Re: [PATCH v2 08/17] vdpa_sim: add supported_features field in vdpasim_dev_attr
On 2020/11/26 下午10:49, Stefano Garzarella wrote: Introduce a new VDPASIM_FEATURES macro with the generic features supported by the vDPA simulator, and VDPASIM_NET_FEATURES macro with vDPA-net features. Add 'supported_features' field in vdpasim_dev_attr, to allow devices to specify their features. Co-developed-by: Max Gurtovoy Signed-off-by: Max Gurtovoy Signed-off-by: Stefano Garzarella --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 29 ++--- 1 file changed, 18 insertions(+), 11 deletions(-) Acked-by: Jason Wang diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 393b54a9f0e4..36677fc3631b 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -49,12 +49,15 @@ struct vdpasim_virtqueue { #define VDPASIM_VQ_NUM 0x2 #define VDPASIM_NAME "vdpasim-netdev" -static u64 vdpasim_features = (1ULL << VIRTIO_F_ANY_LAYOUT) | - (1ULL << VIRTIO_F_VERSION_1) | - (1ULL << VIRTIO_F_ACCESS_PLATFORM) | - (1ULL << VIRTIO_NET_F_MAC); +#define VDPASIM_FEATURES ((1ULL << VIRTIO_F_ANY_LAYOUT) | \ +(1ULL << VIRTIO_F_VERSION_1) | \ +(1ULL << VIRTIO_F_ACCESS_PLATFORM)) + +#define VDPASIM_NET_FEATURES (VDPASIM_FEATURES | \ +(1ULL << VIRTIO_NET_F_MAC)) struct vdpasim_dev_attr { + u64 supported_features; int nvqs; u32 id; }; @@ -112,7 +115,7 @@ static void vdpasim_queue_ready(struct vdpasim *vdpasim, unsigned int idx) { struct vdpasim_virtqueue *vq = >vqs[idx]; - vringh_init_iotlb(>vring, vdpasim_features, + vringh_init_iotlb(>vring, vdpasim->dev_attr.supported_features, VDPASIM_QUEUE_MAX, false, (struct vring_desc *)(uintptr_t)vq->desc_addr, (struct vring_avail *) @@ -121,7 +124,8 @@ static void vdpasim_queue_ready(struct vdpasim *vdpasim, unsigned int idx) (uintptr_t)vq->device_addr); } -static void vdpasim_vq_reset(struct vdpasim_virtqueue *vq) +static void vdpasim_vq_reset(struct vdpasim *vdpasim, +struct vdpasim_virtqueue *vq) { vq->ready = false; vq->desc_addr = 0; @@ -129,8 +133,8 @@ static void vdpasim_vq_reset(struct vdpasim_virtqueue *vq) vq->device_addr = 0; vq->cb = NULL; vq->private = NULL; - vringh_init_iotlb(>vring, vdpasim_features, VDPASIM_QUEUE_MAX, - false, NULL, NULL, NULL); + vringh_init_iotlb(>vring, vdpasim->dev_attr.supported_features, + VDPASIM_QUEUE_MAX, false, NULL, NULL, NULL); } static void vdpasim_reset(struct vdpasim *vdpasim) @@ -138,7 +142,7 @@ static void vdpasim_reset(struct vdpasim *vdpasim) int i; for (i = 0; i < vdpasim->dev_attr.nvqs; i++) - vdpasim_vq_reset(>vqs[i]); + vdpasim_vq_reset(vdpasim, >vqs[i]); spin_lock(>iommu_lock); vhost_iotlb_reset(vdpasim->iommu); @@ -498,7 +502,9 @@ static u32 vdpasim_get_vq_align(struct vdpa_device *vdpa) static u64 vdpasim_get_features(struct vdpa_device *vdpa) { - return vdpasim_features; + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + return vdpasim->dev_attr.supported_features; } static int vdpasim_set_features(struct vdpa_device *vdpa, u64 features) @@ -510,7 +516,7 @@ static int vdpasim_set_features(struct vdpa_device *vdpa, u64 features) if (!(features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) return -EINVAL; - vdpasim->features = features & vdpasim_features; + vdpasim->features = features & vdpasim->dev_attr.supported_features; /* We generally only know whether guest is using the legacy interface * here, so generally that's the earliest we can set config fields. @@ -722,6 +728,7 @@ static int __init vdpasim_dev_init(void) struct vdpasim_dev_attr dev_attr = {}; dev_attr.id = VIRTIO_ID_NET; + dev_attr.supported_features = VDPASIM_NET_FEATURES; dev_attr.nvqs = VDPASIM_VQ_NUM; vdpasim_dev = vdpasim_create(_attr);
Re: [PATCH v2 09/17] vdpa_sim: add work_fn in vdpasim_dev_attr
On 2020/11/26 下午10:49, Stefano Garzarella wrote: Rename vdpasim_work() in vdpasim_net_work() and add it to the vdpasim_dev_attr structure. Co-developed-by: Max Gurtovoy Signed-off-by: Max Gurtovoy Signed-off-by: Stefano Garzarella --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) Acked-by: Jason Wang diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 36677fc3631b..b84d9acd130c 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -60,6 +60,8 @@ struct vdpasim_dev_attr { u64 supported_features; int nvqs; u32 id; + + work_func_t work_fn; }; /* State of each vdpasim device */ @@ -153,7 +155,7 @@ static void vdpasim_reset(struct vdpasim *vdpasim) ++vdpasim->generation; } -static void vdpasim_work(struct work_struct *work) +static void vdpasim_net_work(struct work_struct *work) { struct vdpasim *vdpasim = container_of(work, struct vdpasim, work); @@ -360,7 +362,7 @@ static struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr) goto err_alloc; vdpasim->dev_attr = *dev_attr; - INIT_WORK(>work, vdpasim_work); + INIT_WORK(>work, dev_attr->work_fn); spin_lock_init(>lock); spin_lock_init(>iommu_lock); @@ -730,6 +732,7 @@ static int __init vdpasim_dev_init(void) dev_attr.id = VIRTIO_ID_NET; dev_attr.supported_features = VDPASIM_NET_FEATURES; dev_attr.nvqs = VDPASIM_VQ_NUM; + dev_attr.work_fn = vdpasim_net_work; vdpasim_dev = vdpasim_create(_attr);
Re: [PATCH v2 07/17] vdpa_sim: add device id field in vdpasim_dev_attr
On 2020/11/26 下午10:49, Stefano Garzarella wrote: Remove VDPASIM_DEVICE_ID macro and add 'id' field in vdpasim_dev_attr, that will be returned by vdpasim_get_device_id(). Use VIRTIO_ID_NET for vDPA-net simulator device id. Co-developed-by: Max Gurtovoy Signed-off-by: Max Gurtovoy Signed-off-by: Stefano Garzarella --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) Acked-by: Jason Wang diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index f98262add0e1..393b54a9f0e4 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -44,7 +44,6 @@ struct vdpasim_virtqueue { #define VDPASIM_QUEUE_ALIGN PAGE_SIZE #define VDPASIM_QUEUE_MAX 256 -#define VDPASIM_DEVICE_ID 0x1 #define VDPASIM_VENDOR_ID 0 #define VDPASIM_IOTLB_LIMIT 0 /* unlimited */ #define VDPASIM_VQ_NUM 0x2 @@ -57,6 +56,7 @@ static u64 vdpasim_features = (1ULL << VIRTIO_F_ANY_LAYOUT) | struct vdpasim_dev_attr { int nvqs; + u32 id; }; /* State of each vdpasim device */ @@ -536,7 +536,9 @@ static u16 vdpasim_get_vq_num_max(struct vdpa_device *vdpa) static u32 vdpasim_get_device_id(struct vdpa_device *vdpa) { - return VDPASIM_DEVICE_ID; + struct vdpasim *vdpasim = vdpa_to_sim(vdpa); + + return vdpasim->dev_attr.id; } static u32 vdpasim_get_vendor_id(struct vdpa_device *vdpa) @@ -719,6 +721,7 @@ static int __init vdpasim_dev_init(void) { struct vdpasim_dev_attr dev_attr = {}; + dev_attr.id = VIRTIO_ID_NET; dev_attr.nvqs = VDPASIM_VQ_NUM; vdpasim_dev = vdpasim_create(_attr);
Re: [PATCH v2 06/17] vdpa_sim: add struct vdpasim_dev_attr for device attributes
On 2020/11/26 下午10:49, Stefano Garzarella wrote: vdpasim_dev_attr will contain device specific attributes. We starting moving the number of virtqueues (i.e. nvqs) to vdpasim_dev_attr. vdpasim_create() creates a new vDPA simulator following the device attributes defined in the vdpasim_dev_attr parameter. Co-developed-by: Max Gurtovoy Signed-off-by: Max Gurtovoy Signed-off-by: Stefano Garzarella --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 25 + 1 file changed, 17 insertions(+), 8 deletions(-) Acked-by: Jason Wang diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 62204e064841..f98262add0e1 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -55,11 +55,16 @@ static u64 vdpasim_features = (1ULL << VIRTIO_F_ANY_LAYOUT) | (1ULL << VIRTIO_F_ACCESS_PLATFORM) | (1ULL << VIRTIO_NET_F_MAC); +struct vdpasim_dev_attr { + int nvqs; +}; + /* State of each vdpasim device */ struct vdpasim { struct vdpa_device vdpa; struct vdpasim_virtqueue *vqs; struct work_struct work; + struct vdpasim_dev_attr dev_attr; /* spinlock to synchronize virtqueue state */ spinlock_t lock; struct virtio_net_config config; @@ -68,7 +73,6 @@ struct vdpasim { u32 status; u32 generation; u64 features; - int nvqs; /* spinlock to synchronize iommu table */ spinlock_t iommu_lock; }; @@ -133,7 +137,7 @@ static void vdpasim_reset(struct vdpasim *vdpasim) { int i; - for (i = 0; i < vdpasim->nvqs; i++) + for (i = 0; i < vdpasim->dev_attr.nvqs; i++) vdpasim_vq_reset(>vqs[i]); spin_lock(>iommu_lock); @@ -334,7 +338,7 @@ static const struct dma_map_ops vdpasim_dma_ops = { static const struct vdpa_config_ops vdpasim_config_ops; static const struct vdpa_config_ops vdpasim_batch_config_ops; -static struct vdpasim *vdpasim_create(void) +static struct vdpasim *vdpasim_create(struct vdpasim_dev_attr *dev_attr) { const struct vdpa_config_ops *ops; struct vdpasim *vdpasim; @@ -346,11 +350,12 @@ static struct vdpasim *vdpasim_create(void) else ops = _config_ops; - vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, VDPASIM_VQ_NUM); + vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, + dev_attr->nvqs); if (!vdpasim) goto err_alloc; - vdpasim->nvqs = VDPASIM_VQ_NUM; + vdpasim->dev_attr = *dev_attr; INIT_WORK(>work, vdpasim_work); spin_lock_init(>lock); spin_lock_init(>iommu_lock); @@ -361,7 +366,7 @@ static struct vdpasim *vdpasim_create(void) goto err_iommu; set_dma_ops(dev, _dma_ops); - vdpasim->vqs = kcalloc(vdpasim->nvqs, sizeof(struct vdpasim_virtqueue), + vdpasim->vqs = kcalloc(dev_attr->nvqs, sizeof(struct vdpasim_virtqueue), GFP_KERNEL); if (!vdpasim->vqs) goto err_iommu; @@ -384,7 +389,7 @@ static struct vdpasim *vdpasim_create(void) eth_random_addr(vdpasim->config.mac); } - for (i = 0; i < vdpasim->nvqs; i++) + for (i = 0; i < dev_attr->nvqs; i++) vringh_set_iotlb(>vqs[i].vring, vdpasim->iommu); vdpasim->vdpa.dma_dev = dev; @@ -712,7 +717,11 @@ static const struct vdpa_config_ops vdpasim_batch_config_ops = { static int __init vdpasim_dev_init(void) { - vdpasim_dev = vdpasim_create(); + struct vdpasim_dev_attr dev_attr = {}; + + dev_attr.nvqs = VDPASIM_VQ_NUM; + + vdpasim_dev = vdpasim_create(_attr); if (!IS_ERR(vdpasim_dev)) return 0;
Re: [PATCH v2 05/17] vdpa_sim: rename vdpasim_config_ops variables
On 2020/11/26 下午10:49, Stefano Garzarella wrote: These variables stores generic callbacks used by the vDPA simulator core, so we can remove the 'net' word in their names. Co-developed-by: Max Gurtovoy Signed-off-by: Max Gurtovoy Signed-off-by: Stefano Garzarella --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 12 ++-- 1 file changed, 6 insertions(+), 6 deletions(-) Acked-by: Jason Wang diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 40664d87f303..62204e064841 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -331,8 +331,8 @@ static const struct dma_map_ops vdpasim_dma_ops = { .free = vdpasim_free_coherent, }; -static const struct vdpa_config_ops vdpasim_net_config_ops; -static const struct vdpa_config_ops vdpasim_net_batch_config_ops; +static const struct vdpa_config_ops vdpasim_config_ops; +static const struct vdpa_config_ops vdpasim_batch_config_ops; static struct vdpasim *vdpasim_create(void) { @@ -342,9 +342,9 @@ static struct vdpasim *vdpasim_create(void) int i, ret = -ENOMEM; if (batch_mapping) - ops = _net_batch_config_ops; + ops = _batch_config_ops; else - ops = _net_config_ops; + ops = _config_ops; vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, VDPASIM_VQ_NUM); if (!vdpasim) @@ -657,7 +657,7 @@ static void vdpasim_free(struct vdpa_device *vdpa) kfree(vdpasim->vqs); } -static const struct vdpa_config_ops vdpasim_net_config_ops = { +static const struct vdpa_config_ops vdpasim_config_ops = { .set_vq_address = vdpasim_set_vq_address, .set_vq_num = vdpasim_set_vq_num, .kick_vq= vdpasim_kick_vq, @@ -684,7 +684,7 @@ static const struct vdpa_config_ops vdpasim_net_config_ops = { .free = vdpasim_free, }; -static const struct vdpa_config_ops vdpasim_net_batch_config_ops = { +static const struct vdpa_config_ops vdpasim_batch_config_ops = { .set_vq_address = vdpasim_set_vq_address, .set_vq_num = vdpasim_set_vq_num, .kick_vq= vdpasim_kick_vq,
Re: [PATCH v2 04/17] vdpa_sim: remove the limit of IOTLB entries
On 2020/11/26 下午10:49, Stefano Garzarella wrote: The simulated devices can support multiple queues, so this limit should be defined according to the number of queues supported by the device. Since we are in a simulator, let's simply remove that limit. Suggested-by: Jason Wang Acked-by: Jason Wang Signed-off-by: Stefano Garzarella --- v2: - added VDPASIM_IOTLB_LIMIT macro [Jason] Sorry for being unclear. I meant adding a macro like VHOST_IOTLB_UNLIMITED 0 in vhost_iotlb.h. And use that in vdpa_sim. Thanks --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index ad72f7b1a4eb..40664d87f303 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -46,6 +46,7 @@ struct vdpasim_virtqueue { #define VDPASIM_QUEUE_MAX 256 #define VDPASIM_DEVICE_ID 0x1 #define VDPASIM_VENDOR_ID 0 +#define VDPASIM_IOTLB_LIMIT 0 /* unlimited */ #define VDPASIM_VQ_NUM 0x2 #define VDPASIM_NAME "vdpasim-netdev" @@ -365,7 +366,7 @@ static struct vdpasim *vdpasim_create(void) if (!vdpasim->vqs) goto err_iommu; - vdpasim->iommu = vhost_iotlb_alloc(2048, 0); + vdpasim->iommu = vhost_iotlb_alloc(VDPASIM_IOTLB_LIMIT, 0); if (!vdpasim->iommu) goto err_iommu;
Re: [PATCH v2 03/17] vdpa_sim: remove hard-coded virtq count
On 2020/11/26 下午10:49, Stefano Garzarella wrote: From: Max Gurtovoy Add a new attribute that will define the number of virt queues to be created for the vdpasim device. Signed-off-by: Max Gurtovoy [sgarzare: replace kmalloc_array() with kcalloc()] Signed-off-by: Stefano Garzarella Acked-by: Jason Wang --- v1: - use kcalloc() instead of kmalloc_array() since some function expects variables initialized to zero --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 18 +- 1 file changed, 13 insertions(+), 5 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index c6eaf62df8ec..ad72f7b1a4eb 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -57,7 +57,7 @@ static u64 vdpasim_features = (1ULL << VIRTIO_F_ANY_LAYOUT) | /* State of each vdpasim device */ struct vdpasim { struct vdpa_device vdpa; - struct vdpasim_virtqueue vqs[VDPASIM_VQ_NUM]; + struct vdpasim_virtqueue *vqs; struct work_struct work; /* spinlock to synchronize virtqueue state */ spinlock_t lock; @@ -67,6 +67,7 @@ struct vdpasim { u32 status; u32 generation; u64 features; + int nvqs; /* spinlock to synchronize iommu table */ spinlock_t iommu_lock; }; @@ -131,7 +132,7 @@ static void vdpasim_reset(struct vdpasim *vdpasim) { int i; - for (i = 0; i < VDPASIM_VQ_NUM; i++) + for (i = 0; i < vdpasim->nvqs; i++) vdpasim_vq_reset(>vqs[i]); spin_lock(>iommu_lock); @@ -337,7 +338,7 @@ static struct vdpasim *vdpasim_create(void) const struct vdpa_config_ops *ops; struct vdpasim *vdpasim; struct device *dev; - int ret = -ENOMEM; + int i, ret = -ENOMEM; if (batch_mapping) ops = _net_batch_config_ops; @@ -348,6 +349,7 @@ static struct vdpasim *vdpasim_create(void) if (!vdpasim) goto err_alloc; + vdpasim->nvqs = VDPASIM_VQ_NUM; INIT_WORK(>work, vdpasim_work); spin_lock_init(>lock); spin_lock_init(>iommu_lock); @@ -358,6 +360,11 @@ static struct vdpasim *vdpasim_create(void) goto err_iommu; set_dma_ops(dev, _dma_ops); + vdpasim->vqs = kcalloc(vdpasim->nvqs, sizeof(struct vdpasim_virtqueue), + GFP_KERNEL); + if (!vdpasim->vqs) + goto err_iommu; + vdpasim->iommu = vhost_iotlb_alloc(2048, 0); if (!vdpasim->iommu) goto err_iommu; @@ -376,8 +383,8 @@ static struct vdpasim *vdpasim_create(void) eth_random_addr(vdpasim->config.mac); } - vringh_set_iotlb(>vqs[0].vring, vdpasim->iommu); - vringh_set_iotlb(>vqs[1].vring, vdpasim->iommu); + for (i = 0; i < vdpasim->nvqs; i++) + vringh_set_iotlb(>vqs[i].vring, vdpasim->iommu); vdpasim->vdpa.dma_dev = dev; ret = vdpa_register_device(>vdpa); @@ -646,6 +653,7 @@ static void vdpasim_free(struct vdpa_device *vdpa) kfree(vdpasim->buffer); if (vdpasim->iommu) vhost_iotlb_free(vdpasim->iommu); + kfree(vdpasim->vqs); } static const struct vdpa_config_ops vdpasim_net_config_ops = {
Re: [PATCH v2 02/17] vdpa_sim: remove unnecessary headers inclusion
On 2020/11/26 下午10:49, Stefano Garzarella wrote: Some headers are not necessary, so let's remove them to do some cleaning. Signed-off-by: Stefano Garzarella --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 13 - 1 file changed, 13 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 6a90fdb9cbfc..c6eaf62df8ec 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -7,24 +7,11 @@ * */ -#include #include -#include I think the rule is to make sure e.g the structure definition can be via direct inclusion. E.g struct device {} is defined in this file. -#include -#include -#include -#include -#include -#include -#include -#include #include -#include -#include #include #include #include -#include And the __cpu_to_virtio16 is defined in this file. Thanks #include #include #include
Re: [PATCH v2 01/17] vdpa: remove unnecessary 'default n' in Kconfig entries
On 2020/11/26 下午10:49, Stefano Garzarella wrote: 'default n' is not necessary since it is already the default when nothing is specified. Suggested-by: Jason Wang Signed-off-by: Stefano Garzarella Acked-by: Jason Wang --- drivers/vdpa/Kconfig | 3 --- 1 file changed, 3 deletions(-) diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig index 358f6048dd3c..4019ceb88181 100644 --- a/drivers/vdpa/Kconfig +++ b/drivers/vdpa/Kconfig @@ -14,7 +14,6 @@ config VDPA_SIM select DMA_OPS select VHOST_RING select GENERIC_NET_UTILS - default n help vDPA networking device simulator which loop TX traffic back to RX. This device is used for testing, prototyping and @@ -23,7 +22,6 @@ config VDPA_SIM config IFCVF tristate "Intel IFC VF vDPA driver" depends on PCI_MSI - default n help This kernel module can drive Intel IFC VF NIC to offload virtio dataplane traffic to hardware. @@ -41,7 +39,6 @@ config MLX5_VDPA_NET tristate "vDPA driver for ConnectX devices" select MLX5_VDPA depends on MLX5_CORE - default n help VDPA network driver for ConnectX6 and newer. Provides offloading of virtio net datapath such that descriptors put on the ring will
Re: [PATCH] vdpa: ifcvf: Use dma_set_mask_and_coherent to simplify code
On 2020/11/29 下午8:54, Christophe JAILLET wrote: 'pci_set_dma_mask()' + 'pci_set_consistent_dma_mask()' can be replaced by an equivalent 'dma_set_mask_and_coherent()' which is much less verbose. While at it, fix a typo (s/confiugration/configuration) Signed-off-by: Christophe JAILLET --- Acked-by: Jason Wang drivers/vdpa/ifcvf/ifcvf_main.c | 11 ++- 1 file changed, 2 insertions(+), 9 deletions(-) diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index 8b4028556cb6..fa1af301cf55 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -417,16 +417,9 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id) return ret; } - ret = pci_set_dma_mask(pdev, DMA_BIT_MASK(64)); + ret = dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64)); if (ret) { - IFCVF_ERR(pdev, "No usable DMA confiugration\n"); - return ret; - } - - ret = pci_set_consistent_dma_mask(pdev, DMA_BIT_MASK(64)); - if (ret) { - IFCVF_ERR(pdev, - "No usable coherent DMA confiugration\n"); + IFCVF_ERR(pdev, "No usable DMA configuration\n"); return ret; }
Re: [PATCH v4] vdpa: mlx5: fix vdpa/vhost dependencies
On 2020/11/29 上午5:39, Randy Dunlap wrote: drivers/vdpa/mlx5/ uses vhost_iotlb*() interfaces, so select VHOST_IOTLB to make them be built. However, if VHOST_IOTLB is the only VHOST symbol that is set/enabled, the object file still won't be built because drivers/Makefile won't descend into drivers/vhost/ to build it, so make drivers/Makefile build the needed binary whenever VHOST_IOTLB is set, like it does for VHOST_RING. Fixes these build errors: ERROR: modpost: "vhost_iotlb_itree_next" [drivers/vdpa/mlx5/mlx5_vdpa.ko] undefined! ERROR: modpost: "vhost_iotlb_itree_first" [drivers/vdpa/mlx5/mlx5_vdpa.ko] undefined! Fixes: 29064bfdabd5 ("vdpa/mlx5: Add support library for mlx5 VDPA implementation") Fixes: aff90770e54c ("vdpa/mlx5: Fix dependency on MLX5_CORE") Reported-by: kernel test robot Signed-off-by: Randy Dunlap Cc: Eli Cohen Cc: Parav Pandit Cc: "Michael S. Tsirkin" Cc: Jason Wang Cc: virtualizat...@lists.linux-foundation.org Cc: Saeed Mahameed Cc: Leon Romanovsky Cc: net...@vger.kernel.org --- v2: change from select to depends on VHOST (Saeed) v3: change to depends on VHOST_IOTLB (Jason) v4: use select VHOST_IOTLB (Michael); also add to drivers/Makefile drivers/Makefile |1 + drivers/vdpa/Kconfig |1 + 2 files changed, 2 insertions(+) --- linux-next-20201127.orig/drivers/vdpa/Kconfig +++ linux-next-20201127/drivers/vdpa/Kconfig @@ -32,6 +32,7 @@ config IFCVF config MLX5_VDPA bool + select VHOST_IOTLB help Support library for Mellanox VDPA drivers. Provides code that is common for all types of VDPA drivers. The following drivers are planned: --- linux-next-20201127.orig/drivers/Makefile +++ linux-next-20201127/drivers/Makefile @@ -143,6 +143,7 @@ obj-$(CONFIG_OF)+= of/ obj-$(CONFIG_SSB) += ssb/ obj-$(CONFIG_BCMA)+= bcma/ obj-$(CONFIG_VHOST_RING) += vhost/ +obj-$(CONFIG_VHOST_IOTLB) += vhost/ obj-$(CONFIG_VHOST) += vhost/ obj-$(CONFIG_VLYNQ) += vlynq/ obj-$(CONFIG_GREYBUS) += greybus/ Acked-by: Jason Wang Thanks
Re: [PATCH V2 02/14] virtio-pci: switch to use devres for modern devices
On 2020/11/26 下午9:57, Michael S. Tsirkin wrote: On Thu, Nov 26, 2020 at 05:25:52PM +0800, Jason Wang wrote: This patch tries to convert the modern device to use devres to manage its resources (iomaps). Before this patch the IO address is mapped individually according to the capability. After this patch, we simply map the whole BAR. I think the point of mapping capability was e.g. for devices with huge BARs. We don't want to waste virtual memory for e.g. 32 bit guests. And in particular the spec says: The drivers SHOULD only map part of configuration structure large enough for device operation. The drivers MUST handle an unexpectedly large length, but MAY check that length is large enough for device operation. Good point, so I will stick to devres but not use the shortcut like whole BAR mapping. I also wonder how would this interact with cases where device memory is mapped for different reasons, such as for MSI table access, into userspace as it has resources such as virtio mem, etc. I think it depends on the driver, e.g for virtio-pci and vDPA, the upper layer driver (virtio bus or vDPA bus) know nothing about transport specific thing. It should be ok. E.g. don't e.g. intel CPUs disallow mapping the same address twice with different attributes? Do you mean it doesn't allow one VA is mapped as UC but the other is not? I don't know. But anyhow my understanding is that virtio-pci/vp_vdpa tries to hide the details so we can not have two mappings here. Thanks
Re: [PATCH V2 01/14] virtio-pci: do not access iomem via virtio_pci_device directly
On 2020/11/26 下午9:46, Michael S. Tsirkin wrote: On Thu, Nov 26, 2020 at 05:25:51PM +0800, Jason Wang wrote: Instead of accessing iomem via virito_pci_device directly. Add an indirect level well this patch does not add any indirection it's just refactoring. which is ok of course let's just say it as is. to ease the life of splitting out modern virito-pci typo Will fix. Thanks
[PATCH V2 06/14] virtio-pci-modern: introduce vp_modern_queue_address()
This patch introduce a helper to set virtqueue address for modern address. Signed-off-by: Jason Wang --- drivers/virtio/virtio_pci_modern.c | 33 -- 1 file changed, 27 insertions(+), 6 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index bacc05cbc762..3125987973d3 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -174,6 +174,30 @@ static u16 vp_modern_queue_vector(struct virtio_pci_modern_device *mdev, return vp_ioread16(>queue_msix_vector); } +/* + * vp_modern_queue_address - set the virtqueue address + * @mdev: the modern virtio-pci device + * @index: the queue index + * @desc_addr: address of the descriptor area + * @driver_addr: address of the driver area + * @device_addr: address of the device area + */ +static void vp_modern_queue_address(struct virtio_pci_modern_device *mdev, + u16 index, u64 desc_addr, u64 driver_addr, + u64 device_addr) +{ + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; + + vp_iowrite16(index, >queue_select); + + vp_iowrite64_twopart(desc_addr, >queue_desc_lo, +>queue_desc_hi); + vp_iowrite64_twopart(driver_addr, >queue_avail_lo, +>queue_avail_hi); + vp_iowrite64_twopart(device_addr, >queue_used_lo, +>queue_used_hi); +} + /* virtio config->finalize_features() implementation */ static int vp_finalize_features(struct virtio_device *vdev) { @@ -396,12 +420,9 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, /* activate the queue */ vp_iowrite16(virtqueue_get_vring_size(vq), >queue_size); - vp_iowrite64_twopart(virtqueue_get_desc_addr(vq), ->queue_desc_lo, >queue_desc_hi); - vp_iowrite64_twopart(virtqueue_get_avail_addr(vq), ->queue_avail_lo, >queue_avail_hi); - vp_iowrite64_twopart(virtqueue_get_used_addr(vq), ->queue_used_lo, >queue_used_hi); + vp_modern_queue_address(mdev, index, virtqueue_get_desc_addr(vq), + virtqueue_get_avail_addr(vq), + virtqueue_get_used_addr(vq)); vq->priv = (void __force *)mdev->notify_base + off * mdev->notify_offset_multiplier; -- 2.25.1
[PATCH V2 14/14] vdpa: introduce virtio pci driver
This patch introduce a vDPA driver for virtio-pci device. It bridges the virtio-pci control command to the vDPA bus. This will be used for features prototyping and testing. Note that get/restore virtqueue state is not supported which needs extension on the virtio specification. Signed-off-by: Jason Wang --- drivers/vdpa/Kconfig | 6 + drivers/vdpa/Makefile | 1 + drivers/vdpa/virtio_pci/Makefile | 2 + drivers/vdpa/virtio_pci/vp_vdpa.c | 450 ++ 4 files changed, 459 insertions(+) create mode 100644 drivers/vdpa/virtio_pci/Makefile create mode 100644 drivers/vdpa/virtio_pci/vp_vdpa.c diff --git a/drivers/vdpa/Kconfig b/drivers/vdpa/Kconfig index d7d32b656102..4cca53114cc4 100644 --- a/drivers/vdpa/Kconfig +++ b/drivers/vdpa/Kconfig @@ -47,4 +47,10 @@ config MLX5_VDPA_NET be executed by the hardware. It also supports a variety of stateless offloads depending on the actual device used and firmware version. +config VP_VDPA + tristate "Virtio PCI bridge vDPA driver" + depends on PCI_MSI && VIRTIO_PCI_MODERN + help + This kernel module that bridges virtio PCI device to vDPA bus. + endif # VDPA diff --git a/drivers/vdpa/Makefile b/drivers/vdpa/Makefile index d160e9b63a66..67fe7f3d6943 100644 --- a/drivers/vdpa/Makefile +++ b/drivers/vdpa/Makefile @@ -3,3 +3,4 @@ obj-$(CONFIG_VDPA) += vdpa.o obj-$(CONFIG_VDPA_SIM) += vdpa_sim/ obj-$(CONFIG_IFCVF)+= ifcvf/ obj-$(CONFIG_MLX5_VDPA) += mlx5/ +obj-$(CONFIG_VP_VDPA)+= virtio_pci/ diff --git a/drivers/vdpa/virtio_pci/Makefile b/drivers/vdpa/virtio_pci/Makefile new file mode 100644 index ..231088d3af7d --- /dev/null +++ b/drivers/vdpa/virtio_pci/Makefile @@ -0,0 +1,2 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_VP_VDPA) += vp_vdpa.o diff --git a/drivers/vdpa/virtio_pci/vp_vdpa.c b/drivers/vdpa/virtio_pci/vp_vdpa.c new file mode 100644 index ..6458fa470566 --- /dev/null +++ b/drivers/vdpa/virtio_pci/vp_vdpa.c @@ -0,0 +1,450 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * vDPA bridge driver for modern virtio-pci device + * + * Copyright (c) 2020, Red Hat Inc. All rights reserved. + * Author: Jason Wang + * + * Based on virtio_pci_modern.c. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define VP_VDPA_QUEUE_MAX 256 +#define VP_VDPA_DRIVER_NAME "vp_vdpa" + +struct vp_vring { + void __iomem *notify; + char msix_name[256]; + struct vdpa_callback cb; + int irq; +}; + +struct vp_vdpa { + struct vdpa_device vdpa; + struct virtio_pci_modern_device mdev; + struct vp_vring *vring; + struct vdpa_callback cb; + char msix_name[256]; + int config_irq; + int queues; + int vectors; +}; + +static struct vp_vdpa *vdpa_to_vp(struct vdpa_device *vdpa) +{ + return container_of(vdpa, struct vp_vdpa, vdpa); +} + +static struct virtio_pci_modern_device *vdpa_to_mdev(struct vdpa_device *vdpa) +{ + struct vp_vdpa *vp_vdpa = vdpa_to_vp(vdpa); + + return _vdpa->mdev; +} + +static u64 vp_vdpa_get_features(struct vdpa_device *vdpa) +{ + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); + + return vp_modern_get_features(mdev); +} + +static int vp_vdpa_set_features(struct vdpa_device *vdpa, u64 features) +{ + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); + + vp_modern_set_features(mdev, features); + + return 0; +} + +static u8 vp_vdpa_get_status(struct vdpa_device *vdpa) +{ + struct virtio_pci_modern_device *mdev = vdpa_to_mdev(vdpa); + + return vp_modern_get_status(mdev); +} + +static void vp_vdpa_free_irq(struct vp_vdpa *vp_vdpa) +{ + struct virtio_pci_modern_device *mdev = _vdpa->mdev; + struct pci_dev *pdev = mdev->pci_dev; + int i; + + for (i = 0; i < vp_vdpa->queues; i++) { + if (vp_vdpa->vring[i].irq != VIRTIO_MSI_NO_VECTOR) { + vp_modern_queue_vector(mdev, i, VIRTIO_MSI_NO_VECTOR); + devm_free_irq(>dev, vp_vdpa->vring[i].irq, + _vdpa->vring[i]); + vp_vdpa->vring[i].irq = VIRTIO_MSI_NO_VECTOR; + } + } + + if (vp_vdpa->config_irq != VIRTIO_MSI_NO_VECTOR) { + vp_modern_config_vector(mdev, VIRTIO_MSI_NO_VECTOR); + devm_free_irq(>dev, vp_vdpa->config_irq, vp_vdpa); + vp_vdpa->config_irq = VIRTIO_MSI_NO_VECTOR; + } + + if (vp_vdpa->vectors) { + pci_free_irq_vectors(pdev); + vp_vdpa->vectors = 0; + } +} + +static irqreturn_t vp_vdpa_vq_handler(int irq, void *arg) +{ + struct vp_vring *vring = arg; + + if (vring->cb.callback) + return vring->cb.callback(vring->cb.pr
[PATCH V2 10/14] virtio-pci-modern: introduce helper to get notification offset
This patch introduces help to get notification offset of modern device. Signed-off-by: Jason Wang --- drivers/virtio/virtio_pci_modern.c | 21 - 1 file changed, 16 insertions(+), 5 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 0b86a36998c8..8f1f274724be 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -267,6 +267,21 @@ static u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev) return vp_ioread16(>common->num_queues); } +/* + * vp_modern_get_queue_notify_off - get notification offset for a virtqueue + * @mdev: the modern virtio-pci device + * @index: the queue index + * + * Returns the notification offset for a virtqueue + */ +static u16 vp_modern_get_queue_notify_off(struct virtio_pci_modern_device *mdev, + u16 index) +{ + vp_iowrite16(index, >common->queue_select); + + return vp_ioread16(>common->queue_notify_off); +} + /* virtio config->finalize_features() implementation */ static int vp_finalize_features(struct virtio_device *vdev) { @@ -453,7 +468,6 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, { struct virtio_pci_modern_device *mdev = _dev->mdev; - struct virtio_pci_common_cfg __iomem *cfg = mdev->common; struct virtqueue *vq; u16 num, off; int err; @@ -461,9 +475,6 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, if (index >= vp_modern_get_num_queues(mdev)) return ERR_PTR(-ENOENT); - /* Select the queue we're interested in */ - vp_iowrite16(index, >queue_select); - /* Check if queue is either not available or already active. */ num = vp_modern_get_queue_size(mdev, index); if (!num || vp_modern_get_queue_enable(mdev, index)) @@ -475,7 +486,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, } /* get offset of notification word for this vq */ - off = vp_ioread16(>queue_notify_off); + off = vp_modern_get_queue_notify_off(mdev, index); info->msix_vector = msix_vec; -- 2.25.1
[PATCH V2 11/14] virtio-pci: introduce modern device module
This patch introduce an separate module that implement the low level device probe and access logic for modern device. The goal is let the module to be reused by other driver (e.g vDPA driver that will be introduced soon). Note that, the shared memory cap is not converted since there's no user currently. We can do that in the future if necessary. Signed-off-by: Jason Wang --- drivers/virtio/Kconfig | 10 +- drivers/virtio/Makefile| 1 + drivers/virtio/virtio_pci_common.h | 28 +- drivers/virtio/virtio_pci_modern.c | 462 - drivers/virtio/virtio_pci_modern_dev.c | 462 + include/linux/virtio_pci_modern.h | 107 ++ 6 files changed, 580 insertions(+), 490 deletions(-) create mode 100644 drivers/virtio/virtio_pci_modern_dev.c create mode 100644 include/linux/virtio_pci_modern.h diff --git a/drivers/virtio/Kconfig b/drivers/virtio/Kconfig index e76e9b9ba93c..26491b6e7e10 100644 --- a/drivers/virtio/Kconfig +++ b/drivers/virtio/Kconfig @@ -6,6 +6,14 @@ config VIRTIO bus, such as CONFIG_VIRTIO_PCI, CONFIG_VIRTIO_MMIO, CONFIG_RPMSG or CONFIG_S390_GUEST. +config VIRTIO_PCI_MODERN + tristate "Modern Virtio PCI Device" + depends on PCI + help + Modern PCI device implementation. This module implement the + basic probe and control for devices which is based on modern + PCI device with possible vendor specific extensions. + menuconfig VIRTIO_MENU bool "Virtio drivers" default y @@ -14,7 +22,7 @@ if VIRTIO_MENU config VIRTIO_PCI tristate "PCI driver for virtio devices" - depends on PCI + depends on PCI && VIRTIO_PCI_MODERN select VIRTIO help This driver provides support for virtio based paravirtual device diff --git a/drivers/virtio/Makefile b/drivers/virtio/Makefile index 591e6f72aa54..f097578aaa8f 100644 --- a/drivers/virtio/Makefile +++ b/drivers/virtio/Makefile @@ -1,5 +1,6 @@ # SPDX-License-Identifier: GPL-2.0 obj-$(CONFIG_VIRTIO) += virtio.o virtio_ring.o +obj-$(CONFIG_VIRTIO_PCI_MODERN) += virtio_pci_modern_dev.o obj-$(CONFIG_VIRTIO_MMIO) += virtio_mmio.o obj-$(CONFIG_VIRTIO_PCI) += virtio_pci.o virtio_pci-y := virtio_pci_modern.o virtio_pci_common.o diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h index d32af8ff56f9..4025b940f74e 100644 --- a/drivers/virtio/virtio_pci_common.h +++ b/drivers/virtio/virtio_pci_common.h @@ -25,6 +25,7 @@ #include #include #include +#include #include #include @@ -39,33 +40,6 @@ struct virtio_pci_vq_info { unsigned msix_vector; }; -struct virtio_pci_modern_device { - struct pci_dev *pci_dev; - - /* The IO mapping for the PCI BARs */ - void __iomem * const *base; - - /* The IO mapping for the PCI config space */ - struct virtio_pci_common_cfg __iomem *common; - /* Device-specific data (non-legacy mode) */ - void __iomem *device; - /* Base of vq notifications (non-legacy mode). */ - void __iomem *notify_base; - /* Where to read and clear interrupt */ - u8 __iomem *isr; - - /* So we can sanity-check accesses. */ - size_t notify_len; - size_t device_len; - - /* Multiply queue_notify_off by this value. (non-legacy mode). */ - u32 notify_offset_multiplier; - - int modern_bars; - - struct virtio_device_id id; -}; - /* Our device structure */ struct virtio_pci_device { struct virtio_device vdev; diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 8f1f274724be..8dfdc3b57502 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -19,113 +19,6 @@ #define VIRTIO_RING_NO_LEGACY #include "virtio_pci_common.h" -/* - * Type-safe wrappers for io accesses. - * Use these to enforce at compile time the following spec requirement: - * - * The driver MUST access each field using the “natural” access - * method, i.e. 32-bit accesses for 32-bit fields, 16-bit accesses - * for 16-bit fields and 8-bit accesses for 8-bit fields. - */ -static inline u8 vp_ioread8(const u8 __iomem *addr) -{ - return ioread8(addr); -} -static inline u16 vp_ioread16 (const __le16 __iomem *addr) -{ - return ioread16(addr); -} - -static inline u32 vp_ioread32(const __le32 __iomem *addr) -{ - return ioread32(addr); -} - -static inline void vp_iowrite8(u8 value, u8 __iomem *addr) -{ - iowrite8(value, addr); -} - -static inline void vp_iowrite16(u16 value, __le16 __iomem *addr) -{ - iowrite16(value, addr); -} - -static inline void vp_iowrite32(u32 value, __le32 __iomem *addr) -{ - iowrite32(value, addr); -} - -static void vp_iowrite64_twopart(u64 val, -__le32 __iomem *lo, __le32 __iomem *hi) -{ - vp_iowrite32((
[PATCH V2 12/14] vdpa: set the virtqueue num during register
This patch delay the queue number setting to vDPA device registering. This allows us to probe the virtqueue numbers between device allocation and registering. Signed-off-by: Jason Wang --- drivers/vdpa/ifcvf/ifcvf_main.c | 5 ++--- drivers/vdpa/mlx5/net/mlx5_vnet.c | 5 ++--- drivers/vdpa/vdpa.c | 8 drivers/vdpa/vdpa_sim/vdpa_sim.c | 4 ++-- include/linux/vdpa.h | 7 +++ 5 files changed, 13 insertions(+), 16 deletions(-) diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index 8b4028556cb6..d65f3221d8ed 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -438,8 +438,7 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id) } adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa, - dev, _vdpa_ops, - IFCVF_MAX_QUEUE_PAIRS * 2); + dev, _vdpa_ops); if (adapter == NULL) { IFCVF_ERR(pdev, "Failed to allocate vDPA structure"); return -ENOMEM; @@ -463,7 +462,7 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id) for (i = 0; i < IFCVF_MAX_QUEUE_PAIRS * 2; i++) vf->vring[i].irq = -EINVAL; - ret = vdpa_register_device(>vdpa); + ret = vdpa_register_device(>vdpa, IFCVF_MAX_QUEUE_PAIRS * 2); if (ret) { IFCVF_ERR(pdev, "Failed to register ifcvf to vdpa bus"); goto err; diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 74264e590695..baa6be16f3e5 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1932,8 +1932,7 @@ void *mlx5_vdpa_add_dev(struct mlx5_core_dev *mdev) max_vqs = MLX5_CAP_DEV_VDPA_EMULATION(mdev, max_num_virtio_queues); max_vqs = min_t(u32, max_vqs, MLX5_MAX_SUPPORTED_VQS); - ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, _vdpa_ops, -2 * mlx5_vdpa_max_qps(max_vqs)); + ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, _vdpa_ops); if (IS_ERR(ndev)) return ndev; @@ -1960,7 +1959,7 @@ void *mlx5_vdpa_add_dev(struct mlx5_core_dev *mdev) if (err) goto err_res; - err = vdpa_register_device(>vdev); + err = vdpa_register_device(>vdev, 2 * mlx5_vdpa_max_qps(max_vqs)); if (err) goto err_reg; diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index a69ffc991e13..ba89238f9898 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -61,7 +61,6 @@ static void vdpa_release_dev(struct device *d) * initialized but before registered. * @parent: the parent device * @config: the bus operations that is supported by this device - * @nvqs: number of virtqueues supported by this device * @size: size of the parent structure that contains private data * * Driver should use vdpa_alloc_device() wrapper macro instead of @@ -72,7 +71,6 @@ static void vdpa_release_dev(struct device *d) */ struct vdpa_device *__vdpa_alloc_device(struct device *parent, const struct vdpa_config_ops *config, - int nvqs, size_t size) { struct vdpa_device *vdev; @@ -99,7 +97,6 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, vdev->index = err; vdev->config = config; vdev->features_valid = false; - vdev->nvqs = nvqs; err = dev_set_name(>dev, "vdpa%u", vdev->index); if (err) @@ -122,11 +119,14 @@ EXPORT_SYMBOL_GPL(__vdpa_alloc_device); * vdpa_register_device - register a vDPA device * Callers must have a succeed call of vdpa_alloc_device() before. * @vdev: the vdpa device to be registered to vDPA bus + * @nvqs: number of virtqueues supported by this device * * Returns an error when fail to add to vDPA bus */ -int vdpa_register_device(struct vdpa_device *vdev) +int vdpa_register_device(struct vdpa_device *vdev, int nvqs) { + vdev->nvqs = nvqs; + return device_add(>dev); } EXPORT_SYMBOL_GPL(vdpa_register_device); diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index fb3e7d46870f..e3108bd77610 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -352,7 +352,7 @@ static struct vdpasim *vdpasim_create(void) else ops = _net_config_ops; - vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, VDPASIM_VQ_NUM); + vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops); if (!vdpasim) goto err_alloc; @@ -378,7 +378,7 @@ static struct vdpasi
[PATCH V2 13/14] virtio_vdpa: don't warn when fail to disable vq
There's no guarantee that the device can disable a specific virtqueue through set_vq_ready(). One example is the modern virtio-pci device. So this patch removes the warning. Signed-off-by: Jason Wang --- drivers/virtio/virtio_vdpa.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_vdpa.c b/drivers/virtio/virtio_vdpa.c index 4a9ddb44b2a7..e28acf482e0c 100644 --- a/drivers/virtio/virtio_vdpa.c +++ b/drivers/virtio/virtio_vdpa.c @@ -225,9 +225,8 @@ static void virtio_vdpa_del_vq(struct virtqueue *vq) list_del(>node); spin_unlock_irqrestore(_dev->lock, flags); - /* Select and deactivate the queue */ + /* Select and deactivate the queue (best effort) */ ops->set_vq_ready(vdpa, index, 0); - WARN_ON(ops->get_vq_ready(vdpa, index)); vring_del_virtqueue(vq); -- 2.25.1
[PATCH V2 09/14] virtio-pci-modern: introduce helper for getting queue nums
This patch introduces helper for getting queue num of modern device. Signed-off-by: Jason Wang --- drivers/virtio/virtio_pci_modern.c | 13 - 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index f85216ccc6df..0b86a36998c8 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -256,6 +256,17 @@ static u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev, } +/* + * vp_modern_get_num_queues - get the number of virtqueues + * @mdev: the modern virtio-pci device + * + * Returns the number of virtqueues + */ +static u16 vp_modern_get_num_queues(struct virtio_pci_modern_device *mdev) +{ + return vp_ioread16(>common->num_queues); +} + /* virtio config->finalize_features() implementation */ static int vp_finalize_features(struct virtio_device *vdev) { @@ -447,7 +458,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, u16 num, off; int err; - if (index >= vp_ioread16(>num_queues)) + if (index >= vp_modern_get_num_queues(mdev)) return ERR_PTR(-ENOENT); /* Select the queue we're interested in */ -- 2.25.1
[PATCH V2 08/14] virtio-pci-modern: introduce helper for setting/geting queue size
This patch introduces helper for setting/getting queue size for modern device. Signed-off-by: Jason Wang --- drivers/virtio/virtio_pci_modern.c | 34 -- 1 file changed, 32 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index dcdda32b6182..f85216ccc6df 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -226,6 +226,36 @@ static bool vp_modern_get_queue_enable(struct virtio_pci_modern_device *mdev, return vp_ioread16(>common->queue_enable); } +/* + * vp_modern_set_queue_size - set size for a virtqueue + * @mdev: the modern virtio-pci device + * @index: the queue index + * @size: the size of the virtqueue + */ +static void vp_modern_set_queue_size(struct virtio_pci_modern_device *mdev, +u16 index, u16 size) +{ + vp_iowrite16(index, >common->queue_select); + vp_iowrite16(size, >common->queue_size); + +} + +/* + * vp_modern_get_queue_size - get size for a virtqueue + * @mdev: the modern virtio-pci device + * @index: the queue index + * + * Returns the size of the virtqueue + */ +static u16 vp_modern_get_queue_size(struct virtio_pci_modern_device *mdev, + u16 index) +{ + vp_iowrite16(index, >common->queue_select); + + return vp_ioread16(>common->queue_size); + +} + /* virtio config->finalize_features() implementation */ static int vp_finalize_features(struct virtio_device *vdev) { @@ -424,7 +454,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, vp_iowrite16(index, >queue_select); /* Check if queue is either not available or already active. */ - num = vp_ioread16(>queue_size); + num = vp_modern_get_queue_size(mdev, index); if (!num || vp_modern_get_queue_enable(mdev, index)) return ERR_PTR(-ENOENT); @@ -447,7 +477,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, return ERR_PTR(-ENOMEM); /* activate the queue */ - vp_iowrite16(virtqueue_get_vring_size(vq), >queue_size); + vp_modern_set_queue_size(mdev, index, virtqueue_get_vring_size(vq)); vp_modern_queue_address(mdev, index, virtqueue_get_desc_addr(vq), virtqueue_get_avail_addr(vq), virtqueue_get_used_addr(vq)); -- 2.25.1
[PATCH V2 04/14] virtio-pci: move the notification sanity check to vp_modern_probe()
This patch moves the notification sanity check to vp_modern_probe(). This can make sure the logic could be reused by modules other than virtio-pci. Signed-off-by: Jason Wang --- drivers/virtio/virtio_pci_modern.c | 34 +++--- 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 02688c3b3fbd..d001c74beefe 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -384,17 +384,6 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, vp_iowrite64_twopart(virtqueue_get_used_addr(vq), >queue_used_lo, >queue_used_hi); - /* offset should not wrap */ - if ((u64)off * mdev->notify_offset_multiplier + 2 - > mdev->notify_len) { - dev_warn(_dev->pci_dev->dev, -"bad notification offset %u (x %u) " -"for queue %u > %zd", -off, mdev->notify_offset_multiplier, -index, mdev->notify_len); - err = -EINVAL; - goto err_map_notify; - } vq->priv = (void __force *)mdev->notify_base + off * mdev->notify_offset_multiplier; @@ -695,9 +684,11 @@ static inline void check_offsets(void) static int vp_modern_probe(struct virtio_pci_modern_device *mdev) { struct pci_dev *pci_dev = mdev->pci_dev; - int err, common, isr, notify, device; + int err, common, isr, notify, device, i; + unsigned int num_queues; u32 notify_length; u32 notify_offset; + u16 off; /* We only own devices >= 0x1000 and <= 0x107f: leave the rest. */ if (pci_dev->device < 0x1000 || pci_dev->device > 0x107f) @@ -796,6 +787,25 @@ static int vp_modern_probe(struct virtio_pci_modern_device *mdev) if (!mdev->notify_base) goto err; + num_queues = vp_ioread16(>common->num_queues); + + /* offset should not wrap */ + for (i = 0; i < num_queues; i++) { + vp_iowrite16(i, >common->queue_select); + off = vp_ioread16(>common->queue_notify_off); + + if ((u64)off * mdev->notify_offset_multiplier + 2 + > mdev->notify_len) { + dev_warn(_dev->dev, +"bad notification offset %u (x %u) " +"for queue %u > %zd", +off, mdev->notify_offset_multiplier, +i, mdev->notify_len); + err = -EINVAL; + goto err; + } + } + /* We don't know how much we should map, but PAGE_SIZE * is more than enough for all existing devices. */ -- 2.25.1
[PATCH V2 07/14] virtio-pci-modern: introduce helper to set/get queue_enable
This patch introduces a helper to set/get queue_enable for modern device. Signed-off-by: Jason Wang --- drivers/virtio/virtio_pci_modern.c | 37 +- 1 file changed, 31 insertions(+), 6 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 3125987973d3..dcdda32b6182 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -198,6 +198,34 @@ static void vp_modern_queue_address(struct virtio_pci_modern_device *mdev, >queue_used_hi); } +/* + * vp_modern_set_queue_enable - enable a virtqueue + * @mdev: the modern virtio-pci device + * @index: the queue index + * @enable: whether the virtqueue is enable or not + */ +static void vp_modern_set_queue_enable(struct virtio_pci_modern_device *mdev, + u16 index, bool enable) +{ + vp_iowrite16(index, >common->queue_select); + vp_iowrite16(enable, >common->queue_enable); +} + +/* + * vp_modern_get_queue_enable - enable a virtqueue + * @mdev: the modern virtio-pci device + * @index: the queue index + * + * Returns whether a virtqueue is enabled or not + */ +static bool vp_modern_get_queue_enable(struct virtio_pci_modern_device *mdev, + u16 index) +{ + vp_iowrite16(index, >common->queue_select); + + return vp_ioread16(>common->queue_enable); +} + /* virtio config->finalize_features() implementation */ static int vp_finalize_features(struct virtio_device *vdev) { @@ -397,7 +425,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, /* Check if queue is either not available or already active. */ num = vp_ioread16(>queue_size); - if (!num || vp_ioread16(>queue_enable)) + if (!num || vp_modern_get_queue_enable(mdev, index)) return ERR_PTR(-ENOENT); if (num & (num - 1)) { @@ -454,7 +482,6 @@ static int vp_modern_find_vqs(struct virtio_device *vdev, unsigned nvqs, struct irq_affinity *desc) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); - struct virtio_pci_common_cfg __iomem *cfg = vp_dev->mdev.common; struct virtqueue *vq; int rc = vp_find_vqs(vdev, nvqs, vqs, callbacks, names, ctx, desc); @@ -464,10 +491,8 @@ static int vp_modern_find_vqs(struct virtio_device *vdev, unsigned nvqs, /* Select and activate all queues. Has to be done last: once we do * this, there's no way to go back except reset. */ - list_for_each_entry(vq, >vqs, list) { - vp_iowrite16(vq->index, >queue_select); - vp_iowrite16(1, >queue_enable); - } + list_for_each_entry(vq, >vqs, list) + vp_modern_set_queue_enable(_dev->mdev, vq->index, true); return 0; } -- 2.25.1
[PATCH V2 05/14] virtio-pci-modern: introduce vp_modern_set_queue_vector()
This patch introduces a helper to set virtqueue MSI vector. Signed-off-by: Jason Wang --- drivers/virtio/virtio_pci_modern.c | 35 -- 1 file changed, 24 insertions(+), 11 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index d001c74beefe..bacc05cbc762 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -155,6 +155,25 @@ static void vp_modern_set_features(struct virtio_pci_modern_device *mdev, vp_iowrite32(features >> 32, >guest_feature); } +/* + * vp_modern_queue_vector - set the MSIX vector for a specific virtqueue + * @mdev: the modern virtio-pci device + * @index: queue index + * @vector: the config vector + * + * Returns the config vector read from the device + */ +static u16 vp_modern_queue_vector(struct virtio_pci_modern_device *mdev, + u16 index, u16 vector) +{ + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; + + vp_iowrite16(index, >queue_select); + vp_iowrite16(vector, >queue_msix_vector); + /* Flush the write out to device */ + return vp_ioread16(>queue_msix_vector); +} + /* virtio config->finalize_features() implementation */ static int vp_finalize_features(struct virtio_device *vdev) { @@ -393,8 +412,7 @@ static struct virtqueue *setup_vq(struct virtio_pci_device *vp_dev, } if (msix_vec != VIRTIO_MSI_NO_VECTOR) { - vp_iowrite16(msix_vec, >queue_msix_vector); - msix_vec = vp_ioread16(>queue_msix_vector); + msix_vec = vp_modern_queue_vector(mdev, index, msix_vec); if (msix_vec == VIRTIO_MSI_NO_VECTOR) { err = -EBUSY; goto err_map_notify; @@ -437,16 +455,11 @@ static void del_vq(struct virtio_pci_vq_info *info) { struct virtqueue *vq = info->vq; struct virtio_pci_device *vp_dev = to_vp_device(vq->vdev); - struct virtio_pci_common_cfg __iomem *cfg = vp_dev->mdev.common; - - vp_iowrite16(vq->index, >queue_select); + struct virtio_pci_modern_device *mdev = _dev->mdev; - if (vp_dev->msix_enabled) { - vp_iowrite16(VIRTIO_MSI_NO_VECTOR, ->queue_msix_vector); - /* Flush the write out to device */ - vp_ioread16(>queue_msix_vector); - } + if (vp_dev->msix_enabled) + vp_modern_queue_vector(mdev, vq->index, + VIRTIO_MSI_NO_VECTOR); vring_del_virtqueue(vq); } -- 2.25.1
[PATCH V2 03/14] virtio-pci: split out modern device
This patch splits out the virtio-pci modern device only attributes into another structure. While at it, a dedicated probe method for modern only attributes is introduced. This may help for split the logic into a dedicated module. Signed-off-by: Jason Wang --- drivers/virtio/virtio_pci_common.h | 33 +++-- drivers/virtio/virtio_pci_modern.c | 224 ++--- 2 files changed, 158 insertions(+), 99 deletions(-) diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h index 1d23420f7ed6..d32af8ff56f9 100644 --- a/drivers/virtio/virtio_pci_common.h +++ b/drivers/virtio/virtio_pci_common.h @@ -39,37 +39,43 @@ struct virtio_pci_vq_info { unsigned msix_vector; }; -/* Our device structure */ -struct virtio_pci_device { - struct virtio_device vdev; +struct virtio_pci_modern_device { struct pci_dev *pci_dev; - /* In legacy mode, these two point to within ->legacy. */ - /* Where to read and clear interrupt */ - u8 __iomem *isr; - - /* Modern only fields */ - /* The IO mapping for the BARs */ + /* The IO mapping for the PCI BARs */ void __iomem * const *base; - /* The IO mapping for the PCI config space (non-legacy mode) */ + + /* The IO mapping for the PCI config space */ struct virtio_pci_common_cfg __iomem *common; /* Device-specific data (non-legacy mode) */ void __iomem *device; /* Base of vq notifications (non-legacy mode). */ void __iomem *notify_base; + /* Where to read and clear interrupt */ + u8 __iomem *isr; /* So we can sanity-check accesses. */ size_t notify_len; size_t device_len; - /* Capability for when we need to map notifications per-vq. */ - int notify_map_cap; - /* Multiply queue_notify_off by this value. (non-legacy mode). */ u32 notify_offset_multiplier; int modern_bars; + struct virtio_device_id id; +}; + +/* Our device structure */ +struct virtio_pci_device { + struct virtio_device vdev; + struct pci_dev *pci_dev; + struct virtio_pci_modern_device mdev; + + /* In legacy mode, these two point to within ->legacy. */ + /* Where to read and clear interrupt */ + u8 __iomem *isr; + /* Legacy only field */ /* the IO mapping for the PCI config space */ void __iomem *ioaddr; @@ -157,6 +163,5 @@ static inline void virtio_pci_legacy_remove(struct virtio_pci_device *vp_dev) } #endif int virtio_pci_modern_probe(struct virtio_pci_device *); -void virtio_pci_modern_remove(struct virtio_pci_device *); #endif diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 33cc21b818de..02688c3b3fbd 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -63,13 +63,11 @@ static void vp_iowrite64_twopart(u64 val, vp_iowrite32(val >> 32, hi); } -static void __iomem *map_capability(struct virtio_pci_device *vp_dev, int off, - size_t minlen, - u32 align, - u32 size, - size_t *len) +static void __iomem *map_capability(struct virtio_pci_modern_device *mdev, + int off, size_t minlen, u32 align, + u32 size, size_t *len) { - struct pci_dev *dev = vp_dev->pci_dev; + struct pci_dev *dev = mdev->pci_dev; u8 bar; u32 offset, length; @@ -111,14 +109,13 @@ static void __iomem *map_capability(struct virtio_pci_device *vp_dev, int off, return NULL; } - return vp_dev->base[bar] + offset; + return mdev->base[bar] + offset; } -/* virtio config->get_features() implementation */ -static u64 vp_get_features(struct virtio_device *vdev) +static u64 vp_modern_get_features(struct virtio_pci_modern_device *mdev) { - struct virtio_pci_device *vp_dev = to_vp_device(vdev); - struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common; + struct virtio_pci_common_cfg __iomem *cfg = mdev->common; + u64 features; vp_iowrite32(0, >device_feature_select); @@ -129,6 +126,14 @@ static u64 vp_get_features(struct virtio_device *vdev) return features; } +/* virtio config->get_features() implementation */ +static u64 vp_get_features(struct virtio_device *vdev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(vdev); + + return vp_modern_get_features(_dev->mdev); +} + static void vp_transport_features(struct virtio_device *vdev, u64 features) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); @@ -139,11 +144,21 @@ static void vp_transport_features(struct virtio_device *vdev, u64 features) __virtio_set_bit(vdev, VIRTIO_F_SR_IOV); }
[PATCH V2 02/14] virtio-pci: switch to use devres for modern devices
This patch tries to convert the modern device to use devres to manage its resources (iomaps). Before this patch the IO address is mapped individually according to the capability. After this patch, we simply map the whole BAR. This simplify the work of splitting modern device logic into an separate module. Signed-off-by: Jason Wang --- drivers/virtio/virtio_pci_common.c | 10 -- drivers/virtio/virtio_pci_common.h | 2 + drivers/virtio/virtio_pci_legacy.c | 13 ++- drivers/virtio/virtio_pci_modern.c | 141 + 4 files changed, 54 insertions(+), 112 deletions(-) diff --git a/drivers/virtio/virtio_pci_common.c b/drivers/virtio/virtio_pci_common.c index 222d630c41fc..e786701fa1b4 100644 --- a/drivers/virtio/virtio_pci_common.c +++ b/drivers/virtio/virtio_pci_common.c @@ -527,11 +527,6 @@ static int virtio_pci_probe(struct pci_dev *pci_dev, INIT_LIST_HEAD(_dev->virtqueues); spin_lock_init(_dev->lock); - /* enable the device */ - rc = pci_enable_device(pci_dev); - if (rc) - goto err_enable_device; - if (force_legacy) { rc = virtio_pci_legacy_probe(vp_dev); /* Also try modern mode if we can't map BAR0 (no IO space). */ @@ -559,11 +554,8 @@ static int virtio_pci_probe(struct pci_dev *pci_dev, err_register: if (vp_dev->ioaddr) virtio_pci_legacy_remove(vp_dev); - else -virtio_pci_modern_remove(vp_dev); err_probe: pci_disable_device(pci_dev); -err_enable_device: if (reg_dev) put_device(_dev->vdev.dev); else @@ -582,8 +574,6 @@ static void virtio_pci_remove(struct pci_dev *pci_dev) if (vp_dev->ioaddr) virtio_pci_legacy_remove(vp_dev); - else - virtio_pci_modern_remove(vp_dev); pci_disable_device(pci_dev); put_device(dev); diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h index b2f0eb4067cb..1d23420f7ed6 100644 --- a/drivers/virtio/virtio_pci_common.h +++ b/drivers/virtio/virtio_pci_common.h @@ -49,6 +49,8 @@ struct virtio_pci_device { u8 __iomem *isr; /* Modern only fields */ + /* The IO mapping for the BARs */ + void __iomem * const *base; /* The IO mapping for the PCI config space (non-legacy mode) */ struct virtio_pci_common_cfg __iomem *common; /* Device-specific data (non-legacy mode) */ diff --git a/drivers/virtio/virtio_pci_legacy.c b/drivers/virtio/virtio_pci_legacy.c index d62e9835aeec..890f155ff48c 100644 --- a/drivers/virtio/virtio_pci_legacy.c +++ b/drivers/virtio/virtio_pci_legacy.c @@ -214,14 +214,19 @@ int virtio_pci_legacy_probe(struct virtio_pci_device *vp_dev) struct pci_dev *pci_dev = vp_dev->pci_dev; int rc; + rc = pci_enable_device(pci_dev); + if (rc) + return rc; + + rc = -ENODEV; /* We only own devices >= 0x1000 and <= 0x103f: leave the rest. */ if (pci_dev->device < 0x1000 || pci_dev->device > 0x103f) - return -ENODEV; + goto err_id; if (pci_dev->revision != VIRTIO_PCI_ABI_VERSION) { printk(KERN_ERR "virtio_pci: expected ABI version %d, got %d\n", VIRTIO_PCI_ABI_VERSION, pci_dev->revision); - return -ENODEV; + goto err_id; } rc = dma_set_mask(_dev->dev, DMA_BIT_MASK(64)); @@ -241,7 +246,7 @@ int virtio_pci_legacy_probe(struct virtio_pci_device *vp_dev) rc = pci_request_region(pci_dev, 0, "virtio-pci-legacy"); if (rc) - return rc; + goto err_id; rc = -ENOMEM; vp_dev->ioaddr = pci_iomap(pci_dev, 0, 0); @@ -267,6 +272,8 @@ int virtio_pci_legacy_probe(struct virtio_pci_device *vp_dev) err_iomap: pci_release_region(pci_dev, 0); +err_id: + pci_disable_device(pci_dev); return rc; } diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index df1481fd400c..33cc21b818de 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -63,15 +63,15 @@ static void vp_iowrite64_twopart(u64 val, vp_iowrite32(val >> 32, hi); } -static void __iomem *map_capability(struct pci_dev *dev, int off, +static void __iomem *map_capability(struct virtio_pci_device *vp_dev, int off, size_t minlen, u32 align, - u32 start, u32 size, + u32 size, size_t *len) { + struct pci_dev *dev = vp_dev->pci_dev; u8 bar; u32 offset, length; - void __iomem *p; pci_read_config_byte(dev, off + offsetof(struct virtio_pci_cap,
[PATCH V2 01/14] virtio-pci: do not access iomem via virtio_pci_device directly
Instead of accessing iomem via virito_pci_device directly. Add an indirect level to ease the life of splitting out modern virito-pci logic. Signed-off-by: Jason Wang --- drivers/virtio/virtio_pci_modern.c | 76 ++ 1 file changed, 46 insertions(+), 30 deletions(-) diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 3d6ae5a5e252..df1481fd400c 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -141,12 +141,13 @@ static void __iomem *map_capability(struct pci_dev *dev, int off, static u64 vp_get_features(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common; u64 features; - vp_iowrite32(0, _dev->common->device_feature_select); - features = vp_ioread32(_dev->common->device_feature); - vp_iowrite32(1, _dev->common->device_feature_select); - features |= ((u64)vp_ioread32(_dev->common->device_feature) << 32); + vp_iowrite32(0, >device_feature_select); + features = vp_ioread32(>device_feature); + vp_iowrite32(1, >device_feature_select); + features |= ((u64)vp_ioread32(>device_feature) << 32); return features; } @@ -165,6 +166,7 @@ static void vp_transport_features(struct virtio_device *vdev, u64 features) static int vp_finalize_features(struct virtio_device *vdev) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); + struct virtio_pci_common_cfg __iomem *cfg = vp_dev->common; u64 features = vdev->features; /* Give virtio_ring a chance to accept features. */ @@ -179,10 +181,10 @@ static int vp_finalize_features(struct virtio_device *vdev) return -EINVAL; } - vp_iowrite32(0, _dev->common->guest_feature_select); - vp_iowrite32((u32)vdev->features, _dev->common->guest_feature); - vp_iowrite32(1, _dev->common->guest_feature_select); - vp_iowrite32(vdev->features >> 32, _dev->common->guest_feature); + vp_iowrite32(0, >guest_feature_select); + vp_iowrite32((u32)vdev->features, >guest_feature); + vp_iowrite32(1, >guest_feature_select); + vp_iowrite32(vdev->features >> 32, >guest_feature); return 0; } @@ -192,6 +194,7 @@ static void vp_get(struct virtio_device *vdev, unsigned offset, void *buf, unsigned len) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); + void __iomem *device = vp_dev->device; u8 b; __le16 w; __le32 l; @@ -200,21 +203,21 @@ static void vp_get(struct virtio_device *vdev, unsigned offset, switch (len) { case 1: - b = ioread8(vp_dev->device + offset); + b = ioread8(device + offset); memcpy(buf, , sizeof b); break; case 2: - w = cpu_to_le16(ioread16(vp_dev->device + offset)); + w = cpu_to_le16(ioread16(device + offset)); memcpy(buf, , sizeof w); break; case 4: - l = cpu_to_le32(ioread32(vp_dev->device + offset)); + l = cpu_to_le32(ioread32(device + offset)); memcpy(buf, , sizeof l); break; case 8: - l = cpu_to_le32(ioread32(vp_dev->device + offset)); + l = cpu_to_le32(ioread32(device + offset)); memcpy(buf, , sizeof l); - l = cpu_to_le32(ioread32(vp_dev->device + offset + sizeof l)); + l = cpu_to_le32(ioread32(device + offset + sizeof l)); memcpy(buf + sizeof l, , sizeof l); break; default: @@ -228,6 +231,7 @@ static void vp_set(struct virtio_device *vdev, unsigned offset, const void *buf, unsigned len) { struct virtio_pci_device *vp_dev = to_vp_device(vdev); + void __iomem *device = vp_dev->device; u8 b; __le16 w; __le32 l; @@ -237,21 +241,21 @@ static void vp_set(struct virtio_device *vdev, unsigned offset, switch (len) { case 1: memcpy(, buf, sizeof b); - iowrite8(b, vp_dev->device + offset); + iowrite8(b, device + offset); break; case 2: memcpy(, buf, sizeof w); - iowrite16(le16_to_cpu(w), vp_dev->device + offset); + iowrite16(le16_to_cpu(w), device + offset); break; case 4: memcpy(, buf, sizeof l); - iowrite32(le32_to_cpu(l), vp_dev->device + offset); + iowrite32(le32_to_cpu(l), device + offset); break; case 8: memcpy(, buf, sizeof l); - iowrite32(le
[PATCH V2 00/14] vDPA driver for virtio-pci device
Hi all: This series tries to implement a vDPA driver for virtio-pci device which will bridge between vDPA bus and virtio-pci device. This could be used for future feature prototyping and testing. Please review Changes from V1: - Split common codes from virito-pci and share it with vDPA driver - Use dynamic id in order to be less confusing with virtio-pci driver - No feature whitelist, supporting any features (mq, config etc) Thanks Jason Wang (14): virtio-pci: do not access iomem via virtio_pci_device directly virtio-pci: switch to use devres for modern devices virtio-pci: split out modern device virtio-pci: move the notification sanity check to vp_modern_probe() virtio-pci-modern: introduce vp_modern_set_queue_vector() virtio-pci-modern: introduce vp_modern_queue_address() virtio-pci-modern: introduce helper to set/get queue_enable virtio-pci-modern: introduce helper for setting/geting queue size virtio-pci-modern: introduce helper for getting queue nums virtio-pci-modern: introduce helper to get notification offset virtio-pci: introduce modern device module vdpa: set the virtqueue num during register virtio_vdpa: don't warn when fail to disable vq vdpa: introduce virtio pci driver drivers/vdpa/Kconfig | 6 + drivers/vdpa/Makefile | 1 + drivers/vdpa/ifcvf/ifcvf_main.c| 5 +- drivers/vdpa/mlx5/net/mlx5_vnet.c | 5 +- drivers/vdpa/vdpa.c| 8 +- drivers/vdpa/vdpa_sim/vdpa_sim.c | 4 +- drivers/vdpa/virtio_pci/Makefile | 2 + drivers/vdpa/virtio_pci/vp_vdpa.c | 450 drivers/virtio/Kconfig | 10 +- drivers/virtio/Makefile| 1 + drivers/virtio/virtio_pci_common.c | 10 - drivers/virtio/virtio_pci_common.h | 23 +- drivers/virtio/virtio_pci_legacy.c | 13 +- drivers/virtio/virtio_pci_modern.c | 442 +++ drivers/virtio/virtio_pci_modern_dev.c | 462 + drivers/virtio/virtio_vdpa.c | 3 +- include/linux/vdpa.h | 7 +- include/linux/virtio_pci_modern.h | 107 ++ 18 files changed, 1121 insertions(+), 438 deletions(-) create mode 100644 drivers/vdpa/virtio_pci/Makefile create mode 100644 drivers/vdpa/virtio_pci/vp_vdpa.c create mode 100644 drivers/virtio/virtio_pci_modern_dev.c create mode 100644 include/linux/virtio_pci_modern.h -- 2.25.1
Re: [PATCH RFC 02/12] vdpa: split vdpasim to core and net modules
On 2020/11/18 下午9:14, Stefano Garzarella wrote: Hi Jason, I just discovered that I missed the other questions in this email, sorry for that! No problem :) On Mon, Nov 16, 2020 at 12:00:11PM +0800, Jason Wang wrote: On 2020/11/13 下午9:47, Stefano Garzarella wrote: From: Max Gurtovoy Introduce new vdpa_sim_net and vdpa_sim (core) drivers. This is a preparation for adding a vdpa simulator module for block devices. Signed-off-by: Max Gurtovoy [sgarzare: various cleanups/fixes] Signed-off-by: Stefano Garzarella --- v1: - Removed unused headers - Removed empty module_init() module_exit() - Moved vdpasim_is_little_endian() in vdpa_sim.h - Moved vdpasim16_to_cpu/cpu_to_vdpasim16() in vdpa_sim.h - Added vdpasim*_to_cpu/cpu_to_vdpasim*() also for 32 and 64 - Replaced 'select VDPA_SIM' with 'depends on VDPA_SIM' since selected option can not depend on other [Jason] If possible, I would suggest to split this patch further: 1) convert to use void *config, and an attribute for setting config size during allocation 2) introduce supported_features 3) other attributes (#vqs) 4) rename config ops (more generic one) 5) introduce ops for set|get_config, set_get_features 6) real split [...] -static const struct vdpa_config_ops vdpasim_net_config_ops; -static const struct vdpa_config_ops vdpasim_net_batch_config_ops; +static const struct vdpa_config_ops vdpasim_config_ops; +static const struct vdpa_config_ops vdpasim_batch_config_ops; -static struct vdpasim *vdpasim_create(void) +struct vdpasim *vdpasim_create(struct vdpasim_init_attr *attr) { const struct vdpa_config_ops *ops; struct vdpasim *vdpasim; + u32 device_id; struct device *dev; - int ret = -ENOMEM; + int i, size, ret = -ENOMEM; - if (batch_mapping) - ops = _net_batch_config_ops; + device_id = attr->device_id; + /* Currently, we only accept the network and block devices. */ + if (device_id != VIRTIO_ID_NET && device_id != VIRTIO_ID_BLOCK) + return ERR_PTR(-EOPNOTSUPP); + + if (attr->batch_mapping) + ops = _batch_config_ops; else - ops = _net_config_ops; + ops = _config_ops; vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, VDPASIM_VQ_NUM); if (!vdpasim) goto err_alloc; - INIT_WORK(>work, vdpasim_work); + if (device_id == VIRTIO_ID_NET) + size = sizeof(struct virtio_net_config); + else + size = sizeof(struct virtio_blk_config); It's better to avoid such if/else consider we may introduce more type of devices. Can we have an attribute of config size instead? Yes, I'll move the patch 7 before this. About config size and set/get_config ops, I'm not sure if it is better to hidden everything under the new set/get_config ops, allocating the config structure in each device, or leave the allocation in the core and update it like now. I think we'd better to avoid having any type specific codes in generic sim codes. [...] +config VDPA_SIM_NET + tristate "vDPA simulator for networking device" + depends on VDPA_SIM + default n I remember somebody told me that if we don't enable a module it was disabled by default. So, should I remove "default n" from vdpa_sim* entries? Yes, but please do that in another patch. Thanks Thanks, Stefano
Re: [PATCH] vringh: fix vringh_iov_push_*() documentation
On 2020/11/17 上午12:16, Stefano Garzarella wrote: vringh_iov_push_*() functions don't have 'dst' parameter, but have the 'src' parameter. Replace 'dst' description with 'src' description. Signed-off-by: Stefano Garzarella Acked-by: Jason Wang --- drivers/vhost/vringh.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index 8bd8b403f087..b7403ba8e7f7 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -730,7 +730,7 @@ EXPORT_SYMBOL(vringh_iov_pull_user); /** * vringh_iov_push_user - copy bytes into vring_iov. * @wiov: the wiov as passed to vringh_getdesc_user() (updated as we consume) - * @dst: the place to copy. + * @src: the place to copy from. * @len: the maximum length to copy. * * Returns the bytes copied <= len or a negative errno. @@ -976,7 +976,7 @@ EXPORT_SYMBOL(vringh_iov_pull_kern); /** * vringh_iov_push_kern - copy bytes into vring_iov. * @wiov: the wiov as passed to vringh_getdesc_kern() (updated as we consume) - * @dst: the place to copy. + * @src: the place to copy from. * @len: the maximum length to copy. * * Returns the bytes copied <= len or a negative errno. @@ -1333,7 +1333,7 @@ EXPORT_SYMBOL(vringh_iov_pull_iotlb); * vringh_iov_push_iotlb - copy bytes into vring_iov. * @vrh: the vring. * @wiov: the wiov as passed to vringh_getdesc_iotlb() (updated as we consume) - * @dst: the place to copy. + * @src: the place to copy from. * @len: the maximum length to copy. * * Returns the bytes copied <= len or a negative errno.
Re: [PATCH RFC 12/12] vdpa_sim_blk: implement ramdisk behaviour
On 2020/11/13 下午9:47, Stefano Garzarella wrote: The previous implementation wrote only the status of each request. This patch implements a more accurate block device simulator, providing a ramdisk-like behavior. Also handle VIRTIO_BLK_T_GET_ID request, always answering the "vdpa_blk_sim" string. Let's use a separate patch for this. Signed-off-by: Stefano Garzarella --- drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 151 +++ 1 file changed, 133 insertions(+), 18 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c index 8e41b3ab98d5..68e74383322f 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c @@ -7,6 +7,7 @@ */ #include +#include #include #include "vdpa_sim.h" @@ -24,10 +25,137 @@ static struct vdpasim *vdpasim_blk_dev; +static int vdpasim_blk_handle_req(struct vdpasim *vdpasim, + struct vdpasim_virtqueue *vq) +{ + size_t wrote = 0, to_read = 0, to_write = 0; + struct virtio_blk_outhdr hdr; + uint8_t status; + uint32_t type; + ssize_t bytes; + loff_t offset; + int i, ret; + + vringh_kiov_cleanup(>riov); + vringh_kiov_cleanup(>wiov); It looks to me we should do those after vringh_get_desc_iotlb()? See comment above vringh_getdesc_kern(). + + ret = vringh_getdesc_iotlb(>vring, >riov, >wiov, + >head, GFP_ATOMIC); + if (ret != 1) + return ret; + + for (i = 0; i < vq->wiov.used; i++) + to_write += vq->wiov.iov[i].iov_len; It's better to introduce a helper for this (or consider to use iov iterator). + to_write -= 1; /* last byte is the status */ + + for (i = 0; i < vq->riov.used; i++) + to_read += vq->riov.iov[i].iov_len; + + bytes = vringh_iov_pull_iotlb(>vring, >riov, , sizeof(hdr)); + if (bytes != sizeof(hdr)) + return 0; + + to_read -= bytes; + + type = le32_to_cpu(hdr.type); + offset = le64_to_cpu(hdr.sector) << SECTOR_SHIFT; + status = VIRTIO_BLK_S_OK; + + switch (type) { + case VIRTIO_BLK_T_IN: + if (offset + to_write > VDPASIM_BLK_CAPACITY << SECTOR_SHIFT) { + dev_err(>vdpa.dev, + "reading over the capacity - offset: 0x%llx len: 0x%lx\n", + offset, to_write); + status = VIRTIO_BLK_S_IOERR; + break; + } + + bytes = vringh_iov_push_iotlb(>vring, >wiov, + vdpasim->buffer + offset, + to_write); + if (bytes < 0) { + dev_err(>vdpa.dev, + "vringh_iov_push_iotlb() error: %ld offset: 0x%llx len: 0x%lx\n", + bytes, offset, to_write); + status = VIRTIO_BLK_S_IOERR; + break; + } + + wrote += bytes; + break; + + case VIRTIO_BLK_T_OUT: + if (offset + to_read > VDPASIM_BLK_CAPACITY << SECTOR_SHIFT) { + dev_err(>vdpa.dev, + "writing over the capacity - offset: 0x%llx len: 0x%lx\n", + offset, to_read); + status = VIRTIO_BLK_S_IOERR; + break; + } + + bytes = vringh_iov_pull_iotlb(>vring, >riov, + vdpasim->buffer + offset, + to_read); + if (bytes < 0) { + dev_err(>vdpa.dev, + "vringh_iov_pull_iotlb() error: %ld offset: 0x%llx len: 0x%lx\n", + bytes, offset, to_read); + status = VIRTIO_BLK_S_IOERR; + break; + } + break; + + case VIRTIO_BLK_T_GET_ID: { + char id[VIRTIO_BLK_ID_BYTES] = "vdpa_blk_sim"; Let's use a global static one? + + bytes = vringh_iov_push_iotlb(>vring, + >wiov, id, + VIRTIO_BLK_ID_BYTES); + if (bytes < 0) { + dev_err(>vdpa.dev, + "vringh_iov_push_iotlb() error: %ld\n", bytes); + status = VIRTIO_BLK_S_IOERR; + break; + } + + wrote += bytes; + break; + } + + default: + dev_warn(>vdpa.dev, +"Unsupported request type %d\n", type); + status = VIRTIO_BLK_S_IOERR; + break; + } + +
Re: [PATCH RFC 11/12] vringh: allow vringh_iov_xfer() to skip bytes when ptr is NULL
On 2020/11/13 下午9:47, Stefano Garzarella wrote: In some cases, it may be useful to provide a way to skip a number of bytes in a vringh_iov. In order to keep vringh_iov consistent, let's reuse vringh_iov_xfer() logic and skip bytes when the ptr is NULL. Signed-off-by: Stefano Garzarella --- I'm not sure if this is the best option, maybe we can add a new function vringh_iov_skip(). Suggestions? I might be worth to check whether we can convert vringh_iov to use iov iterator then we can use iov_iterator_advance() here. Thanks --- drivers/vhost/vringh.c | 16 +++- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index 8bd8b403f087..ed3290946ad7 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -75,7 +75,9 @@ static inline int __vringh_get_head(const struct vringh *vrh, return head; } -/* Copy some bytes to/from the iovec. Returns num copied. */ +/* Copy some bytes to/from the iovec. Returns num copied. + * If ptr is NULL, skips at most len bytes. + */ static inline ssize_t vringh_iov_xfer(struct vringh *vrh, struct vringh_kiov *iov, void *ptr, size_t len, @@ -89,12 +91,16 @@ static inline ssize_t vringh_iov_xfer(struct vringh *vrh, size_t partlen; partlen = min(iov->iov[iov->i].iov_len, len); - err = xfer(vrh, iov->iov[iov->i].iov_base, ptr, partlen); - if (err) - return err; + + if (ptr) { + err = xfer(vrh, iov->iov[iov->i].iov_base, ptr, partlen); + if (err) + return err; + ptr += partlen; + } + done += partlen; len -= partlen; - ptr += partlen; iov->consumed += partlen; iov->iov[iov->i].iov_len -= partlen; iov->iov[iov->i].iov_base += partlen;
Re: [PATCH RFC 10/12] vdpa_sim: split vdpasim_virtqueue's iov field in riov and wiov
On 2020/11/13 下午9:47, Stefano Garzarella wrote: vringh_getdesc_iotlb() manages 2 iovs for writable and readable descriptors. This is very useful for the block device, where for each request we have both types of descriptor. Let's split the vdpasim_virtqueue's iov field in riov and wiov to use them with vringh_getdesc_iotlb(). Signed-off-by: Stefano Garzarella Acked-by: Jason Wang --- drivers/vdpa/vdpa_sim/vdpa_sim.h | 3 ++- drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 6 +++--- drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 8 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h index cc21e07aa2f7..0d4629675e4b 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.h +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h @@ -27,7 +27,8 @@ struct vdpasim; struct vdpasim_virtqueue { struct vringh vring; - struct vringh_kiov iov; + struct vringh_kiov riov; + struct vringh_kiov wiov; unsigned short head; bool ready; u64 desc_addr; diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c index 122a3c039507..8e41b3ab98d5 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c @@ -41,13 +41,13 @@ static void vdpasim_blk_work(struct work_struct *work) if (!vq->ready) continue; - while (vringh_getdesc_iotlb(>vring, >iov, >iov, + while (vringh_getdesc_iotlb(>vring, >riov, >wiov, >head, GFP_ATOMIC) > 0) { int write; - vq->iov.i = vq->iov.used - 1; - write = vringh_iov_push_iotlb(>vring, >iov, , 1); + vq->wiov.i = vq->wiov.used - 1; + write = vringh_iov_push_iotlb(>vring, >wiov, , 1); if (write <= 0) break; diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c index d0a1403f64b2..783b1e85b09c 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c @@ -47,12 +47,12 @@ static void vdpasim_net_work(struct work_struct *work) while (true) { total_write = 0; - err = vringh_getdesc_iotlb(>vring, >iov, NULL, + err = vringh_getdesc_iotlb(>vring, >riov, NULL, >head, GFP_ATOMIC); if (err <= 0) break; - err = vringh_getdesc_iotlb(>vring, NULL, >iov, + err = vringh_getdesc_iotlb(>vring, NULL, >wiov, >head, GFP_ATOMIC); if (err <= 0) { vringh_complete_iotlb(>vring, txq->head, 0); @@ -60,13 +60,13 @@ static void vdpasim_net_work(struct work_struct *work) } while (true) { - read = vringh_iov_pull_iotlb(>vring, >iov, + read = vringh_iov_pull_iotlb(>vring, >riov, vdpasim->buffer, PAGE_SIZE); if (read <= 0) break; - write = vringh_iov_push_iotlb(>vring, >iov, + write = vringh_iov_push_iotlb(>vring, >wiov, vdpasim->buffer, read); if (write <= 0) break;
Re: [PATCH RFC 09/12] vdpa_sim: make vdpasim->buffer size configurable
On 2020/11/13 下午9:47, Stefano Garzarella wrote: Allow each device to specify the size of the buffer allocated in vdpa_sim. Signed-off-by: Stefano Garzarella Acked-by: Jason Wang --- drivers/vdpa/vdpa_sim/vdpa_sim.h | 1 + drivers/vdpa/vdpa_sim/vdpa_sim.c | 2 +- drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 1 + drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 1 + 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h index f7e1fe0a88d3..cc21e07aa2f7 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.h +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h @@ -49,6 +49,7 @@ struct vdpasim_device { struct vdpasim_init_attr { struct vdpasim_device device; + size_t buffer_size; int batch_mapping; work_func_t work_fn; diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index bd034fbf4683..3863d49e0d6d 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -223,7 +223,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_init_attr *attr) if (!vdpasim->iommu) goto err_iommu; - vdpasim->buffer = kvmalloc(PAGE_SIZE, GFP_KERNEL); + vdpasim->buffer = kvmalloc(attr->buffer_size, GFP_KERNEL); if (!vdpasim->buffer) goto err_iommu; diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c index f456a0e4e097..122a3c039507 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c @@ -100,6 +100,7 @@ static int __init vdpasim_blk_init(void) attr.device.update_config = vdpasim_blk_update_config; attr.work_fn = vdpasim_blk_work; + attr.buffer_size = PAGE_SIZE; vdpasim_blk_dev = vdpasim_create(); if (IS_ERR(vdpasim_blk_dev)) { diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c index b9372fdf2415..d0a1403f64b2 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim_net.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_net.c @@ -124,6 +124,7 @@ static int __init vdpasim_net_init(void) attr.work_fn = vdpasim_net_work; attr.batch_mapping = batch_mapping; + attr.buffer_size = PAGE_SIZE; vdpasim_net_dev = vdpasim_create(); if (IS_ERR(vdpasim_net_dev)) {
Re: [PATCH RFC 07/12] vdpa_sim: move config management outside of the core
On 2020/11/13 下午9:47, Stefano Garzarella wrote: In order to simplify the code of the vdpa_sim core, we move the config management in each device simulator. The device must provide the size of config structure and a callback to update this structure called during the vdpasim_set_features(). Similarly, I suggest to do this before patch 2, then there's no need for the conversion of blk device. Signed-off-by: Stefano Garzarella --- drivers/vdpa/vdpa_sim/vdpa_sim.h | 5 +++-- drivers/vdpa/vdpa_sim/vdpa_sim.c | 29 +--- drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 27 -- drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 12 4 files changed, 37 insertions(+), 36 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h index 76e642042eb0..f7e1fe0a88d3 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.h +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h @@ -10,8 +10,6 @@ #include #include #include -#include -#include #define DRV_VERSION "0.1" #define DRV_AUTHOR "Jason Wang " @@ -42,8 +40,11 @@ struct vdpasim_virtqueue { struct vdpasim_device { u64 supported_features; + size_t config_size; u32 id; int nvqs; + + void (*update_config)(struct vdpasim *vdpasim); Let's use set_config/get_config to align with virtio/vhost. Other looks good. Thanks
Re: [PATCH RFC 08/12] vdpa_sim: use kvmalloc to allocate vdpasim->buffer
On 2020/11/13 下午9:47, Stefano Garzarella wrote: The next patch will make the buffer size configurable from each device. Since the buffer could be larger than a page, we use kvmalloc() instead of kmalloc(). Signed-off-by: Stefano Garzarella Acked-by: Jason Wang Thanks --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 9c29c2013661..bd034fbf4683 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -223,7 +223,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_init_attr *attr) if (!vdpasim->iommu) goto err_iommu; - vdpasim->buffer = kmalloc(PAGE_SIZE, GFP_KERNEL); + vdpasim->buffer = kvmalloc(PAGE_SIZE, GFP_KERNEL); if (!vdpasim->buffer) goto err_iommu; @@ -495,7 +495,7 @@ static void vdpasim_free(struct vdpa_device *vdpa) struct vdpasim *vdpasim = vdpa_to_sim(vdpa); cancel_work_sync(>work); - kfree(vdpasim->buffer); + kvfree(vdpasim->buffer); if (vdpasim->iommu) vhost_iotlb_free(vdpasim->iommu); kfree(vdpasim->vqs);
Re: [PATCH RFC 05/12] vdpa_sim: remove the limit of IOTLB entries
On 2020/11/13 下午9:47, Stefano Garzarella wrote: The simulated devices can support multiple queues, so this limit should be defined according to the number of queues supported by the device. Since we are in a simulator, let's simply remove that limit. Suggested-by: Jason Wang Signed-off-by: Stefano Garzarella Acked-by: Jason Wang It would be good to introduce a macro instead of using the magic 0 here. Thanks --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 2b4fea354413..9c9717441bbe 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -230,7 +230,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_init_attr *attr) goto err_iommu; set_dma_ops(dev, _dma_ops); - vdpasim->iommu = vhost_iotlb_alloc(2048, 0); + vdpasim->iommu = vhost_iotlb_alloc(0, 0); if (!vdpasim->iommu) goto err_iommu;
Re: [PATCH RFC 06/12] vdpa_sim: add struct vdpasim_device to store device properties
On 2020/11/13 下午9:47, Stefano Garzarella wrote: Move device properties used during the entire life cycle in a new structure to simplify the copy of these fields during the vdpasim initialization. Signed-off-by: Stefano Garzarella It would be better to do it before patch 2. --- drivers/vdpa/vdpa_sim/vdpa_sim.h | 17 -- drivers/vdpa/vdpa_sim/vdpa_sim.c | 33 ++-- drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 8 +-- drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 9 +--- 4 files changed, 38 insertions(+), 29 deletions(-) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h index 6a1267c40d5e..76e642042eb0 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.h +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h @@ -40,12 +40,17 @@ struct vdpasim_virtqueue { irqreturn_t (*cb)(void *data); }; +struct vdpasim_device { + u64 supported_features; + u32 id; + int nvqs; +}; + struct vdpasim_init_attr { - u32 device_id; - u64 features; + struct vdpasim_device device; + int batch_mapping; + work_func_t work_fn; - int batch_mapping; - int nvqs; }; /* State of each vdpasim device */ @@ -53,18 +58,16 @@ struct vdpasim { struct vdpa_device vdpa; struct vdpasim_virtqueue *vqs; struct work_struct work; + struct vdpasim_device device; /* spinlock to synchronize virtqueue state */ spinlock_t lock; /* virtio config according to device type */ void *config; struct vhost_iotlb *iommu; void *buffer; - u32 device_id; u32 status; u32 generation; u64 features; - u64 supported_features; - int nvqs; /* spinlock to synchronize iommu table */ spinlock_t iommu_lock; }; diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 9c9717441bbe..d053bd14b3f8 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -28,7 +28,7 @@ static void vdpasim_queue_ready(struct vdpasim *vdpasim, unsigned int idx) { struct vdpasim_virtqueue *vq = >vqs[idx]; - vringh_init_iotlb(>vring, vdpasim->supported_features, + vringh_init_iotlb(>vring, vdpasim->device.supported_features, VDPASIM_QUEUE_MAX, false, (struct vring_desc *)(uintptr_t)vq->desc_addr, (struct vring_avail *) @@ -46,7 +46,7 @@ static void vdpasim_vq_reset(struct vdpasim *vdpasim, vq->device_addr = 0; vq->cb = NULL; vq->private = NULL; - vringh_init_iotlb(>vring, vdpasim->supported_features, + vringh_init_iotlb(>vring, vdpasim->device.supported_features, VDPASIM_QUEUE_MAX, false, NULL, NULL, NULL); } @@ -54,7 +54,7 @@ static void vdpasim_reset(struct vdpasim *vdpasim) { int i; - for (i = 0; i < vdpasim->nvqs; i++) + for (i = 0; i < vdpasim->device.nvqs; i++) vdpasim_vq_reset(vdpasim, >vqs[i]); spin_lock(>iommu_lock); @@ -189,7 +189,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_init_attr *attr) struct device *dev; int i, size, ret = -ENOMEM; - device_id = attr->device_id; + device_id = attr->device.id; /* Currently, we only accept the network and block devices. */ if (device_id != VIRTIO_ID_NET && device_id != VIRTIO_ID_BLOCK) return ERR_PTR(-EOPNOTSUPP); @@ -200,10 +200,12 @@ struct vdpasim *vdpasim_create(struct vdpasim_init_attr *attr) ops = _config_ops; vdpasim = vdpa_alloc_device(struct vdpasim, vdpa, NULL, ops, - attr->nvqs); + attr->device.nvqs); if (!vdpasim) goto err_alloc; + vdpasim->device = attr->device; + if (device_id == VIRTIO_ID_NET) size = sizeof(struct virtio_net_config); else @@ -212,14 +214,11 @@ struct vdpasim *vdpasim_create(struct vdpasim_init_attr *attr) if (!vdpasim->config) goto err_iommu; - vdpasim->vqs = kcalloc(attr->nvqs, sizeof(struct vdpasim_virtqueue), - GFP_KERNEL); + vdpasim->vqs = kcalloc(vdpasim->device.nvqs, + sizeof(struct vdpasim_virtqueue), GFP_KERNEL); if (!vdpasim->vqs) goto err_iommu; - vdpasim->device_id = device_id; - vdpasim->supported_features = attr->features; - vdpasim->nvqs = attr->nvqs; INIT_WORK(>work, attr->work_fn); spin_lock_init(>lock); spin_lock_init(>iommu_lock); @@ -238,7 +237,7 @@ struct vdpasim *vdpasim_create(struct vdpasim_init_attr *attr) if (!vdpasim->buffer) goto err_iommu; - for (i = 0; i < vdpasim->nvqs; i++) + for (i = 0; i <
Re: [PATCH RFC 04/12] vdpa: add vdpa simulator for block device
On 2020/11/13 下午9:47, Stefano Garzarella wrote: From: Max Gurtovoy This will allow running vDPA for virtio block protocol. Signed-off-by: Max Gurtovoy [sgarzare: various cleanups/fixes] Signed-off-by: Stefano Garzarella --- v1: - Removed unused headers - Used cpu_to_vdpasim*() to store config fields - Replaced 'select VDPA_SIM' with 'depends on VDPA_SIM' since selected option can not depend on other [Jason] - Start with a single queue for now [Jason] - Add comments to memory barriers --- drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 124 +++ drivers/vdpa/Kconfig | 9 ++ drivers/vdpa/vdpa_sim/Makefile | 1 + 3 files changed, 134 insertions(+) create mode 100644 drivers/vdpa/vdpa_sim/vdpa_sim_blk.c diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c new file mode 100644 index ..386dbb2f7138 --- /dev/null +++ b/drivers/vdpa/vdpa_sim/vdpa_sim_blk.c @@ -0,0 +1,124 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * VDPA simulator for block device. + * + * Copyright (c) 2020, Mellanox Technologies. All rights reserved. + * + */ + +#include + +#include "vdpa_sim.h" + +#define VDPASIM_BLK_FEATURES ((1ULL << VIRTIO_BLK_F_SIZE_MAX) | \ +(1ULL << VIRTIO_BLK_F_SEG_MAX) | \ +(1ULL << VIRTIO_BLK_F_BLK_SIZE) | \ +(1ULL << VIRTIO_BLK_F_TOPOLOGY) | \ +(1ULL << VIRTIO_BLK_F_MQ)) + +#define VDPASIM_BLK_CAPACITY 0x4 +#define VDPASIM_BLK_SIZE_MAX 0x1000 +#define VDPASIM_BLK_SEG_MAX 32 +#define VDPASIM_BLK_VQ_NUM 1 + +static struct vdpasim *vdpasim_blk_dev; + +static void vdpasim_blk_work(struct work_struct *work) +{ + struct vdpasim *vdpasim = container_of(work, struct vdpasim, work); + u8 status = VIRTIO_BLK_S_OK; + int i; + + spin_lock(>lock); + + if (!(vdpasim->status & VIRTIO_CONFIG_S_DRIVER_OK)) + goto out; + + for (i = 0; i < VDPASIM_BLK_VQ_NUM; i++) { + struct vdpasim_virtqueue *vq = >vqs[i]; + + if (!vq->ready) + continue; + + while (vringh_getdesc_iotlb(>vring, >iov, >iov, + >head, GFP_ATOMIC) > 0) { + + int write; + + vq->iov.i = vq->iov.used - 1; + write = vringh_iov_push_iotlb(>vring, >iov, , 1); + if (write <= 0) + break; + + /* Make sure data is wrote before advancing index */ + smp_wmb(); + + vringh_complete_iotlb(>vring, vq->head, write); + + /* Make sure used is visible before rasing the interrupt. */ + smp_wmb(); + + if (vringh_need_notify_iotlb(>vring) > 0) + vringh_notify(>vring); Do we initialize vrh->notify anywhere? And This seems duplicated with the following vq->cb. I think the correct way is to initialize vrh->notify and use vringh_need_notify_iotlb()/vringh_notify() instead of the vq->cb here. And while at it, it's better to convert net simulator to do the same. Thanks + + local_bh_disable(); + if (vq->cb) + vq->cb(vq->private); + local_bh_enable(); + } + } +out: + spin_unlock(>lock); + +} + +static int __init vdpasim_blk_init(void) +{ + struct vdpasim_init_attr attr = {}; + struct virtio_blk_config *config; + int ret; + + attr.device_id = VIRTIO_ID_BLOCK; + attr.features = VDPASIM_FEATURES | VDPASIM_BLK_FEATURES; + attr.work_fn = vdpasim_blk_work; + vdpasim_blk_dev = vdpasim_create(); + if (IS_ERR(vdpasim_blk_dev)) { + ret = PTR_ERR(vdpasim_blk_dev); + goto out; + } + + config = (struct virtio_blk_config *)vdpasim_blk_dev->config; + config->capacity = cpu_to_vdpasim64(vdpasim_blk_dev, VDPASIM_BLK_CAPACITY); + config->size_max = cpu_to_vdpasim32(vdpasim_blk_dev, VDPASIM_BLK_SIZE_MAX); + config->seg_max = cpu_to_vdpasim32(vdpasim_blk_dev, VDPASIM_BLK_SEG_MAX); + config->num_queues = cpu_to_vdpasim16(vdpasim_blk_dev, VDPASIM_BLK_VQ_NUM); + config->min_io_size = cpu_to_vdpasim16(vdpasim_blk_dev, 1); + config->opt_io_size = cpu_to_vdpasim32(vdpasim_blk_dev, 1); + config->blk_size = cpu_to_vdpasim32(vdpasim_blk_dev, 512); + + ret = vdpa_register_device(_blk_dev->vdpa); + if (ret) + goto put_dev; + + return 0; + +put_dev: + put_device(_blk_dev->vdpa.dev); +out: + return ret; +} + +static void __exit vdpasim_blk_exit(void) +{ + struct vdpa_device *vdpa = _blk_dev->vdpa; + + vdpa_unregister_device(vdpa); +} +
Re: [PATCH RFC 03/12] vdpa_sim: remove hard-coded virtq count
On 2020/11/13 下午9:47, Stefano Garzarella wrote: From: Max Gurtovoy Add a new attribute that will define the number of virt queues to be created for the vdpasim device. Signed-off-by: Max Gurtovoy [sgarzare: replace kmalloc_array() with kcalloc()] Signed-off-by: Stefano Garzarella --- v1: - use kcalloc() instead of kmalloc_array() since some function expects variables initialized to zero Looks good, one nit, I prefer to do this before patch 2. Thanks
Re: [PATCH RFC 02/12] vdpa: split vdpasim to core and net modules
On 2020/11/13 下午9:47, Stefano Garzarella wrote: From: Max Gurtovoy Introduce new vdpa_sim_net and vdpa_sim (core) drivers. This is a preparation for adding a vdpa simulator module for block devices. Signed-off-by: Max Gurtovoy [sgarzare: various cleanups/fixes] Signed-off-by: Stefano Garzarella --- v1: - Removed unused headers - Removed empty module_init() module_exit() - Moved vdpasim_is_little_endian() in vdpa_sim.h - Moved vdpasim16_to_cpu/cpu_to_vdpasim16() in vdpa_sim.h - Added vdpasim*_to_cpu/cpu_to_vdpasim*() also for 32 and 64 - Replaced 'select VDPA_SIM' with 'depends on VDPA_SIM' since selected option can not depend on other [Jason] If possible, I would suggest to split this patch further: 1) convert to use void *config, and an attribute for setting config size during allocation 2) introduce supported_features 3) other attributes (#vqs) 4) rename config ops (more generic one) 5) introduce ops for set|get_config, set_get_features 6) real split --- drivers/vdpa/vdpa_sim/vdpa_sim.h | 110 +++ drivers/vdpa/vdpa_sim/vdpa_sim.c | 285 ++- drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 153 ++ drivers/vdpa/Kconfig | 7 +- drivers/vdpa/vdpa_sim/Makefile | 1 + 5 files changed, 329 insertions(+), 227 deletions(-) create mode 100644 drivers/vdpa/vdpa_sim/vdpa_sim.h create mode 100644 drivers/vdpa/vdpa_sim/vdpa_sim_net.c diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.h b/drivers/vdpa/vdpa_sim/vdpa_sim.h new file mode 100644 index ..33613c49888c --- /dev/null +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.h @@ -0,0 +1,110 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +/* + * Copyright (c) 2020, Red Hat Inc. All rights reserved. + */ + +#ifndef _VDPA_SIM_H +#define _VDPA_SIM_H + +#include +#include +#include +#include +#include +#include + +#define DRV_VERSION "0.1" +#define DRV_AUTHOR "Jason Wang " +#define DRV_LICENSE "GPL v2" + +#define VDPASIM_QUEUE_ALIGN PAGE_SIZE +#define VDPASIM_QUEUE_MAX 256 +#define VDPASIM_VENDOR_ID 0 +#define VDPASIM_VQ_NUM 0x2 + +#define VDPASIM_FEATURES ((1ULL << VIRTIO_F_ANY_LAYOUT) | \ +(1ULL << VIRTIO_F_VERSION_1) | \ +(1ULL << VIRTIO_F_ACCESS_PLATFORM)) + +struct vdpasim; + +struct vdpasim_virtqueue { + struct vringh vring; + struct vringh_kiov iov; + unsigned short head; + bool ready; + u64 desc_addr; + u64 device_addr; + u64 driver_addr; + u32 num; + void *private; + irqreturn_t (*cb)(void *data); +}; + +struct vdpasim_init_attr { + u32 device_id; + u64 features; + work_func_t work_fn; + int batch_mapping; +}; + +/* State of each vdpasim device */ +struct vdpasim { + struct vdpa_device vdpa; + struct vdpasim_virtqueue vqs[VDPASIM_VQ_NUM]; + struct work_struct work; + /* spinlock to synchronize virtqueue state */ + spinlock_t lock; + /* virtio config according to device type */ + void *config; + struct vhost_iotlb *iommu; + void *buffer; + u32 device_id; + u32 status; + u32 generation; + u64 features; + u64 supported_features; + /* spinlock to synchronize iommu table */ + spinlock_t iommu_lock; +}; + +struct vdpasim *vdpasim_create(struct vdpasim_init_attr *attr); + +/* TODO: cross-endian support */ +static inline bool vdpasim_is_little_endian(struct vdpasim *vdpasim) +{ + return virtio_legacy_is_little_endian() || + (vdpasim->features & (1ULL << VIRTIO_F_VERSION_1)); +} + +static inline u16 vdpasim16_to_cpu(struct vdpasim *vdpasim, __virtio16 val) +{ + return __virtio16_to_cpu(vdpasim_is_little_endian(vdpasim), val); +} + +static inline __virtio16 cpu_to_vdpasim16(struct vdpasim *vdpasim, u16 val) +{ + return __cpu_to_virtio16(vdpasim_is_little_endian(vdpasim), val); +} + +static inline u32 vdpasim32_to_cpu(struct vdpasim *vdpasim, __virtio32 val) +{ + return __virtio32_to_cpu(vdpasim_is_little_endian(vdpasim), val); +} + +static inline __virtio32 cpu_to_vdpasim32(struct vdpasim *vdpasim, u32 val) +{ + return __cpu_to_virtio32(vdpasim_is_little_endian(vdpasim), val); +} + +static inline u64 vdpasim64_to_cpu(struct vdpasim *vdpasim, __virtio64 val) +{ + return __virtio64_to_cpu(vdpasim_is_little_endian(vdpasim), val); +} + +static inline __virtio64 cpu_to_vdpasim64(struct vdpasim *vdpasim, u64 val) +{ + return __cpu_to_virtio64(vdpasim_is_little_endian(vdpasim), val); +} + +#endif diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 6a90fdb9cbfc..04f9dc9ce8c8 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -1,107 +1,16 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * VDPA networking devi
Re: [PATCH RFC 00/12] vdpa: generalize vdpa simulator and add block device
On 2020/11/13 下午9:47, Stefano Garzarella wrote: Thanks to Max that started this work! I took his patches, and extended the block simulator a bit. This series moves the network device simulator in a new module (vdpa_sim_net) and leaves the generic functions in the vdpa_sim core module, allowing the possibility to add new vDPA device simulators. Then we added a new vdpa_sim_blk module to simulate a block device. I'm not sure about patch 11 ("vringh: allow vringh_iov_xfer() to skip bytes when ptr is NULL"), maybe we can add a new functions instead of modify vringh_iov_xfer(). As Max reported, I'm also seeing errors with vdpa_sim_blk related to iotlb and vringh when there is high load, these are some of the error messages I can see randomly: vringh: Failed to access avail idx at e8deb2cc vringh: Failed to read head: idx 6289 address e1ad1d50 vringh: Failed to get flags at 6635d7a3 virtio_vdpa vdpa0: vringh_iov_push_iotlb() error: -14 offset: 0x284 len: 0x2 virtio_vdpa vdpa0: vringh_iov_pull_iotlb() error: -14 offset: 0x58ee000 len: 0x3000 These errors should all be related to the fact that iotlb_translate() fails with -EINVAL, so it seems that we miss some mapping. Is this only reproducible when there's multiple co-current accessing of IOTLB? If yes, it's probably a hint that some kind of synchronization is still missed somewhere. It might be useful to log the dma_map/unmp in both virtio_ring and vringh to see who is missing the map. Thanks I'll debug more carefully, in the meantime can you give a first review? Thanks, Stefano Max Gurtovoy (4): vhost-vdpa: add support for vDPA blk devices vdpa: split vdpasim to core and net modules vdpa_sim: remove hard-coded virtq count vdpa: add vdpa simulator for block device Stefano Garzarella (8): vdpa_sim: remove the limit of IOTLB entries vdpa_sim: add struct vdpasim_device to store device properties vdpa_sim: move config management outside of the core vdpa_sim: use kvmalloc to allocate vdpasim->buffer vdpa_sim: make vdpasim->buffer size configurable vdpa_sim: split vdpasim_virtqueue's iov field in riov and wiov vringh: allow vringh_iov_xfer() to skip bytes when ptr is NULL vdpa_sim_blk: implement ramdisk behaviour drivers/vdpa/vdpa_sim/vdpa_sim.h | 117 +++ drivers/vdpa/vdpa_sim/vdpa_sim.c | 283 +-- drivers/vdpa/vdpa_sim/vdpa_sim_blk.c | 251 drivers/vdpa/vdpa_sim/vdpa_sim_net.c | 172 drivers/vhost/vdpa.c | 11 +- drivers/vhost/vringh.c | 16 +- drivers/vdpa/Kconfig | 16 +- drivers/vdpa/vdpa_sim/Makefile | 2 + 8 files changed, 628 insertions(+), 240 deletions(-) create mode 100644 drivers/vdpa/vdpa_sim/vdpa_sim.h create mode 100644 drivers/vdpa/vdpa_sim/vdpa_sim_blk.c create mode 100644 drivers/vdpa/vdpa_sim/vdpa_sim_net.c
Re: [PATCH v3] vhost-vdpa: fix page pinning leakage in error path (rework)
On 2020/11/6 上午7:26, Si-Wei Liu wrote: Pinned pages are not properly accounted particularly when mapping error occurs on IOTLB update. Clean up dangling pinned pages for the error path. The memory usage for bookkeeping pinned pages is reverted to what it was before: only one single free page is needed. This helps reduce the host memory demand for VM with a large amount of memory, or in the situation where host is running short of free memory. Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend") Signed-off-by: Si-Wei Liu --- Changes in v3: - Turn explicit last_pfn check to a WARN_ON() (Jason) Changes in v2: - Drop the reversion patch - Fix unhandled page leak towards the end of page_list Acked-by: Jason Wang Thanks drivers/vhost/vdpa.c | 80 1 file changed, 62 insertions(+), 18 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index b6d9016..5b13dfd 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, if (r) vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1); + else + atomic64_add(size >> PAGE_SHIFT, >mm->pinned_vm); return r; } @@ -591,14 +593,16 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, unsigned long list_size = PAGE_SIZE / sizeof(struct page *); unsigned int gup_flags = FOLL_LONGTERM; unsigned long npages, cur_base, map_pfn, last_pfn = 0; - unsigned long locked, lock_limit, pinned, i; + unsigned long lock_limit, sz2pin, nchunks, i; u64 iova = msg->iova; + long pinned; int ret = 0; if (vhost_iotlb_itree_first(iotlb, msg->iova, msg->iova + msg->size - 1)) return -EEXIST; + /* Limit the use of memory for bookkeeping */ page_list = (struct page **) __get_free_page(GFP_KERNEL); if (!page_list) return -ENOMEM; @@ -607,52 +611,75 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, gup_flags |= FOLL_WRITE; npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> PAGE_SHIFT; - if (!npages) - return -EINVAL; + if (!npages) { + ret = -EINVAL; + goto free; + } mmap_read_lock(dev->mm); - locked = atomic64_add_return(npages, >mm->pinned_vm); lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - - if (locked > lock_limit) { + if (npages + atomic64_read(>mm->pinned_vm) > lock_limit) { ret = -ENOMEM; - goto out; + goto unlock; } cur_base = msg->uaddr & PAGE_MASK; iova &= PAGE_MASK; + nchunks = 0; while (npages) { - pinned = min_t(unsigned long, npages, list_size); - ret = pin_user_pages(cur_base, pinned, -gup_flags, page_list, NULL); - if (ret != pinned) + sz2pin = min_t(unsigned long, npages, list_size); + pinned = pin_user_pages(cur_base, sz2pin, + gup_flags, page_list, NULL); + if (sz2pin != pinned) { + if (pinned < 0) { + ret = pinned; + } else { + unpin_user_pages(page_list, pinned); + ret = -ENOMEM; + } goto out; + } + nchunks++; if (!last_pfn) map_pfn = page_to_pfn(page_list[0]); - for (i = 0; i < ret; i++) { + for (i = 0; i < pinned; i++) { unsigned long this_pfn = page_to_pfn(page_list[i]); u64 csize; if (last_pfn && (this_pfn != last_pfn + 1)) { /* Pin a contiguous chunk of memory */ csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT; - if (vhost_vdpa_map(v, iova, csize, - map_pfn << PAGE_SHIFT, - msg->perm)) + ret = vhost_vdpa_map(v, iova, csize, +map_pfn << PAGE_SHIFT, +msg->perm); + if (ret) { + /* +* Unpin the pages that are left unmapped +* from this point on in the current +
Re: [PATCH v2] vhost-vdpa: fix page pinning leakage in error path (rework)
On 2020/11/10 上午7:56, si-wei liu wrote: On 11/9/2020 2:42 PM, Michael S. Tsirkin wrote: On Mon, Nov 09, 2020 at 01:44:03PM -0800, si-wei liu wrote: On 11/8/2020 7:21 PM, Jason Wang wrote: On 2020/11/6 上午6:57, si-wei liu wrote: On 11/4/2020 7:26 PM, Jason Wang wrote: On 2020/11/5 上午7:33, Si-Wei Liu wrote: Pinned pages are not properly accounted particularly when mapping error occurs on IOTLB update. Clean up dangling pinned pages for the error path. The memory usage for bookkeeping pinned pages is reverted to what it was before: only one single free page is needed. This helps reduce the host memory demand for VM with a large amount of memory, or in the situation where host is running short of free memory. Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend") Signed-off-by: Si-Wei Liu --- Changes in v2: - Drop the reversion patch - Fix unhandled page leak towards the end of page_list drivers/vhost/vdpa.c | 79 1 file changed, 61 insertions(+), 18 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index b6d9016..e112854 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, if (r) vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1); + else + atomic64_add(size >> PAGE_SHIFT, >mm->pinned_vm); return r; } @@ -591,14 +593,16 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, unsigned long list_size = PAGE_SIZE / sizeof(struct page *); unsigned int gup_flags = FOLL_LONGTERM; unsigned long npages, cur_base, map_pfn, last_pfn = 0; - unsigned long locked, lock_limit, pinned, i; + unsigned long lock_limit, sz2pin, nchunks, i; u64 iova = msg->iova; + long pinned; int ret = 0; if (vhost_iotlb_itree_first(iotlb, msg->iova, msg->iova + msg->size - 1)) return -EEXIST; + /* Limit the use of memory for bookkeeping */ page_list = (struct page **) __get_free_page(GFP_KERNEL); if (!page_list) return -ENOMEM; @@ -607,52 +611,75 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, gup_flags |= FOLL_WRITE; npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) PAGE_SHIFT; - if (!npages) - return -EINVAL; + if (!npages) { + ret = -EINVAL; + goto free; + } mmap_read_lock(dev->mm); - locked = atomic64_add_return(npages, >mm->pinned_vm); lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - - if (locked > lock_limit) { + if (npages + atomic64_read(>mm->pinned_vm) > lock_limit) { ret = -ENOMEM; - goto out; + goto unlock; } cur_base = msg->uaddr & PAGE_MASK; iova &= PAGE_MASK; + nchunks = 0; while (npages) { - pinned = min_t(unsigned long, npages, list_size); - ret = pin_user_pages(cur_base, pinned, - gup_flags, page_list, NULL); - if (ret != pinned) + sz2pin = min_t(unsigned long, npages, list_size); + pinned = pin_user_pages(cur_base, sz2pin, + gup_flags, page_list, NULL); + if (sz2pin != pinned) { + if (pinned < 0) { + ret = pinned; + } else { + unpin_user_pages(page_list, pinned); + ret = -ENOMEM; + } goto out; + } + nchunks++; if (!last_pfn) map_pfn = page_to_pfn(page_list[0]); - for (i = 0; i < ret; i++) { + for (i = 0; i < pinned; i++) { unsigned long this_pfn = page_to_pfn(page_list[i]); u64 csize; if (last_pfn && (this_pfn != last_pfn + 1)) { /* Pin a contiguous chunk of memory */ csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT; - if (vhost_vdpa_map(v, iova, csize, - map_pfn << PAGE_SHIFT, - msg->perm)) + ret = vhost_vdpa_map(v, iova, csize, + map_pfn << PAGE_SHIFT, + msg->perm); + if (ret) { + /* + * Unpin the pages that are left unmapped + * from this point on in the current + * page_list. The remaining outstanding + * ones which may stride across several + * chunks will be covered in the common + * error path subsequently. + */ + unpin_user_pages(_list[i], + pinned - i); Can we simply do last_pfn = this_pfn here? Nope. They are not contiguous segments
Re: [PATCH virtio] virtio: virtio_console: fix DMA memory allocation for rproc serial
On 2020/11/5 下午8:22, Alexander Lobakin wrote: From: Jason Wang Date: Thu, 5 Nov 2020 11:10:24 +0800 Hi Jason, On 2020/11/4 下午11:31, Alexander Lobakin wrote: Since commit 086d08725d34 ("remoteproc: create vdev subdevice with specific dma memory pool"), every remoteproc has a DMA subdevice ("remoteprocX#vdevYbuffer") for each virtio device, which inherits DMA capabilities from the corresponding platform device. This allowed to associate different DMA pools with each vdev, and required from virtio drivers to perform DMA operations with the parent device (vdev->dev.parent) instead of grandparent (vdev->dev.parent->parent). virtio_rpmsg_bus was already changed in the same merge cycle with commit d999b622fcfb ("rpmsg: virtio: allocate buffer from parent"), but virtio_console did not. In fact, operations using the grandparent worked fine while the grandparent was the platform device, but since commit c774ad010873 ("remoteproc: Fix and restore the parenting hierarchy for vdev") this was changed, and now the grandparent device is the remoteproc device without any DMA capabilities. So, starting v5.8-rc1 the following warning is observed: [2.483925] [ cut here ] [2.489148] WARNING: CPU: 3 PID: 101 at kernel/dma/mapping.c:427 0x80e7eee8 [2.489152] Modules linked in: virtio_console(+) [2.503737] virtio_rpmsg_bus rpmsg_core [2.508903] [2.528898] [2.913043] [2.914907] ---[ end trace 93ac8746beab612c ]--- [2.920102] virtio-ports vport1p0: Error allocating inbufs kernel/dma/mapping.c:427 is: WARN_ON_ONCE(!dev->coherent_dma_mask); obviously because the grandparent now is remoteproc dev without any DMA caps: [3.104943] Parent: remoteproc0#vdev1buffer, grandparent: remoteproc0 Fix this the same way as it was for virtio_rpmsg_bus, using just the parent device (vdev->dev.parent, "remoteprocX#vdevYbuffer") for DMA operations. This also allows now to reserve DMA pools/buffers for rproc serial via Device Tree. Fixes: c774ad010873 ("remoteproc: Fix and restore the parenting hierarchy for vdev") Cc: sta...@vger.kernel.org # 5.1+ Signed-off-by: Alexander Lobakin --- drivers/char/virtio_console.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index a2da8f768b94..1836cc56e357 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -435,12 +435,12 @@ static struct port_buffer *alloc_buf(struct virtio_device *vdev, size_t buf_size /* * Allocate DMA memory from ancestor. When a virtio * device is created by remoteproc, the DMA memory is -* associated with the grandparent device: -* vdev => rproc => platform-dev. +* associated with the parent device: +* virtioY => remoteprocX#vdevYbuffer. */ - if (!vdev->dev.parent || !vdev->dev.parent->parent) + buf->dev = vdev->dev.parent; + if (!buf->dev) goto free_buf; - buf->dev = vdev->dev.parent->parent; I wonder it could be the right time to introduce dma_dev for virtio instead of depending on something magic via parent. This patch are meant to hit RC window and stable trees as a fix of the bug that is present since v5.8-rc1. So any new features are out of scope of this particular fix. Right. The idea of DMAing through "dev->parent" is that "virtioX" itself is a logical dev, not the real one, but its parent *is*. This logic is used across the whole tree -- every subsystem creates its own logical device, but drivers should always use the backing PCI/platform/etc. devices for DMA operations, which represent the real hardware. Yes, so what I meant is to use different variables for DMA and hierarchy. So it's the responsibility of the lower layer to pass a correct "dma_dev" to the upper layer instead of depending parent. Anyway for this patch. Acked-by: Jason Wang Thanks (Btw I don't even notice that there's transport specific code in virtio console, it's better to avoid it) Thanks Thanks, Al /* Increase device refcnt to avoid freeing it */ get_device(buf->dev);
Re: [PATCH v2] vhost-vdpa: fix page pinning leakage in error path (rework)
On 2020/11/6 上午6:57, si-wei liu wrote: On 11/4/2020 7:26 PM, Jason Wang wrote: On 2020/11/5 上午7:33, Si-Wei Liu wrote: Pinned pages are not properly accounted particularly when mapping error occurs on IOTLB update. Clean up dangling pinned pages for the error path. The memory usage for bookkeeping pinned pages is reverted to what it was before: only one single free page is needed. This helps reduce the host memory demand for VM with a large amount of memory, or in the situation where host is running short of free memory. Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend") Signed-off-by: Si-Wei Liu --- Changes in v2: - Drop the reversion patch - Fix unhandled page leak towards the end of page_list drivers/vhost/vdpa.c | 79 1 file changed, 61 insertions(+), 18 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index b6d9016..e112854 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, if (r) vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1); + else + atomic64_add(size >> PAGE_SHIFT, >mm->pinned_vm); return r; } @@ -591,14 +593,16 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, unsigned long list_size = PAGE_SIZE / sizeof(struct page *); unsigned int gup_flags = FOLL_LONGTERM; unsigned long npages, cur_base, map_pfn, last_pfn = 0; - unsigned long locked, lock_limit, pinned, i; + unsigned long lock_limit, sz2pin, nchunks, i; u64 iova = msg->iova; + long pinned; int ret = 0; if (vhost_iotlb_itree_first(iotlb, msg->iova, msg->iova + msg->size - 1)) return -EEXIST; + /* Limit the use of memory for bookkeeping */ page_list = (struct page **) __get_free_page(GFP_KERNEL); if (!page_list) return -ENOMEM; @@ -607,52 +611,75 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, gup_flags |= FOLL_WRITE; npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> PAGE_SHIFT; - if (!npages) - return -EINVAL; + if (!npages) { + ret = -EINVAL; + goto free; + } mmap_read_lock(dev->mm); - locked = atomic64_add_return(npages, >mm->pinned_vm); lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - - if (locked > lock_limit) { + if (npages + atomic64_read(>mm->pinned_vm) > lock_limit) { ret = -ENOMEM; - goto out; + goto unlock; } cur_base = msg->uaddr & PAGE_MASK; iova &= PAGE_MASK; + nchunks = 0; while (npages) { - pinned = min_t(unsigned long, npages, list_size); - ret = pin_user_pages(cur_base, pinned, - gup_flags, page_list, NULL); - if (ret != pinned) + sz2pin = min_t(unsigned long, npages, list_size); + pinned = pin_user_pages(cur_base, sz2pin, + gup_flags, page_list, NULL); + if (sz2pin != pinned) { + if (pinned < 0) { + ret = pinned; + } else { + unpin_user_pages(page_list, pinned); + ret = -ENOMEM; + } goto out; + } + nchunks++; if (!last_pfn) map_pfn = page_to_pfn(page_list[0]); - for (i = 0; i < ret; i++) { + for (i = 0; i < pinned; i++) { unsigned long this_pfn = page_to_pfn(page_list[i]); u64 csize; if (last_pfn && (this_pfn != last_pfn + 1)) { /* Pin a contiguous chunk of memory */ csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT; - if (vhost_vdpa_map(v, iova, csize, - map_pfn << PAGE_SHIFT, - msg->perm)) + ret = vhost_vdpa_map(v, iova, csize, + map_pfn << PAGE_SHIFT, + msg->perm); + if (ret) { + /* + * Unpin the pages that are left unmapped + * from this point on in the current + * page_list. The remaining outstanding + * ones which may stride across several + * chunks will be covered in the common + * error path subsequently. + */ + unpin_user_pages(_list[i], + pinned - i); Can we simply do last_pfn = this_pfn here? Nope. They are not contiguous segments of memory. Noted the conditional (this_pfn != last_pfn + 1) being held here. Right. goto out; + } + map_p
Re: [PATCH v2] vhost-vdpa: fix page pinning leakage in error path (rework)
On 2020/11/5 上午7:33, Si-Wei Liu wrote: Pinned pages are not properly accounted particularly when mapping error occurs on IOTLB update. Clean up dangling pinned pages for the error path. The memory usage for bookkeeping pinned pages is reverted to what it was before: only one single free page is needed. This helps reduce the host memory demand for VM with a large amount of memory, or in the situation where host is running short of free memory. Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend") Signed-off-by: Si-Wei Liu --- Changes in v2: - Drop the reversion patch - Fix unhandled page leak towards the end of page_list drivers/vhost/vdpa.c | 79 1 file changed, 61 insertions(+), 18 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index b6d9016..e112854 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, if (r) vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1); + else + atomic64_add(size >> PAGE_SHIFT, >mm->pinned_vm); return r; } @@ -591,14 +593,16 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, unsigned long list_size = PAGE_SIZE / sizeof(struct page *); unsigned int gup_flags = FOLL_LONGTERM; unsigned long npages, cur_base, map_pfn, last_pfn = 0; - unsigned long locked, lock_limit, pinned, i; + unsigned long lock_limit, sz2pin, nchunks, i; u64 iova = msg->iova; + long pinned; int ret = 0; if (vhost_iotlb_itree_first(iotlb, msg->iova, msg->iova + msg->size - 1)) return -EEXIST; + /* Limit the use of memory for bookkeeping */ page_list = (struct page **) __get_free_page(GFP_KERNEL); if (!page_list) return -ENOMEM; @@ -607,52 +611,75 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, gup_flags |= FOLL_WRITE; npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> PAGE_SHIFT; - if (!npages) - return -EINVAL; + if (!npages) { + ret = -EINVAL; + goto free; + } mmap_read_lock(dev->mm); - locked = atomic64_add_return(npages, >mm->pinned_vm); lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - - if (locked > lock_limit) { + if (npages + atomic64_read(>mm->pinned_vm) > lock_limit) { ret = -ENOMEM; - goto out; + goto unlock; } cur_base = msg->uaddr & PAGE_MASK; iova &= PAGE_MASK; + nchunks = 0; while (npages) { - pinned = min_t(unsigned long, npages, list_size); - ret = pin_user_pages(cur_base, pinned, -gup_flags, page_list, NULL); - if (ret != pinned) + sz2pin = min_t(unsigned long, npages, list_size); + pinned = pin_user_pages(cur_base, sz2pin, + gup_flags, page_list, NULL); + if (sz2pin != pinned) { + if (pinned < 0) { + ret = pinned; + } else { + unpin_user_pages(page_list, pinned); + ret = -ENOMEM; + } goto out; + } + nchunks++; if (!last_pfn) map_pfn = page_to_pfn(page_list[0]); - for (i = 0; i < ret; i++) { + for (i = 0; i < pinned; i++) { unsigned long this_pfn = page_to_pfn(page_list[i]); u64 csize; if (last_pfn && (this_pfn != last_pfn + 1)) { /* Pin a contiguous chunk of memory */ csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT; - if (vhost_vdpa_map(v, iova, csize, - map_pfn << PAGE_SHIFT, - msg->perm)) + ret = vhost_vdpa_map(v, iova, csize, +map_pfn << PAGE_SHIFT, +msg->perm); + if (ret) { + /* +* Unpin the pages that are left unmapped +* from this point on in the current +* page_list. The remaining outstanding +* ones which may stride across several +* chunks will be covered in the common +* error path
Re: [PATCH 2/2] vhost-vdpa: fix page pinning leakage in error path (rework)
On 2020/11/5 上午7:40, si-wei liu wrote: On 11/3/2020 6:42 PM, Jason Wang wrote: On 2020/10/30 下午3:45, Si-Wei Liu wrote: Pinned pages are not properly accounted particularly when mapping error occurs on IOTLB update. Clean up dangling pinned pages for the error path. The memory usage for bookkeeping pinned pages is reverted to what it was before: only one single free page is needed. This helps reduce the host memory demand for VM with a large amount of memory, or in the situation where host is running short of free memory. Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend") Signed-off-by: Si-Wei Liu --- drivers/vhost/vdpa.c | 64 +--- 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index b6d9016..8da8558 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, if (r) vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1); + else + atomic64_add(size >> PAGE_SHIFT, >mm->pinned_vm); return r; } @@ -591,14 +593,16 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, unsigned long list_size = PAGE_SIZE / sizeof(struct page *); unsigned int gup_flags = FOLL_LONGTERM; unsigned long npages, cur_base, map_pfn, last_pfn = 0; - unsigned long locked, lock_limit, pinned, i; + unsigned long lock_limit, sz2pin, nchunks, i; u64 iova = msg->iova; + long pinned; int ret = 0; if (vhost_iotlb_itree_first(iotlb, msg->iova, msg->iova + msg->size - 1)) return -EEXIST; + /* Limit the use of memory for bookkeeping */ page_list = (struct page **) __get_free_page(GFP_KERNEL); if (!page_list) return -ENOMEM; @@ -607,52 +611,64 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, gup_flags |= FOLL_WRITE; npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> PAGE_SHIFT; - if (!npages) - return -EINVAL; + if (!npages) { + ret = -EINVAL; + goto free; + } mmap_read_lock(dev->mm); - locked = atomic64_add_return(npages, >mm->pinned_vm); lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - - if (locked > lock_limit) { + if (npages + atomic64_read(>mm->pinned_vm) > lock_limit) { ret = -ENOMEM; - goto out; + goto unlock; } cur_base = msg->uaddr & PAGE_MASK; iova &= PAGE_MASK; + nchunks = 0; while (npages) { - pinned = min_t(unsigned long, npages, list_size); - ret = pin_user_pages(cur_base, pinned, - gup_flags, page_list, NULL); - if (ret != pinned) + sz2pin = min_t(unsigned long, npages, list_size); + pinned = pin_user_pages(cur_base, sz2pin, + gup_flags, page_list, NULL); + if (sz2pin != pinned) { + if (pinned < 0) { + ret = pinned; + } else { + unpin_user_pages(page_list, pinned); + ret = -ENOMEM; + } goto out; + } + nchunks++; if (!last_pfn) map_pfn = page_to_pfn(page_list[0]); - for (i = 0; i < ret; i++) { + for (i = 0; i < pinned; i++) { unsigned long this_pfn = page_to_pfn(page_list[i]); u64 csize; if (last_pfn && (this_pfn != last_pfn + 1)) { /* Pin a contiguous chunk of memory */ csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT; - if (vhost_vdpa_map(v, iova, csize, - map_pfn << PAGE_SHIFT, - msg->perm)) + ret = vhost_vdpa_map(v, iova, csize, + map_pfn << PAGE_SHIFT, + msg->perm); + if (ret) goto out; + map_pfn = this_pfn; iova += csize; + nchunks = 0; } last_pfn = this_pfn; } - cur_base += ret << PAGE_SHIFT; - npages -= ret; + cur_base += pinned << PAGE_SHIFT; + npages -= pinned; } /* Pin the rest chunk */ @@ -660,10 +676,22 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, map_pfn << PAGE_SHIFT, msg->perm); out: if (ret) { + if (nchunks && last_pfn) { Can we decrease npages where you did "nchunks++" then we can check npages here instead? Hmmm, I am not sure I get what you want... @nchunks gets reset to 0 whenever a certain range of pinned pages is successfully mapped. The conditional (when nchunks i
Re: [PATCH virtio] virtio: virtio_console: fix DMA memory allocation for rproc serial
On 2020/11/4 下午11:31, Alexander Lobakin wrote: Since commit 086d08725d34 ("remoteproc: create vdev subdevice with specific dma memory pool"), every remoteproc has a DMA subdevice ("remoteprocX#vdevYbuffer") for each virtio device, which inherits DMA capabilities from the corresponding platform device. This allowed to associate different DMA pools with each vdev, and required from virtio drivers to perform DMA operations with the parent device (vdev->dev.parent) instead of grandparent (vdev->dev.parent->parent). virtio_rpmsg_bus was already changed in the same merge cycle with commit d999b622fcfb ("rpmsg: virtio: allocate buffer from parent"), but virtio_console did not. In fact, operations using the grandparent worked fine while the grandparent was the platform device, but since commit c774ad010873 ("remoteproc: Fix and restore the parenting hierarchy for vdev") this was changed, and now the grandparent device is the remoteproc device without any DMA capabilities. So, starting v5.8-rc1 the following warning is observed: [2.483925] [ cut here ] [2.489148] WARNING: CPU: 3 PID: 101 at kernel/dma/mapping.c:427 0x80e7eee8 [2.489152] Modules linked in: virtio_console(+) [2.503737] virtio_rpmsg_bus rpmsg_core [2.508903] [2.528898] [2.913043] [2.914907] ---[ end trace 93ac8746beab612c ]--- [2.920102] virtio-ports vport1p0: Error allocating inbufs kernel/dma/mapping.c:427 is: WARN_ON_ONCE(!dev->coherent_dma_mask); obviously because the grandparent now is remoteproc dev without any DMA caps: [3.104943] Parent: remoteproc0#vdev1buffer, grandparent: remoteproc0 Fix this the same way as it was for virtio_rpmsg_bus, using just the parent device (vdev->dev.parent, "remoteprocX#vdevYbuffer") for DMA operations. This also allows now to reserve DMA pools/buffers for rproc serial via Device Tree. Fixes: c774ad010873 ("remoteproc: Fix and restore the parenting hierarchy for vdev") Cc: sta...@vger.kernel.org # 5.1+ Signed-off-by: Alexander Lobakin --- drivers/char/virtio_console.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index a2da8f768b94..1836cc56e357 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -435,12 +435,12 @@ static struct port_buffer *alloc_buf(struct virtio_device *vdev, size_t buf_size /* * Allocate DMA memory from ancestor. When a virtio * device is created by remoteproc, the DMA memory is -* associated with the grandparent device: -* vdev => rproc => platform-dev. +* associated with the parent device: +* virtioY => remoteprocX#vdevYbuffer. */ - if (!vdev->dev.parent || !vdev->dev.parent->parent) + buf->dev = vdev->dev.parent; + if (!buf->dev) goto free_buf; - buf->dev = vdev->dev.parent->parent; I wonder it could be the right time to introduce dma_dev for virtio instead of depending on something magic via parent. (Btw I don't even notice that there's transport specific code in virtio console, it's better to avoid it) Thanks /* Increase device refcnt to avoid freeing it */ get_device(buf->dev);
Re: [PATCH 2/2] vhost-vdpa: fix page pinning leakage in error path (rework)
On 2020/10/30 下午3:45, Si-Wei Liu wrote: Pinned pages are not properly accounted particularly when mapping error occurs on IOTLB update. Clean up dangling pinned pages for the error path. The memory usage for bookkeeping pinned pages is reverted to what it was before: only one single free page is needed. This helps reduce the host memory demand for VM with a large amount of memory, or in the situation where host is running short of free memory. Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend") Signed-off-by: Si-Wei Liu --- drivers/vhost/vdpa.c | 64 +--- 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index b6d9016..8da8558 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, if (r) vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1); + else + atomic64_add(size >> PAGE_SHIFT, >mm->pinned_vm); return r; } @@ -591,14 +593,16 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, unsigned long list_size = PAGE_SIZE / sizeof(struct page *); unsigned int gup_flags = FOLL_LONGTERM; unsigned long npages, cur_base, map_pfn, last_pfn = 0; - unsigned long locked, lock_limit, pinned, i; + unsigned long lock_limit, sz2pin, nchunks, i; u64 iova = msg->iova; + long pinned; int ret = 0; if (vhost_iotlb_itree_first(iotlb, msg->iova, msg->iova + msg->size - 1)) return -EEXIST; + /* Limit the use of memory for bookkeeping */ page_list = (struct page **) __get_free_page(GFP_KERNEL); if (!page_list) return -ENOMEM; @@ -607,52 +611,64 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, gup_flags |= FOLL_WRITE; npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> PAGE_SHIFT; - if (!npages) - return -EINVAL; + if (!npages) { + ret = -EINVAL; + goto free; + } mmap_read_lock(dev->mm); - locked = atomic64_add_return(npages, >mm->pinned_vm); lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - - if (locked > lock_limit) { + if (npages + atomic64_read(>mm->pinned_vm) > lock_limit) { ret = -ENOMEM; - goto out; + goto unlock; } cur_base = msg->uaddr & PAGE_MASK; iova &= PAGE_MASK; + nchunks = 0; while (npages) { - pinned = min_t(unsigned long, npages, list_size); - ret = pin_user_pages(cur_base, pinned, -gup_flags, page_list, NULL); - if (ret != pinned) + sz2pin = min_t(unsigned long, npages, list_size); + pinned = pin_user_pages(cur_base, sz2pin, + gup_flags, page_list, NULL); + if (sz2pin != pinned) { + if (pinned < 0) { + ret = pinned; + } else { + unpin_user_pages(page_list, pinned); + ret = -ENOMEM; + } goto out; + } + nchunks++; if (!last_pfn) map_pfn = page_to_pfn(page_list[0]); - for (i = 0; i < ret; i++) { + for (i = 0; i < pinned; i++) { unsigned long this_pfn = page_to_pfn(page_list[i]); u64 csize; if (last_pfn && (this_pfn != last_pfn + 1)) { /* Pin a contiguous chunk of memory */ csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT; - if (vhost_vdpa_map(v, iova, csize, - map_pfn << PAGE_SHIFT, - msg->perm)) + ret = vhost_vdpa_map(v, iova, csize, +map_pfn << PAGE_SHIFT, +msg->perm); + if (ret) goto out; + map_pfn = this_pfn; iova += csize; + nchunks = 0; } last_pfn = this_pfn; } - cur_base += ret << PAGE_SHIFT; - npages -= ret; + cur_base += pinned << PAGE_SHIFT; + npages -= pinned; } /* Pin the rest chunk */ @@ -660,10 +676,22 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, map_pfn << PAGE_SHIFT,
Re: [PATCH 2/2] vhost-vdpa: fix page pinning leakage in error path (rework)
On 2020/11/4 上午9:08, si-wei liu wrote: On 11/3/2020 5:06 PM, si-wei liu wrote: On 11/3/2020 5:00 AM, Jason Wang wrote: On 2020/10/30 下午3:45, Si-Wei Liu wrote: Pinned pages are not properly accounted particularly when mapping error occurs on IOTLB update. Clean up dangling pinned pages for the error path. The memory usage for bookkeeping pinned pages is reverted to what it was before: only one single free page is needed. This helps reduce the host memory demand for VM with a large amount of memory, or in the situation where host is running short of free memory. Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend") Signed-off-by: Si-Wei Liu --- drivers/vhost/vdpa.c | 64 +--- 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index b6d9016..8da8558 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, if (r) vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1); + else + atomic64_add(size >> PAGE_SHIFT, >mm->pinned_vm); return r; } @@ -591,14 +593,16 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, unsigned long list_size = PAGE_SIZE / sizeof(struct page *); unsigned int gup_flags = FOLL_LONGTERM; unsigned long npages, cur_base, map_pfn, last_pfn = 0; - unsigned long locked, lock_limit, pinned, i; + unsigned long lock_limit, sz2pin, nchunks, i; u64 iova = msg->iova; + long pinned; int ret = 0; if (vhost_iotlb_itree_first(iotlb, msg->iova, msg->iova + msg->size - 1)) return -EEXIST; + /* Limit the use of memory for bookkeeping */ page_list = (struct page **) __get_free_page(GFP_KERNEL); if (!page_list) return -ENOMEM; @@ -607,52 +611,64 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, gup_flags |= FOLL_WRITE; npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> PAGE_SHIFT; - if (!npages) - return -EINVAL; + if (!npages) { + ret = -EINVAL; + goto free; + } mmap_read_lock(dev->mm); - locked = atomic64_add_return(npages, >mm->pinned_vm); lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - - if (locked > lock_limit) { + if (npages + atomic64_read(>mm->pinned_vm) > lock_limit) { ret = -ENOMEM; - goto out; + goto unlock; } cur_base = msg->uaddr & PAGE_MASK; iova &= PAGE_MASK; + nchunks = 0; while (npages) { - pinned = min_t(unsigned long, npages, list_size); - ret = pin_user_pages(cur_base, pinned, - gup_flags, page_list, NULL); - if (ret != pinned) + sz2pin = min_t(unsigned long, npages, list_size); + pinned = pin_user_pages(cur_base, sz2pin, + gup_flags, page_list, NULL); + if (sz2pin != pinned) { + if (pinned < 0) { + ret = pinned; + } else { + unpin_user_pages(page_list, pinned); + ret = -ENOMEM; + } goto out; + } + nchunks++; if (!last_pfn) map_pfn = page_to_pfn(page_list[0]); - for (i = 0; i < ret; i++) { + for (i = 0; i < pinned; i++) { unsigned long this_pfn = page_to_pfn(page_list[i]); u64 csize; if (last_pfn && (this_pfn != last_pfn + 1)) { /* Pin a contiguous chunk of memory */ csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT; - if (vhost_vdpa_map(v, iova, csize, - map_pfn << PAGE_SHIFT, - msg->perm)) + ret = vhost_vdpa_map(v, iova, csize, + map_pfn << PAGE_SHIFT, + msg->perm); + if (ret) goto out; + map_pfn = this_pfn; iova += csize; + nchunks = 0; } last_pfn = this_pfn; } - cur_base += ret << PAGE_SHIFT; - npages -= ret; + cur_base += pinned << PAGE_SHIFT; + npages -= pinned; } /* Pin the rest chunk */ @@ -660,10 +676,22 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, map_pfn << PAGE_SHIFT, msg->perm); out: if (ret) { + if (nchunks && last_pfn) { + unsigned long pfn; + + /* + * Unpin the outstanding pages which are unmapped. + * Mapped pages are accounted in vdpa_map(), thus +
Re: [PATCH 2/2] vhost-vdpa: fix page pinning leakage in error path (rework)
On 2020/10/30 下午3:45, Si-Wei Liu wrote: Pinned pages are not properly accounted particularly when mapping error occurs on IOTLB update. Clean up dangling pinned pages for the error path. The memory usage for bookkeeping pinned pages is reverted to what it was before: only one single free page is needed. This helps reduce the host memory demand for VM with a large amount of memory, or in the situation where host is running short of free memory. Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend") Signed-off-by: Si-Wei Liu --- drivers/vhost/vdpa.c | 64 +--- 1 file changed, 46 insertions(+), 18 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index b6d9016..8da8558 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -560,6 +560,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, if (r) vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1); + else + atomic64_add(size >> PAGE_SHIFT, >mm->pinned_vm); return r; } @@ -591,14 +593,16 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, unsigned long list_size = PAGE_SIZE / sizeof(struct page *); unsigned int gup_flags = FOLL_LONGTERM; unsigned long npages, cur_base, map_pfn, last_pfn = 0; - unsigned long locked, lock_limit, pinned, i; + unsigned long lock_limit, sz2pin, nchunks, i; u64 iova = msg->iova; + long pinned; int ret = 0; if (vhost_iotlb_itree_first(iotlb, msg->iova, msg->iova + msg->size - 1)) return -EEXIST; + /* Limit the use of memory for bookkeeping */ page_list = (struct page **) __get_free_page(GFP_KERNEL); if (!page_list) return -ENOMEM; @@ -607,52 +611,64 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, gup_flags |= FOLL_WRITE; npages = PAGE_ALIGN(msg->size + (iova & ~PAGE_MASK)) >> PAGE_SHIFT; - if (!npages) - return -EINVAL; + if (!npages) { + ret = -EINVAL; + goto free; + } mmap_read_lock(dev->mm); - locked = atomic64_add_return(npages, >mm->pinned_vm); lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - - if (locked > lock_limit) { + if (npages + atomic64_read(>mm->pinned_vm) > lock_limit) { ret = -ENOMEM; - goto out; + goto unlock; } cur_base = msg->uaddr & PAGE_MASK; iova &= PAGE_MASK; + nchunks = 0; while (npages) { - pinned = min_t(unsigned long, npages, list_size); - ret = pin_user_pages(cur_base, pinned, -gup_flags, page_list, NULL); - if (ret != pinned) + sz2pin = min_t(unsigned long, npages, list_size); + pinned = pin_user_pages(cur_base, sz2pin, + gup_flags, page_list, NULL); + if (sz2pin != pinned) { + if (pinned < 0) { + ret = pinned; + } else { + unpin_user_pages(page_list, pinned); + ret = -ENOMEM; + } goto out; + } + nchunks++; if (!last_pfn) map_pfn = page_to_pfn(page_list[0]); - for (i = 0; i < ret; i++) { + for (i = 0; i < pinned; i++) { unsigned long this_pfn = page_to_pfn(page_list[i]); u64 csize; if (last_pfn && (this_pfn != last_pfn + 1)) { /* Pin a contiguous chunk of memory */ csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT; - if (vhost_vdpa_map(v, iova, csize, - map_pfn << PAGE_SHIFT, - msg->perm)) + ret = vhost_vdpa_map(v, iova, csize, +map_pfn << PAGE_SHIFT, +msg->perm); + if (ret) goto out; + map_pfn = this_pfn; iova += csize; + nchunks = 0; } last_pfn = this_pfn; } - cur_base += ret << PAGE_SHIFT; - npages -= ret; + cur_base += pinned << PAGE_SHIFT; + npages -= pinned; } /* Pin the rest chunk */ @@ -660,10 +676,22 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, map_pfn << PAGE_SHIFT,
Re: [PATCH] vhost/vsock: add IOTLB API support
On 2020/11/3 上午1:11, Stefano Garzarella wrote: On Fri, Oct 30, 2020 at 07:44:43PM +0800, Jason Wang wrote: On 2020/10/30 下午6:54, Stefano Garzarella wrote: On Fri, Oct 30, 2020 at 06:02:18PM +0800, Jason Wang wrote: On 2020/10/30 上午1:43, Stefano Garzarella wrote: This patch enables the IOTLB API support for vhost-vsock devices, allowing the userspace to emulate an IOMMU for the guest. These changes were made following vhost-net, in details this patch: - exposes VIRTIO_F_ACCESS_PLATFORM feature and inits the iotlb device if the feature is acked - implements VHOST_GET_BACKEND_FEATURES and VHOST_SET_BACKEND_FEATURES ioctls - calls vq_meta_prefetch() before vq processing to prefetch vq metadata address in IOTLB - provides .read_iter, .write_iter, and .poll callbacks for the chardev; they are used by the userspace to exchange IOTLB messages This patch was tested with QEMU and a patch applied [1] to fix a simple issue: $ qemu -M q35,accel=kvm,kernel-irqchip=split \ -drive file=fedora.qcow2,format=qcow2,if=virtio \ -device intel-iommu,intremap=on \ -device vhost-vsock-pci,guest-cid=3,iommu_platform=on Patch looks good, but a question: It looks to me you don't enable ATS which means vhost won't get any invalidation request or did I miss anything? You're right, I didn't see invalidation requests, only miss and updates. Now I have tried to enable 'ats' and 'device-iotlb' but I still don't see any invalidation. How can I test it? (Sorry but I don't have much experience yet with vIOMMU) I guess it's because the batched unmap. Maybe you can try to use "intel_iommu=strict" in guest kernel command line to see if it works. Btw, make sure the qemu contains the patch [1]. Otherwise ATS won't be enabled for recent Linux Kernel in the guest. The problem was my kernel, it was built with a tiny configuration. Using fedora stock kernel I can see the 'invalidate' requests, but I also had the following issues. Do they make you ring any bells? $ ./qemu -m 4G -smp 4 -M q35,accel=kvm,kernel-irqchip=split \ -drive file=fedora.qcow2,format=qcow2,if=virtio \ -device intel-iommu,intremap=on,device-iotlb=on \ -device vhost-vsock-pci,guest-cid=6,iommu_platform=on,ats=on,id=v1 qemu-system-x86_64: vtd_iova_to_slpte: detected IOVA overflow (iova=0x1d4030c0) It's a hint that IOVA exceeds the AW. It might be worth to check whether the missed IOVA reported from IOTLB is legal. Thanks qemu-system-x86_64: vtd_iommu_translate: detected translation failure (dev=00:03:00, iova=0x1d4030c0) qemu-system-x86_64: New fault is not recorded due to compression of faults Guest kernel messages: [ 44.940872] DMAR: DRHD: handling fault status reg 2 [ 44.941989] DMAR: [DMA Read] Request device [00:03.0] PASID fault addr 88W [ 49.785884] DMAR: DRHD: handling fault status reg 2 [ 49.788874] DMAR: [DMA Read] Request device [00:03.0] PASID fault addr 88W QEMU: b149dea55c Merge remote-tracking branch 'remotes/cschoenebeck/tags/pull-9p-20201102' into staging Linux guest: 5.8.16-200.fc32.x86_64 Thanks, Stefano
Re: [PATCH 1/2] Revert "vhost-vdpa: fix page pinning leakage in error path"
On 2020/10/30 下午3:45, Si-Wei Liu wrote: This reverts commit 7ed9e3d97c32d969caded2dfb6e67c1a2cc5a0b1. Signed-off-by: Si-Wei Liu --- drivers/vhost/vdpa.c | 119 +-- 1 file changed, 48 insertions(+), 71 deletions(-) I saw this has been reverted there https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/drivers/vhost?id=5e1a3149eec8675c2767cc465903f5e4829de5b0. :) Thanks diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index a2dbc85..b6d9016 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -588,19 +588,21 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, struct vhost_dev *dev = >vdev; struct vhost_iotlb *iotlb = dev->iotlb; struct page **page_list; - struct vm_area_struct **vmas; + unsigned long list_size = PAGE_SIZE / sizeof(struct page *); unsigned int gup_flags = FOLL_LONGTERM; - unsigned long map_pfn, last_pfn = 0; - unsigned long npages, lock_limit; - unsigned long i, nmap = 0; + unsigned long npages, cur_base, map_pfn, last_pfn = 0; + unsigned long locked, lock_limit, pinned, i; u64 iova = msg->iova; - long pinned; int ret = 0; if (vhost_iotlb_itree_first(iotlb, msg->iova, msg->iova + msg->size - 1)) return -EEXIST; + page_list = (struct page **) __get_free_page(GFP_KERNEL); + if (!page_list) + return -ENOMEM; + if (msg->perm & VHOST_ACCESS_WO) gup_flags |= FOLL_WRITE; @@ -608,86 +610,61 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, if (!npages) return -EINVAL; - page_list = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); - vmas = kvmalloc_array(npages, sizeof(struct vm_area_struct *), - GFP_KERNEL); - if (!page_list || !vmas) { - ret = -ENOMEM; - goto free; - } - mmap_read_lock(dev->mm); + locked = atomic64_add_return(npages, >mm->pinned_vm); lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - if (npages + atomic64_read(>mm->pinned_vm) > lock_limit) { - ret = -ENOMEM; - goto unlock; - } - pinned = pin_user_pages(msg->uaddr & PAGE_MASK, npages, gup_flags, - page_list, vmas); - if (npages != pinned) { - if (pinned < 0) { - ret = pinned; - } else { - unpin_user_pages(page_list, pinned); - ret = -ENOMEM; - } - goto unlock; + if (locked > lock_limit) { + ret = -ENOMEM; + goto out; } + cur_base = msg->uaddr & PAGE_MASK; iova &= PAGE_MASK; - map_pfn = page_to_pfn(page_list[0]); - - /* One more iteration to avoid extra vdpa_map() call out of loop. */ - for (i = 0; i <= npages; i++) { - unsigned long this_pfn; - u64 csize; - - /* The last chunk may have no valid PFN next to it */ - this_pfn = i < npages ? page_to_pfn(page_list[i]) : -1UL; - - if (last_pfn && (this_pfn == -1UL || -this_pfn != last_pfn + 1)) { - /* Pin a contiguous chunk of memory */ - csize = last_pfn - map_pfn + 1; - ret = vhost_vdpa_map(v, iova, csize << PAGE_SHIFT, -map_pfn << PAGE_SHIFT, -msg->perm); - if (ret) { - /* -* Unpin the rest chunks of memory on the -* flight with no corresponding vdpa_map() -* calls having been made yet. On the other -* hand, vdpa_unmap() in the failure path -* is in charge of accounting the number of -* pinned pages for its own. -* This asymmetrical pattern of accounting -* is for efficiency to pin all pages at -* once, while there is no other callsite -* of vdpa_map() than here above. -*/ - unpin_user_pages(_list[nmap], -npages - nmap); - goto out; + + while (npages) { + pinned = min_t(unsigned long, npages, list_size); + ret = pin_user_pages(cur_base, pinned, +gup_flags, page_list, NULL); + if (ret != pinned) +
Re: [PATCH] vhost/vsock: add IOTLB API support
On 2020/10/30 下午6:54, Stefano Garzarella wrote: On Fri, Oct 30, 2020 at 06:02:18PM +0800, Jason Wang wrote: On 2020/10/30 上午1:43, Stefano Garzarella wrote: This patch enables the IOTLB API support for vhost-vsock devices, allowing the userspace to emulate an IOMMU for the guest. These changes were made following vhost-net, in details this patch: - exposes VIRTIO_F_ACCESS_PLATFORM feature and inits the iotlb device if the feature is acked - implements VHOST_GET_BACKEND_FEATURES and VHOST_SET_BACKEND_FEATURES ioctls - calls vq_meta_prefetch() before vq processing to prefetch vq metadata address in IOTLB - provides .read_iter, .write_iter, and .poll callbacks for the chardev; they are used by the userspace to exchange IOTLB messages This patch was tested with QEMU and a patch applied [1] to fix a simple issue: $ qemu -M q35,accel=kvm,kernel-irqchip=split \ -drive file=fedora.qcow2,format=qcow2,if=virtio \ -device intel-iommu,intremap=on \ -device vhost-vsock-pci,guest-cid=3,iommu_platform=on Patch looks good, but a question: It looks to me you don't enable ATS which means vhost won't get any invalidation request or did I miss anything? You're right, I didn't see invalidation requests, only miss and updates. Now I have tried to enable 'ats' and 'device-iotlb' but I still don't see any invalidation. How can I test it? (Sorry but I don't have much experience yet with vIOMMU) I guess it's because the batched unmap. Maybe you can try to use "intel_iommu=strict" in guest kernel command line to see if it works. Btw, make sure the qemu contains the patch [1]. Otherwise ATS won't be enabled for recent Linux Kernel in the guest. Thanks [1] https://patchew.org/QEMU/20200909081731.24688-1-jasow...@redhat.com/ Thanks, Stefano
Re: [PATCH] vhost/vsock: add IOTLB API support
On 2020/10/30 上午1:43, Stefano Garzarella wrote: This patch enables the IOTLB API support for vhost-vsock devices, allowing the userspace to emulate an IOMMU for the guest. These changes were made following vhost-net, in details this patch: - exposes VIRTIO_F_ACCESS_PLATFORM feature and inits the iotlb device if the feature is acked - implements VHOST_GET_BACKEND_FEATURES and VHOST_SET_BACKEND_FEATURES ioctls - calls vq_meta_prefetch() before vq processing to prefetch vq metadata address in IOTLB - provides .read_iter, .write_iter, and .poll callbacks for the chardev; they are used by the userspace to exchange IOTLB messages This patch was tested with QEMU and a patch applied [1] to fix a simple issue: $ qemu -M q35,accel=kvm,kernel-irqchip=split \ -drive file=fedora.qcow2,format=qcow2,if=virtio \ -device intel-iommu,intremap=on \ -device vhost-vsock-pci,guest-cid=3,iommu_platform=on Patch looks good, but a question: It looks to me you don't enable ATS which means vhost won't get any invalidation request or did I miss anything? Thanks [1] https://lists.gnu.org/archive/html/qemu-devel/2020-10/msg09077.html Signed-off-by: Stefano Garzarella --- drivers/vhost/vsock.c | 68 +-- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c index a483cec31d5c..5e78fb719602 100644 --- a/drivers/vhost/vsock.c +++ b/drivers/vhost/vsock.c @@ -30,7 +30,12 @@ #define VHOST_VSOCK_PKT_WEIGHT 256 enum { - VHOST_VSOCK_FEATURES = VHOST_FEATURES, + VHOST_VSOCK_FEATURES = VHOST_FEATURES | + (1ULL << VIRTIO_F_ACCESS_PLATFORM) +}; + +enum { + VHOST_VSOCK_BACKEND_FEATURES = (1ULL << VHOST_BACKEND_F_IOTLB_MSG_V2) }; /* Used to track all the vhost_vsock instances on the system. */ @@ -94,6 +99,9 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock, if (!vhost_vq_get_backend(vq)) goto out; + if (!vq_meta_prefetch(vq)) + goto out; + /* Avoid further vmexits, we're already processing the virtqueue */ vhost_disable_notify(>dev, vq); @@ -449,6 +457,9 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work) if (!vhost_vq_get_backend(vq)) goto out; + if (!vq_meta_prefetch(vq)) + goto out; + vhost_disable_notify(>dev, vq); do { u32 len; @@ -766,8 +777,12 @@ static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features) mutex_lock(>dev.mutex); if ((features & (1 << VHOST_F_LOG_ALL)) && !vhost_log_access_ok(>dev)) { - mutex_unlock(>dev.mutex); - return -EFAULT; + goto err; + } + + if ((features & (1ULL << VIRTIO_F_ACCESS_PLATFORM))) { + if (vhost_init_device_iotlb(>dev, true)) + goto err; } for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++) { @@ -778,6 +793,10 @@ static int vhost_vsock_set_features(struct vhost_vsock *vsock, u64 features) } mutex_unlock(>dev.mutex); return 0; + +err: + mutex_unlock(>dev.mutex); + return -EFAULT; } static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl, @@ -811,6 +830,18 @@ static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl, if (copy_from_user(, argp, sizeof(features))) return -EFAULT; return vhost_vsock_set_features(vsock, features); + case VHOST_GET_BACKEND_FEATURES: + features = VHOST_VSOCK_BACKEND_FEATURES; + if (copy_to_user(argp, , sizeof(features))) + return -EFAULT; + return 0; + case VHOST_SET_BACKEND_FEATURES: + if (copy_from_user(, argp, sizeof(features))) + return -EFAULT; + if (features & ~VHOST_VSOCK_BACKEND_FEATURES) + return -EOPNOTSUPP; + vhost_set_backend_features(>dev, features); + return 0; default: mutex_lock(>dev.mutex); r = vhost_dev_ioctl(>dev, ioctl, argp); @@ -823,6 +854,34 @@ static long vhost_vsock_dev_ioctl(struct file *f, unsigned int ioctl, } } +static ssize_t vhost_vsock_chr_read_iter(struct kiocb *iocb, struct iov_iter *to) +{ + struct file *file = iocb->ki_filp; + struct vhost_vsock *vsock = file->private_data; + struct vhost_dev *dev = >dev; + int noblock = file->f_flags & O_NONBLOCK; + + return vhost_chr_read_iter(dev, to, noblock); +} + +static ssize_t vhost_vsock_chr_write_iter(struct kiocb *iocb, + struct iov_iter *from) +{ + struct file *file = iocb->ki_filp; + struct vhost_vsock *vsock = file->private_data; + struct
Re: [PATCH 0/2] vdpasim: allow to set MAC address
On 2020/10/29 下午8:20, Laurent Vivier wrote: This series starts by fixing a bug: vdpa_sim generates a MAC address that is never show to upper layer, and thus virtio-net generates another random MAC address, that changes each time virtio-net is loaded (even if vdpa_sim is not unloaded). Then it adds a parameter to vpa_sim module to allow the user to set the MAC address. With that we use vdpa_sim with a stable MAC addres, that doesn't change between reboots. Laurent Vivier (2): vdpasim: fix MAC address configuration vdpasim: allow to assign a MAC address drivers/vdpa/vdpa_sim/vdpa_sim.c | 17 +++-- 1 file changed, 15 insertions(+), 2 deletions(-) Acked-by: Jason Wang
Re: [PATCH] vdpa_sim: Fix DMA mask
--- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -361,7 +361,9 @@ static struct vdpasim *vdpasim_create(void) spin_lock_init(>iommu_lock); dev = >vdpa.dev; - dev->coherent_dma_mask = DMA_BIT_MASK(64); + dev->dma_mask = >coherent_dma_mask; + if (dma_set_mask_and_coherent(dev, DMA_BIT_MASK(64))) + goto err_iommu; set_dma_ops(dev, _dma_ops); vdpasim->iommu = vhost_iotlb_alloc(2048, 0); Acked-by: Jason Wang
Re: [PATCH] vdpa/mlx5: Fix error return in map_direct_mr()
On 2020/10/26 下午3:06, Jing Xiangfeng wrote: Fix to return the variable "err" from the error handling case instead of "ret". Fixes: 94abbccdf291 ("vdpa/mlx5: Add shared memory registration code") Signed-off-by: Jing Xiangfeng --- drivers/vdpa/mlx5/core/mr.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/vdpa/mlx5/core/mr.c b/drivers/vdpa/mlx5/core/mr.c index ef1c550f8266..4b6195666c58 100644 --- a/drivers/vdpa/mlx5/core/mr.c +++ b/drivers/vdpa/mlx5/core/mr.c @@ -239,7 +239,6 @@ static int map_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr u64 paend; struct scatterlist *sg; struct device *dma = mvdev->mdev->device; - int ret; for (map = vhost_iotlb_itree_first(iotlb, mr->start, mr->end - 1); map; map = vhost_iotlb_itree_next(map, start, mr->end - 1)) { @@ -277,8 +276,8 @@ static int map_direct_mr(struct mlx5_vdpa_dev *mvdev, struct mlx5_vdpa_direct_mr done: mr->log_size = log_entity_size; mr->nsg = nsg; - ret = dma_map_sg_attrs(dma, mr->sg_head.sgl, mr->nsg, DMA_BIDIRECTIONAL, 0); - if (!ret) + err = dma_map_sg_attrs(dma, mr->sg_head.sgl, mr->nsg, DMA_BIDIRECTIONAL, 0); + if (!err) goto err_map; err = create_direct_mr(mvdev, mr); Acked-by: Jason Wang
[PATCH V4 3/3] vdpa_sim: implement get_iova_range()
This implements a sample get_iova_range() for the simulator which advertise [0, ULLONG_MAX] as the valid range. Signed-off-by: Jason Wang --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 12 1 file changed, 12 insertions(+) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 62d640327145..ff6c9fd8d879 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -574,6 +574,16 @@ static u32 vdpasim_get_generation(struct vdpa_device *vdpa) return vdpasim->generation; } +static struct vdpa_iova_range vdpasim_get_iova_range(struct vdpa_device *vdpa) +{ + struct vdpa_iova_range range = { + .first = 0ULL, + .last = ULLONG_MAX, + }; + + return range; +} + static int vdpasim_set_map(struct vdpa_device *vdpa, struct vhost_iotlb *iotlb) { @@ -657,6 +667,7 @@ static const struct vdpa_config_ops vdpasim_net_config_ops = { .get_config = vdpasim_get_config, .set_config = vdpasim_set_config, .get_generation = vdpasim_get_generation, + .get_iova_range = vdpasim_get_iova_range, .dma_map= vdpasim_dma_map, .dma_unmap = vdpasim_dma_unmap, .free = vdpasim_free, @@ -683,6 +694,7 @@ static const struct vdpa_config_ops vdpasim_net_batch_config_ops = { .get_config = vdpasim_get_config, .set_config = vdpasim_set_config, .get_generation = vdpasim_get_generation, + .get_iova_range = vdpasim_get_iova_range, .set_map= vdpasim_set_map, .free = vdpasim_free, }; -- 2.20.1
[PATCH V4 0/3] vDPA: API for reporting IOVA range
Hi All: This series introduces API for reporing IOVA range. This is a must for userspace to work correclty: - for the process that uses vhost-vDPA directly, the IOVA must be allocated from this range. - for VM(qemu), when vIOMMU is not enabled, fail early if GPA is out of range - for VM(qemu), when vIOMMU is enabled, determine a valid guest address width and then guest IOVA allocator can behave correctly. Please review. Changes from V3: - really silent build warnings Changes from V2: - silent build warnings Changes from V1: - do not mandate get_iova_range() for device with its own DMA translation logic and assume a [0, ULLONG_MAX] range - mandate IOVA range only for IOMMU that forcing aperture - forbid the map which is out of the IOVA range in vhost-vDPA Jason Wang (3): vdpa: introduce config op to get valid iova range vhost: vdpa: report iova range vdpa_sim: implement get_iova_range() drivers/vdpa/vdpa_sim/vdpa_sim.c | 12 ++ drivers/vhost/vdpa.c | 41 include/linux/vdpa.h | 15 include/uapi/linux/vhost.h | 4 include/uapi/linux/vhost_types.h | 9 +++ 5 files changed, 81 insertions(+) -- 2.20.1
[PATCH V4 2/3] vhost: vdpa: report iova range
This patch introduces a new ioctl for vhost-vdpa device that can report the iova range by the device. For device that implements get_iova_range() method, we fetch it from the vDPA device. If device doesn't implement get_iova_range() but depends on platform IOMMU, we will query via DOMAIN_ATTR_GEOMETRY, otherwise [0, ULLONG_MAX] is assumed. For safety, this patch also rules out the map request which is not in the valid range. Signed-off-by: Jason Wang --- drivers/vhost/vdpa.c | 41 include/uapi/linux/vhost.h | 4 include/uapi/linux/vhost_types.h | 9 +++ 3 files changed, 54 insertions(+) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index a2dbc85e0b0d..846de69d9c01 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -47,6 +47,7 @@ struct vhost_vdpa { int minor; struct eventfd_ctx *config_ctx; int in_batch; + struct vdpa_iova_range range; }; static DEFINE_IDA(vhost_vdpa_ida); @@ -337,6 +338,16 @@ static long vhost_vdpa_set_config_call(struct vhost_vdpa *v, u32 __user *argp) return 0; } +static long vhost_vdpa_get_iova_range(struct vhost_vdpa *v, u32 __user *argp) +{ + struct vhost_vdpa_iova_range range = { + .first = v->range.first, + .last = v->range.last, + }; + + return copy_to_user(argp, , sizeof(range)); +} + static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, void __user *argp) { @@ -471,6 +482,9 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep, features = VHOST_VDPA_BACKEND_FEATURES; r = copy_to_user(featurep, , sizeof(features)); break; + case VHOST_VDPA_GET_IOVA_RANGE: + r = vhost_vdpa_get_iova_range(v, argp); + break; default: r = vhost_dev_ioctl(>vdev, cmd, argp); if (r == -ENOIOCTLCMD) @@ -597,6 +611,10 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, long pinned; int ret = 0; + if (msg->iova < v->range.first || + msg->iova + msg->size - 1 > v->range.last) + return -EINVAL; + if (vhost_iotlb_itree_first(iotlb, msg->iova, msg->iova + msg->size - 1)) return -EEXIST; @@ -783,6 +801,27 @@ static void vhost_vdpa_free_domain(struct vhost_vdpa *v) v->domain = NULL; } +static void vhost_vdpa_set_iova_range(struct vhost_vdpa *v) +{ + struct vdpa_iova_range *range = >range; + struct iommu_domain_geometry geo; + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + + if (ops->get_iova_range) { + *range = ops->get_iova_range(vdpa); + } else if (v->domain && + !iommu_domain_get_attr(v->domain, + DOMAIN_ATTR_GEOMETRY, ) && + geo.force_aperture) { + range->first = geo.aperture_start; + range->last = geo.aperture_end; + } else { + range->first = 0; + range->last = ULLONG_MAX; + } +} + static int vhost_vdpa_open(struct inode *inode, struct file *filep) { struct vhost_vdpa *v; @@ -823,6 +862,8 @@ static int vhost_vdpa_open(struct inode *inode, struct file *filep) if (r) goto err_init_iotlb; + vhost_vdpa_set_iova_range(v); + filep->private_data = v; return 0; diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index 75232185324a..c998860d7bbc 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -146,4 +146,8 @@ /* Set event fd for config interrupt*/ #define VHOST_VDPA_SET_CONFIG_CALL _IOW(VHOST_VIRTIO, 0x77, int) + +/* Get the valid iova range */ +#define VHOST_VDPA_GET_IOVA_RANGE _IOR(VHOST_VIRTIO, 0x78, \ +struct vhost_vdpa_iova_range) #endif diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h index 9a269a88a6ff..f7f6a3a28977 100644 --- a/include/uapi/linux/vhost_types.h +++ b/include/uapi/linux/vhost_types.h @@ -138,6 +138,15 @@ struct vhost_vdpa_config { __u8 buf[0]; }; +/* vhost vdpa IOVA range + * @first: First address that can be mapped by vhost-vDPA + * @last: Last address that can be mapped by vhost-vDPA + */ +struct vhost_vdpa_iova_range { + __u64 first; + __u64 last; +}; + /* Feature bits */ /* Log all write descriptors. Can be changed while device is active. */ #define VHOST_F_LOG_ALL 26 -- 2.20.1
[PATCH V4 1/3] vdpa: introduce config op to get valid iova range
This patch introduce a config op to get valid iova range from the vDPA device. Signed-off-by: Jason Wang --- include/linux/vdpa.h | 15 +++ 1 file changed, 15 insertions(+) diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index eae0bfd87d91..30bc7a7223bb 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -52,6 +52,16 @@ struct vdpa_device { int nvqs; }; +/** + * vDPA IOVA range - the IOVA range support by the device + * @first: start of the IOVA range + * @last: end of the IOVA range + */ +struct vdpa_iova_range { + u64 first; + u64 last; +}; + /** * vDPA_config_ops - operations for configuring a vDPA device. * Note: vDPA device drivers are required to implement all of the @@ -151,6 +161,10 @@ struct vdpa_device { * @get_generation:Get device config generation (optional) * @vdev: vdpa device * Returns u32: device generation + * @get_iova_range:Get supported iova range (optional) + * @vdev: vdpa device + * Returns the iova range supported by + * the device. * @set_map: Set device memory mapping (optional) * Needed for device that using device * specific DMA translation (on-chip IOMMU) @@ -216,6 +230,7 @@ struct vdpa_config_ops { void (*set_config)(struct vdpa_device *vdev, unsigned int offset, const void *buf, unsigned int len); u32 (*get_generation)(struct vdpa_device *vdev); + struct vdpa_iova_range (*get_iova_range)(struct vdpa_device *vdev); /* DMA ops */ int (*set_map)(struct vdpa_device *vdev, struct vhost_iotlb *iotlb); -- 2.20.1
Re: [PATCH V3 2/3] vhost: vdpa: report iova range
On 2020/10/23 下午1:28, kernel test robot wrote: Hi Jason, I love your patch! Perhaps something to improve: [auto build test WARNING on vhost/linux-next] [also build test WARNING on linus/master v5.9 next-20201023] [cannot apply to linux/master] [If your patch is applied to the wrong git tree, kindly drop us a note. And when submitting patch, we suggest to use '--base' as documented in https://git-scm.com/docs/git-format-patch] url: https://github.com/0day-ci/linux/commits/Jason-Wang/vDPA-API-for-reporting-IOVA-range/20201023-102708 base: https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git linux-next config: m68k-randconfig-r034-20201022 (attached as .config) compiler: m68k-linux-gcc (GCC) 9.3.0 reproduce (this is a W=1 build): wget https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross # https://github.com/0day-ci/linux/commit/446e7b97838ebf87f1acd61580137716fdad104a git remote add linux-review https://github.com/0day-ci/linux git fetch --no-tags linux-review Jason-Wang/vDPA-API-for-reporting-IOVA-range/20201023-102708 git checkout 446e7b97838ebf87f1acd61580137716fdad104a # save the attached .config to linux build tree COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-9.3.0 make.cross ARCH=m68k If you fix the issue, kindly add following tag as appropriate Reported-by: kernel test robot All warnings (new ones prefixed by >>): drivers/vhost/vdpa.c: In function 'vhost_vdpa_setup_vq_irq': drivers/vhost/vdpa.c:94:6: warning: variable 'ret' set but not used [-Wunused-but-set-variable] 94 | int ret, irq; | ^~~ drivers/vhost/vdpa.c: In function 'vhost_vdpa_unlocked_ioctl': This looks like another issue that needs to be fixed. drivers/vhost/vdpa.c:483:5: warning: this statement may fall through [-Wimplicit-fallthrough=] 483 | r = copy_to_user(featurep, , sizeof(features)); | ~~^ drivers/vhost/vdpa.c:484:2: note: here 484 | case VHOST_VDPA_GET_IOVA_RANGE: | ^~~~ vim +483 drivers/vhost/vdpa.c My bad. V4 is on the road. Thanks 4c8cf31885f69e8 Tiwei Bie2020-03-26 426 4c8cf31885f69e8 Tiwei Bie2020-03-26 427 static long vhost_vdpa_unlocked_ioctl(struct file *filep, 4c8cf31885f69e8 Tiwei Bie2020-03-26 428 unsigned int cmd, unsigned long arg) 4c8cf31885f69e8 Tiwei Bie2020-03-26 429 { 4c8cf31885f69e8 Tiwei Bie2020-03-26 430struct vhost_vdpa *v = filep->private_data; 4c8cf31885f69e8 Tiwei Bie2020-03-26 431struct vhost_dev *d = >vdev; 4c8cf31885f69e8 Tiwei Bie2020-03-26 432void __user *argp = (void __user *)arg; a127c5bbb6a8eee Jason Wang 2020-09-07 433u64 __user *featurep = argp; a127c5bbb6a8eee Jason Wang 2020-09-07 434u64 features; 4c8cf31885f69e8 Tiwei Bie2020-03-26 435long r; 4c8cf31885f69e8 Tiwei Bie2020-03-26 436 a127c5bbb6a8eee Jason Wang 2020-09-07 437if (cmd == VHOST_SET_BACKEND_FEATURES) { a127c5bbb6a8eee Jason Wang 2020-09-07 438r = copy_from_user(, featurep, sizeof(features)); a127c5bbb6a8eee Jason Wang 2020-09-07 439if (r) a127c5bbb6a8eee Jason Wang 2020-09-07 440return r; a127c5bbb6a8eee Jason Wang 2020-09-07 441if (features & ~VHOST_VDPA_BACKEND_FEATURES) a127c5bbb6a8eee Jason Wang 2020-09-07 442return -EOPNOTSUPP; a127c5bbb6a8eee Jason Wang 2020-09-07 443 vhost_set_backend_features(>vdev, features); a127c5bbb6a8eee Jason Wang 2020-09-07 444return 0; a127c5bbb6a8eee Jason Wang 2020-09-07 445 } a127c5bbb6a8eee Jason Wang 2020-09-07 446 4c8cf31885f69e8 Tiwei Bie2020-03-26 447mutex_lock(>mutex); 4c8cf31885f69e8 Tiwei Bie2020-03-26 448 4c8cf31885f69e8 Tiwei Bie2020-03-26 449switch (cmd) { 4c8cf31885f69e8 Tiwei Bie2020-03-26 450case VHOST_VDPA_GET_DEVICE_ID: 4c8cf31885f69e8 Tiwei Bie2020-03-26 451r = vhost_vdpa_get_device_id(v, argp); 4c8cf31885f69e8 Tiwei Bie2020-03-26 452break; 4c8cf31885f69e8 Tiwei Bie2020-03-26 453case VHOST_VDPA_GET_STATUS: 4c8cf31885f69e8 Tiwei Bie2020-03-26 454r = vhost_vdpa_get_status(v, argp); 4c8cf31885f69e8 Tiwei Bie2020-03-26 455break; 4c8cf31885f69e8 Tiwei Bie2020-03-26 456case VHOST_VDPA_SET_STATUS: 4c8cf31885f69e8 Tiwei Bie2020-03-26 457r = vhost_vdpa_set_status(v, argp); 4c8cf31885f69e8 Tiwei Bie2020-03-26 458break; 4c8cf31885f69e8 Tiwei Bie2020-03-26 459case VHOST_VDPA_GET_CONFIG: 4c8cf31885f69e8 Tiwei Bie2020-03-26 460r = vhost_vdpa_get_config(v, argp); 4c8cf31885f69e8 Tiwei Bie2020-03-26 461break; 4c8
[PATCH V3 2/3] vhost: vdpa: report iova range
This patch introduces a new ioctl for vhost-vdpa device that can report the iova range by the device. For device that implements get_iova_range() method, we fetch it from the vDPA device. If device doesn't implement get_iova_range() but depends on platform IOMMU, we will query via DOMAIN_ATTR_GEOMETRY, otherwise [0, ULLONG_MAX] is assumed. For safety, this patch also rules out the map request which is not in the valid range. Signed-off-by: Jason Wang --- drivers/vhost/vdpa.c | 40 include/uapi/linux/vhost.h | 4 include/uapi/linux/vhost_types.h | 9 +++ 3 files changed, 53 insertions(+) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index a2dbc85e0b0d..562ed99116d1 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -47,6 +47,7 @@ struct vhost_vdpa { int minor; struct eventfd_ctx *config_ctx; int in_batch; + struct vdpa_iova_range range; }; static DEFINE_IDA(vhost_vdpa_ida); @@ -337,6 +338,16 @@ static long vhost_vdpa_set_config_call(struct vhost_vdpa *v, u32 __user *argp) return 0; } +static long vhost_vdpa_get_iova_range(struct vhost_vdpa *v, u32 __user *argp) +{ + struct vhost_vdpa_iova_range range = { + .first = v->range.first, + .last = v->range.last, + }; + + return copy_to_user(argp, , sizeof(range)); +} + static long vhost_vdpa_vring_ioctl(struct vhost_vdpa *v, unsigned int cmd, void __user *argp) { @@ -470,6 +481,8 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep, case VHOST_GET_BACKEND_FEATURES: features = VHOST_VDPA_BACKEND_FEATURES; r = copy_to_user(featurep, , sizeof(features)); + case VHOST_VDPA_GET_IOVA_RANGE: + r = vhost_vdpa_get_iova_range(v, argp); break; default: r = vhost_dev_ioctl(>vdev, cmd, argp); @@ -597,6 +610,10 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, long pinned; int ret = 0; + if (msg->iova < v->range.first || + msg->iova + msg->size - 1 > v->range.last) + return -EINVAL; + if (vhost_iotlb_itree_first(iotlb, msg->iova, msg->iova + msg->size - 1)) return -EEXIST; @@ -783,6 +800,27 @@ static void vhost_vdpa_free_domain(struct vhost_vdpa *v) v->domain = NULL; } +static void vhost_vdpa_set_iova_range(struct vhost_vdpa *v) +{ + struct vdpa_iova_range *range = >range; + struct iommu_domain_geometry geo; + struct vdpa_device *vdpa = v->vdpa; + const struct vdpa_config_ops *ops = vdpa->config; + + if (ops->get_iova_range) { + *range = ops->get_iova_range(vdpa); + } else if (v->domain && + !iommu_domain_get_attr(v->domain, + DOMAIN_ATTR_GEOMETRY, ) && + geo.force_aperture) { + range->first = geo.aperture_start; + range->last = geo.aperture_end; + } else { + range->first = 0; + range->last = ULLONG_MAX; + } +} + static int vhost_vdpa_open(struct inode *inode, struct file *filep) { struct vhost_vdpa *v; @@ -823,6 +861,8 @@ static int vhost_vdpa_open(struct inode *inode, struct file *filep) if (r) goto err_init_iotlb; + vhost_vdpa_set_iova_range(v); + filep->private_data = v; return 0; diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h index 75232185324a..c998860d7bbc 100644 --- a/include/uapi/linux/vhost.h +++ b/include/uapi/linux/vhost.h @@ -146,4 +146,8 @@ /* Set event fd for config interrupt*/ #define VHOST_VDPA_SET_CONFIG_CALL _IOW(VHOST_VIRTIO, 0x77, int) + +/* Get the valid iova range */ +#define VHOST_VDPA_GET_IOVA_RANGE _IOR(VHOST_VIRTIO, 0x78, \ +struct vhost_vdpa_iova_range) #endif diff --git a/include/uapi/linux/vhost_types.h b/include/uapi/linux/vhost_types.h index 9a269a88a6ff..f7f6a3a28977 100644 --- a/include/uapi/linux/vhost_types.h +++ b/include/uapi/linux/vhost_types.h @@ -138,6 +138,15 @@ struct vhost_vdpa_config { __u8 buf[0]; }; +/* vhost vdpa IOVA range + * @first: First address that can be mapped by vhost-vDPA + * @last: Last address that can be mapped by vhost-vDPA + */ +struct vhost_vdpa_iova_range { + __u64 first; + __u64 last; +}; + /* Feature bits */ /* Log all write descriptors. Can be changed while device is active. */ #define VHOST_F_LOG_ALL 26 -- 2.20.1
[PATCH V3 3/3] vdpa_sim: implement get_iova_range()
This implements a sample get_iova_range() for the simulator which advertise [0, ULLONG_MAX] as the valid range. Signed-off-by: Jason Wang --- drivers/vdpa/vdpa_sim/vdpa_sim.c | 12 1 file changed, 12 insertions(+) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 62d640327145..ff6c9fd8d879 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -574,6 +574,16 @@ static u32 vdpasim_get_generation(struct vdpa_device *vdpa) return vdpasim->generation; } +static struct vdpa_iova_range vdpasim_get_iova_range(struct vdpa_device *vdpa) +{ + struct vdpa_iova_range range = { + .first = 0ULL, + .last = ULLONG_MAX, + }; + + return range; +} + static int vdpasim_set_map(struct vdpa_device *vdpa, struct vhost_iotlb *iotlb) { @@ -657,6 +667,7 @@ static const struct vdpa_config_ops vdpasim_net_config_ops = { .get_config = vdpasim_get_config, .set_config = vdpasim_set_config, .get_generation = vdpasim_get_generation, + .get_iova_range = vdpasim_get_iova_range, .dma_map= vdpasim_dma_map, .dma_unmap = vdpasim_dma_unmap, .free = vdpasim_free, @@ -683,6 +694,7 @@ static const struct vdpa_config_ops vdpasim_net_batch_config_ops = { .get_config = vdpasim_get_config, .set_config = vdpasim_set_config, .get_generation = vdpasim_get_generation, + .get_iova_range = vdpasim_get_iova_range, .set_map= vdpasim_set_map, .free = vdpasim_free, }; -- 2.20.1
[PATCH V3 0/3] vDPA: API for reporting IOVA range
Hi All: This series introduces API for reporing IOVA range. This is a must for userspace to work correclty: - for the process that uses vhost-vDPA directly, the IOVA must be allocated from this range. - for VM(qemu), when vIOMMU is not enabled, fail early if GPA is out of range - for VM(qemu), when vIOMMU is enabled, determine a valid guest address width and then guest IOVA allocator can behave correctly. Please review. Changes from V2: - silent build warnings Changes from V1: - do not mandate get_iova_range() for device with its own DMA translation logic and assume a [0, ULLONG_MAX] range - mandate IOVA range only for IOMMU that forcing aperture - forbid the map which is out of the IOVA range in vhost-vDPA Jason Wang (3): vdpa: introduce config op to get valid iova range vhost: vdpa: report iova range vdpa_sim: implement get_iova_range() drivers/vdpa/vdpa_sim/vdpa_sim.c | 12 ++ drivers/vhost/vdpa.c | 40 include/linux/vdpa.h | 15 include/uapi/linux/vhost.h | 4 include/uapi/linux/vhost_types.h | 9 +++ 5 files changed, 80 insertions(+) -- 2.20.1
[PATCH V3 1/3] vdpa: introduce config op to get valid iova range
This patch introduce a config op to get valid iova range from the vDPA device. Signed-off-by: Jason Wang --- include/linux/vdpa.h | 15 +++ 1 file changed, 15 insertions(+) diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index eae0bfd87d91..30bc7a7223bb 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -52,6 +52,16 @@ struct vdpa_device { int nvqs; }; +/** + * vDPA IOVA range - the IOVA range support by the device + * @first: start of the IOVA range + * @last: end of the IOVA range + */ +struct vdpa_iova_range { + u64 first; + u64 last; +}; + /** * vDPA_config_ops - operations for configuring a vDPA device. * Note: vDPA device drivers are required to implement all of the @@ -151,6 +161,10 @@ struct vdpa_device { * @get_generation:Get device config generation (optional) * @vdev: vdpa device * Returns u32: device generation + * @get_iova_range:Get supported iova range (optional) + * @vdev: vdpa device + * Returns the iova range supported by + * the device. * @set_map: Set device memory mapping (optional) * Needed for device that using device * specific DMA translation (on-chip IOMMU) @@ -216,6 +230,7 @@ struct vdpa_config_ops { void (*set_config)(struct vdpa_device *vdev, unsigned int offset, const void *buf, unsigned int len); u32 (*get_generation)(struct vdpa_device *vdev); + struct vdpa_iova_range (*get_iova_range)(struct vdpa_device *vdev); /* DMA ops */ int (*set_map)(struct vdpa_device *vdev, struct vhost_iotlb *iotlb); -- 2.20.1
Re: [PATCH 0/4] vDPA: API for reporting IOVA range
On 2020/10/21 下午10:45, Michael S. Tsirkin wrote: On Wed, Jun 17, 2020 at 11:29:43AM +0800, Jason Wang wrote: Hi All: This series introduces API for reporing IOVA range. This is a must for userspace to work correclty: - for the process that uses vhost-vDPA directly to properly allocate IOVA - for VM(qemu), when vIOMMU is not enabled, fail early if GPA is out of range - for VM(qemu), when vIOMMU is enabled, determine a valid guest address width Please review. Thanks OK so what is the plan here? Change begin-end->first-last and repost? I've posted V2 with this change, but it get some warning for buildbot. Will post a V3. Thanks Jason Wang (4): vdpa: introduce config op to get valid iova range vdpa_sim: implement get_iova_range bus operation vdpa: get_iova_range() is mandatory for device specific DMA translation vhost: vdpa: report iova range drivers/vdpa/vdpa.c | 4 drivers/vdpa/vdpa_sim/vdpa_sim.c | 11 +++ drivers/vhost/vdpa.c | 27 +++ include/linux/vdpa.h | 14 ++ include/uapi/linux/vhost.h | 4 include/uapi/linux/vhost_types.h | 5 + 6 files changed, 65 insertions(+) -- 2.20.1
Re: [PATCH v4] Revert "virtio-net: ethtool configurable RXCSUM"
On 2020/10/21 下午10:30, Michael S. Tsirkin wrote: This reverts commit 3618ad2a7c0e78e4258386394d5d5f92a3dbccf8. When control vq is not negotiated, that commit causes a crash: [ 72.229171] kernel BUG at drivers/net/virtio_net.c:1667! [ 72.230266] invalid opcode: [#1] PREEMPT SMP [ 72.231172] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.9.0-rc8-02934-g3618ad2a7c0e7 #1 [ 72.231172] EIP: virtnet_send_command+0x120/0x140 [ 72.231172] Code: 00 0f 94 c0 8b 7d f0 65 33 3d 14 00 00 00 75 1c 8d 65 f4 5b 5e 5f 5d c3 66 90 be 01 00 00 00 e9 6e ff ff ff 8d b6 00 +00 00 00 <0f> 0b e8 d9 bb 82 00 eb 17 8d b4 26 00 00 00 00 8d b4 26 00 00 00 [ 72.231172] EAX: 000d EBX: f72895c0 ECX: 0017 EDX: 0011 [ 72.231172] ESI: f7197800 EDI: ed69bd00 EBP: ed69bcf4 ESP: ed69bc98 [ 72.231172] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00010246 [ 72.231172] CR0: 80050033 CR2: CR3: 02c84000 CR4: 000406f0 [ 72.231172] Call Trace: [ 72.231172] ? __virt_addr_valid+0x45/0x60 [ 72.231172] ? ___cache_free+0x51f/0x760 [ 72.231172] ? kobject_uevent_env+0xf4/0x560 [ 72.231172] virtnet_set_guest_offloads+0x4d/0x80 [ 72.231172] virtnet_set_features+0x85/0x120 [ 72.231172] ? virtnet_set_guest_offloads+0x80/0x80 [ 72.231172] __netdev_update_features+0x27a/0x8e0 [ 72.231172] ? kobject_uevent+0xa/0x20 [ 72.231172] ? netdev_register_kobject+0x12c/0x160 [ 72.231172] register_netdevice+0x4fe/0x740 [ 72.231172] register_netdev+0x1c/0x40 [ 72.231172] virtnet_probe+0x728/0xb60 [ 72.231172] ? _raw_spin_unlock+0x1d/0x40 [ 72.231172] ? virtio_vdpa_get_status+0x1c/0x20 [ 72.231172] virtio_dev_probe+0x1c6/0x271 [ 72.231172] really_probe+0x195/0x2e0 [ 72.231172] driver_probe_device+0x26/0x60 [ 72.231172] device_driver_attach+0x49/0x60 [ 72.231172] __driver_attach+0x46/0xc0 [ 72.231172] ? device_driver_attach+0x60/0x60 [ 72.231172] bus_add_driver+0x197/0x1c0 [ 72.231172] driver_register+0x66/0xc0 [ 72.231172] register_virtio_driver+0x1b/0x40 [ 72.231172] virtio_net_driver_init+0x61/0x86 [ 72.231172] ? veth_init+0x14/0x14 [ 72.231172] do_one_initcall+0x76/0x2e4 [ 72.231172] ? rdinit_setup+0x2a/0x2a [ 72.231172] do_initcalls+0xb2/0xd5 [ 72.231172] kernel_init_freeable+0x14f/0x179 [ 72.231172] ? rest_init+0x100/0x100 [ 72.231172] kernel_init+0xd/0xe0 [ 72.231172] ret_from_fork+0x1c/0x30 [ 72.231172] Modules linked in: [ 72.269563] ---[ end trace a6ebc4afea0e6cb1 ]--- The reason is that virtnet_set_features now calls virtnet_set_guest_offloads unconditionally, it used to only call it when there is something to configure. If device does not have a control vq, everything breaks. Revert the original commit for now. Cc: Tonghao Zhang Cc: Willem de Bruijn Fixes: 3618ad2a7c0e7 ("virtio-net: ethtool configurable RXCSUM") Reported-by: kernel test robot Signed-off-by: Michael S. Tsirkin --- Acked-by: Jason Wang Same patch as all of v1-v3, just tweaking the commit log. drivers/net/virtio_net.c | 50 +++- 1 file changed, 13 insertions(+), 37 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index d2d2c4a53cf2..21b71148c532 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -68,8 +68,6 @@ static const unsigned long guest_offloads[] = { (1ULL << VIRTIO_NET_F_GUEST_ECN) | \ (1ULL << VIRTIO_NET_F_GUEST_UFO)) -#define GUEST_OFFLOAD_CSUM_MASK (1ULL << VIRTIO_NET_F_GUEST_CSUM) - struct virtnet_stat_desc { char desc[ETH_GSTRING_LEN]; size_t offset; @@ -2524,48 +2522,29 @@ static int virtnet_get_phys_port_name(struct net_device *dev, char *buf, return 0; } -static netdev_features_t virtnet_fix_features(struct net_device *netdev, - netdev_features_t features) -{ - /* If Rx checksum is disabled, LRO should also be disabled. */ - if (!(features & NETIF_F_RXCSUM)) - features &= ~NETIF_F_LRO; - - return features; -} - static int virtnet_set_features(struct net_device *dev, netdev_features_t features) { struct virtnet_info *vi = netdev_priv(dev); - u64 offloads = vi->guest_offloads; + u64 offloads; int err; - /* Don't allow configuration while XDP is active. */ - if (vi->xdp_queue_pairs) - return -EBUSY; - if ((dev->features ^ features) & NETIF_F_LRO) { + if (vi->xdp_queue_pairs) + return -EBUSY; + if (features & NETIF_F_LRO) - offloads |= GUEST_OFFLOAD_LRO_MASK & - vi->guest_offloads_capable; + offloads = vi->guest_offloads_capable;
Re: [PATCH 2/2] KVM: not link irqfd with a fake IRQ bypass producer
On 2020/10/19 下午5:06, Zhenzhong Duan wrote: In case failure to setup Post interrupt for an IRQ, it make no sense to assign irqfd->producer to the producer. This change makes code more robust. It's better to describe what issue we will get without this patch. Thanks Signed-off-by: Zhenzhong Duan --- arch/x86/kvm/x86.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ce856e0..277e961 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -10683,13 +10683,14 @@ int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, container_of(cons, struct kvm_kernel_irqfd, consumer); int ret; - irqfd->producer = prod; kvm_arch_start_assignment(irqfd->kvm); ret = kvm_x86_ops.update_pi_irte(irqfd->kvm, prod->irq, irqfd->gsi, 1); if (ret) kvm_arch_end_assignment(irqfd->kvm); + else + irqfd->producer = prod; return ret; }
Re: [PATCH 1/2] KVM: not register a IRQ bypass producer if unsupported or disabled
On 2020/10/19 下午5:06, Zhenzhong Duan wrote: If Post interrupt is disabled due to hardware limit or forcely disabled by "intremap=nopost" parameter, return -EINVAL so that the legacy mode IRQ isn't registered as IRQ bypass producer. Is there any side effect if it was still registered? With this change, below message is printed: "vfio-pci :db:00.0: irq bypass producer (token 60c8cda5) registration fails: -22" I may miss something, but the patch only touches vhost-vDPA instead of VFIO? Thanks ..which also hints us if a vfio or vdpa device works in PI mode or legacy remapping mode. Add a print to vdpa code just like what vfio_msi_set_vector_signal() does. Signed-off-by: Zhenzhong Duan --- arch/x86/kvm/svm/avic.c | 3 +-- arch/x86/kvm/vmx/vmx.c | 5 ++--- drivers/vhost/vdpa.c| 5 + 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/svm/avic.c b/arch/x86/kvm/svm/avic.c index ac830cd..316142a 100644 --- a/arch/x86/kvm/svm/avic.c +++ b/arch/x86/kvm/svm/avic.c @@ -814,7 +814,7 @@ int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, if (!kvm_arch_has_assigned_device(kvm) || !irq_remapping_cap(IRQ_POSTING_CAP)) - return 0; + return ret; pr_debug("SVM: %s: host_irq=%#x, guest_irq=%#x, set=%#x\n", __func__, host_irq, guest_irq, set); @@ -899,7 +899,6 @@ int svm_update_pi_irte(struct kvm *kvm, unsigned int host_irq, } } - ret = 0; out: srcu_read_unlock(>irq_srcu, idx); return ret; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f0a9954..1fed6d6 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -7716,12 +7716,12 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, struct kvm_lapic_irq irq; struct kvm_vcpu *vcpu; struct vcpu_data vcpu_info; - int idx, ret = 0; + int idx, ret = -EINVAL; if (!kvm_arch_has_assigned_device(kvm) || !irq_remapping_cap(IRQ_POSTING_CAP) || !kvm_vcpu_apicv_active(kvm->vcpus[0])) - return 0; + return ret; idx = srcu_read_lock(>irq_srcu); irq_rt = srcu_dereference(kvm->irq_routing, >irq_srcu); @@ -7787,7 +7787,6 @@ static int vmx_update_pi_irte(struct kvm *kvm, unsigned int host_irq, } } - ret = 0; out: srcu_read_unlock(>irq_srcu, idx); return ret; diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 62a9bb0..b20060a 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -107,6 +107,11 @@ static void vhost_vdpa_setup_vq_irq(struct vhost_vdpa *v, u16 qid) vq->call_ctx.producer.token = vq->call_ctx.ctx; vq->call_ctx.producer.irq = irq; ret = irq_bypass_register_producer(>call_ctx.producer); + if (unlikely(ret)) + dev_info(>dev, + "irq bypass producer (token %p) registration fails: %d\n", + vq->call_ctx.producer.token, ret); + spin_unlock(>call_ctx.ctx_lock); }
Re: [PATCH net v2] Revert "virtio-net: ethtool configurable RXCSUM"
On 2020/10/20 上午1:32, Michael S. Tsirkin wrote: This reverts commit 3618ad2a7c0e78e4258386394d5d5f92a3dbccf8. When the device does not have a control vq (e.g. when using a version of QEMU based on upstream v0.10 or older, or when specifying ctrl_vq=off,ctrl_rx=off,ctrl_vlan=off,ctrl_rx_extra=off,ctrl_mac_addr=off for the device on the QEMU command line), that commit causes a crash: [ 72.229171] kernel BUG at drivers/net/virtio_net.c:1667! [ 72.230266] invalid opcode: [#1] PREEMPT SMP [ 72.231172] CPU: 0 PID: 1 Comm: swapper/0 Not tainted 5.9.0-rc8-02934-g3618ad2a7c0e7 #1 [ 72.231172] EIP: virtnet_send_command+0x120/0x140 [ 72.231172] Code: 00 0f 94 c0 8b 7d f0 65 33 3d 14 00 00 00 75 1c 8d 65 f4 5b 5e 5f 5d c3 66 90 be 01 00 00 00 e9 6e ff ff ff 8d b6 00 +00 00 00 <0f> 0b e8 d9 bb 82 00 eb 17 8d b4 26 00 00 00 00 8d b4 26 00 00 00 [ 72.231172] EAX: 000d EBX: f72895c0 ECX: 0017 EDX: 0011 [ 72.231172] ESI: f7197800 EDI: ed69bd00 EBP: ed69bcf4 ESP: ed69bc98 [ 72.231172] DS: 007b ES: 007b FS: 00d8 GS: 00e0 SS: 0068 EFLAGS: 00010246 [ 72.231172] CR0: 80050033 CR2: CR3: 02c84000 CR4: 000406f0 [ 72.231172] Call Trace: [ 72.231172] ? __virt_addr_valid+0x45/0x60 [ 72.231172] ? ___cache_free+0x51f/0x760 [ 72.231172] ? kobject_uevent_env+0xf4/0x560 [ 72.231172] virtnet_set_guest_offloads+0x4d/0x80 [ 72.231172] virtnet_set_features+0x85/0x120 [ 72.231172] ? virtnet_set_guest_offloads+0x80/0x80 [ 72.231172] __netdev_update_features+0x27a/0x8e0 [ 72.231172] ? kobject_uevent+0xa/0x20 [ 72.231172] ? netdev_register_kobject+0x12c/0x160 [ 72.231172] register_netdevice+0x4fe/0x740 [ 72.231172] register_netdev+0x1c/0x40 [ 72.231172] virtnet_probe+0x728/0xb60 [ 72.231172] ? _raw_spin_unlock+0x1d/0x40 [ 72.231172] ? virtio_vdpa_get_status+0x1c/0x20 [ 72.231172] virtio_dev_probe+0x1c6/0x271 [ 72.231172] really_probe+0x195/0x2e0 [ 72.231172] driver_probe_device+0x26/0x60 [ 72.231172] device_driver_attach+0x49/0x60 [ 72.231172] __driver_attach+0x46/0xc0 [ 72.231172] ? device_driver_attach+0x60/0x60 [ 72.231172] bus_add_driver+0x197/0x1c0 [ 72.231172] driver_register+0x66/0xc0 [ 72.231172] register_virtio_driver+0x1b/0x40 [ 72.231172] virtio_net_driver_init+0x61/0x86 [ 72.231172] ? veth_init+0x14/0x14 [ 72.231172] do_one_initcall+0x76/0x2e4 [ 72.231172] ? rdinit_setup+0x2a/0x2a [ 72.231172] do_initcalls+0xb2/0xd5 [ 72.231172] kernel_init_freeable+0x14f/0x179 [ 72.231172] ? rest_init+0x100/0x100 [ 72.231172] kernel_init+0xd/0xe0 [ 72.231172] ret_from_fork+0x1c/0x30 [ 72.231172] Modules linked in: [ 72.269563] ---[ end trace a6ebc4afea0e6cb1 ]--- The reason is that virtnet_set_features now calls virtnet_set_guest_offloads unconditionally, it used to only call it when there is something to configure. If device does not have a control vq, everything breaks. Looking at this some more, I noticed that it's not really checking the hardware too much. E.g. if ((dev->features ^ features) & NETIF_F_LRO) { if (features & NETIF_F_LRO) offloads |= GUEST_OFFLOAD_LRO_MASK & vi->guest_offloads_capable; else offloads &= ~GUEST_OFFLOAD_LRO_MASK; } and (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \ (1ULL << VIRTIO_NET_F_GUEST_ECN) | \ (1ULL << VIRTIO_NET_F_GUEST_UFO)) But there's no guarantee that e.g. VIRTIO_NET_F_GUEST_TSO6 is set. If it isn't command should not send it. Further static int virtnet_set_features(struct net_device *dev, netdev_features_t features) { struct virtnet_info *vi = netdev_priv(dev); u64 offloads = vi->guest_offloads; seems wrong since guest_offloads is zero initialized, I'm not sure I get here. Did you mean vi->guest_offloads? We initialize it during probe for (i = 0; i < ARRAY_SIZE(guest_offloads); i++) if (virtio_has_feature(vi->vdev, guest_offloads[i])) set_bit(guest_offloads[i], >guest_offloads); it does not reflect the state after reset which comes from the features. Revert the original commit for now. Cc: Tonghao Zhang Cc: Willem de Bruijn Fixes: 3618ad2a7c0e7 ("virtio-net: ethtool configurable RXCSUM") Reported-by: kernel test robot Signed-off-by: Michael S. Tsirkin --- changes from v1: - clarify how to reproduce the bug in the log drivers/net/virtio_net.c | 50 +++- 1 file changed, 13 insertions(+), 37 deletions(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index d2d2c4a53cf2..21b71148c532 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -68,8 +68,6 @@ static const unsigned long guest_offloads[] = { (1ULL <<
Re: [PATCH v3] i2c: virtio: add a virtio i2c frontend driver
On 2020/10/14 下午4:37, Jie Deng wrote: On 2020/10/13 16:00, Jason Wang wrote: + + virtqueue_kick(vq); + + time_left = wait_for_completion_timeout(>completion, adap->timeout); + if (!time_left) { + dev_err(>dev, "msg[%d]: addr=0x%x timeout.\n", i, msgs[i].addr); + break; + } You don't set error number here. Is this intended? And using a timeout here is not good, and if the request is finished just after the timeout, in the next xfer you may hit the following check. It's better to use either interrupt here. Could you check the I2C drivers in the kernel ? The "wait_for_completion_timeout" mechanism is commonly used by I2C bus drivers in their i2c_algorithm.master_xfer. There's a major difference between virtio-i2c and other drivers. In the case of virtio, the device could be a software device emulated by a remote process. This means the timeout might not be rare. I don't see how timeout is properly handled in this patch (e.g did you notice that you don't set any error when timeout? or is this intended?) The backend software may operate the physical device. The timeout depends on how the backend is designed. Here if the timeout happens, it will return the actual number of messages successfully processed to the I2C core. Let the I2C core decides how to do next. So let's consider the following case: 1) driver:virtio_i2c_add_msg(msgA) 2) driver:timeout, and return to I2C core 3) driver:virtio_i2c_add_msg(msgB) 4) device: complete msgA 5) driver: virtqueue_get_buf() returns msgA, since the token is always vi->vmsg, the driver may think msgB has been completed. If this case does happen, it is exactly a case that the condition "((!vmsg) || (vmsg != >vmsg))" are met. I may miss something, but you always use vi->vmsg as token so vmsg is equal to >vmsg here Currently, the timeout value is hard-coded in the driver. Generally speaking, timeout rarely happens. Well, it's better to no have such assumption consider the device could be a emulated one. It can also be designed as a device configuration if needed. In any case, the timeout should be handled correctly regardless of its frequency. Thanks. Thanks. + + vmsg = (struct virtio_i2c_msg *)virtqueue_get_buf(vq, ); + /* vmsg should point to the same address with >vmsg */ + if ((!vmsg) || (vmsg != >vmsg)) { + dev_err(>dev, "msg[%d]: addr=0x%x virtqueue error.\n", + i, msgs[i].addr); + break; + } So I think we can remove this check. Consider only one descriptor will be used at most, unless there's a bug in the device (and no other driver to the similar check), we should not hit this. Btw, as I replied in the previous version, the device should be cacpable of dealing of a batch of requests through the virtqueue, otherwise it's meaningless to use a queue here. We should not assume there is no bug in the device. I don't think we can remove this check if we want our code to be robust. Can you tell when at which case you may hit !vmsg or vmsg != vi->vmsg? Normally, it won't hit here. But the API "virtqueue_get_buf" tells me "It *may *return NULL or the "data" token handed to virtqueue_add_*()." Note that we had the following check already in virtqueue_get_buf_ctx(), so the the virtio core had already have the ability to figure out the wrong head. if (unlikely(id >= vq->packed.vring.num)) { BAD_RING(vq, "id %u out of range\n", id); return NULL; } if (unlikely(!vq->packed.desc_state[id].data)) { BAD_RING(vq, "id %u is not a head!\n", id); return NULL; } And when it returns a NULL, it's not necessarily an error of the device, it might just require more time to finish the processing. That's why we just returned the actual number of messages successfully processed in this case, and let the I2C core to try one more time. Actually we have no idea if this is a device error or not. Try one more time can also fail if it is a backend error. Of course, there is another option. We can return error for timeout, no matter what reason. Thanks. From the perspective of a caller, I just don't care when it happens. To make the code robust, what I care about is what I should do if this is not our case since the doc says it*may *happen. If you insist on removing this check, I will remove "vmsg != vi->vmsg" and keep the check for !vmsg. As Dan reported in v2, we should at least check here for NULL. Thanks. As I said, currently, we are using the virtqueue to send the msg one by one to the backend. The mechanism is described in the spec. Which part of the spec describes such "one by one" mechanism? If there is one, I'd happily give a NACK since it
Re: [PATCH v3 2/2] vhost-vdpa: fix page pinning leakage in error path
On 2020/10/14 上午7:42, si-wei liu wrote: So what I suggest is to fix the pinning leakage first and do the possible optimization on top (which is still questionable to me). OK. Unfortunately, this was picked and got merged in upstream. So I will post a follow up patch set to 1) revert the commit to the original __get_free_page() implementation, and 2) fix the accounting and leakage on top. Will it be fine? Fine. Thanks
Re: [PATCH v3 2/2] vhost-vdpa: fix page pinning leakage in error path
On 2020/10/14 下午2:52, Michael S. Tsirkin wrote: On Tue, Oct 13, 2020 at 04:42:59PM -0700, si-wei liu wrote: On 10/9/2020 7:27 PM, Jason Wang wrote: On 2020/10/3 下午1:02, Si-Wei Liu wrote: Pinned pages are not properly accounted particularly when mapping error occurs on IOTLB update. Clean up dangling pinned pages for the error path. As the inflight pinned pages, specifically for memory region that strides across multiple chunks, would need more than one free page for book keeping and accounting. For simplicity, pin pages for all memory in the IOVA range in one go rather than have multiple pin_user_pages calls to make up the entire region. This way it's easier to track and account the pages already mapped, particularly for clean-up in the error path. Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend") Signed-off-by: Si-Wei Liu --- Changes in v3: - Factor out vhost_vdpa_map() change to a separate patch Changes in v2: - Fix incorrect target SHA1 referenced drivers/vhost/vdpa.c | 119 ++- 1 file changed, 71 insertions(+), 48 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 0f27919..dad41dae 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -595,21 +595,19 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, struct vhost_dev *dev = >vdev; struct vhost_iotlb *iotlb = dev->iotlb; struct page **page_list; -unsigned long list_size = PAGE_SIZE / sizeof(struct page *); +struct vm_area_struct **vmas; unsigned int gup_flags = FOLL_LONGTERM; -unsigned long npages, cur_base, map_pfn, last_pfn = 0; -unsigned long locked, lock_limit, pinned, i; +unsigned long map_pfn, last_pfn = 0; +unsigned long npages, lock_limit; +unsigned long i, nmap = 0; u64 iova = msg->iova; +long pinned; int ret = 0; if (vhost_iotlb_itree_first(iotlb, msg->iova, msg->iova + msg->size - 1)) return -EEXIST; -page_list = (struct page **) __get_free_page(GFP_KERNEL); -if (!page_list) -return -ENOMEM; - if (msg->perm & VHOST_ACCESS_WO) gup_flags |= FOLL_WRITE; @@ -617,61 +615,86 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, if (!npages) return -EINVAL; +page_list = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); +vmas = kvmalloc_array(npages, sizeof(struct vm_area_struct *), + GFP_KERNEL); This will result high order memory allocation which was what the code tried to avoid originally. Using an unlimited size will cause a lot of side effects consider VM or userspace may try to pin several TB of memory. Hmmm, that's a good point. Indeed, if the guest memory demand is huge or the host system is running short of free pages, kvmalloc will be problematic and less efficient than the __get_free_page implementation. OK so ... Jason, what's the plan? How about you send a patchset with 1. revert this change 2. fix error handling leak Work for me, but it looks like siwei want to do this. So it's better for to send the patchset. Thanks
Re: [PATCH v3] i2c: virtio: add a virtio i2c frontend driver
On 2020/10/13 下午3:16, Jie Deng wrote: On 2020/10/12 11:43, Jason Wang wrote: On 2020/10/12 上午10:45, Jie Deng wrote: On 2020/10/10 11:14, Jason Wang wrote: + + virtqueue_kick(vq); + + time_left = wait_for_completion_timeout(>completion, adap->timeout); + if (!time_left) { + dev_err(>dev, "msg[%d]: addr=0x%x timeout.\n", i, msgs[i].addr); + break; + } You don't set error number here. Is this intended? And using a timeout here is not good, and if the request is finished just after the timeout, in the next xfer you may hit the following check. It's better to use either interrupt here. Could you check the I2C drivers in the kernel ? The "wait_for_completion_timeout" mechanism is commonly used by I2C bus drivers in their i2c_algorithm.master_xfer. There's a major difference between virtio-i2c and other drivers. In the case of virtio, the device could be a software device emulated by a remote process. This means the timeout might not be rare. I don't see how timeout is properly handled in this patch (e.g did you notice that you don't set any error when timeout? or is this intended?) The backend software may operate the physical device. The timeout depends on how the backend is designed. Here if the timeout happens, it will return the actual number of messages successfully processed to the I2C core. Let the I2C core decides how to do next. So let's consider the following case: 1) driver:virtio_i2c_add_msg(msgA) 2) driver:timeout, and return to I2C core 3) driver:virtio_i2c_add_msg(msgB) 4) device: complete msgA 5) driver: virtqueue_get_buf() returns msgA, since the token is always vi->vmsg, the driver may think msgB has been completed. Thanks. + + vmsg = (struct virtio_i2c_msg *)virtqueue_get_buf(vq, ); + /* vmsg should point to the same address with >vmsg */ + if ((!vmsg) || (vmsg != >vmsg)) { + dev_err(>dev, "msg[%d]: addr=0x%x virtqueue error.\n", + i, msgs[i].addr); + break; + } So I think we can remove this check. Consider only one descriptor will be used at most, unless there's a bug in the device (and no other driver to the similar check), we should not hit this. Btw, as I replied in the previous version, the device should be cacpable of dealing of a batch of requests through the virtqueue, otherwise it's meaningless to use a queue here. We should not assume there is no bug in the device. I don't think we can remove this check if we want our code to be robust. Can you tell when at which case you may hit !vmsg or vmsg != vi->vmsg? Normally, it won't hit here. But the API "virtqueue_get_buf" tells me "It *may *return NULL or the "data" token handed to virtqueue_add_*()." Note that we had the following check already in virtqueue_get_buf_ctx(), so the the virtio core had already have the ability to figure out the wrong head. if (unlikely(id >= vq->packed.vring.num)) { BAD_RING(vq, "id %u out of range\n", id); return NULL; } if (unlikely(!vq->packed.desc_state[id].data)) { BAD_RING(vq, "id %u is not a head!\n", id); return NULL; } And when it returns a NULL, it's not necessarily an error of the device, it might just require more time to finish the processing. From the perspective of a caller, I just don't care when it happens. To make the code robust, what I care about is what I should do if this is not our case since the doc says it*may *happen. If you insist on removing this check, I will remove "vmsg != vi->vmsg" and keep the check for !vmsg. As Dan reported in v2, we should at least check here for NULL. Thanks. As I said, currently, we are using the virtqueue to send the msg one by one to the backend. The mechanism is described in the spec. Which part of the spec describes such "one by one" mechanism? If there is one, I'd happily give a NACK since it doesn't require a queue to work which is conflict with the concept of the virtqueue. What's the concept of the virtqueue ? Why do you want to restrict how users use virtqueue ? So I think there's some misunderstanding here. The point is not to restrict how to use virtqueue. What I meant is: - we should not invent a device with a virtqueue that can only accept one buffer at a time - I don't see any mechanism like "one by one" described in the spec, so it's ok but if it'd happen to have, I will NACK It's like you provide a water glass to user. The user can fill a full glass of water and drinks once or fill half a glass of water and drink twice. It is a user behavior and should not be restricted by the glass provider. That's my point as well, we should not describe the "once" behavior in the spec. Thanks.
Re: [RFC PATCH 10/24] vdpa: introduce config operations for associating ASID to a virtqueue group
On 2020/10/12 下午4:17, Eli Cohen wrote: On Mon, Oct 12, 2020 at 03:45:10PM +0800, Jason Wang wrote: So in theory we can have several asid's (for different virtqueues), each one should be followed by a specific set_map call. If this is so, how do I know if I met all the conditions run my driver? Maybe we need another callback to let the driver know it should not expect more set_maps(). This should work similarly as in the past. Two parts of the work is expected to be done by the driver: 1) store the mapping somewhere (e.g hardware) during set_map() 2) associating mapping with a specific virtqueue The only difference is that more than one mapping is used now. ok, so like today, I will always get DRIVER_OK after I got all the set_maps(), right? Yes. Thanks For the issue of more set_maps(), driver should be always ready for the new set_maps() call instead of not expecting new set_maps() since guest memory topology could be changed due to several reasons. Qemu or vhost-vDPA will try their best to avoid the frequency of set_maps() for better performance (e.g through batched IOTLB updating). E.g there should be at most one set_map() during one time of guest booting.
Re: [RFC PATCH 10/24] vdpa: introduce config operations for associating ASID to a virtqueue group
On 2020/10/12 下午2:59, Eli Cohen wrote: On Fri, Oct 09, 2020 at 11:56:45AM +0800, Jason Wang wrote: On 2020/10/1 下午9:29, Eli Cohen wrote: On Thu, Sep 24, 2020 at 11:21:11AM +0800, Jason Wang wrote: This patch introduces a new bus operation to allow the vDPA bus driver to associate an ASID to a virtqueue group. So in case of virtio_net, I would expect that all the data virtqueues will be associated with the same address space identifier. Right. I will add the codes to do this in the next version. It should be more explicit than have this assumption by default. Moreover, this assignment should be provided before the set_map call that provides the iotlb for the address space, correct? I think it's better not have this limitation, note that set_map() now takes a asid argument. So for hardware if the associated as is changed, the driver needs to program the hardware to switch to the new mapping. Does this work for mlx5? So in theory we can have several asid's (for different virtqueues), each one should be followed by a specific set_map call. If this is so, how do I know if I met all the conditions run my driver? Maybe we need another callback to let the driver know it should not expect more set_maps(). This should work similarly as in the past. Two parts of the work is expected to be done by the driver: 1) store the mapping somewhere (e.g hardware) during set_map() 2) associating mapping with a specific virtqueue The only difference is that more than one mapping is used now. For the issue of more set_maps(), driver should be always ready for the new set_maps() call instead of not expecting new set_maps() since guest memory topology could be changed due to several reasons. Qemu or vhost-vDPA will try their best to avoid the frequency of set_maps() for better performance (e.g through batched IOTLB updating). E.g there should be at most one set_map() during one time of guest booting. Thanks
Re: [PATCH v3] i2c: virtio: add a virtio i2c frontend driver
On 2020/10/12 上午10:45, Jie Deng wrote: On 2020/10/10 11:14, Jason Wang wrote: + + virtqueue_kick(vq); + + time_left = wait_for_completion_timeout(>completion, adap->timeout); + if (!time_left) { + dev_err(>dev, "msg[%d]: addr=0x%x timeout.\n", i, msgs[i].addr); + break; + } You don't set error number here. Is this intended? And using a timeout here is not good, and if the request is finished just after the timeout, in the next xfer you may hit the following check. It's better to use either interrupt here. Could you check the I2C drivers in the kernel ? The "wait_for_completion_timeout" mechanism is commonly used by I2C bus drivers in their i2c_algorithm.master_xfer. There's a major difference between virtio-i2c and other drivers. In the case of virtio, the device could be a software device emulated by a remote process. This means the timeout might not be rare. I don't see how timeout is properly handled in this patch (e.g did you notice that you don't set any error when timeout? or is this intended?) + + vmsg = (struct virtio_i2c_msg *)virtqueue_get_buf(vq, ); + /* vmsg should point to the same address with >vmsg */ + if ((!vmsg) || (vmsg != >vmsg)) { + dev_err(>dev, "msg[%d]: addr=0x%x virtqueue error.\n", + i, msgs[i].addr); + break; + } So I think we can remove this check. Consider only one descriptor will be used at most, unless there's a bug in the device (and no other driver to the similar check), we should not hit this. Btw, as I replied in the previous version, the device should be cacpable of dealing of a batch of requests through the virtqueue, otherwise it's meaningless to use a queue here. We should not assume there is no bug in the device. I don't think we can remove this check if we want our code to be robust. Can you tell when at which case you may hit !vmsg or vmsg != vi->vmsg? As I said, currently, we are using the virtqueue to send the msg one by one to the backend. The mechanism is described in the spec. Which part of the spec describes such "one by one" mechanism? If there is one, I'd happily give a NACK since it doesn't require a queue to work which is conflict with the concept of the virtqueue. Thanks. + + +#ifndef _UAPI_LINUX_VIRTIO_I2C_H +#define _UAPI_LINUX_VIRTIO_I2C_H + +#include +#include +#include + +/** + * struct virtio_i2c_hdr - the virtio I2C message header structure + * @addr: i2c_msg addr, the slave address + * @flags: i2c_msg flags + * @len: i2c_msg len + */ +struct virtio_i2c_hdr { + __le16 addr; + __le16 flags; + __le16 len; +}; I'm afraid this is not complete. E.g the status is missed. I suspect what virtio-scsi use is better. Which split the in from the out instead of reusing the same buffer. And it can ease the uAPI header export. Thanks I think following definition in uAPI for the status is enough. There is no need to provide a "u8" status in the structure. /* The final status written by the device */ #define VIRTIO_I2C_MSG_OK 0 #define VIRTIO_I2C_MSG_ERR 1 You can see an example in virtio_blk. In the spec: struct virtio_blk_req { le32 type; le32 reserved; le64 sector; u8 data[]; u8 status; }; In virtio_blk.h, there is only following definitions. #define VIRTIO_BLK_S_OK 0 #define VIRTIO_BLK_S_IOERR 1 #define VIRTIO_BLK_S_UNSUPP 2 virtio-blk is a bad example, it's just too late to fix. For any new introduced uAPI it should be a complete one. Thanks Thanks.
Re: [PATCH] vdpa/mlx5: should keep avail_index despite device status
On 2020/10/2 上午4:18, Si-Wei Liu wrote: A VM with mlx5 vDPA has below warnings while being reset: vhost VQ 0 ring restore failed: -1: Resource temporarily unavailable (11) vhost VQ 1 ring restore failed: -1: Resource temporarily unavailable (11) We should allow userspace emulating the virtio device be able to get to vq's avail_index, regardless of vDPA device status. Save the index that was last seen when virtq was stopped, so that userspace doesn't complain. Signed-off-by: Si-Wei Liu Acked-by: Jason Wang --- drivers/vdpa/mlx5/net/mlx5_vnet.c | 20 ++-- 1 file changed, 14 insertions(+), 6 deletions(-) diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 70676a6..74264e59 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1133,15 +1133,17 @@ static void suspend_vq(struct mlx5_vdpa_net *ndev, struct mlx5_vdpa_virtqueue *m if (!mvq->initialized) return; - if (query_virtqueue(ndev, mvq, )) { - mlx5_vdpa_warn(>mvdev, "failed to query virtqueue\n"); - return; - } if (mvq->fw_state != MLX5_VIRTIO_NET_Q_OBJECT_STATE_RDY) return; if (modify_virtqueue(ndev, mvq, MLX5_VIRTIO_NET_Q_OBJECT_STATE_SUSPEND)) mlx5_vdpa_warn(>mvdev, "modify to suspend failed\n"); + + if (query_virtqueue(ndev, mvq, )) { + mlx5_vdpa_warn(>mvdev, "failed to query virtqueue\n"); + return; + } + mvq->avail_idx = attr.available_index; } static void suspend_vqs(struct mlx5_vdpa_net *ndev) @@ -1411,8 +1413,14 @@ static int mlx5_vdpa_get_vq_state(struct vdpa_device *vdev, u16 idx, struct vdpa struct mlx5_virtq_attr attr; int err; - if (!mvq->initialized) - return -EAGAIN; + /* If the virtq object was destroyed, use the value saved at +* the last minute of suspend_vq. This caters for userspace +* that cares about emulating the index after vq is stopped. +*/ + if (!mvq->initialized) { + state->avail_index = mvq->avail_idx; + return 0; + } err = query_virtqueue(ndev, mvq, ); if (err) {
Re: [PATCH v3] i2c: virtio: add a virtio i2c frontend driver
On 2020/9/22 上午10:58, Jie Deng wrote: Add an I2C bus driver for virtio para-virtualization. The controller can be emulated by the backend driver in any device model software by following the virtio protocol. This driver communicates with the backend driver through a virtio I2C message structure which includes following parts: - Header: i2c_msg addr, flags, len. - Data buffer: the pointer to the I2C msg data. - Status: the processing result from the backend. People may implement different backend drivers to emulate different controllers according to their needs. A backend example can be found in the device model of the open source project ACRN. For more information, please refer to https://projectacrn.org. The virtio device ID 34 is used for this I2C adpter since IDs before 34 have been reserved by other virtio devices. Co-developed-by: Conghui Chen Signed-off-by: Conghui Chen Signed-off-by: Jie Deng Reviewed-by: Shuo Liu Reviewed-by: Andy Shevchenko --- The device ID request: https://github.com/oasis-tcs/virtio-spec/issues/85 The specification: https://lists.oasis-open.org/archives/virtio-comment/202009/msg00021.html Changes in v3: - Move the interface into uAPI according to Jason. - Fix issues reported by Dan Carpenter. - Fix typo reported by Randy. Changes in v2: - Addressed comments received from Michael, Andy and Jason. drivers/i2c/busses/Kconfig | 11 ++ drivers/i2c/busses/Makefile | 3 + drivers/i2c/busses/i2c-virtio.c | 256 include/uapi/linux/virtio_i2c.h | 31 + include/uapi/linux/virtio_ids.h | 1 + 5 files changed, 302 insertions(+) create mode 100644 drivers/i2c/busses/i2c-virtio.c create mode 100644 include/uapi/linux/virtio_i2c.h diff --git a/drivers/i2c/busses/Kconfig b/drivers/i2c/busses/Kconfig index 293e7a0..f2f6543 100644 --- a/drivers/i2c/busses/Kconfig +++ b/drivers/i2c/busses/Kconfig @@ -21,6 +21,17 @@ config I2C_ALI1535 This driver can also be built as a module. If so, the module will be called i2c-ali1535. +config I2C_VIRTIO + tristate "Virtio I2C Adapter" + depends on VIRTIO + help + If you say yes to this option, support will be included for the virtio + I2C adapter driver. The hardware can be emulated by any device model + software according to the virtio protocol. + + This driver can also be built as a module. If so, the module + will be called i2c-virtio. + config I2C_ALI1563 tristate "ALI 1563" depends on PCI diff --git a/drivers/i2c/busses/Makefile b/drivers/i2c/busses/Makefile index 19aff0e..821acfa 100644 --- a/drivers/i2c/busses/Makefile +++ b/drivers/i2c/busses/Makefile @@ -6,6 +6,9 @@ # ACPI drivers obj-$(CONFIG_I2C_SCMI)+= i2c-scmi.o +# VIRTIO I2C host controller driver +obj-$(CONFIG_I2C_VIRTIO) += i2c-virtio.o + # PC SMBus host controller drivers obj-$(CONFIG_I2C_ALI1535) += i2c-ali1535.o obj-$(CONFIG_I2C_ALI1563) += i2c-ali1563.o diff --git a/drivers/i2c/busses/i2c-virtio.c b/drivers/i2c/busses/i2c-virtio.c new file mode 100644 index 000..48fd780 --- /dev/null +++ b/drivers/i2c/busses/i2c-virtio.c @@ -0,0 +1,256 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Virtio I2C Bus Driver + * + * Copyright (c) 2020 Intel Corporation. All rights reserved. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/** + * struct virtio_i2c_msg - the virtio I2C message structure + * @hdr: the virtio I2C message header + * @buf: virtio I2C message data buffer + * @status: the processing result from the backend + */ +struct virtio_i2c_msg { + struct virtio_i2c_hdr hdr; + u8 *buf; + u8 status; +}; + +/** + * struct virtio_i2c - virtio I2C data + * @vdev: virtio device for this controller + * @completion: completion of virtio I2C message + * @vmsg: the virtio I2C message for communication + * @adap: I2C adapter for this controller + * @i2c_lock: lock for virtqueue processing + * @vq: the virtio virtqueue for communication + */ +struct virtio_i2c { + struct virtio_device *vdev; + struct completion completion; + struct virtio_i2c_msg vmsg; + struct i2c_adapter adap; + struct mutex i2c_lock; + struct virtqueue *vq; +}; + +static void virtio_i2c_msg_done(struct virtqueue *vq) +{ + struct virtio_i2c *vi = vq->vdev->priv; + + complete(>completion); +} + +static int virtio_i2c_add_msg(struct virtqueue *vq, + struct virtio_i2c_msg *vmsg, + struct i2c_msg *msg) +{ + struct scatterlist *sgs[3], hdr, bout, bin, status; + int outcnt = 0, incnt = 0; + + if (!msg->len) + return -EINVAL; + + vmsg->hdr.addr = cpu_to_le16(msg->addr); + vmsg->hdr.flags =
Re: [PATCH v3 3/3] vhost: Don't call log_access_ok() when using IOTLB
On 2020/10/3 下午6:02, Greg Kurz wrote: When the IOTLB device is enabled, the log_guest_addr that is passed by userspace to the VHOST_SET_VRING_ADDR ioctl, and which is then written to vq->log_addr, is a GIOVA. All writes to this address are translated by log_user() to writes to an HVA, and then ultimately logged through the corresponding GPAs in log_write_hva(). No logging will ever occur with vq->log_addr in this case. It is thus wrong to pass vq->log_addr and log_guest_addr to log_access_vq() which assumes they are actual GPAs. Introduce a new vq_log_used_access_ok() helper that only checks accesses to the log for the used structure when there isn't an IOTLB device around. Signed-off-by: Greg Kurz Acked-by: Jason Wang In the future, we may consider to deprecate log_guest_addr since in any case regardless of IOTLB ennoblement we can get GPA from either IOTLB or MEM table. Thanks --- drivers/vhost/vhost.c | 23 ++- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 9d2c225fb518..9ad45e1d27f0 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1370,6 +1370,20 @@ bool vhost_log_access_ok(struct vhost_dev *dev) } EXPORT_SYMBOL_GPL(vhost_log_access_ok); +static bool vq_log_used_access_ok(struct vhost_virtqueue *vq, + void __user *log_base, + bool log_used, + u64 log_addr) +{ + /* If an IOTLB device is present, log_addr is a GIOVA that +* will never be logged by log_used(). */ + if (vq->iotlb) + return true; + + return !log_used || log_access_ok(log_base, log_addr, + vhost_get_used_size(vq, vq->num)); +} + /* Verify access for write logging. */ /* Caller should have vq mutex and device mutex */ static bool vq_log_access_ok(struct vhost_virtqueue *vq, @@ -1377,8 +1391,7 @@ static bool vq_log_access_ok(struct vhost_virtqueue *vq, { return vq_memory_access_ok(log_base, vq->umem, vhost_has_feature(vq, VHOST_F_LOG_ALL)) && - (!vq->log_used || log_access_ok(log_base, vq->log_addr, - vhost_get_used_size(vq, vq->num))); + vq_log_used_access_ok(vq, log_base, vq->log_used, vq->log_addr); } /* Can we start vq? */ @@ -1517,9 +1530,9 @@ static long vhost_vring_set_addr(struct vhost_dev *d, return -EINVAL; /* Also validate log access for used ring if enabled. */ - if ((a.flags & (0x1 << VHOST_VRING_F_LOG)) && - !log_access_ok(vq->log_base, a.log_guest_addr, - vhost_get_used_size(vq, vq->num))) + if (!vq_log_used_access_ok(vq, vq->log_base, + a.flags & (0x1 << VHOST_VRING_F_LOG), + a.log_guest_addr)) return -EINVAL; }
Re: [PATCH v3 2/3] vhost: Use vhost_get_used_size() in vhost_vring_set_addr()
On 2020/10/3 下午6:02, Greg Kurz wrote: The open-coded computation of the used size doesn't take the event into account when the VIRTIO_RING_F_EVENT_IDX feature is present. Fix that by using vhost_get_used_size(). Signed-off-by: Greg Kurz --- drivers/vhost/vhost.c |3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index c3b49975dc28..9d2c225fb518 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -1519,8 +1519,7 @@ static long vhost_vring_set_addr(struct vhost_dev *d, /* Also validate log access for used ring if enabled. */ if ((a.flags & (0x1 << VHOST_VRING_F_LOG)) && !log_access_ok(vq->log_base, a.log_guest_addr, - sizeof *vq->used + - vq->num * sizeof *vq->used->ring)) + vhost_get_used_size(vq, vq->num))) return -EINVAL; } Acked-by: Jason Wang
Re: [PATCH v3 2/2] vhost-vdpa: fix page pinning leakage in error path
On 2020/10/3 下午1:02, Si-Wei Liu wrote: Pinned pages are not properly accounted particularly when mapping error occurs on IOTLB update. Clean up dangling pinned pages for the error path. As the inflight pinned pages, specifically for memory region that strides across multiple chunks, would need more than one free page for book keeping and accounting. For simplicity, pin pages for all memory in the IOVA range in one go rather than have multiple pin_user_pages calls to make up the entire region. This way it's easier to track and account the pages already mapped, particularly for clean-up in the error path. Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend") Signed-off-by: Si-Wei Liu --- Changes in v3: - Factor out vhost_vdpa_map() change to a separate patch Changes in v2: - Fix incorrect target SHA1 referenced drivers/vhost/vdpa.c | 119 ++- 1 file changed, 71 insertions(+), 48 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 0f27919..dad41dae 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -595,21 +595,19 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, struct vhost_dev *dev = >vdev; struct vhost_iotlb *iotlb = dev->iotlb; struct page **page_list; - unsigned long list_size = PAGE_SIZE / sizeof(struct page *); + struct vm_area_struct **vmas; unsigned int gup_flags = FOLL_LONGTERM; - unsigned long npages, cur_base, map_pfn, last_pfn = 0; - unsigned long locked, lock_limit, pinned, i; + unsigned long map_pfn, last_pfn = 0; + unsigned long npages, lock_limit; + unsigned long i, nmap = 0; u64 iova = msg->iova; + long pinned; int ret = 0; if (vhost_iotlb_itree_first(iotlb, msg->iova, msg->iova + msg->size - 1)) return -EEXIST; - page_list = (struct page **) __get_free_page(GFP_KERNEL); - if (!page_list) - return -ENOMEM; - if (msg->perm & VHOST_ACCESS_WO) gup_flags |= FOLL_WRITE; @@ -617,61 +615,86 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, if (!npages) return -EINVAL; + page_list = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); + vmas = kvmalloc_array(npages, sizeof(struct vm_area_struct *), + GFP_KERNEL); This will result high order memory allocation which was what the code tried to avoid originally. Using an unlimited size will cause a lot of side effects consider VM or userspace may try to pin several TB of memory. + if (!page_list || !vmas) { + ret = -ENOMEM; + goto free; + } Any reason that you want to use vmas? + mmap_read_lock(dev->mm); - locked = atomic64_add_return(npages, >mm->pinned_vm); lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - - if (locked > lock_limit) { + if (npages + atomic64_read(>mm->pinned_vm) > lock_limit) { ret = -ENOMEM; - goto out; + goto unlock; } - cur_base = msg->uaddr & PAGE_MASK; - iova &= PAGE_MASK; + pinned = pin_user_pages(msg->uaddr & PAGE_MASK, npages, gup_flags, + page_list, vmas); + if (npages != pinned) { + if (pinned < 0) { + ret = pinned; + } else { + unpin_user_pages(page_list, pinned); + ret = -ENOMEM; + } + goto unlock; + } - while (npages) { - pinned = min_t(unsigned long, npages, list_size); - ret = pin_user_pages(cur_base, pinned, -gup_flags, page_list, NULL); - if (ret != pinned) - goto out; - - if (!last_pfn) - map_pfn = page_to_pfn(page_list[0]); - - for (i = 0; i < ret; i++) { - unsigned long this_pfn = page_to_pfn(page_list[i]); - u64 csize; - - if (last_pfn && (this_pfn != last_pfn + 1)) { - /* Pin a contiguous chunk of memory */ - csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT; - if (vhost_vdpa_map(v, iova, csize, - map_pfn << PAGE_SHIFT, - msg->perm)) - goto out; - map_pfn = this_pfn; - iova += csize; + iova &= PAGE_MASK; + map_pfn = page_to_pfn(page_list[0]); + + /* One more iteration to avoid extra vdpa_map() call out of loop. */ + for (i = 0; i <= npages; i++) { +
Re: [PATCH v3 1/2] vhost-vdpa: fix vhost_vdpa_map() on error condition
On 2020/10/3 下午1:02, Si-Wei Liu wrote: vhost_vdpa_map() should remove the iotlb entry just added if the corresponding mapping fails to set up properly. Fixes: 4c8cf31885f6 ("vhost: introduce vDPA-based backend") Signed-off-by: Si-Wei Liu --- drivers/vhost/vdpa.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 796fe97..0f27919 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -565,6 +565,9 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, perm_to_iommu_flags(perm)); } + if (r) + vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1); + return r; } Acked-by: Jason Wang
Re: [PATCH v2] vringh: fix __vringh_iov() when riov and wiov are different
On 2020/10/9 上午4:42, Stefano Garzarella wrote: If riov and wiov are both defined and they point to different objects, only riov is initialized. If the wiov is not initialized by the caller, the function fails returning -EINVAL and printing "Readable desc 0x... after writable" error message. This issue happens when descriptors have both readable and writable buffers (eg. virtio-blk devices has virtio_blk_outhdr in the readable buffer and status as last byte of writable buffer) and we call __vringh_iov() to get both type of buffers in two different iovecs. Let's replace the 'else if' clause with 'if' to initialize both riov and wiov if they are not NULL. As checkpatch pointed out, we also avoid crashing the kernel when riov and wiov are both NULL, replacing BUG() with WARN_ON() and returning -EINVAL. It looks like I met the exact similar issue when developing ctrl vq support (which requires both READ and WRITE descriptor). While I was trying to fix the issue I found the following comment: * Note that you may need to clean up riov and wiov, even on error! */ int vringh_getdesc_iotlb(struct vringh *vrh, I saw some driver call vringh_kiov_cleanup(). So I just follow to use that. I'm not quite sure which one is better. Thanks Fixes: f87d0fbb5798 ("vringh: host-side implementation of virtio rings.") Cc: sta...@vger.kernel.org Signed-off-by: Stefano Garzarella --- drivers/vhost/vringh.c | 9 + 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/drivers/vhost/vringh.c b/drivers/vhost/vringh.c index e059a9a47cdf..8bd8b403f087 100644 --- a/drivers/vhost/vringh.c +++ b/drivers/vhost/vringh.c @@ -284,13 +284,14 @@ __vringh_iov(struct vringh *vrh, u16 i, desc_max = vrh->vring.num; up_next = -1; + /* You must want something! */ + if (WARN_ON(!riov && !wiov)) + return -EINVAL; + if (riov) riov->i = riov->used = 0; - else if (wiov) + if (wiov) wiov->i = wiov->used = 0; - else - /* You must want something! */ - BUG(); for (;;) { void *addr;
Re: [RFC PATCH 18/24] vhost-vdpa: support ASID based IOTLB API
On 2020/9/28 下午11:44, Eugenio Perez Martin wrote: -u64 iova, u64 size) +static int vhost_vdpa_unmap(struct vhost_vdpa *v, + struct vhost_iotlb *iotlb, + u64 iova, u64 size) { struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; + u32 asid = (iotlb); + + if (!iotlb) + return -EINVAL; This should be reorder to check for (!iotlb) before use at `asid = iotlb_to_asid()`, isn't it? Thanks! Yes, will fix in the next version. Thanks
Re: [RFC PATCH 10/24] vdpa: introduce config operations for associating ASID to a virtqueue group
On 2020/10/1 下午9:29, Eli Cohen wrote: On Thu, Sep 24, 2020 at 11:21:11AM +0800, Jason Wang wrote: This patch introduces a new bus operation to allow the vDPA bus driver to associate an ASID to a virtqueue group. So in case of virtio_net, I would expect that all the data virtqueues will be associated with the same address space identifier. Right. I will add the codes to do this in the next version. It should be more explicit than have this assumption by default. Moreover, this assignment should be provided before the set_map call that provides the iotlb for the address space, correct? I think it's better not have this limitation, note that set_map() now takes a asid argument. So for hardware if the associated as is changed, the driver needs to program the hardware to switch to the new mapping. Does this work for mlx5? Signed-off-by: Jason Wang --- include/linux/vdpa.h | 10 ++ 1 file changed, 10 insertions(+) diff --git a/include/linux/vdpa.h b/include/linux/vdpa.h index 1e1163daa352..e2394995a3cd 100644 --- a/include/linux/vdpa.h +++ b/include/linux/vdpa.h @@ -160,6 +160,12 @@ struct vdpa_device { * @get_generation: Get device config generation (optional) *@vdev: vdpa device *Returns u32: device generation + * @set_group_asid:Set address space identifier for a + * virtqueue group + * @vdev: vdpa device + * @group: virtqueue group + * @asid: address space id for this group + * Returns integer: success (0) or error (< 0) * @set_map: Set device memory mapping (optional) *Needed for device that using device *specific DMA translation (on-chip IOMMU) @@ -237,6 +243,10 @@ struct vdpa_config_ops { u64 iova, u64 size, u64 pa, u32 perm); int (*dma_unmap)(struct vdpa_device *vdev, unsigned int asid, u64 iova, u64 size); + int (*set_group_asid)(struct vdpa_device *vdev, unsigned int group, + unsigned int asid); + + Extra space Will fix. Thanks /* Free device resources */ void (*free)(struct vdpa_device *vdev); -- 2.20.1
Re: [RFC PATCH 09/24] vdpa: multiple address spaces support
On 2020/10/1 下午9:23, Eli Cohen wrote: + /* Only support 1 address space */ + if (vdpa->ngroups != 1) + return -ENOTSUPP; Checkpatch warning: prefer EOPNOTSUPP Will fix. Thanks
Re: [RFC PATCH 09/24] vdpa: multiple address spaces support
On 2020/10/1 下午9:21, Eli Cohen wrote: On Thu, Sep 24, 2020 at 11:21:10AM +0800, Jason Wang wrote: This patches introduces the multiple address spaces support for vDPA device. This idea is to identify a specific address space via an dedicated identifier - ASID. During vDPA device allocation, vDPA device driver needs to report the number of address spaces supported by the device then the DMA mapping ops of the vDPA device needs to be extended to support ASID. This helps to isolate the DMA among the virtqueues. E.g in the case of virtio-net, the control virtqueue will not be assigned directly to guest. This RFC patch only converts for the device that wants its own IOMMU/DMA translation logic. So it will rejects the device with more that 1 address space that depends on platform IOMMU. The plan to This is not apparent from the code. Instead you enforce number of groups to 1. Yes, will fix. moving all the DMA mapping logic to the vDPA device driver instead of doing it in vhost-vDPA (otherwise it could result a very complicated APIs and actually vhost-vDPA doesn't care about how the actual composition/emulation were done in the device driver). Signed-off-by: Jason Wang --- drivers/vdpa/ifcvf/ifcvf_main.c | 2 +- drivers/vdpa/mlx5/net/mlx5_vnet.c | 5 +++-- drivers/vdpa/vdpa.c | 4 +++- drivers/vdpa/vdpa_sim/vdpa_sim.c | 10 ++ drivers/vhost/vdpa.c | 14 +- include/linux/vdpa.h | 23 --- 6 files changed, 38 insertions(+), 20 deletions(-) diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index e6a0be374e51..86cdf5f8bcae 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -440,7 +440,7 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id) adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa, dev, _vdpa_ops, - IFCVF_MAX_QUEUE_PAIRS * 2, 1); + IFCVF_MAX_QUEUE_PAIRS * 2, 1, 1); if (adapter == NULL) { IFCVF_ERR(pdev, "Failed to allocate vDPA structure"); diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 4e480f4f754e..db7404e121bf 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1788,7 +1788,8 @@ static u32 mlx5_vdpa_get_generation(struct vdpa_device *vdev) return mvdev->generation; } -static int mlx5_vdpa_set_map(struct vdpa_device *vdev, struct vhost_iotlb *iotlb) +static int mlx5_vdpa_set_map(struct vdpa_device *vdev, unsigned int asid, +struct vhost_iotlb *iotlb) { struct mlx5_vdpa_dev *mvdev = to_mvdev(vdev); struct mlx5_vdpa_net *ndev = to_mlx5_vdpa_ndev(mvdev); @@ -1931,7 +1932,7 @@ void *mlx5_vdpa_add_dev(struct mlx5_core_dev *mdev) max_vqs = min_t(u32, max_vqs, MLX5_MAX_SUPPORTED_VQS); ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, _vdpa_ops, -2 * mlx5_vdpa_max_qps(max_vqs), 1); +2 * mlx5_vdpa_max_qps(max_vqs), 1, 1); if (IS_ERR(ndev)) return ndev; diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index 46399746ec7c..05195fa7865d 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -63,6 +63,7 @@ static void vdpa_release_dev(struct device *d) * @config: the bus operations that is supported by this device * @nvqs: number of virtqueues supported by this device * @ngroups: number of groups supported by this device + * @nas: number of address spaces supported by this device * @size: size of the parent structure that contains private data * * Driver should use vdpa_alloc_device() wrapper macro instead of @@ -74,7 +75,7 @@ static void vdpa_release_dev(struct device *d) struct vdpa_device *__vdpa_alloc_device(struct device *parent, const struct vdpa_config_ops *config, int nvqs, unsigned int ngroups, - size_t size) + unsigned int nas, size_t size) { struct vdpa_device *vdev; int err = -EINVAL; @@ -102,6 +103,7 @@ struct vdpa_device *__vdpa_alloc_device(struct device *parent, vdev->features_valid = false; vdev->nvqs = nvqs; vdev->ngroups = ngroups; + vdev->nas = nas; err = dev_set_name(>dev, "vdpa%u", vdev->index); if (err) diff --git a/drivers/vdpa/vdpa_sim/vdpa_sim.c b/drivers/vdpa/vdpa_sim/vdpa_sim.c index 6669c561bc6e..5dc04ec271bb 100644 --- a/drivers/vdpa/vdpa_sim/vdpa_sim.c +++ b/drivers/vdpa/vdpa_sim/vdpa_sim.c @@ -354,7 +354,7 @@ static struct vdpasim *vdpasim_create(void) ops = _net_config_o
Re: [RFC PATCH 08/24] vdpa: introduce virtqueue groups
On 2020/9/28 下午11:44, Eugenio Perez Martin wrote: On Thu, Sep 24, 2020 at 5:23 AM Jason Wang wrote: This patch introduces virtqueue groups to vDPA device. The virtqueue group is the minimal set of virtqueues that must share an address space. And the adddress space identifier could only be attached to a specific virtqueue group. A new mandated bus operation is introduced to get the virtqueue group ID for a specific virtqueue. All the vDPA device drivers were converted to simply support a single virtqueue group. Signed-off-by: Jason Wang --- drivers/vdpa/ifcvf/ifcvf_main.c | 9 - drivers/vdpa/mlx5/net/mlx5_vnet.c | 8 +++- drivers/vdpa/vdpa.c | 4 +++- drivers/vdpa/vdpa_sim/vdpa_sim.c | 11 ++- include/linux/vdpa.h | 12 +--- 5 files changed, 37 insertions(+), 7 deletions(-) diff --git a/drivers/vdpa/ifcvf/ifcvf_main.c b/drivers/vdpa/ifcvf/ifcvf_main.c index 076d7ac5e723..e6a0be374e51 100644 --- a/drivers/vdpa/ifcvf/ifcvf_main.c +++ b/drivers/vdpa/ifcvf/ifcvf_main.c @@ -327,6 +327,11 @@ static u32 ifcvf_vdpa_get_vq_align(struct vdpa_device *vdpa_dev) return IFCVF_QUEUE_ALIGNMENT; } +static u32 ifcvf_vdpa_get_vq_group(struct vdpa_device *vdpa, u16 idx) +{ + return 0; +} + static void ifcvf_vdpa_get_config(struct vdpa_device *vdpa_dev, unsigned int offset, void *buf, unsigned int len) @@ -387,6 +392,7 @@ static const struct vdpa_config_ops ifc_vdpa_ops = { .get_device_id = ifcvf_vdpa_get_device_id, .get_vendor_id = ifcvf_vdpa_get_vendor_id, .get_vq_align = ifcvf_vdpa_get_vq_align, + .get_vq_group = ifcvf_vdpa_get_vq_group, .get_config = ifcvf_vdpa_get_config, .set_config = ifcvf_vdpa_set_config, .set_config_cb = ifcvf_vdpa_set_config_cb, @@ -434,7 +440,8 @@ static int ifcvf_probe(struct pci_dev *pdev, const struct pci_device_id *id) adapter = vdpa_alloc_device(struct ifcvf_adapter, vdpa, dev, _vdpa_ops, - IFCVF_MAX_QUEUE_PAIRS * 2); + IFCVF_MAX_QUEUE_PAIRS * 2, 1); + if (adapter == NULL) { IFCVF_ERR(pdev, "Failed to allocate vDPA structure"); return -ENOMEM; diff --git a/drivers/vdpa/mlx5/net/mlx5_vnet.c b/drivers/vdpa/mlx5/net/mlx5_vnet.c index 9df69d5efe8c..4e480f4f754e 100644 --- a/drivers/vdpa/mlx5/net/mlx5_vnet.c +++ b/drivers/vdpa/mlx5/net/mlx5_vnet.c @@ -1428,6 +1428,11 @@ static u32 mlx5_vdpa_get_vq_align(struct vdpa_device *vdev) return PAGE_SIZE; } +static u32 mlx5_vdpa_get_vq_group(struct vdpa_device *vdpa, u16 idx) +{ + return 0; +} + enum { MLX5_VIRTIO_NET_F_GUEST_CSUM = 1 << 9, MLX5_VIRTIO_NET_F_CSUM = 1 << 10, MLX5_VIRTIO_NET_F_HOST_TSO6 = 1 << 11, @@ -1838,6 +1843,7 @@ static const struct vdpa_config_ops mlx5_vdpa_ops = { .get_vq_notification = mlx5_get_vq_notification, .get_vq_irq = mlx5_get_vq_irq, .get_vq_align = mlx5_vdpa_get_vq_align, + .get_vq_group = mlx5_vdpa_get_vq_group, .get_features = mlx5_vdpa_get_features, .set_features = mlx5_vdpa_set_features, .set_config_cb = mlx5_vdpa_set_config_cb, @@ -1925,7 +1931,7 @@ void *mlx5_vdpa_add_dev(struct mlx5_core_dev *mdev) max_vqs = min_t(u32, max_vqs, MLX5_MAX_SUPPORTED_VQS); ndev = vdpa_alloc_device(struct mlx5_vdpa_net, mvdev.vdev, mdev->device, _vdpa_ops, -2 * mlx5_vdpa_max_qps(max_vqs)); +2 * mlx5_vdpa_max_qps(max_vqs), 1); if (IS_ERR(ndev)) return ndev; diff --git a/drivers/vdpa/vdpa.c b/drivers/vdpa/vdpa.c index a69ffc991e13..46399746ec7c 100644 --- a/drivers/vdpa/vdpa.c +++ b/drivers/vdpa/vdpa.c @@ -62,6 +62,7 @@ static void vdpa_release_dev(struct device *d) * @parent: the parent device * @config: the bus operations that is supported by this device * @nvqs: number of virtqueues supported by this device + * @ngroups: number of groups supported by this device Hi! Maybe the description of "ngroups" could be "number of*virtqueue* groups supported by this device"? I think that it could be needed in some contexts reading the code. Exactly. Will fix. Thanks Thanks!
Re: [RFC PATCH 06/24] vhost-vdpa: switch to use vhost-vdpa specific IOTLB
On 2020/9/30 下午8:02, Eli Cohen wrote: On Thu, Sep 24, 2020 at 11:21:07AM +0800, Jason Wang wrote: To ease the implementation of per group ASID support for vDPA device. This patch switches to use a vhost-vdpa specific IOTLB to avoid the unnecessary refactoring of the vhost core. Signed-off-by: Jason Wang --- drivers/vhost/vdpa.c | 14 -- 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 74bef1c15a70..ec3c94f706c1 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -40,6 +40,7 @@ struct vhost_vdpa { struct vhost_virtqueue *vqs; struct completion completion; struct vdpa_device *vdpa; + struct vhost_iotlb *iotlb; struct device dev; struct cdev cdev; atomic_t opened; @@ -514,12 +515,11 @@ static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, static void vhost_vdpa_iotlb_free(struct vhost_vdpa *v) { - struct vhost_dev *dev = >vdev; - struct vhost_iotlb *iotlb = dev->iotlb; + struct vhost_iotlb *iotlb = v->iotlb; vhost_vdpa_iotlb_unmap(v, iotlb, 0ULL, 0ULL - 1); - kfree(dev->iotlb); - dev->iotlb = NULL; + kfree(v->iotlb); + v->iotlb = NULL; } static int perm_to_iommu_flags(u32 perm) @@ -681,7 +681,7 @@ static int vhost_vdpa_process_iotlb_msg(struct vhost_dev *dev, struct vhost_vdpa *v = container_of(dev, struct vhost_vdpa, vdev); struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; - struct vhost_iotlb *iotlb = dev->iotlb; + struct vhost_iotlb *iotlb = v->iotlb; int r = 0; r = vhost_dev_check_owner(dev); @@ -812,12 +812,14 @@ static int vhost_vdpa_open(struct inode *inode, struct file *filep) r = vhost_vdpa_alloc_domain(v); if (r) - goto err_init_iotlb; + goto err_alloc_domain; You're still using this: dev->iotlb = vhost_iotlb_alloc(0, 0); Shouldn't you use v->iotlb = host_iotlb_alloc(0, 0); to set the vdpa device iotlb field? Yes, you're right. Will fix. Thanks filep->private_data = v; return 0; +err_alloc_domain: + vhost_vdpa_iotlb_free(v); err_init_iotlb: vhost_vdpa_cleanup(v); err: -- 2.20.1
Re: [RFC PATCH 05/24] vhost-vdpa: passing iotlb to IOMMU mapping helpers
On 2020/9/30 下午7:26, Eli Cohen wrote: On Thu, Sep 24, 2020 at 11:21:06AM +0800, Jason Wang wrote: To prepare for the ASID support for vhost-vdpa, try to pass IOTLB object to dma helpers. Maybe it's worth mentioning here that this patch does not change any functionality and is presented as a preparation for passing different iotlb's instead of using dev->iotlb Right, let me add them in the next version. Thanks Signed-off-by: Jason Wang --- drivers/vhost/vdpa.c | 40 ++-- 1 file changed, 22 insertions(+), 18 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 9c641274b9f3..74bef1c15a70 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -489,10 +489,11 @@ static long vhost_vdpa_unlocked_ioctl(struct file *filep, return r; } -static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, u64 start, u64 last) +static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, + struct vhost_iotlb *iotlb, + u64 start, u64 last) { struct vhost_dev *dev = >vdev; - struct vhost_iotlb *iotlb = dev->iotlb; struct vhost_iotlb_map *map; struct page *page; unsigned long pfn, pinned; @@ -514,8 +515,9 @@ static void vhost_vdpa_iotlb_unmap(struct vhost_vdpa *v, u64 start, u64 last) static void vhost_vdpa_iotlb_free(struct vhost_vdpa *v) { struct vhost_dev *dev = >vdev; + struct vhost_iotlb *iotlb = dev->iotlb; - vhost_vdpa_iotlb_unmap(v, 0ULL, 0ULL - 1); + vhost_vdpa_iotlb_unmap(v, iotlb, 0ULL, 0ULL - 1); kfree(dev->iotlb); dev->iotlb = NULL; } @@ -542,15 +544,14 @@ static int perm_to_iommu_flags(u32 perm) return flags | IOMMU_CACHE; } -static int vhost_vdpa_map(struct vhost_vdpa *v, +static int vhost_vdpa_map(struct vhost_vdpa *v, struct vhost_iotlb *iotlb, u64 iova, u64 size, u64 pa, u32 perm) { - struct vhost_dev *dev = >vdev; struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; int r = 0; - r = vhost_iotlb_add_range(dev->iotlb, iova, iova + size - 1, + r = vhost_iotlb_add_range(iotlb, iova, iova + size - 1, pa, perm); if (r) return r; @@ -559,7 +560,7 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, r = ops->dma_map(vdpa, iova, size, pa, perm); } else if (ops->set_map) { if (!v->in_batch) - r = ops->set_map(vdpa, dev->iotlb); + r = ops->set_map(vdpa, iotlb); } else { r = iommu_map(v->domain, iova, pa, size, perm_to_iommu_flags(perm)); @@ -568,29 +569,30 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, return r; } -static void vhost_vdpa_unmap(struct vhost_vdpa *v, u64 iova, u64 size) +static void vhost_vdpa_unmap(struct vhost_vdpa *v, +struct vhost_iotlb *iotlb, +u64 iova, u64 size) { - struct vhost_dev *dev = >vdev; struct vdpa_device *vdpa = v->vdpa; const struct vdpa_config_ops *ops = vdpa->config; - vhost_vdpa_iotlb_unmap(v, iova, iova + size - 1); + vhost_vdpa_iotlb_unmap(v, iotlb, iova, iova + size - 1); if (ops->dma_map) { ops->dma_unmap(vdpa, iova, size); } else if (ops->set_map) { if (!v->in_batch) - ops->set_map(vdpa, dev->iotlb); + ops->set_map(vdpa, iotlb); } else { iommu_unmap(v->domain, iova, size); } } static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, + struct vhost_iotlb *iotlb, struct vhost_iotlb_msg *msg) { struct vhost_dev *dev = >vdev; - struct vhost_iotlb *iotlb = dev->iotlb; struct page **page_list; unsigned long list_size = PAGE_SIZE / sizeof(struct page *); unsigned int gup_flags = FOLL_LONGTERM; @@ -644,7 +646,7 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, if (last_pfn && (this_pfn != last_pfn + 1)) { /* Pin a contiguous chunk of memory */ csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT; - if (vhost_vdpa_map(v, iova, csize, + if (vhost_vdpa_map(v, iotlb, iova, csize, map_pfn << PAGE_SHIFT, msg->perm)) goto out; @@ -660,11 +662,12 @@ static int vhost_vdpa_
Re: [PATCH] vhost-vdpa: fix page pinning leakage in error path
On 2020/10/2 上午4:23, Si-Wei Liu wrote: Pinned pages are not properly accounted particularly when mapping error occurs on IOTLB update. Clean up dangling pinned pages for the error path. As the inflight pinned pages, specifically for memory region that strides across multiple chunks, would need more than one free page for book keeping and accounting. For simplicity, pin pages for all memory in the IOVA range in one go rather than have multiple pin_user_pages calls to make up the entire region. This way it's easier to track and account the pages already mapped, particularly for clean-up in the error path. Fixes: 20453a45fb06 ("vhost: introduce vDPA-based backend") Signed-off-by: Si-Wei Liu --- drivers/vhost/vdpa.c | 121 +++ 1 file changed, 73 insertions(+), 48 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 796fe97..abc4aa2 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -565,6 +565,8 @@ static int vhost_vdpa_map(struct vhost_vdpa *v, perm_to_iommu_flags(perm)); } + if (r) + vhost_iotlb_del_range(dev->iotlb, iova, iova + size - 1); return r; } Please use a separate patch for this fix. @@ -592,21 +594,19 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, struct vhost_dev *dev = >vdev; struct vhost_iotlb *iotlb = dev->iotlb; struct page **page_list; - unsigned long list_size = PAGE_SIZE / sizeof(struct page *); + struct vm_area_struct **vmas; unsigned int gup_flags = FOLL_LONGTERM; - unsigned long npages, cur_base, map_pfn, last_pfn = 0; - unsigned long locked, lock_limit, pinned, i; + unsigned long map_pfn, last_pfn = 0; + unsigned long npages, lock_limit; + unsigned long i, nmap = 0; u64 iova = msg->iova; + long pinned; int ret = 0; if (vhost_iotlb_itree_first(iotlb, msg->iova, msg->iova + msg->size - 1)) return -EEXIST; - page_list = (struct page **) __get_free_page(GFP_KERNEL); - if (!page_list) - return -ENOMEM; - if (msg->perm & VHOST_ACCESS_WO) gup_flags |= FOLL_WRITE; @@ -614,61 +614,86 @@ static int vhost_vdpa_process_iotlb_update(struct vhost_vdpa *v, if (!npages) return -EINVAL; + page_list = kvmalloc_array(npages, sizeof(struct page *), GFP_KERNEL); + vmas = kvmalloc_array(npages, sizeof(struct vm_area_struct *), + GFP_KERNEL); + if (!page_list || !vmas) { + ret = -ENOMEM; + goto free; + } + mmap_read_lock(dev->mm); - locked = atomic64_add_return(npages, >mm->pinned_vm); lock_limit = rlimit(RLIMIT_MEMLOCK) >> PAGE_SHIFT; - - if (locked > lock_limit) { + if (npages + atomic64_read(>mm->pinned_vm) > lock_limit) { ret = -ENOMEM; - goto out; + goto unlock; } - cur_base = msg->uaddr & PAGE_MASK; - iova &= PAGE_MASK; + pinned = pin_user_pages(msg->uaddr & PAGE_MASK, npages, gup_flags, + page_list, vmas); + if (npages != pinned) { + if (pinned < 0) { + ret = pinned; + } else { + unpin_user_pages(page_list, pinned); + ret = -ENOMEM; + } + goto unlock; + } - while (npages) { - pinned = min_t(unsigned long, npages, list_size); - ret = pin_user_pages(cur_base, pinned, -gup_flags, page_list, NULL); - if (ret != pinned) - goto out; - - if (!last_pfn) - map_pfn = page_to_pfn(page_list[0]); - - for (i = 0; i < ret; i++) { - unsigned long this_pfn = page_to_pfn(page_list[i]); - u64 csize; - - if (last_pfn && (this_pfn != last_pfn + 1)) { - /* Pin a contiguous chunk of memory */ - csize = (last_pfn - map_pfn + 1) << PAGE_SHIFT; - if (vhost_vdpa_map(v, iova, csize, - map_pfn << PAGE_SHIFT, - msg->perm)) - goto out; - map_pfn = this_pfn; - iova += csize; + iova &= PAGE_MASK; + map_pfn = page_to_pfn(page_list[0]); + + /* One more iteration to avoid extra vdpa_map() call out of loop. */ + for (i = 0; i <= npages; i++) { + unsigned long this_pfn; + u64 csize; + + /* The last chunk may have no valid PFN
Re: [RFC PATCH 02/24] vhost-vdpa: fix vqs leak in vhost_vdpa_open()
On 2020/9/24 下午3:48, Eli Cohen wrote: On Thu, Sep 24, 2020 at 11:21:03AM +0800, Jason Wang wrote: We need to free vqs during the err path after it has been allocated since vhost won't do that for us. Signed-off-by: Jason Wang --- drivers/vhost/vdpa.c | 11 --- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 796fe979f997..9c641274b9f3 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -764,6 +764,12 @@ static void vhost_vdpa_free_domain(struct vhost_vdpa *v) v->domain = NULL; } +static void vhost_vdpa_cleanup(struct vhost_vdpa *v) +{ + vhost_dev_cleanup(>vdev); + kfree(v->vdev.vqs); +} + Wouldn't it be cleaner to call kfree(vqs) explicilty inside vhost_vdpa_open() in case of failure and keep the symetry of vhost_dev_init()/vhost_dev_cleanup()? That's also fine. See https://www.mail-archive.com/virtualization@lists.linux-foundation.org/msg42558.html I will use that for the next version. Thanks. static int vhost_vdpa_open(struct inode *inode, struct file *filep) { struct vhost_vdpa *v; @@ -809,7 +815,7 @@ static int vhost_vdpa_open(struct inode *inode, struct file *filep) return 0; err_init_iotlb: - vhost_dev_cleanup(>vdev); + vhost_vdpa_cleanup(v); err: atomic_dec(>opened); return r; @@ -840,8 +846,7 @@ static int vhost_vdpa_release(struct inode *inode, struct file *filep) vhost_vdpa_free_domain(v); vhost_vdpa_config_put(v); vhost_vdpa_clean_irq(v); - vhost_dev_cleanup(>vdev); - kfree(v->vdev.vqs); + vhost_vdpa_cleanup(v); mutex_unlock(>mutex); atomic_dec(>opened); -- 2.20.1
Re: [RFC PATCH 00/24] Control VQ support in vDPA
On 2020/9/24 下午6:17, Stefan Hajnoczi wrote: On Thu, Sep 24, 2020 at 11:21:01AM +0800, Jason Wang wrote: This series tries to add the support for control virtqueue in vDPA. Please include documentation for both driver authors and vhost-vdpa ioctl users. vhost-vdpa ioctls are only documented with a single sentence. Please add full information on arguments, return values, and a high-level explanation of the feature (like this cover letter) to introduce the API. Right, this is in the TODO list. (And we probably need to start with documenting vDPA bus operations first). What is the policy for using virtqueue groups? My guess is: 1. virtio_vdpa simply enables all virtqueue groups. 2. vhost_vdpa relies on userspace policy on how to use virtqueue groups. Are the semantics of virtqueue groups documented somewhere so userspace knows what to do? If a vDPA driver author decides to create N virtqueue groups, N/2 virtqueue groups, or just 1 virtqueue group, how will userspace know what to do? So the mapping from virtqueue to virtqueue group is mandated by the vDPA device(driver). vDPA bus driver (like vhost-vDPA), can only change the association between virtqueue groups and ASID. By default, it is required all virtqueue groups to be associated to address space 0. This make sure virtio_vdpa can work without any special groups/asid configuration. I admit we need document all those semantics/polices. Maybe a document is needed to describe the recommended device-specific virtqueue groups that vDPA drivers should implement (e.g. "put the net control vq into its own virtqueue group")? Yes, note that this depends on the hardware capability actually. It can only put control vq in other virtqueue group if: 1) hardware support to isolate control vq DMA from the rest virtqueues (PASID or simply using PA (translated address) for control vq) or 2) the control vq is emulated by vDPA device driver (like vdpa_sim did). This could become messy with guidelines. For example, drivers might be shipped that aren't usable for certain use cases just because the author didn't know that a certain virtqueue grouping is advantageous. Right. BTW I like how general this feature is. It seems to allow vDPA devices to be split into sub-devices for further passthrough. Who will write the first vDPA-on-vDPA driver? :) Yes, that's an interesting question. For now, I can imagine we can emulated a SRIOV based virtio-net VFs via this. If we want to expose the ASID setting to guest as well, it probably needs more thought. Thanks Stefan
Re: [PATCH v3 -next] vdpa: mlx5: change Kconfig depends to fix build errors
On 2020/9/25 下午6:19, Michael S. Tsirkin wrote: On Fri, Sep 25, 2020 at 10:20:05AM +0300, Leon Romanovsky wrote: On Thu, Sep 24, 2020 at 12:02:43PM -0400, Michael S. Tsirkin wrote: On Thu, Sep 24, 2020 at 08:47:05AM -0700, Randy Dunlap wrote: On 9/24/20 3:24 AM, Eli Cohen wrote: On Thu, Sep 24, 2020 at 05:30:55AM -0400, Michael S. Tsirkin wrote: --- linux-next-20200917.orig/drivers/vdpa/Kconfig +++ linux-next-20200917/drivers/vdpa/Kconfig @@ -31,7 +31,7 @@ config IFCVF config MLX5_VDPA bool "MLX5 VDPA support library for ConnectX devices" - depends on MLX5_CORE + depends on VHOST_IOTLB && MLX5_CORE default n While we are here, can anyone who apply this patch delete the "default n" line? It is by default "n". I can do that Thanks Hmm other drivers select VHOST_IOTLB, why not do the same? v1 used select, but Saeed requested use of depends instead because select can cause problems. I can't see another driver doing that. Perhaps I can set dependency on VHOST which by itself depends on VHOST_IOTLB? help Support library for Mellanox VDPA drivers. Provides code that is Saeed what kind of problems? It's used with select in other places, isn't it? IMHO, "depends" is much more explicit than "select". Thanks This is now how VHOST_IOTLB has been designed though. If you want to change VHOST_IOTLB to depends I think we should do it consistently all over. config VHOST_IOTLB tristate help Generic IOTLB implementation for vhost and vringh. This option is selected by any driver which needs to support an IOMMU in software. Yes, since there's no prompt for VHOST_IOTLB which means, if there's no other symbol that select VHOST_IOTLB, you can't enable MLX5 at all. See kconfig-language.rst: In general use select only for non-visible symbols (no prompts anywhere) and for symbols with no dependencies. That will limit the usefulness but on the other hand avoid the illegal configurations all over. Thanks -- ~Randy
Re: [RFC PATCH 02/24] vhost-vdpa: fix vqs leak in vhost_vdpa_open()
On 2020/9/24 下午5:31, Michael S. Tsirkin wrote: On Thu, Sep 24, 2020 at 11:21:03AM +0800, Jason Wang wrote: We need to free vqs during the err path after it has been allocated since vhost won't do that for us. Signed-off-by: Jason Wang This is a bugfix too right? I don't see it posted separately ... A patch that is functional equivalent is posted here: https://www.mail-archive.com/virtualization@lists.linux-foundation.org/msg42558.html I'm a little bit lazy to use that one since this patch is probably wrote before that one. Thanks --- drivers/vhost/vdpa.c | 11 --- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c index 796fe979f997..9c641274b9f3 100644 --- a/drivers/vhost/vdpa.c +++ b/drivers/vhost/vdpa.c @@ -764,6 +764,12 @@ static void vhost_vdpa_free_domain(struct vhost_vdpa *v) v->domain = NULL; } +static void vhost_vdpa_cleanup(struct vhost_vdpa *v) +{ + vhost_dev_cleanup(>vdev); + kfree(v->vdev.vqs); +} + static int vhost_vdpa_open(struct inode *inode, struct file *filep) { struct vhost_vdpa *v; @@ -809,7 +815,7 @@ static int vhost_vdpa_open(struct inode *inode, struct file *filep) return 0; err_init_iotlb: - vhost_dev_cleanup(>vdev); + vhost_vdpa_cleanup(v); err: atomic_dec(>opened); return r; @@ -840,8 +846,7 @@ static int vhost_vdpa_release(struct inode *inode, struct file *filep) vhost_vdpa_free_domain(v); vhost_vdpa_config_put(v); vhost_vdpa_clean_irq(v); - vhost_dev_cleanup(>vdev); - kfree(v->vdev.vqs); + vhost_vdpa_cleanup(v); mutex_unlock(>mutex); atomic_dec(>opened); -- 2.20.1
Re: [RFC PATCH 01/24] vhost-vdpa: fix backend feature ioctls
On 2020/9/24 下午3:50, Michael S. Tsirkin wrote: On Thu, Sep 24, 2020 at 11:21:02AM +0800, Jason Wang wrote: Commit 653055b9acd4 ("vhost-vdpa: support get/set backend features") introduces two malfunction backend features ioctls: 1) the ioctls was blindly added to vring ioctl instead of vdpa device ioctl 2) vhost_set_backend_features() was called when dev mutex has already been held which will lead a deadlock This patch fixes the above issues. Cc: Eli Cohen Reported-by: Zhu Lingshan Fixes: 653055b9acd4 ("vhost-vdpa: support get/set backend features") Signed-off-by: Jason Wang Don't we want the fixes queued right now, as opposed to the rest of the RFC? Yes, actually I've posted in before[1]. Adding the patch here is to simplify the work for the guys that want to do the work on top. E.g for Cindy to start the Qemu prototype. Thanks [1] https://www.spinics.net/lists/netdev/msg681247.html