[PATCH v3 1/2] vhost: Add worker backend callouts

2023-12-04 Thread Mike Christie
This adds the vhost backend callouts for the worker ioctls added in the
6.4 linux kernel commit:

c1ecd8e95007 ("vhost: allow userspace to create workers")

Signed-off-by: Mike Christie 
Reviewed-by: Stefano Garzarella 
Reviewed-by: Stefan Hajnoczi 

---
 hw/virtio/vhost-backend.c | 28 
 include/hw/virtio/vhost-backend.h | 14 ++
 2 files changed, 42 insertions(+)

diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c
index 17f3fc6a0823..833804dd40f2 100644
--- a/hw/virtio/vhost-backend.c
+++ b/hw/virtio/vhost-backend.c
@@ -158,6 +158,30 @@ static int vhost_kernel_set_vring_busyloop_timeout(struct 
vhost_dev *dev,
 return vhost_kernel_call(dev, VHOST_SET_VRING_BUSYLOOP_TIMEOUT, s);
 }
 
+static int vhost_kernel_new_worker(struct vhost_dev *dev,
+   struct vhost_worker_state *worker)
+{
+return vhost_kernel_call(dev, VHOST_NEW_WORKER, worker);
+}
+
+static int vhost_kernel_free_worker(struct vhost_dev *dev,
+struct vhost_worker_state *worker)
+{
+return vhost_kernel_call(dev, VHOST_FREE_WORKER, worker);
+}
+
+static int vhost_kernel_attach_vring_worker(struct vhost_dev *dev,
+struct vhost_vring_worker *worker)
+{
+return vhost_kernel_call(dev, VHOST_ATTACH_VRING_WORKER, worker);
+}
+
+static int vhost_kernel_get_vring_worker(struct vhost_dev *dev,
+ struct vhost_vring_worker *worker)
+{
+return vhost_kernel_call(dev, VHOST_GET_VRING_WORKER, worker);
+}
+
 static int vhost_kernel_set_features(struct vhost_dev *dev,
  uint64_t features)
 {
@@ -313,6 +337,10 @@ const VhostOps kernel_ops = {
 .vhost_set_vring_err = vhost_kernel_set_vring_err,
 .vhost_set_vring_busyloop_timeout =
 vhost_kernel_set_vring_busyloop_timeout,
+.vhost_get_vring_worker = vhost_kernel_get_vring_worker,
+.vhost_attach_vring_worker = vhost_kernel_attach_vring_worker,
+.vhost_new_worker = vhost_kernel_new_worker,
+.vhost_free_worker = vhost_kernel_free_worker,
 .vhost_set_features = vhost_kernel_set_features,
 .vhost_get_features = vhost_kernel_get_features,
 .vhost_set_backend_cap = vhost_kernel_set_backend_cap,
diff --git a/include/hw/virtio/vhost-backend.h 
b/include/hw/virtio/vhost-backend.h
index a86d103f8245..70c2e8ffeee5 100644
--- a/include/hw/virtio/vhost-backend.h
+++ b/include/hw/virtio/vhost-backend.h
@@ -45,6 +45,8 @@ struct vhost_memory;
 struct vhost_vring_file;
 struct vhost_vring_state;
 struct vhost_vring_addr;
+struct vhost_vring_worker;
+struct vhost_worker_state;
 struct vhost_scsi_target;
 struct vhost_iotlb_msg;
 struct vhost_virtqueue;
@@ -85,6 +87,14 @@ typedef int (*vhost_set_vring_err_op)(struct vhost_dev *dev,
   struct vhost_vring_file *file);
 typedef int (*vhost_set_vring_busyloop_timeout_op)(struct vhost_dev *dev,
struct vhost_vring_state 
*r);
+typedef int (*vhost_attach_vring_worker_op)(struct vhost_dev *dev,
+struct vhost_vring_worker *worker);
+typedef int (*vhost_get_vring_worker_op)(struct vhost_dev *dev,
+ struct vhost_vring_worker *worker);
+typedef int (*vhost_new_worker_op)(struct vhost_dev *dev,
+   struct vhost_worker_state *worker);
+typedef int (*vhost_free_worker_op)(struct vhost_dev *dev,
+struct vhost_worker_state *worker);
 typedef int (*vhost_set_features_op)(struct vhost_dev *dev,
  uint64_t features);
 typedef int (*vhost_get_features_op)(struct vhost_dev *dev,
@@ -172,6 +182,10 @@ typedef struct VhostOps {
 vhost_set_vring_call_op vhost_set_vring_call;
 vhost_set_vring_err_op vhost_set_vring_err;
 vhost_set_vring_busyloop_timeout_op vhost_set_vring_busyloop_timeout;
+vhost_new_worker_op vhost_new_worker;
+vhost_free_worker_op vhost_free_worker;
+vhost_get_vring_worker_op vhost_get_vring_worker;
+vhost_attach_vring_worker_op vhost_attach_vring_worker;
 vhost_set_features_op vhost_set_features;
 vhost_get_features_op vhost_get_features;
 vhost_set_backend_cap_op vhost_set_backend_cap;
-- 
2.34.1




[PATCH v3 0/2] vhost-scsi: Support worker ioctls

2023-12-04 Thread Mike Christie
The following patches allow users to configure the vhost worker threads
for vhost-scsi. With vhost-net we get a worker thread per rx/tx virtqueue
pair, but for vhost-scsi we get one worker for all workqueues. This
becomes a bottlneck after 2 queues are used.

In the upstream linux kernel commit:

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/drivers/vhost/vhost.c?id=c1ecd8e9500797748ae4f79657971955d452d69d

we enabled the vhost layer to be able to create a worker thread and
attach it to a virtqueue.

This patchset adds support to vhost-scsi to use these ioctls so we are
no longer limited to the single worker.

v3:
- Warn user if they have set worker_per_virtqueue=true but the kernel
doesn't support it.
v2:
- Make config option a bool instead of an int.





[PATCH v3 2/2] vhost-scsi: Add support for a worker thread per virtqueue

2023-12-04 Thread Mike Christie
This adds support for vhost-scsi to be able to create a worker thread
per virtqueue. Right now for vhost-net we get a worker thread per
tx/rx virtqueue pair which scales nicely as we add more virtqueues and
CPUs, but for scsi we get the single worker thread that's shared by all
virtqueues. When trying to send IO to more than 2 virtqueues the single
thread becomes a bottlneck.

This patch adds a new setting, worker_per_virtqueue, which can be set
to:

false: Existing behavior where we get the single worker thread.
true: Create a worker per IO virtqueue.

Signed-off-by: Mike Christie 
Reviewed-by: Stefan Hajnoczi 

---
 hw/scsi/vhost-scsi.c| 62 +
 include/hw/virtio/virtio-scsi.h |  1 +
 2 files changed, 63 insertions(+)

diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c
index 3126df9e1d9d..08aa7534df51 100644
--- a/hw/scsi/vhost-scsi.c
+++ b/hw/scsi/vhost-scsi.c
@@ -165,6 +165,59 @@ static const VMStateDescription vmstate_virtio_vhost_scsi 
= {
 .pre_save = vhost_scsi_pre_save,
 };
 
+static int vhost_scsi_set_workers(VHostSCSICommon *vsc, bool per_virtqueue)
+{
+struct vhost_dev *dev = &vsc->dev;
+struct vhost_vring_worker vq_worker;
+struct vhost_worker_state worker;
+int i, ret;
+
+/* Use default worker */
+if (!per_virtqueue || dev->nvqs == VHOST_SCSI_VQ_NUM_FIXED + 1) {
+return 0;
+}
+
+/*
+ * ctl/evt share the first worker since it will be rare for them
+ * to send cmds while IO is running.
+ */
+for (i = VHOST_SCSI_VQ_NUM_FIXED + 1; i < dev->nvqs; i++) {
+memset(&worker, 0, sizeof(worker));
+
+ret = dev->vhost_ops->vhost_new_worker(dev, &worker);
+if (ret == -ENOTTY) {
+/*
+ * worker ioctls are not implemented so just ignore and
+ * and continue device setup.
+ */
+warn_report("vhost-scsi: Backend supports a single worker. "
+"Ignoring worker_per_virtqueue=true setting.");
+ret = 0;
+break;
+} else if (ret) {
+break;
+}
+
+memset(&vq_worker, 0, sizeof(vq_worker));
+vq_worker.worker_id = worker.worker_id;
+vq_worker.index = i;
+
+ret = dev->vhost_ops->vhost_attach_vring_worker(dev, &vq_worker);
+if (ret == -ENOTTY) {
+/*
+ * It's a bug for the kernel to have supported the worker creation
+ * ioctl but not attach.
+ */
+dev->vhost_ops->vhost_free_worker(dev, &worker);
+break;
+} else if (ret) {
+break;
+}
+}
+
+return ret;
+}
+
 static void vhost_scsi_realize(DeviceState *dev, Error **errp)
 {
 VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
@@ -232,6 +285,13 @@ static void vhost_scsi_realize(DeviceState *dev, Error 
**errp)
 goto free_vqs;
 }
 
+ret = vhost_scsi_set_workers(vsc, vs->conf.worker_per_virtqueue);
+if (ret < 0) {
+error_setg(errp, "vhost-scsi: vhost worker setup failed: %s",
+   strerror(-ret));
+goto free_vqs;
+}
+
 /* At present, channel and lun both are 0 for bootable vhost-scsi disk */
 vsc->channel = 0;
 vsc->lun = 0;
@@ -297,6 +357,8 @@ static Property vhost_scsi_properties[] = {
  VIRTIO_SCSI_F_T10_PI,
  false),
 DEFINE_PROP_BOOL("migratable", VHostSCSICommon, migratable, false),
+DEFINE_PROP_BOOL("worker_per_virtqueue", VirtIOSCSICommon,
+ conf.worker_per_virtqueue, false),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h
index 779568ab5d28..0e9a1867665e 100644
--- a/include/hw/virtio/virtio-scsi.h
+++ b/include/hw/virtio/virtio-scsi.h
@@ -51,6 +51,7 @@ typedef struct virtio_scsi_config VirtIOSCSIConfig;
 struct VirtIOSCSIConf {
 uint32_t num_queues;
 uint32_t virtqueue_size;
+bool worker_per_virtqueue;
 bool seg_max_adjust;
 uint32_t max_sectors;
 uint32_t cmd_per_lun;
-- 
2.34.1




Re: [PATCH v2 2/2] vhost-scsi: Add support for a worker thread per virtqueue

2023-11-29 Thread Mike Christie
On 11/29/23 3:30 AM, Stefano Garzarella wrote:
> On Sun, Nov 26, 2023 at 06:28:34PM -0600, Mike Christie wrote:
>> This adds support for vhost-scsi to be able to create a worker thread
>> per virtqueue. Right now for vhost-net we get a worker thread per
>> tx/rx virtqueue pair which scales nicely as we add more virtqueues and
>> CPUs, but for scsi we get the single worker thread that's shared by all
>> virtqueues. When trying to send IO to more than 2 virtqueues the single
>> thread becomes a bottlneck.
>>
>> This patch adds a new setting, workers_per_virtqueue, which can be set
>> to:
>>
>> false: Existing behavior where we get the single worker thread.
>> true: Create a worker per IO virtqueue.
>>
>> Signed-off-by: Mike Christie 
>> ---
>> hw/scsi/vhost-scsi.c    | 60 +
>> include/hw/virtio/virtio-scsi.h |  1 +
>> 2 files changed, 61 insertions(+)
>>
>> diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c
>> index 3126df9e1d9d..77eef9474c23 100644
>> --- a/hw/scsi/vhost-scsi.c
>> +++ b/hw/scsi/vhost-scsi.c
>> @@ -165,6 +165,57 @@ static const VMStateDescription 
>> vmstate_virtio_vhost_scsi = {
>>     .pre_save = vhost_scsi_pre_save,
>> };
>>
>> +static int vhost_scsi_set_workers(VHostSCSICommon *vsc, bool per_virtqueue)
>> +{
>> +    struct vhost_dev *dev = &vsc->dev;
>> +    struct vhost_vring_worker vq_worker;
>> +    struct vhost_worker_state worker;
>> +    int i, ret;
>> +
>> +    /* Use default worker */
>> +    if (!per_virtqueue || dev->nvqs == VHOST_SCSI_VQ_NUM_FIXED + 1) {
>> +    return 0;
>> +    }
>> +
>> +    /*
>> + * ctl/evt share the first worker since it will be rare for them
>> + * to send cmds while IO is running.
>> + */
>> +    for (i = VHOST_SCSI_VQ_NUM_FIXED + 1; i < dev->nvqs; i++) {
>> +    memset(&worker, 0, sizeof(worker));
>> +
>> +    ret = dev->vhost_ops->vhost_new_worker(dev, &worker);
>> +    if (ret == -ENOTTY) {
>> +    /*
>> + * worker ioctls are not implemented so just ignore and
>> + * and continue device setup.
>> + */
> 
> IIUC here the user has asked to use a worker for each virtqueue, but the
> kernel does not support it so we ignore it.
> 
> Should we at least print a warning?
> 

We should. I'll add it.




[PATCH v2 1/2] vhost: Add worker backend callouts

2023-11-26 Thread Mike Christie
This adds the vhost backend callouts for the worker ioctls added in the
6.4 linux kernel commit:

c1ecd8e95007 ("vhost: allow userspace to create workers")

Signed-off-by: Mike Christie 
---
 hw/virtio/vhost-backend.c | 28 
 include/hw/virtio/vhost-backend.h | 14 ++
 2 files changed, 42 insertions(+)

diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c
index 17f3fc6a0823..833804dd40f2 100644
--- a/hw/virtio/vhost-backend.c
+++ b/hw/virtio/vhost-backend.c
@@ -158,6 +158,30 @@ static int vhost_kernel_set_vring_busyloop_timeout(struct 
vhost_dev *dev,
 return vhost_kernel_call(dev, VHOST_SET_VRING_BUSYLOOP_TIMEOUT, s);
 }
 
+static int vhost_kernel_new_worker(struct vhost_dev *dev,
+   struct vhost_worker_state *worker)
+{
+return vhost_kernel_call(dev, VHOST_NEW_WORKER, worker);
+}
+
+static int vhost_kernel_free_worker(struct vhost_dev *dev,
+struct vhost_worker_state *worker)
+{
+return vhost_kernel_call(dev, VHOST_FREE_WORKER, worker);
+}
+
+static int vhost_kernel_attach_vring_worker(struct vhost_dev *dev,
+struct vhost_vring_worker *worker)
+{
+return vhost_kernel_call(dev, VHOST_ATTACH_VRING_WORKER, worker);
+}
+
+static int vhost_kernel_get_vring_worker(struct vhost_dev *dev,
+ struct vhost_vring_worker *worker)
+{
+return vhost_kernel_call(dev, VHOST_GET_VRING_WORKER, worker);
+}
+
 static int vhost_kernel_set_features(struct vhost_dev *dev,
  uint64_t features)
 {
@@ -313,6 +337,10 @@ const VhostOps kernel_ops = {
 .vhost_set_vring_err = vhost_kernel_set_vring_err,
 .vhost_set_vring_busyloop_timeout =
 vhost_kernel_set_vring_busyloop_timeout,
+.vhost_get_vring_worker = vhost_kernel_get_vring_worker,
+.vhost_attach_vring_worker = vhost_kernel_attach_vring_worker,
+.vhost_new_worker = vhost_kernel_new_worker,
+.vhost_free_worker = vhost_kernel_free_worker,
 .vhost_set_features = vhost_kernel_set_features,
 .vhost_get_features = vhost_kernel_get_features,
 .vhost_set_backend_cap = vhost_kernel_set_backend_cap,
diff --git a/include/hw/virtio/vhost-backend.h 
b/include/hw/virtio/vhost-backend.h
index 96ccc18cd33b..9f16d0884e8f 100644
--- a/include/hw/virtio/vhost-backend.h
+++ b/include/hw/virtio/vhost-backend.h
@@ -33,6 +33,8 @@ struct vhost_memory;
 struct vhost_vring_file;
 struct vhost_vring_state;
 struct vhost_vring_addr;
+struct vhost_vring_worker;
+struct vhost_worker_state;
 struct vhost_scsi_target;
 struct vhost_iotlb_msg;
 struct vhost_virtqueue;
@@ -73,6 +75,14 @@ typedef int (*vhost_set_vring_err_op)(struct vhost_dev *dev,
   struct vhost_vring_file *file);
 typedef int (*vhost_set_vring_busyloop_timeout_op)(struct vhost_dev *dev,
struct vhost_vring_state 
*r);
+typedef int (*vhost_attach_vring_worker_op)(struct vhost_dev *dev,
+struct vhost_vring_worker *worker);
+typedef int (*vhost_get_vring_worker_op)(struct vhost_dev *dev,
+ struct vhost_vring_worker *worker);
+typedef int (*vhost_new_worker_op)(struct vhost_dev *dev,
+   struct vhost_worker_state *worker);
+typedef int (*vhost_free_worker_op)(struct vhost_dev *dev,
+struct vhost_worker_state *worker);
 typedef int (*vhost_set_features_op)(struct vhost_dev *dev,
  uint64_t features);
 typedef int (*vhost_get_features_op)(struct vhost_dev *dev,
@@ -151,6 +161,10 @@ typedef struct VhostOps {
 vhost_set_vring_call_op vhost_set_vring_call;
 vhost_set_vring_err_op vhost_set_vring_err;
 vhost_set_vring_busyloop_timeout_op vhost_set_vring_busyloop_timeout;
+vhost_new_worker_op vhost_new_worker;
+vhost_free_worker_op vhost_free_worker;
+vhost_get_vring_worker_op vhost_get_vring_worker;
+vhost_attach_vring_worker_op vhost_attach_vring_worker;
 vhost_set_features_op vhost_set_features;
 vhost_get_features_op vhost_get_features;
 vhost_set_backend_cap_op vhost_set_backend_cap;
-- 
2.34.1




[PATCH v2 0/2] vhost-scsi: Support worker ioctls

2023-11-26 Thread Mike Christie
The following patches allow users to configure the vhost worker threads
for vhost-scsi. With vhost-net we get a worker thread per rx/tx virtqueue
pair, but for vhost-scsi we get one worker for all workqueues. This
becomes a bottlneck after 2 queues are used.

In the upstream linux kernel commit:

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/drivers/vhost/vhost.c?id=c1ecd8e9500797748ae4f79657971955d452d69d

we enabled the vhost layer to be able to create a worker thread and
attach it to a virtqueue.

This patchset adds support to vhost-scsi to use these ioctls so we are
no longer limited to the single worker.

v2:
- Make config option a bool instead of an int.





[PATCH v2 2/2] vhost-scsi: Add support for a worker thread per virtqueue

2023-11-26 Thread Mike Christie
This adds support for vhost-scsi to be able to create a worker thread
per virtqueue. Right now for vhost-net we get a worker thread per
tx/rx virtqueue pair which scales nicely as we add more virtqueues and
CPUs, but for scsi we get the single worker thread that's shared by all
virtqueues. When trying to send IO to more than 2 virtqueues the single
thread becomes a bottlneck.

This patch adds a new setting, workers_per_virtqueue, which can be set
to:

false: Existing behavior where we get the single worker thread.
true: Create a worker per IO virtqueue.

Signed-off-by: Mike Christie 
---
 hw/scsi/vhost-scsi.c| 60 +
 include/hw/virtio/virtio-scsi.h |  1 +
 2 files changed, 61 insertions(+)

diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c
index 3126df9e1d9d..77eef9474c23 100644
--- a/hw/scsi/vhost-scsi.c
+++ b/hw/scsi/vhost-scsi.c
@@ -165,6 +165,57 @@ static const VMStateDescription vmstate_virtio_vhost_scsi 
= {
 .pre_save = vhost_scsi_pre_save,
 };
 
+static int vhost_scsi_set_workers(VHostSCSICommon *vsc, bool per_virtqueue)
+{
+struct vhost_dev *dev = &vsc->dev;
+struct vhost_vring_worker vq_worker;
+struct vhost_worker_state worker;
+int i, ret;
+
+/* Use default worker */
+if (!per_virtqueue || dev->nvqs == VHOST_SCSI_VQ_NUM_FIXED + 1) {
+return 0;
+}
+
+/*
+ * ctl/evt share the first worker since it will be rare for them
+ * to send cmds while IO is running.
+ */
+for (i = VHOST_SCSI_VQ_NUM_FIXED + 1; i < dev->nvqs; i++) {
+memset(&worker, 0, sizeof(worker));
+
+ret = dev->vhost_ops->vhost_new_worker(dev, &worker);
+if (ret == -ENOTTY) {
+/*
+ * worker ioctls are not implemented so just ignore and
+ * and continue device setup.
+ */
+ret = 0;
+break;
+} else if (ret) {
+break;
+}
+
+memset(&vq_worker, 0, sizeof(vq_worker));
+vq_worker.worker_id = worker.worker_id;
+vq_worker.index = i;
+
+ret = dev->vhost_ops->vhost_attach_vring_worker(dev, &vq_worker);
+if (ret == -ENOTTY) {
+/*
+ * It's a bug for the kernel to have supported the worker creation
+ * ioctl but not attach.
+ */
+dev->vhost_ops->vhost_free_worker(dev, &worker);
+break;
+} else if (ret) {
+break;
+}
+}
+
+return ret;
+}
+
 static void vhost_scsi_realize(DeviceState *dev, Error **errp)
 {
 VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
@@ -232,6 +283,13 @@ static void vhost_scsi_realize(DeviceState *dev, Error 
**errp)
 goto free_vqs;
 }
 
+ret = vhost_scsi_set_workers(vsc, vs->conf.worker_per_virtqueue);
+if (ret < 0) {
+error_setg(errp, "vhost-scsi: vhost worker setup failed: %s",
+   strerror(-ret));
+goto free_vqs;
+}
+
 /* At present, channel and lun both are 0 for bootable vhost-scsi disk */
 vsc->channel = 0;
 vsc->lun = 0;
@@ -297,6 +355,8 @@ static Property vhost_scsi_properties[] = {
  VIRTIO_SCSI_F_T10_PI,
  false),
 DEFINE_PROP_BOOL("migratable", VHostSCSICommon, migratable, false),
+DEFINE_PROP_BOOL("worker_per_virtqueue", VirtIOSCSICommon,
+ conf.worker_per_virtqueue, false),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h
index 779568ab5d28..0e9a1867665e 100644
--- a/include/hw/virtio/virtio-scsi.h
+++ b/include/hw/virtio/virtio-scsi.h
@@ -51,6 +51,7 @@ typedef struct virtio_scsi_config VirtIOSCSIConfig;
 struct VirtIOSCSIConf {
 uint32_t num_queues;
 uint32_t virtqueue_size;
+bool worker_per_virtqueue;
 bool seg_max_adjust;
 uint32_t max_sectors;
 uint32_t cmd_per_lun;
-- 
2.34.1




Re: [PATCH 2/2] vhost-scsi: Add support for a worker thread per virtqueue

2023-11-15 Thread Mike Christie
On 11/15/23 6:57 AM, Stefan Hajnoczi wrote:
> On Wed, Nov 15, 2023 at 12:43:02PM +0100, Stefano Garzarella wrote:
>> On Mon, Nov 13, 2023 at 06:36:44PM -0600, Mike Christie wrote:
>>> This adds support for vhost-scsi to be able to create a worker thread
>>> per virtqueue. Right now for vhost-net we get a worker thread per
>>> tx/rx virtqueue pair which scales nicely as we add more virtqueues and
>>> CPUs, but for scsi we get the single worker thread that's shared by all
>>> virtqueues. When trying to send IO to more than 2 virtqueues the single
>>> thread becomes a bottlneck.
>>>
>>> This patch adds a new setting, virtqueue_workers, which can be set to:
>>>
>>> 1: Existing behavior whre we get the single thread.
>>> -1: Create a worker per IO virtqueue.
>>
>> I find this setting a bit odd. What about a boolean instead?
>>
>> `per_virtqueue_workers`:
>> false: Existing behavior whre we get the single thread.
>> true: Create a worker per IO virtqueue.
> 
> Me too, I thought there would be round-robin assignment for 1 <
> worker_cnt < (dev->nvqs - VHOST_SCSI_VQ_NUM_FIXED) but instead only 1
> and -1 have any meaning.
> 
> Do you want to implement round-robin assignment?
> 

It was an int because I originally did round robin but at some point
dropped it. I found that our users at least:

1. Are used to configuring number of virtqueues.
2. In the userspace guest OS are used to checking the queue to CPU
mappings to figure out how their app should optimize itself.

So users would just do a virtqueue per vCPU or if trying to reduce
mem usage would do N virtqueues < vCPUs. For both cases they just did the
worker per virtqueue.

However, I left it an int in case in the future someone wanted
the future.





Re: [PATCH 2/2] vhost-scsi: Add support for a worker thread per virtqueue

2023-11-15 Thread Mike Christie
On 11/15/23 5:43 AM, Stefano Garzarella wrote:
> On Mon, Nov 13, 2023 at 06:36:44PM -0600, Mike Christie wrote:
>> This adds support for vhost-scsi to be able to create a worker thread
>> per virtqueue. Right now for vhost-net we get a worker thread per
>> tx/rx virtqueue pair which scales nicely as we add more virtqueues and
>> CPUs, but for scsi we get the single worker thread that's shared by all
>> virtqueues. When trying to send IO to more than 2 virtqueues the single
>> thread becomes a bottlneck.
>>
>> This patch adds a new setting, virtqueue_workers, which can be set to:
>>
>> 1: Existing behavior whre we get the single thread.
>> -1: Create a worker per IO virtqueue.
> 
> I find this setting a bit odd. What about a boolean instead?
> 
> `per_virtqueue_workers`:
>     false: Existing behavior whre we get the single thread.
>     true: Create a worker per IO virtqueue.

Sound good.


> 
>>
>> Signed-off-by: Mike Christie 
>> ---
>> hw/scsi/vhost-scsi.c    | 68 +
>> include/hw/virtio/virtio-scsi.h |  1 +
>> 2 files changed, 69 insertions(+)
>>
>> diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c
>> index 3126df9e1d9d..5cf669b6563b 100644
>> --- a/hw/scsi/vhost-scsi.c
>> +++ b/hw/scsi/vhost-scsi.c
>> @@ -31,6 +31,9 @@
>> #include "qemu/cutils.h"
>> #include "sysemu/sysemu.h"
>>
>> +#define VHOST_SCSI_WORKER_PER_VQ    -1
>> +#define VHOST_SCSI_WORKER_DEF    1
>> +
>> /* Features supported by host kernel. */
>> static const int kernel_feature_bits[] = {
>>     VIRTIO_F_NOTIFY_ON_EMPTY,
>> @@ -165,6 +168,62 @@ static const VMStateDescription 
>> vmstate_virtio_vhost_scsi = {
>>     .pre_save = vhost_scsi_pre_save,
>> };
>>
>> +static int vhost_scsi_set_workers(VHostSCSICommon *vsc, int workers_cnt)
>> +{
>> +    struct vhost_dev *dev = &vsc->dev;
>> +    struct vhost_vring_worker vq_worker;
>> +    struct vhost_worker_state worker;
>> +    int i, ret;
>> +
>> +    /* Use default worker */
>> +    if (workers_cnt == VHOST_SCSI_WORKER_DEF ||
>> +    dev->nvqs == VHOST_SCSI_VQ_NUM_FIXED + 1) {
>> +    return 0;
>> +    }
>> +
>> +    if (workers_cnt != VHOST_SCSI_WORKER_PER_VQ) {
>> +    return -EINVAL;
>> +    }
>> +
>> +    /*
>> + * ctl/evt share the first worker since it will be rare for them
>> + * to send cmds while IO is running.
>> + */
>> +    for (i = VHOST_SCSI_VQ_NUM_FIXED + 1; i < dev->nvqs; i++) {
>> +    memset(&worker, 0, sizeof(worker));
>> +
>> +    ret = dev->vhost_ops->vhost_new_worker(dev, &worker);
> 
> Should we call vhost_free_worker() in the vhost_scsi_unrealize() or are
> workers automatically freed when `vhostfd` is closed?
> 

All worker threads are freed automatically like how the default worker
created from VHOST_SET_OWNER is freed on close.




[PATCH 1/2] vhost: Add worker backend callouts

2023-11-13 Thread Mike Christie
This adds the vhost backend callouts for the worker ioctls added in the
6.4 linux kernel commit:

c1ecd8e95007 ("vhost: allow userspace to create workers")

Signed-off-by: Mike Christie 
---
 hw/virtio/vhost-backend.c | 28 
 include/hw/virtio/vhost-backend.h | 14 ++
 2 files changed, 42 insertions(+)

diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c
index 17f3fc6a0823..833804dd40f2 100644
--- a/hw/virtio/vhost-backend.c
+++ b/hw/virtio/vhost-backend.c
@@ -158,6 +158,30 @@ static int vhost_kernel_set_vring_busyloop_timeout(struct 
vhost_dev *dev,
 return vhost_kernel_call(dev, VHOST_SET_VRING_BUSYLOOP_TIMEOUT, s);
 }
 
+static int vhost_kernel_new_worker(struct vhost_dev *dev,
+   struct vhost_worker_state *worker)
+{
+return vhost_kernel_call(dev, VHOST_NEW_WORKER, worker);
+}
+
+static int vhost_kernel_free_worker(struct vhost_dev *dev,
+struct vhost_worker_state *worker)
+{
+return vhost_kernel_call(dev, VHOST_FREE_WORKER, worker);
+}
+
+static int vhost_kernel_attach_vring_worker(struct vhost_dev *dev,
+struct vhost_vring_worker *worker)
+{
+return vhost_kernel_call(dev, VHOST_ATTACH_VRING_WORKER, worker);
+}
+
+static int vhost_kernel_get_vring_worker(struct vhost_dev *dev,
+ struct vhost_vring_worker *worker)
+{
+return vhost_kernel_call(dev, VHOST_GET_VRING_WORKER, worker);
+}
+
 static int vhost_kernel_set_features(struct vhost_dev *dev,
  uint64_t features)
 {
@@ -313,6 +337,10 @@ const VhostOps kernel_ops = {
 .vhost_set_vring_err = vhost_kernel_set_vring_err,
 .vhost_set_vring_busyloop_timeout =
 vhost_kernel_set_vring_busyloop_timeout,
+.vhost_get_vring_worker = vhost_kernel_get_vring_worker,
+.vhost_attach_vring_worker = vhost_kernel_attach_vring_worker,
+.vhost_new_worker = vhost_kernel_new_worker,
+.vhost_free_worker = vhost_kernel_free_worker,
 .vhost_set_features = vhost_kernel_set_features,
 .vhost_get_features = vhost_kernel_get_features,
 .vhost_set_backend_cap = vhost_kernel_set_backend_cap,
diff --git a/include/hw/virtio/vhost-backend.h 
b/include/hw/virtio/vhost-backend.h
index 96ccc18cd33b..9f16d0884e8f 100644
--- a/include/hw/virtio/vhost-backend.h
+++ b/include/hw/virtio/vhost-backend.h
@@ -33,6 +33,8 @@ struct vhost_memory;
 struct vhost_vring_file;
 struct vhost_vring_state;
 struct vhost_vring_addr;
+struct vhost_vring_worker;
+struct vhost_worker_state;
 struct vhost_scsi_target;
 struct vhost_iotlb_msg;
 struct vhost_virtqueue;
@@ -73,6 +75,14 @@ typedef int (*vhost_set_vring_err_op)(struct vhost_dev *dev,
   struct vhost_vring_file *file);
 typedef int (*vhost_set_vring_busyloop_timeout_op)(struct vhost_dev *dev,
struct vhost_vring_state 
*r);
+typedef int (*vhost_attach_vring_worker_op)(struct vhost_dev *dev,
+struct vhost_vring_worker *worker);
+typedef int (*vhost_get_vring_worker_op)(struct vhost_dev *dev,
+ struct vhost_vring_worker *worker);
+typedef int (*vhost_new_worker_op)(struct vhost_dev *dev,
+   struct vhost_worker_state *worker);
+typedef int (*vhost_free_worker_op)(struct vhost_dev *dev,
+struct vhost_worker_state *worker);
 typedef int (*vhost_set_features_op)(struct vhost_dev *dev,
  uint64_t features);
 typedef int (*vhost_get_features_op)(struct vhost_dev *dev,
@@ -151,6 +161,10 @@ typedef struct VhostOps {
 vhost_set_vring_call_op vhost_set_vring_call;
 vhost_set_vring_err_op vhost_set_vring_err;
 vhost_set_vring_busyloop_timeout_op vhost_set_vring_busyloop_timeout;
+vhost_new_worker_op vhost_new_worker;
+vhost_free_worker_op vhost_free_worker;
+vhost_get_vring_worker_op vhost_get_vring_worker;
+vhost_attach_vring_worker_op vhost_attach_vring_worker;
 vhost_set_features_op vhost_set_features;
 vhost_get_features_op vhost_get_features;
 vhost_set_backend_cap_op vhost_set_backend_cap;
-- 
2.34.1




[PATCH 0/2] vhost-scsi: Support worker ioctls

2023-11-13 Thread Mike Christie
The following patches allow users to configure the vhost worker threads
for vhost-scsi. With vhost-net we get a worker thread per rx/tx virtqueue
pair, but for vhost-scsi we get one worker for all workqueues. This
becomes a bottlneck after 2 queues are used.

In the upstream linux kernel commit:

https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/commit/drivers/vhost/vhost.c?id=c1ecd8e9500797748ae4f79657971955d452d69d

we enabled the vhost layer to be able to create a worker thread and
attach it to a virtqueue.

This patchset adds support to vhost-scsi to use these ioctls so we are
no longer limited to the single worker.





[PATCH 2/2] vhost-scsi: Add support for a worker thread per virtqueue

2023-11-13 Thread Mike Christie
This adds support for vhost-scsi to be able to create a worker thread
per virtqueue. Right now for vhost-net we get a worker thread per
tx/rx virtqueue pair which scales nicely as we add more virtqueues and
CPUs, but for scsi we get the single worker thread that's shared by all
virtqueues. When trying to send IO to more than 2 virtqueues the single
thread becomes a bottlneck.

This patch adds a new setting, virtqueue_workers, which can be set to:

1: Existing behavior whre we get the single thread.
-1: Create a worker per IO virtqueue.

Signed-off-by: Mike Christie 
---
 hw/scsi/vhost-scsi.c| 68 +
 include/hw/virtio/virtio-scsi.h |  1 +
 2 files changed, 69 insertions(+)

diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c
index 3126df9e1d9d..5cf669b6563b 100644
--- a/hw/scsi/vhost-scsi.c
+++ b/hw/scsi/vhost-scsi.c
@@ -31,6 +31,9 @@
 #include "qemu/cutils.h"
 #include "sysemu/sysemu.h"
 
+#define VHOST_SCSI_WORKER_PER_VQ-1
+#define VHOST_SCSI_WORKER_DEF1
+
 /* Features supported by host kernel. */
 static const int kernel_feature_bits[] = {
 VIRTIO_F_NOTIFY_ON_EMPTY,
@@ -165,6 +168,62 @@ static const VMStateDescription vmstate_virtio_vhost_scsi 
= {
 .pre_save = vhost_scsi_pre_save,
 };
 
+static int vhost_scsi_set_workers(VHostSCSICommon *vsc, int workers_cnt)
+{
+struct vhost_dev *dev = &vsc->dev;
+struct vhost_vring_worker vq_worker;
+struct vhost_worker_state worker;
+int i, ret;
+
+/* Use default worker */
+if (workers_cnt == VHOST_SCSI_WORKER_DEF ||
+dev->nvqs == VHOST_SCSI_VQ_NUM_FIXED + 1) {
+return 0;
+}
+
+if (workers_cnt != VHOST_SCSI_WORKER_PER_VQ) {
+return -EINVAL;
+}
+
+/*
+ * ctl/evt share the first worker since it will be rare for them
+ * to send cmds while IO is running.
+ */
+for (i = VHOST_SCSI_VQ_NUM_FIXED + 1; i < dev->nvqs; i++) {
+memset(&worker, 0, sizeof(worker));
+
+ret = dev->vhost_ops->vhost_new_worker(dev, &worker);
+if (ret == -ENOTTY) {
+/*
+ * worker ioctls are not implemented so just ignore and
+ * and continue device setup.
+ */
+ret = 0;
+break;
+} else if (ret) {
+break;
+}
+
+memset(&vq_worker, 0, sizeof(vq_worker));
+vq_worker.worker_id = worker.worker_id;
+vq_worker.index = i;
+
+ret = dev->vhost_ops->vhost_attach_vring_worker(dev, &vq_worker);
+if (ret == -ENOTTY) {
+/*
+ * It's a bug for the kernel to have supported the worker creation
+ * ioctl but not attach.
+ */
+dev->vhost_ops->vhost_free_worker(dev, &worker);
+break;
+} else if (ret) {
+break;
+}
+}
+
+return ret;
+}
+
 static void vhost_scsi_realize(DeviceState *dev, Error **errp)
 {
 VirtIOSCSICommon *vs = VIRTIO_SCSI_COMMON(dev);
@@ -232,6 +291,13 @@ static void vhost_scsi_realize(DeviceState *dev, Error 
**errp)
 goto free_vqs;
 }
 
+ret = vhost_scsi_set_workers(vsc, vs->conf.virtqueue_workers);
+if (ret < 0) {
+error_setg(errp, "vhost-scsi: vhost worker setup failed: %s",
+   strerror(-ret));
+goto free_vqs;
+}
+
 /* At present, channel and lun both are 0 for bootable vhost-scsi disk */
 vsc->channel = 0;
 vsc->lun = 0;
@@ -297,6 +363,8 @@ static Property vhost_scsi_properties[] = {
  VIRTIO_SCSI_F_T10_PI,
  false),
 DEFINE_PROP_BOOL("migratable", VHostSCSICommon, migratable, false),
+DEFINE_PROP_INT32("virtqueue_workers", VirtIOSCSICommon,
+  conf.virtqueue_workers, VHOST_SCSI_WORKER_DEF),
 DEFINE_PROP_END_OF_LIST(),
 };
 
diff --git a/include/hw/virtio/virtio-scsi.h b/include/hw/virtio/virtio-scsi.h
index 779568ab5d28..f70624ece564 100644
--- a/include/hw/virtio/virtio-scsi.h
+++ b/include/hw/virtio/virtio-scsi.h
@@ -51,6 +51,7 @@ typedef struct virtio_scsi_config VirtIOSCSIConfig;
 struct VirtIOSCSIConf {
 uint32_t num_queues;
 uint32_t virtqueue_size;
+int virtqueue_workers;
 bool seg_max_adjust;
 uint32_t max_sectors;
 uint32_t cmd_per_lun;
-- 
2.34.1




Re: [PATCH] Revert "virtio-scsi: Send "REPORTED LUNS CHANGED" sense data upon disk hotplug events"

2023-07-11 Thread Mike Christie
What was the issue you are seeing?

Was it something like you get the UA. We retry then on one of the
retries the sense is not setup correctly, so the scsi error handler
runs? That fails and the device goes offline?

If you turn on scsi debugging you would see:


[  335.445922] sd 0:0:0:0: [sda] tag#15 Add. Sense: Reported luns data has 
changed
[  335.445922] sd 0:0:0:0: [sda] tag#16 00 00 00 00 00 00 00 00 00 00 00 00 00 
00 00 00
[  335.445925] sd 0:0:0:0: [sda] tag#16 00 00 00 00 00 00 00 00 00 00 00 00 00 
00 00 00
[  335.445929] sd 0:0:0:0: [sda] tag#17 Done: FAILED Result: hostbyte=DID_OK 
driverbyte=DRIVER_OK cmd_age=0s
[  335.445932] sd 0:0:0:0: [sda] tag#17 CDB: Write(10) 2a 00 00 db 4f c0 00 00 
20 00
[  335.445934] sd 0:0:0:0: [sda] tag#17 00 00 00 00 00 00 00 00 00 00 00 00 00 
00 00 00
[  335.445936] sd 0:0:0:0: [sda] tag#17 00 00 00 00 00 00 00 00 00 00 00 00 00 
00 00 00
[  335.445938] sd 0:0:0:0: [sda] tag#17 00 00 00 00 00 00 00 00 00 00 00 00 00 
00 00 00
[  335.445940] sd 0:0:0:0: [sda] tag#17 00 00 00 00 00 00 00 00 00 00 00 00 00 
00 00 00
[  335.445942] sd 0:0:0:0: [sda] tag#17 00 00 00 00 00 00 00 00 00 00 00 00 00 
00 00 00
[  335.445945] sd 0:0:0:0: [sda] tag#17 00 00 00 00 00 00 00 00 00 00 00 00 00 
00 00 00
[  335.451447] scsi host0: scsi_eh_0: waking up 0/2/2
[  335.451453] scsi host0: Total of 2 commands on 1 devices require eh work
[  335.451457] sd 0:0:0:0: [sda] tag#16 scsi_eh_0: requesting sense


I don't know the qemu scsi code well, but I scanned the code for my co-worker
and my guess was commit 8cc5583abe6419e7faaebc9fbd109f34f4c850f2 had a race in 
it.

How is locking done? when it is a bus level UA but there are multiple devices
on the bus?

Is it possible, devA is clearing the sense on devB. For example, thread1 for
devA  is doing scsi_clear_unit_attention but then thread2 for devB has seen that
bus->unit_attention so it set req ops to reqops_unit_attention. But when
we run reqops_unit_attention.send_command scsi_unit_attention does not see
req->bus->unit_attention set anymore so we get a CC with no sense.

If the linux kernel scsi layer sees a CC with no sense then we fire the SCSI
error handler like seen above in the logs.


On 7/11/23 12:06 PM, Stefano Garzarella wrote:
> CCing `./scripts/get_maintainer.pl -f drivers/scsi/virtio_scsi.c`,
> since I found a few things in the virtio-scsi driver...
> 
> FYI we have seen that Linux has problems with a QEMU patch for the
> virtio-scsi device (details at the bottom of this email in the revert
> commit message and BZ).
> 
> 
> This is what I found when I looked at the Linux code:
> 
> In scsi_report_sense() in linux/drivers/scsi/scsi_error.c linux calls
> scsi_report_lun_change() that set `sdev_target->expecting_lun_change =
> 1` when we receive a UNIT ATTENTION with REPORT LUNS CHANGED
> (sshdr->asc == 0x3f && sshdr->ascq == 0x0e).
> 
> When `sdev_target->expecting_lun_change = 1` is set and we call
> scsi_check_sense(), for example to check the next UNIT ATTENTION, it
> will return NEEDS_RETRY, that I think will cause the issues we are
> seeing.
> 
> `sdev_target->expecting_lun_change` is reset only in
> scsi_decide_disposition() when `REPORT_LUNS` command returns with
> SAM_STAT_GOOD.
> That command is issued in scsi_report_lun_scan() called by
> __scsi_scan_target(), called for example by scsi_scan_target(),
> scsi_scan_host(), etc.
> 
> So, checking QEMU, we send VIRTIO_SCSI_EVT_RESET_RESCAN during hotplug
> and VIRTIO_SCSI_EVT_RESET_REMOVED during hotunplug. In both cases now we
> send also the UNIT ATTENTION.
> 
> In the virtio-scsi driver, when we receive VIRTIO_SCSI_EVT_RESET_RESCAN
> (hotplug) we call scsi_scan_target() or scsi_add_device(). Both of them
> will call __scsi_scan_target() at some points, sending `REPORT_LUNS`
> command to the device. This does not happen for
> VIRTIO_SCSI_EVT_RESET_REMOVED (hotunplug). Indeed if I remove the
> UNIT ATTENTION from the hotunplug in QEMU, everything works well.
> 
> So, I tried to add a scan also for VIRTIO_SCSI_EVT_RESET_REMOVED:
> 
> diff --git a/drivers/scsi/virtio_scsi.c b/drivers/scsi/virtio_scsi.c
> index bd5633667d01..c57658a63097 100644
> --- a/drivers/scsi/virtio_scsi.c
> +++ b/drivers/scsi/virtio_scsi.c
> @@ -291,6 +291,7 @@ static void virtscsi_handle_transport_reset(struct 
> virtio_scsi *vscsi,
>     }
>     break;
>     case VIRTIO_SCSI_EVT_RESET_REMOVED:
> +   scsi_scan_host(shost);
>     sdev = scsi_device_lookup(shost, 0, target, lun);
>     if (sdev) {
>     scsi_remove_device(sdev);
> 
> This somehow helps, now linux only breaks if the plug/unplug frequency
> is really high. If I put a 5 second sleep between plug/unplug events, it
> doesn't break (at least for the duration of my test which has been
> running for about 30 minutes, before it used to break after about a
> minute).
> 
> Another thing I noticed is that in QEMU maybe we should set the UNIT
> ATTENTION fi

Re: question on vhost, limiting kernel threads and NPROC

2021-09-13 Thread Mike Christie
I just realized I forgot to cc the virt list so adding now.

Christian see the very bottom for a different fork patch.

On 7/12/21 7:05 AM, Stefan Hajnoczi wrote:
> On Fri, Jul 09, 2021 at 11:25:37AM -0500, Mike Christie wrote:
>> Hi,
>>
>> The goal of this email is to try and figure how we want to track/limit the
>> number of kernel threads created by vhost devices.
>>
>> Background:
>> ---
>> For vhost-scsi, we've hit a issue where the single vhost worker thread can't
>> handle all IO the being sent from multiple queues. IOPs is stuck at around
>> 500K. To fix this, we did this patchset:
>>
>> https://lore.kernel.org/linux-scsi/20210525180600.6349-1-michael.chris...@oracle.com/
>>
>> which allows userspace to create N threads and map them to a dev's 
>> virtqueues.
>> With this we can get around 1.4M IOPs.
>>
>> Problem:
>> 
>> While those patches were being reviewed, a concern about tracking all these
>> new possible threads was raised here:
>>
>> https://lore.kernel.org/linux-scsi/YL45CfpHyzSEcAJv@stefanha-x1.localdomain/
>>
>> To save you some time, the question is what does other kernel code using the
>> kthread API do to track the number of kernel threads created on behalf of
>> a userspace thread. The answer is they don't do anything so we will have to
>> add that code.
>>
>> I started to do that here:
>>
>> https://lkml.org/lkml/2021/6/23/1233
>>
>> where those patches would charge/check the vhost device owner's RLIMIT_NPROC
>> value. But, the question of if we really want to do this has come up which is
>> why I'm bugging lists like libvirt now.
>>
>> Question/Solution:
>> --
>> I'm bugging everyone so we can figure out:
>>
>> If we need to specifically track the number of kernel threads being made
>> for the vhost kernel use case by the RLIMIT_NPROC limit?
>>
>> Or, is it ok to limit the number of devices with the RLIMIT_NOFILE limit.
>> Then each device has a limit on the number of threads it can create.
> 
> Do we want to add an interface where an unprivileged userspace process
> can create large numbers of kthreads? The number is indirectly bounded
> by RLIMIT_NOFILE * num_virtqueues, but there is no practical way to
> use that rlimit since num_virtqueues various across vhost devices and
> RLIMIT_NOFILE might need to have a specific value to control file
> descriptors.
> 
> io_uring worker threads are limited by RLIMIT_NPROC. I think it makes
> sense in vhost too where the device instance is owned by a specific
> userspace process and can be accounted against that process' rlimit.
> 
> I don't have a specific use case other than that I think vhost should be
> safe and well-behaved.
> 

Sorry for the late reply. I finally got to go on PTO and used like 2
years worth in one super long vacation :)

I still don't have a RLIMIT_NPROC use case and it wasn't not clear to
me if that has to be handled before merging. However, I might have got
lucky and found a bug where the fix will handle your request too.

It looks like cgroup v2 is supposed to work, but for vhost threads
it doesn't because the kernel functions we use just support v1. If
we change the vhost layer to create threads like how io_uring does
then we get the RLIMIT_NPROC checks and also cgroup v2 support.

Christian, If you didn't like this patch

https://lkml.org/lkml/2021/6/23/1233

then I'm not sure how much you will like what is needed to support the
above. Here is a patch which includes what we would need from the fork
related code. On one hand, it's nicer because it fits into the PF FLAG
code like you requested. But, I have to add a no_files arg. See below:


--


>From 351d476e8db0a78b9bdf22d77dd1abe66c0eac40 Mon Sep 17 00:00:00 2001
From: Mike Christie 
Date: Mon, 13 Sep 2021 11:20:20 -0500
Subject: [PATCH] fork: allow cloning of userspace procs from kernel

Userspace apps/processes like Qemu call into the vhost layer to create
worker threads which execute IO on behalf of VMs. If users set RIMIT
or cgroup limits or setup v2 cgroups or namespaces, the worker thread
is not accounted for or even setup correctly. The reason is that vhost
uses the kthread api which inherits those attributes/values from the
kthreadd thread. This patch allows kernel modules to work like the
io_uring code which can call kernel_clone from the userspace thread's
context and directly inherit its attributes like cgroups from and will
check limits like RLIMIT_NPROC against that userspace thread.

Note: this patch combines 2 changes that should be separate patches. I'm
in

question on vhost, limiting kernel threads and NPROC

2021-07-09 Thread Mike Christie
Hi,

The goal of this email is to try and figure how we want to track/limit the
number of kernel threads created by vhost devices.

Background:
---
For vhost-scsi, we've hit a issue where the single vhost worker thread can't
handle all IO the being sent from multiple queues. IOPs is stuck at around
500K. To fix this, we did this patchset:

https://lore.kernel.org/linux-scsi/20210525180600.6349-1-michael.chris...@oracle.com/

which allows userspace to create N threads and map them to a dev's virtqueues.
With this we can get around 1.4M IOPs.

Problem:

While those patches were being reviewed, a concern about tracking all these
new possible threads was raised here:

https://lore.kernel.org/linux-scsi/YL45CfpHyzSEcAJv@stefanha-x1.localdomain/

To save you some time, the question is what does other kernel code using the
kthread API do to track the number of kernel threads created on behalf of
a userspace thread. The answer is they don't do anything so we will have to
add that code.

I started to do that here:

https://lkml.org/lkml/2021/6/23/1233

where those patches would charge/check the vhost device owner's RLIMIT_NPROC
value. But, the question of if we really want to do this has come up which is
why I'm bugging lists like libvirt now.

Question/Solution:
--
I'm bugging everyone so we can figure out:

If we need to specifically track the number of kernel threads being made
for the vhost kernel use case by the RLIMIT_NPROC limit?

Or, is it ok to limit the number of devices with the RLIMIT_NOFILE limit.
Then each device has a limit on the number of threads it can create.



Re: [PATCH 00/10] vhost/qemu: thread per IO SCSI vq

2020-11-19 Thread Mike Christie

On 11/19/20 10:24 AM, Stefan Hajnoczi wrote:

On Thu, Nov 19, 2020 at 4:13 PM Mike Christie
 wrote:


On 11/19/20 8:46 AM, Michael S. Tsirkin wrote:

On Wed, Nov 18, 2020 at 11:31:17AM +, Stefan Hajnoczi wrote:

My preference has been:

1. If we were to ditch cgroups, then add a new interface that would allow
us to bind threads to a specific CPU, so that it lines up with the guest's
mq to CPU mapping.


A 1:1 vCPU/vq->CPU mapping isn't desirable in all cases.

The CPU affinity is a userspace policy decision. The host kernel should
provide a mechanism but not the policy. That way userspace can decide
which workers are shared by multiple vqs and on which physical CPUs they
should run.


So if we let userspace dictate the threading policy then I think binding
vqs to userspace threads and running there makes the most sense,
no need to create the threads.



Just to make sure I am on the same page, in one of the first postings of
this set at the bottom of the mail:

https://urldefense.com/v3/__https://www.spinics.net/lists/linux-scsi/msg148322.html__;!!GqivPVa7Brio!PdGIFdzqcAb6DW8twtjX3r7xOcM7XbTh7Ndkhxhb-1fV1VNB4lXjFzwFVE1zczUIE2Mp$

I asked about a new interface and had done something more like what
Stefan posted:

struct vhost_vq_worker_info {
/*
 * The pid of an existing vhost worker that this vq will be
 * assigned to. When pid is 0 the virtqueue is assigned to the
 * default vhost worker. When pid is -1 a new worker thread is
 * created for this virtqueue. When pid is -2 the virtqueue's
 * worker thread is unchanged.
 *
 * If a vhost worker no longer has any virtqueues assigned to it
 * then it will terminate.
 *
 * The pid of the vhost worker is stored to this field when the
 * ioctl completes successfully. Use pid -2 to query the current
 * vhost worker pid.
 */
__kernel_pid_t pid;  /* in/out */

/* The virtqueue index*/
unsigned int vq_idx; /* in */
};

This approach is simple and it allowed me to have userspace map queues
and threads optimally for our setups.

Note: Stefan, in response to your previous comment, I am just using my
1:1 mapping as an example and would make it configurable from userspace.

In the email above are you guys suggesting to execute the SCSI/vhost
requests in userspace? We should not do that because:

1. It negates part of what makes vhost fast where we do not have to kick
out to userspace then back to the kernel.

2. It's not doable or becomes a crazy mess because vhost-scsi is tied to
the scsi/target layer in the kernel. You can't process the scsi command
in userspace since the scsi state machine and all its configuration info
is in the kernel's scsi/target layer.

For example, I was just the maintainer of the target_core_user module
that hooks into LIO/target on the backend (vhost-scsi hooks in on the
front end) and passes commands to userspace and there we have a
semi-shadow state machine. It gets nasty to try and maintain/sync state
between lio/target core in the kernel and in userspace. We also see the
perf loss I mentioned in #1.


No, if I understand Michael correctly he has suggested a different approach.

My suggestion was that the kernel continues to manage the worker
threads but an ioctl allows userspace to control the policy.

I think Michael is saying that the kernel shouldn't manage/create
threads. Userspace should create threads and then invoke an ioctl from
those threads.

The ioctl will call into the vhost driver where it will execute
something similar to vhost_worker(). So this ioctl will block while
the kernel is using the thread to process vqs.

What isn't clear to me is how to tell the kernel which vqs are
processed by a thread. We could try to pass that information into the
ioctl. I'm not sure what the cleanest solution is here.

Maybe something like:

struct vhost_run_worker_info {
 struct timespec *timeout;
 sigset_t *sigmask;

 /* List of virtqueues to process */
 unsigned nvqs;
 unsigned vqs[];
};

/* This blocks until the timeout is reached, a signal is received, or
the vhost device is destroyed */
int ret = ioctl(vhost_fd, VHOST_RUN_WORKER, &info);

As you can see, userspace isn't involved with dealing with the
requests. It just acts as a thread donor to the vhost driver.

We would want the VHOST_RUN_WORKER calls to be infrequent to avoid the
penalty of switching into the kernel, copying in the arguments, etc.


I didn't get this part. Why have the timeout? When the timeout expires, 
does userspace just call right back down to the kernel or does it do 
some sort of processing/operation?


You could have your worker function run from that ioctl wait for a 
signal or a wake up call from the vhost_work/poll functions.





Michael: is this the kind of thing you were thinking of?

Stefan






Re: [PATCH 00/10] vhost/qemu: thread per IO SCSI vq

2020-11-19 Thread Mike Christie

On 11/19/20 8:46 AM, Michael S. Tsirkin wrote:

On Wed, Nov 18, 2020 at 11:31:17AM +, Stefan Hajnoczi wrote:

My preference has been:

1. If we were to ditch cgroups, then add a new interface that would allow
us to bind threads to a specific CPU, so that it lines up with the guest's
mq to CPU mapping.


A 1:1 vCPU/vq->CPU mapping isn't desirable in all cases.

The CPU affinity is a userspace policy decision. The host kernel should
provide a mechanism but not the policy. That way userspace can decide
which workers are shared by multiple vqs and on which physical CPUs they
should run.


So if we let userspace dictate the threading policy then I think binding
vqs to userspace threads and running there makes the most sense,
no need to create the threads.



Just to make sure I am on the same page, in one of the first postings of 
this set at the bottom of the mail:


https://www.spinics.net/lists/linux-scsi/msg148322.html

I asked about a new interface and had done something more like what 
Stefan posted:


  struct vhost_vq_worker_info {
  /*
   * The pid of an existing vhost worker that this vq will be
   * assigned to. When pid is 0 the virtqueue is assigned to the
   * default vhost worker. When pid is -1 a new worker thread is
   * created for this virtqueue. When pid is -2 the virtqueue's
   * worker thread is unchanged.
   *
   * If a vhost worker no longer has any virtqueues assigned to it
   * then it will terminate.
   *
   * The pid of the vhost worker is stored to this field when the
   * ioctl completes successfully. Use pid -2 to query the current
   * vhost worker pid.
   */
  __kernel_pid_t pid;  /* in/out */

  /* The virtqueue index*/
  unsigned int vq_idx; /* in */
  };

This approach is simple and it allowed me to have userspace map queues 
and threads optimally for our setups.


Note: Stefan, in response to your previous comment, I am just using my 
1:1 mapping as an example and would make it configurable from userspace.


In the email above are you guys suggesting to execute the SCSI/vhost 
requests in userspace? We should not do that because:


1. It negates part of what makes vhost fast where we do not have to kick 
out to userspace then back to the kernel.


2. It's not doable or becomes a crazy mess because vhost-scsi is tied to 
the scsi/target layer in the kernel. You can't process the scsi command 
in userspace since the scsi state machine and all its configuration info 
is in the kernel's scsi/target layer.


For example, I was just the maintainer of the target_core_user module 
that hooks into LIO/target on the backend (vhost-scsi hooks in on the 
front end) and passes commands to userspace and there we have a 
semi-shadow state machine. It gets nasty to try and maintain/sync state 
between lio/target core in the kernel and in userspace. We also see the 
perf loss I mentioned in #1.




Re: [PATCH 00/10] vhost/qemu: thread per IO SCSI vq

2020-11-19 Thread Mike Christie

On 11/18/20 10:35 PM, Jason Wang wrote:

its just extra code. This patch:
https://urldefense.com/v3/__https://www.spinics.net/lists/linux-scsi/msg150151.html__;!!GqivPVa7Brio!MJS-iYeBuOljoz2xerETyn4c1N9i0XnOE8oNhz4ebbzCMNeQIP_Iie8zH18L7cY7_hur$ 
would work without the ENABLE ioctl I mean.



That seems to pre-allocate all workers. If we don't care the resources 
(127 workers) consumption it could be fine.


It only makes what the user requested via num_queues.

That patch will:
1. For the default case of num_queues=1 we use the single worker created 
from the SET_OWNER ioctl.

2. If num_queues > 1, then it creates a worker thread per num_queue > 1.








And if you guys want to do the completely new interface, then none of 
this matters I guess :)


For disable see below.






My issue/convern is that in general these calls seems useful, but we 
don't really
need them for scsi because vhost scsi is already stuck creating vqs 
like how it does
due to existing users. If we do the ifcvf_vdpa_set_vq_ready type of 
design where
we just set some bit, then the new ioctl does not give us a lot. 
It's just an extra

check and extra code.

And for the mlx5_vdpa_set_vq_ready type of design, it doesn't seem 
like it's going
to happen a lot where the admin is going to want to remove vqs from 
a running device.



In this case, qemu may just disable the queues of vhost-scsi via 
SET_VRING_ENABLE and then we can free resources?



Some SCSI background in case it doesn't work like net:
---
When the user sets up mq for vhost-scsi/virtio-scsi, for max perf and 
no cares about mem use they would normally set num_queues based on the 
number of vCPUs and MSI-x vectors. I think the default in qemu now is 
to try and detect that value.


When the virtio_scsi driver is loaded into the guest kernel, it takes 
the num_queues value and tells the scsi/block mq layer to create 
num_queues multiqueue hw queues.



If I read the code correctly, for modern device, guest will set 
queue_enable for the queues that it wants to use. So in this ideal case, 
qemu can forward them to VRING_ENABLE and reset VRING_ENABLE during 
device reset.


I was thinking more you want an event like when a device/LUN is 
added/removed to a host. Instead of kicking off a device scan, you could 
call the block helper to remap queues. It would then not be too invasive 
to running IO. I'll look into reset some more.





But it would be complicated to support legacy device and qemu.




--

I was trying to say in the previous email that is if all we do is set 
some bits to indicate the queue is disabled, free its resources, stop 
polling/queueing in the scsi/target layer, flush etc, it does not seem 
useful. I was trying to ask when would a user only want this behavior?



I think it's device reset, the semantic is that unless the queue is 
enabled, we should treat it as disabled.




Ah ok. I I'll look into that some more. A funny thing is that I was 
trying to test that a while ago, but it wasn't helpful. I'm guessing it 
didn't work because it didn't implement what you wanted for disable 
right now :)




Re: [PATCH 00/10] vhost/qemu: thread per IO SCSI vq

2020-11-18 Thread Mike Christie

On 11/18/20 1:54 AM, Jason Wang wrote:


On 2020/11/18 下午2:57, Mike Christie wrote:

On 11/17/20 11:17 PM, Jason Wang wrote:

On 2020/11/18 上午12:40, Stefan Hajnoczi wrote:

On Thu, Nov 12, 2020 at 05:18:59PM -0600, Mike Christie wrote:

The following kernel patches were made over Michael's vhost branch:

https://urldefense.com/v3/__https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git/log/?h=vhost__;!!GqivPVa7Brio!MzCv3wdRfz5dltunazRWGCeUkMg91pPEOLpIivsebLX9vhYDSi_E1V36e9H0NoRys_hU$ 


and the vhost-scsi bug fix patchset:

https://urldefense.com/v3/__https://lore.kernel.org/linux-scsi/20201112170008.GB1555653@stefanha-x1.localdomain/T/*t__;Iw!!GqivPVa7Brio!MzCv3wdRfz5dltunazRWGCeUkMg91pPEOLpIivsebLX9vhYDSi_E1V36e9H0NmuPE_m8$ 


And the qemu patch was made over the qemu master branch.

vhost-scsi currently supports multiple queues with the num_queues
setting, but we end up with a setup where the guest's scsi/block
layer can do a queue per vCPU and the layers below vhost can do
a queue per CPU. vhost-scsi will then do a num_queue virtqueues,
but all IO gets set on and completed on a single vhost-scsi thread.
After 2 - 4 vqs this becomes a bottleneck.

This patchset allows us to create a worker thread per IO vq, so we
can better utilize multiple CPUs with the multiple queues. It
implments Jason's suggestion to create the initial worker like
normal, then create the extra workers for IO vqs with the
VHOST_SET_VRING_ENABLE ioctl command added in this patchset.

How does userspace find out the tids and set their CPU affinity?

What is the meaning of the new VHOST_SET_VRING_ENABLE ioctl? It doesn't
really "enable" or "disable" the vq, requests are processed regardless.


Actually I think it should do the real "enable/disable" that tries to 
follow the virtio spec.



What does real mean here?



I think it means when a vq is disabled, vhost won't process any request 
from that virtqueue.




For the vdpa enable call for example, would it be like
ifcvf_vdpa_set_vq_ready where it sets the ready bit or more like 
mlx5_vdpa_set_vq_ready

where it can do some more work in the disable case?



For vDPA, it would be more complicated.

E.g for IFCVF, it just delay the setting of queue_enable when it get 
DRIVER_OK. Technically it can passthrough the queue_enable to the 
hardware as what mlx5e did.





For net and something like ifcvf_vdpa_set_vq_ready's design would we have
vhost_ring_ioctl() set some vhost_virtqueue enable bit. We then have 
some helper
vhost_vq_is_enabled() and some code to detect if userspace supports 
the new ioctl.



Yes, vhost support backend capability. When userspace negotiate the new 
capability, we should depend on SET_VRING_ENABLE, if not we can do 
vhost_vq_is_enable().



And then in vhost_net_set_backend do we call vhost_vq_is_enabled()? 
What is done

for disable then?



It needs more thought, but the question is not specific to 
SET_VRING_ENABLE. Consider guest may zero ring address as well.


For disabling, we can simply flush the work and disable all the polls.



It doesn't seem to buy a lot of new functionality. Is it just
so we follow the spec?



My understanding is that, since spec defines queue_enable, we should 
support it in vhost. And we can piggyback the delayed vq creation with 
this feature. Otherwise we will duplicate the function if we want to 
support queue_enable.



I had actually given up on the delayed vq creation goal. I'm still not 
sure how it's related to ENABLE and I think it gets pretty gross.


1. If we started from a semi-clean slate, and used the ENABLE ioctl more 
like a CREATE ioctl, and did the ENABLE after vhost dev open() but 
before any other ioctls, we can allocate the vq when we get the ENABLE 
ioctl. This fixes the issue where vhost scsi is allocating 128 vqs at 
open() time. We can then allocate metadata like the iovecs at ENABLE 
time or when we get a setup ioctl that is related to the metadata, so it 
fixes that too.


That makes sense how ENABLE is related to delayed vq allocation and why 
we would want it.


If we now need to support old tools though, then you lose me. To try and 
keep the code paths using the same code, then at vhost dev open() time 
do we start vhost_dev_init with zero vqs like with the allocate at 
ENABLE time case? Then when we get the first vring or dev ioctl, do we 
allocate the vq and related metadata? If so, the ENABLE does not buy us 
a lot since we get the delayed allocation from the compat code. Also 
this compat case gets really messy when we are delaying the actual vq 
and not just the metadata.


If for the compat case, we keep the code that before/during 
vhost_dev_init allocates all the vqs and does the initialization, then 
we end up with 2 very very different code paths. And we also need a new 
modparam or something to tell the drivers to do the old or new open() 
behavior.


2. If we do an approach that is less invasive to

Re: [PATCH 00/10] vhost/qemu: thread per IO SCSI vq

2020-11-17 Thread Mike Christie
On 11/18/20 12:57 AM, Mike Christie wrote:
> On 11/17/20 11:17 PM, Jason Wang wrote:
>>
>> On 2020/11/18 上午12:40, Stefan Hajnoczi wrote:
>>> On Thu, Nov 12, 2020 at 05:18:59PM -0600, Mike Christie wrote:
>>>> The following kernel patches were made over Michael's vhost branch:
>>>>
>>>> https://urldefense.com/v3/__https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git/log/?h=vhost__;!!GqivPVa7Brio!MzCv3wdRfz5dltunazRWGCeUkMg91pPEOLpIivsebLX9vhYDSi_E1V36e9H0NoRys_hU$
>>>> and the vhost-scsi bug fix patchset:
>>>>
>>>> https://urldefense.com/v3/__https://lore.kernel.org/linux-scsi/20201112170008.GB1555653@stefanha-x1.localdomain/T/*t__;Iw!!GqivPVa7Brio!MzCv3wdRfz5dltunazRWGCeUkMg91pPEOLpIivsebLX9vhYDSi_E1V36e9H0NmuPE_m8$
>>>> And the qemu patch was made over the qemu master branch.
>>>>
>>>> vhost-scsi currently supports multiple queues with the num_queues
>>>> setting, but we end up with a setup where the guest's scsi/block
>>>> layer can do a queue per vCPU and the layers below vhost can do
>>>> a queue per CPU. vhost-scsi will then do a num_queue virtqueues,
>>>> but all IO gets set on and completed on a single vhost-scsi thread.
>>>> After 2 - 4 vqs this becomes a bottleneck.
>>>>
>>>> This patchset allows us to create a worker thread per IO vq, so we
>>>> can better utilize multiple CPUs with the multiple queues. It
>>>> implments Jason's suggestion to create the initial worker like
>>>> normal, then create the extra workers for IO vqs with the
>>>> VHOST_SET_VRING_ENABLE ioctl command added in this patchset.
>>> How does userspace find out the tids and set their CPU affinity?
>>>
>>> What is the meaning of the new VHOST_SET_VRING_ENABLE ioctl? It doesn't
>>> really "enable" or "disable" the vq, requests are processed regardless.
>>
>>
>> Actually I think it should do the real "enable/disable" that tries to follow 
>> the virtio spec.
>>
> 
> What does real mean here? For the vdpa enable call for example, would it be 
> like
> ifcvf_vdpa_set_vq_ready where it sets the ready bit or more like 
> mlx5_vdpa_set_vq_ready
> where it can do some more work in the disable case?
> 
> For net and something like ifcvf_vdpa_set_vq_ready's design would we have
> vhost_ring_ioctl() set some vhost_virtqueue enable bit. We then have some 
> helper
> vhost_vq_is_enabled() and some code to detect if userspace supports the new 
> ioctl.
> And then in vhost_net_set_backend do we call vhost_vq_is_enabled()? What is 
> done
> for disable then? It doesn't seem to buy a lot of new functionality. Is it 
> just
> so we follow the spec?
> 
> Or do you want it work more like mlx5_vdpa_set_vq_ready? For this in 
> vhost_ring_ioctl
> when we get the new ioctl we would call into the drivers and have it start 
> queues
> and stop queues? For enable, what we you do for net for this case? For 
> disable,
> would you do something like vhost_net_stop_vq (we don't free up anything 
> allocated
> in vhost_vring_ioctl calls, but we can stop what we setup in the net driver)?
> Is this useful for the current net mq design or is this for something like 
> where
> you would do one vhost net device with multiple vqs?
> 
> My issue/convern is that in general these calls seems useful, but we don't 
> really
> need them for scsi because vhost scsi is already stuck creating vqs like how 
> it does
> due to existing users. If we do the ifcvf_vdpa_set_vq_ready type of design 
> where
> we just set some bit, then the new ioctl does not give us a lot. It's just an 
> extra
> check and extra code.
> 
> And for the mlx5_vdpa_set_vq_ready type of design, it doesn't seem like it's 
> going
> to happen a lot where the admin is going to want to remove vqs from a running 
> device.
> And for both addition/removal for scsi we would need code in virtio scsi to 
> handle
> hot plug removal/addition of a queue and then redoing the multiqueue mappings 
> which
> would be difficult to add with no one requesting it.

Actually I want to half take this last chunk back. When I said in general these 
calls
seem useful, I meant for the mlx5_vdpa_set_vq_ready type design. For example, 
if a
user was going to remove/add vCPUs then this functionality where we are 
completely
adding/removing virtqueues would be useful. We would need a lot more than just
the new ioctl though, because we would want to completely create/setup a new
virtqueue

I do not have any of our users asking for this. You guys work on this more so
you know better.

Another option is to kick it down the road again since I'm not sure my patches
here have a lot to do with this. We could also just do the kernel only approach
(no new ioctl) and then add some new design when we have users asking for it.



Re: [PATCH 00/10] vhost/qemu: thread per IO SCSI vq

2020-11-17 Thread Mike Christie
On 11/17/20 11:17 PM, Jason Wang wrote:
> 
> On 2020/11/18 上午12:40, Stefan Hajnoczi wrote:
>> On Thu, Nov 12, 2020 at 05:18:59PM -0600, Mike Christie wrote:
>>> The following kernel patches were made over Michael's vhost branch:
>>>
>>> https://urldefense.com/v3/__https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git/log/?h=vhost__;!!GqivPVa7Brio!MzCv3wdRfz5dltunazRWGCeUkMg91pPEOLpIivsebLX9vhYDSi_E1V36e9H0NoRys_hU$
>>> and the vhost-scsi bug fix patchset:
>>>
>>> https://urldefense.com/v3/__https://lore.kernel.org/linux-scsi/20201112170008.GB1555653@stefanha-x1.localdomain/T/*t__;Iw!!GqivPVa7Brio!MzCv3wdRfz5dltunazRWGCeUkMg91pPEOLpIivsebLX9vhYDSi_E1V36e9H0NmuPE_m8$
>>> And the qemu patch was made over the qemu master branch.
>>>
>>> vhost-scsi currently supports multiple queues with the num_queues
>>> setting, but we end up with a setup where the guest's scsi/block
>>> layer can do a queue per vCPU and the layers below vhost can do
>>> a queue per CPU. vhost-scsi will then do a num_queue virtqueues,
>>> but all IO gets set on and completed on a single vhost-scsi thread.
>>> After 2 - 4 vqs this becomes a bottleneck.
>>>
>>> This patchset allows us to create a worker thread per IO vq, so we
>>> can better utilize multiple CPUs with the multiple queues. It
>>> implments Jason's suggestion to create the initial worker like
>>> normal, then create the extra workers for IO vqs with the
>>> VHOST_SET_VRING_ENABLE ioctl command added in this patchset.
>> How does userspace find out the tids and set their CPU affinity?
>>
>> What is the meaning of the new VHOST_SET_VRING_ENABLE ioctl? It doesn't
>> really "enable" or "disable" the vq, requests are processed regardless.
> 
> 
> Actually I think it should do the real "enable/disable" that tries to follow 
> the virtio spec.
> 

What does real mean here? For the vdpa enable call for example, would it be like
ifcvf_vdpa_set_vq_ready where it sets the ready bit or more like 
mlx5_vdpa_set_vq_ready
where it can do some more work in the disable case?

For net and something like ifcvf_vdpa_set_vq_ready's design would we have
vhost_ring_ioctl() set some vhost_virtqueue enable bit. We then have some helper
vhost_vq_is_enabled() and some code to detect if userspace supports the new 
ioctl.
And then in vhost_net_set_backend do we call vhost_vq_is_enabled()? What is done
for disable then? It doesn't seem to buy a lot of new functionality. Is it just
so we follow the spec?

Or do you want it work more like mlx5_vdpa_set_vq_ready? For this in 
vhost_ring_ioctl
when we get the new ioctl we would call into the drivers and have it start 
queues
and stop queues? For enable, what we you do for net for this case? For disable,
would you do something like vhost_net_stop_vq (we don't free up anything 
allocated
in vhost_vring_ioctl calls, but we can stop what we setup in the net driver)?
Is this useful for the current net mq design or is this for something like where
you would do one vhost net device with multiple vqs?

My issue/convern is that in general these calls seems useful, but we don't 
really
need them for scsi because vhost scsi is already stuck creating vqs like how it 
does
due to existing users. If we do the ifcvf_vdpa_set_vq_ready type of design where
we just set some bit, then the new ioctl does not give us a lot. It's just an 
extra
check and extra code.

And for the mlx5_vdpa_set_vq_ready type of design, it doesn't seem like it's 
going
to happen a lot where the admin is going to want to remove vqs from a running 
device.
And for both addition/removal for scsi we would need code in virtio scsi to 
handle
hot plug removal/addition of a queue and then redoing the multiqueue mappings 
which
would be difficult to add with no one requesting it.



Re: [PATCH 00/10] vhost/qemu: thread per IO SCSI vq

2020-11-17 Thread Mike Christie
On 11/17/20 10:40 AM, Stefan Hajnoczi wrote:
> On Thu, Nov 12, 2020 at 05:18:59PM -0600, Mike Christie wrote:
>> The following kernel patches were made over Michael's vhost branch:
>>
>> https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git/log/?h=vhost
>>
>> and the vhost-scsi bug fix patchset:
>>
>> https://lore.kernel.org/linux-scsi/20201112170008.GB1555653@stefanha-x1.localdomain/T/#t
>>
>> And the qemu patch was made over the qemu master branch.
>>
>> vhost-scsi currently supports multiple queues with the num_queues
>> setting, but we end up with a setup where the guest's scsi/block
>> layer can do a queue per vCPU and the layers below vhost can do
>> a queue per CPU. vhost-scsi will then do a num_queue virtqueues,
>> but all IO gets set on and completed on a single vhost-scsi thread.
>> After 2 - 4 vqs this becomes a bottleneck.
>>
>> This patchset allows us to create a worker thread per IO vq, so we
>> can better utilize multiple CPUs with the multiple queues. It
>> implments Jason's suggestion to create the initial worker like
>> normal, then create the extra workers for IO vqs with the
>> VHOST_SET_VRING_ENABLE ioctl command added in this patchset.
> 
> How does userspace find out the tids and set their CPU affinity?
> 

When we create the worker thread we add it to the device owner's cgroup,
so we end up inheriting those settings like affinity.

However, are you more asking about finer control like if the guest is
doing mq, and the mq hw queue is bound to cpu0, it would perform
better if we could bind vhost vq's worker thread to cpu0? I think the
problem might is if you are in the cgroup then we can't set a specific
threads CPU affinity to just one specific CPU. So you can either do
cgroups or not.


> What is the meaning of the new VHOST_SET_VRING_ENABLE ioctl? It doesn't
> really "enable" or "disable" the vq, requests are processed regardless.
> 

Yeah, I agree. The problem I've mentioned before is:

1. For net and vsock, it's not useful because the vqs are hard coded in
the kernel and userspace, so you can't disable a vq and you never need
to enable one.

2. vdpa has it's own enable ioctl.

3. For scsi, because we already are doing multiple vqs based on the
num_queues value, we have to have some sort of compat support and
code to detect if userspace is even going to send the new ioctl.
In this patchset, compat just meant enable/disable the extra functionality
of extra worker threads for a vq. We will still use the vq if
userspace set it up.


> The purpose of the ioctl isn't clear to me because the kernel could
> automatically create 1 thread per vq without a new ioctl. On the other
> hand, if userspace is supposed to control worker threads then a
> different interface would be more powerful:
> 

My preference has been:

1. If we were to ditch cgroups, then add a new interface that would allow
us to bind threads to a specific CPU, so that it lines up with the guest's
mq to CPU mapping.

2. If we continue with cgroups then I think just creating the worker
threads from vhost_scsi_set_endpoint is best, because that is the point
we do the other final vq setup ops vhost_vq_set_backend and
vhost_vq_init_access.

For option number 2 it would be simple. Instead of the vring enable patches:

[PATCH 08/10] vhost: move msg_handler to new ops struct
[PATCH 09/10] vhost: add VHOST_SET_VRING_ENABLE support
[PATCH 10/10] vhost-scsi: create a woker per IO vq
and
[PATCH 1/1] q

[PATCH 09/10] vhost: add VHOST_SET_VRING_ENABLE support

2020-11-12 Thread Mike Christie
This adds a new ioctl VHOST_SET_VRING_ENABLE that the vhost drivers can
implement a callout for and execute an operation when the vq is
enabled/disabled.

Signed-off-by: Mike Christie 
---
 drivers/vhost/vhost.c  | 25 +
 drivers/vhost/vhost.h  |  1 +
 include/uapi/linux/vhost.h |  1 +
 3 files changed, 27 insertions(+)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 2f98b81..e953031 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1736,6 +1736,28 @@ static long vhost_vring_set_num_addr(struct vhost_dev *d,
 
return r;
 }
+
+static long vhost_vring_set_enable(struct vhost_dev *d,
+  struct vhost_virtqueue *vq,
+  void __user *argp)
+{
+   struct vhost_vring_state s;
+   int ret = 0;
+
+   if (vq->private_data)
+   return -EBUSY;
+
+   if (copy_from_user(&s, argp, sizeof s))
+   return -EFAULT;
+
+   if (s.num != 1 && s.num != 0)
+   return -EINVAL;
+
+   if (d->ops && d->ops->enable_vring)
+   ret = d->ops->enable_vring(vq, s.num);
+   return ret;
+}
+
 long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user 
*argp)
 {
struct file *eventfp, *filep = NULL;
@@ -1765,6 +1787,9 @@ long vhost_vring_ioctl(struct vhost_dev *d, unsigned int 
ioctl, void __user *arg
mutex_lock(&vq->mutex);
 
switch (ioctl) {
+   case VHOST_SET_VRING_ENABLE:
+   r = vhost_vring_set_enable(d, vq, argp);
+   break;
case VHOST_SET_VRING_BASE:
/* Moving base with an active backend?
 * You don't want to do that. */
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index a293f48..1279c09 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -158,6 +158,7 @@ struct vhost_msg_node {
 
 struct vhost_dev_ops {
int (*msg_handler)(struct vhost_dev *dev, struct vhost_iotlb_msg *msg);
+   int (*enable_vring)(struct vhost_virtqueue *vq, bool enable);
 };
 
 struct vhost_dev {
diff --git a/include/uapi/linux/vhost.h b/include/uapi/linux/vhost.h
index c998860..3ffd133 100644
--- a/include/uapi/linux/vhost.h
+++ b/include/uapi/linux/vhost.h
@@ -70,6 +70,7 @@
 #define VHOST_VRING_BIG_ENDIAN 1
 #define VHOST_SET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x13, struct 
vhost_vring_state)
 #define VHOST_GET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x14, struct 
vhost_vring_state)
+#define VHOST_SET_VRING_ENABLE _IOW(VHOST_VIRTIO, 0x15, struct 
vhost_vring_state)
 
 /* The following ioctls use eventfd file descriptors to signal and poll
  * for events. */
-- 
1.8.3.1




[PATCH 03/10] vhost poll: fix coding style

2020-11-12 Thread Mike Christie
We use like 3 coding styles in this struct. Switch to just tabs.

Signed-off-by: Mike Christie 
Reviewed-by: Chaitanya Kulkarni 
---
 drivers/vhost/vhost.h | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 1ba8e81..575c818 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -28,12 +28,12 @@ struct vhost_work {
 /* Poll a file (eventfd or socket) */
 /* Note: there's nothing vhost specific about this structure. */
 struct vhost_poll {
-   poll_tabletable;
-   wait_queue_head_t*wqh;
-   wait_queue_entry_t  wait;
-   struct vhost_work work;
-   __poll_t  mask;
-   struct vhost_dev *dev;
+   poll_table  table;
+   wait_queue_head_t   *wqh;
+   wait_queue_entry_t  wait;
+   struct vhost_work   work;
+   __poll_tmask;
+   struct vhost_dev*dev;
 };
 
 void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
-- 
1.8.3.1




[PATCH 06/10] vhost scsi: make SCSI cmd completion per vq

2020-11-12 Thread Mike Christie
In the last patches we are going to have a worker thread per IO vq.
This patch separates the scsi cmd completion code paths so we can
complete cmds based on their vq instead of having all cmds complete
on the same worker thread.

Signed-off-by: Mike Christie 
---
 drivers/vhost/scsi.c | 48 +---
 1 file changed, 25 insertions(+), 23 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 4725a08..2bbe1a8 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -178,6 +178,7 @@ enum {
 
 struct vhost_scsi_virtqueue {
struct vhost_virtqueue vq;
+   struct vhost_scsi *vs;
/*
 * Reference counting for inflight reqs, used for flush operation. At
 * each time, one reference tracks new commands submitted, while we
@@ -192,6 +193,9 @@ struct vhost_scsi_virtqueue {
struct vhost_scsi_cmd *scsi_cmds;
struct sbitmap scsi_tags;
int max_cmds;
+
+   struct vhost_work completion_work;
+   struct llist_head completion_list;
 };
 
 struct vhost_scsi {
@@ -202,9 +206,6 @@ struct vhost_scsi {
struct vhost_dev dev;
struct vhost_scsi_virtqueue vqs[VHOST_SCSI_MAX_VQ];
 
-   struct vhost_work vs_completion_work; /* cmd completion work item */
-   struct llist_head vs_completion_list; /* cmd completion queue */
-
struct vhost_work vs_event_work; /* evt injection work item */
struct llist_head vs_event_list; /* evt injection queue */
 
@@ -380,10 +381,11 @@ static void vhost_scsi_release_cmd(struct se_cmd *se_cmd)
} else {
struct vhost_scsi_cmd *cmd = container_of(se_cmd,
struct vhost_scsi_cmd, tvc_se_cmd);
-   struct vhost_scsi *vs = cmd->tvc_vhost;
+   struct vhost_scsi_virtqueue *svq =  container_of(cmd->tvc_vq,
+   struct vhost_scsi_virtqueue, vq);
 
-   llist_add(&cmd->tvc_completion_list, &vs->vs_completion_list);
-   vhost_work_queue(&vs->dev, &vs->vs_completion_work);
+   llist_add(&cmd->tvc_completion_list, &svq->completion_list);
+   vhost_vq_work_queue(&svq->vq, &svq->completion_work);
}
 }
 
@@ -545,18 +547,17 @@ static void vhost_scsi_evt_work(struct vhost_work *work)
  */
 static void vhost_scsi_complete_cmd_work(struct vhost_work *work)
 {
-   struct vhost_scsi *vs = container_of(work, struct vhost_scsi,
-   vs_completion_work);
-   DECLARE_BITMAP(signal, VHOST_SCSI_MAX_VQ);
+   struct vhost_scsi_virtqueue *svq = container_of(work,
+   struct vhost_scsi_virtqueue, completion_work);
struct virtio_scsi_cmd_resp v_rsp;
struct vhost_scsi_cmd *cmd, *t;
struct llist_node *llnode;
struct se_cmd *se_cmd;
struct iov_iter iov_iter;
-   int ret, vq;
+   bool signal = false;
+   int ret;
 
-   bitmap_zero(signal, VHOST_SCSI_MAX_VQ);
-   llnode = llist_del_all(&vs->vs_completion_list);
+   llnode = llist_del_all(&svq->completion_list);
llist_for_each_entry_safe(cmd, t, llnode, tvc_completion_list) {
se_cmd = &cmd->tvc_se_cmd;
 
@@ -576,21 +577,16 @@ static void vhost_scsi_complete_cmd_work(struct 
vhost_work *work)
  cmd->tvc_in_iovs, sizeof(v_rsp));
ret = copy_to_iter(&v_rsp, sizeof(v_rsp), &iov_iter);
if (likely(ret == sizeof(v_rsp))) {
-   struct vhost_scsi_virtqueue *q;
+   signal = true;
vhost_add_used(cmd->tvc_vq, cmd->tvc_vq_desc, 0);
-   q = container_of(cmd->tvc_vq, struct 
vhost_scsi_virtqueue, vq);
-   vq = q - vs->vqs;
-   __set_bit(vq, signal);
} else
pr_err("Faulted on virtio_scsi_cmd_resp\n");
 
vhost_scsi_release_cmd_res(se_cmd);
}
 
-   vq = -1;
-   while ((vq = find_next_bit(signal, VHOST_SCSI_MAX_VQ, vq + 1))
-   < VHOST_SCSI_MAX_VQ)
-   vhost_signal(&vs->dev, &vs->vqs[vq].vq);
+   if (signal)
+   vhost_signal(&svq->vs->dev, &svq->vq);
 }
 
 static struct vhost_scsi_cmd *
@@ -1799,6 +1795,7 @@ static int vhost_scsi_set_features(struct vhost_scsi *vs, 
u64 features)
 
 static int vhost_scsi_open(struct inode *inode, struct file *f)
 {
+   struct vhost_scsi_virtqueue *svq;
struct vhost_scsi *vs;
struct vhost_virtqueue **vqs;
int r = -ENOMEM, i;
@@ -1814,7 +1811,6 @@ static int vhost_scsi_open(struct inode *inode, struct 
file *f)
if (!vqs)
goto err_vqs;
 
-   vhost_work_init(&vs->vs_completi

[PATCH 00/10] vhost/qemu: thread per IO SCSI vq

2020-11-12 Thread Mike Christie
The following kernel patches were made over Michael's vhost branch:

https://git.kernel.org/pub/scm/linux/kernel/git/mst/vhost.git/log/?h=vhost

and the vhost-scsi bug fix patchset:

https://lore.kernel.org/linux-scsi/20201112170008.GB1555653@stefanha-x1.localdomain/T/#t

And the qemu patch was made over the qemu master branch.

vhost-scsi currently supports multiple queues with the num_queues
setting, but we end up with a setup where the guest's scsi/block
layer can do a queue per vCPU and the layers below vhost can do
a queue per CPU. vhost-scsi will then do a num_queue virtqueues,
but all IO gets set on and completed on a single vhost-scsi thread.
After 2 - 4 vqs this becomes a bottleneck.

This patchset allows us to create a worker thread per IO vq, so we
can better utilize multiple CPUs with the multiple queues. It
implments Jason's suggestion to create the initial worker like
normal, then create the extra workers for IO vqs with the
VHOST_SET_VRING_ENABLE ioctl command added in this patchset.





[PATCH 10/10] vhost-scsi: create a woker per IO vq

2020-11-12 Thread Mike Christie
This patch has vhost-scsi create a worker thread per IO vq.
It also adds a modparam to enable the feature, because I was thinking
existing setups might not be expecting the extra threading use, so the
default is to use the old single thread multiple vq behavior.

Signed-off-by: Mike Christie 
---
 drivers/vhost/scsi.c | 38 +-
 1 file changed, 37 insertions(+), 1 deletion(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 612359d..3fb147f 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -62,6 +62,12 @@
  */
 #define VHOST_SCSI_WEIGHT 256
 
+static bool vhost_scsi_worker_per_io_vq;
+module_param_named(thread_per_io_virtqueue, vhost_scsi_worker_per_io_vq, bool,
+  0644);
+MODULE_PARM_DESC(thread_per_io_virtqueue,
+"Create a worker thread per IO virtqueue. Set to true to turn 
on. Default is false where all virtqueues share a thread.");
+
 struct vhost_scsi_inflight {
/* Wait for the flush operation to finish */
struct completion comp;
@@ -1805,6 +1811,36 @@ static int vhost_scsi_set_features(struct vhost_scsi 
*vs, u64 features)
return 0;
 }
 
+static int vhost_scsi_enable_vring(struct vhost_virtqueue *vq, bool enable)
+{
+   struct vhost_scsi *vs = container_of(vq->dev, struct vhost_scsi, dev);
+   /*
+* For compat, we have the evt, ctl and first IO vq share worker0 like
+* is setup by default. Additional vqs get their own worker.
+*/
+   if (vq == &vs->vqs[VHOST_SCSI_VQ_CTL].vq ||
+   vq == &vs->vqs[VHOST_SCSI_VQ_EVT].vq ||
+   vq == &vs->vqs[VHOST_SCSI_VQ_IO].vq)
+   return 0;
+
+   if (enable) {
+   if (!vhost_scsi_worker_per_io_vq)
+   return 0;
+   if (vq->worker_id != 0)
+   return 0;
+   return vhost_vq_worker_add(vq->dev, vq);
+   } else {
+   if (vq->worker_id == 0)
+   return 0;
+   vhost_vq_worker_remove(vq->dev, vq);
+   return 0;
+   }
+}
+
+static struct vhost_dev_ops vhost_scsi_dev_ops = {
+   .enable_vring = vhost_scsi_enable_vring,
+};
+
 static int vhost_scsi_open(struct inode *inode, struct file *f)
 {
struct vhost_scsi_virtqueue *svq;
@@ -1843,7 +1879,7 @@ static int vhost_scsi_open(struct inode *inode, struct 
file *f)
svq->vq.handle_kick = vhost_scsi_handle_kick;
}
vhost_dev_init(&vs->dev, vqs, VHOST_SCSI_MAX_VQ, UIO_MAXIOV,
-  VHOST_SCSI_WEIGHT, 0, true, NULL);
+  VHOST_SCSI_WEIGHT, 0, true, &vhost_scsi_dev_ops);
 
vhost_scsi_init_inflight(vs, NULL);
 
-- 
1.8.3.1




[PATCH 05/10] vhost: poll support support multiple workers

2020-11-12 Thread Mike Christie
The final patches are going to have vhost scsi create a vhost worker
per IO vq. This patch converts the poll code to poll and queue work on
the worker that is tied to the vq (in this patch we maintain the old
behavior where all vqs use a single worker).

For drivers that do not convert over to the multiple worker support
or for the case where the user just does not want to allocate the
resources then we maintain support for the single worker case.

Note: This adds a new function vhost_vq_work_queue. It's used by
this patch and also the next one, so I exported it here.

Signed-off-by: Mike Christie 
---
 drivers/vhost/net.c   |  6 --
 drivers/vhost/vhost.c | 14 +++---
 drivers/vhost/vhost.h |  6 --
 3 files changed, 19 insertions(+), 7 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 531a00d..6a27fe6 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -1330,8 +1330,10 @@ static int vhost_net_open(struct inode *inode, struct 
file *f)
   VHOST_NET_PKT_WEIGHT, VHOST_NET_WEIGHT, true,
   NULL);
 
-   vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, 
dev);
-   vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev);
+   vhost_poll_init(n->poll + VHOST_NET_VQ_TX, handle_tx_net, EPOLLOUT, dev,
+   vqs[VHOST_NET_VQ_TX]);
+   vhost_poll_init(n->poll + VHOST_NET_VQ_RX, handle_rx_net, EPOLLIN, dev,
+   vqs[VHOST_NET_VQ_RX]);
 
f->private_data = n;
n->page_frag.page = NULL;
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index d229515..9eeb8c7 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -187,13 +187,15 @@ void vhost_work_init(struct vhost_work *work, 
vhost_work_fn_t fn)
 
 /* Init poll structure */
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
-__poll_t mask, struct vhost_dev *dev)
+__poll_t mask, struct vhost_dev *dev,
+struct vhost_virtqueue *vq)
 {
init_waitqueue_func_entry(&poll->wait, vhost_poll_wakeup);
init_poll_funcptr(&poll->table, vhost_poll_func);
poll->mask = mask;
poll->dev = dev;
poll->wqh = NULL;
+   poll->vq = vq;
 
vhost_work_init(&poll->work, fn);
 }
@@ -284,6 +286,12 @@ void vhost_poll_flush(struct vhost_poll *poll)
 }
 EXPORT_SYMBOL_GPL(vhost_poll_flush);
 
+void vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work)
+{
+   vhost_work_queue_on(vq->dev, work, vq->worker_id);
+}
+EXPORT_SYMBOL_GPL(vhost_vq_work_queue);
+
 /* A lockless hint for busy polling code to exit the loop */
 bool vhost_has_work(struct vhost_dev *dev)
 {
@@ -301,7 +309,7 @@ bool vhost_has_work(struct vhost_dev *dev)
 
 void vhost_poll_queue(struct vhost_poll *poll)
 {
-   vhost_work_queue(poll->dev, &poll->work);
+   vhost_vq_work_queue(poll->vq, &poll->work);
 }
 EXPORT_SYMBOL_GPL(vhost_poll_queue);
 
@@ -528,7 +536,7 @@ void vhost_dev_init(struct vhost_dev *dev,
vhost_vq_reset(dev, vq);
if (vq->handle_kick)
vhost_poll_init(&vq->poll, vq->handle_kick,
-   EPOLLIN, dev);
+   EPOLLIN, dev, vq);
}
 }
 EXPORT_SYMBOL_GPL(vhost_dev_init);
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index f334e90..232c5f9 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -33,7 +33,6 @@ struct vhost_worker {
 };
 
 /* Poll a file (eventfd or socket) */
-/* Note: there's nothing vhost specific about this structure. */
 struct vhost_poll {
poll_table  table;
wait_queue_head_t   *wqh;
@@ -41,16 +40,19 @@ struct vhost_poll {
struct vhost_work   work;
__poll_tmask;
struct vhost_dev*dev;
+   struct vhost_virtqueue  *vq;
 };
 
 void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
 void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
 bool vhost_has_work(struct vhost_dev *dev);
+void vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work);
 int vhost_vq_worker_add(struct vhost_dev *dev, struct vhost_virtqueue *vq);
 void vhost_vq_worker_remove(struct vhost_dev *dev, struct vhost_virtqueue *vq);
 
 void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t fn,
-__poll_t mask, struct vhost_dev *dev);
+__poll_t mask, struct vhost_dev *dev,
+struct vhost_virtqueue *vq);
 int vhost_poll_start(struct vhost_poll *poll, struct file *file);
 void vhost_poll_stop(struct vhost_poll *poll);
 void vhost_poll_flush(struct vhost_poll *poll);
-- 
1.8.3.1




[PATCH 04/10] vhost: support multiple worker threads

2020-11-12 Thread Mike Christie
This is a prep patch to support multiple vhost worker threads per vhost
dev. This patch converts the code that had assumed a single worker
thread by:

1. Moving worker related fields to a new struct vhost_worker.
2. Converting vhost.c code to use the new struct and assume we will
have an array of workers.
3. It also exports 2 helper functions that will be used in the last
patch when vhost-scsi is converted to use this new functionality.

Why do we need multiple worker threads?

The admin can set_num_queues > 1 and the guest OS will run in
multiqueue mode where, depending on the num_queues, you might get
a queue per CPU. The layers below vhost-scsi are also doing
multiqueue. So, while vhost-scsi will create num_queue vqs every
IO on every CPU we are using has to be sent from and complete
on this one thread on one CPU and can't fully utlize the multiple
queues above and below us.

With the null_blk driver we max out at 360K IOPs when doing a random
workload like:

fio --direct=1 --rw=randrw --bs=4k --ioengine=libaio \
--iodepth=VQ_QUEUE_DEPTH --numjobs=NUM_VQS --filename  /dev/sdXYZ

where NUM_VQS gets up to 8 (number of cores per numa node on my system)
and VQ_QUEUE_DEPTH can be anywhere from 32 to 128.

With the patches in this set and the patches to remove the sess_cmd_lock
and execution_lock from lio's IO path in the SCSI tree for 5.11, we are
able to get IOPs from a single LUN up to 700K.

Signed-off-by: Mike Christie 
---
 drivers/vhost/vhost.c | 260 +++---
 drivers/vhost/vhost.h |  14 ++-
 2 files changed, 217 insertions(+), 57 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 78d9535..d229515 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -231,16 +231,47 @@ void vhost_poll_stop(struct vhost_poll *poll)
 }
 EXPORT_SYMBOL_GPL(vhost_poll_stop);
 
-void vhost_work_dev_flush(struct vhost_dev *dev)
+static void vhost_work_queue_on(struct vhost_dev *dev, struct vhost_work *work,
+   int worker_id)
+{
+   if (!dev->num_workers)
+   return;
+
+   if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
+   /* We can only add the work to the list after we're
+* sure it was not in the list.
+* test_and_set_bit() implies a memory barrier.
+*/
+   llist_add(&work->node, &dev->workers[worker_id]->work_list);
+   wake_up_process(dev->workers[worker_id]->task);
+   }
+}
+
+void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
+{
+   vhost_work_queue_on(dev, work, 0);
+}
+EXPORT_SYMBOL_GPL(vhost_work_queue);
+
+static void vhost_work_flush_on(struct vhost_dev *dev, int worker_id)
 {
struct vhost_flush_struct flush;
 
-   if (dev->worker) {
-   init_completion(&flush.wait_event);
-   vhost_work_init(&flush.work, vhost_flush_work);
+   init_completion(&flush.wait_event);
+   vhost_work_init(&flush.work, vhost_flush_work);
+
+   vhost_work_queue_on(dev, &flush.work, worker_id);
+   wait_for_completion(&flush.wait_event);
+}
+
+void vhost_work_dev_flush(struct vhost_dev *dev)
+{
+   int i;
 
-   vhost_work_queue(dev, &flush.work);
-   wait_for_completion(&flush.wait_event);
+   for (i = 0; i < dev->num_workers; i++) {
+   if (!dev->workers[i])
+   continue;
+   vhost_work_flush_on(dev, i);
}
 }
 EXPORT_SYMBOL_GPL(vhost_work_dev_flush);
@@ -253,26 +284,18 @@ void vhost_poll_flush(struct vhost_poll *poll)
 }
 EXPORT_SYMBOL_GPL(vhost_poll_flush);
 
-void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work)
+/* A lockless hint for busy polling code to exit the loop */
+bool vhost_has_work(struct vhost_dev *dev)
 {
-   if (!dev->worker)
-   return;
+   int i;
 
-   if (!test_and_set_bit(VHOST_WORK_QUEUED, &work->flags)) {
-   /* We can only add the work to the list after we're
-* sure it was not in the list.
-* test_and_set_bit() implies a memory barrier.
-*/
-   llist_add(&work->node, &dev->work_list);
-   wake_up_process(dev->worker);
+   for (i = 0; i < dev->num_workers; i++) {
+   if (dev->workers[i] &&
+   !llist_empty(&dev->workers[i]->work_list))
+   return true;
}
-}
-EXPORT_SYMBOL_GPL(vhost_work_queue);
 
-/* A lockless hint for busy polling code to exit the loop */
-bool vhost_has_work(struct vhost_dev *dev)
-{
-   return !llist_empty(&dev->work_list);
+   return false;
 }
 EXPORT_SYMBOL_GPL(vhost_has_work);
 
@@ -343,7 +366,8 @@ static void vhost_vq_reset(struct vhost_dev *dev,
 
 sta

[PATCH 07/10] vhost, vhost-scsi: flush IO vqs then send TMF rsp

2020-11-12 Thread Mike Christie
With one worker we will always send the scsi cmd responses then send the
TMF rsp, because LIO will always complete the scsi cmds first which
calls vhost_scsi_release_cmd to add them to the work queue.

When the next patch adds multiple worker support, the worker threads
could still be sending their responses when the tmf's work is run.
So this patch has vhost-scsi flush the IO vqs on other worker threads
before we send the tmf response.

Signed-off-by: Mike Christie 
---
 drivers/vhost/scsi.c  | 16 ++--
 drivers/vhost/vhost.c |  6 ++
 drivers/vhost/vhost.h |  1 +
 3 files changed, 21 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 2bbe1a8..612359d 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -1178,11 +1178,23 @@ static void vhost_scsi_tmf_resp_work(struct vhost_work 
*work)
struct vhost_scsi_tmf *tmf = container_of(work, struct vhost_scsi_tmf,
  vwork);
int resp_code;
+   int i;
+
+   if (tmf->se_cmd.se_tmr_req->response == TMR_FUNCTION_COMPLETE) {
+   /*
+* When processing a TMF, lio completes the cmds then the
+* TMF, so with one worker the TMF always completes after
+* cmds. For multiple worker support, we must flush the
+* IO vqs that do not share a worker with the ctl vq (vqs
+* 3 and up) to make sure they have completed their cmds.
+*/
+   for (i = 1; i < tmf->vhost->dev.num_workers; i++)
+   vhost_vq_work_flush(&tmf->vhost->vqs[i + 
VHOST_SCSI_VQ_IO].vq);
 
-   if (tmf->se_cmd.se_tmr_req->response == TMR_FUNCTION_COMPLETE)
resp_code = VIRTIO_SCSI_S_FUNCTION_SUCCEEDED;
-   else
+   } else {
resp_code = VIRTIO_SCSI_S_FUNCTION_REJECTED;
+   }
 
vhost_scsi_send_tmf_resp(tmf->vhost, &tmf->svq->vq, tmf->in_iovs,
 tmf->vq_desc, &tmf->resp_iov, resp_code);
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 9eeb8c7..6a6abfc 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -278,6 +278,12 @@ void vhost_work_dev_flush(struct vhost_dev *dev)
 }
 EXPORT_SYMBOL_GPL(vhost_work_dev_flush);
 
+void vhost_vq_work_flush(struct vhost_virtqueue *vq)
+{
+   vhost_work_flush_on(vq->dev, vq->worker_id);
+}
+EXPORT_SYMBOL_GPL(vhost_vq_work_flush);
+
 /* Flush any work that has been scheduled. When calling this, don't hold any
  * locks that are also used by the callback. */
 void vhost_poll_flush(struct vhost_poll *poll)
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 232c5f9..0837133 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -46,6 +46,7 @@ struct vhost_poll {
 void vhost_work_init(struct vhost_work *work, vhost_work_fn_t fn);
 void vhost_work_queue(struct vhost_dev *dev, struct vhost_work *work);
 bool vhost_has_work(struct vhost_dev *dev);
+void vhost_vq_work_flush(struct vhost_virtqueue *vq);
 void vhost_vq_work_queue(struct vhost_virtqueue *vq, struct vhost_work *work);
 int vhost_vq_worker_add(struct vhost_dev *dev, struct vhost_virtqueue *vq);
 void vhost_vq_worker_remove(struct vhost_dev *dev, struct vhost_virtqueue *vq);
-- 
1.8.3.1




[PATCH 1/1] qemu vhost scsi: add VHOST_SET_VRING_ENABLE support

2020-11-12 Thread Mike Christie
This patch made over the master branch allows the vhost-scsi
driver to call into the kernel and tell it to enable/disable
a virtqueue.

The kernel patches included with this set, will create
a worker per IO vq when multiple IO queues have been setup.

Signed-off-by: Mike Christie 
---
 hw/scsi/vhost-scsi.c|  6 ++
 hw/virtio/vhost-backend.c   | 30 ++
 linux-headers/linux/vhost.h |  1 +
 3 files changed, 37 insertions(+)

diff --git a/hw/scsi/vhost-scsi.c b/hw/scsi/vhost-scsi.c
index 4d70fa0..bbb2ba3 100644
--- a/hw/scsi/vhost-scsi.c
+++ b/hw/scsi/vhost-scsi.c
@@ -93,6 +93,12 @@ static int vhost_scsi_start(VHostSCSI *s)
 return ret;
 }
 
+ret = vsc->dev.vhost_ops->vhost_set_vring_enable(&vsc->dev, 1);
+if (ret) {
+error_report("Error enabling vhost-scsi vqs %d", ret);
+vhost_scsi_common_stop(vsc);
+}
+
 ret = vhost_scsi_set_endpoint(s);
 if (ret < 0) {
 error_report("Error setting vhost-scsi endpoint");
diff --git a/hw/virtio/vhost-backend.c b/hw/virtio/vhost-backend.c
index 88c8ecc..e190c8e 100644
--- a/hw/virtio/vhost-backend.c
+++ b/hw/virtio/vhost-backend.c
@@ -102,6 +102,35 @@ static int vhost_kernel_set_mem_table(struct vhost_dev 
*dev,
 return vhost_kernel_call(dev, VHOST_SET_MEM_TABLE, mem);
 }
 
+static int vhost_kernel_set_vring_enable(struct vhost_dev *dev, int enable)
+{
+struct vhost_vring_state s;
+int i, ret;
+
+s.num = 1;
+for (i = 0; i < dev->nvqs; ++i) {
+s.index = i;
+
+ret = vhost_kernel_call(dev, VHOST_SET_VRING_ENABLE, &s);
+/* Ignore kernels that do not support the cmd */
+if (ret == -EPERM)
+return 0;
+if (ret)
+goto disable_vrings;
+}
+
+return 0;
+
+disable_vrings:
+s.num = 0;
+for (i--; i < dev->nvqs; ++i) {
+s.index = i;
+vhost_kernel_call(dev, VHOST_SET_VRING_ENABLE, &s);
+}
+
+return ret;
+}
+
 static int vhost_kernel_set_vring_addr(struct vhost_dev *dev,
struct vhost_vring_addr *addr)
 {
@@ -302,6 +331,7 @@ static const VhostOps kernel_ops = {
 .vhost_scsi_get_abi_version = vhost_kernel_scsi_get_abi_version,
 .vhost_set_log_base = vhost_kernel_set_log_base,
 .vhost_set_mem_table = vhost_kernel_set_mem_table,
+.vhost_set_vring_enable = vhost_kernel_set_vring_enable,
 .vhost_set_vring_addr = vhost_kernel_set_vring_addr,
 .vhost_set_vring_endian = vhost_kernel_set_vring_endian,
 .vhost_set_vring_num = vhost_kernel_set_vring_num,
diff --git a/linux-headers/linux/vhost.h b/linux-headers/linux/vhost.h
index 7523218..98dd919 100644
--- a/linux-headers/linux/vhost.h
+++ b/linux-headers/linux/vhost.h
@@ -70,6 +70,7 @@
 #define VHOST_VRING_BIG_ENDIAN 1
 #define VHOST_SET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x13, struct 
vhost_vring_state)
 #define VHOST_GET_VRING_ENDIAN _IOW(VHOST_VIRTIO, 0x14, struct 
vhost_vring_state)
+#define VHOST_SET_VRING_ENABLE _IOW(VHOST_VIRTIO, 0x15, struct 
vhost_vring_state)
 
 /* The following ioctls use eventfd file descriptors to signal and poll
  * for events. */
-- 
1.8.3.1




[PATCH 02/10] vhost scsi: remove extra flushes

2020-11-12 Thread Mike Christie
The vhost work flush function was flushing the entire work queue, so
there is no need for the double vhost_work_dev_flush calls in
vhost_scsi_flush.

And we do not need to call vhost_poll_flush for each poller because
that call also ends up flushing the same work queue thread the
vhost_work_dev_flush call flushed.

Signed-off-by: Mike Christie 
---
 drivers/vhost/scsi.c | 8 
 1 file changed, 8 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index 8795fd3..4725a08 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -1443,11 +1443,6 @@ static void vhost_scsi_handle_kick(struct vhost_work 
*work)
vhost_scsi_handle_vq(vs, vq);
 }
 
-static void vhost_scsi_flush_vq(struct vhost_scsi *vs, int index)
-{
-   vhost_poll_flush(&vs->vqs[index].vq.poll);
-}
-
 /* Callers must hold dev mutex */
 static void vhost_scsi_flush(struct vhost_scsi *vs)
 {
@@ -1466,9 +1461,6 @@ static void vhost_scsi_flush(struct vhost_scsi *vs)
kref_put(&old_inflight[i]->kref, vhost_scsi_done_inflight);
 
/* Flush both the vhost poll and vhost work */
-   for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
-   vhost_scsi_flush_vq(vs, i);
-   vhost_work_dev_flush(&vs->dev);
vhost_work_dev_flush(&vs->dev);
 
/* Wait for all reqs issued before the flush to be finished */
-- 
1.8.3.1




[PATCH 08/10] vhost: move msg_handler to new ops struct

2020-11-12 Thread Mike Christie
The next patch adds a callout so drivers can perform some action when we
get a VHOST_SET_VRING_ENABLE, so this patch moves the msg_handler callout
to a new vhost_dev_ops struct just to keep all the callouts better
organized.

Signed-off-by: Mike Christie 
---
 drivers/vhost/vdpa.c  |  7 +--
 drivers/vhost/vhost.c | 10 --
 drivers/vhost/vhost.h | 11 ++-
 3 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/drivers/vhost/vdpa.c b/drivers/vhost/vdpa.c
index 2754f30..f271f42 100644
--- a/drivers/vhost/vdpa.c
+++ b/drivers/vhost/vdpa.c
@@ -802,6 +802,10 @@ static void vhost_vdpa_set_iova_range(struct vhost_vdpa *v)
}
 }
 
+static struct vhost_dev_ops vdpa_dev_ops = {
+   .msg_handler= vhost_vdpa_process_iotlb_msg,
+};
+
 static int vhost_vdpa_open(struct inode *inode, struct file *filep)
 {
struct vhost_vdpa *v;
@@ -829,8 +833,7 @@ static int vhost_vdpa_open(struct inode *inode, struct file 
*filep)
vqs[i] = &v->vqs[i];
vqs[i]->handle_kick = handle_vq_kick;
}
-   vhost_dev_init(dev, vqs, nvqs, 0, 0, 0, false,
-  vhost_vdpa_process_iotlb_msg);
+   vhost_dev_init(dev, vqs, nvqs, 0, 0, 0, false, &vdpa_dev_ops);
 
dev->iotlb = vhost_iotlb_alloc(0, 0);
if (!dev->iotlb) {
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index 6a6abfc..2f98b81 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -504,9 +504,7 @@ static size_t vhost_get_desc_size(struct vhost_virtqueue 
*vq,
 void vhost_dev_init(struct vhost_dev *dev,
struct vhost_virtqueue **vqs, int nvqs,
int iov_limit, int weight, int byte_weight,
-   bool use_worker,
-   int (*msg_handler)(struct vhost_dev *dev,
-  struct vhost_iotlb_msg *msg))
+   bool use_worker, struct vhost_dev_ops *ops)
 {
struct vhost_virtqueue *vq;
int i;
@@ -524,7 +522,7 @@ void vhost_dev_init(struct vhost_dev *dev,
dev->weight = weight;
dev->byte_weight = byte_weight;
dev->use_worker = use_worker;
-   dev->msg_handler = msg_handler;
+   dev->ops = ops;
init_waitqueue_head(&dev->wait);
INIT_LIST_HEAD(&dev->read_list);
INIT_LIST_HEAD(&dev->pending_list);
@@ -1328,8 +1326,8 @@ ssize_t vhost_chr_write_iter(struct vhost_dev *dev,
goto done;
}
 
-   if (dev->msg_handler)
-   ret = dev->msg_handler(dev, &msg);
+   if (dev->ops && dev->ops->msg_handler)
+   ret = dev->ops->msg_handler(dev, &msg);
else
ret = vhost_process_iotlb_msg(dev, &msg);
if (ret) {
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index 0837133..a293f48 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -156,6 +156,10 @@ struct vhost_msg_node {
   struct list_head node;
 };
 
+struct vhost_dev_ops {
+   int (*msg_handler)(struct vhost_dev *dev, struct vhost_iotlb_msg *msg);
+};
+
 struct vhost_dev {
struct mm_struct *mm;
struct mutex mutex;
@@ -175,16 +179,13 @@ struct vhost_dev {
int byte_weight;
u64 kcov_handle;
bool use_worker;
-   int (*msg_handler)(struct vhost_dev *dev,
-  struct vhost_iotlb_msg *msg);
+   struct vhost_dev_ops *ops;
 };
 
 bool vhost_exceeds_weight(struct vhost_virtqueue *vq, int pkts, int total_len);
 void vhost_dev_init(struct vhost_dev *, struct vhost_virtqueue **vqs,
int nvqs, int iov_limit, int weight, int byte_weight,
-   bool use_worker,
-   int (*msg_handler)(struct vhost_dev *dev,
-  struct vhost_iotlb_msg *msg));
+   bool use_worker, struct vhost_dev_ops *ops);
 long vhost_dev_set_owner(struct vhost_dev *dev);
 bool vhost_dev_has_owner(struct vhost_dev *dev);
 long vhost_dev_check_owner(struct vhost_dev *);
-- 
1.8.3.1




[PATCH 01/10] vhost: remove work arg from vhost_work_flush

2020-11-12 Thread Mike Christie
vhost_work_flush doesn't do anything with the work arg. This patch drops
it and then renames vhost_work_flush to vhost_work_dev_flush to reflect
that the function flushes all the works in the dev and not just a
specific queue or work item.

Signed-off-by: Mike Christie 
Acked-by: Jason Wang 
Reviewed-by: Chaitanya Kulkarni 
---
 drivers/vhost/scsi.c  | 4 ++--
 drivers/vhost/vhost.c | 8 
 drivers/vhost/vhost.h | 2 +-
 drivers/vhost/vsock.c | 2 +-
 4 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/drivers/vhost/scsi.c b/drivers/vhost/scsi.c
index f22fce5..8795fd3 100644
--- a/drivers/vhost/scsi.c
+++ b/drivers/vhost/scsi.c
@@ -1468,8 +1468,8 @@ static void vhost_scsi_flush(struct vhost_scsi *vs)
/* Flush both the vhost poll and vhost work */
for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
vhost_scsi_flush_vq(vs, i);
-   vhost_work_flush(&vs->dev, &vs->vs_completion_work);
-   vhost_work_flush(&vs->dev, &vs->vs_event_work);
+   vhost_work_dev_flush(&vs->dev);
+   vhost_work_dev_flush(&vs->dev);
 
/* Wait for all reqs issued before the flush to be finished */
for (i = 0; i < VHOST_SCSI_MAX_VQ; i++)
diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index a262e12..78d9535 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -231,7 +231,7 @@ void vhost_poll_stop(struct vhost_poll *poll)
 }
 EXPORT_SYMBOL_GPL(vhost_poll_stop);
 
-void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work)
+void vhost_work_dev_flush(struct vhost_dev *dev)
 {
struct vhost_flush_struct flush;
 
@@ -243,13 +243,13 @@ void vhost_work_flush(struct vhost_dev *dev, struct 
vhost_work *work)
wait_for_completion(&flush.wait_event);
}
 }
-EXPORT_SYMBOL_GPL(vhost_work_flush);
+EXPORT_SYMBOL_GPL(vhost_work_dev_flush);
 
 /* Flush any work that has been scheduled. When calling this, don't hold any
  * locks that are also used by the callback. */
 void vhost_poll_flush(struct vhost_poll *poll)
 {
-   vhost_work_flush(poll->dev, &poll->work);
+   vhost_work_dev_flush(poll->dev);
 }
 EXPORT_SYMBOL_GPL(vhost_poll_flush);
 
@@ -538,7 +538,7 @@ static int vhost_attach_cgroups(struct vhost_dev *dev)
attach.owner = current;
vhost_work_init(&attach.work, vhost_attach_cgroups_work);
vhost_work_queue(dev, &attach.work);
-   vhost_work_flush(dev, &attach.work);
+   vhost_work_dev_flush(dev);
return attach.ret;
 }
 
diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h
index b063324..1ba8e81 100644
--- a/drivers/vhost/vhost.h
+++ b/drivers/vhost/vhost.h
@@ -46,7 +46,7 @@ void vhost_poll_init(struct vhost_poll *poll, vhost_work_fn_t 
fn,
 void vhost_poll_stop(struct vhost_poll *poll);
 void vhost_poll_flush(struct vhost_poll *poll);
 void vhost_poll_queue(struct vhost_poll *poll);
-void vhost_work_flush(struct vhost_dev *dev, struct vhost_work *work);
+void vhost_work_dev_flush(struct vhost_dev *dev);
 long vhost_vring_ioctl(struct vhost_dev *d, unsigned int ioctl, void __user 
*argp);
 
 struct vhost_log {
diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
index a483cec..f40205f 100644
--- a/drivers/vhost/vsock.c
+++ b/drivers/vhost/vsock.c
@@ -652,7 +652,7 @@ static void vhost_vsock_flush(struct vhost_vsock *vsock)
for (i = 0; i < ARRAY_SIZE(vsock->vqs); i++)
if (vsock->vqs[i].handle_kick)
vhost_poll_flush(&vsock->vqs[i].poll);
-   vhost_work_flush(&vsock->dev, &vsock->send_pkt_work);
+   vhost_work_dev_flush(&vsock->dev);
 }
 
 static void vhost_vsock_reset_orphans(struct sock *sk)
-- 
1.8.3.1




Re: [Qemu-devel] QEMU desired libiscsi.so clashes with libiscsi.so from iscsi-initiator-utils

2012-03-06 Thread Mike Christie
On 03/06/2012 07:51 PM, ronnie sahlberg wrote:
> Hi Mike,
> 
> Thanks!
> 
> That would be great if you rename it to something less generic and
> specific to libiscsi-utils.
> That means I can continue using libiscsi as the name for my
> multiplatform library.
> 
> By the way,  if the only user today and in the future of the library
> in libiscsi-utils is anaconda,

The only current user that I know of is anaconda, but I have been
working on making a more generic lib that others can use. I think Qlogic
was going to use it too, but I have been trying to advise them not to
since it was not upstream.

For anaconda we need to be able to find targets/disks and setup the
install on them. We want to then set things up so the OS uses those
disks and whatever driver (software iscsi or one of the offload drivers
or iser) they selected for root/boot and the other partitions.

In general, I just want to have a way to login, discover, and manage
targets and HBAs using the software iscsi or iser driver or one of the
offload cards we support. So something like IMA. I wanted to keep it
close to IMA so other vendors can use it for a plugin if needed.

If your lib can or would like to do stuff like that, I would be very
very very happy :)



Re: [Qemu-devel] QEMU desired libiscsi.so clashes with libiscsi.so from iscsi-initiator-utils

2012-03-06 Thread Mike Christie
On 03/06/2012 01:58 PM, Mike Christie wrote:
> On 03/06/2012 06:19 AM, Hannes Reinecke wrote:
>> On 03/06/2012 12:06 PM, ronnie sahlberg wrote:
>>> Sorry about this.
>>>
>>> First, libiscsi is a really good name for a general purpose
>>> multiplatform library, like libiscsi.
>>> Second,  a generic name like this is a horribly poor idea for a single
>>> distribution/ single use / obscure private library.
>>>
>> Yes.
>>
>>>
>>> I want to solve a problem  to make it available on all platforms.
>>> I dont like to chose a suboptimal name for this reason,  but I thought
>>> I had no choice.
>>>
>> Fully agreed. I'm perfectly fine with libiscsi ...
>>
>>>
>>> I am not really excited with the concept of "obscure single use
>>> private library polluting the namespace like this" but what are my
>>> options ?
>>> I can live with renaming my library if that is what it takes.
>>>
>> IMO the only sane option here is to have a separate package,
>> containing libiscsi _only_.
>>
>> It can be a sub-package of existing iscsi-related things, ie
>> contained within the existing open-iscsi _source_ repository.
>>
>> But having is packaged _together_ with another rpm is really bad.
>> Plus it makes updating a nightmare.
>>
>> So, Mike, what about having it as a sub-package to open-iscsi?
>> I'm all for it, especially as some partners are already asking for
>> it ...
>>
> 
> Not sure what we are talking about. Are you talking about the libiscsi
> that ships in fedora/rhel? That comes in the iscsi-initiator-utils-devel
> rpm already. It should not get installed with just the
> iscsi-initiator-utils rpm.
> 
> That lib really should only be used by anaconda. That lib is not a good
> general purpose lib. It is kinda awkward. Some of the iscsi concepts are
> mixed up. My preference is that only anaconda ever uses that lib,
> because I am not supporting it upstream. It is not merged upstream for
> example. The upstream iscsi tools do not use it (anaconda people
> actually wrote it so it works for them, but it does not work for
> iscsiadm for example). I only support it for rhel/fedora for anaconda use.


Oh yeah, if the libiscsi from iscsi-initiator-utils-devel is causing
naming conflicts then I am fine with renaming that to something else if
that fixes the issue.



Re: [Qemu-devel] QEMU desired libiscsi.so clashes with libiscsi.so from iscsi-initiator-utils

2012-03-06 Thread Mike Christie
On 03/06/2012 06:19 AM, Hannes Reinecke wrote:
> On 03/06/2012 12:06 PM, ronnie sahlberg wrote:
>> Sorry about this.
>>
>> First, libiscsi is a really good name for a general purpose
>> multiplatform library, like libiscsi.
>> Second,  a generic name like this is a horribly poor idea for a single
>> distribution/ single use / obscure private library.
>>
> Yes.
> 
>>
>> I want to solve a problem  to make it available on all platforms.
>> I dont like to chose a suboptimal name for this reason,  but I thought
>> I had no choice.
>>
> Fully agreed. I'm perfectly fine with libiscsi ...
> 
>>
>> I am not really excited with the concept of "obscure single use
>> private library polluting the namespace like this" but what are my
>> options ?
>> I can live with renaming my library if that is what it takes.
>>
> IMO the only sane option here is to have a separate package,
> containing libiscsi _only_.
> 
> It can be a sub-package of existing iscsi-related things, ie
> contained within the existing open-iscsi _source_ repository.
> 
> But having is packaged _together_ with another rpm is really bad.
> Plus it makes updating a nightmare.
> 
> So, Mike, what about having it as a sub-package to open-iscsi?
> I'm all for it, especially as some partners are already asking for
> it ...
> 

Not sure what we are talking about. Are you talking about the libiscsi
that ships in fedora/rhel? That comes in the iscsi-initiator-utils-devel
rpm already. It should not get installed with just the
iscsi-initiator-utils rpm.

That lib really should only be used by anaconda. That lib is not a good
general purpose lib. It is kinda awkward. Some of the iscsi concepts are
mixed up. My preference is that only anaconda ever uses that lib,
because I am not supporting it upstream. It is not merged upstream for
example. The upstream iscsi tools do not use it (anaconda people
actually wrote it so it works for them, but it does not work for
iscsiadm for example). I only support it for rhel/fedora for anaconda use.