Re: move vmd vioblk ring processing to another thread

2017-09-17 Thread Mike Larkin
On Mon, Sep 18, 2017 at 10:48:36AM +1000, David Gwynne wrote:
> vmd currently performs disk io for the guest synchronously in the
> same thread the guest is running in. it also uses bounce buffers
> in between the guests "physical" memory and the read and writes
> against the disk image.
> 
> this diff moves the vioblk ring processing into a taskq, ie, another
> thread. this allows the guest to run while vmd is doing the reads
> and writes against the disk image. currently vmd only creates a
> single taskq for all vioblk devices to use. when the guest posts
> and queue notify write, vmd simple task_adds the ring processing
> to the taskq. when the ring is processed, vmd updates the completion
> queue and posts an interrupt to the guest.
> 
> this diff also takes advantage of the new vaddr_mem and iovec_mem
> APIs to avoid using bounce buffers between the guest and read/writes.
> when the guest configures a ring address, vaddr_mem is used to get
> direct access to the ring in vmds address space. reads and writes
> by the guest use iovec_mem to fill out an iovec array, which is
> then passed directly to preadv and pwritev.
> 
> because ring and io is now performed on a different thread to the
> guests vcpu, memory ordering becomes a considering. this also adds
> the use of membar_consumer and membar_producer from sys/atomic.h
> to ensure the ring updates become visibile to the guest in the
> correct order.
> 
> i would appreciate tests, particularly with vmd on i386 so i can
> know if the atomic api is available there.
> 

vmd on i386 needs to be brought up to date again with amd64. Nothing
special, just need to spend a few hours diffing trees. For now though,
let's not let that block moving forward here.

> the vm.c chunk is a result of testing with seabios, which likes to
> place the ring at the end of the physical address space.
> 
> ok?
> 
> Index: Makefile
> ===
> RCS file: /cvs/src/usr.sbin/vmd/Makefile,v
> retrieving revision 1.16
> diff -u -p -r1.16 Makefile
> --- Makefile  3 Jul 2017 22:21:47 -   1.16
> +++ Makefile  18 Sep 2017 00:39:00 -
> @@ -7,6 +7,7 @@ SRCS= vmd.c control.c log.c priv.c proc
>  SRCS+=   vm.c loadfile_elf.c pci.c virtio.c i8259.c mc146818.c
>  SRCS+=   ns8250.c i8253.c vmboot.c ufs.c disklabel.c dhcp.c 
> packet.c
>  SRCS+=   parse.y atomicio.c
> +SRCS+=   task.c
>  
>  CFLAGS+= -Wall -I${.CURDIR}
>  CFLAGS+= -Wstrict-prototypes -Wmissing-prototypes
> Index: virtio.c
> ===
> RCS file: /cvs/src/usr.sbin/vmd/virtio.c,v
> retrieving revision 1.54
> diff -u -p -r1.54 virtio.c
> --- virtio.c  17 Sep 2017 23:07:56 -  1.54
> +++ virtio.c  18 Sep 2017 00:39:00 -
> @@ -18,6 +18,7 @@
>  
>  #include/* PAGE_SIZE */
>  #include 
> +#include  /* membars */
>  
>  #include 
>  #include 
> @@ -43,14 +44,48 @@
>  #include "virtio.h"
>  #include "loadfile.h"
>  #include "atomicio.h"
> +#include "task.h"
> +
> +#ifndef MIN
> +#define MIN(_a, _b)  ((_a) < (_b) ? (_a) : (_b))
> +#endif
> +
> +#ifndef nitems
> +#define nitems(_a)   (sizeof(_a) / sizeof((_a)[0]))
> +#endif
>  
>  extern char *__progname;
>  
> +struct vioblk_queue {
> + struct vioblk_dev   *dev;
> + void*ring;
> + struct virtio_vq_infovq;
> + struct task  t;
> + struct event ev;
> +};
> +
> +struct vioblk_dev {
> + struct virtio_io_cfg cfg;
> +
> + struct vioblk_queue q[VIRTIO_MAX_QUEUES];
> +
> + int fd;
> + uint64_t sz;
> + uint32_t max_xfer;
> +
> + uint32_t vm_id;
> + int irq;
> +
> + uint8_t pci_id;
> +};
> +
>  struct viornd_dev viornd;
>  struct vioblk_dev *vioblk;
>  struct vionet_dev *vionet;
>  struct vmmci_dev vmmci;
>  
> +struct taskq *iotq;
> +
>  int nr_vionet;
>  int nr_vioblk;
>  
> @@ -62,13 +97,12 @@ int nr_vioblk;
>  #define VMMCI_F_ACK  (1<<1)
>  #define VMMCI_F_SYNCRTC  (1<<2)
>  
> -struct ioinfo {
> - uint8_t *buf;
> - ssize_t len;
> - off_t offset;
> - int fd;
> - int error;
> -};
> +int virtio_blk_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t);
> +int vioblk_dump(int);
> +int vioblk_restore(int, struct vm_create_params *, int *);
> +void vioblk_update_qs(struct vioblk_dev *);
> +void vioblk_update_qa(struct vioblk_dev *);
> +int vioblk_notifyq(struct vioblk_dev *);
>  
>  const char *
>  vioblk_cmd_name(uint32_t type)
> @@ -85,6 +119,7 @@ vioblk_cmd_name(uint32_t type)
>   }
>  }
>  
> +#if 0
>  static void
>  dump_descriptor_chain(struct vring_desc *desc, int16_t dxx)
>  {
> @@ -108,6 +143,7 @@ dump_descriptor_chain(struct vring_desc 
>   desc[dxx].flags,
>   desc[dxx].next);
>  }
> +#endif
>  
>  static const char *
>  virtio_reg_name(uint8_t reg)
> @@ -323,7 +359,10 @@ vioblk_update_qa(struct vioblk_dev *dev)
>   

move vmd vioblk ring processing to another thread

2017-09-17 Thread David Gwynne
vmd currently performs disk io for the guest synchronously in the
same thread the guest is running in. it also uses bounce buffers
in between the guests "physical" memory and the read and writes
against the disk image.

this diff moves the vioblk ring processing into a taskq, ie, another
thread. this allows the guest to run while vmd is doing the reads
and writes against the disk image. currently vmd only creates a
single taskq for all vioblk devices to use. when the guest posts
and queue notify write, vmd simple task_adds the ring processing
to the taskq. when the ring is processed, vmd updates the completion
queue and posts an interrupt to the guest.

this diff also takes advantage of the new vaddr_mem and iovec_mem
APIs to avoid using bounce buffers between the guest and read/writes.
when the guest configures a ring address, vaddr_mem is used to get
direct access to the ring in vmds address space. reads and writes
by the guest use iovec_mem to fill out an iovec array, which is
then passed directly to preadv and pwritev.

because ring and io is now performed on a different thread to the
guests vcpu, memory ordering becomes a considering. this also adds
the use of membar_consumer and membar_producer from sys/atomic.h
to ensure the ring updates become visibile to the guest in the
correct order.

i would appreciate tests, particularly with vmd on i386 so i can
know if the atomic api is available there.

the vm.c chunk is a result of testing with seabios, which likes to
place the ring at the end of the physical address space.

ok?

Index: Makefile
===
RCS file: /cvs/src/usr.sbin/vmd/Makefile,v
retrieving revision 1.16
diff -u -p -r1.16 Makefile
--- Makefile3 Jul 2017 22:21:47 -   1.16
+++ Makefile18 Sep 2017 00:39:00 -
@@ -7,6 +7,7 @@ SRCS=   vmd.c control.c log.c priv.c proc
 SRCS+= vm.c loadfile_elf.c pci.c virtio.c i8259.c mc146818.c
 SRCS+= ns8250.c i8253.c vmboot.c ufs.c disklabel.c dhcp.c packet.c
 SRCS+= parse.y atomicio.c
+SRCS+= task.c
 
 CFLAGS+=   -Wall -I${.CURDIR}
 CFLAGS+=   -Wstrict-prototypes -Wmissing-prototypes
Index: virtio.c
===
RCS file: /cvs/src/usr.sbin/vmd/virtio.c,v
retrieving revision 1.54
diff -u -p -r1.54 virtio.c
--- virtio.c17 Sep 2017 23:07:56 -  1.54
+++ virtio.c18 Sep 2017 00:39:00 -
@@ -18,6 +18,7 @@
 
 #include  /* PAGE_SIZE */
 #include 
+#include  /* membars */
 
 #include 
 #include 
@@ -43,14 +44,48 @@
 #include "virtio.h"
 #include "loadfile.h"
 #include "atomicio.h"
+#include "task.h"
+
+#ifndef MIN
+#define MIN(_a, _b)((_a) < (_b) ? (_a) : (_b))
+#endif
+
+#ifndef nitems
+#define nitems(_a) (sizeof(_a) / sizeof((_a)[0]))
+#endif
 
 extern char *__progname;
 
+struct vioblk_queue {
+   struct vioblk_dev   *dev;
+   void*ring;
+   struct virtio_vq_infovq;
+   struct task  t;
+   struct event ev;
+};
+
+struct vioblk_dev {
+   struct virtio_io_cfg cfg;
+
+   struct vioblk_queue q[VIRTIO_MAX_QUEUES];
+
+   int fd;
+   uint64_t sz;
+   uint32_t max_xfer;
+
+   uint32_t vm_id;
+   int irq;
+
+   uint8_t pci_id;
+};
+
 struct viornd_dev viornd;
 struct vioblk_dev *vioblk;
 struct vionet_dev *vionet;
 struct vmmci_dev vmmci;
 
+struct taskq *iotq;
+
 int nr_vionet;
 int nr_vioblk;
 
@@ -62,13 +97,12 @@ int nr_vioblk;
 #define VMMCI_F_ACK(1<<1)
 #define VMMCI_F_SYNCRTC(1<<2)
 
-struct ioinfo {
-   uint8_t *buf;
-   ssize_t len;
-   off_t offset;
-   int fd;
-   int error;
-};
+int virtio_blk_io(int, uint16_t, uint32_t *, uint8_t *, void *, uint8_t);
+int vioblk_dump(int);
+int vioblk_restore(int, struct vm_create_params *, int *);
+void vioblk_update_qs(struct vioblk_dev *);
+void vioblk_update_qa(struct vioblk_dev *);
+int vioblk_notifyq(struct vioblk_dev *);
 
 const char *
 vioblk_cmd_name(uint32_t type)
@@ -85,6 +119,7 @@ vioblk_cmd_name(uint32_t type)
}
 }
 
+#if 0
 static void
 dump_descriptor_chain(struct vring_desc *desc, int16_t dxx)
 {
@@ -108,6 +143,7 @@ dump_descriptor_chain(struct vring_desc 
desc[dxx].flags,
desc[dxx].next);
 }
+#endif
 
 static const char *
 virtio_reg_name(uint8_t reg)
@@ -323,7 +359,10 @@ vioblk_update_qa(struct vioblk_dev *dev)
if (dev->cfg.queue_select > 0)
return;
 
-   dev->vq[dev->cfg.queue_select].qa = dev->cfg.queue_address;
+   dev->q[dev->cfg.queue_select].vq.qa = dev->cfg.queue_address;
+   dev->q[dev->cfg.queue_select].ring = vaddr_mem(
+   dev->cfg.queue_address * VIRTIO_PAGE_SIZE,
+   vring_size(VIOBLK_QUEUE_SIZE));
 }
 
 void
@@ -336,375 +375,184 @@ vioblk_update_qs(struct vioblk_dev *dev)
}
 
/* Update queue address/size based on queue select */
-   dev->cf