This splits the read/write functions into top and bottom halves. It doesn't change much yet, but this is a requirement for async IO. The start funtion turns the request into an ioinfo (to be completed eventually by a thread) and the finish function retrives the result. (for now, we just do the work in finish.)
seems to work, but could probably use a little more load testing. Index: virtio.c =================================================================== RCS file: /cvs/src/usr.sbin/vmd/virtio.c,v retrieving revision 1.47 diff -u -p -r1.47 virtio.c --- virtio.c 27 May 2017 23:58:16 -0000 1.47 +++ virtio.c 30 May 2017 10:56:47 -0000 @@ -62,6 +62,14 @@ int nr_vioblk; #define VMMCI_F_ACK (1<<1) #define VMMCI_F_SYNCRTC (1<<2) +struct ioinfo { + uint8_t *buf; + ssize_t len; + off_t offset; + int fd; + int error; +}; + const char * vioblk_cmd_name(uint32_t type) { @@ -324,35 +332,87 @@ vioblk_update_qs(struct vioblk_dev *dev) dev->cfg.queue_size = dev->vq[dev->cfg.queue_select].qs; } -static char * -vioblk_do_read(struct vioblk_dev *dev, off_t sector, ssize_t sz) +static void +vioblk_free_info(struct ioinfo *info) { - char *buf; + if (!info) + return; + free(info->buf); + free(info); +} - buf = malloc(sz); - if (buf == NULL) { - log_warn("malloc errror vioblk read"); - return (NULL); - } +static struct ioinfo * +vioblk_start_read(struct vioblk_dev *dev, off_t sector, ssize_t sz) +{ + struct ioinfo *info; + + info = calloc(1, sizeof(*info)); + if (!info) + goto nomem; + info->buf = malloc(sz); + if (info->buf == NULL) + goto nomem; + info->len = sz; + info->offset = sector * VIRTIO_BLK_SECTOR_SIZE; + info->fd = dev->fd; + + return info; - if (pread(dev->fd, buf, sz, sector * VIRTIO_BLK_SECTOR_SIZE) != sz) { +nomem: + free(info); + log_warn("malloc errror vioblk read"); + return (NULL); +} + + +static const uint8_t * +vioblk_finish_read(struct ioinfo *info) +{ + if (pread(info->fd, info->buf, info->len, info->offset) != info->len) { + info->error = errno; log_warn("vioblk read error"); - free(buf); - return (NULL); + return NULL; } - return buf; + return info->buf; +} + +static struct ioinfo * +vioblk_start_write(struct vioblk_dev *dev, off_t sector, paddr_t addr, size_t len) +{ + struct ioinfo *info; + + info = calloc(1, sizeof(*info)); + if (!info) + goto nomem; + info->buf = malloc(len); + if (info->buf == NULL) + goto nomem; + info->len = len; + info->offset = sector * VIRTIO_BLK_SECTOR_SIZE; + info->fd = dev->fd; + + if (read_mem(addr, info->buf, len)) { + vioblk_free_info(info); + return NULL; + } + + return info; + +nomem: + free(info); + log_warn("malloc errror vioblk write"); + return (NULL); } static int -vioblk_do_write(struct vioblk_dev *dev, off_t sector, char *buf, ssize_t sz) +vioblk_finish_write(struct ioinfo *info) { - if (pwrite(dev->fd, buf, sz, sector * VIRTIO_BLK_SECTOR_SIZE) != sz) { + if (pwrite(info->fd, info->buf, info->len, info->offset) != info->len) { log_warn("vioblk write error"); - return (1); + return EIO; } - - return (0); + return 0; } /* @@ -368,7 +428,7 @@ vioblk_notifyq(struct vioblk_dev *dev) uint8_t ds; int ret; off_t secbias; - char *vr, *secdata; + char *vr; struct vring_desc *desc, *cmd_desc, *secdata_desc, *ds_desc; struct vring_avail *avail; struct vring_used *used; @@ -441,14 +501,16 @@ vioblk_notifyq(struct vioblk_dev *dev) secbias = 0; do { - /* read the data (use current data descriptor) */ - /* - * XXX waste to malloc secdata in vioblk_do_read - * and free it here over and over - */ - secdata = vioblk_do_read(dev, cmd.sector + secbias, + struct ioinfo *info; + const uint8_t *secdata; + + info = vioblk_start_read(dev, cmd.sector + secbias, (ssize_t)secdata_desc->len); + + /* read the data (use current data descriptor) */ + secdata = vioblk_finish_read(info); if (secdata == NULL) { + vioblk_free_info(info); log_warnx("vioblk: block read error, " "sector %lld", cmd.sector); goto out; @@ -460,11 +522,11 @@ vioblk_notifyq(struct vioblk_dev *dev) "data to gpa @ 0x%llx", secdata_desc->addr); dump_descriptor_chain(desc, cmd_desc_idx); - free(secdata); + vioblk_free_info(info); goto out; } - free(secdata); + vioblk_free_info(info); secbias += (secdata_desc->len / VIRTIO_BLK_SECTOR_SIZE); secdata_desc_idx = secdata_desc->next & @@ -514,35 +576,31 @@ vioblk_notifyq(struct vioblk_dev *dev) goto out; } - secdata = NULL; secbias = 0; do { - secdata = realloc(secdata, secdata_desc->len); - if (secdata == NULL) { - log_warn("wr vioblk: malloc error, " - "len %d", secdata_desc->len); - goto out; - } + struct ioinfo *info; - if (read_mem(secdata_desc->addr, secdata, - secdata_desc->len)) { + info = vioblk_start_write(dev, cmd.sector + secbias, + secdata_desc->addr, secdata_desc->len); + + if (info == NULL) { log_warnx("wr vioblk: can't read " "sector data @ 0x%llx", secdata_desc->addr); dump_descriptor_chain(desc, cmd_desc_idx); - free(secdata); goto out; } - if (vioblk_do_write(dev, cmd.sector + secbias, - secdata, (ssize_t)secdata_desc->len)) { + if (vioblk_finish_write(info)) { log_warnx("wr vioblk: disk write " "error"); - free(secdata); + vioblk_free_info(info); goto out; } + vioblk_free_info(info); + secbias += secdata_desc->len / VIRTIO_BLK_SECTOR_SIZE; @@ -550,8 +608,6 @@ vioblk_notifyq(struct vioblk_dev *dev) VIOBLK_QUEUE_MASK; secdata_desc = &desc[secdata_desc_idx]; } while (secdata_desc->flags & VRING_DESC_F_NEXT); - - free(secdata); ds_desc_idx = secdata_desc_idx; ds_desc = secdata_desc; Index: vm.c =================================================================== RCS file: /cvs/src/usr.sbin/vmd/vm.c,v retrieving revision 1.18 diff -u -p -r1.18 vm.c --- vm.c 28 May 2017 23:56:13 -0000 1.18 +++ vm.c 30 May 2017 10:54:53 -0000 @@ -1230,9 +1230,10 @@ find_gpa_range(struct vm_create_params * * exist in the guest. */ int -write_mem(paddr_t dst, void *buf, size_t len) +write_mem(paddr_t dst, const void *buf, size_t len) { - char *from = buf, *to; + const char *from = buf; + char *to; size_t n, off; struct vm_mem_range *vmr; Index: vmd.h =================================================================== RCS file: /cvs/src/usr.sbin/vmd/vmd.h,v retrieving revision 1.54 diff -u -p -r1.54 vmd.h --- vmd.h 4 May 2017 19:41:58 -0000 1.54 +++ vmd.h 30 May 2017 10:55:05 -0000 @@ -289,7 +289,7 @@ uint32_t vm_priv_addr(struct address *, /* vmm.c */ void vmm(struct privsep *, struct privsep_proc *); void vmm_shutdown(void); -int write_mem(paddr_t, void *buf, size_t); +int write_mem(paddr_t, const void *buf, size_t); int read_mem(paddr_t, void *buf, size_t); int opentap(char *); int fd_hasdata(int);