mbuhl@ found an issue where the emulated virtio block device can hang. The tl;dr: the emulated pic never injects an interrupt and the vioblk(4) driver in the guest starves, waiting to be told to check the virtqueue.
Flipping the bit in the i8259 using gdb causes the spice to flow once again. This diff fixes two related things (so could be committed in 2 parts): 1. the irq deassert on a virtio pci interrupt status register read should occur synchronously by the vcpu thread in the vm and not async. 2. always raise the irr bit in the pic, regardless of mask. The mask is used when finding an irq to ack and shouldn't be used to determine if the irr bit is raised. AFAICT from the old i8259 data sheets, masking should have no effect on if the irr bit is set. This is holding up another diff I want to share that reduces latency in interrupts, but it's shown to make this i8259 race condition worse, so I'd like to give folks a few days to check if the below diff causes regressions. Once this is committed, I'll feel safe sharing the latency diff with tech@. Any ok's pending a few days for folks to test? -dv diff refs/heads/master cdba90a89ee6e77b52c2b6955d36a260a23a3b85 commit - ad1cd1152fddbf55189657a2df9f2468409698ab commit + cdba90a89ee6e77b52c2b6955d36a260a23a3b85 blob - f7862f5e9d17f96f5260358271fab8f253b26505 blob + b6891c52c824d7b8c69e67e5323772152b1ed844 --- usr.sbin/vmd/i8259.c +++ usr.sbin/vmd/i8259.c @@ -222,20 +222,16 @@ i8259_assert_irq(uint8_t irq) { mutex_lock(&pic_mtx); if (irq <= 7) { - if (!ISSET(pics[MASTER].imr, 1 << irq)) { - SET(pics[MASTER].irr, 1 << irq); - pics[MASTER].asserted = 1; - } + SET(pics[MASTER].irr, 1 << irq); + pics[MASTER].asserted = 1; } else { irq -= 8; - if (!ISSET(pics[SLAVE].imr, 1 << irq)) { - SET(pics[SLAVE].irr, 1 << irq); - pics[SLAVE].asserted = 1; + SET(pics[SLAVE].irr, 1 << irq); + pics[SLAVE].asserted = 1; - /* Assert cascade IRQ on master PIC */ - SET(pics[MASTER].irr, 1 << 2); - pics[MASTER].asserted = 1; - } + /* Assert cascade IRQ on master PIC */ + SET(pics[MASTER].irr, 1 << 2); + pics[MASTER].asserted = 1; } mutex_unlock(&pic_mtx); } blob - 7bc76c4daed427dae82688452ec21985be679bc4 blob + 4d321fd4ff23514ffa7a4bee0eeb7a9324020d80 --- usr.sbin/vmd/vioblk.c +++ usr.sbin/vmd/vioblk.c @@ -39,7 +39,8 @@ static uint32_t handle_io_read(struct viodev_msg *, st extern struct vmd_vm *current_vm; static const char *disk_type(int); -static uint32_t handle_io_read(struct viodev_msg *, struct virtio_dev *); +static uint32_t handle_io_read(struct viodev_msg *, struct virtio_dev *, + int8_t *); static int handle_io_write(struct viodev_msg *, struct virtio_dev *); void vioblk_notify_rx(struct vioblk_dev *); int vioblk_notifyq(struct vioblk_dev *); @@ -723,6 +724,7 @@ handle_sync_io(int fd, short event, void *arg) struct viodev_msg msg; struct imsg imsg; ssize_t n; + int8_t intr = INTR_STATE_NOOP; if (event & EV_READ) { if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN) @@ -770,8 +772,9 @@ handle_sync_io(int fd, short event, void *arg) } case VIODEV_MSG_IO_READ: /* Read IO: make sure to send a reply */ - msg.data = handle_io_read(&msg, dev); + msg.data = handle_io_read(&msg, dev, &intr); msg.data_valid = 1; + msg.state = intr; imsg_compose_event(iev, IMSG_DEVOP_MSG, 0, 0, -1, &msg, sizeof(msg)); break; @@ -844,7 +847,7 @@ handle_io_read(struct viodev_msg *msg, struct virtio_d } static uint32_t -handle_io_read(struct viodev_msg *msg, struct virtio_dev *dev) +handle_io_read(struct viodev_msg *msg, struct virtio_dev *dev, int8_t *intr) { struct vioblk_dev *vioblk = &dev->vioblk; uint8_t sz = msg->io_sz; @@ -1001,7 +1004,8 @@ handle_io_read(struct viodev_msg *msg, struct virtio_d case VIRTIO_CONFIG_ISR_STATUS: data = vioblk->cfg.isr_status; vioblk->cfg.isr_status = 0; - virtio_deassert_pic_irq(dev, 0); + if (intr != NULL) + *intr = INTR_STATE_DEASSERT; break; default: return (0xFFFFFFFF); blob - c16ad2635ea622fd7fce48b5145e2199dd451346 blob + 5c2a943ac7ad02dfbc4793f5caab3200a3ced803 --- usr.sbin/vmd/vionet.c +++ usr.sbin/vmd/vionet.c @@ -52,7 +52,8 @@ static uint32_t handle_io_read(struct viodev_msg *, st static int vionet_rx(struct vionet_dev *); static void vionet_rx_event(int, short, void *); -static uint32_t handle_io_read(struct viodev_msg *, struct virtio_dev *); +static uint32_t handle_io_read(struct viodev_msg *, struct virtio_dev *, + int8_t *); static int handle_io_write(struct viodev_msg *, struct virtio_dev *); void vionet_notify_rx(struct virtio_dev *); int vionet_notifyq(struct virtio_dev *); @@ -757,6 +758,7 @@ handle_sync_io(int fd, short event, void *arg) struct viodev_msg msg; struct imsg imsg; ssize_t n; + int8_t intr = INTR_STATE_NOOP; if (event & EV_READ) { if ((n = imsg_read(ibuf)) == -1 && errno != EAGAIN) @@ -804,8 +806,9 @@ handle_sync_io(int fd, short event, void *arg) } case VIODEV_MSG_IO_READ: /* Read IO: make sure to send a reply */ - msg.data = handle_io_read(&msg, dev); + msg.data = handle_io_read(&msg, dev, &intr); msg.data_valid = 1; + msg.state = intr; imsg_compose_event(iev, IMSG_DEVOP_MSG, 0, 0, -1, &msg, sizeof(msg)); break; @@ -891,7 +894,7 @@ handle_io_read(struct viodev_msg *msg, struct virtio_d } static uint32_t -handle_io_read(struct viodev_msg *msg, struct virtio_dev *dev) +handle_io_read(struct viodev_msg *msg, struct virtio_dev *dev, int8_t *intr) { struct vionet_dev *vionet = &dev->vionet; uint32_t data; @@ -930,7 +933,8 @@ handle_io_read(struct viodev_msg *msg, struct virtio_d case VIRTIO_CONFIG_ISR_STATUS: data = vionet->cfg.isr_status; vionet->cfg.isr_status = 0; - virtio_deassert_pic_irq(dev, 0); + if (intr != NULL) + *intr = INTR_STATE_DEASSERT; break; default: return (0xFFFFFFFF);