Hi Eugenio, Regarding the point about not enabling VIRTIO_NET_F_NOTF_COAL unconditionally: since feature negotiation happens during VM bootup and coalescing is configured via ethtool after the VM has already booted, how would we handle this under a conditional property?
If the feature isn't negotiated at the start, we wouldn't be able to enable it dynamically later. Could you clarify how you'd like the command-line regulation to interact with the guest's ability to enable it later? ------------------------------------------------------------ > @@ -37,6 +37,7 @@ bool vhost_svq_valid_features(uint64_t features, Error **errp) > case VIRTIO_RING_F_INDIRECT_DESC: > continue; > > + case VIRTIO_F_RING_RESET: This patch does not handle VIRTIO_F_RING_RESET, so this should not be enabled. => I included VIRTIO_F_RING_RESET in the valid features list because the vDPA host device offers it by default; without it, SVQ fails the feature negotiation check during initialization. qemu-system-x86_64: -netdev type=vhost-vdpa,vhostdev=/dev/vhost-vdpa-0,id=vhost-vdpa0,x-svq=on: SVQ Invalid device feature flags, offer: 0x1e0010330bfffa7, ok: 0x1e0000330bfffa7 Regards, Koushik Dutta On Thu, Mar 12, 2026 at 3:47 PM Eugenio Perez Martin <[email protected]> wrote: > On Wed, Mar 11, 2026 at 2:28 PM Koushik Dutta <[email protected]> wrote: > > > > Implement VirtIO Network Notification Coalescing (Bit 53). > > This allows the guest to manage interrupt frequency using ethtool > > -C for both RX and TX paths. > > > > - Added VIRTIO_NET_F_NOTF_COAL to host features. > > - Implemented VIRTIO_NET_CTRL_NOTF_COAL class handling in > > virtio_net_handle_ctrl_iov. > > - Added logic to store and apply rx/tx usecs and max_packets. > > - Added packet counters and threshold logic for both RX and TX data > paths. > > - Dynamic Dispatcher: Implemented a dispatcher mechanism that > > dynamically switches/activates the notification callback logic > > only after the guest enables TX coalescing via ethtool. > > > > This reduces interrupt overhead by batching notifications based on > > either a packet count or a time-based threshold. > > > > Signed-off-by: Koushik Dutta <[email protected]> > > --- > > hw/net/virtio-net.c | 114 ++++++++++++++++++++++++----- > > hw/virtio/vhost-shadow-virtqueue.c | 1 + > > hw/virtio/vhost-vdpa.c | 3 + > > include/hw/virtio/virtio-net.h | 8 ++ > > net/vhost-vdpa.c | 10 +++ > > 5 files changed, 119 insertions(+), 17 deletions(-) > > > > diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c > > index eccb48ad42..96aa70aea8 100644 > > --- a/hw/net/virtio-net.c > > +++ b/hw/net/virtio-net.c > > @@ -157,6 +157,15 @@ static void > flush_or_purge_queued_packets(NetClientState *nc) > > * - we could suppress RX interrupt if we were so inclined. > > */ > > > > +static void virtio_net_rx_notify(void *opaque) > > +{ > > + VirtIONet *n = opaque; > > + VirtIODevice *vdev = VIRTIO_DEVICE(n); > > + > > + n->rx_pkt_cnt = 0; > > + virtio_notify(vdev, n->vqs[0].rx_vq); > > +} > > + > > static void virtio_net_get_config(VirtIODevice *vdev, uint8_t *config) > > { > > VirtIONet *n = VIRTIO_NET(vdev); > > @@ -1081,6 +1090,29 @@ static int virtio_net_handle_offloads(VirtIONet > *n, uint8_t cmd, > > } > > } > > > > +static int virtio_net_handle_coal(VirtIONet *n, uint8_t cmd, > > + struct iovec *iov, unsigned int > iov_cnt) > > +{ > > + struct virtio_net_ctrl_coal coal; > > + size_t s; > > + > > + s = iov_to_buf(iov, iov_cnt, 0, &coal, sizeof(coal)); > > + if (s != sizeof(coal)) { > > + return VIRTIO_NET_ERR; > > + } > > + > > + if (cmd == VIRTIO_NET_CTRL_NOTF_COAL_RX_SET) { > > + n->rx_coal_usecs = le32_to_cpu(coal.max_usecs); > > + n->rx_coal_packets = le32_to_cpu(coal.max_packets); > > + } else if (cmd == VIRTIO_NET_CTRL_NOTF_COAL_TX_SET) { > > + n->tx_coal_usecs = le32_to_cpu(coal.max_usecs); > > + n->tx_coal_packets = le32_to_cpu(coal.max_packets); > > + n->tx_timeout = n->tx_coal_usecs * 1000; > > + } > > + > > + return VIRTIO_NET_OK; > > +} > > + > > static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd, > > struct iovec *iov, unsigned int > iov_cnt) > > { > > @@ -1582,6 +1614,8 @@ size_t virtio_net_handle_ctrl_iov(VirtIODevice > *vdev, > > status = virtio_net_handle_mq(n, ctrl.cmd, iov, out_num); > > } else if (ctrl.class == VIRTIO_NET_CTRL_GUEST_OFFLOADS) { > > status = virtio_net_handle_offloads(n, ctrl.cmd, iov, out_num); > > + } else if (ctrl.class == VIRTIO_NET_CTRL_NOTF_COAL) { > > + status = virtio_net_handle_coal(n, ctrl.cmd, iov, out_num); > > } > > > > s = iov_from_buf(in_sg, in_num, 0, &status, sizeof(status)); > > @@ -2041,7 +2075,18 @@ static ssize_t > virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf, > > } > > > > virtqueue_flush(q->rx_vq, i); > > - virtio_notify(vdev, q->rx_vq); > > + > > + /* rx coalescing */ > > + n->rx_pkt_cnt += i; > > + if (n->rx_coal_usecs == 0 || n->rx_pkt_cnt >= n->rx_coal_packets) { > > + timer_del(n->rx_index_timer); > > + virtio_net_rx_notify(n); > > + } else { > > + if (n->rx_pkt_cnt == i) { > > Maybe it's better to check if !timer_pending? It wasn't immediately > clear to me that `n->rx_pkt_cnt == i` until I realized that > `n->rx_pkt_cnt` is 0 at the beginning of the function for that > condition to be true. > > > + timer_mod(n->rx_index_timer, > > + qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + > n->rx_coal_usecs); > > + } > > + } > > > > return size; > > > > @@ -2900,6 +2945,7 @@ static void virtio_net_tx_timer(void *opaque) > > if (ret == -EBUSY || ret == -EINVAL) { > > return; > > } > > + n->tx_pkt_cnt -= ret; > > /* > > * If we flush a full burst of packets, assume there are > > * more coming and immediately rearm > > @@ -2919,6 +2965,7 @@ static void virtio_net_tx_timer(void *opaque) > > ret = virtio_net_flush_tx(q); > > if (ret > 0) { > > virtio_queue_set_notification(q->tx_vq, 0); > > + n->tx_pkt_cnt -= ret; > > If we got his way, we must control for underflow here. > > But I think it should be n->pt_pkt_cnt = 0, isn't it? > > > q->tx_waiting = 1; > > timer_mod(q->tx_timer, > > qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + > n->tx_timeout); > > @@ -2974,27 +3021,45 @@ static void virtio_net_tx_bh(void *opaque) > > } > > } > > > > -static void virtio_net_add_queue(VirtIONet *n, int index) > > +static void virtio_net_handle_tx_dispatch(VirtIODevice *vdev, VirtQueue > *vq) > > { > > - VirtIODevice *vdev = VIRTIO_DEVICE(n); > > + VirtIONet *n = VIRTIO_NET(vdev); > > > > - n->vqs[index].rx_vq = virtio_add_queue(vdev, > n->net_conf.rx_queue_size, > > - virtio_net_handle_rx); > > + n->tx_pkt_cnt++; > > + bool use_timer = n->tx_timer_activate || n->tx_coal_usecs > 0 || > > + n->tx_coal_packets > 0; > > No declarations in the middle of the function. > > Also, why use_timer = true as long as n->tx_coal_packets > 0? > > > > > - if (n->net_conf.tx && !strcmp(n->net_conf.tx, "timer")) { > > - n->vqs[index].tx_vq = > > - virtio_add_queue(vdev, n->net_conf.tx_queue_size, > > - virtio_net_handle_tx_timer); > > - n->vqs[index].tx_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, > > - virtio_net_tx_timer, > > - &n->vqs[index]); > > + if (use_timer && n->tx_pkt_cnt < n->tx_coal_packets) { > > + virtio_net_handle_tx_timer(vdev, vq); > > } else { > > - n->vqs[index].tx_vq = > > - virtio_add_queue(vdev, n->net_conf.tx_queue_size, > > - virtio_net_handle_tx_bh); > > - n->vqs[index].tx_bh = qemu_bh_new_guarded(virtio_net_tx_bh, > &n->vqs[index], > > - > &DEVICE(vdev)->mem_reentrancy_guard); > > + n->tx_pkt_cnt = 0; > > + virtio_net_handle_tx_bh(vdev, vq); > > } > > +} > > + > > +static void virtio_net_add_queue(VirtIONet *n, int index) > > +{ > > + VirtIODevice *vdev = VIRTIO_DEVICE(n); > > + > > + n->vqs[index].rx_vq = > > + virtio_add_queue(vdev, > > + n->net_conf.rx_queue_size, > > + virtio_net_handle_rx); > > + > > + n->vqs[index].tx_vq = > > + virtio_add_queue(vdev, > > + n->net_conf.tx_queue_size, > > + virtio_net_handle_tx_dispatch); > > + > > + n->vqs[index].tx_timer = > > + timer_new_ns(QEMU_CLOCK_VIRTUAL, > > + virtio_net_tx_timer, > > + &n->vqs[index]); > > We should avoid creating a new timer if it will not be used by > default. Create it only if cmdline enables it or the guest enable it > via control vq. > > > + > > + n->vqs[index].tx_bh = > > + qemu_bh_new_guarded(virtio_net_tx_bh, > > + &n->vqs[index], > > + &DEVICE(vdev)->mem_reentrancy_guard); > > > > n->vqs[index].tx_waiting = 0; > > n->vqs[index].n = n; > > @@ -3089,6 +3154,7 @@ static void virtio_net_get_features(VirtIODevice > *vdev, uint64_t *features, > > virtio_features_or(features, features, n->host_features_ex); > > > > virtio_add_feature_ex(features, VIRTIO_NET_F_MAC); > > + virtio_add_feature_ex(features, VIRTIO_NET_F_NOTF_COAL); > > We shouldn't enable VIRTIO_NET_F_NOTF_COAL unconditionally, it should > be regulated by the cmdline property. > > > > > if (!peer_has_vnet_hdr(n)) { > > virtio_clear_feature_ex(features, VIRTIO_NET_F_CSUM); > > @@ -3972,6 +4038,10 @@ static void virtio_net_device_realize(DeviceState > *dev, Error **errp) > > error_printf("Defaulting to \"bh\""); > > } > > > > + if (n->net_conf.tx && strcmp(n->net_conf.tx, "timer")) { > > + n->tx_timer_activate = true; > > + } > > + > > n->net_conf.tx_queue_size = MIN(virtio_net_max_tx_queue_size(n), > > n->net_conf.tx_queue_size); > > > > @@ -4048,6 +4118,14 @@ static void virtio_net_device_realize(DeviceState > *dev, Error **errp) > > n->rss_data.specified_hash_types.on_bits | > > n->rss_data.specified_hash_types.auto_bits; > > } > > + n->rx_pkt_cnt = 0; > > + n->tx_pkt_cnt = 0; > > + n->rx_coal_usecs = 0; > > + n->tx_coal_usecs = 0; > > + n->rx_coal_packets = 0; > > + n->tx_coal_packets = 0; > > + n->rx_index_timer = timer_new_us(QEMU_CLOCK_VIRTUAL, > > + virtio_net_rx_notify, n); > > Same as the tx timer. Is it worth creating a new timer even if it will > not be used by default? > > > } > > > > static void virtio_net_device_unrealize(DeviceState *dev) > > @@ -4262,6 +4340,8 @@ static const Property virtio_net_properties[] = { > > VIRTIO_NET_F_GUEST_USO6, true), > > DEFINE_PROP_BIT64("host_uso", VirtIONet, host_features, > > VIRTIO_NET_F_HOST_USO, true), > > + DEFINE_PROP_BIT64("vq_notf_coal", VirtIONet, host_features, > > + VIRTIO_NET_F_NOTF_COAL, true), > > DEFINE_PROP_ON_OFF_AUTO_BIT64("hash-ipv4", VirtIONet, > > rss_data.specified_hash_types, > > VIRTIO_NET_HASH_REPORT_IPv4 - 1, > > diff --git a/hw/virtio/vhost-shadow-virtqueue.c > b/hw/virtio/vhost-shadow-virtqueue.c > > index 6242aeb69c..104ed0ce8f 100644 > > --- a/hw/virtio/vhost-shadow-virtqueue.c > > +++ b/hw/virtio/vhost-shadow-virtqueue.c > > @@ -37,6 +37,7 @@ bool vhost_svq_valid_features(uint64_t features, Error > **errp) > > case VIRTIO_RING_F_INDIRECT_DESC: > > continue; > > > > + case VIRTIO_F_RING_RESET: > > This patch does not handle VIRTIO_F_RING_RESET, so this should not be > enabled. > > > case VIRTIO_F_ACCESS_PLATFORM: > > /* SVQ trust in the host's IOMMU to translate addresses */ > > case VIRTIO_F_VERSION_1: > > diff --git a/hw/virtio/vhost-vdpa.c b/hw/virtio/vhost-vdpa.c > > index 2f8f11df86..8eb9bdb961 100644 > > --- a/hw/virtio/vhost-vdpa.c > > +++ b/hw/virtio/vhost-vdpa.c > > @@ -1541,6 +1541,9 @@ static int vhost_vdpa_get_features(struct > vhost_dev *dev, > > if (ret == 0) { > > /* Add SVQ logging capabilities */ > > *features |= BIT_ULL(VHOST_F_LOG_ALL); > > + > > + /* Add notification coalescing features */ > > + *features |= BIT_ULL(VIRTIO_NET_F_NOTF_COAL); > > } > > > > return ret; > > diff --git a/include/hw/virtio/virtio-net.h > b/include/hw/virtio/virtio-net.h > > index 5b8ab7bda7..024501ed37 100644 > > --- a/include/hw/virtio/virtio-net.h > > +++ b/include/hw/virtio/virtio-net.h > > @@ -231,6 +231,14 @@ struct VirtIONet { > > struct EBPFRSSContext ebpf_rss; > > uint32_t nr_ebpf_rss_fds; > > char **ebpf_rss_fds; > > + QEMUTimer *rx_index_timer; > > + uint32_t rx_coal_usecs; > > + uint32_t rx_coal_packets; > > + uint32_t rx_pkt_cnt; > > + uint32_t tx_coal_usecs; > > + uint32_t tx_coal_packets; > > + uint32_t tx_pkt_cnt; > > + bool tx_timer_activate; > > }; > > > > size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev, > > diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c > > index 3df6091274..6a5d0f4cae 100644 > > --- a/net/vhost-vdpa.c > > +++ b/net/vhost-vdpa.c > > @@ -61,6 +61,7 @@ static const int vdpa_feature_bits[] = { > > VIRTIO_F_NOTIFY_ON_EMPTY, > > VIRTIO_F_RING_PACKED, > > VIRTIO_F_RING_RESET, > > + VIRTIO_F_ORDER_PLATFORM, > > VIRTIO_F_VERSION_1, > > VIRTIO_F_IN_ORDER, > > VIRTIO_F_NOTIFICATION_DATA, > > @@ -70,6 +71,8 @@ static const int vdpa_feature_bits[] = { > > VIRTIO_NET_F_CTRL_RX, > > VIRTIO_NET_F_CTRL_RX_EXTRA, > > VIRTIO_NET_F_CTRL_VLAN, > > + VIRTIO_NET_F_GUEST_ANNOUNCE, > > + VIRTIO_NET_F_NOTF_COAL, > > Only this feature can be added to vdpa_feature_bits. The rest of them > must not be added to any other array. > > > VIRTIO_NET_F_CTRL_VQ, > > VIRTIO_NET_F_GSO, > > VIRTIO_NET_F_GUEST_CSUM, > > @@ -115,6 +118,13 @@ static const uint64_t vdpa_svq_device_features = > > BIT_ULL(VIRTIO_NET_F_HOST_UFO) | > > BIT_ULL(VIRTIO_NET_F_MRG_RXBUF) | > > BIT_ULL(VIRTIO_NET_F_STATUS) | > > + BIT_ULL(VIRTIO_F_RING_RESET) | > > + BIT_ULL(VIRTIO_F_ORDER_PLATFORM) | > > + BIT_ULL(VIRTIO_NET_F_GUEST_USO4) | > > + BIT_ULL(VIRTIO_NET_F_GUEST_USO6) | > > + BIT_ULL(VIRTIO_NET_F_HOST_USO) | > > + BIT_ULL(VIRTIO_NET_F_GUEST_ANNOUNCE) | > > + BIT_ULL(VIRTIO_NET_F_NOTF_COAL) | > > BIT_ULL(VIRTIO_NET_F_CTRL_VQ) | > > BIT_ULL(VIRTIO_NET_F_GSO) | > > BIT_ULL(VIRTIO_NET_F_CTRL_RX) | > > -- > > 2.53.0 > > > >
