Implement VirtIO Network Notification Coalescing (Bit 53). This allows the guest to manage interrupt frequency using ethtool -C for both RX and TX paths.
- Added VIRTIO_NET_F_NOTF_COAL to host features. - Implemented VIRTIO_NET_CTRL_NOTF_COAL class handling in virtio_net_handle_ctrl_iov. - Added logic to store and apply rx/tx usecs and max_packets. - Added packet counters and threshold logic for both RX and TX data paths. - Dynamic Dispatcher: Implemented a dispatcher mechanism that dynamically switches/activates the notification callback logic only after the guest enables TX coalescing via ethtool. - After VM LM coalescing parameters persist in the destination VM. This reduces interrupt overhead by batching notifications based on either a packet count or a time-based threshold. Signed-off-by: Koushik Dutta <[email protected]> --- v7 changes: - Fixed time unit consistency: TX path now uses microseconds matching RX - Changed timer_new_ns to timer_new_us for TX timers - Changed qemu_clock_get_ns to qemu_clock_get_us in TX path - Convert txtimer from nanoseconds to microseconds (txtimer / 1000) - Removed unused rx_index_timer variable from struct and initialization - Fixed feature flag handling: removed conditional check in ctrl handler --- hw/net/virtio-net.c | 193 +++++++++++++++++++++++++++++---- include/hw/virtio/virtio-net.h | 9 +- net/passt.c | 1 + net/tap.c | 1 + net/vhost-user.c | 1 + net/vhost-vdpa.c | 1 + 6 files changed, 183 insertions(+), 23 deletions(-) diff --git a/hw/net/virtio-net.c b/hw/net/virtio-net.c index 2a5d642a64..5a98b52ded 100644 --- a/hw/net/virtio-net.c +++ b/hw/net/virtio-net.c @@ -157,6 +157,16 @@ static void flush_or_purge_queued_packets(NetClientState *nc) * - we could suppress RX interrupt if we were so inclined. */ +static void virtio_net_rx_notify(void *opaque) +{ + VirtIONetQueue *q = opaque; + VirtIONet *n = q->n; + VirtIODevice *vdev = VIRTIO_DEVICE(n); + + n->rx_pkt_cnt = 0; + virtio_notify(vdev, q->rx_vq); +} + static void virtio_net_get_config(VirtIODevice *vdev, uint8_t *config) { VirtIONet *n = VIRTIO_NET(vdev); @@ -435,7 +445,7 @@ static int virtio_net_set_status(struct VirtIODevice *vdev, uint8_t status) if (queue_started) { if (q->tx_timer) { timer_mod(q->tx_timer, - qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout); + qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs); } else { replay_bh_schedule_event(q->tx_bh); } @@ -1080,6 +1090,52 @@ static int virtio_net_handle_offloads(VirtIONet *n, uint8_t cmd, } } +static void virtio_net_tx_timer(void *opaque); + +static int virtio_net_handle_coal(VirtIONet *n, uint8_t cmd, + struct iovec *iov, unsigned int iov_cnt) +{ + struct virtio_net_ctrl_coal coal; + VirtIONetQueue *q; + size_t s; + int i; + + s = iov_to_buf(iov, iov_cnt, 0, &coal, sizeof(coal)); + if (s != sizeof(coal)) { + return VIRTIO_NET_ERR; + } + + if (cmd == VIRTIO_NET_CTRL_NOTF_COAL_RX_SET) { + n->rx_coal_usecs = le32_to_cpu(coal.max_usecs); + n->rx_coal_packets = le32_to_cpu(coal.max_packets); + if (n->rx_coal_usecs > 0) { + for (i = 0; i < n->max_queue_pairs; i++) { + q = &n->vqs[i]; + if (!q->rx_timer) { + q->rx_timer = timer_new_us(QEMU_CLOCK_VIRTUAL, + virtio_net_rx_notify, + q); + } + } + } + } else if (cmd == VIRTIO_NET_CTRL_NOTF_COAL_TX_SET) { + n->tx_coal_usecs = le32_to_cpu(coal.max_usecs); + n->tx_coal_packets = le32_to_cpu(coal.max_packets); + if (n->tx_coal_usecs > 0) { + for (i = 0; i < n->max_queue_pairs; i++) { + q = &n->vqs[i]; + if (!q->tx_timer && n->tx_coal_usecs > 0) { + q->tx_timer = timer_new_us(QEMU_CLOCK_VIRTUAL, + virtio_net_tx_timer, + q); + } + } + } + } + + return VIRTIO_NET_OK; +} + static int virtio_net_handle_mac(VirtIONet *n, uint8_t cmd, struct iovec *iov, unsigned int iov_cnt) { @@ -1581,6 +1637,8 @@ size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev, status = virtio_net_handle_mq(n, ctrl.cmd, iov, out_num); } else if (ctrl.class == VIRTIO_NET_CTRL_GUEST_OFFLOADS) { status = virtio_net_handle_offloads(n, ctrl.cmd, iov, out_num); + } else if (ctrl.class == VIRTIO_NET_CTRL_NOTF_COAL) { + status = virtio_net_handle_coal(n, ctrl.cmd, iov, out_num); } s = iov_from_buf(in_sg, in_num, 0, &status, sizeof(status)); @@ -2040,7 +2098,22 @@ static ssize_t virtio_net_receive_rcu(NetClientState *nc, const uint8_t *buf, } virtqueue_flush(q->rx_vq, i); - virtio_notify(vdev, q->rx_vq); + + /* rx coalescing */ + n->rx_pkt_cnt += i; + if (n->rx_coal_usecs == 0 || n->rx_pkt_cnt >= n->rx_coal_packets) { + if (q->rx_timer) { + timer_del(q->rx_timer); + } + virtio_net_rx_notify(q); + } else { + if (q->rx_timer) { + if (!timer_pending(q->rx_timer)) { + timer_mod(q->rx_timer, + qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->rx_coal_usecs); + } + } + } return size; @@ -2708,7 +2781,7 @@ static void virtio_net_tx_complete(NetClientState *nc, ssize_t len) replay_bh_schedule_event(q->tx_bh); } else { timer_mod(q->tx_timer, - qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout); + qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs); } q->tx_waiting = 1; } @@ -2817,7 +2890,6 @@ detach: return -EINVAL; } -static void virtio_net_tx_timer(void *opaque); static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq) { @@ -2842,7 +2914,7 @@ static void virtio_net_handle_tx_timer(VirtIODevice *vdev, VirtQueue *vq) } else { /* re-arm timer to flush it (and more) on next tick */ timer_mod(q->tx_timer, - qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout); + qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs); q->tx_waiting = 1; virtio_queue_set_notification(vq, 0); } @@ -2899,6 +2971,12 @@ static void virtio_net_tx_timer(void *opaque) if (ret == -EBUSY || ret == -EINVAL) { return; } + if (n->tx_pkt_cnt < ret) { + n->tx_pkt_cnt = 0; + } else { + n->tx_pkt_cnt -= ret; + } + /* * If we flush a full burst of packets, assume there are * more coming and immediately rearm @@ -2906,7 +2984,7 @@ static void virtio_net_tx_timer(void *opaque) if (ret >= n->tx_burst) { q->tx_waiting = 1; timer_mod(q->tx_timer, - qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout); + qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs); return; } /* @@ -2918,9 +2996,10 @@ static void virtio_net_tx_timer(void *opaque) ret = virtio_net_flush_tx(q); if (ret > 0) { virtio_queue_set_notification(q->tx_vq, 0); + n->tx_pkt_cnt -= ret; q->tx_waiting = 1; timer_mod(q->tx_timer, - qemu_clock_get_ns(QEMU_CLOCK_VIRTUAL) + n->tx_timeout); + qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs); } } @@ -2973,6 +3052,32 @@ static void virtio_net_tx_bh(void *opaque) } } +static void virtio_net_handle_tx_dispatch(VirtIODevice *vdev, VirtQueue *vq) +{ + VirtIONet *n = VIRTIO_NET(vdev); + VirtIONetQueue *q = &n->vqs[vq2q(virtio_get_queue_index(vq))]; + bool use_timer = n->tx_timer_activate || n->tx_coal_usecs > 0 || + n->tx_coal_packets > 0; + bool pkt_limit = (n->tx_coal_packets > 0); + + if (use_timer) { + n->tx_pkt_cnt++; + if (!pkt_limit || n->tx_pkt_cnt < n->tx_coal_packets) { + if (q->tx_timer) { + virtio_net_handle_tx_timer(vdev, vq); + return; + } + } + n->tx_pkt_cnt = 0; + if (q->tx_timer) { + timer_del(q->tx_timer); + } + virtio_net_handle_tx_bh(vdev, vq); + } else { + virtio_net_handle_tx_bh(vdev, vq); + } +} + static void virtio_net_add_queue(VirtIONet *n, int index) { VirtIODevice *vdev = VIRTIO_DEVICE(n); @@ -2980,20 +3085,15 @@ static void virtio_net_add_queue(VirtIONet *n, int index) n->vqs[index].rx_vq = virtio_add_queue(vdev, n->net_conf.rx_queue_size, virtio_net_handle_rx); - if (n->net_conf.tx && !strcmp(n->net_conf.tx, "timer")) { - n->vqs[index].tx_vq = - virtio_add_queue(vdev, n->net_conf.tx_queue_size, - virtio_net_handle_tx_timer); - n->vqs[index].tx_timer = timer_new_ns(QEMU_CLOCK_VIRTUAL, - virtio_net_tx_timer, - &n->vqs[index]); - } else { - n->vqs[index].tx_vq = - virtio_add_queue(vdev, n->net_conf.tx_queue_size, - virtio_net_handle_tx_bh); - n->vqs[index].tx_bh = qemu_bh_new_guarded(virtio_net_tx_bh, &n->vqs[index], - &DEVICE(vdev)->mem_reentrancy_guard); - } + n->vqs[index].tx_vq = + virtio_add_queue(vdev, + n->net_conf.tx_queue_size, + virtio_net_handle_tx_dispatch); + + n->vqs[index].tx_bh = + qemu_bh_new_guarded(virtio_net_tx_bh, + &n->vqs[index], + &DEVICE(vdev)->mem_reentrancy_guard); n->vqs[index].tx_waiting = 0; n->vqs[index].n = n; @@ -3088,6 +3188,9 @@ static void virtio_net_get_features(VirtIODevice *vdev, uint64_t *features, virtio_features_or(features, features, n->host_features_ex); virtio_add_feature_ex(features, VIRTIO_NET_F_MAC); + if (n->tx_timer_activate) { + virtio_clear_feature_ex(features, VIRTIO_NET_F_NOTF_COAL); + } if (!peer_has_vnet_hdr(n)) { virtio_clear_feature_ex(features, VIRTIO_NET_F_CSUM); @@ -3242,6 +3345,35 @@ static int virtio_net_post_load_device(void *opaque, int version_id) } virtio_net_commit_rss_config(n); + if (n->tx_coal_usecs > 0 || n->rx_coal_usecs > 0) { + + for (i = 0; i < n->max_queue_pairs; i++) { + VirtIONetQueue *q = &n->vqs[i]; + if (!q->rx_timer && n->rx_coal_usecs > 0) { + q->rx_timer = timer_new_us(QEMU_CLOCK_VIRTUAL, + virtio_net_rx_notify, + q); + } + + if (!q->tx_timer && n->tx_coal_usecs > 0) { + q->tx_timer = timer_new_us(QEMU_CLOCK_VIRTUAL, + virtio_net_tx_timer, + q); + } + + if (n->tx_coal_usecs > 0 && q->tx_timer) { + n->tx_pkt_cnt = 0; + timer_mod(q->tx_timer, + qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->tx_coal_usecs); + } + + if (n->rx_coal_usecs > 0 && q->rx_timer) { + timer_mod(q->rx_timer, + qemu_clock_get_us(QEMU_CLOCK_VIRTUAL) + n->rx_coal_usecs); + } + } + } + return 0; } @@ -3617,6 +3749,10 @@ static const VMStateDescription vmstate_virtio_net_device = { vmstate_virtio_net_tx_waiting), VMSTATE_UINT64_TEST(curr_guest_offloads, VirtIONet, has_ctrl_guest_offloads), + VMSTATE_UINT32(rx_coal_usecs, VirtIONet), + VMSTATE_UINT32(tx_coal_usecs, VirtIONet), + VMSTATE_UINT32(rx_coal_packets, VirtIONet), + VMSTATE_UINT32(tx_coal_packets, VirtIONet), VMSTATE_END_OF_LIST() }, .subsections = (const VMStateDescription * const []) { @@ -3960,7 +4096,6 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp) } n->vqs = g_new0(VirtIONetQueue, n->max_queue_pairs); n->curr_queue_pairs = 1; - n->tx_timeout = n->net_conf.txtimer; if (n->net_conf.tx && strcmp(n->net_conf.tx, "timer") && strcmp(n->net_conf.tx, "bh")) { @@ -3970,6 +4105,13 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp) error_printf("Defaulting to \"bh\""); } + if (n->net_conf.tx && strcmp(n->net_conf.tx, "timer") == 0) { + n->tx_coal_usecs = n->net_conf.txtimer / 1000; + n->tx_timer_activate = true; + } else { + n->tx_coal_usecs = 0; + } + n->net_conf.tx_queue_size = MIN(virtio_net_max_tx_queue_size(n), n->net_conf.tx_queue_size); @@ -4046,6 +4188,11 @@ static void virtio_net_device_realize(DeviceState *dev, Error **errp) n->rss_data.specified_hash_types.on_bits | n->rss_data.specified_hash_types.auto_bits; } + n->rx_pkt_cnt = 0; + n->tx_pkt_cnt = 0; + n->rx_coal_usecs = 0; + n->rx_coal_packets = 0; + n->tx_coal_packets = 0; } static void virtio_net_device_unrealize(DeviceState *dev) @@ -4258,6 +4405,8 @@ static const Property virtio_net_properties[] = { VIRTIO_NET_F_GUEST_USO6, true), DEFINE_PROP_BIT64("host_uso", VirtIONet, host_features, VIRTIO_NET_F_HOST_USO, true), + DEFINE_PROP_BIT64("vq_notf_coal", VirtIONet, host_features, + VIRTIO_NET_F_NOTF_COAL, true), DEFINE_PROP_ON_OFF_AUTO_BIT64("hash-ipv4", VirtIONet, rss_data.specified_hash_types, VIRTIO_NET_HASH_REPORT_IPv4 - 1, diff --git a/include/hw/virtio/virtio-net.h b/include/hw/virtio/virtio-net.h index 371e376428..b3a7df5ad8 100644 --- a/include/hw/virtio/virtio-net.h +++ b/include/hw/virtio/virtio-net.h @@ -158,6 +158,7 @@ typedef struct VirtioNetRssData { typedef struct VirtIONetQueue { VirtQueue *rx_vq; VirtQueue *tx_vq; + QEMUTimer *rx_timer; QEMUTimer *tx_timer; QEMUBH *tx_bh; uint32_t tx_waiting; @@ -177,7 +178,6 @@ struct VirtIONet { /* RSC Chains - temporary storage of coalesced data, all these data are lost in case of migration */ QTAILQ_HEAD(, VirtioNetRscChain) rsc_chains; - uint32_t tx_timeout; int32_t tx_burst; uint32_t has_vnet_hdr; size_t host_hdr_len; @@ -230,6 +230,13 @@ struct VirtIONet { struct EBPFRSSContext ebpf_rss; uint32_t nr_ebpf_rss_fds; char **ebpf_rss_fds; + uint32_t rx_coal_usecs; + uint32_t rx_coal_packets; + uint32_t rx_pkt_cnt; + uint32_t tx_coal_usecs; + uint32_t tx_coal_packets; + uint32_t tx_pkt_cnt; + bool tx_timer_activate; }; size_t virtio_net_handle_ctrl_iov(VirtIODevice *vdev, diff --git a/net/passt.c b/net/passt.c index 4ff94ee509..0b0d9e222a 100644 --- a/net/passt.c +++ b/net/passt.c @@ -52,6 +52,7 @@ static const int user_feature_bits[] = { VIRTIO_NET_F_GUEST_USO4, VIRTIO_NET_F_GUEST_USO6, VIRTIO_NET_F_HOST_USO, + VIRTIO_NET_F_NOTF_COAL, /* This bit implies RARP isn't sent by QEMU out of band */ VIRTIO_NET_F_GUEST_ANNOUNCE, diff --git a/net/tap.c b/net/tap.c index 8d7ab6ba6f..ea5987a3dc 100644 --- a/net/tap.c +++ b/net/tap.c @@ -62,6 +62,7 @@ static const int kernel_feature_bits[] = { VIRTIO_F_NOTIFICATION_DATA, VIRTIO_NET_F_RSC_EXT, VIRTIO_NET_F_HASH_REPORT, + VIRTIO_NET_F_NOTF_COAL, VIRTIO_NET_F_GUEST_UDP_TUNNEL_GSO, VIRTIO_NET_F_HOST_UDP_TUNNEL_GSO, VHOST_INVALID_FEATURE_BIT diff --git a/net/vhost-user.c b/net/vhost-user.c index a4bb49bbcf..f0b3752d7c 100644 --- a/net/vhost-user.c +++ b/net/vhost-user.c @@ -54,6 +54,7 @@ static const int user_feature_bits[] = { VIRTIO_NET_F_GUEST_USO4, VIRTIO_NET_F_GUEST_USO6, VIRTIO_NET_F_HOST_USO, + VIRTIO_NET_F_NOTF_COAL, /* This bit implies RARP isn't sent by QEMU out of band */ VIRTIO_NET_F_GUEST_ANNOUNCE, diff --git a/net/vhost-vdpa.c b/net/vhost-vdpa.c index 3df6091274..4ab8f26ceb 100644 --- a/net/vhost-vdpa.c +++ b/net/vhost-vdpa.c @@ -70,6 +70,7 @@ static const int vdpa_feature_bits[] = { VIRTIO_NET_F_CTRL_RX, VIRTIO_NET_F_CTRL_RX_EXTRA, VIRTIO_NET_F_CTRL_VLAN, + VIRTIO_NET_F_NOTF_COAL, VIRTIO_NET_F_CTRL_VQ, VIRTIO_NET_F_GSO, VIRTIO_NET_F_GUEST_CSUM, -- 2.53.0
