Hi Yuan, This is a first review, I will certainly have more comments later.
On 6/2/21 10:31 AM, Yuan Wang wrote: > This patch implements asynchronous dequeue data path for split ring. > A new asynchronous dequeue function is introduced. With this function, > the application can try to receive packets from the guest with > offloading large copies to the DMA engine, thus saving precious CPU > cycles. Do you have any number to share? > Signed-off-by: Wenwu Ma <wenwux...@intel.com> > Signed-off-by: Yuan Wang <yuanx.w...@intel.com> > Signed-off-by: Jiayu Hu <jiayu...@intel.com> > --- > doc/guides/prog_guide/vhost_lib.rst | 10 + > examples/vhost/ioat.c | 30 +- > examples/vhost/ioat.h | 3 + > examples/vhost/main.c | 60 +-- > lib/vhost/rte_vhost_async.h | 44 ++- > lib/vhost/version.map | 3 + > lib/vhost/virtio_net.c | 549 ++++++++++++++++++++++++++++ > 7 files changed, 664 insertions(+), 35 deletions(-) Please split the patch in multple parts. At least don't mix example and lib changes in the same patch. > diff --git a/doc/guides/prog_guide/vhost_lib.rst > b/doc/guides/prog_guide/vhost_lib.rst > index 6b7206bc1d..785ab0fb34 100644 > --- a/doc/guides/prog_guide/vhost_lib.rst > +++ b/doc/guides/prog_guide/vhost_lib.rst > @@ -281,6 +281,16 @@ The following is an overview of some key Vhost API > functions: > Poll enqueue completion status from async data path. Completed packets > are returned to applications through ``pkts``. > > +* ``rte_vhost_try_dequeue_burst(vid, queue_id, mbuf_pool, pkts, count, > nr_inflight)`` The function should contain async in its name. BTW, I think we should also rename below APIs while they are experimental to highlight it is async related: rte_vhost_submit_enqueue_burst rte_vhost_poll_enqueue_completed > + > + Try to receive packets from the guest with offloading large packets > + to the DMA engine. Successfully dequeued packets are transfer > + completed and returned in ``pkts``. But there may be other packets > + that are sent from the guest but being transferred by the DMA engine, > + called in-flight packets. This function will return in-flight packets > + only after the DMA engine finishes transferring. The amount of > + in-flight packets by now is returned in ``nr_inflight``. > + > Vhost-user Implementations > -------------------------- > > diff --git a/examples/vhost/ioat.c b/examples/vhost/ioat.c > index 2a2c2d7202..236306c9c7 100644 > --- a/examples/vhost/ioat.c > +++ b/examples/vhost/ioat.c > @@ -17,7 +17,6 @@ struct packet_tracker { > unsigned short next_read; > unsigned short next_write; > unsigned short last_remain; > - unsigned short ioat_space; > }; > > struct packet_tracker cb_tracker[MAX_VHOST_DEVICE]; > @@ -61,18 +60,30 @@ open_ioat(const char *value) > goto out; > } > while (i < args_nr) { > + char *txd, *rxd; > + bool is_txd; > char *arg_temp = dma_arg[i]; > uint8_t sub_nr; > + > sub_nr = rte_strsplit(arg_temp, strlen(arg_temp), ptrs, 2, '@'); > if (sub_nr != 2) { > ret = -1; > goto out; > } > > - start = strstr(ptrs[0], "txd"); > - if (start == NULL) { > + txd = strstr(ptrs[0], "txd"); > + rxd = strstr(ptrs[0], "rxd"); > + if (txd == NULL && rxd == NULL) { > ret = -1; > goto out; > + } else if (txd) { > + is_txd = true; > + start = txd; > + ret |= ASYNC_RX_VHOST; > + } else { > + is_txd = false; > + start = rxd; > + ret |= ASYNC_TX_VHOST; > } > > start += 3; > @@ -82,7 +93,8 @@ open_ioat(const char *value) > goto out; > } > > - vring_id = 0 + VIRTIO_RXQ; > + vring_id = is_txd ? VIRTIO_RXQ : VIRTIO_TXQ; > + > if (rte_pci_addr_parse(ptrs[1], > &(dma_info + vid)->dmas[vring_id].addr) < 0) { > ret = -1; > @@ -113,7 +125,6 @@ open_ioat(const char *value) > goto out; > } > rte_rawdev_start(dev_id); > - cb_tracker[dev_id].ioat_space = IOAT_RING_SIZE - 1; > dma_info->nr++; > i++; > } > @@ -128,7 +139,7 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id, > struct rte_vhost_async_status *opaque_data, uint16_t count) > { > uint32_t i_desc; > - uint16_t dev_id = dma_bind[vid].dmas[queue_id * 2 + VIRTIO_RXQ].dev_id; > + uint16_t dev_id = dma_bind[vid].dmas[queue_id].dev_id; It looks broken with regards to multiqueue (it was before this patch). In open_ioat(), only dma_bind[vid].dmas[VIRTIO_RXQ] and dma_bind[vid].dmas[VIRTIO_TXQ] are set. As it seems that the application does not support multiqueue, it may be a good idea to check queue_id value before using it. > struct rte_vhost_iov_iter *src = NULL; > struct rte_vhost_iov_iter *dst = NULL; > unsigned long i_seg; > @@ -140,7 +151,7 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id, > src = descs[i_desc].src; > dst = descs[i_desc].dst; > i_seg = 0; > - if (cb_tracker[dev_id].ioat_space < src->nr_segs) > + if (rte_ioat_burst_capacity(dev_id) < src->nr_segs) This change should be in a dedicated patch, it is not related to dequeue support. > break; > while (i_seg < src->nr_segs) { > rte_ioat_enqueue_copy(dev_id, > @@ -155,7 +166,6 @@ ioat_transfer_data_cb(int vid, uint16_t queue_id, > } > write &= mask; > cb_tracker[dev_id].size_track[write] = src->nr_segs; > - cb_tracker[dev_id].ioat_space -= src->nr_segs; > write++; > } > } else { > @@ -181,8 +191,7 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id, > unsigned short mask = MAX_ENQUEUED_SIZE - 1; > unsigned short i; > > - uint16_t dev_id = dma_bind[vid].dmas[queue_id * 2 > - + VIRTIO_RXQ].dev_id; > + uint16_t dev_id = dma_bind[vid].dmas[queue_id].dev_id; > n_seg = rte_ioat_completed_ops(dev_id, 255, NULL, NULL, dump, > dump); > if (n_seg < 0) { > RTE_LOG(ERR, > @@ -194,7 +203,6 @@ ioat_check_completed_copies_cb(int vid, uint16_t queue_id, > if (n_seg == 0) > return 0; > > - cb_tracker[dev_id].ioat_space += n_seg; > n_seg += cb_tracker[dev_id].last_remain; > > read = cb_tracker[dev_id].next_read; > diff --git a/examples/vhost/ioat.h b/examples/vhost/ioat.h > index 1aa28ed6a3..db7acefc02 100644 > --- a/examples/vhost/ioat.h > +++ b/examples/vhost/ioat.h > @@ -13,6 +13,9 @@ > #define IOAT_RING_SIZE 4096 > #define MAX_ENQUEUED_SIZE 4096 > > +#define ASYNC_RX_VHOST 1 > +#define ASYNC_TX_VHOST 2 > + > struct dma_info { > struct rte_pci_addr addr; > uint16_t dev_id; > diff --git a/examples/vhost/main.c b/examples/vhost/main.c > index d2179eadb9..a5662a1a91 100644 > --- a/examples/vhost/main.c > +++ b/examples/vhost/main.c > @@ -93,7 +93,8 @@ static int client_mode; > > static int builtin_net_driver; > > -static int async_vhost_driver; > +static int async_rx_vhost_driver; > +static int async_tx_vhost_driver; > > static char *dma_type; > > @@ -671,13 +672,17 @@ us_vhost_parse_args(int argc, char **argv) > break; > > case OPT_DMAS_NUM: > - if (open_dma(optarg) == -1) { > + ret = open_dma(optarg); > + if (ret == -1) { > RTE_LOG(INFO, VHOST_CONFIG, > "Wrong DMA args\n"); > us_vhost_usage(prgname); > return -1; > } > - async_vhost_driver = 1; > + if (ret & ASYNC_RX_VHOST) > + async_rx_vhost_driver = 1; > + if (ret & ASYNC_TX_VHOST) > + async_tx_vhost_driver = 1; > break; > > case OPT_CLIENT_NUM: > @@ -887,7 +892,7 @@ drain_vhost(struct vhost_dev *vdev) > > if (builtin_net_driver) { > ret = vs_enqueue_pkts(vdev, VIRTIO_RXQ, m, nr_xmit); > - } else if (async_vhost_driver) { > + } else if (async_rx_vhost_driver) { I think we should consider having ops for async and sync instead of all these if/else. It could be refactored as preliminary patch for this series. > uint32_t cpu_cpl_nr = 0; > uint16_t enqueue_fail = 0; > struct rte_mbuf *m_cpu_cpl[nr_xmit]; > @@ -914,7 +919,7 @@ drain_vhost(struct vhost_dev *vdev) > __ATOMIC_SEQ_CST); > } > > - if (!async_vhost_driver) > + if (!async_rx_vhost_driver) > free_pkts(m, nr_xmit); > } >