On Tue, Dec 18, 2018 at 6:45 AM Björn Töpel <bjorn.to...@gmail.com> wrote: > > Den mån 17 dec. 2018 kl 20:40 skrev William Tu <u9012...@gmail.com>: > > > > The patch adds support for AF_XDP async xmit. Users can use > > AF_XDP on both side of the veth and get better performance, with > > the cost of ksoftirqd doing the xmit. The veth_xsk_async_xmit > > simply kicks the napi function, veth_poll, to run, and the transmit > > logic is implemented there. > > > > Tested using two namespaces, one runs xdpsock and the other runs > > xdp_rxq_info. A simple script comparing the performance with/without > > AF_XDP shows improvement from 724Kpps to 1.1Mpps. > > > > ip netns add at_ns0 > > ip link add p0 type veth peer name p1 > > ip link set p0 netns at_ns0 > > ip link set dev p1 up > > ip netns exec at_ns0 ip link set dev p0 up > > > > # receiver > > ip netns exec at_ns0 xdp_rxq_info --dev p0 --action XDP_DROP > > > > # sender > > xdpsock -i p1 -t -N -z > > or > > xdpsock -i p1 -t -S > > > > Signed-off-by: William Tu <u9012...@gmail.com> > > --- > > drivers/net/veth.c | 247 > > ++++++++++++++++++++++++++++++++++++++++++++++++++++- > > 1 file changed, 245 insertions(+), 2 deletions(-) > > > > diff --git a/drivers/net/veth.c b/drivers/net/veth.c > > index f412ea1cef18..5171ddad5973 100644 > > --- a/drivers/net/veth.c > > +++ b/drivers/net/veth.c > > @@ -25,6 +25,10 @@ > > #include <linux/ptr_ring.h> > > #include <linux/bpf_trace.h> > > #include <linux/net_tstamp.h> > > +#include <net/xdp_sock.h> > > +#include <linux/mm.h> > > +#include <linux/slab.h> > > +#include <net/page_pool.h> > > > > #define DRV_NAME "veth" > > #define DRV_VERSION "1.0" > > @@ -53,6 +57,7 @@ struct veth_rq { > > bool rx_notify_masked; > > struct ptr_ring xdp_ring; > > struct xdp_rxq_info xdp_rxq; > > + struct xdp_umem *xsk_umem; > > }; > > > > struct veth_priv { > > @@ -61,6 +66,11 @@ struct veth_priv { > > struct bpf_prog *_xdp_prog; > > struct veth_rq *rq; > > unsigned int requested_headroom; > > + > > + /* AF_XDP zero-copy */ > > + struct xdp_umem **xsk_umems; > > + u16 num_xsk_umems_used; > > + u16 num_xsk_umems; > > }; > > > > The umems are since commit 661b8d1b0e3a ("net: add umem reference in > netdev{_rx}_queue") part of the netdev structure, so instead of > storing the umem in the priv area, you can pull it from the netdev > device. Doing so means that you can remove all the *_umem() functions > below. > Thanks, will use it in next version.
> Jan (added as Cc:) just did that work on the i40e driver (patches > posted on intel-wired-lan list). > > > /* > > @@ -742,10 +752,87 @@ static int veth_poll(struct napi_struct *napi, int > > budget) > > struct veth_rq *rq = > > container_of(napi, struct veth_rq, xdp_napi); > > unsigned int xdp_xmit = 0; > > - int done; > > + int tx_budget = budget; > > + int done = 0; > > + > > + /* tx: use netif_tx_napi_add or here? */ > > + while (rq->xsk_umem && tx_budget--) { > > + struct veth_priv *priv, *peer_priv; > > + struct net_device *dev, *peer_dev; > > + unsigned int inner_xdp_xmit = 0; > > + unsigned int metasize = 0; > > + struct veth_rq *peer_rq; > > + struct xdp_frame *xdpf; > > + bool dropped = false; > > + struct sk_buff *skb; > > + struct page *page; > > + char *vaddr; > > + void *addr; > > + u32 len; > > + > > + if (!xsk_umem_consume_tx_virtual(rq->xsk_umem, &vaddr, > > &len)) > > + break; > > + > > + page = dev_alloc_page(); > > + if (!page) > > + return -ENOMEM; > > + > > + addr = page_to_virt(page); > > + xdpf = addr; > > + memset(xdpf, 0, sizeof(*xdpf)); > > + > > + addr += sizeof(*xdpf); > > + memcpy(addr, vaddr, len); > > + > > + xdpf->data = addr + metasize; > > + xdpf->len = len; > > + xdpf->headroom = 0; > > + xdpf->metasize = metasize; > > + xdpf->mem.type = MEM_TYPE_PAGE_SHARED; > > + > > + /* Invoke peer rq to rcv */ > > + dev = rq->dev; > > + priv = netdev_priv(dev); > > + peer_dev = priv->peer; > > + peer_priv = netdev_priv(peer_dev); > > + peer_rq = peer_priv->rq; There is a bug here, where in this case we'll always use peer_rq[0] Instead, we should do peer_priv->rq[qid]; > > + > > + /* put into peer rq */ > > + skb = veth_xdp_rcv_one(peer_rq, xdpf, &inner_xdp_xmit); > > + if (!skb) { > > + /* Peer side has XDP program attached */ > > + if (inner_xdp_xmit & VETH_XDP_TX) { > > + /* Not supported */ > > + xsk_umem_complete_tx(rq->xsk_umem, 1); > > + xsk_umem_consume_tx_done(rq->xsk_umem); > > + xdp_return_frame(xdpf); > > + goto skip_tx; > > + } else if (inner_xdp_xmit & VETH_XDP_REDIR) { > > + xdp_do_flush_map(); > > + } else { > > + dropped = true; > > + } > > + } else { > > + /* Peer side has no XDP attached */ > > + napi_gro_receive(&peer_rq->xdp_napi, skb); > > + } > > + xsk_umem_complete_tx(rq->xsk_umem, 1); > > + xsk_umem_consume_tx_done(rq->xsk_umem); > > + > > + /* update peer stats */ > > + u64_stats_update_begin(&peer_rq->stats.syncp); > > + peer_rq->stats.xdp_packets++; > > + peer_rq->stats.xdp_bytes += len; > > + if (dropped) > > + rq->stats.xdp_drops++; > > + u64_stats_update_end(&peer_rq->stats.syncp); > > + done++; > > + } > > Refactor to a function? Sure, will do it. Regards, William