On Tue, Dec 18, 2018 at 6:45 AM Björn Töpel <bjorn.to...@gmail.com> wrote:
>
> Den mån 17 dec. 2018 kl 20:40 skrev William Tu <u9012...@gmail.com>:
> >
> > The patch adds support for AF_XDP async xmit.  Users can use
> > AF_XDP on both side of the veth and get better performance, with
> > the cost of ksoftirqd doing the xmit.  The veth_xsk_async_xmit
> > simply kicks the napi function, veth_poll, to run, and the transmit
> > logic is implemented there.
> >
> > Tested using two namespaces, one runs xdpsock and the other runs
> > xdp_rxq_info.  A simple script comparing the performance with/without
> > AF_XDP shows improvement from 724Kpps to 1.1Mpps.
> >
> >   ip netns add at_ns0
> >   ip link add p0 type veth peer name p1
> >   ip link set p0 netns at_ns0
> >   ip link set dev p1 up
> >   ip netns exec at_ns0 ip link set dev p0 up
> >
> >   # receiver
> >   ip netns exec at_ns0 xdp_rxq_info --dev p0 --action XDP_DROP
> >
> >   # sender
> >   xdpsock -i p1 -t -N -z
> >   or
> >   xdpsock -i p1 -t -S
> >
> > Signed-off-by: William Tu <u9012...@gmail.com>
> > ---
> >  drivers/net/veth.c | 247 
> > ++++++++++++++++++++++++++++++++++++++++++++++++++++-
> >  1 file changed, 245 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/net/veth.c b/drivers/net/veth.c
> > index f412ea1cef18..5171ddad5973 100644
> > --- a/drivers/net/veth.c
> > +++ b/drivers/net/veth.c
> > @@ -25,6 +25,10 @@
> >  #include <linux/ptr_ring.h>
> >  #include <linux/bpf_trace.h>
> >  #include <linux/net_tstamp.h>
> > +#include <net/xdp_sock.h>
> > +#include <linux/mm.h>
> > +#include <linux/slab.h>
> > +#include <net/page_pool.h>
> >
> >  #define DRV_NAME       "veth"
> >  #define DRV_VERSION    "1.0"
> > @@ -53,6 +57,7 @@ struct veth_rq {
> >         bool                    rx_notify_masked;
> >         struct ptr_ring         xdp_ring;
> >         struct xdp_rxq_info     xdp_rxq;
> > +       struct xdp_umem *xsk_umem;
> >  };
> >
> >  struct veth_priv {
> > @@ -61,6 +66,11 @@ struct veth_priv {
> >         struct bpf_prog         *_xdp_prog;
> >         struct veth_rq          *rq;
> >         unsigned int            requested_headroom;
> > +
> > +       /* AF_XDP zero-copy */
> > +       struct xdp_umem **xsk_umems;
> > +       u16 num_xsk_umems_used;
> > +       u16 num_xsk_umems;
> >  };
> >
>
> The umems are since commit 661b8d1b0e3a ("net: add umem reference in
> netdev{_rx}_queue") part of the netdev structure, so instead of
> storing the umem in the priv area, you can pull it from the netdev
> device. Doing so means that you can remove all the *_umem() functions
> below.
>
Thanks, will use it in next version.

> Jan (added as Cc:) just did that work on the i40e driver (patches
> posted on intel-wired-lan list).
>
> >  /*
> > @@ -742,10 +752,87 @@ static int veth_poll(struct napi_struct *napi, int 
> > budget)
> >         struct veth_rq *rq =
> >                 container_of(napi, struct veth_rq, xdp_napi);
> >         unsigned int xdp_xmit = 0;
> > -       int done;
> > +       int tx_budget = budget;
> > +       int done = 0;
> > +
> > +       /* tx: use netif_tx_napi_add or here? */
> > +       while (rq->xsk_umem && tx_budget--) {
> > +               struct veth_priv *priv, *peer_priv;
> > +               struct net_device *dev, *peer_dev;
> > +               unsigned int inner_xdp_xmit = 0;
> > +               unsigned int metasize = 0;
> > +               struct veth_rq *peer_rq;
> > +               struct xdp_frame *xdpf;
> > +               bool dropped = false;
> > +               struct sk_buff *skb;
> > +               struct page *page;
> > +               char *vaddr;
> > +               void *addr;
> > +               u32 len;
> > +
> > +               if (!xsk_umem_consume_tx_virtual(rq->xsk_umem, &vaddr, 
> > &len))
> > +                       break;
> > +
> > +               page = dev_alloc_page();
> > +               if (!page)
> > +                       return -ENOMEM;
> > +
> > +               addr = page_to_virt(page);
> > +               xdpf = addr;
> > +               memset(xdpf, 0, sizeof(*xdpf));
> > +
> > +               addr += sizeof(*xdpf);
> > +               memcpy(addr, vaddr, len);
> > +
> > +               xdpf->data = addr + metasize;
> > +               xdpf->len = len;
> > +               xdpf->headroom = 0;
> > +               xdpf->metasize = metasize;
> > +               xdpf->mem.type = MEM_TYPE_PAGE_SHARED;
> > +
> > +               /* Invoke peer rq to rcv */
> > +               dev = rq->dev;
> > +               priv = netdev_priv(dev);
> > +               peer_dev = priv->peer;
> > +               peer_priv = netdev_priv(peer_dev);
> > +               peer_rq = peer_priv->rq;
There is a bug here, where in this case we'll always use peer_rq[0]
Instead, we should do peer_priv->rq[qid];

> > +
> > +               /* put into peer rq */
> > +               skb = veth_xdp_rcv_one(peer_rq, xdpf, &inner_xdp_xmit);
> > +               if (!skb) {
> > +                       /* Peer side has XDP program attached */
> > +                       if (inner_xdp_xmit & VETH_XDP_TX) {
> > +                               /* Not supported */
> > +                               xsk_umem_complete_tx(rq->xsk_umem, 1);
> > +                               xsk_umem_consume_tx_done(rq->xsk_umem);
> > +                               xdp_return_frame(xdpf);
> > +                               goto skip_tx;
> > +                       } else if (inner_xdp_xmit & VETH_XDP_REDIR) {
> > +                               xdp_do_flush_map();
> > +                       } else {
> > +                               dropped = true;
> > +                       }
> > +               } else {
> > +                       /* Peer side has no XDP attached */
> > +                       napi_gro_receive(&peer_rq->xdp_napi, skb);
> > +               }
> > +               xsk_umem_complete_tx(rq->xsk_umem, 1);
> > +               xsk_umem_consume_tx_done(rq->xsk_umem);
> > +
> > +               /* update peer stats */
> > +               u64_stats_update_begin(&peer_rq->stats.syncp);
> > +               peer_rq->stats.xdp_packets++;
> > +               peer_rq->stats.xdp_bytes += len;
> > +               if (dropped)
> > +                       rq->stats.xdp_drops++;
> > +               u64_stats_update_end(&peer_rq->stats.syncp);
> > +               done++;
> > +       }
>
> Refactor to a function?
Sure, will do it.

Regards,
William

Reply via email to