Yunjian Wang wrote:
> Now the zero-copy feature of AF_XDP socket is supported by some
> drivers, which can reduce CPU utilization on the xdp program.
> This patch set allows tun to support AF_XDP Rx zero-copy feature.
> 
> This patch tries to address this by:
> - Use peek_len to consume a xsk->desc and get xsk->desc length.
> - When the tun support AF_XDP Rx zero-copy, the vq's array maybe empty.
> So add a check for empty vq's array in vhost_net_buf_produce().
> - add XDP_SETUP_XSK_POOL and ndo_xsk_wakeup callback support
> - add tun_put_user_desc function to copy the Rx data to VM
> 
> Signed-off-by: Yunjian Wang <wangyunj...@huawei.com>

I don't fully understand the higher level design of this feature yet.

But some initial comments at the code level.

> ---
>  drivers/net/tun.c   | 165 +++++++++++++++++++++++++++++++++++++++++++-
>  drivers/vhost/net.c |  18 +++--
>  2 files changed, 176 insertions(+), 7 deletions(-)
> 
> diff --git a/drivers/net/tun.c b/drivers/net/tun.c
> index afa5497f7c35..248b0f8e07d1 100644
> --- a/drivers/net/tun.c
> +++ b/drivers/net/tun.c
> @@ -77,6 +77,7 @@
>  #include <net/ax25.h>
>  #include <net/rose.h>
>  #include <net/6lowpan.h>
> +#include <net/xdp_sock_drv.h>
>  
>  #include <linux/uaccess.h>
>  #include <linux/proc_fs.h>
> @@ -145,6 +146,10 @@ struct tun_file {
>       struct tun_struct *detached;
>       struct ptr_ring tx_ring;
>       struct xdp_rxq_info xdp_rxq;
> +     struct xdp_desc desc;
> +     /* protects xsk pool */
> +     spinlock_t pool_lock;
> +     struct xsk_buff_pool *pool;
>  };
>  
>  struct tun_page {
> @@ -208,6 +213,8 @@ struct tun_struct {
>       struct bpf_prog __rcu *xdp_prog;
>       struct tun_prog __rcu *steering_prog;
>       struct tun_prog __rcu *filter_prog;
> +     /* tracks AF_XDP ZC enabled queues */
> +     unsigned long *af_xdp_zc_qps;
>       struct ethtool_link_ksettings link_ksettings;
>       /* init args */
>       struct file *file;
> @@ -795,6 +802,8 @@ static int tun_attach(struct tun_struct *tun, struct file 
> *file,
>  
>       tfile->queue_index = tun->numqueues;
>       tfile->socket.sk->sk_shutdown &= ~RCV_SHUTDOWN;
> +     tfile->desc.len = 0;
> +     tfile->pool = NULL;
>  
>       if (tfile->detached) {
>               /* Re-attach detached tfile, updating XDP queue_index */
> @@ -989,6 +998,13 @@ static int tun_net_init(struct net_device *dev)
>               return err;
>       }
>  
> +     tun->af_xdp_zc_qps = bitmap_zalloc(MAX_TAP_QUEUES, GFP_KERNEL);
> +     if (!tun->af_xdp_zc_qps) {
> +             security_tun_dev_free_security(tun->security);
> +             free_percpu(dev->tstats);
> +             return -ENOMEM;
> +     }
> +
>       tun_flow_init(tun);
>  
>       dev->hw_features = NETIF_F_SG | NETIF_F_FRAGLIST |
> @@ -1009,6 +1025,7 @@ static int tun_net_init(struct net_device *dev)
>               tun_flow_uninit(tun);
>               security_tun_dev_free_security(tun->security);
>               free_percpu(dev->tstats);
> +             bitmap_free(tun->af_xdp_zc_qps);

Please release state in inverse order of acquire.

>               return err;
>       }
>       return 0;
> @@ -1222,11 +1239,77 @@ static int tun_xdp_set(struct net_device *dev, struct 
> bpf_prog *prog,
>       return 0;
>  }
>  
> +static int tun_xsk_pool_enable(struct net_device *netdev,
> +                            struct xsk_buff_pool *pool,
> +                            u16 qid)
> +{
> +     struct tun_struct *tun = netdev_priv(netdev);
> +     struct tun_file *tfile;
> +     unsigned long flags;
> +
> +     rcu_read_lock();
> +     tfile = rtnl_dereference(tun->tfiles[qid]);
> +     if (!tfile) {
> +             rcu_read_unlock();
> +             return -ENODEV;
> +     }

No need for rcu_read_lock with rtnl_dereference.

Consider ASSERT_RTNL() if unsure whether this patch could be reached
without the rtnl held.

> +
> +     spin_lock_irqsave(&tfile->pool_lock, flags);
> +     xsk_pool_set_rxq_info(pool, &tfile->xdp_rxq);
> +     tfile->pool = pool;
> +     spin_unlock_irqrestore(&tfile->pool_lock, flags);
> +
> +     rcu_read_unlock();
> +     set_bit(qid, tun->af_xdp_zc_qps);

What are the concurrency semantics: there's a spinlock to make
the update to xdp_rxq and pool a critical section, but the bitmap
is not part of this? Please also then document why the irqsave.

> +
> +     return 0;
> +}
> +
> +static int tun_xsk_pool_disable(struct net_device *netdev, u16 qid)
> +{
> +     struct tun_struct *tun = netdev_priv(netdev);
> +     struct tun_file *tfile;
> +     unsigned long flags;
> +
> +     if (!test_bit(qid, tun->af_xdp_zc_qps))
> +             return 0;
> +
> +     clear_bit(qid, tun->af_xdp_zc_qps);

Time of check to time of use race between test and clear? Or is
there no race because anything that clears will hold the RTNL? If so,
please add a comment.

> +
> +     rcu_read_lock();
> +     tfile = rtnl_dereference(tun->tfiles[qid]);
> +     if (!tfile) {
> +             rcu_read_unlock();
> +             return 0;
> +     }
> +
> +     spin_lock_irqsave(&tfile->pool_lock, flags);
> +     if (tfile->desc.len) {
> +             xsk_tx_completed(tfile->pool, 1);
> +             tfile->desc.len = 0;
> +     }
> +     tfile->pool = NULL;
> +     spin_unlock_irqrestore(&tfile->pool_lock, flags);
> +
> +     rcu_read_unlock();
> +     return 0;
> +}
> +
> +int tun_xsk_pool_setup(struct net_device *dev, struct xsk_buff_pool *pool,
> +                    u16 qid)
> +{
> +     return pool ? tun_xsk_pool_enable(dev, pool, qid) :
> +             tun_xsk_pool_disable(dev, qid);
> +}
> +
>  static int tun_xdp(struct net_device *dev, struct netdev_bpf *xdp)
>  {
>       switch (xdp->command) {
>       case XDP_SETUP_PROG:
>               return tun_xdp_set(dev, xdp->prog, xdp->extack);
> +     case XDP_SETUP_XSK_POOL:
> +             return tun_xsk_pool_setup(dev, xdp->xsk.pool,
> +                                        xdp->xsk.queue_id);
>       default:
>               return -EINVAL;
>       }
> @@ -1331,6 +1414,19 @@ static int tun_xdp_tx(struct net_device *dev, struct 
> xdp_buff *xdp)
>       return nxmit;
>  }
>  
> +static int tun_xsk_wakeup(struct net_device *dev, u32 qid, u32 flags)
> +{
> +     struct tun_struct *tun = netdev_priv(dev);
> +     struct tun_file *tfile;
> +
> +     rcu_read_lock();
> +     tfile = rcu_dereference(tun->tfiles[qid]);
> +     if (tfile)
> +             __tun_xdp_flush_tfile(tfile);
> +     rcu_read_unlock();
> +     return 0;
> +}
> +
>  static const struct net_device_ops tap_netdev_ops = {
>       .ndo_init               = tun_net_init,
>       .ndo_uninit             = tun_net_uninit,
> @@ -1347,6 +1443,7 @@ static const struct net_device_ops tap_netdev_ops = {
>       .ndo_get_stats64        = dev_get_tstats64,
>       .ndo_bpf                = tun_xdp,
>       .ndo_xdp_xmit           = tun_xdp_xmit,
> +     .ndo_xsk_wakeup         = tun_xsk_wakeup,
>       .ndo_change_carrier     = tun_net_change_carrier,
>  };
>  
> @@ -1404,7 +1501,8 @@ static void tun_net_initialize(struct net_device *dev)
>               /* Currently tun does not support XDP, only tap does. */
>               dev->xdp_features = NETDEV_XDP_ACT_BASIC |
>                                   NETDEV_XDP_ACT_REDIRECT |
> -                                 NETDEV_XDP_ACT_NDO_XMIT;
> +                                 NETDEV_XDP_ACT_NDO_XMIT |
> +                                 NETDEV_XDP_ACT_XSK_ZEROCOPY;
>  
>               break;
>       }
> @@ -2213,6 +2311,37 @@ static void *tun_ring_recv(struct tun_file *tfile, int 
> noblock, int *err)
>       return ptr;
>  }
>  
> +static ssize_t tun_put_user_desc(struct tun_struct *tun,
> +                              struct tun_file *tfile,
> +                              struct xdp_desc *desc,
> +                              struct iov_iter *iter)
> +{
> +     size_t size = desc->len;
> +     int vnet_hdr_sz = 0;
> +     size_t ret;
> +
> +     if (tun->flags & IFF_VNET_HDR) {
> +             struct virtio_net_hdr_mrg_rxbuf gso = { 0 };
> +
> +             vnet_hdr_sz = READ_ONCE(tun->vnet_hdr_sz);
> +             if (unlikely(iov_iter_count(iter) < vnet_hdr_sz))
> +                     return -EINVAL;
> +             if (unlikely(copy_to_iter(&gso, sizeof(gso), iter) !=
> +                          sizeof(gso)))
> +                     return -EFAULT;
> +             iov_iter_advance(iter, vnet_hdr_sz - sizeof(gso));
> +     }
> +
> +     ret = copy_to_iter(xsk_buff_raw_get_data(tfile->pool, desc->addr),
> +                        size, iter) + vnet_hdr_sz;
> +
> +     preempt_disable();
> +     dev_sw_netstats_tx_add(tun->dev, 1, ret);
> +     preempt_enable();
> +
> +     return ret;
> +}
> +

This is almost a copy of tun_put_user_xdp. Can we refactor to avoid
duplicating code (here and elsewhere this may apply).

>  static ssize_t tun_do_read(struct tun_struct *tun, struct tun_file *tfile,
>                          struct iov_iter *to,
>                          int noblock, void *ptr)
> @@ -2226,6 +2355,22 @@ static ssize_t tun_do_read(struct tun_struct *tun, 
> struct tun_file *tfile,
>       }
>  
>       if (!ptr) {
> +             /* Read frames from xsk's desc */
> +             if (test_bit(tfile->queue_index, tun->af_xdp_zc_qps)) {
> +                     spin_lock(&tfile->pool_lock);
> +                     if (tfile->pool) {
> +                             ret = tun_put_user_desc(tun, tfile, 
> &tfile->desc, to);
> +                             xsk_tx_completed(tfile->pool, 1);
> +                             if (xsk_uses_need_wakeup(tfile->pool))
> +                                     xsk_set_tx_need_wakeup(tfile->pool);

For the xsk maintainers if they're reading along: this two line
pattern is seen quite often. Deserves a helper in a header file?

> +                             tfile->desc.len = 0;
> +                     } else {
> +                             ret = -EBADFD;
> +                     }
> +                     spin_unlock(&tfile->pool_lock);
> +                     return ret;
> +             }
> +
>               /* Read frames from ring */
>               ptr = tun_ring_recv(tfile, noblock, &err);
>               if (!ptr)
> @@ -2311,6 +2456,7 @@ static void tun_free_netdev(struct net_device *dev)
>  
>       BUG_ON(!(list_empty(&tun->disabled)));
>  
> +     bitmap_free(tun->af_xdp_zc_qps);
>       free_percpu(dev->tstats);
>       tun_flow_uninit(tun);
>       security_tun_dev_free_security(tun->security);
> @@ -2666,7 +2812,19 @@ static int tun_peek_len(struct socket *sock)
>       if (!tun)
>               return 0;
>  
> -     ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
> +     if (test_bit(tfile->queue_index, tun->af_xdp_zc_qps)) {
> +             spin_lock(&tfile->pool_lock);
> +             if (tfile->pool && xsk_tx_peek_desc(tfile->pool, &tfile->desc)) 
> {
> +                     xsk_tx_release(tfile->pool);
> +                     ret = tfile->desc.len;
> +                     /* The length of desc must be greater than 0 */
> +                     if (!ret)
> +                             xsk_tx_completed(tfile->pool, 1);

Peek semantics usually don't result in releasing the buffer. Am I
misunderstanding this operation?
> +             }
> +             spin_unlock(&tfile->pool_lock);
> +     } else {
> +             ret = PTR_RING_PEEK_CALL(&tfile->tx_ring, tun_ptr_peek_len);
> +     }
>       tun_put(tun);
>  
>       return ret;
> @@ -3469,8 +3627,11 @@ static int tun_chr_open(struct inode *inode, struct 
> file * file)
>  
>       mutex_init(&tfile->napi_mutex);
>       RCU_INIT_POINTER(tfile->tun, NULL);
> +     spin_lock_init(&tfile->pool_lock);
>       tfile->flags = 0;
>       tfile->ifindex = 0;
> +     tfile->pool = NULL;
> +     tfile->desc.len = 0;
>  
>       init_waitqueue_head(&tfile->socket.wq.wait);
>  
> diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> index f2ed7167c848..a1f143ad2341 100644
> --- a/drivers/vhost/net.c
> +++ b/drivers/vhost/net.c

For virtio maintainer: is it okay to have tun and vhost/net changes in
the same patch, or is it better to split them?

> @@ -169,9 +169,10 @@ static int vhost_net_buf_is_empty(struct vhost_net_buf 
> *rxq)
>  
>  static void *vhost_net_buf_consume(struct vhost_net_buf *rxq)
>  {
> -     void *ret = vhost_net_buf_get_ptr(rxq);
> -     ++rxq->head;
> -     return ret;
> +     if (rxq->tail == rxq->head)
> +             return NULL;
> +
> +     return rxq->queue[rxq->head++];

Why this change?

>  }
>  
>  static int vhost_net_buf_produce(struct vhost_net_virtqueue *nvq)
> @@ -993,12 +994,19 @@ static void handle_tx(struct vhost_net *net)
>  
>  static int peek_head_len(struct vhost_net_virtqueue *rvq, struct sock *sk)
>  {
> +     struct socket *sock = sk->sk_socket;
>       struct sk_buff *head;
>       int len = 0;
>       unsigned long flags;
>  
> -     if (rvq->rx_ring)
> -             return vhost_net_buf_peek(rvq);
> +     if (rvq->rx_ring) {
> +             len = vhost_net_buf_peek(rvq);
> +             if (likely(len))
> +                     return len;
> +     }
> +
> +     if (sock->ops->peek_len)
> +             return sock->ops->peek_len(sock);
>  
>       spin_lock_irqsave(&sk->sk_receive_queue.lock, flags);
>       head = skb_peek(&sk->sk_receive_queue);
> -- 
> 2.33.0
> 



Reply via email to