Add a TUN/TAP receive mode that exercises the napi_gro_frags()
interface. This mode is available only in TAP mode, as the interface
expects packets with Ethernet headers.

Furthermore, packets follow the layout of the iovec_iter that was
received. The first iovec is the linear data, and every one after the
first is a fragment. If there are more fragments than the max number,
drop the packet. Additionally, invoke eth_get_headlen() to exercise flow
dissector code and to verify that the header resides in the linear data.

The napi_gro_frags() mode requires setting the IFF_NAPI_FRAGS option.
This is imposed because this mode is intended for testing via tools like
syzkaller and packetdrill, and the increased flexibility it provides can
introduce security vulnerabilities.

Signed-off-by: Petar Penkov <ppen...@google.com>
Cc: Eric Dumazet <eduma...@google.com>
Cc: Mahesh Bandewar <mahe...@google.com>
Cc: Willem de Bruijn <will...@google.com>
Cc: da...@davemloft.net
Cc: ppen...@stanford.edu
---
 drivers/net/tun.c           | 135 ++++++++++++++++++++++++++++++++++++++++++--
 include/uapi/linux/if_tun.h |   1 +
 2 files changed, 130 insertions(+), 6 deletions(-)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index d5c824e3ec42..2ba9809ab6cd 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -75,6 +75,7 @@
 #include <linux/skb_array.h>
 #include <linux/bpf.h>
 #include <linux/bpf_trace.h>
+#include <linux/mutex.h>
 
 #include <linux/uaccess.h>
 
@@ -120,8 +121,15 @@ do {                                                       
        \
 #define TUN_VNET_LE     0x80000000
 #define TUN_VNET_BE     0x40000000
 
+#if IS_ENABLED(CONFIG_TUN_NAPI)
+#define TUN_FEATURES_EXTRA IFF_NAPI_FRAGS
+#else
+#define TUN_FEATURES_EXTRA 0
+#endif
+
 #define TUN_FEATURES (IFF_NO_PI | IFF_ONE_QUEUE | IFF_VNET_HDR | \
-                     IFF_MULTI_QUEUE)
+                     IFF_MULTI_QUEUE | TUN_FEATURES_EXTRA)
+
 #define GOODCOPY_LEN 128
 
 #define FLT_EXACT_COUNT 8
@@ -173,6 +181,7 @@ struct tun_file {
                unsigned int ifindex;
        };
        struct napi_struct napi;
+       struct mutex napi_mutex;        /* Protects access to the above napi */
        struct list_head next;
        struct tun_struct *detached;
        struct skb_array tx_array;
@@ -276,6 +285,7 @@ static void tun_napi_init(struct tun_struct *tun, struct 
tun_file *tfile)
                netif_napi_add(tun->dev, &tfile->napi, tun_napi_poll,
                               NAPI_POLL_WEIGHT);
                napi_enable(&tfile->napi);
+               mutex_init(&tfile->napi_mutex);
        }
 }
 
@@ -291,6 +301,11 @@ static void tun_napi_del(struct tun_file *tfile)
                netif_napi_del(&tfile->napi);
 }
 
+static bool tun_napi_frags_enabled(const struct tun_struct *tun)
+{
+       return READ_ONCE(tun->flags) & IFF_NAPI_FRAGS;
+}
+
 #ifdef CONFIG_TUN_VNET_CROSS_LE
 static inline bool tun_legacy_is_little_endian(struct tun_struct *tun)
 {
@@ -1034,7 +1049,8 @@ static void tun_poll_controller(struct net_device *dev)
         * supports polling, which enables bridge devices in virt setups to
         * still use netconsole
         * If NAPI is enabled, however, we need to schedule polling for all
-        * queues.
+        * queues unless we are using napi_gro_frags(), which we call in
+        * process context and not in NAPI context.
         */
 
        if (IS_ENABLED(CONFIG_TUN_NAPI)) {
@@ -1042,6 +1058,9 @@ static void tun_poll_controller(struct net_device *dev)
                struct tun_file *tfile;
                int i;
 
+               if (tun_napi_frags_enabled(tun))
+                       return;
+
                rcu_read_lock();
                for (i = 0; i < tun->numqueues; i++) {
                        tfile = rcu_dereference(tun->tfiles[i]);
@@ -1264,6 +1283,64 @@ static unsigned int tun_chr_poll(struct file *file, 
poll_table *wait)
        return mask;
 }
 
+static struct sk_buff *tun_napi_alloc_frags(struct tun_file *tfile,
+                                           size_t len,
+                                           const struct iov_iter *it)
+{
+       struct sk_buff *skb;
+       size_t linear;
+       int err;
+       int i;
+
+       if (it->nr_segs > MAX_SKB_FRAGS + 1)
+               return ERR_PTR(-ENOMEM);
+
+       local_bh_disable();
+       skb = napi_get_frags(&tfile->napi);
+       local_bh_enable();
+       if (!skb)
+               return ERR_PTR(-ENOMEM);
+
+       linear = iov_iter_single_seg_count(it);
+       err = __skb_grow(skb, linear);
+       if (err)
+               goto free;
+
+       skb->len = len;
+       skb->data_len = len - linear;
+       skb->truesize += skb->data_len;
+
+       for (i = 1; i < it->nr_segs; i++) {
+               size_t fragsz = it->iov[i].iov_len;
+               unsigned long offset;
+               struct page *page;
+               void *data;
+
+               if (fragsz == 0 || fragsz > PAGE_SIZE) {
+                       err = -EINVAL;
+                       goto free;
+               }
+
+               local_bh_disable();
+               data = napi_alloc_frag(fragsz);
+               local_bh_enable();
+               if (!data) {
+                       err = -ENOMEM;
+                       goto free;
+               }
+
+               page = virt_to_page(data);
+               offset = offset_in_page(data);
+               skb_fill_page_desc(skb, i - 1, page, offset, fragsz);
+       }
+
+       return skb;
+free:
+       /* frees skb and all frags allocated with napi_alloc_frag() */
+       napi_free_frags(&tfile->napi);
+       return ERR_PTR(err);
+}
+
 /* prepad is the amount to reserve at front.  len is length after that.
  * linear is a hint as to how much to copy (usually headers). */
 static struct sk_buff *tun_alloc_skb(struct tun_file *tfile,
@@ -1466,6 +1543,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, 
struct tun_file *tfile,
        int err;
        u32 rxhash;
        int generic_xdp = 1;
+       bool frags = tun_napi_frags_enabled(tun);
 
        if (!(tun->dev->flags & IFF_UP))
                return -EIO;
@@ -1523,7 +1601,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, 
struct tun_file *tfile,
                        zerocopy = true;
        }
 
-       if (tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
+       if (!frags && tun_can_build_skb(tun, tfile, len, noblock, zerocopy)) {
                skb = tun_build_skb(tun, tfile, from, &gso, len, &generic_xdp);
                if (IS_ERR(skb)) {
                        this_cpu_inc(tun->pcpu_stats->rx_dropped);
@@ -1540,10 +1618,24 @@ static ssize_t tun_get_user(struct tun_struct *tun, 
struct tun_file *tfile,
                                linear = tun16_to_cpu(tun, gso.hdr_len);
                }
 
-               skb = tun_alloc_skb(tfile, align, copylen, linear, noblock);
+               if (frags) {
+                       mutex_lock(&tfile->napi_mutex);
+                       skb = tun_napi_alloc_frags(tfile, copylen, from);
+                       /* tun_napi_alloc_frags() enforces a layout for the skb.
+                        * If zerocopy is enabled, then this layout will be
+                        * overwritten by zerocopy_sg_from_iter().
+                        */
+                       zerocopy = false;
+               } else {
+                       skb = tun_alloc_skb(tfile, align, copylen, linear,
+                                           noblock);
+               }
+
                if (IS_ERR(skb)) {
                        if (PTR_ERR(skb) != -EAGAIN)
                                this_cpu_inc(tun->pcpu_stats->rx_dropped);
+                       if (frags)
+                               mutex_unlock(&tfile->napi_mutex);
                        return PTR_ERR(skb);
                }
 
@@ -1555,6 +1647,11 @@ static ssize_t tun_get_user(struct tun_struct *tun, 
struct tun_file *tfile,
                if (err) {
                        this_cpu_inc(tun->pcpu_stats->rx_dropped);
                        kfree_skb(skb);
+                       if (frags) {
+                               tfile->napi.skb = NULL;
+                               mutex_unlock(&tfile->napi_mutex);
+                       }
+
                        return -EFAULT;
                }
        }
@@ -1562,6 +1659,11 @@ static ssize_t tun_get_user(struct tun_struct *tun, 
struct tun_file *tfile,
        if (virtio_net_hdr_to_skb(skb, &gso, tun_is_little_endian(tun))) {
                this_cpu_inc(tun->pcpu_stats->rx_frame_errors);
                kfree_skb(skb);
+               if (frags) {
+                       tfile->napi.skb = NULL;
+                       mutex_unlock(&tfile->napi_mutex);
+               }
+
                return -EINVAL;
        }
 
@@ -1587,7 +1689,8 @@ static ssize_t tun_get_user(struct tun_struct *tun, 
struct tun_file *tfile,
                skb->dev = tun->dev;
                break;
        case IFF_TAP:
-               skb->protocol = eth_type_trans(skb, tun->dev);
+               if (!frags)
+                       skb->protocol = eth_type_trans(skb, tun->dev);
                break;
        }
 
@@ -1622,7 +1725,23 @@ static ssize_t tun_get_user(struct tun_struct *tun, 
struct tun_file *tfile,
 
        rxhash = __skb_get_hash_symmetric(skb);
 
-       if (IS_ENABLED(CONFIG_TUN_NAPI)) {
+       if (frags) {
+               /* Exercise flow dissector code path. */
+               u32 headlen = eth_get_headlen(skb->data, skb_headlen(skb));
+
+               if (headlen > skb_headlen(skb) || headlen < ETH_HLEN) {
+                       this_cpu_inc(tun->pcpu_stats->rx_dropped);
+                       napi_free_frags(&tfile->napi);
+                       mutex_unlock(&tfile->napi_mutex);
+                       WARN_ON(1);
+                       return -ENOMEM;
+               }
+
+               local_bh_disable();
+               napi_gro_frags(&tfile->napi);
+               local_bh_enable();
+               mutex_unlock(&tfile->napi_mutex);
+       } else if (IS_ENABLED(CONFIG_TUN_NAPI)) {
                struct sk_buff_head *queue = &tfile->sk.sk_write_queue;
                int queue_len;
 
@@ -2168,6 +2287,10 @@ static int tun_set_iff(struct net *net, struct file 
*file, struct ifreq *ifr)
        tun->flags = (tun->flags & ~TUN_FEATURES) |
                (ifr->ifr_flags & TUN_FEATURES);
 
+       if (!IS_ENABLED(CONFIG_TUN_NAPI) ||
+           (tun->flags & TUN_TYPE_MASK) != IFF_TAP)
+               tun->flags = tun->flags & ~IFF_NAPI_FRAGS;
+
        /* Make sure persistent devices do not get stuck in
         * xoff state.
         */
diff --git a/include/uapi/linux/if_tun.h b/include/uapi/linux/if_tun.h
index 3cb5e1d85ddd..1eb1eb42f151 100644
--- a/include/uapi/linux/if_tun.h
+++ b/include/uapi/linux/if_tun.h
@@ -60,6 +60,7 @@
 /* TUNSETIFF ifr flags */
 #define IFF_TUN                0x0001
 #define IFF_TAP                0x0002
+#define IFF_NAPI_FRAGS 0x0010
 #define IFF_NO_PI      0x1000
 /* This flag has no real effect */
 #define IFF_ONE_QUEUE  0x2000
-- 
2.14.1.581.gf28d330327-goog

Reply via email to