Fix PACKET_RX_RING bug for versions TPACKET_V1 and TPACKET_V2 which
casues the ring to get corrupted by allowing multiple kernel threads
to claim ownership of the same ring entry. Track ownership in a shadow
ring structure to prevent other kernel threads from reusing the same
entry before it's fully filled in, passed to user space, and then
eventually passed back to the kernel for use with a new packet.

Signed-off-by: Jon Rosen <jro...@cisco.com>
---

There is a bug in net/packet/af_packet.c:tpacket_rcv in how it manages
the PACKET_RX_RING for versions TPACKET_V1 and TPACKET_V2.  This bug makes
it possible for multiple kernel threads to claim ownership of the same
ring entry, corrupting the ring and the corresponding packet(s).

These diffs are the second proposed solution, previous proposal was described
in https://www.mail-archive.com/netdev@vger.kernel.org/msg227468.html
subject [RFC PATCH] packet: mark ring entry as in-use inside spin_lock
to prevent RX ring overrun

Those diffs would have changed the binary interface and have broken certain
applications. Consensus was that such a change would be inappropriate.

These new diffs use a shadow ring in kernel space for tracking intermediate
state of an entry and prevent more than one kernel thread from simultaneously
allocating a ring entry. This avoids any impact to the binary interface
between kernel and userspace but comes at the additional cost of requiring a
second spin_lock when passing ownership of a ring entry to userspace.

Jon Rosen (1):
  packet: track ring entry use using a shadow ring to prevent RX ring
    overrun

 net/packet/af_packet.c | 64 ++++++++++++++++++++++++++++++++++++++++++++++++++
 net/packet/internal.h  | 14 +++++++++++
 2 files changed, 78 insertions(+)

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index e0f3f4a..4d08c8e 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2165,6 +2165,26 @@ static int packet_rcv(struct sk_buff *skb, struct 
net_device *dev,
        return 0;
 }
 
+static inline void *packet_rx_shadow_aquire_head(struct packet_sock *po)
+{
+       struct packet_ring_shadow_entry *entry;
+
+       entry = &po->rx_shadow.ring[po->rx_ring.head];
+       if (unlikely(entry->inuse))
+               return NULL;
+
+       entry->inuse = 1;
+       return (void *)entry;
+}
+
+static inline void packet_rx_shadow_release(void *_entry)
+{
+       struct packet_ring_shadow_entry *entry;
+
+       entry = (struct packet_ring_shadow_entry *)_entry;
+       entry->inuse = 0;
+}
+
 static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,
                       struct packet_type *pt, struct net_device *orig_dev)
 {
@@ -2182,6 +2202,7 @@ static int tpacket_rcv(struct sk_buff *skb, struct 
net_device *dev,
        __u32 ts_status;
        bool is_drop_n_account = false;
        bool do_vnet = false;
+       void *rx_shadow_ring_entry = NULL;
 
        /* struct tpacket{2,3}_hdr is aligned to a multiple of 
TPACKET_ALIGNMENT.
         * We may add members to them until current aligned size without forcing
@@ -2277,7 +2298,15 @@ static int tpacket_rcv(struct sk_buff *skb, struct 
net_device *dev,
        if (!h.raw)
                goto drop_n_account;
        if (po->tp_version <= TPACKET_V2) {
+               /* Attempt to allocate shadow ring entry.
+                * If already inuse then the ring is full.
+                */
+               rx_shadow_ring_entry = packet_rx_shadow_aquire_head(po);
+               if (unlikely(!rx_shadow_ring_entry))
+                       goto ring_is_full;
+
                packet_increment_rx_head(po, &po->rx_ring);
+
        /*
         * LOSING will be reported till you read the stats,
         * because it's COR - Clear On Read.
@@ -2383,7 +2412,11 @@ static int tpacket_rcv(struct sk_buff *skb, struct 
net_device *dev,
 #endif
 
        if (po->tp_version <= TPACKET_V2) {
+               spin_lock(&sk->sk_receive_queue.lock);
                __packet_set_status(po, h.raw, status);
+               packet_rx_shadow_release(rx_shadow_ring_entry);
+               spin_unlock(&sk->sk_receive_queue.lock);
+
                sk->sk_data_ready(sk);
        } else {
                prb_clear_blk_fill_status(&po->rx_ring);
@@ -4197,6 +4230,25 @@ static struct pgv *alloc_pg_vec(struct tpacket_req *req, 
int order)
        goto out;
 }
 
+static struct packet_ring_shadow_entry *
+               packet_rx_shadow_alloc(unsigned int tp_frame_nr)
+{
+       struct packet_ring_shadow_entry *rx_shadow_ring;
+       int ring_size;
+       int i;
+
+       ring_size = tp_frame_nr * sizeof(*rx_shadow_ring);
+       rx_shadow_ring = kmalloc(ring_size, GFP_KERNEL);
+
+       if (!rx_shadow_ring)
+               return NULL;
+
+       for (i = 0; i < tp_frame_nr; i++)
+               rx_shadow_ring[i].inuse = 0;
+
+       return rx_shadow_ring;
+}
+
 static int packet_set_ring(struct sock *sk, union tpacket_req_u *req_u,
                int closing, int tx_ring)
 {
@@ -4209,6 +4261,7 @@ static int packet_set_ring(struct sock *sk, union 
tpacket_req_u *req_u,
        int err = -EINVAL;
        /* Added to avoid minimal code churn */
        struct tpacket_req *req = &req_u->req;
+       struct packet_ring_shadow_entry *rx_shadow_ring = NULL;
 
        lock_sock(sk);
 
@@ -4266,6 +4319,13 @@ static int packet_set_ring(struct sock *sk, union 
tpacket_req_u *req_u,
                        goto out;
 
                err = -ENOMEM;
+               if (!tx_ring && po->tp_version <= TPACKET_V2) {
+                       rx_shadow_ring =
+                               packet_rx_shadow_alloc(req->tp_frame_nr);
+                       if (!rx_shadow_ring)
+                               goto out;
+               }
+
                order = get_order(req->tp_block_size);
                pg_vec = alloc_pg_vec(req, order);
                if (unlikely(!pg_vec))
@@ -4319,6 +4379,8 @@ static int packet_set_ring(struct sock *sk, union 
tpacket_req_u *req_u,
                rb->frame_max = (req->tp_frame_nr - 1);
                rb->head = 0;
                rb->frame_size = req->tp_frame_size;
+               if (!tx_ring)
+                       swap(po->rx_shadow.ring, rx_shadow_ring);
                spin_unlock_bh(&rb_queue->lock);
 
                swap(rb->pg_vec_order, order);
@@ -4349,6 +4411,8 @@ static int packet_set_ring(struct sock *sk, union 
tpacket_req_u *req_u,
        if (pg_vec)
                free_pg_vec(pg_vec, order, req->tp_block_nr);
 out:
+       kfree(rx_shadow_ring);
+
        release_sock(sk);
        return err;
 }
diff --git a/net/packet/internal.h b/net/packet/internal.h
index a1d2b23..d1a965e 100644
--- a/net/packet/internal.h
+++ b/net/packet/internal.h
@@ -73,6 +73,14 @@ struct packet_ring_buffer {
        struct tpacket_kbdq_core        prb_bdqc;
 };
 
+struct packet_ring_shadow_entry {
+       unsigned int            inuse;
+};
+
+struct packet_ring_shadow {
+       struct packet_ring_shadow_entry *ring;
+};
+
 extern struct mutex fanout_mutex;
 #define PACKET_FANOUT_MAX      256
 
@@ -107,8 +115,14 @@ struct packet_sock {
        struct sock             sk;
        struct packet_fanout    *fanout;
        union  tpacket_stats_u  stats;
+       /* Do not separate rx/tx ring structures.
+        * They are treated as an array in af_packet.c:packet_mmap()
+        */
        struct packet_ring_buffer       rx_ring;
        struct packet_ring_buffer       tx_ring;
+       /* end of rings */
+
+       struct packet_ring_shadow       rx_shadow;
        int                     copy_thresh;
        spinlock_t              bind_lock;
        struct mutex            pg_vec_lock;
-- 
2.5.0

Reply via email to