[PATCH v2 2/4] net: xsk: add a simple buffer reuse queue
From: Jakub Kicinski XSK UMEM is strongly single producer single consumer so reuse of frames is challenging. Add a simple "stash" of FILL packets to reuse for drivers to optionally make use of. This is useful when driver has to free (ndo_stop) or resize a ring with an active AF_XDP ZC socket. v2: Fixed build issues for !CONFIG_XDP_SOCKETS. Signed-off-by: Jakub Kicinski --- include/net/xdp_sock.h | 69 ++ net/xdp/xdp_umem.c | 2 ++ net/xdp/xsk_queue.c| 55 + net/xdp/xsk_queue.h| 3 ++ 4 files changed, 129 insertions(+) diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h index 932ca0dad6f3..70a115bea4f4 100644 --- a/include/net/xdp_sock.h +++ b/include/net/xdp_sock.h @@ -21,6 +21,12 @@ struct xdp_umem_page { dma_addr_t dma; }; +struct xdp_umem_fq_reuse { + u32 nentries; + u32 length; + u64 handles[]; +}; + struct xdp_umem { struct xsk_queue *fq; struct xsk_queue *cq; @@ -37,6 +43,7 @@ struct xdp_umem { struct page **pgs; u32 npgs; struct net_device *dev; + struct xdp_umem_fq_reuse *fq_reuse; u16 queue_id; bool zc; spinlock_t xsk_list_lock; @@ -75,6 +82,10 @@ void xsk_umem_discard_addr(struct xdp_umem *umem); void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries); bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len); void xsk_umem_consume_tx_done(struct xdp_umem *umem); +struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries); +struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem, + struct xdp_umem_fq_reuse *newq); +void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq); static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) { @@ -85,6 +96,35 @@ static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) { return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1)); } + +/* Reuse-queue aware version of FILL queue helpers */ +static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) +{ + struct xdp_umem_fq_reuse *rq = umem->fq_reuse; + + if (!rq->length) + return xsk_umem_peek_addr(umem, addr); + + *addr = rq->handles[rq->length - 1]; + return addr; +} + +static inline void xsk_umem_discard_addr_rq(struct xdp_umem *umem) +{ + struct xdp_umem_fq_reuse *rq = umem->fq_reuse; + + if (!rq->length) + xsk_umem_discard_addr(umem); + else + rq->length--; +} + +static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr) +{ + struct xdp_umem_fq_reuse *rq = umem->fq_reuse; + + rq->handles[rq->length++] = addr; +} #else static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp) { @@ -128,6 +168,21 @@ static inline void xsk_umem_consume_tx_done(struct xdp_umem *umem) { } +static inline struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries) +{ + return NULL; +} + +static inline struct xdp_umem_fq_reuse *xsk_reuseq_swap( + struct xdp_umem *umem, + struct xdp_umem_fq_reuse *newq) +{ + return NULL; +} +static inline void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq) +{ +} + static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr) { return NULL; @@ -137,6 +192,20 @@ static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem *umem, u64 addr) { return 0; } + +static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr) +{ + return NULL; +} + +static inline void xsk_umem_discard_addr_rq(struct xdp_umem *umem) +{ +} + +static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr) +{ +} + #endif /* CONFIG_XDP_SOCKETS */ #endif /* _LINUX_XDP_SOCK_H */ diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c index b3b632c5aeae..555427b3e0fe 100644 --- a/net/xdp/xdp_umem.c +++ b/net/xdp/xdp_umem.c @@ -165,6 +165,8 @@ static void xdp_umem_release(struct xdp_umem *umem) umem->cq = NULL; } + xsk_reuseq_destroy(umem); + xdp_umem_unpin_pages(umem); task = get_pid_task(umem->pid, PIDTYPE_PID); diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c index 2dc1384d9f27..b66504592d9b 100644 --- a/net/xdp/xsk_queue.c +++ b/net/xdp/xsk_queue.c @@ -3,7 +3,9 @@ * Copyright(c) 2018 Intel Corporation. */ +#include #include +#include #include "xsk_queue.h" @@ -62,3 +64,56 @@ void xskq_destroy(struct xsk_queue *q) page_frag_free(q->ring); kfree(q); } + +struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries) +{ + struct xdp_umem_fq_reuse *newq; + + /* Check for overflow */ + if (nentries > (u32)roundup_pow_of_two(nentries)) + return NULL; + nentries = roundup_pow_of_two(nentries); + + newq = kvmalloc(struct_size(newq, handles, nentries), GFP_KERNEL); +
[PATCH v2 1/4] i40e: clean zero-copy XDP Tx ring on shutdown/reset
From: Björn Töpel When the zero-copy enabled XDP Tx ring is torn down, due to configuration changes, outstandning frames on the hardware descriptor ring are queued on the completion ring. The completion ring has a back-pressure mechanism that will guarantee that there is sufficient space on the ring. Signed-off-by: Björn Töpel --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 17 +++ .../ethernet/intel/i40e/i40e_txrx_common.h| 2 ++ drivers/net/ethernet/intel/i40e/i40e_xsk.c| 30 +++ 3 files changed, 43 insertions(+), 6 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 37bd4e50ccde..7f85d4ba8b54 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -636,13 +636,18 @@ void i40e_clean_tx_ring(struct i40e_ring *tx_ring) unsigned long bi_size; u16 i; - /* ring already cleared, nothing to do */ - if (!tx_ring->tx_bi) - return; + if (ring_is_xdp(tx_ring) && tx_ring->xsk_umem) { + i40e_xsk_clean_tx_ring(tx_ring); + } else { + /* ring already cleared, nothing to do */ + if (!tx_ring->tx_bi) + return; - /* Free all the Tx ring sk_buffs */ - for (i = 0; i < tx_ring->count; i++) - i40e_unmap_and_free_tx_resource(tx_ring, &tx_ring->tx_bi[i]); + /* Free all the Tx ring sk_buffs */ + for (i = 0; i < tx_ring->count; i++) + i40e_unmap_and_free_tx_resource(tx_ring, + &tx_ring->tx_bi[i]); + } bi_size = sizeof(struct i40e_tx_buffer) * tx_ring->count; memset(tx_ring->tx_bi, 0, bi_size); diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h index b5afd479a9c5..29c68b29d36f 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h @@ -87,4 +87,6 @@ static inline void i40e_arm_wb(struct i40e_ring *tx_ring, } } +void i40e_xsk_clean_tx_ring(struct i40e_ring *tx_ring); + #endif /* I40E_TXRX_COMMON_ */ diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 2ebfc78bbd09..99116277c4d2 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -830,3 +830,33 @@ int i40e_xsk_async_xmit(struct net_device *dev, u32 queue_id) return 0; } + +/** + * i40e_xsk_clean_xdp_ring - Clean the XDP Tx ring on shutdown + * @xdp_ring: XDP Tx ring + **/ +void i40e_xsk_clean_tx_ring(struct i40e_ring *tx_ring) +{ + u16 ntc = tx_ring->next_to_clean, ntu = tx_ring->next_to_use; + struct xdp_umem *umem = tx_ring->xsk_umem; + struct i40e_tx_buffer *tx_bi; + u32 xsk_frames = 0; + + while (ntc != ntu) { + tx_bi = &tx_ring->tx_bi[ntc]; + + if (tx_bi->xdpf) + i40e_clean_xdp_tx_buffer(tx_ring, tx_bi); + else + xsk_frames++; + + tx_bi->xdpf = NULL; + + ntc++; + if (ntc > tx_ring->count) + ntc = 0; + } + + if (xsk_frames) + xsk_umem_complete_tx(umem, xsk_frames); +} -- 2.17.1
[PATCH v2 4/4] i40e: disallow changing the number of descriptors when AF_XDP is on
From: Björn Töpel When an AF_XDP UMEM is attached to any of the Rx rings, we disallow a user to change the number of descriptors via e.g. "ethtool -G IFNAME". Otherwise, the size of the stash/reuse queue can grow unbounded, which would result in OOM or leaking userspace buffers. Signed-off-by: Björn Töpel --- .../net/ethernet/intel/i40e/i40e_ethtool.c| 9 +++- .../ethernet/intel/i40e/i40e_txrx_common.h| 1 + drivers/net/ethernet/intel/i40e/i40e_xsk.c| 22 +++ 3 files changed, 31 insertions(+), 1 deletion(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index d7d3974beca2..3cd2c88c72f8 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -5,7 +5,7 @@ #include "i40e.h" #include "i40e_diag.h" - +#include "i40e_txrx_common.h" #include "i40e_ethtool_stats.h" #define I40E_PF_STAT(_name, _stat) \ @@ -1493,6 +1493,13 @@ static int i40e_set_ringparam(struct net_device *netdev, (new_rx_count == vsi->rx_rings[0]->count)) return 0; + /* If there is a AF_XDP UMEM attached to any of Rx rings, +* disallow changing the number of descriptors -- regardless +* if the netdev is running or not. +*/ + if (i40e_xsk_any_rx_ring_enabled(vsi)) + return -EBUSY; + while (test_and_set_bit(__I40E_CONFIG_BUSY, pf->state)) { timeout--; if (!timeout) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h index 8d46acff6f2e..09809dffe399 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h @@ -89,5 +89,6 @@ static inline void i40e_arm_wb(struct i40e_ring *tx_ring, void i40e_xsk_clean_rx_ring(struct i40e_ring *rx_ring); void i40e_xsk_clean_tx_ring(struct i40e_ring *tx_ring); +bool i40e_xsk_any_rx_ring_enabled(struct i40e_vsi *vsi); #endif /* I40E_TXRX_COMMON_ */ diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index e4b62e871afc..119f59ec7cc0 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -944,3 +944,25 @@ void i40e_xsk_clean_tx_ring(struct i40e_ring *tx_ring) if (xsk_frames) xsk_umem_complete_tx(umem, xsk_frames); } + +/** + * i40e_xsk_any_rx_ring_enabled - Checks whether any of the Rx rings + * has AF_XDP UMEM attached + * @vsi: vsi + * + * Returns true if any of the Rx rings has an AF_XDP UMEM attached + **/ +bool i40e_xsk_any_rx_ring_enabled(struct i40e_vsi *vsi) +{ + int i; + + if (!vsi->xsk_umems) + return false; + + for (i = 0; i < vsi->num_queue_pairs; i++) { + if (vsi->xsk_umems[i]) + return true; + } + + return false; +} -- 2.17.1
[PATCH v2 0/4] i40e AF_XDP zero-copy buffer leak fixes
From: Björn Töpel NB! The v1 was sent via the bpf-next tree. This time the series is routed via JeffK's Intel Wired tree to minimize the risk for i40e merge conflicts. This series addresses an AF_XDP zero-copy issue that buffers passed from userspace to the kernel was leaked when the hardware descriptor ring was torn down. The patches fixes the i40e AF_XDP zero-copy implementation. Thanks to Jakub Kicinski for pointing this out! Some background for folks that don't know the details: A zero-copy capable driver picks buffers off the fill ring and places them on the hardware Rx ring to be completed at a later point when DMA is complete. Similar on the Tx side; The driver picks buffers off the Tx ring and places them on the Tx hardware ring. In the typical flow, the Rx buffer will be placed onto an Rx ring (completed to the user), and the Tx buffer will be placed on the completion ring to notify the user that the transfer is done. However, if the driver needs to tear down the hardware rings for some reason (interface goes down, reconfiguration and such), the userspace buffers cannot be leaked. They have to be reused or completed back to userspace. The implementation does the following: * Outstanding Tx descriptors will be passed to the completion ring. The Tx code has back-pressure mechanism in place, so that enough empty space in the completion ring is guaranteed. * Outstanding Rx descriptors are temporarily stored on a stash/reuse queue. The reuse queue is based on Jakub's RFC. When/if the HW rings comes up again, entries from the stash are used to re-populate the ring. * When AF_XDP ZC is enabled, disallow changing the number of hardware descriptors via ethtool. Otherwise, the size of the stash/reuse queue can grow unbounded. Going forward, introducing a "zero-copy allocator" analogous to Jesper Brouer's page pool would be a more robust and reuseable solution. v1->v2: Address kbuild "undefined symbols" error when building with !CONFIG_XDP_SOCKETS. Thanks! Björn Björn Töpel (3): i40e: clean zero-copy XDP Tx ring on shutdown/reset i40e: clean zero-copy XDP Rx ring on shutdown/reset i40e: disallow changing the number of descriptors when AF_XDP is on Jakub Kicinski (1): net: xsk: add a simple buffer reuse queue .../net/ethernet/intel/i40e/i40e_ethtool.c| 9 +- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 21 ++- .../ethernet/intel/i40e/i40e_txrx_common.h| 4 + drivers/net/ethernet/intel/i40e/i40e_xsk.c| 152 +- include/net/xdp_sock.h| 69 net/xdp/xdp_umem.c| 2 + net/xdp/xsk_queue.c | 55 +++ net/xdp/xsk_queue.h | 3 + 8 files changed, 299 insertions(+), 16 deletions(-) -- 2.17.1
[PATCH v2 3/4] i40e: clean zero-copy XDP Rx ring on shutdown/reset
From: Björn Töpel Outstanding Rx descriptors are temporarily stored on a stash/reuse queue. When/if the HW rings comes up again, entries from the stash are used to re-populate the ring. The latter required some restructuring of the allocation scheme for the AF_XDP zero-copy implementation. There is now a fast, and a slow allocation. The "fast allocation" is used from the fast-path and obtains free buffers from the fill ring and the internal recycle mechanism. The "slow allocation" is only used in ring setup, and obtains buffers from the fill ring and the stash (if any). Signed-off-by: Björn Töpel --- drivers/net/ethernet/intel/i40e/i40e_txrx.c | 4 +- .../ethernet/intel/i40e/i40e_txrx_common.h| 1 + drivers/net/ethernet/intel/i40e/i40e_xsk.c| 100 -- 3 files changed, 96 insertions(+), 9 deletions(-) diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c index 7f85d4ba8b54..740ea58ba938 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c @@ -1355,8 +1355,10 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring) rx_ring->skb = NULL; } - if (rx_ring->xsk_umem) + if (rx_ring->xsk_umem) { + i40e_xsk_clean_rx_ring(rx_ring); goto skip_free; + } /* Free all the Rx ring sk_buffs */ for (i = 0; i < rx_ring->count; i++) { diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h index 29c68b29d36f..8d46acff6f2e 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h +++ b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h @@ -87,6 +87,7 @@ static inline void i40e_arm_wb(struct i40e_ring *tx_ring, } } +void i40e_xsk_clean_rx_ring(struct i40e_ring *rx_ring); void i40e_xsk_clean_tx_ring(struct i40e_ring *tx_ring); #endif /* I40E_TXRX_COMMON_ */ diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c index 99116277c4d2..e4b62e871afc 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c +++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c @@ -140,6 +140,7 @@ static void i40e_xsk_umem_dma_unmap(struct i40e_vsi *vsi, struct xdp_umem *umem) static int i40e_xsk_umem_enable(struct i40e_vsi *vsi, struct xdp_umem *umem, u16 qid) { + struct xdp_umem_fq_reuse *reuseq; bool if_running; int err; @@ -156,6 +157,12 @@ static int i40e_xsk_umem_enable(struct i40e_vsi *vsi, struct xdp_umem *umem, return -EBUSY; } + reuseq = xsk_reuseq_prepare(vsi->rx_rings[0]->count); + if (!reuseq) + return -ENOMEM; + + xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq)); + err = i40e_xsk_umem_dma_map(vsi, umem); if (err) return err; @@ -353,16 +360,46 @@ static bool i40e_alloc_buffer_zc(struct i40e_ring *rx_ring, } /** - * i40e_alloc_rx_buffers_zc - Allocates a number of Rx buffers + * i40e_alloc_buffer_slow_zc - Allocates an i40e_rx_buffer * @rx_ring: Rx ring - * @count: The number of buffers to allocate + * @bi: Rx buffer to populate * - * This function allocates a number of Rx buffers and places them on - * the Rx ring. + * This function allocates an Rx buffer. The buffer can come from fill + * queue, or via the reuse queue. * * Returns true for a successful allocation, false otherwise **/ -bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count) +static bool i40e_alloc_buffer_slow_zc(struct i40e_ring *rx_ring, + struct i40e_rx_buffer *bi) +{ + struct xdp_umem *umem = rx_ring->xsk_umem; + u64 handle, hr; + + if (!xsk_umem_peek_addr_rq(umem, &handle)) { + rx_ring->rx_stats.alloc_page_failed++; + return false; + } + + handle &= rx_ring->xsk_umem->chunk_mask; + + hr = umem->headroom + XDP_PACKET_HEADROOM; + + bi->dma = xdp_umem_get_dma(umem, handle); + bi->dma += hr; + + bi->addr = xdp_umem_get_data(umem, handle); + bi->addr += hr; + + bi->handle = handle + umem->headroom; + + xsk_umem_discard_addr_rq(umem); + return true; +} + +static __always_inline bool __i40e_alloc_rx_buffers_zc( + struct i40e_ring *rx_ring, u16 count, + bool alloc(struct i40e_ring *rx_ring, + struct i40e_rx_buffer *bi)) { u16 ntu = rx_ring->next_to_use; union i40e_rx_desc *rx_desc; @@ -372,7 +409,7 @@ bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count) rx_desc = I40E_RX_DESC(rx_ring, ntu); bi = &rx_ring->rx_bi[ntu]; do { - if (!i40e_alloc_buffer_zc(rx_ring, bi)) { + if (!alloc(rx_ring, bi)) { ok = false; goto no_buffers; } @@ -404,6 +441,
Re: [PATCH net-next] net: sched: cls_flower: dump offload count value
On Thu, 6 Sep 2018 18:37:23 +0300, Vlad Buslov wrote: > Change flower in_hw_count type to fixed-size u32 and dump it as > TCA_FLOWER_IN_HW_COUNT. This change is necessary to properly test shared > blocks and re-offload functionality. > > Signed-off-by: Vlad Buslov > Acked-by: Jiri Pirko > --- > include/net/sch_generic.h| 2 +- > include/uapi/linux/pkt_cls.h | 2 ++ > net/sched/cls_flower.c | 5 - > 3 files changed, 7 insertions(+), 2 deletions(-) > > diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h > index a6d00093f35e..d68ac55539a5 100644 > --- a/include/net/sch_generic.h > +++ b/include/net/sch_generic.h > @@ -362,7 +362,7 @@ static inline void tcf_block_offload_dec(struct tcf_block > *block, u32 *flags) > } > > static inline void > -tc_cls_offload_cnt_update(struct tcf_block *block, unsigned int *cnt, > +tc_cls_offload_cnt_update(struct tcf_block *block, u32 *cnt, > u32 *flags, bool add) > { > if (add) { > diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h > index be382fb0592d..2824fb7ed1c9 100644 > --- a/include/uapi/linux/pkt_cls.h > +++ b/include/uapi/linux/pkt_cls.h > @@ -483,6 +483,8 @@ enum { > TCA_FLOWER_KEY_ENC_OPTS, > TCA_FLOWER_KEY_ENC_OPTS_MASK, > > + TCA_FLOWER_IN_HW_COUNT, /* be32 */ Why be32? I wish there was a good way to share this attribute between classifiers :( > __TCA_FLOWER_MAX, > }; > > diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c > index 6fd9bdd93796..4b8dd37dd4f8 100644 > --- a/net/sched/cls_flower.c > +++ b/net/sched/cls_flower.c > @@ -98,7 +98,7 @@ struct cls_fl_filter { > struct list_head list; > u32 handle; > u32 flags; > - unsigned int in_hw_count; > + u32 in_hw_count; > struct rcu_work rwork; > struct net_device *hw_dev; > }; > @@ -1880,6 +1880,9 @@ static int fl_dump(struct net *net, struct tcf_proto > *tp, void *fh, > if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags)) > goto nla_put_failure; > > + if (nla_put_u32(skb, TCA_FLOWER_IN_HW_COUNT, f->in_hw_count)) > + goto nla_put_failure; > + > if (tcf_exts_dump(skb, &f->exts)) > goto nla_put_failure; >
Re: [PATCH net-next] net: sched: cls_flower: dump offload count value
On Fri 07 Sep 2018 at 09:11, Jakub Kicinski wrote: > On Thu, 6 Sep 2018 18:37:23 +0300, Vlad Buslov wrote: >> Change flower in_hw_count type to fixed-size u32 and dump it as >> TCA_FLOWER_IN_HW_COUNT. This change is necessary to properly test shared >> blocks and re-offload functionality. >> >> Signed-off-by: Vlad Buslov >> Acked-by: Jiri Pirko >> --- >> include/net/sch_generic.h| 2 +- >> include/uapi/linux/pkt_cls.h | 2 ++ >> net/sched/cls_flower.c | 5 - >> 3 files changed, 7 insertions(+), 2 deletions(-) >> >> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h >> index a6d00093f35e..d68ac55539a5 100644 >> --- a/include/net/sch_generic.h >> +++ b/include/net/sch_generic.h >> @@ -362,7 +362,7 @@ static inline void tcf_block_offload_dec(struct >> tcf_block *block, u32 *flags) >> } >> >> static inline void >> -tc_cls_offload_cnt_update(struct tcf_block *block, unsigned int *cnt, >> +tc_cls_offload_cnt_update(struct tcf_block *block, u32 *cnt, >>u32 *flags, bool add) >> { >> if (add) { >> diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h >> index be382fb0592d..2824fb7ed1c9 100644 >> --- a/include/uapi/linux/pkt_cls.h >> +++ b/include/uapi/linux/pkt_cls.h >> @@ -483,6 +483,8 @@ enum { >> TCA_FLOWER_KEY_ENC_OPTS, >> TCA_FLOWER_KEY_ENC_OPTS_MASK, >> >> +TCA_FLOWER_IN_HW_COUNT, /* be32 */ > > Why be32? This comment is wrong. Thanks for catching this. > > I wish there was a good way to share this attribute between > classifiers :( > >> __TCA_FLOWER_MAX, >> }; >> >> diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c >> index 6fd9bdd93796..4b8dd37dd4f8 100644 >> --- a/net/sched/cls_flower.c >> +++ b/net/sched/cls_flower.c >> @@ -98,7 +98,7 @@ struct cls_fl_filter { >> struct list_head list; >> u32 handle; >> u32 flags; >> -unsigned int in_hw_count; >> +u32 in_hw_count; >> struct rcu_work rwork; >> struct net_device *hw_dev; >> }; >> @@ -1880,6 +1880,9 @@ static int fl_dump(struct net *net, struct tcf_proto >> *tp, void *fh, >> if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags)) >> goto nla_put_failure; >> >> +if (nla_put_u32(skb, TCA_FLOWER_IN_HW_COUNT, f->in_hw_count)) >> +goto nla_put_failure; >> + >> if (tcf_exts_dump(skb, &f->exts)) >> goto nla_put_failure; >>
Re: kernels > v4.12 oops/crash with ipsec-traffic: bisected to b838d5e1c5b6e57b10ec8af2268824041e3ea911: ipv4: mark DST_NOGC and remove the operation of dst_free()
Hello, didn't respond as I've been on vacation. Am Freitag, 31. August 2018, 08:50:24 schrieb Steffen Klassert: > On Thu, Aug 30, 2018 at 08:53:50PM +0200, Wolfgang Walter wrote: > > Hello, > > > > kernels > 4.12 do not work on one of our main routers. They crash as soon > > as ipsec-tunnels are configured and ipsec-traffic actually flows. > > Can you please send the backtrace of this crash? > I'll try today. The oops quickly disappears because other problems arising from it pop up. The machine crashes and no logs are logged. I try to make foto or try to log to the serial console. At the moment I only see that there is xfrm_ stuff in the call trace as xfrm_lookup, xfrm_route_, and it is while routing a packet. With later kernels (4.18.5) the machine seems to crash without a call trace on console. Regards, -- Wolfgang Walter Studentenwerk München Anstalt des öffentlichen Rechts
[PATCH net-next] cxgb4: impose mandatory VLAN usage when non-zero TAG ID
From: Casey Leedom When a non-zero VLAN Tag ID is passed to t4_set_vlan_acl() then impose mandatory VLAN Usage with that VLAN ID. I.e any other VLAN ID should result in packets getting dropped. Signed-off-by: Casey Leedom Signed-off-by: Ganesh Goudar --- drivers/net/ethernet/chelsio/cxgb4/t4_hw.c| 3 +++ drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h | 1 + 2 files changed, 4 insertions(+) diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c index 5fe5d16..6f1bd7b 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c +++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c @@ -10210,6 +10210,9 @@ int t4_set_vlan_acl(struct adapter *adap, unsigned int mbox, unsigned int vf, vlan_cmd.en_to_len16 = cpu_to_be32(enable | FW_LEN16(vlan_cmd)); /* Drop all packets that donot match vlan id */ vlan_cmd.dropnovlan_fm = FW_ACL_VLAN_CMD_FM_F; + vlan_cmd.dropnovlan_fm = (enable + ? (FW_ACL_VLAN_CMD_DROPNOVLAN_F | +FW_ACL_VLAN_CMD_FM_F) : 0); if (enable != 0) { vlan_cmd.nvlan = 1; vlan_cmd.vlanid[0] = cpu_to_be16(vlan); diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h index 5dc6c41..6d2bc87 100644 --- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h +++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h @@ -2464,6 +2464,7 @@ struct fw_acl_vlan_cmd { #define FW_ACL_VLAN_CMD_DROPNOVLAN_S 7 #define FW_ACL_VLAN_CMD_DROPNOVLAN_V(x)((x) << FW_ACL_VLAN_CMD_DROPNOVLAN_S) +#define FW_ACL_VLAN_CMD_DROPNOVLAN_FFW_ACL_VLAN_CMD_DROPNOVLAN_V(1U) #define FW_ACL_VLAN_CMD_FM_S 6 #define FW_ACL_VLAN_CMD_FM_M 0x1 -- 2.1.0
Re: [PATCH v2 net-next 3/4] net: make listified RX functions return number of good packets
On 07/09/18 03:27, Eric Dumazet wrote: > On 09/06/2018 07:26 AM, Edward Cree wrote: >> Signed-off-by: Edward Cree > Lack of changelog here ? > > I do not know what is a good packet. The comment on netif_receive_skb_list() defines this as "skbs for which netif_receive_skb() would have returned %NET_RX_SUCCESS". But I shall put that into the changelog as well. > You are adding a lot of conditional expressions, that cpu > will mispredict quite often. I don't see an alternative, since this is needed by patch #4 and I daresay there are other drivers that will also want to get NET_RX_SUCCESS-like information (possibly from regular receives as well as GRO; I'm not quite sure why sfc only cares about the latter). > Typical micro benchmarks wont really notice. Any suggestions on how to construct a test that will?
Re: [PATCH v2 net-next 0/4] net: batched receive in GRO path
On 07/09/18 03:32, Eric Dumazet wrote: > Your performance numbers are not convincing, since TCP stream test should > get nominal GRO gains. I'm not quite sure what you mean here, could you explain a bit more? > Adding this complexity and icache pressure needs more experimental results. > > What about RPC workloads (eg 100 concurrent netperf -t TCP_RR -- -r > 8000,8000 ) I'll try that. Any other tests that would be worthwhile? -Ed
Re: [PATCH v2 net-next 3/4] net: make listified RX functions return number of good packets
On 09/07/2018 03:44 AM, Edward Cree wrote: > > Any suggestions on how to construct a test that will? > Say 50 concurrent netperf -t TCP_RR -- -r 8000,8000 This way you have a mix of GRO-candidates, and non GRO ones (pure acks) GRO sizes would be reasonable (not full size GRO packets).
Network device suspend/resume
Hi, I am bringing kernel bugzilla bug here https://bugzilla.kernel.org/show_bug.cgi?id=196399 This issue occured 2 months ago and we didn't see this again. Wondering if that appears again. Can you confirm if there is any bug in network suspend/resume in the case. One more instance here https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_4376/fi-skl-6600u/igt@gem_exec_susp...@basic-s4-devices.html Network device is 0b95:7720 ASIX Electronics Corp. AX88772 USB network card driver seems to be usbnet = Bug 196399: We have found out that since at least 4.11-rc1, some machines in the Intel GFX CI lab have been generating the following warning when suspending to s4 (suspend to disk): [ 287.212825] [ cut here ] [ 287.212829] WARNING: CPU: 0 PID: 3165 at net/sched/sch_generic.c:316 dev_watchdog+0x218/0x220 [ 287.212830] Modules linked in: mcs7830 usbnet mii snd_hda_codec_hdmi snd_hda_codec_realtek snd_hda_codec_generic i915 x86_pkg_temp_thermal intel_powerclamp coretemp crct10dif_pclmul crc32_pclmul snd_hda_intel snd_hda_codec snd_hwdep ghash_clmulni_intel snd_hda_core snd_pcm i2c_designware_platform i2c_designware_core mei_me mei prime_numbers i2c_hid pinctrl_sunrisepoint pinctrl_intel [ 287.212864] CPU: 0 PID: 3165 Comm: gem_exec_suspen Tainted: G U 4.12.0-CI-CI_DRM_2829+ #1 [ 287.212865] Hardware name: Dell Inc. XPS 13 9360/093TW6, BIOS 1.3.2 01/18/2017 [ 287.212867] task: 8801b4084f40 task.stack: c91d8000 [ 287.212869] RIP: 0010:dev_watchdog+0x218/0x220 [ 287.212870] RSP: 0018:88027e403e38 EFLAGS: 00010292 [ 287.212872] RAX: 005a RBX: RCX: [ 287.212874] RDX: 0002 RSI: 81cbcf89 RDI: 81c9c627 [ 287.212875] RBP: 88027e403e68 R08: R09: 0001 [ 287.212876] R10: 28e9c215 R11: R12: 88026e08a848 [ 287.212877] R13: R14: 88026e050020 R15: 0001 [ 287.212878] FS: 7f345056a8c0() GS:88027e40() knlGS: [ 287.212880] CS: 0010 DS: ES: CR0: 80050033 [ 287.212881] CR2: 008d7008 CR3: 0001b4314000 CR4: 003406f0 [ 287.212882] Call Trace: [ 287.212883] [ 287.212886] ? qdisc_rcu_free+0x40/0x40 [ 287.212888] ? qdisc_rcu_free+0x40/0x40 [ 287.212891] call_timer_fn+0x8e/0x370 [ 287.212894] ? qdisc_rcu_free+0x40/0x40 [ 287.212896] expire_timers+0x150/0x1f0 [ 287.212899] run_timer_softirq+0x7c/0x160 [ 287.212903] __do_softirq+0x116/0x4a0 [ 287.212906] irq_exit+0xa9/0xc0 [ 287.212909] smp_apic_timer_interrupt+0x38/0x50 [ 287.212912] apic_timer_interrupt+0x90/0xa0 [ 287.212914] RIP: 0010:delay_tsc+0x33/0xc0 [ 287.212916] RSP: 0018:c91dbcd8 EFLAGS: 0286 ORIG_RAX: ff10 [ 287.212918] RAX: 8000 RBX: 0005964f23a0 RCX: 0001 [ 287.212919] RDX: 8001 RSI: 81c8e23a RDI: [ 287.212920] RBP: c91dbcf8 R08: R09: 0001 [ 287.212921] R10: R11: R12: 00059633478e [ 287.212922] R13: 00249f13 R14: R15: 880272eac008 [ 287.212924] [ 287.212929] ? delay_tsc+0x6b/0xc0 [ 287.212932] __delay+0xa/0x10 [ 287.212934] __const_udelay+0x31/0x40 [ 287.212936] hibernation_debug_sleep+0x20/0x30 [ 287.212938] hibernation_snapshot+0x2bc/0x5f0 [ 287.212940] hibernate+0x159/0x2f0 [ 287.212943] state_store+0xe0/0xf0 [ 287.212947] kobj_attr_store+0xf/0x20 [ 287.212949] sysfs_kf_write+0x40/0x50 [ 287.212951] kernfs_fop_write+0x130/0x1b0 [ 287.212955] __vfs_write+0x23/0x120 [ 287.212957] ? rcu_read_lock_sched_held+0x75/0x80 [ 287.212959] ? rcu_sync_lockdep_assert+0x2a/0x50 [ 287.212961] ? __sb_start_write+0xfa/0x1f0 [ 287.212964] vfs_write+0xc5/0x1d0 [ 287.212966] ? trace_hardirqs_on_caller+0xe7/0x1c0 [ 287.212969] SyS_write+0x44/0xb0 [ 287.212972] entry_SYSCALL_64_fastpath+0x1c/0xb1 [ 287.212973] RIP: 0033:0x7f344ed4a4a0 [ 287.212974] RSP: 002b:7ffef50dfaa8 EFLAGS: 0246 ORIG_RAX: 0001 [ 287.212977] RAX: ffda RBX: 81470683 RCX: 7f344ed4a4a0 [ 287.212978] RDX: 0004 RSI: 0041d211 RDI: 0006 [ 287.212979] RBP: c91dbf88 R08: 008d6a50 R09: [ 287.212980] R10: R11: 0246 R12: 0041d211 [ 287.212981] R13: 0006 R14: R15: [ 287.212984] ? __this_cpu_preempt_check+0x13/0x20 [ 287.212988] Code: 63 8e 18 04 00 00 eb 93 4c 89 f7 c6 05 77 5c 77 00 01 e8 dc 7f fd ff 89 d9 48 89 c2 4c 89 f6 48 c7 c7 18 f4 cf 81 e8 f1 c4 9d ff <0f> ff eb c3 0f 1f 40
network device suspend/resume
Hi, I am bringing kernel bugzilla bug here 196399 https://bugzilla.kernel.org/show_bug.cgi?id=196399 This issue occurred last time two months ago and I wonder if it appears again. Can you confirm if there is any issue related to network device suspend/resume. last instance is here https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_4376/fi-skl-6600u/igt@gem_exec_susp...@basic-s4-devices.html == 0b95:7720 ASIX Electronics Corp. AX88772 USB network card driver seems to be usbnet == Bug Report: 196399 We have found out that since at least 4.11-rc1, some machines in the Intel GFX CI lab have been generating the following warning when suspending to s4 (suspend to disk): [ 287.212825] [ cut here ] [ 287.212829] WARNING: CPU: 0 PID: 3165 at net/sched/sch_generic.c:316 dev_watchdog+0x218/0x220 [ 287.212830] Modules linked in: mcs7830 usbnet mii snd_hda_codec_hdmi snd_hda_codec_realtek snd_hda_codec_generic i915 x86_pkg_temp_thermal intel_powerclamp coretemp crct10dif_pclmul crc32_pclmul snd_hda_intel snd_hda_codec snd_hwdep ghash_clmulni_intel snd_hda_core snd_pcm i2c_designware_platform i2c_designware_core mei_me mei prime_numbers i2c_hid pinctrl_sunrisepoint pinctrl_intel [ 287.212864] CPU: 0 PID: 3165 Comm: gem_exec_suspen Tainted: G U 4.12.0-CI-CI_DRM_2829+ #1 [ 287.212865] Hardware name: Dell Inc. XPS 13 9360/093TW6, BIOS 1.3.2 01/18/2017 [ 287.212867] task: 8801b4084f40 task.stack: c91d8000 [ 287.212869] RIP: 0010:dev_watchdog+0x218/0x220 [ 287.212870] RSP: 0018:88027e403e38 EFLAGS: 00010292 [ 287.212872] RAX: 005a RBX: RCX: [ 287.212874] RDX: 0002 RSI: 81cbcf89 RDI: 81c9c627 [ 287.212875] RBP: 88027e403e68 R08: R09: 0001 [ 287.212876] R10: 28e9c215 R11: R12: 88026e08a848 [ 287.212877] R13: R14: 88026e050020 R15: 0001 [ 287.212878] FS: 7f345056a8c0() GS:88027e40() knlGS: [ 287.212880] CS: 0010 DS: ES: CR0: 80050033 [ 287.212881] CR2: 008d7008 CR3: 0001b4314000 CR4: 003406f0 [ 287.212882] Call Trace: [ 287.212883] [ 287.212886] ? qdisc_rcu_free+0x40/0x40 [ 287.212888] ? qdisc_rcu_free+0x40/0x40 [ 287.212891] call_timer_fn+0x8e/0x370 [ 287.212894] ? qdisc_rcu_free+0x40/0x40 [ 287.212896] expire_timers+0x150/0x1f0 [ 287.212899] run_timer_softirq+0x7c/0x160 [ 287.212903] __do_softirq+0x116/0x4a0 [ 287.212906] irq_exit+0xa9/0xc0 [ 287.212909] smp_apic_timer_interrupt+0x38/0x50 [ 287.212912] apic_timer_interrupt+0x90/0xa0 [ 287.212914] RIP: 0010:delay_tsc+0x33/0xc0 [ 287.212916] RSP: 0018:c91dbcd8 EFLAGS: 0286 ORIG_RAX: ff10 [ 287.212918] RAX: 8000 RBX: 0005964f23a0 RCX: 0001 [ 287.212919] RDX: 8001 RSI: 81c8e23a RDI: [ 287.212920] RBP: c91dbcf8 R08: R09: 0001 [ 287.212921] R10: R11: R12: 00059633478e [ 287.212922] R13: 00249f13 R14: R15: 880272eac008 [ 287.212924] [ 287.212929] ? delay_tsc+0x6b/0xc0 [ 287.212932] __delay+0xa/0x10 [ 287.212934] __const_udelay+0x31/0x40 [ 287.212936] hibernation_debug_sleep+0x20/0x30 [ 287.212938] hibernation_snapshot+0x2bc/0x5f0 [ 287.212940] hibernate+0x159/0x2f0 [ 287.212943] state_store+0xe0/0xf0 [ 287.212947] kobj_attr_store+0xf/0x20 [ 287.212949] sysfs_kf_write+0x40/0x50 [ 287.212951] kernfs_fop_write+0x130/0x1b0 [ 287.212955] __vfs_write+0x23/0x120 [ 287.212957] ? rcu_read_lock_sched_held+0x75/0x80 [ 287.212959] ? rcu_sync_lockdep_assert+0x2a/0x50 [ 287.212961] ? __sb_start_write+0xfa/0x1f0 [ 287.212964] vfs_write+0xc5/0x1d0 [ 287.212966] ? trace_hardirqs_on_caller+0xe7/0x1c0 [ 287.212969] SyS_write+0x44/0xb0 [ 287.212972] entry_SYSCALL_64_fastpath+0x1c/0xb1 [ 287.212973] RIP: 0033:0x7f344ed4a4a0 [ 287.212974] RSP: 002b:7ffef50dfaa8 EFLAGS: 0246 ORIG_RAX: 0001 [ 287.212977] RAX: ffda RBX: 81470683 RCX: 7f344ed4a4a0 [ 287.212978] RDX: 0004 RSI: 0041d211 RDI: 0006 [ 287.212979] RBP: c91dbf88 R08: 008d6a50 R09: [ 287.212980] R10: R11: 0246 R12: 0041d211 [ 287.212981] R13: 0006 R14: R15: [ 287.212984] ? __this_cpu_preempt_check+0x13/0x20 [ 287.212988] Code: 63 8e 18 04 00 00 eb 93 4c 89 f7 c6 05 77 5c 77 00 01 e8 dc 7f fd ff 89 d9 48 89 c2 4c 89 f6 48 c7 c7 18 f4 cf 81 e8 f1 c4 9d ff <0f> ff eb c3 0f 1f 40 00 48 c7 47 08 00 00 00 00 55 48 c7 07 00 [
Re: [PATCH 1/7] fix hnode refcounting
On 2018-09-06 10:35 p.m., Al Viro wrote: On Thu, Sep 06, 2018 at 06:21:09AM -0400, Jamal Hadi Salim wrote: [..] Argh... Unfortunately, there's this: in u32_delete() we have if (root_ht) { if (root_ht->refcnt > 1) { *last = false; goto ret; } if (root_ht->refcnt == 1) { if (!ht_empty(root_ht)) { *last = false; goto ret; } } } and that would need to be updated. It is not detrimental as you have it right now but you are right an adjustment is needed... Deleting of a root directly should not be allowed. But you can flush a whole tp. Consider this: -- sudo tc qdisc add dev $P ingress sudo tc filter add dev $P parent : protocol ip prio 10 \ u32 match ip protocol 1 0xff Which creates root ht 800 You shouldnt be allowed to do this: -- tc filter delete dev $P parent : protocol ip prio 10 handle 800: u32 --- But you can delete the tp entirely as such: --- tc filter delete dev $P parent : protocol ip prio 10 u32 -- The later will go via the destroy() path and flush all filters. You should also be able to delete individual filters. ex: $tc filter del dev $P parent : prio 10 handle 800:0:800 u32 Where that code you are referring to is important is when the last filter deleted - we need the caller to know and it destroys root. i.e you should return last=true when the last filter is deleted so root gets auto deleted (just like it was autocreated) However, that logics is bloody odd to start with. First of all, root_ht has come from struct tc_u_hnode *root_ht = rtnl_dereference(tp->root); and the only place where it's ever modified is rcu_assign_pointer(tp->root, root_ht); in u32_init(), where we'd bloody well checked that root_ht is non-NULL (see if (root_ht == NULL) return -ENOBUFS; upstream of that place) and where that assignment is inevitable on the way to returning 0. No matter what, if tp has passed u32_init() it will have non-NULL ->root, forever. And there is no way for tcf_proto to be seen outside of tcf_proto_create() without ->init() having returned 0 - it gets freed before anyone sees it. Yes, the check for root_ht is not necessary - but the check for the last filter (and testing for last) is needed. So this 'if (root_ht)' can't be false. What's more, what the hell is the whole thing checking? We are in u32_delete(). It's called (as ->delete()) from tfilter_del_notify(), which is called from tc_del_tfilter(). If we return 0 with *last true, we follow up calling tcf_proto_destroy(). OK, let's look at the logics in there: * if there are links to root hnode => false * if there's no links to root hnode and it has knodes => false (BTW, if we ever get there with root_ht->refcnt < 1, we are obviously screwed) * if there is a tcf_proto sharing tp->data => false (i.e. any filters with different prio - don't bother) * if tp is the only one with reference to tp->data and there are *any* knodes => false. Any extra links can come only from knodes in a non-empty hnode. And it's not a common case. Shouldn't thIe whole thing be * shared tp->data => false * any non-empty hnode => false instead? Perhaps even with the knode counter in tp->data, avoiding any loops in there, as well as the entire ht_empty()... Now, in the very beginning of u32_delete() we have this: struct tc_u_hnode *ht = arg; if (ht == NULL) goto out; OK, but the call of ->delete() is err = tp->ops->delete(tp, fh, last, extack); and arg == NULL seen in u32_delete() means fh == NULL in tfilter_del_notify(). Which is called in if (!fh) { ... } else { bool last; err = tfilter_del_notify(net, skb, n, tp, block, q, parent, fh, false, &last, extack); How can we ever get there with NULL fh? Try: tc filter delete dev $P parent : protocol ip prio 10 u32 tcm handle is 0, so will hit that code path. The whole thing makes very little sense; looks like it used to live in u32_destroy() prior to commit 763dbf6328e41 ("net_sched: move the empty tp check from ->destroy() to ->delete()"), but looking at the rationale in that commit... I don't see how it fixes anything - sure, now we remove tcf_proto from the list before calling ->destroy(). Without any RCU delays in between. How could it possibly solve any issues with ->classify() called in parallel with ->destroy()? cls_u32 (at least these days) does try to survive u32_destroy() in parallel with u32_classify(); if any other classifiers do not, they are still broken and that commit has not done anything for them. Anyway, adjus
Re: [PATCH 1/7] fix hnode refcounting
To clarify with an example i used to test your patches: #0 add ingress filter $TC qdisc add dev $P ingress #1 add filter $TC filter add dev $P parent : protocol ip prio 10 \ u32 match ip protocol 1 0xff #2 display $TC filter ls dev $P parent : #3 try to delete root $TC filter delete dev $P parent : protocol ip prio 10 \ handle 800: u32 #4 nothing changes.. $TC filter ls dev $P parent : #5 delete filter $TC filter delete dev $P parent : protocol ip prio 10 \ handle 800:0:800 u32 #6 filter gone but hash table still there.. $TC filter ls dev $P parent : #7 delete tp $TC filter delete dev $P parent : protocol ip prio 10 \ u32 #8 now it is gone.. $TC filter ls dev $P parent : your patches show #6 filter as still active. We want it to look like #8 Hope this helps. cheers, jamal
[PATCH net-next v2] net: sched: change tcf_del_walker() to take idrinfo->lock
Action API was changed to work with actions and action_idr in concurrency safe manner, however tcf_del_walker() still uses actions without taking a reference or idrinfo->lock first, and deletes them directly, disregarding possible concurrent delete. Add tc_action_wq workqueue to action API. Implement tcf_idr_release_unsafe() that assumes external synchronization by caller and delays blocking action cleanup part to tc_action_wq workqueue. Extend tcf_action_cleanup() with 'async' argument to indicate that function should free action asynchronously. Change tcf_del_walker() to take idrinfo->lock while iterating over actions and use new tcf_idr_release_unsafe() to release them while holding the lock. Signed-off-by: Vlad Buslov --- include/net/act_api.h | 1 + net/sched/act_api.c | 73 --- 2 files changed, 65 insertions(+), 9 deletions(-) diff --git a/include/net/act_api.h b/include/net/act_api.h index c6f195b3c706..4c5117bc4afb 100644 --- a/include/net/act_api.h +++ b/include/net/act_api.h @@ -38,6 +38,7 @@ struct tc_action { struct gnet_stats_queue __percpu *cpu_qstats; struct tc_cookie__rcu *act_cookie; struct tcf_chain*goto_chain; + struct work_struct work; }; #define tcf_index common.tcfa_index #define tcf_refcnt common.tcfa_refcnt diff --git a/net/sched/act_api.c b/net/sched/act_api.c index 6f118d62c731..4ad9062c34b3 100644 --- a/net/sched/act_api.c +++ b/net/sched/act_api.c @@ -90,13 +90,38 @@ static void free_tcf(struct tc_action *p) kfree(p); } -static void tcf_action_cleanup(struct tc_action *p) +static void tcf_action_free(struct tc_action *p) +{ + gen_kill_estimator(&p->tcfa_rate_est); + free_tcf(p); +} + +static void tcf_action_free_work(struct work_struct *work) +{ + struct tc_action *p = container_of(work, + struct tc_action, + work); + + tcf_action_free(p); +} + +static struct workqueue_struct *tc_action_wq; + +static bool tcf_action_queue_work(struct work_struct *work, work_func_t func) +{ + INIT_WORK(work, func); + return queue_work(tc_action_wq, work); +} + +static void tcf_action_cleanup(struct tc_action *p, bool async) { if (p->ops->cleanup) p->ops->cleanup(p); - gen_kill_estimator(&p->tcfa_rate_est); - free_tcf(p); + if (async) + tcf_action_queue_work(&p->work, tcf_action_free_work); + else + tcf_action_free(p); } static int __tcf_action_put(struct tc_action *p, bool bind) @@ -109,7 +134,7 @@ static int __tcf_action_put(struct tc_action *p, bool bind) idr_remove(&idrinfo->action_idr, p->tcfa_index); spin_unlock(&idrinfo->lock); - tcf_action_cleanup(p); + tcf_action_cleanup(p, false); return 1; } @@ -147,6 +172,24 @@ int __tcf_idr_release(struct tc_action *p, bool bind, bool strict) } EXPORT_SYMBOL(__tcf_idr_release); +/* Release idr without obtaining idrinfo->lock. Caller must prevent any + * concurrent modifications of idrinfo->action_idr! + */ + +static int tcf_idr_release_unsafe(struct tc_action *p) +{ + if (atomic_read(&p->tcfa_bindcnt) > 0) + return -EPERM; + + if (refcount_dec_and_test(&p->tcfa_refcnt)) { + idr_remove(&p->idrinfo->action_idr, p->tcfa_index); + tcf_action_cleanup(p, true); + return ACT_P_DELETED; + } + + return 0; +} + static size_t tcf_action_shared_attrs_size(const struct tc_action *act) { struct tc_cookie *act_cookie; @@ -262,20 +305,25 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, struct sk_buff *skb, if (nla_put_string(skb, TCA_KIND, ops->kind)) goto nla_put_failure; + spin_lock(&idrinfo->lock); idr_for_each_entry_ul(idr, p, id) { - ret = __tcf_idr_release(p, false, true); + ret = tcf_idr_release_unsafe(p); if (ret == ACT_P_DELETED) { module_put(ops->owner); n_i++; } else if (ret < 0) { - goto nla_put_failure; + goto nla_put_failure_locked; } } + spin_unlock(&idrinfo->lock); + if (nla_put_u32(skb, TCA_FCNT, n_i)) goto nla_put_failure; nla_nest_end(skb, nest); return n_i; +nla_put_failure_locked: + spin_unlock(&idrinfo->lock); nla_put_failure: nla_nest_cancel(skb, nest); return ret; @@ -341,7 +389,7 @@ static int tcf_idr_delete_index(struct tcf_idrinfo *idrinfo, u32 index) p->tcfa_index)); spin_unlock(&idrinfo->lock); - tcf_action_cleanup(p); + tcf_
[iproute2 PATCH] bridge: Correct json output
The current implementation adds configured vlans as "vlan": [ ... ] into an array. This is malformed json and fails to be parsed. This patch creates an object to include this key value pair. Test with: ip l a type bridge ./bridge/bridge -j vlan | jq fixes c7c1a1ef5 Signed-off-by: Tobias Jungel --- bridge/vlan.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/bridge/vlan.c b/bridge/vlan.c index 19a36b80..9376b985 100644 --- a/bridge/vlan.c +++ b/bridge/vlan.c @@ -632,6 +632,7 @@ void print_vlan_info(FILE *fp, struct rtattr *tb) if (!is_json_context()) fprintf(fp, "%s", _SL_); + open_json_object(NULL); open_json_array(PRINT_JSON, "vlan"); for (i = RTA_DATA(list); RTA_OK(i, rem); i = RTA_NEXT(i, rem)) { @@ -659,6 +660,7 @@ void print_vlan_info(FILE *fp, struct rtattr *tb) } close_json_array(PRINT_ANY, "\n"); + close_json_object(); } int do_vlan(int argc, char **argv)
[PATCH net-next v2] net: sched: cls_flower: dump offload count value
Change flower in_hw_count type to fixed-size u32 and dump it as TCA_FLOWER_IN_HW_COUNT. This change is necessary to properly test shared blocks and re-offload functionality. Signed-off-by: Vlad Buslov Acked-by: Jiri Pirko --- include/net/sch_generic.h| 2 +- include/uapi/linux/pkt_cls.h | 2 ++ net/sched/cls_flower.c | 5 - 3 files changed, 7 insertions(+), 2 deletions(-) diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index a6d00093f35e..d68ac55539a5 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -362,7 +362,7 @@ static inline void tcf_block_offload_dec(struct tcf_block *block, u32 *flags) } static inline void -tc_cls_offload_cnt_update(struct tcf_block *block, unsigned int *cnt, +tc_cls_offload_cnt_update(struct tcf_block *block, u32 *cnt, u32 *flags, bool add) { if (add) { diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h index be382fb0592d..401d0c1e612d 100644 --- a/include/uapi/linux/pkt_cls.h +++ b/include/uapi/linux/pkt_cls.h @@ -483,6 +483,8 @@ enum { TCA_FLOWER_KEY_ENC_OPTS, TCA_FLOWER_KEY_ENC_OPTS_MASK, + TCA_FLOWER_IN_HW_COUNT, + __TCA_FLOWER_MAX, }; diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c index 6fd9bdd93796..4b8dd37dd4f8 100644 --- a/net/sched/cls_flower.c +++ b/net/sched/cls_flower.c @@ -98,7 +98,7 @@ struct cls_fl_filter { struct list_head list; u32 handle; u32 flags; - unsigned int in_hw_count; + u32 in_hw_count; struct rcu_work rwork; struct net_device *hw_dev; }; @@ -1880,6 +1880,9 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, void *fh, if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags)) goto nla_put_failure; + if (nla_put_u32(skb, TCA_FLOWER_IN_HW_COUNT, f->in_hw_count)) + goto nla_put_failure; + if (tcf_exts_dump(skb, &f->exts)) goto nla_put_failure; -- 2.7.5
Re: [PATCH net-next v2] net: sched: cls_flower: dump offload count value
On Fri, 7 Sep 2018 17:22:21 +0300, Vlad Buslov wrote: > Change flower in_hw_count type to fixed-size u32 and dump it as > TCA_FLOWER_IN_HW_COUNT. This change is necessary to properly test shared > blocks and re-offload functionality. > > Signed-off-by: Vlad Buslov > Acked-by: Jiri Pirko LGTM, thanks for the respin!
Allow bpf_perf_event_output to access packet data
Re-sent due to HTML e-mail mess up, apologies. -- Forwarded message -- From: Lorenz Bauer Date: 7 September 2018 at 15:53 Subject: Allow bpf_perf_event_output to access packet data To: netdev@vger.kernel.org, Alexei Starovoitov , Daniel Borkmann Hello list, I'm attempting to use bpf_perf_event_output to do packet sampling from XDP. The code basically runs before our other XDP code, does a perf_event_output with the full packet (for now) and then tail calls into DDoS mitigation, etc. I've just discovered that perf_event_output isn't allowed to access packet data by the verifier. Is this something that could be allowed? Best Lorenz -- Lorenz Bauer | Systems Engineer 25 Lavington St., London SE1 0NZ www.cloudflare.com
for_each_netdev_feature() broken on big endian
Hi, On a MIPS 32 Big endian system the netdev_sync_upper_features() function does not work correctly. It does not disbale bit 15 (NETIF_F_LRO, 0x8000), but 47 (NETIF_F_HW_TC, 0x8000). The for_each_netdev_feature() macro is used to go over all netdev feature flags and calls for_each_set_bit() with a u64. This is the code: #define for_each_netdev_feature(mask_addr, bit) \ for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT) https://elixir.bootlin.com/linux/v4.19-rc2/source/include/linux/netdev_features.h#L157 The for_each_set_bit() macro calls the find_first_bit() macro and works directly on the bits in the memory organized in 32 bit values in the generic implementation, see here: https://elixir.bootlin.com/linux/v4.19-rc2/source/lib/find_bit.c#L102 On big endian systems the u64 value is stored in a different order in memory compared to little endian systems. When I execute this code: netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES; printk("%s:%i: addr: 0x%llx, size: %i\n", __func__, __LINE__, upper_disables, NETDEV_FEATURE_COUNT); ret = find_first_bit((unsigned long *)&upper_disables, NETDEV_FEATURE_COUNT); printk("%s:%i: addr: 0x%llx, size: %i, ret: %li\n", __func__, __LINE__, upper_disables, NETDEV_FEATURE_COUNT, ret); I get this result: [ 35.227140] netdev_sync_upper_features:6912: addr: 0x8000, size: 48 [ 35.27] netdev_sync_upper_features:6914: addr: 0x8000, size: 48, ret: 47 Expected would be a ret: 15. As I understood the documentation for for_each_set_bit() this should find the first bit in memory and not in a integer, so for_each_netdev_feature() should not use for_each_set_bit(). I do not really know what a good fix would be, I could convert the feature flags to little endian before calling for_each_netdev_feature(), would this be ok? Hauke
Re: for_each_netdev_feature() broken on big endian
From: "Mehrtens, Hauke" Date: Fri, 7 Sep 2018 15:10:53 + > On a MIPS 32 Big endian system the netdev_sync_upper_features() function does > not work correctly. > It does not disbale bit 15 (NETIF_F_LRO, 0x8000), but 47 > (NETIF_F_HW_TC, 0x8000). > > The for_each_netdev_feature() macro is used to go over all netdev feature > flags and calls for_each_set_bit() with a u64. > This is the code: > #define for_each_netdev_feature(mask_addr, bit) \ > for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT) > https://elixir.bootlin.com/linux/v4.19-rc2/source/include/linux/netdev_features.h#L157 Good catch, yes we cannot use the the generic bit handling macros on the netdev feature flags because the feature flags are a u64 operated on as a unit whereas "long" may be 32-bit or 64-bit depending upon the architecture.
Re: [PATCH v4 3/3] IB/ipoib: Log sysfs 'dev_id' accesses from userspace
On Thu, Sep 06, 2018 at 05:51:12PM +0300, Arseny Maslennikov wrote: > Some tools may currently be using only the deprecated attribute; > let's print an elaborate and clear deprecation notice to kmsg. > > To do that, we have to replace the whole sysfs file, since we inherit > the original one from netdev. > > Signed-off-by: Arseny Maslennikov > drivers/infiniband/ulp/ipoib/ipoib_main.c | 31 +++ > 1 file changed, 31 insertions(+) > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c > b/drivers/infiniband/ulp/ipoib/ipoib_main.c > index 30f840f874b3..74732726ec6f 100644 > +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c > @@ -2386,6 +2386,35 @@ int ipoib_add_pkey_attr(struct net_device *dev) > return device_create_file(&dev->dev, &dev_attr_pkey); > } > > +/* > + * We erroneously exposed the iface's port number in the dev_id > + * sysfs field long after dev_port was introduced for that purpose[1], > + * and we need to stop everyone from relying on that. > + * Let's overload the shower routine for the dev_id file here > + * to gently bring the issue up. > + * > + * [1] https://www.spinics.net/lists/netdev/msg272123.html > + */ > +static ssize_t dev_id_show(struct device *dev, > +struct device_attribute *attr, char *buf) > +{ > + struct net_device *ndev = to_net_dev(dev); > + > + if (ndev->dev_id == ndev->dev_port) > + netdev_info_once(ndev, > + "\"%s\" wants to know my dev_id. Should it look at > dev_port instead? See Documentation/ABI/testing/sysfs-class-net for more > info.\n", > + current->comm); > + > + return sprintf(buf, "%#x\n", ndev->dev_id); > +} > +static DEVICE_ATTR_RO(dev_id); > + > +int ipoib_intercept_dev_id_attr(struct net_device *dev) > +{ > + device_remove_file(&dev->dev, &dev_attr_dev_id); > + return device_create_file(&dev->dev, &dev_attr_dev_id); > +} Isn't this racey with userspace? Ie what happens if udev is querying the dev_id right here? Do we know there is no userspace doing this? > static struct net_device *ipoib_add_port(const char *format, >struct ib_device *hca, u8 port) > { > @@ -2427,6 +2456,8 @@ static struct net_device *ipoib_add_port(const char > *format, >*/ > ndev->priv_destructor = ipoib_intf_free; > > + if (ipoib_intercept_dev_id_attr(ndev)) > + goto sysfs_failed; No device_remove_file needed? Jason
Re: [**EXTERNAL**] Re: VRF with enslaved L3 enabled bridge
On 9/7/18 9:56 AM, D'Souza, Nelson wrote: > > *From:* David Ahern > *Sent:* Thursday, September 6, 2018 5:27 PM > *To:* D'Souza, Nelson; netdev@vger.kernel.org > *Subject:* Re: [**EXTERNAL**] Re: VRF with enslaved L3 enabled bridge > > On 9/5/18 12:00 PM, D'Souza, Nelson wrote: >> Just following up would you be able to confirm that this is a > Linux VRF issue? > > I can confirm that I can reproduce the problem. Need to find time to dig > into it. bridge's netfilter hook is dropping the packet. bridge's netfilter code registers hook operations that are invoked when nh_hook is called. It then sees all subsequent calls to nf_hook. Packet wise, the bridge netfilter hook runs first. br_nf_pre_routing allocates nf_bridge, sets in_prerouting to 1 and calls NF_HOOK for NF_INET_PRE_ROUTING. It's finish function, br_nf_pre_routing_finish, then resets in_prerouting flag to 0. Any subsequent calls to nf_hook invoke ip_sabotage_in. That function sees in_prerouting is not set and steals (drops) the packet. The simplest change is to have ip_sabotage_in recognize that the bridge can be enslaved to a VRF (L3 master device) and allow the packet to continue. Thanks to Ido for the hint on ip_sabotage_in. This patch works for me: diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 6e0dc6bcd32a..37278dc280eb 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -835,7 +835,8 @@ static unsigned int ip_sabotage_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - if (skb->nf_bridge && !skb->nf_bridge->in_prerouting) { + if (skb->nf_bridge && !skb->nf_bridge->in_prerouting && + !netif_is_l3_master(skb->dev)) { state->okfn(state->net, state->sk, skb); return NF_STOLEN; }
Re: [PATCH v3 3/3] IB/ipoib: Log sysfs 'dev_id' accesses from userspace
On Thu, 2018-09-06 at 16:03 +0300, Leon Romanovsky wrote: > On Thu, Sep 06, 2018 at 10:04:33AM +0300, Arseny Maslennikov wrote: > > On Wed, Sep 05, 2018 at 04:50:35PM +0300, Leon Romanovsky wrote: > > > On Mon, Sep 03, 2018 at 07:13:16PM +0300, Arseny Maslennikov wrote: > > > > Signed-off-by: Arseny Maslennikov > > > > --- > > > > drivers/infiniband/ulp/ipoib/ipoib_main.c | 38 +++ > > > > 1 file changed, 38 insertions(+) > > > > > > > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c > > > > b/drivers/infiniband/ulp/ipoib/ipoib_main.c > > > > index 30f840f874b3..7386e5bde3d3 100644 > > > > --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c > > > > +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c > > > > @@ -2386,6 +2386,42 @@ int ipoib_add_pkey_attr(struct net_device *dev) > > > > return device_create_file(&dev->dev, &dev_attr_pkey); > > > > } > > > > > > > > +/* > > > > + * We erroneously exposed the iface's port number in the dev_id > > > > + * sysfs field long after dev_port was introduced for that purpose[1], > > > > + * and we need to stop everyone from relying on that. > > > > + * Let's overload the shower routine for the dev_id file here > > > > + * to gently bring the issue up. > > > > + * > > > > + * [1] https://www.spinics.net/lists/netdev/msg272123.html > > > > + */ > > > > +static ssize_t dev_id_show(struct device *dev, > > > > + struct device_attribute *attr, char *buf) > > > > +{ > > > > + struct net_device *ndev = to_net_dev(dev); > > > > + ssize_t ret = -EINVAL; > > > > + > > > > + if (ndev->dev_id == ndev->dev_port) { > > > > + netdev_info_once(ndev, > > > > + "\"%s\" wants to know my dev_id. " > > > > + "Should it look at dev_port instead?\n", > > > > + current->comm); > > > > + netdev_info_once(ndev, > > > > + "See Documentation/ABI/testing/sysfs-class-net > > > > for more info.\n"); > > > > + } > > > > + > > > > + ret = sprintf(buf, "%#x\n", ndev->dev_id); > > > > + > > > > + return ret; > > > > +} > > > > +static DEVICE_ATTR_RO(dev_id); > > > > + > > > > > > I don't see this field among exposed by IPoIB, why should we expose it > > > now? > > > > > > > To deviate from standard netdev behaviour, which only prints the > > field out. Doug wanted this to also print a deprecation message, and > > netdev (obviously) does not do that. See below. > > > > > > +int ipoib_intercept_dev_id_attr(struct net_device *dev) > > > > +{ > > > > + device_remove_file(&dev->dev, &dev_attr_dev_id); > > > > + return device_create_file(&dev->dev, &dev_attr_dev_id); > > > > > > Why isn't enough to rely on netdev code? > > > > > > > Netdev code relies on macros around a *static* function 'netdev_show', > > which is defined in net/core/net-sysfs.c; it is not listed in any header > > files, and the macros aren't as well. This all leads me to believe it > > was not really meant to be used from outside net/core/net-sysfs. > > > > The only way we could use any netdev code here is to set up our own > > handler (again), printk() a message, then call netdev_show — but we have > > no access to it. > > > > Of course, it also may be that I'm terribly missing a clue. > > Thanks, > > IMHO, the end result of adequate Doug's request is a little bit too much. > I don't think that it justifies such remove->create construction. > > Personal opinion. I agree with you that the end result is kinda bulky, *but*, we will need to know if there are things using the old dev_id before we can remove it. In particular, I'm concerned the IPoIB handling code of NetworkManager uses it. It's worth the cost I think. -- Doug Ledford GPG KeyID: B826A3330E572FDD Key fingerprint = AE6B 1BDA 122B 23B4 265B 1274 B826 A333 0E57 2FDD signature.asc Description: This is a digitally signed message part
Re: [PATCH v4 3/3] IB/ipoib: Log sysfs 'dev_id' accesses from userspace
On Fri, 2018-09-07 at 09:43 -0600, Jason Gunthorpe wrote: > On Thu, Sep 06, 2018 at 05:51:12PM +0300, Arseny Maslennikov wrote: > > Some tools may currently be using only the deprecated attribute; > > let's print an elaborate and clear deprecation notice to kmsg. > > > > To do that, we have to replace the whole sysfs file, since we inherit > > the original one from netdev. > > > > Signed-off-by: Arseny Maslennikov > > drivers/infiniband/ulp/ipoib/ipoib_main.c | 31 +++ > > 1 file changed, 31 insertions(+) > > > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c > > b/drivers/infiniband/ulp/ipoib/ipoib_main.c > > index 30f840f874b3..74732726ec6f 100644 > > +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c > > @@ -2386,6 +2386,35 @@ int ipoib_add_pkey_attr(struct net_device *dev) > > return device_create_file(&dev->dev, &dev_attr_pkey); > > } > > > > +/* > > + * We erroneously exposed the iface's port number in the dev_id > > + * sysfs field long after dev_port was introduced for that purpose[1], > > + * and we need to stop everyone from relying on that. > > + * Let's overload the shower routine for the dev_id file here > > + * to gently bring the issue up. > > + * > > + * [1] https://www.spinics.net/lists/netdev/msg272123.html > > + */ > > +static ssize_t dev_id_show(struct device *dev, > > + struct device_attribute *attr, char *buf) > > +{ > > + struct net_device *ndev = to_net_dev(dev); > > + > > + if (ndev->dev_id == ndev->dev_port) > > + netdev_info_once(ndev, > > + "\"%s\" wants to know my dev_id. Should it look at > > dev_port instead? See Documentation/ABI/testing/sysfs-class-net for more > > info.\n", > > + current->comm); > > + > > + return sprintf(buf, "%#x\n", ndev->dev_id); > > +} > > +static DEVICE_ATTR_RO(dev_id); > > + > > +int ipoib_intercept_dev_id_attr(struct net_device *dev) > > +{ > > + device_remove_file(&dev->dev, &dev_attr_dev_id); > > + return device_create_file(&dev->dev, &dev_attr_dev_id); > > +} > > Isn't this racey with userspace? Ie what happens if udev is querying > the dev_id right here? > > Do we know there is no userspace doing this? I don't think that it can race (or reasonably can). The intercept function is done as part of ipoib_add_port() so the port itself isn't live yet. Things like udev shouldn't be scanning it until after we've finished bringing it up and added it to the system, so any race here is unimportant IMO. > > static struct net_device *ipoib_add_port(const char *format, > > struct ib_device *hca, u8 port) > > { > > @@ -2427,6 +2456,8 @@ static struct net_device *ipoib_add_port(const char > > *format, > > */ > > ndev->priv_destructor = ipoib_intf_free; > > > > + if (ipoib_intercept_dev_id_attr(ndev)) > > + goto sysfs_failed; > > No device_remove_file needed? > > Jason -- Doug Ledford GPG KeyID: B826A3330E572FDD Key fingerprint = AE6B 1BDA 122B 23B4 265B 1274 B826 A333 0E57 2FDD signature.asc Description: This is a digitally signed message part
Re: [PATCH v4 3/3] IB/ipoib: Log sysfs 'dev_id' accesses from userspace
On Fri, Sep 07, 2018 at 01:14:51PM -0400, Doug Ledford wrote: > On Fri, 2018-09-07 at 09:43 -0600, Jason Gunthorpe wrote: > > On Thu, Sep 06, 2018 at 05:51:12PM +0300, Arseny Maslennikov wrote: > > > Some tools may currently be using only the deprecated attribute; > > > let's print an elaborate and clear deprecation notice to kmsg. > > > > > > To do that, we have to replace the whole sysfs file, since we inherit > > > the original one from netdev. > > > > > > Signed-off-by: Arseny Maslennikov > > > drivers/infiniband/ulp/ipoib/ipoib_main.c | 31 +++ > > > 1 file changed, 31 insertions(+) > > > > > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c > > > b/drivers/infiniband/ulp/ipoib/ipoib_main.c > > > index 30f840f874b3..74732726ec6f 100644 > > > +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c > > > @@ -2386,6 +2386,35 @@ int ipoib_add_pkey_attr(struct net_device *dev) > > > return device_create_file(&dev->dev, &dev_attr_pkey); > > > } > > > > > > +/* > > > + * We erroneously exposed the iface's port number in the dev_id > > > + * sysfs field long after dev_port was introduced for that purpose[1], > > > + * and we need to stop everyone from relying on that. > > > + * Let's overload the shower routine for the dev_id file here > > > + * to gently bring the issue up. > > > + * > > > + * [1] https://www.spinics.net/lists/netdev/msg272123.html > > > + */ > > > +static ssize_t dev_id_show(struct device *dev, > > > +struct device_attribute *attr, char *buf) > > > +{ > > > + struct net_device *ndev = to_net_dev(dev); > > > + > > > + if (ndev->dev_id == ndev->dev_port) > > > + netdev_info_once(ndev, > > > + "\"%s\" wants to know my dev_id. Should it look at > > > dev_port instead? See Documentation/ABI/testing/sysfs-class-net for more > > > info.\n", > > > + current->comm); > > > + > > > + return sprintf(buf, "%#x\n", ndev->dev_id); > > > +} > > > +static DEVICE_ATTR_RO(dev_id); > > > + > > > +int ipoib_intercept_dev_id_attr(struct net_device *dev) > > > +{ > > > + device_remove_file(&dev->dev, &dev_attr_dev_id); > > > + return device_create_file(&dev->dev, &dev_attr_dev_id); > > > +} > > > > Isn't this racey with userspace? Ie what happens if udev is querying > > the dev_id right here? > > > > Do we know there is no userspace doing this? > > I don't think that it can race (or reasonably can). The intercept > function is done as part of ipoib_add_port() so the port itself isn't > live yet. The above code is after register_netdev, so the port itself is certainly live as far as userspace is concerned.. All the other sysfs stuff in add_port is already wrong/racy.. See Parav's recent series fixing this for the main devices.. Jason
Re: [PATCH v3 3/3] IB/ipoib: Log sysfs 'dev_id' accesses from userspace
On Fri, Sep 07, 2018 at 01:14:37PM -0400, Doug Ledford wrote: > On Thu, 2018-09-06 at 16:03 +0300, Leon Romanovsky wrote: > > On Thu, Sep 06, 2018 at 10:04:33AM +0300, Arseny Maslennikov wrote: > > > On Wed, Sep 05, 2018 at 04:50:35PM +0300, Leon Romanovsky wrote: > > > > On Mon, Sep 03, 2018 at 07:13:16PM +0300, Arseny Maslennikov wrote: > > > > > Signed-off-by: Arseny Maslennikov > > > > > --- > > > > > drivers/infiniband/ulp/ipoib/ipoib_main.c | 38 > > > > > +++ > > > > > 1 file changed, 38 insertions(+) > > > > > > > > > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c > > > > > b/drivers/infiniband/ulp/ipoib/ipoib_main.c > > > > > index 30f840f874b3..7386e5bde3d3 100644 > > > > > --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c > > > > > +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c > > > > > @@ -2386,6 +2386,42 @@ int ipoib_add_pkey_attr(struct net_device *dev) > > > > > return device_create_file(&dev->dev, &dev_attr_pkey); > > > > > } > > > > > > > > > > +/* > > > > > + * We erroneously exposed the iface's port number in the dev_id > > > > > + * sysfs field long after dev_port was introduced for that > > > > > purpose[1], > > > > > + * and we need to stop everyone from relying on that. > > > > > + * Let's overload the shower routine for the dev_id file here > > > > > + * to gently bring the issue up. > > > > > + * > > > > > + * [1] https://www.spinics.net/lists/netdev/msg272123.html > > > > > + */ > > > > > +static ssize_t dev_id_show(struct device *dev, > > > > > +struct device_attribute *attr, char *buf) > > > > > +{ > > > > > + struct net_device *ndev = to_net_dev(dev); > > > > > + ssize_t ret = -EINVAL; > > > > > + > > > > > + if (ndev->dev_id == ndev->dev_port) { > > > > > + netdev_info_once(ndev, > > > > > + "\"%s\" wants to know my dev_id. " > > > > > + "Should it look at dev_port instead?\n", > > > > > + current->comm); > > > > > + netdev_info_once(ndev, > > > > > + "See Documentation/ABI/testing/sysfs-class-net > > > > > for more info.\n"); > > > > > + } > > > > > + > > > > > + ret = sprintf(buf, "%#x\n", ndev->dev_id); > > > > > + > > > > > + return ret; > > > > > +} > > > > > +static DEVICE_ATTR_RO(dev_id); > > > > > + > > > > > > > > I don't see this field among exposed by IPoIB, why should we expose it > > > > now? > > > > > > > > > > To deviate from standard netdev behaviour, which only prints the > > > field out. Doug wanted this to also print a deprecation message, and > > > netdev (obviously) does not do that. See below. > > > > > > > > +int ipoib_intercept_dev_id_attr(struct net_device *dev) > > > > > +{ > > > > > + device_remove_file(&dev->dev, &dev_attr_dev_id); > > > > > + return device_create_file(&dev->dev, &dev_attr_dev_id); > > > > > > > > Why isn't enough to rely on netdev code? > > > > > > > > > > Netdev code relies on macros around a *static* function 'netdev_show', > > > which is defined in net/core/net-sysfs.c; it is not listed in any header > > > files, and the macros aren't as well. This all leads me to believe it > > > was not really meant to be used from outside net/core/net-sysfs. > > > > > > The only way we could use any netdev code here is to set up our own > > > handler (again), printk() a message, then call netdev_show — but we have > > > no access to it. > > > > > > Of course, it also may be that I'm terribly missing a clue. > > > > Thanks, > > > > IMHO, the end result of adequate Doug's request is a little bit too much. > > I don't think that it justifies such remove->create construction. > > > > Personal opinion. > > I agree with you that the end result is kinda bulky, *but*, we will need > to know if there are things using the old dev_id before we can remove > it. In particular, I'm concerned the IPoIB handling code of > NetworkManager uses it. It's worth the cost I think. I did my checks now and saw that ibdev2netdev uses the value provided from this dev_id, so every invocation of that script will generate the warning. See this line in Parav's repo https://github.com/Mellanox/container_scripts/blob/master/ibdev2netdev#L112 Thanks > > -- > Doug Ledford > GPG KeyID: B826A3330E572FDD > Key fingerprint = AE6B 1BDA 122B 23B4 265B 1274 B826 A333 0E57 2FDD signature.asc Description: PGP signature
Re: [PATCH net-next] veth: report NEWLINK event when moving the peer device in a new namespace
Hi David, On Mon, 2018-09-03 at 20:54 -0600, David Ahern wrote: > From init_net: > $ ip monitor all-nsid I thought the concern of the patch is the overhead of sending one additional RTM_NEWLINK message. This workaround has likely higher overhead. More importantly, it's so cumbersome, that I doubt anybody would implementing such a solution. When the events of one namespace are not sufficient to get all relevant information (local to the namespace itself), the solution is not monitor all-nsid. You might save complexity and performance overhead in kernel. But what you save here is just moved to user-space, which faces higher complexity (at multiple places/projects, where developers are not experts in netlink) and higher overhead. RTM_GETLINK/NLM_F_DUMP allows to fetch information. The same information is usually also emitted on changes via RTM_NEWLINK notifications. Yes, the information may be avilable somehow, but how cumbersome: a) receive RTM_DELLINK, recognize that the message belongs to a veth that was moved to another netns, and recognize that the peer's IFLA_LINK changed. This approach only works when the veth is moved away from the current namespace. b) or, enable monitor all-nsid, receive RTM_NEWLINK, recognize that the event is for moving a veth between netns, find the peer in the other netns and recognize that the peer's IFLA_LINK changed. best, THomas signature.asc Description: This is a digitally signed message part
Re: [PATCH net-next v2] net: sched: change tcf_del_walker() to take idrinfo->lock
On Fri, Sep 7, 2018 at 6:52 AM Vlad Buslov wrote: > > Action API was changed to work with actions and action_idr in concurrency > safe manner, however tcf_del_walker() still uses actions without taking a > reference or idrinfo->lock first, and deletes them directly, disregarding > possible concurrent delete. > > Add tc_action_wq workqueue to action API. Implement > tcf_idr_release_unsafe() that assumes external synchronization by caller > and delays blocking action cleanup part to tc_action_wq workqueue. Extend > tcf_action_cleanup() with 'async' argument to indicate that function should > free action asynchronously. Where exactly is blocking in tcf_action_cleanup()? >From your code, it looks like free_tcf(), but from my observation, the only blocking function inside is tcf_action_goto_chain_fini() which calls __tcf_chain_put(). But, __tcf_chain_put() is blocking _ONLY_ when tc_chain_notify() is called, for tc action it is never called. So, what else is blocking?
Re: [PATCH net-next 13/13] net: sched: add flags to Qdisc class ops struct
On Thu, Sep 6, 2018 at 12:59 AM Vlad Buslov wrote: > > Extend Qdisc_class_ops with flags. Create enum to hold possible class ops > flag values. Add first class ops flags value QDISC_CLASS_OPS_DOIT_UNLOCKED > to indicate that class ops functions can be called without taking rtnl > lock. We don't add anything that is not used. This is the last patch in this series, so I am pretty sure you split it in a wrong way, it certainly belongs to next series, not this series.
Re: [PATCH net-next 09/13] net: sched: extend tcf_block with rcu
On Thu, Sep 6, 2018 at 12:59 AM Vlad Buslov wrote: > > Extend tcf_block with rcu to allow safe deallocation when it is accessed > concurrently. This sucks, please fold this patch into where you call rcu_read_lock() on tcf block. This patch _alone_ is apparently not complete. This is not how we split patches.
Re: [PATCH v3 3/3] IB/ipoib: Log sysfs 'dev_id' accesses from userspace
On Fri, 2018-09-07 at 20:28 +0300, Leon Romanovsky wrote: > On Fri, Sep 07, 2018 at 01:14:37PM -0400, Doug Ledford wrote: > > On Thu, 2018-09-06 at 16:03 +0300, Leon Romanovsky wrote: > > > On Thu, Sep 06, 2018 at 10:04:33AM +0300, Arseny Maslennikov wrote: > > > > On Wed, Sep 05, 2018 at 04:50:35PM +0300, Leon Romanovsky wrote: > > > > > On Mon, Sep 03, 2018 at 07:13:16PM +0300, Arseny Maslennikov wrote: > > > > > > Signed-off-by: Arseny Maslennikov > > > > > > --- > > > > > > drivers/infiniband/ulp/ipoib/ipoib_main.c | 38 > > > > > > +++ > > > > > > 1 file changed, 38 insertions(+) > > > > > > > > > > > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c > > > > > > b/drivers/infiniband/ulp/ipoib/ipoib_main.c > > > > > > index 30f840f874b3..7386e5bde3d3 100644 > > > > > > --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c > > > > > > +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c > > > > > > @@ -2386,6 +2386,42 @@ int ipoib_add_pkey_attr(struct net_device > > > > > > *dev) > > > > > > return device_create_file(&dev->dev, &dev_attr_pkey); > > > > > > } > > > > > > > > > > > > +/* > > > > > > + * We erroneously exposed the iface's port number in the dev_id > > > > > > + * sysfs field long after dev_port was introduced for that > > > > > > purpose[1], > > > > > > + * and we need to stop everyone from relying on that. > > > > > > + * Let's overload the shower routine for the dev_id file here > > > > > > + * to gently bring the issue up. > > > > > > + * > > > > > > + * [1] https://www.spinics.net/lists/netdev/msg272123.html > > > > > > + */ > > > > > > +static ssize_t dev_id_show(struct device *dev, > > > > > > + struct device_attribute *attr, char *buf) > > > > > > +{ > > > > > > + struct net_device *ndev = to_net_dev(dev); > > > > > > + ssize_t ret = -EINVAL; > > > > > > + > > > > > > + if (ndev->dev_id == ndev->dev_port) { > > > > > > + netdev_info_once(ndev, > > > > > > + "\"%s\" wants to know my dev_id. " > > > > > > + "Should it look at dev_port instead?\n", > > > > > > + current->comm); > > > > > > + netdev_info_once(ndev, > > > > > > + "See Documentation/ABI/testing/sysfs-class-net > > > > > > for more info.\n"); > > > > > > + } > > > > > > + > > > > > > + ret = sprintf(buf, "%#x\n", ndev->dev_id); > > > > > > + > > > > > > + return ret; > > > > > > +} > > > > > > +static DEVICE_ATTR_RO(dev_id); > > > > > > + > > > > > > > > > > I don't see this field among exposed by IPoIB, why should we expose > > > > > it now? > > > > > > > > > > > > > To deviate from standard netdev behaviour, which only prints the > > > > field out. Doug wanted this to also print a deprecation message, and > > > > netdev (obviously) does not do that. See below. > > > > > > > > > > +int ipoib_intercept_dev_id_attr(struct net_device *dev) > > > > > > +{ > > > > > > + device_remove_file(&dev->dev, &dev_attr_dev_id); > > > > > > + return device_create_file(&dev->dev, &dev_attr_dev_id); > > > > > > > > > > Why isn't enough to rely on netdev code? > > > > > > > > > > > > > Netdev code relies on macros around a *static* function 'netdev_show', > > > > which is defined in net/core/net-sysfs.c; it is not listed in any header > > > > files, and the macros aren't as well. This all leads me to believe it > > > > was not really meant to be used from outside net/core/net-sysfs. > > > > > > > > The only way we could use any netdev code here is to set up our own > > > > handler (again), printk() a message, then call netdev_show — but we have > > > > no access to it. > > > > > > > > Of course, it also may be that I'm terribly missing a clue. > > > > > > Thanks, > > > > > > IMHO, the end result of adequate Doug's request is a little bit too much. > > > I don't think that it justifies such remove->create construction. > > > > > > Personal opinion. > > > > I agree with you that the end result is kinda bulky, *but*, we will need > > to know if there are things using the old dev_id before we can remove > > it. In particular, I'm concerned the IPoIB handling code of > > NetworkManager uses it. It's worth the cost I think. > > I did my checks now and saw that ibdev2netdev uses the value provided > from this dev_id, so every invocation of that script will generate the > warning. > > See this line in Parav's repo > https://github.com/Mellanox/container_scripts/blob/master/ibdev2netdev#L112 I'm pretty sure libvirt + qemu will trigger it too. -- Doug Ledford GPG KeyID: B826A3330E572FDD Key fingerprint = AE6B 1BDA 122B 23B4 265B 1274 B826 A333 0E57 2FDD signature.asc Description: This is a digitally signed message part
Re: [PATCH net-next 08/13] net: sched: rename tcf_block_get{_ext}() and tcf_block_put{_ext}()
On Thu, Sep 6, 2018 at 12:59 AM Vlad Buslov wrote: > > Functions tcf_block_get{_ext}() and tcf_block_put{_ext}() actually > attach/detach block to specific Qdisc besides just taking/putting > reference. Rename them according to their purpose. Where exactly does it attach to? Each qdisc provides a pointer to a pointer of a block, like &cl->block. It is where the result is saved to. It takes a parameter of Qdisc* merely for read-only purpose. So, renaming it to *attach() is even confusing, at least not any better. Please find other names or leave them as they are.
Re: kernels > v4.12 oops/crash with ipsec-traffic: bisected to b838d5e1c5b6e57b10ec8af2268824041e3ea911: ipv4: mark DST_NOGC and remove the operation of dst_free()
Am Freitag, 31. August 2018, 08:50:24 schrieb Steffen Klassert: > On Thu, Aug 30, 2018 at 08:53:50PM +0200, Wolfgang Walter wrote: > > Hello, > > > > kernels > 4.12 do not work on one of our main routers. They crash as soon > > as ipsec-tunnels are configured and ipsec-traffic actually flows. > > Can you please send the backtrace of this crash? > I bootet the b838d5e1c5b6e57b10ec8af2268824041e3ea911 several times but I could not record the complete trace. I think I have to log to the serial console but I can't do that before next week. What I could record ist: There is a always ... the callrace. This is the part I could see: irq_exit+0x71/0x80 do_IRQ+0x4d/0xd0 common_interrup+07a/0x7a RIP: 010:cpuidle_enter_state+0x11d/0x200 RSP: 0018:c9000321bee0 EFLAGS: 0282 ORIG_RAX: ffc4 RAX: 88085efde450 RBX: 0004 RCX: 0003c9e63c13 RDX: 0003c9e63c13 RSI: ffb03103fe35ac43 RDI: RBP: e87cf600 R08: 000c R09: 0004 R10: 0400 R11: 0003c99e56fc R12: 0003c9e63c13 R13: 0003c9da9567 R14: 0004 R15: 822763e0 do_idle+0xd3/0x160 cpu_startup_entry+0x14/0x20 secondary_startup_64+0xa5/0xb0 Code: 00 0f b7 83 c0 00 00 00 80 7c 02 08 01 0f 86 d3 02 00 00 41 8b 8c 24 3c 10 00 00 48 8b 6b 58 85 c9 0f 84 2f 01 00 00 48 83 e5 fe 45 60 02 0f 84 4e 01 00 00 f6 43 38 01 74 0d 80 00 bd ab 00 00 RIP: ip_forward+0xd4/0x470 RSP: 88085efc3cb0 CR2: 0060 [ end trace 7205b53c25b7b35a ]--- Kernel panic - not syncing: Fatal exception in interrupt Kernel Offset: disabled Rebooting in 60 seconds.. I got an email from Tobias Hommel and I think it is the same problem. It is very clear that it is the difference from ipv4: call dst_hold_safe() properly to ipv4: mark DST_NOGC and remove the operation of dst_free() which triggers this bug. Regards, -- Wolfgang Walter Studentenwerk München Anstalt des öffentlichen Rechts
[Patch net-next] net_sched: remove redundant qdisc lock classes
We no longer take any spinlock on RX path for ingress qdisc, so this lockdep annotation is no longer needed. Cc: Jamal Hadi Salim Signed-off-by: Cong Wang --- net/sched/sch_api.c | 7 --- 1 file changed, 7 deletions(-) diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c index 98541c6399db..411c40344b77 100644 --- a/net/sched/sch_api.c +++ b/net/sched/sch_api.c @@ -27,7 +27,6 @@ #include #include #include -#include #include #include @@ -1053,10 +1052,6 @@ static int qdisc_block_indexes_set(struct Qdisc *sch, struct nlattr **tca, return 0; } -/* lockdep annotation is needed for ingress; egress gets it only for name */ -static struct lock_class_key qdisc_tx_lock; -static struct lock_class_key qdisc_rx_lock; - /* Allocate and initialize new qdisc. @@ -1121,7 +1116,6 @@ static struct Qdisc *qdisc_create(struct net_device *dev, if (handle == TC_H_INGRESS) { sch->flags |= TCQ_F_INGRESS; handle = TC_H_MAKE(TC_H_INGRESS, 0); - lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock); } else { if (handle == 0) { handle = qdisc_alloc_handle(dev); @@ -1129,7 +1123,6 @@ static struct Qdisc *qdisc_create(struct net_device *dev, if (handle == 0) goto err_out3; } - lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock); if (!netif_is_multiqueue(dev)) sch->flags |= TCQ_F_ONETXQUEUE; } -- 2.14.4
[Patch net-next] htb: use anonymous union for simplicity
cl->leaf.q is slightly more readable than cl->un.leaf.q. Cc: Jamal Hadi Salim Signed-off-by: Cong Wang --- net/sched/sch_htb.c | 98 ++--- 1 file changed, 49 insertions(+), 49 deletions(-) diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c index 43c4bfe625a9..2c45e97b7b87 100644 --- a/net/sched/sch_htb.c +++ b/net/sched/sch_htb.c @@ -132,7 +132,7 @@ struct htb_class { struct htb_class_inner { struct htb_prio clprio[TC_HTB_NUMPRIO]; } inner; - } un; + }; s64 pq_key; int prio_activity; /* for which prios are we active */ @@ -411,13 +411,13 @@ static void htb_activate_prios(struct htb_sched *q, struct htb_class *cl) int prio = ffz(~m); m &= ~(1 << prio); - if (p->un.inner.clprio[prio].feed.rb_node) + if (p->inner.clprio[prio].feed.rb_node) /* parent already has its feed in use so that * reset bit in mask as parent is already ok */ mask &= ~(1 << prio); - htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, prio); + htb_add_to_id_tree(&p->inner.clprio[prio].feed, cl, prio); } p->prio_activity |= mask; cl = p; @@ -447,19 +447,19 @@ static void htb_deactivate_prios(struct htb_sched *q, struct htb_class *cl) int prio = ffz(~m); m &= ~(1 << prio); - if (p->un.inner.clprio[prio].ptr == cl->node + prio) { + if (p->inner.clprio[prio].ptr == cl->node + prio) { /* we are removing child which is pointed to from * parent feed - forget the pointer but remember * classid */ - p->un.inner.clprio[prio].last_ptr_id = cl->common.classid; - p->un.inner.clprio[prio].ptr = NULL; + p->inner.clprio[prio].last_ptr_id = cl->common.classid; + p->inner.clprio[prio].ptr = NULL; } htb_safe_rb_erase(cl->node + prio, - &p->un.inner.clprio[prio].feed); + &p->inner.clprio[prio].feed); - if (!p->un.inner.clprio[prio].feed.rb_node) + if (!p->inner.clprio[prio].feed.rb_node) mask |= 1 << prio; } @@ -555,7 +555,7 @@ htb_change_class_mode(struct htb_sched *q, struct htb_class *cl, s64 *diff) */ static inline void htb_activate(struct htb_sched *q, struct htb_class *cl) { - WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen); + WARN_ON(cl->level || !cl->leaf.q || !cl->leaf.q->q.qlen); if (!cl->prio_activity) { cl->prio_activity = 1 << cl->prio; @@ -615,7 +615,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc *sch, __qdisc_drop(skb, to_free); return ret; #endif - } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q, + } else if ((ret = qdisc_enqueue(skb, cl->leaf.q, to_free)) != NET_XMIT_SUCCESS) { if (net_xmit_drop_count(ret)) { qdisc_qstats_drop(sch); @@ -823,7 +823,7 @@ static struct htb_class *htb_lookup_leaf(struct htb_prio *hprio, const int prio) cl = rb_entry(*sp->pptr, struct htb_class, node[prio]); if (!cl->level) return cl; - clp = &cl->un.inner.clprio[prio]; + clp = &cl->inner.clprio[prio]; (++sp)->root = clp->feed.rb_node; sp->pptr = &clp->ptr; sp->pid = &clp->last_ptr_id; @@ -857,7 +857,7 @@ static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio, * graft operation on the leaf since last dequeue; * simply deactivate and skip such class */ - if (unlikely(cl->un.leaf.q->q.qlen == 0)) { + if (unlikely(cl->leaf.q->q.qlen == 0)) { struct htb_class *next; htb_deactivate(q, cl); @@ -873,12 +873,12 @@ static struct sk_buff *htb_dequeue_tree(struct htb_sched *q, const int prio, goto next; } - skb = cl->un.leaf.q->dequeue(cl->un.leaf.q); + skb = cl->leaf.q->dequeue(cl->leaf.q);
Re: [RFC PATCH bpf-next v2 0/4] Implement bpf queue/stack maps
On 09/06/2018 07:13 PM, Alexei Starovoitov wrote: On Fri, Aug 31, 2018 at 11:25:48PM +0200, Mauricio Vasquez B wrote: In some applications this is needed have a pool of free elements, like for example the list of free L4 ports in a SNAT. None of the current maps allow to do it as it is not possibleto get an any element without having they key it is associated to. This patchset implements two new kind of eBPF maps: queue and stack. Those maps provide to eBPF programs the peek, push and pop operations, and for userspace applications a new bpf_map_lookup_and_delete_elem() is added. Signed-off-by: Mauricio Vasquez B --- I am sending this as an RFC because there is still an issue I am not sure how to solve. The queue/stack maps have a linked list for saving the nodes, and a preallocation schema based on the pcpu_freelist already implemented and used in the htabmap. Each time an element is pushed into the map, a node from the pcpu_freelist is taken and then added to the linked list. The pop operation takes and *removes* the first node from the linked list, then it uses *call_rcu* to postpose freeing the node, i.e, the node is only returned to the pcpu_freelist when the rcu callback is executed. This is needed because an element returned by the pop() operation should remain valid for the whole duration of the eBPF program. The problem is that elements are not immediately returned to the free list, so in some cases the push operation could fail because there are not free nodes in the pcpu_freelist. The following code snippet exposes that problem. ... /* Push MAP_SIZE elements */ for (i = 0; i < MAP_SIZE; i++) assert(bpf_map_update_elem(fd, NULL, &vals[i], 0) == 0); /* Pop all elements */ for (i = 0; i < MAP_SIZE; i++) assert(bpf_map_lookup_and_delete_elem(fd, NULL, &val) == 0 && val == vals[i]); // sleep(1) <-- If I put this sleep, everything works. /* Push MAP_SIZE elements */ for (i = 0; i < MAP_SIZE; i++) assert(bpf_map_update_elem(fd, NULL, &vals[i], 0) == 0); ^^^ This fails because there are not available elements in pcpu_freelist ... I think a possible solution is to oversize the pcpu_freelist (no idea by how much, maybe double or, or make it 1.5 time the max elements in the map?) I also have concerns about it, it would waste that memory in many cases and this is also probably that it doesn't solve the issue because that code snippet is puhsing and popping elements too fast, so even if the pcpu_freelist is much large a certain time instant all the elements could be used. Is this really an important issue? Any idea of how to solve it? It is important issue indeed and a difficult one to solve. We have the same issue with hash map. If the prog is doing: value = lookup(key); delete(key); // here the prog shouldn't be accessing the value anymore, since the memory // could have been reused, but value pointer is still valid and points to // allocated memory Just to notice that for the queue map it is a little bit worse because there isn't a way to mark an element to be reused, hence in some cases the pool of free elements could be exhausted. bpf_map_pop_elem() is trying to do lookup_and_delete and preserve validity of value without races. With pcpu_freelist I don't think there is a solution. We can have this queue/stack map without prealloc and use kmalloc/kfree back and forth. Performance will not be as great, but for your use case, I suspect, it will be good enough. I agree, for our use case we are not that worried about the performance, it is still in the dataplane but let's say it is not in the "hot" path. The key issue with kmalloc/kfree is unbounded time of rcu callbacks. If somebody starts doing push/pop for every packet, the rcu subsystem will struggle and nothing we can do about it. The only way I could think of to resolve this problem is to reuse the logic that Joe is working on for socket lookups inside the program. Joe, how is that going? Could you repost the latest patches? In such case the api for stack map will look like: elem = bpf_map_pop_elem(stack); // access elem bpf_map_free_elem(elem); // here prog is not allowed to access elem and verifier will catch that elem = bpf_map_alloc_elem(stack); // populate elem bpf_map_push_elem(elem); // here prog is not allowed to access elem and verifier will catch that Then both pre-allocated elems and kmalloc/kfree will work fine and no unbounded rcu issues in both cases. I read the Joe's proposal and using that for this problem looks like a nice solution. I think a good trade-off for now would be to go ahead with a queue/stack map without preallocating support (or maybe include it having always in mind that this issue has to be solved in the near future) and then, as a separated work, try to use Joe's proposal in the map helpers. What do you think? Thanks, Mauricio.
Re: kernels > v4.12 oops/crash with ipsec-traffic: bisected to b838d5e1c5b6e57b10ec8af2268824041e3ea911: ipv4: mark DST_NOGC and remove the operation of dst_free()
Hello Steffen, in one of your emails to Thomas you wrote: > xfrm_lookup+0x2a is at the very beginning of xfrm_lookup(), here we > find: > > u16 family = dst_orig->ops->family; > > ops has an offset of 32 bytes (20 hex) in dst_orig, so looks like > dst_orig is NULL. > > In the forwarding case, we get dst_orig from the skb and dst_orig > can't be NULL here unless the skb itself is already fishy. Is this really true? If xfrm_lookup is called from __xfrm_route_forward(): int __xfrm_route_forward(struct sk_buff *skb, unsigned short family) { struct net *net = dev_net(skb->dev); struct flowi fl; struct dst_entry *dst; int res = 1; if (xfrm_decode_session(skb, &fl, family) < 0) { XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR); return 0; } skb_dst_force(skb); dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE); if (IS_ERR(dst)) { res = 0; dst = NULL; } skb_dst_set(skb, dst); return res; } couldn't it be possible that skb_dst_force(skb) actually sets dst to NULL if it cannot safely lock it? If it is absolutely sure that skb_dst_force() never can set dst to NULL I wonder why it is called at all? Here is skb_dst_force() static inline void skb_dst_force(struct sk_buff *skb) { if (skb_dst_is_noref(skb)) { struct dst_entry *dst = skb_dst(skb); WARN_ON(!rcu_read_lock_held()); if (!dst_hold_safe(dst)) dst = NULL; skb->_skb_refdst = (unsigned long)dst; } } and dst_hold_safe() is static inline bool dst_hold_safe(struct dst_entry *dst) { return atomic_inc_not_zero(&dst->__refcnt); } Am Freitag, 7. September 2018, 22:22:39 schrieb Wolfgang Walter: > Am Freitag, 31. August 2018, 08:50:24 schrieb Steffen Klassert: > > On Thu, Aug 30, 2018 at 08:53:50PM +0200, Wolfgang Walter wrote: > > > Hello, > > > > > > kernels > 4.12 do not work on one of our main routers. They crash as > > > soon > > > as ipsec-tunnels are configured and ipsec-traffic actually flows. > > > > Can you please send the backtrace of this crash? > > I bootet the b838d5e1c5b6e57b10ec8af2268824041e3ea911 several times but I > could not record the complete trace. I think I have to log to the serial > console but I can't do that before next week. > > > What I could record ist: > > There is a always >... > the callrace. > > This is the part I could see: > > > irq_exit+0x71/0x80 > do_IRQ+0x4d/0xd0 > common_interrup+07a/0x7a > > RIP: 010:cpuidle_enter_state+0x11d/0x200 > RSP: 0018:c9000321bee0 EFLAGS: 0282 ORIG_RAX: ffc4 > RAX: 88085efde450 RBX: 0004 RCX: 0003c9e63c13 > RDX: 0003c9e63c13 RSI: ffb03103fe35ac43 RDI: > RBP: e87cf600 R08: 000c R09: 0004 > R10: 0400 R11: 0003c99e56fc R12: 0003c9e63c13 > R13: 0003c9da9567 R14: 0004 R15: 822763e0 > do_idle+0xd3/0x160 > cpu_startup_entry+0x14/0x20 > secondary_startup_64+0xa5/0xb0 > Code: 00 0f b7 83 c0 00 00 00 80 7c 02 08 01 0f 86 d3 02 00 00 41 > 8b 8c 24 3c 10 00 00 48 8b 6b 58 85 c9 0f 84 2f 01 00 00 48 83 e5 fe 45 > 60 > 02 0f 84 4e 01 00 00 f6 43 38 01 74 0d 80 00 bd ab 00 00 > RIP: ip_forward+0xd4/0x470 RSP: 88085efc3cb0 > CR2: 0060 > [ end trace 7205b53c25b7b35a ]--- > Kernel panic - not syncing: Fatal exception in interrupt > Kernel Offset: disabled > Rebooting in 60 seconds.. > > > I got an email from Tobias Hommel and I think it is the same problem. > > It is very clear that it is the difference from > > ipv4: call dst_hold_safe() properly > > to > > ipv4: mark DST_NOGC and remove the operation of dst_free() > > which triggers this bug. > > Regards, Regards -- Wolfgang Walter Studentenwerk München Anstalt des öffentlichen Rechts
[net-next] i40e(vf): remove i40e_ethtool_stats.h header file
Essentially reverts commit 8fd75c58a09a ("i40e: move ethtool stats boiler plate code to i40e_ethtool_stats.h", 2018-08-30), and additionally moves the similar code in i40evf into i40evf_ethtool.c. The code was intially moved from i40e_ethtool.c into i40e_ethtool_stats.h as a way of better logically organizing the code. This has two problems. First, we can't have an inline function with variadic arguments on all platforms. Second, it gave the appearance that we had plans to share code between the i40e and i40evf drivers, due to having a near copy of the contents in the i40evf/i40e_ethtool_stats.h file. Patches which actually attempt to combine or share code between the i40e and i40evf drivers have not materialized, and are likely a ways off. Rather than fixing the one function which causes build issues, just move this code back into the i40e_ethtool.c and i40evf_ethtool.c files. Note that we also change these functions back from static inlines to just statics, since they're no longer in a header file. We can revisit this if/when work is done to actually attempt to share code between drivers. Alternatively, this stats code could be made more generic so that it can be shared across drivers as part of ethtool kernel work. Signed-off-by: Jacob Keller --- Sorry aboutthe delay, our iwl queue maintainer Jeff is on vacation, and discussion about how best to resolve this issue was/is ongoing in the IWL list. I opted to just move the contents of these files back into i40e_ethtool.c and i40evf_ethtool.c respectively, as I think it's better than only moving the single offending function back into the .c file. This is different from the approach suggested by Wang, Dongsheng on Intel Wired LAN, but I think it's a better approach for now. I've also kindly asked if Aaron Brown could test this out. .../net/ethernet/intel/i40e/i40e_ethtool.c| 219 - .../ethernet/intel/i40e/i40e_ethtool_stats.h | 221 -- .../intel/i40evf/i40e_ethtool_stats.h | 221 -- .../ethernet/intel/i40evf/i40evf_ethtool.c| 219 - 4 files changed, 436 insertions(+), 444 deletions(-) delete mode 100644 drivers/net/ethernet/intel/i40e/i40e_ethtool_stats.h delete mode 100644 drivers/net/ethernet/intel/i40evf/i40e_ethtool_stats.h diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c index d7d3974beca2..87fe2e60602f 100644 --- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c +++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c @@ -6,7 +6,224 @@ #include "i40e.h" #include "i40e_diag.h" -#include "i40e_ethtool_stats.h" +/* ethtool statistics helpers */ + +/** + * struct i40e_stats - definition for an ethtool statistic + * @stat_string: statistic name to display in ethtool -S output + * @sizeof_stat: the sizeof() the stat, must be no greater than sizeof(u64) + * @stat_offset: offsetof() the stat from a base pointer + * + * This structure defines a statistic to be added to the ethtool stats buffer. + * It defines a statistic as offset from a common base pointer. Stats should + * be defined in constant arrays using the I40E_STAT macro, with every element + * of the array using the same _type for calculating the sizeof_stat and + * stat_offset. + * + * The @sizeof_stat is expected to be sizeof(u8), sizeof(u16), sizeof(u32) or + * sizeof(u64). Other sizes are not expected and will produce a WARN_ONCE from + * the i40e_add_ethtool_stat() helper function. + * + * The @stat_string is interpreted as a format string, allowing formatted + * values to be inserted while looping over multiple structures for a given + * statistics array. Thus, every statistic string in an array should have the + * same type and number of format specifiers, to be formatted by variadic + * arguments to the i40e_add_stat_string() helper function. + **/ +struct i40e_stats { + char stat_string[ETH_GSTRING_LEN]; + int sizeof_stat; + int stat_offset; +}; + +/* Helper macro to define an i40e_stat structure with proper size and type. + * Use this when defining constant statistics arrays. Note that @_type expects + * only a type name and is used multiple times. + */ +#define I40E_STAT(_type, _name, _stat) { \ + .stat_string = _name, \ + .sizeof_stat = FIELD_SIZEOF(_type, _stat), \ + .stat_offset = offsetof(_type, _stat) \ +} + +/* Helper macro for defining some statistics directly copied from the netdev + * stats structure. + */ +#define I40E_NETDEV_STAT(_net_stat) \ + I40E_STAT(struct rtnl_link_stats64, #_net_stat, _net_stat) + +/* Helper macro for defining some statistics related to queues */ +#define I40E_QUEUE_STAT(_name, _stat) \ + I40E_STAT(struct i40e_ring, _name, _stat) + +/* Stats associated with a Tx or Rx ring */ +static const struct i40e_stats i40e_gstrings_queue_stats[] = { + I40E_QUEUE_STAT("%s-%u.packets", stats.packets), + I40E_QUEUE_STAT("%s-%u.bytes", stats.bytes), +}; + +/*
Re: [net-next] i40e(vf): remove i40e_ethtool_stats.h header file
From: Jacob Keller Date: Fri, 7 Sep 2018 14:55:44 -0700 > Essentially reverts commit 8fd75c58a09a ("i40e: move ethtool > stats boiler plate code to i40e_ethtool_stats.h", 2018-08-30), and > additionally moves the similar code in i40evf into i40evf_ethtool.c. > > The code was intially moved from i40e_ethtool.c into i40e_ethtool_stats.h > as a way of better logically organizing the code. This has two problems. > First, we can't have an inline function with variadic arguments on all > platforms. Second, it gave the appearance that we had plans to share > code between the i40e and i40evf drivers, due to having a near copy of > the contents in the i40evf/i40e_ethtool_stats.h file. > > Patches which actually attempt to combine or share code between the i40e > and i40evf drivers have not materialized, and are likely a ways off. > > Rather than fixing the one function which causes build issues, just move > this code back into the i40e_ethtool.c and i40evf_ethtool.c files. Note > that we also change these functions back from static inlines to just > statics, since they're no longer in a header file. > > We can revisit this if/when work is done to actually attempt to share > code between drivers. Alternatively, this stats code could be made more > generic so that it can be shared across drivers as part of ethtool > kernel work. > > Signed-off-by: Jacob Keller Applied and after the build checks out will be pushed to net-next. Thanks.
[PATCH net] netfilter: bridge: Don't sabotage nf_hook calls from an l3mdev
From: David Ahern For starters, the bridge netfilter code registers operations that are invoked any time nh_hook is called. Specifically, ip_sabotage_in watches for nested calls for NF_INET_PRE_ROUTING when a bridge is in the stack. Packet wise, the bridge netfilter hook runs first. br_nf_pre_routing allocates nf_bridge, sets in_prerouting to 1 and calls NF_HOOK for NF_INET_PRE_ROUTING. It's finish function, br_nf_pre_routing_finish, then resets in_prerouting flag to 0 and the packet continues up the stack. The packet eventually makes it to the VRF driver and it invokes nf_hook for NF_INET_PRE_ROUTING in case any rules have been added against the vrf device. Because of the registered operations the call to nf_hook causes ip_sabotage_in to be invoked. That function sees the nf_bridge on the skb and that in_prerouting is not set. Thinking it is an invalid nested call it steals (drops) the packet. Update ip_sabotage_in to recognize that the bridge or one of its upper devices (e.g., vlan) can be enslaved to a VRF (L3 master device) and allow the packet to go through the nf_hook a second time. Fixes: 73e20b761acf ("net: vrf: Add support for PREROUTING rules on vrf device") Reported-by: D'Souza, Nelson Signed-off-by: David Ahern --- net/bridge/br_netfilter_hooks.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 6e0dc6bcd32a..37278dc280eb 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -835,7 +835,8 @@ static unsigned int ip_sabotage_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - if (skb->nf_bridge && !skb->nf_bridge->in_prerouting) { + if (skb->nf_bridge && !skb->nf_bridge->in_prerouting && + !netif_is_l3_master(skb->dev)) { state->okfn(state->net, state->sk, skb); return NF_STOLEN; } -- 2.11.0
Re: GPL compliance issue with liquidio/lio_23xx_vsw.bin firmware
On Wed, Aug 29, 2018 at 09:26:35PM +0100, Ben Hutchings wrote: > Date: Wed, 29 Aug 2018 21:26:35 +0100 > From: Ben Hutchings > To: Felix Manlunas , Florian Weimer > > CC: linux-firmw...@kernel.org, "netdev@vger.kernel.org" > , Derek Chickles > , Satanand Burla > , Felix Manlunas > , Raghu Vatsavayi > , Manish Awasthi > , manojkumar.panic...@cavium.com > Subject: Re: GPL compliance issue with liquidio/lio_23xx_vsw.bin firmware > User-Agent: Evolution 3.29.92-1 > > On Mon, 2018-08-27 at 17:04 -0700, Felix Manlunas wrote: > > On Mon, Aug 27, 2018 at 05:01:10PM +0200, Florian Weimer wrote: > > > liquidio/lio_23xx_vsw.bin contains a compiled MIPS Linux kernel: > > > > > > $ tail --bytes=+1313 liquidio/lio_23xx_vsw.bin > elf > > > $ readelf -aW elf > > > […] > > > [ 6] __ksymtab PROGBITS80e495f8 64a5f8 00d130 > > > 00 A 0 0 8 > > > [ 7] __ksymtab_gpl PROGBITS80e56728 657728 008400 > > > 00 A 0 0 8 > > > [ 8] __ksymtab_strings PROGBITS80e5eb28 65fb28 018868 > > > 00 A 0 0 1 > > > […] > > > Symbol table '.symtab' contains 1349 entries: > > >Num:Value Size TypeBind Vis Ndx Name > > > 0: 0 NOTYPE LOCAL DEFAULT UND > > > 1: 0 FILELOCAL DEFAULT ABS > > > arch/mips/kernel/head.o > > > 2: 0 FILELOCAL DEFAULT ABS init/main.c > > > 3: 0 FILELOCAL DEFAULT ABS > > > include/linux/types.h > > > […] > > > > > > Yet there is no corresponding source provided, and LICENCE.cavium lacks > > > the required notices. > > > > > > Thanks, > > > Florian > > > > Cavium apologizes for the oversight. Cavium has been advertising the > > appropriate license terms including the existence of GPL in the firmware > > in our outbox releases. We will update the license terms in LICENCE.cavium > > in our upstream contribution in collaboration with our legal team. > > Everything added to linux-firmware.git needs to be safe for Linux > distributions to redistribute. (There are some ancient firmware images > with unclear licensing, but we don't want to add any more.) > > My understanding is that GPL v2 requires that commercial distributors > either provide the complete and corresponding source alongside the > binaries, or include a written offer to provide it later. They > *cannot* simply point to your offer to do this (only non-commercial > distributors can do that). So the source needs to be published too. > > Adding the complete Linux kernel source code to linux-firmware.git > doesn't seem like a sensible step, so maybe this particular firmware > needs to live elsewhere. > > Ben. Apologies for the delay. We're committed to provide sources for the GPL components of LiquidIO firmware. After Cavium's recent merger with Marvell, we are revisiting our Licensing and expect to share the updated license soon. Since keeping Linux kernel source code in linux-firmware.git doesn't make sense to us too, we're also working internally on identifying the appropriate mechanism to share those sources when requested. That said, it is our opinion that for all practical purposes (for example, distribution of driver + firmware) the firmware binary for LiquidIO adapters should continue to reside in the linux-firmware.git as long as all the prerequisites for sharing the firmware are met. In the case of liquidio_23xx_vsw.bin, this would mean an additional license type in the LICENCE.cavium file for this firmware including a reference to the accompanying sources for the Linux kernel bundled in the firmware. As we understand, there is no precedent in the linux-firmware.git for device firmware that include a Linux kernel so we also request comments from community on the best ways to support the LiquidIO firmware and similar firmwares that may appear in future. Cavium LiquidIO Team
Re: [**EXTERNAL**] Re: VRF with enslaved L3 enabled bridge
Thanks David and Ido, for finding the root-cause for bridge Rx packets getting dropped, also for coming up with a patch. Regards, Nelson On 9/7/18, 9:09 AM, "David Ahern" wrote: On 9/7/18 9:56 AM, D'Souza, Nelson wrote: > > *From:* David Ahern > *Sent:* Thursday, September 6, 2018 5:27 PM > *To:* D'Souza, Nelson; netdev@vger.kernel.org > *Subject:* Re: [**EXTERNAL**] Re: VRF with enslaved L3 enabled bridge > > On 9/5/18 12:00 PM, D'Souza, Nelson wrote: >> Just following up would you be able to confirm that this is a > Linux VRF issue? > > I can confirm that I can reproduce the problem. Need to find time to dig > into it. bridge's netfilter hook is dropping the packet. bridge's netfilter code registers hook operations that are invoked when nh_hook is called. It then sees all subsequent calls to nf_hook. Packet wise, the bridge netfilter hook runs first. br_nf_pre_routing allocates nf_bridge, sets in_prerouting to 1 and calls NF_HOOK for NF_INET_PRE_ROUTING. It's finish function, br_nf_pre_routing_finish, then resets in_prerouting flag to 0. Any subsequent calls to nf_hook invoke ip_sabotage_in. That function sees in_prerouting is not set and steals (drops) the packet. The simplest change is to have ip_sabotage_in recognize that the bridge can be enslaved to a VRF (L3 master device) and allow the packet to continue. Thanks to Ido for the hint on ip_sabotage_in. This patch works for me: diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c index 6e0dc6bcd32a..37278dc280eb 100644 --- a/net/bridge/br_netfilter_hooks.c +++ b/net/bridge/br_netfilter_hooks.c @@ -835,7 +835,8 @@ static unsigned int ip_sabotage_in(void *priv, struct sk_buff *skb, const struct nf_hook_state *state) { - if (skb->nf_bridge && !skb->nf_bridge->in_prerouting) { + if (skb->nf_bridge && !skb->nf_bridge->in_prerouting && + !netif_is_l3_master(skb->dev)) { state->okfn(state->net, state->sk, skb); return NF_STOLEN; }
[bpf-next, v2 1/3] flow_dissector: implements flow dissector BPF hook
From: Petar Penkov Adds a hook for programs of type BPF_PROG_TYPE_FLOW_DISSECTOR and attach type BPF_FLOW_DISSECTOR that is executed in the flow dissector path. The BPF program is per-network namespace. Signed-off-by: Petar Penkov Signed-off-by: Willem de Bruijn --- include/linux/bpf.h| 1 + include/linux/bpf_types.h | 1 + include/linux/skbuff.h | 7 ++ include/net/net_namespace.h| 3 + include/net/sch_generic.h | 12 ++- include/uapi/linux/bpf.h | 25 ++ kernel/bpf/syscall.c | 8 ++ kernel/bpf/verifier.c | 32 net/core/filter.c | 67 net/core/flow_dissector.c | 136 + tools/bpf/bpftool/prog.c | 1 + tools/include/uapi/linux/bpf.h | 25 ++ tools/lib/bpf/libbpf.c | 2 + 13 files changed, 317 insertions(+), 3 deletions(-) diff --git a/include/linux/bpf.h b/include/linux/bpf.h index 523481a3471b..988a00797bcd 100644 --- a/include/linux/bpf.h +++ b/include/linux/bpf.h @@ -212,6 +212,7 @@ enum bpf_reg_type { PTR_TO_PACKET_META, /* skb->data - meta_len */ PTR_TO_PACKET, /* reg points to skb->data */ PTR_TO_PACKET_END, /* skb->data + headlen */ + PTR_TO_FLOW_KEYS,/* reg points to bpf_flow_keys */ }; /* The information passed from prog-specific *_is_valid_access diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h index cd26c090e7c0..22083712dd18 100644 --- a/include/linux/bpf_types.h +++ b/include/linux/bpf_types.h @@ -32,6 +32,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2) #ifdef CONFIG_INET BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport) #endif +BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector) BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops) BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 17a13e4785fc..ce0e863f02a2 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -243,6 +243,8 @@ struct scatterlist; struct pipe_inode_info; struct iov_iter; struct napi_struct; +struct bpf_prog; +union bpf_attr; #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE) struct nf_conntrack { @@ -1192,6 +1194,11 @@ void skb_flow_dissector_init(struct flow_dissector *flow_dissector, const struct flow_dissector_key *key, unsigned int key_count); +int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr, + struct bpf_prog *prog); + +int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr); + bool __skb_flow_dissect(const struct sk_buff *skb, struct flow_dissector *flow_dissector, void *target_container, diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h index 9b5fdc50519a..99d4148e0f90 100644 --- a/include/net/net_namespace.h +++ b/include/net/net_namespace.h @@ -43,6 +43,7 @@ struct ctl_table_header; struct net_generic; struct uevent_sock; struct netns_ipvs; +struct bpf_prog; #define NETDEV_HASHBITS8 @@ -145,6 +146,8 @@ struct net { #endif struct net_generic __rcu*gen; + struct bpf_prog __rcu *flow_dissector_prog; + /* Note : following structs are cache line aligned */ #ifdef CONFIG_XFRM struct netns_xfrm xfrm; diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h index a6d00093f35e..1b81ba85fd2d 100644 --- a/include/net/sch_generic.h +++ b/include/net/sch_generic.h @@ -19,6 +19,7 @@ struct Qdisc_ops; struct qdisc_walker; struct tcf_walker; struct module; +struct bpf_flow_keys; typedef int tc_setup_cb_t(enum tc_setup_type type, void *type_data, void *cb_priv); @@ -307,9 +308,14 @@ struct tcf_proto { }; struct qdisc_skb_cb { - unsigned intpkt_len; - u16 slave_dev_queue_mapping; - u16 tc_classid; + union { + struct { + unsigned intpkt_len; + u16 slave_dev_queue_mapping; + u16 tc_classid; + }; + struct bpf_flow_keys *flow_keys; + }; #define QDISC_CB_PRIV_LEN 20 unsigned char data[QDISC_CB_PRIV_LEN]; }; diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h index 66917a4eba27..3064706fcaaa 100644 --- a/include/uapi/linux/bpf.h +++ b/include/uapi/linux/bpf.h @@ -152,6 +152,7 @@ enum bpf_prog_type { BPF_PROG_TYPE_LWT_SEG6LOCAL, BPF_PROG_TYPE_LIRC_MODE2, BPF_PROG_TYPE_SK_REUSEPORT, + BPF_PROG_TYPE_FLOW_DISSECTOR, }; enum bpf_attach_type { @@ -172,6 +173,7 @@ enum bpf_attach_type { BPF_CGROUP_UDP4_SENDMSG, BPF_CGROUP_UDP6_SEN
[bpf-next, v2 0/3] Introduce eBPF flow dissector
From: Petar Penkov This patch series hardens the RX stack by allowing flow dissection in BPF, as previously discussed [1]. Because of the rigorous checks of the BPF verifier, this provides significant security guarantees. In particular, the BPF flow dissector cannot get inside of an infinite loop, as with CVE-2013-4348, because BPF programs are guaranteed to terminate. It cannot read outside of packet bounds, because all memory accesses are checked. Also, with BPF the administrator can decide which protocols to support, reducing potential attack surface. Rarely encountered protocols can be excluded from dissection and the program can be updated without kernel recompile or reboot if a bug is discovered. Patch 1 adds infrastructure to execute a BPF program in __skb_flow_dissect. This includes a new BPF program and attach type. Patch 2 adds a flow dissector program in BPF. This parses most protocols in __skb_flow_dissect in BPF for a subset of flow keys (basic, control, ports, and address types). Patch 3 adds a selftest that attaches the BPF program to the flow dissector and sends traffic with different levels of encapsulation. Performance Evaluation: The in-kernel implementation was compared against the demo program from patch 2 using the test in patch 3 with IPv4/UDP traffic over 10 seconds. $perf record -a -C 4 taskset -c 4 ./test_flow_dissector -i 4 -f 8 \ -t 10 In-kernel Dissector: __skb_flow_dissect overhead: 2.12% Total Packets: 3,272,597 (from output of ./test_flow_dissector) BPF Dissector: __skb_flow_dissect overhead: 1.63% Total Packets: 3,232,356 (from output of ./test_flow_dissector) No-op BPF Dissector: __skb_flow_dissect overhead: 1.52% Total Packets: 3,330,635 (from output of ./test_flow_dissector) Changes since v1: 1/ (Patch 1) LD_ABS instructions now disallowed for the new BPF prog type 2/ (Patch 1) now checks if skb is NULL in __skb_flow_dissect() 3/ (Patch 1) fixed incorrect accesses in flow_dissector_is_valid_access() - writes to the flow_keys field now disallowed - reads/writes to tc_classid and data_meta now disallowed 4/ (Patch 2) headers pulled with bpf_skb_load_data if direct access fails Changes since RFC: 1/ (Patch 1) Flow dissector hook changed from global to per-netns 2/ (Patch 1) Defined struct bpf_flow_keys to be used in BPF flow dissector programs instead of exposing the internal flow keys layout. Added a function to translate from bpf_flow_keys to the internal layout after BPF dissection is complete. The pointer to this struct is stored in qdisc_skb_cb rather than inside of the 20 byte control block which simplifies verification and allows access to all 20 bytes of the cb. 3/ (Patch 2) Removed GUE parsing as it relied on a hardcoded port 4/ (Patch 2) MPLS parsing now stops at the first label which is consistent with the in-kernel flow dissector 5/ (Patch 2) Refactored to use direct packet access and to write out to struct bpf_flow_keys [1] http://vger.kernel.org/netconf2017_files/rx_hardening_and_udp_gso.pdf Petar Penkov (3): flow_dissector: implements flow dissector BPF hook flow_dissector: implements eBPF parser selftests/bpf: test bpf flow dissection include/linux/bpf.h | 1 + include/linux/bpf_types.h | 1 + include/linux/skbuff.h| 7 + include/net/net_namespace.h | 3 + include/net/sch_generic.h | 12 +- include/uapi/linux/bpf.h | 25 + kernel/bpf/syscall.c | 8 + kernel/bpf/verifier.c | 32 + net/core/filter.c | 67 ++ net/core/flow_dissector.c | 136 +++ tools/bpf/bpftool/prog.c | 1 + tools/include/uapi/linux/bpf.h| 25 + tools/lib/bpf/libbpf.c| 2 + tools/testing/selftests/bpf/.gitignore| 2 + tools/testing/selftests/bpf/Makefile | 8 +- tools/testing/selftests/bpf/bpf_flow.c| 386 + tools/testing/selftests/bpf/config| 1 + .../selftests/bpf/flow_dissector_load.c | 140 .../selftests/bpf/test_flow_dissector.c | 782 ++ .../selftests/bpf/test_flow_dissector.sh | 115 +++ tools/testing/selftests/bpf/with_addr.sh | 54 ++ tools/testing/selftests/bpf/with_tunnels.sh | 36 + 22 files changed, 1838 insertions(+), 6 deletions(-) create mode 100644 tools/testing/selftests/bpf/bpf_flow.c create mode 100644 tools/testing/selftests/bpf/flow_dissector_load.c create mode 100644 tools/testing/selftests/bpf/test_flow_dissector.c create mode 100755 tools/testing/selftests/bpf/test_flow_dissector.sh create mode 100755 tools/testing/selftests/bpf/with_addr.sh create mode 100755 tools/testing/selftests/bpf/with_tunnels.sh -- 2.19.0.rc2.392.g5ba43deb5a-goog
[bpf-next, v2 2/3] flow_dissector: implements eBPF parser
From: Petar Penkov This eBPF program extracts basic/control/ip address/ports keys from incoming packets. It supports recursive parsing for IP encapsulation, and VLAN, along with IPv4/IPv6 and extension headers. This program is meant to show how flow dissection and key extraction can be done in eBPF. Link: http://vger.kernel.org/netconf2017_files/rx_hardening_and_udp_gso.pdf Signed-off-by: Petar Penkov Signed-off-by: Willem de Bruijn --- tools/testing/selftests/bpf/Makefile | 2 +- tools/testing/selftests/bpf/bpf_flow.c | 386 + 2 files changed, 387 insertions(+), 1 deletion(-) create mode 100644 tools/testing/selftests/bpf/bpf_flow.c diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index fff7fb1285fc..e65f50f9185e 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -35,7 +35,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o test_tcp_estats.o test test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \ test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o test_lirc_mode2_kern.o \ get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \ - test_skb_cgroup_id_kern.o + test_skb_cgroup_id_kern.o bpf_flow.o # Order correspond to 'make run_tests' order TEST_PROGS := test_kmod.sh \ diff --git a/tools/testing/selftests/bpf/bpf_flow.c b/tools/testing/selftests/bpf/bpf_flow.c new file mode 100644 index ..f1e33a574841 --- /dev/null +++ b/tools/testing/selftests/bpf/bpf_flow.c @@ -0,0 +1,386 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "bpf_helpers.h" +#include "bpf_endian.h" + +int _version SEC("version") = 1; +#define PROG(F) SEC(#F) int bpf_func_##F + +/* These are the identifiers of the BPF programs that will be used in tail + * calls. Name is limited to 16 characters, with the terminating character and + * bpf_func_ above, we have only 6 to work with, anything after will be cropped. + */ +enum { + IP, + IPV6, + IPV6OP, /* Destination/Hop-by-Hop Options IPv6 Extension header */ + IPV6FR, /* Fragmentation IPv6 Extension Header */ + MPLS, + VLAN, +}; + +#define IP_MF 0x2000 +#define IP_OFFSET 0x1FFF +#define IP6_MF 0x0001 +#define IP6_OFFSET 0xFFF8 + +struct vlan_hdr { + __be16 h_vlan_TCI; + __be16 h_vlan_encapsulated_proto; +}; + +struct gre_hdr { + __be16 flags; + __be16 proto; +}; + +struct frag_hdr { + __u8 nexthdr; + __u8 reserved; + __be16 frag_off; + __be32 identification; +}; + +struct bpf_map_def SEC("maps") jmp_table = { + .type = BPF_MAP_TYPE_PROG_ARRAY, + .key_size = sizeof(__u32), + .value_size = sizeof(__u32), + .max_entries = 8 +}; + +struct bpf_dissect_cb { + __u16 nhoff; +}; + +static __always_inline void *bpf_flow_dissect_get_header(struct __sk_buff *skb, +__u16 hdr_size, +void *buffer) +{ + struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb); + void *data_end = (__u8 *)(long)skb->data_end; + void *data = (__u8 *)(long)skb->data; + __u8 *hdr; + + /* Verifies this variable offset does not overflow */ + if (cb->nhoff > (USHRT_MAX - hdr_size)) + return NULL; + + hdr = data + cb->nhoff; + if (hdr + hdr_size <= data_end) + return hdr; + + if (bpf_skb_load_bytes(skb, cb->nhoff, buffer, hdr_size)) + return NULL; + + return buffer; +} + +/* Dispatches on ETHERTYPE */ +static __always_inline int parse_eth_proto(struct __sk_buff *skb, __be16 proto) +{ + struct bpf_flow_keys *keys = (struct bpf_flow_keys *)(long)(skb->flow_keys); + + keys->n_proto = proto; + switch (proto) { + case bpf_htons(ETH_P_IP): + bpf_tail_call(skb, &jmp_table, IP); + break; + case bpf_htons(ETH_P_IPV6): + bpf_tail_call(skb, &jmp_table, IPV6); + break; + case bpf_htons(ETH_P_MPLS_MC): + case bpf_htons(ETH_P_MPLS_UC): + bpf_tail_call(skb, &jmp_table, MPLS); + break; + case bpf_htons(ETH_P_8021Q): + case bpf_htons(ETH_P_8021AD): + bpf_tail_call(skb, &jmp_table, VLAN); + break; + default: + /* Protocol not supported */ + return BPF_DROP; + } + + return BPF_DROP; +} + +SEC("dissect") +int dissect(struct __sk_buff *skb) +{ + if (!skb->vlan_present) + return parse_eth_proto(skb, skb->protocol); + else + return parse_eth_proto(skb, skb->v
[bpf-next, v2 3/3] selftests/bpf: test bpf flow dissection
From: Petar Penkov Adds a test that sends different types of packets over multiple tunnels and verifies that valid packets are dissected correctly. To do so, a tc-flower rule is added to drop packets on UDP src port 9, and packets are sent from ports 8, 9, and 10. Only the packets on port 9 should be dropped. Because tc-flower relies on the flow dissector to match flows, correct classification demonstrates correct dissection. Also add support logic to load the BPF program and to inject the test packets. Signed-off-by: Petar Penkov Signed-off-by: Willem de Bruijn --- tools/testing/selftests/bpf/.gitignore| 2 + tools/testing/selftests/bpf/Makefile | 6 +- tools/testing/selftests/bpf/config| 1 + .../selftests/bpf/flow_dissector_load.c | 140 .../selftests/bpf/test_flow_dissector.c | 782 ++ .../selftests/bpf/test_flow_dissector.sh | 115 +++ tools/testing/selftests/bpf/with_addr.sh | 54 ++ tools/testing/selftests/bpf/with_tunnels.sh | 36 + 8 files changed, 1134 insertions(+), 2 deletions(-) create mode 100644 tools/testing/selftests/bpf/flow_dissector_load.c create mode 100644 tools/testing/selftests/bpf/test_flow_dissector.c create mode 100755 tools/testing/selftests/bpf/test_flow_dissector.sh create mode 100755 tools/testing/selftests/bpf/with_addr.sh create mode 100755 tools/testing/selftests/bpf/with_tunnels.sh diff --git a/tools/testing/selftests/bpf/.gitignore b/tools/testing/selftests/bpf/.gitignore index 4d789c1e5167..8a60c9b9892d 100644 --- a/tools/testing/selftests/bpf/.gitignore +++ b/tools/testing/selftests/bpf/.gitignore @@ -23,3 +23,5 @@ test_skb_cgroup_id_user test_socket_cookie test_cgroup_storage test_select_reuseport +test_flow_dissector +flow_dissector_load diff --git a/tools/testing/selftests/bpf/Makefile b/tools/testing/selftests/bpf/Makefile index e65f50f9185e..fd3851d5c079 100644 --- a/tools/testing/selftests/bpf/Makefile +++ b/tools/testing/selftests/bpf/Makefile @@ -47,10 +47,12 @@ TEST_PROGS := test_kmod.sh \ test_tunnel.sh \ test_lwt_seg6local.sh \ test_lirc_mode2.sh \ - test_skb_cgroup_id.sh + test_skb_cgroup_id.sh \ + test_flow_dissector.sh # Compile but not part of 'make run_tests' -TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr test_skb_cgroup_id_user +TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr test_skb_cgroup_id_user \ + flow_dissector_load test_flow_dissector include ../lib.mk diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config index b4994a94968b..3655508f95fd 100644 --- a/tools/testing/selftests/bpf/config +++ b/tools/testing/selftests/bpf/config @@ -18,3 +18,4 @@ CONFIG_CRYPTO_HMAC=m CONFIG_CRYPTO_SHA256=m CONFIG_VXLAN=y CONFIG_GENEVE=y +CONFIG_NET_CLS_FLOWER=m diff --git a/tools/testing/selftests/bpf/flow_dissector_load.c b/tools/testing/selftests/bpf/flow_dissector_load.c new file mode 100644 index ..d3273b5b3173 --- /dev/null +++ b/tools/testing/selftests/bpf/flow_dissector_load.c @@ -0,0 +1,140 @@ +// SPDX-License-Identifier: GPL-2.0 +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +const char *cfg_pin_path = "/sys/fs/bpf/flow_dissector"; +const char *cfg_map_name = "jmp_table"; +bool cfg_attach = true; +char *cfg_section_name; +char *cfg_path_name; + +static void load_and_attach_program(void) +{ + struct bpf_program *prog, *main_prog; + struct bpf_map *prog_array; + int i, fd, prog_fd, ret; + struct bpf_object *obj; + int prog_array_fd; + + ret = bpf_prog_load(cfg_path_name, BPF_PROG_TYPE_FLOW_DISSECTOR, &obj, + &prog_fd); + if (ret) + error(1, 0, "bpf_prog_load %s", cfg_path_name); + + main_prog = bpf_object__find_program_by_title(obj, cfg_section_name); + if (!main_prog) + error(1, 0, "bpf_object__find_program_by_title %s", + cfg_section_name); + + prog_fd = bpf_program__fd(main_prog); + if (prog_fd < 0) + error(1, 0, "bpf_program__fd"); + + prog_array = bpf_object__find_map_by_name(obj, cfg_map_name); + if (!prog_array) + error(1, 0, "bpf_object__find_map_by_name %s", cfg_map_name); + + prog_array_fd = bpf_map__fd(prog_array); + if (prog_array_fd < 0) + error(1, 0, "bpf_map__fd %s", cfg_map_name); + + i = 0; + bpf_object__for_each_program(prog, obj) { + fd = bpf_program__fd(prog); + if (fd < 0) + error(1, 0, "bpf_program__fd"); + + if (fd != prog_fd) { + printf("%d: %s\n", i, bpf_program__title(prog, false)); + bpf_map_update_elem(prog_array_fd, &i, &fd, BPF_ANY); + ++i; + } + } + + ret = bpf_
Re: RE packet: fix reserve calculation - net/packet/af_packet.c
On Fri, Sep 7, 2018 at 11:50 AM, Willem de Bruijn wrote: > Hi James, > > Thanks for the report. In the future please always include > netdev@vger.kernel.org in technical discussions. > > On Fri, Sep 7, 2018 at 1:00 AM James Sakalaukus wrote: >> >> Hello Willem and David, >> >> I have an unpolished Ethernet driver for a PCIe FPGA subsystem, and >> the following commit has the side effect of moving the data alignment >> for SOCK_RAW packets. >> >> >> commit b84bbaf7a6c8cca24f8acf25a2c8e46913a947ba >> net/packet/af_packet.c >> >> These changes to packet_snd() moves the data and tail pointers >> backwards by net_device->hard_header_len, which is nominally ETH_HLEN. >> The .ndo_start_xmit driver function now gets a struct sk_buff with >> data alignment on a 2-byte boundary. My DMA core is not happy about >> it. >> >> >> commit 9aad13b087ab0a588cd68259de618f100053360e >> >> This commit changed the previous fix from a skb_push to skb_reserve. >> The functionality from my end did not change though. .ndo_start_xmit >> still gets a struct sk_buff with 2 byte alignment. >> >> >> This may be causing problems for other network drivers with DMA >> alignment requirements, but maybe its just me. > > This is the crux of the question. > > > Did the PACKET_TX_RING variant work with your device? > > A quick scan seems to indicate that it is common to allocate a linear > buffer and then reserve hard_header_len aligned up to 16B > (HH_MOD_LEN). The actual alignment of both link layer and network > header then depends on the alignment with which kmalloc returned. It > is probably safe to assume that the buddy allocator returns a multiple > of 4B at least for allocations of this size. Then the network layer > header is 4B aligned. But for Ethernet, the skb_push in eth_header() > would make the link layer header 2B aligned. > > If you are not seeing these problems with other protocols, I must be > misreading that code. > > I will take a closer look. > I actually have not tested the device with any protocols other than SOCK_RAW. This device is on a real time network that does not use standard network layer protocols. PACKET_TX_RING is something I haven't had time to delve into. It may be the case that other protocols always hand over a 2-byte aligned buffer. All I know for sure is that I was getting a 4-byte aligned buffer before the update. >> I've only been working >> on the driver for about 6 months, so its not released, though I would >> like to put it out there for others. I updated my distro kernel this >> week, and then proceeded to beat my head against the wall trying to >> figure out why my driver mysteriously stopped working. >> >> >> Thanks for your time, >> >> James Sakalaukus >> ja...@sakalaukus.com
Re: RE packet: fix reserve calculation - net/packet/af_packet.c
On Fri, Sep 7, 2018 at 1:34 PM, David Miller wrote: > From: Willem de Bruijn > Date: Fri, 7 Sep 2018 11:50:02 -0400 > >> If you are not seeing these problems with other protocols, I must be >> misreading that code. > > Right, the ethernet header is only guaranteed to be 2 byte aligned on > transmit. > > Nothing ever gave larger alignment guarantees. > > This is the _only_ reasonable situation. > > If we made the ethernet header to be 4 byte aligned, that means the > ipv4 addresses in the ipv4 header are not 4 byte aligned so we would > do unaligned memory accesses when building the headers which are > either expensive or trap depending upon the architecture. Understood. I thought SOCK_RAW made no assumptions with respect to the network layer headers. Previously, the buffer was allocated and then data was copied into it without giving thought to alignment. Sounds like I got lucky with the alignment and I should have expected 2-bytes from day 1. I've been using this device with only SOCK_RAW on a real time network, and I have not even tried to use SOCK_STREAM or SOCK_DGRAM. So, I just tried to use TCP on the device, and the buffer alignment is in fact 2-bytes inside my transmit function, and my DMA core does not work. So...I guess I need to bring my hardware into the 21st century or do a copy in the driver transmit function. Thanks for your time
Re: [Patch net] net_sched: properly cancel netlink dump on failure
From: Cong Wang Date: Thu, 6 Sep 2018 14:50:16 -0700 > When nla_put*() fails after nla_nest_start(), we need > to call nla_nest_cancel() to cancel the message, otherwise > we end up calling nla_nest_end() like a success. > > Fixes: 0ed5269f9e41 ("net/sched: add tunnel option support to act_tunnel_key") > Cc: Davide Caratti > Cc: Simon Horman > Signed-off-by: Cong Wang Applied, thanks.
Re: [PATCH net-next] cxgb4: impose mandatory VLAN usage when non-zero TAG ID
From: Ganesh Goudar Date: Fri, 7 Sep 2018 15:59:33 +0530 > From: Casey Leedom > > When a non-zero VLAN Tag ID is passed to t4_set_vlan_acl() > then impose mandatory VLAN Usage with that VLAN ID. > I.e any other VLAN ID should result in packets getting > dropped. > > Signed-off-by: Casey Leedom > Signed-off-by: Ganesh Goudar Applied.
Re: [PATCH] tcp: really ignore MSG_ZEROCOPY if no SO_ZEROCOPY
From: Vincent Whitchurch Date: Thu, 6 Sep 2018 15:54:59 +0200 > According to the documentation in msg_zerocopy.rst, the SO_ZEROCOPY > flag was introduced because send(2) ignores unknown message flags and > any legacy application which was accidentally passing the equivalent of > MSG_ZEROCOPY earlier should not see any new behaviour. > > Before commit f214f915e7db ("tcp: enable MSG_ZEROCOPY"), a send(2) call > which passed the equivalent of MSG_ZEROCOPY without setting SO_ZEROCOPY > would succeed. However, after that commit, it fails with -ENOBUFS. So > it appears that the SO_ZEROCOPY flag fails to fulfill its intended > purpose. Fix it. > > Fixes: f214f915e7db ("tcp: enable MSG_ZEROCOPY") > Signed-off-by: Vincent Whitchurch Applied and queued up for -stable, thanks.