[PATCH v2 2/4] net: xsk: add a simple buffer reuse queue

2018-09-07 Thread Björn Töpel
From: Jakub Kicinski 

XSK UMEM is strongly single producer single consumer so reuse of
frames is challenging.  Add a simple "stash" of FILL packets to
reuse for drivers to optionally make use of.  This is useful
when driver has to free (ndo_stop) or resize a ring with an active
AF_XDP ZC socket.

v2: Fixed build issues for !CONFIG_XDP_SOCKETS.

Signed-off-by: Jakub Kicinski 
---
 include/net/xdp_sock.h | 69 ++
 net/xdp/xdp_umem.c |  2 ++
 net/xdp/xsk_queue.c| 55 +
 net/xdp/xsk_queue.h|  3 ++
 4 files changed, 129 insertions(+)

diff --git a/include/net/xdp_sock.h b/include/net/xdp_sock.h
index 932ca0dad6f3..70a115bea4f4 100644
--- a/include/net/xdp_sock.h
+++ b/include/net/xdp_sock.h
@@ -21,6 +21,12 @@ struct xdp_umem_page {
dma_addr_t dma;
 };
 
+struct xdp_umem_fq_reuse {
+   u32 nentries;
+   u32 length;
+   u64 handles[];
+};
+
 struct xdp_umem {
struct xsk_queue *fq;
struct xsk_queue *cq;
@@ -37,6 +43,7 @@ struct xdp_umem {
struct page **pgs;
u32 npgs;
struct net_device *dev;
+   struct xdp_umem_fq_reuse *fq_reuse;
u16 queue_id;
bool zc;
spinlock_t xsk_list_lock;
@@ -75,6 +82,10 @@ void xsk_umem_discard_addr(struct xdp_umem *umem);
 void xsk_umem_complete_tx(struct xdp_umem *umem, u32 nb_entries);
 bool xsk_umem_consume_tx(struct xdp_umem *umem, dma_addr_t *dma, u32 *len);
 void xsk_umem_consume_tx_done(struct xdp_umem *umem);
+struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries);
+struct xdp_umem_fq_reuse *xsk_reuseq_swap(struct xdp_umem *umem,
+ struct xdp_umem_fq_reuse *newq);
+void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq);
 
 static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
@@ -85,6 +96,35 @@ static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem 
*umem, u64 addr)
 {
return umem->pages[addr >> PAGE_SHIFT].dma + (addr & (PAGE_SIZE - 1));
 }
+
+/* Reuse-queue aware version of FILL queue helpers */
+static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
+{
+   struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
+
+   if (!rq->length)
+   return xsk_umem_peek_addr(umem, addr);
+
+   *addr = rq->handles[rq->length - 1];
+   return addr;
+}
+
+static inline void xsk_umem_discard_addr_rq(struct xdp_umem *umem)
+{
+   struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
+
+   if (!rq->length)
+   xsk_umem_discard_addr(umem);
+   else
+   rq->length--;
+}
+
+static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
+{
+   struct xdp_umem_fq_reuse *rq = umem->fq_reuse;
+
+   rq->handles[rq->length++] = addr;
+}
 #else
 static inline int xsk_generic_rcv(struct xdp_sock *xs, struct xdp_buff *xdp)
 {
@@ -128,6 +168,21 @@ static inline void xsk_umem_consume_tx_done(struct 
xdp_umem *umem)
 {
 }
 
+static inline struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries)
+{
+   return NULL;
+}
+
+static inline struct xdp_umem_fq_reuse *xsk_reuseq_swap(
+   struct xdp_umem *umem,
+   struct xdp_umem_fq_reuse *newq)
+{
+   return NULL;
+}
+static inline void xsk_reuseq_free(struct xdp_umem_fq_reuse *rq)
+{
+}
+
 static inline char *xdp_umem_get_data(struct xdp_umem *umem, u64 addr)
 {
return NULL;
@@ -137,6 +192,20 @@ static inline dma_addr_t xdp_umem_get_dma(struct xdp_umem 
*umem, u64 addr)
 {
return 0;
 }
+
+static inline u64 *xsk_umem_peek_addr_rq(struct xdp_umem *umem, u64 *addr)
+{
+   return NULL;
+}
+
+static inline void xsk_umem_discard_addr_rq(struct xdp_umem *umem)
+{
+}
+
+static inline void xsk_umem_fq_reuse(struct xdp_umem *umem, u64 addr)
+{
+}
+
 #endif /* CONFIG_XDP_SOCKETS */
 
 #endif /* _LINUX_XDP_SOCK_H */
diff --git a/net/xdp/xdp_umem.c b/net/xdp/xdp_umem.c
index b3b632c5aeae..555427b3e0fe 100644
--- a/net/xdp/xdp_umem.c
+++ b/net/xdp/xdp_umem.c
@@ -165,6 +165,8 @@ static void xdp_umem_release(struct xdp_umem *umem)
umem->cq = NULL;
}
 
+   xsk_reuseq_destroy(umem);
+
xdp_umem_unpin_pages(umem);
 
task = get_pid_task(umem->pid, PIDTYPE_PID);
diff --git a/net/xdp/xsk_queue.c b/net/xdp/xsk_queue.c
index 2dc1384d9f27..b66504592d9b 100644
--- a/net/xdp/xsk_queue.c
+++ b/net/xdp/xsk_queue.c
@@ -3,7 +3,9 @@
  * Copyright(c) 2018 Intel Corporation.
  */
 
+#include 
 #include 
+#include 
 
 #include "xsk_queue.h"
 
@@ -62,3 +64,56 @@ void xskq_destroy(struct xsk_queue *q)
page_frag_free(q->ring);
kfree(q);
 }
+
+struct xdp_umem_fq_reuse *xsk_reuseq_prepare(u32 nentries)
+{
+   struct xdp_umem_fq_reuse *newq;
+
+   /* Check for overflow */
+   if (nentries > (u32)roundup_pow_of_two(nentries))
+   return NULL;
+   nentries = roundup_pow_of_two(nentries);
+
+   newq = kvmalloc(struct_size(newq, handles, nentries), GFP_KERNEL);
+   

[PATCH v2 1/4] i40e: clean zero-copy XDP Tx ring on shutdown/reset

2018-09-07 Thread Björn Töpel
From: Björn Töpel 

When the zero-copy enabled XDP Tx ring is torn down, due to
configuration changes, outstandning frames on the hardware descriptor
ring are queued on the completion ring.

The completion ring has a back-pressure mechanism that will guarantee
that there is sufficient space on the ring.

Signed-off-by: Björn Töpel 
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 17 +++
 .../ethernet/intel/i40e/i40e_txrx_common.h|  2 ++
 drivers/net/ethernet/intel/i40e/i40e_xsk.c| 30 +++
 3 files changed, 43 insertions(+), 6 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 37bd4e50ccde..7f85d4ba8b54 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -636,13 +636,18 @@ void i40e_clean_tx_ring(struct i40e_ring *tx_ring)
unsigned long bi_size;
u16 i;
 
-   /* ring already cleared, nothing to do */
-   if (!tx_ring->tx_bi)
-   return;
+   if (ring_is_xdp(tx_ring) && tx_ring->xsk_umem) {
+   i40e_xsk_clean_tx_ring(tx_ring);
+   } else {
+   /* ring already cleared, nothing to do */
+   if (!tx_ring->tx_bi)
+   return;
 
-   /* Free all the Tx ring sk_buffs */
-   for (i = 0; i < tx_ring->count; i++)
-   i40e_unmap_and_free_tx_resource(tx_ring, &tx_ring->tx_bi[i]);
+   /* Free all the Tx ring sk_buffs */
+   for (i = 0; i < tx_ring->count; i++)
+   i40e_unmap_and_free_tx_resource(tx_ring,
+   &tx_ring->tx_bi[i]);
+   }
 
bi_size = sizeof(struct i40e_tx_buffer) * tx_ring->count;
memset(tx_ring->tx_bi, 0, bi_size);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h 
b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
index b5afd479a9c5..29c68b29d36f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
@@ -87,4 +87,6 @@ static inline void i40e_arm_wb(struct i40e_ring *tx_ring,
}
 }
 
+void i40e_xsk_clean_tx_ring(struct i40e_ring *tx_ring);
+
 #endif /* I40E_TXRX_COMMON_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c 
b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index 2ebfc78bbd09..99116277c4d2 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -830,3 +830,33 @@ int i40e_xsk_async_xmit(struct net_device *dev, u32 
queue_id)
 
return 0;
 }
+
+/**
+ * i40e_xsk_clean_xdp_ring - Clean the XDP Tx ring on shutdown
+ * @xdp_ring: XDP Tx ring
+ **/
+void i40e_xsk_clean_tx_ring(struct i40e_ring *tx_ring)
+{
+   u16 ntc = tx_ring->next_to_clean, ntu = tx_ring->next_to_use;
+   struct xdp_umem *umem = tx_ring->xsk_umem;
+   struct i40e_tx_buffer *tx_bi;
+   u32 xsk_frames = 0;
+
+   while (ntc != ntu) {
+   tx_bi = &tx_ring->tx_bi[ntc];
+
+   if (tx_bi->xdpf)
+   i40e_clean_xdp_tx_buffer(tx_ring, tx_bi);
+   else
+   xsk_frames++;
+
+   tx_bi->xdpf = NULL;
+
+   ntc++;
+   if (ntc > tx_ring->count)
+   ntc = 0;
+   }
+
+   if (xsk_frames)
+   xsk_umem_complete_tx(umem, xsk_frames);
+}
-- 
2.17.1



[PATCH v2 4/4] i40e: disallow changing the number of descriptors when AF_XDP is on

2018-09-07 Thread Björn Töpel
From: Björn Töpel 

When an AF_XDP UMEM is attached to any of the Rx rings, we disallow a
user to change the number of descriptors via e.g. "ethtool -G IFNAME".

Otherwise, the size of the stash/reuse queue can grow unbounded, which
would result in OOM or leaking userspace buffers.

Signed-off-by: Björn Töpel 
---
 .../net/ethernet/intel/i40e/i40e_ethtool.c|  9 +++-
 .../ethernet/intel/i40e/i40e_txrx_common.h|  1 +
 drivers/net/ethernet/intel/i40e/i40e_xsk.c| 22 +++
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index d7d3974beca2..3cd2c88c72f8 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -5,7 +5,7 @@
 
 #include "i40e.h"
 #include "i40e_diag.h"
-
+#include "i40e_txrx_common.h"
 #include "i40e_ethtool_stats.h"
 
 #define I40E_PF_STAT(_name, _stat) \
@@ -1493,6 +1493,13 @@ static int i40e_set_ringparam(struct net_device *netdev,
(new_rx_count == vsi->rx_rings[0]->count))
return 0;
 
+   /* If there is a AF_XDP UMEM attached to any of Rx rings,
+* disallow changing the number of descriptors -- regardless
+* if the netdev is running or not.
+*/
+   if (i40e_xsk_any_rx_ring_enabled(vsi))
+   return -EBUSY;
+
while (test_and_set_bit(__I40E_CONFIG_BUSY, pf->state)) {
timeout--;
if (!timeout)
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h 
b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
index 8d46acff6f2e..09809dffe399 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
@@ -89,5 +89,6 @@ static inline void i40e_arm_wb(struct i40e_ring *tx_ring,
 
 void i40e_xsk_clean_rx_ring(struct i40e_ring *rx_ring);
 void i40e_xsk_clean_tx_ring(struct i40e_ring *tx_ring);
+bool i40e_xsk_any_rx_ring_enabled(struct i40e_vsi *vsi);
 
 #endif /* I40E_TXRX_COMMON_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c 
b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index e4b62e871afc..119f59ec7cc0 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -944,3 +944,25 @@ void i40e_xsk_clean_tx_ring(struct i40e_ring *tx_ring)
if (xsk_frames)
xsk_umem_complete_tx(umem, xsk_frames);
 }
+
+/**
+ * i40e_xsk_any_rx_ring_enabled - Checks whether any of the Rx rings
+ * has AF_XDP UMEM attached
+ * @vsi: vsi
+ *
+ * Returns true if any of the Rx rings has an AF_XDP UMEM attached
+ **/
+bool i40e_xsk_any_rx_ring_enabled(struct i40e_vsi *vsi)
+{
+   int i;
+
+   if (!vsi->xsk_umems)
+   return false;
+
+   for (i = 0; i < vsi->num_queue_pairs; i++) {
+   if (vsi->xsk_umems[i])
+   return true;
+   }
+
+   return false;
+}
-- 
2.17.1



[PATCH v2 0/4] i40e AF_XDP zero-copy buffer leak fixes

2018-09-07 Thread Björn Töpel
From: Björn Töpel 

NB! The v1 was sent via the bpf-next tree. This time the series is
routed via JeffK's Intel Wired tree to minimize the risk for i40e
merge conflicts.

This series addresses an AF_XDP zero-copy issue that buffers passed
from userspace to the kernel was leaked when the hardware descriptor
ring was torn down.

The patches fixes the i40e AF_XDP zero-copy implementation.

Thanks to Jakub Kicinski for pointing this out!

Some background for folks that don't know the details: A zero-copy
capable driver picks buffers off the fill ring and places them on the
hardware Rx ring to be completed at a later point when DMA is
complete. Similar on the Tx side; The driver picks buffers off the Tx
ring and places them on the Tx hardware ring.

In the typical flow, the Rx buffer will be placed onto an Rx ring
(completed to the user), and the Tx buffer will be placed on the
completion ring to notify the user that the transfer is done.

However, if the driver needs to tear down the hardware rings for some
reason (interface goes down, reconfiguration and such), the userspace
buffers cannot be leaked. They have to be reused or completed back to
userspace.

The implementation does the following:

* Outstanding Tx descriptors will be passed to the completion
  ring. The Tx code has back-pressure mechanism in place, so that
  enough empty space in the completion ring is guaranteed.

* Outstanding Rx descriptors are temporarily stored on a stash/reuse
  queue. The reuse queue is based on Jakub's RFC. When/if the HW rings
  comes up again, entries from the stash are used to re-populate the
  ring.

* When AF_XDP ZC is enabled, disallow changing the number of hardware
  descriptors via ethtool. Otherwise, the size of the stash/reuse
  queue can grow unbounded.

Going forward, introducing a "zero-copy allocator" analogous to Jesper
Brouer's page pool would be a more robust and reuseable solution.

v1->v2: Address kbuild "undefined symbols" error when building with
!CONFIG_XDP_SOCKETS.

Thanks!
Björn


Björn Töpel (3):
  i40e: clean zero-copy XDP Tx ring on shutdown/reset
  i40e: clean zero-copy XDP Rx ring on shutdown/reset
  i40e: disallow changing the number of descriptors when AF_XDP is on

Jakub Kicinski (1):
  net: xsk: add a simple buffer reuse queue

 .../net/ethernet/intel/i40e/i40e_ethtool.c|   9 +-
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   |  21 ++-
 .../ethernet/intel/i40e/i40e_txrx_common.h|   4 +
 drivers/net/ethernet/intel/i40e/i40e_xsk.c| 152 +-
 include/net/xdp_sock.h|  69 
 net/xdp/xdp_umem.c|   2 +
 net/xdp/xsk_queue.c   |  55 +++
 net/xdp/xsk_queue.h   |   3 +
 8 files changed, 299 insertions(+), 16 deletions(-)

-- 
2.17.1



[PATCH v2 3/4] i40e: clean zero-copy XDP Rx ring on shutdown/reset

2018-09-07 Thread Björn Töpel
From: Björn Töpel 

Outstanding Rx descriptors are temporarily stored on a stash/reuse
queue. When/if the HW rings comes up again, entries from the stash are
used to re-populate the ring.

The latter required some restructuring of the allocation scheme for
the AF_XDP zero-copy implementation. There is now a fast, and a slow
allocation. The "fast allocation" is used from the fast-path and
obtains free buffers from the fill ring and the internal recycle
mechanism. The "slow allocation" is only used in ring setup, and
obtains buffers from the fill ring and the stash (if any).

Signed-off-by: Björn Töpel 
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   |   4 +-
 .../ethernet/intel/i40e/i40e_txrx_common.h|   1 +
 drivers/net/ethernet/intel/i40e/i40e_xsk.c| 100 --
 3 files changed, 96 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 7f85d4ba8b54..740ea58ba938 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -1355,8 +1355,10 @@ void i40e_clean_rx_ring(struct i40e_ring *rx_ring)
rx_ring->skb = NULL;
}
 
-   if (rx_ring->xsk_umem)
+   if (rx_ring->xsk_umem) {
+   i40e_xsk_clean_rx_ring(rx_ring);
goto skip_free;
+   }
 
/* Free all the Rx ring sk_buffs */
for (i = 0; i < rx_ring->count; i++) {
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h 
b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
index 29c68b29d36f..8d46acff6f2e 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx_common.h
@@ -87,6 +87,7 @@ static inline void i40e_arm_wb(struct i40e_ring *tx_ring,
}
 }
 
+void i40e_xsk_clean_rx_ring(struct i40e_ring *rx_ring);
 void i40e_xsk_clean_tx_ring(struct i40e_ring *tx_ring);
 
 #endif /* I40E_TXRX_COMMON_ */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c 
b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
index 99116277c4d2..e4b62e871afc 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
@@ -140,6 +140,7 @@ static void i40e_xsk_umem_dma_unmap(struct i40e_vsi *vsi, 
struct xdp_umem *umem)
 static int i40e_xsk_umem_enable(struct i40e_vsi *vsi, struct xdp_umem *umem,
u16 qid)
 {
+   struct xdp_umem_fq_reuse *reuseq;
bool if_running;
int err;
 
@@ -156,6 +157,12 @@ static int i40e_xsk_umem_enable(struct i40e_vsi *vsi, 
struct xdp_umem *umem,
return -EBUSY;
}
 
+   reuseq = xsk_reuseq_prepare(vsi->rx_rings[0]->count);
+   if (!reuseq)
+   return -ENOMEM;
+
+   xsk_reuseq_free(xsk_reuseq_swap(umem, reuseq));
+
err = i40e_xsk_umem_dma_map(vsi, umem);
if (err)
return err;
@@ -353,16 +360,46 @@ static bool i40e_alloc_buffer_zc(struct i40e_ring 
*rx_ring,
 }
 
 /**
- * i40e_alloc_rx_buffers_zc - Allocates a number of Rx buffers
+ * i40e_alloc_buffer_slow_zc - Allocates an i40e_rx_buffer
  * @rx_ring: Rx ring
- * @count: The number of buffers to allocate
+ * @bi: Rx buffer to populate
  *
- * This function allocates a number of Rx buffers and places them on
- * the Rx ring.
+ * This function allocates an Rx buffer. The buffer can come from fill
+ * queue, or via the reuse queue.
  *
  * Returns true for a successful allocation, false otherwise
  **/
-bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, u16 count)
+static bool i40e_alloc_buffer_slow_zc(struct i40e_ring *rx_ring,
+ struct i40e_rx_buffer *bi)
+{
+   struct xdp_umem *umem = rx_ring->xsk_umem;
+   u64 handle, hr;
+
+   if (!xsk_umem_peek_addr_rq(umem, &handle)) {
+   rx_ring->rx_stats.alloc_page_failed++;
+   return false;
+   }
+
+   handle &= rx_ring->xsk_umem->chunk_mask;
+
+   hr = umem->headroom + XDP_PACKET_HEADROOM;
+
+   bi->dma = xdp_umem_get_dma(umem, handle);
+   bi->dma += hr;
+
+   bi->addr = xdp_umem_get_data(umem, handle);
+   bi->addr += hr;
+
+   bi->handle = handle + umem->headroom;
+
+   xsk_umem_discard_addr_rq(umem);
+   return true;
+}
+
+static __always_inline bool __i40e_alloc_rx_buffers_zc(
+   struct i40e_ring *rx_ring, u16 count,
+   bool alloc(struct i40e_ring *rx_ring,
+  struct i40e_rx_buffer *bi))
 {
u16 ntu = rx_ring->next_to_use;
union i40e_rx_desc *rx_desc;
@@ -372,7 +409,7 @@ bool i40e_alloc_rx_buffers_zc(struct i40e_ring *rx_ring, 
u16 count)
rx_desc = I40E_RX_DESC(rx_ring, ntu);
bi = &rx_ring->rx_bi[ntu];
do {
-   if (!i40e_alloc_buffer_zc(rx_ring, bi)) {
+   if (!alloc(rx_ring, bi)) {
ok = false;
goto no_buffers;
}
@@ -404,6 +441,

Re: [PATCH net-next] net: sched: cls_flower: dump offload count value

2018-09-07 Thread Jakub Kicinski
On Thu,  6 Sep 2018 18:37:23 +0300, Vlad Buslov wrote:
> Change flower in_hw_count type to fixed-size u32 and dump it as
> TCA_FLOWER_IN_HW_COUNT. This change is necessary to properly test shared
> blocks and re-offload functionality.
> 
> Signed-off-by: Vlad Buslov 
> Acked-by: Jiri Pirko 
> ---
>  include/net/sch_generic.h| 2 +-
>  include/uapi/linux/pkt_cls.h | 2 ++
>  net/sched/cls_flower.c   | 5 -
>  3 files changed, 7 insertions(+), 2 deletions(-)
> 
> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
> index a6d00093f35e..d68ac55539a5 100644
> --- a/include/net/sch_generic.h
> +++ b/include/net/sch_generic.h
> @@ -362,7 +362,7 @@ static inline void tcf_block_offload_dec(struct tcf_block 
> *block, u32 *flags)
>  }
>  
>  static inline void
> -tc_cls_offload_cnt_update(struct tcf_block *block, unsigned int *cnt,
> +tc_cls_offload_cnt_update(struct tcf_block *block, u32 *cnt,
> u32 *flags, bool add)
>  {
>   if (add) {
> diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
> index be382fb0592d..2824fb7ed1c9 100644
> --- a/include/uapi/linux/pkt_cls.h
> +++ b/include/uapi/linux/pkt_cls.h
> @@ -483,6 +483,8 @@ enum {
>   TCA_FLOWER_KEY_ENC_OPTS,
>   TCA_FLOWER_KEY_ENC_OPTS_MASK,
>  
> + TCA_FLOWER_IN_HW_COUNT, /* be32 */

Why be32?

I wish there was a good way to share this attribute between
classifiers :( 

>   __TCA_FLOWER_MAX,
>  };
>  
> diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
> index 6fd9bdd93796..4b8dd37dd4f8 100644
> --- a/net/sched/cls_flower.c
> +++ b/net/sched/cls_flower.c
> @@ -98,7 +98,7 @@ struct cls_fl_filter {
>   struct list_head list;
>   u32 handle;
>   u32 flags;
> - unsigned int in_hw_count;
> + u32 in_hw_count;
>   struct rcu_work rwork;
>   struct net_device *hw_dev;
>  };
> @@ -1880,6 +1880,9 @@ static int fl_dump(struct net *net, struct tcf_proto 
> *tp, void *fh,
>   if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags))
>   goto nla_put_failure;
>  
> + if (nla_put_u32(skb, TCA_FLOWER_IN_HW_COUNT, f->in_hw_count))
> + goto nla_put_failure;
> +
>   if (tcf_exts_dump(skb, &f->exts))
>   goto nla_put_failure;
>  



Re: [PATCH net-next] net: sched: cls_flower: dump offload count value

2018-09-07 Thread Vlad Buslov


On Fri 07 Sep 2018 at 09:11, Jakub Kicinski  
wrote:
> On Thu,  6 Sep 2018 18:37:23 +0300, Vlad Buslov wrote:
>> Change flower in_hw_count type to fixed-size u32 and dump it as
>> TCA_FLOWER_IN_HW_COUNT. This change is necessary to properly test shared
>> blocks and re-offload functionality.
>> 
>> Signed-off-by: Vlad Buslov 
>> Acked-by: Jiri Pirko 
>> ---
>>  include/net/sch_generic.h| 2 +-
>>  include/uapi/linux/pkt_cls.h | 2 ++
>>  net/sched/cls_flower.c   | 5 -
>>  3 files changed, 7 insertions(+), 2 deletions(-)
>> 
>> diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
>> index a6d00093f35e..d68ac55539a5 100644
>> --- a/include/net/sch_generic.h
>> +++ b/include/net/sch_generic.h
>> @@ -362,7 +362,7 @@ static inline void tcf_block_offload_dec(struct 
>> tcf_block *block, u32 *flags)
>>  }
>>  
>>  static inline void
>> -tc_cls_offload_cnt_update(struct tcf_block *block, unsigned int *cnt,
>> +tc_cls_offload_cnt_update(struct tcf_block *block, u32 *cnt,
>>u32 *flags, bool add)
>>  {
>>  if (add) {
>> diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
>> index be382fb0592d..2824fb7ed1c9 100644
>> --- a/include/uapi/linux/pkt_cls.h
>> +++ b/include/uapi/linux/pkt_cls.h
>> @@ -483,6 +483,8 @@ enum {
>>  TCA_FLOWER_KEY_ENC_OPTS,
>>  TCA_FLOWER_KEY_ENC_OPTS_MASK,
>>  
>> +TCA_FLOWER_IN_HW_COUNT, /* be32 */
>
> Why be32?

This comment is wrong.
Thanks for catching this.

>
> I wish there was a good way to share this attribute between
> classifiers :( 
>
>>  __TCA_FLOWER_MAX,
>>  };
>>  
>> diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
>> index 6fd9bdd93796..4b8dd37dd4f8 100644
>> --- a/net/sched/cls_flower.c
>> +++ b/net/sched/cls_flower.c
>> @@ -98,7 +98,7 @@ struct cls_fl_filter {
>>  struct list_head list;
>>  u32 handle;
>>  u32 flags;
>> -unsigned int in_hw_count;
>> +u32 in_hw_count;
>>  struct rcu_work rwork;
>>  struct net_device *hw_dev;
>>  };
>> @@ -1880,6 +1880,9 @@ static int fl_dump(struct net *net, struct tcf_proto 
>> *tp, void *fh,
>>  if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags))
>>  goto nla_put_failure;
>>  
>> +if (nla_put_u32(skb, TCA_FLOWER_IN_HW_COUNT, f->in_hw_count))
>> +goto nla_put_failure;
>> +
>>  if (tcf_exts_dump(skb, &f->exts))
>>  goto nla_put_failure;
>>  



Re: kernels > v4.12 oops/crash with ipsec-traffic: bisected to b838d5e1c5b6e57b10ec8af2268824041e3ea911: ipv4: mark DST_NOGC and remove the operation of dst_free()

2018-09-07 Thread Wolfgang Walter
Hello,

didn't respond as I've been on vacation.

Am Freitag, 31. August 2018, 08:50:24 schrieb Steffen Klassert:
> On Thu, Aug 30, 2018 at 08:53:50PM +0200, Wolfgang Walter wrote:
> > Hello,
> > 
> > kernels > 4.12 do not work on one of our main routers. They crash as soon
> > as ipsec-tunnels are configured and ipsec-traffic actually flows.
> 
> Can you please send the backtrace of this crash?
> 

I'll try today. The oops quickly disappears because other problems arising 
from it pop up. The machine crashes and no logs are logged. I try to make foto 
or try to log to the serial console.

At the moment I only see that there is xfrm_ stuff in the call trace as 
xfrm_lookup, xfrm_route_, and it is while routing a packet.

With later kernels (4.18.5) the machine seems to crash without a call trace on 
console.

Regards,
-- 
Wolfgang Walter
Studentenwerk München
Anstalt des öffentlichen Rechts


[PATCH net-next] cxgb4: impose mandatory VLAN usage when non-zero TAG ID

2018-09-07 Thread Ganesh Goudar
From: Casey Leedom 

When a non-zero VLAN Tag ID is passed to t4_set_vlan_acl()
then impose mandatory VLAN Usage with that VLAN ID.
I.e any other VLAN ID should result in packets getting
dropped.

Signed-off-by: Casey Leedom 
Signed-off-by: Ganesh Goudar 
---
 drivers/net/ethernet/chelsio/cxgb4/t4_hw.c| 3 +++
 drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h | 1 +
 2 files changed, 4 insertions(+)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c 
b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
index 5fe5d16..6f1bd7b 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4_hw.c
@@ -10210,6 +10210,9 @@ int t4_set_vlan_acl(struct adapter *adap, unsigned int 
mbox, unsigned int vf,
vlan_cmd.en_to_len16 = cpu_to_be32(enable | FW_LEN16(vlan_cmd));
/* Drop all packets that donot match vlan id */
vlan_cmd.dropnovlan_fm = FW_ACL_VLAN_CMD_FM_F;
+   vlan_cmd.dropnovlan_fm = (enable
+ ? (FW_ACL_VLAN_CMD_DROPNOVLAN_F |
+FW_ACL_VLAN_CMD_FM_F) : 0);
if (enable != 0) {
vlan_cmd.nvlan = 1;
vlan_cmd.vlanid[0] = cpu_to_be16(vlan);
diff --git a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h 
b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
index 5dc6c41..6d2bc87 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/t4fw_api.h
@@ -2464,6 +2464,7 @@ struct fw_acl_vlan_cmd {
 
 #define FW_ACL_VLAN_CMD_DROPNOVLAN_S   7
 #define FW_ACL_VLAN_CMD_DROPNOVLAN_V(x)((x) << 
FW_ACL_VLAN_CMD_DROPNOVLAN_S)
+#define FW_ACL_VLAN_CMD_DROPNOVLAN_FFW_ACL_VLAN_CMD_DROPNOVLAN_V(1U)
 
 #define FW_ACL_VLAN_CMD_FM_S   6
 #define FW_ACL_VLAN_CMD_FM_M   0x1
-- 
2.1.0



Re: [PATCH v2 net-next 3/4] net: make listified RX functions return number of good packets

2018-09-07 Thread Edward Cree
On 07/09/18 03:27, Eric Dumazet wrote:
> On 09/06/2018 07:26 AM, Edward Cree wrote:
>> Signed-off-by: Edward Cree 
> Lack of changelog here ?
>
> I do not know what is a good packet.
The comment on netif_receive_skb_list() defines this as "skbs for which
 netif_receive_skb() would have returned %NET_RX_SUCCESS".  But I shall put
 that into the changelog as well.

> You are adding a lot of conditional expressions, that cpu
> will mispredict quite often.
I don't see an alternative, since this is needed by patch #4 and I daresay
 there are other drivers that will also want to get NET_RX_SUCCESS-like
 information (possibly from regular receives as well as GRO; I'm not quite
 sure why sfc only cares about the latter).

> Typical micro benchmarks wont really notice.

Any suggestions on how to construct a test that will?



Re: [PATCH v2 net-next 0/4] net: batched receive in GRO path

2018-09-07 Thread Edward Cree
On 07/09/18 03:32, Eric Dumazet wrote:
> Your performance numbers are not convincing, since TCP stream test should
> get nominal GRO gains.
I'm not quite sure what you mean here, could you explain a bit more?

> Adding this complexity and icache pressure needs more experimental results.
>
> What about RPC workloads  (eg 100 concurrent netperf -t TCP_RR -- -r 
> 8000,8000 )
I'll try that.  Any other tests that would be worthwhile?

-Ed


Re: [PATCH v2 net-next 3/4] net: make listified RX functions return number of good packets

2018-09-07 Thread Eric Dumazet



On 09/07/2018 03:44 AM, Edward Cree wrote:

> 
> Any suggestions on how to construct a test that will?
> 

Say 50 concurrent netperf -t TCP_RR -- -r 8000,8000

This way you have a mix of GRO-candidates, and non GRO ones (pure acks)

GRO sizes would be reasonable (not full size GRO packets).



Network device suspend/resume

2018-09-07 Thread Lakshmi

Hi,

I am bringing kernel bugzilla bug here
https://bugzilla.kernel.org/show_bug.cgi?id=196399

This issue occured 2 months ago and we didn't see this again. Wondering 
if that appears again. Can you confirm if there is any bug in network 
suspend/resume in the case.


One more instance here
https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_4376/fi-skl-6600u/igt@gem_exec_susp...@basic-s4-devices.html

Network device is
0b95:7720 ASIX Electronics Corp. AX88772
USB network card
driver seems to be usbnet
=
Bug 196399:

We have found out that since at least 4.11-rc1, some machines in the 
Intel GFX CI lab have been generating the following warning when 
suspending to s4 (suspend to disk):


[  287.212825] [ cut here ]
[  287.212829] WARNING: CPU: 0 PID: 3165 at net/sched/sch_generic.c:316 
dev_watchdog+0x218/0x220
[  287.212830] Modules linked in: mcs7830 usbnet mii snd_hda_codec_hdmi 
snd_hda_codec_realtek snd_hda_codec_generic i915 x86_pkg_temp_thermal 
intel_powerclamp coretemp crct10dif_pclmul crc32_pclmul snd_hda_intel 
snd_hda_codec snd_hwdep ghash_clmulni_intel snd_hda_core snd_pcm 
i2c_designware_platform i2c_designware_core mei_me mei prime_numbers 
i2c_hid pinctrl_sunrisepoint pinctrl_intel
[  287.212864] CPU: 0 PID: 3165 Comm: gem_exec_suspen Tainted: G U 
   4.12.0-CI-CI_DRM_2829+ #1
[  287.212865] Hardware name: Dell Inc. XPS 13 9360/093TW6, BIOS 1.3.2 
01/18/2017

[  287.212867] task: 8801b4084f40 task.stack: c91d8000
[  287.212869] RIP: 0010:dev_watchdog+0x218/0x220
[  287.212870] RSP: 0018:88027e403e38 EFLAGS: 00010292
[  287.212872] RAX: 005a RBX:  RCX: 

[  287.212874] RDX: 0002 RSI: 81cbcf89 RDI: 
81c9c627
[  287.212875] RBP: 88027e403e68 R08:  R09: 
0001
[  287.212876] R10: 28e9c215 R11:  R12: 
88026e08a848
[  287.212877] R13:  R14: 88026e050020 R15: 
0001
[  287.212878] FS:  7f345056a8c0() GS:88027e40() 
knlGS:

[  287.212880] CS:  0010 DS:  ES:  CR0: 80050033
[  287.212881] CR2: 008d7008 CR3: 0001b4314000 CR4: 
003406f0

[  287.212882] Call Trace:
[  287.212883]  
[  287.212886]  ? qdisc_rcu_free+0x40/0x40
[  287.212888]  ? qdisc_rcu_free+0x40/0x40
[  287.212891]  call_timer_fn+0x8e/0x370
[  287.212894]  ? qdisc_rcu_free+0x40/0x40
[  287.212896]  expire_timers+0x150/0x1f0
[  287.212899]  run_timer_softirq+0x7c/0x160
[  287.212903]  __do_softirq+0x116/0x4a0
[  287.212906]  irq_exit+0xa9/0xc0
[  287.212909]  smp_apic_timer_interrupt+0x38/0x50
[  287.212912]  apic_timer_interrupt+0x90/0xa0
[  287.212914] RIP: 0010:delay_tsc+0x33/0xc0
[  287.212916] RSP: 0018:c91dbcd8 EFLAGS: 0286 ORIG_RAX: 
ff10
[  287.212918] RAX: 8000 RBX: 0005964f23a0 RCX: 
0001
[  287.212919] RDX: 8001 RSI: 81c8e23a RDI: 

[  287.212920] RBP: c91dbcf8 R08:  R09: 
0001
[  287.212921] R10:  R11:  R12: 
00059633478e
[  287.212922] R13: 00249f13 R14:  R15: 
880272eac008

[  287.212924]  
[  287.212929]  ? delay_tsc+0x6b/0xc0
[  287.212932]  __delay+0xa/0x10
[  287.212934]  __const_udelay+0x31/0x40
[  287.212936]  hibernation_debug_sleep+0x20/0x30
[  287.212938]  hibernation_snapshot+0x2bc/0x5f0
[  287.212940]  hibernate+0x159/0x2f0
[  287.212943]  state_store+0xe0/0xf0
[  287.212947]  kobj_attr_store+0xf/0x20
[  287.212949]  sysfs_kf_write+0x40/0x50
[  287.212951]  kernfs_fop_write+0x130/0x1b0
[  287.212955]  __vfs_write+0x23/0x120
[  287.212957]  ? rcu_read_lock_sched_held+0x75/0x80
[  287.212959]  ? rcu_sync_lockdep_assert+0x2a/0x50
[  287.212961]  ? __sb_start_write+0xfa/0x1f0
[  287.212964]  vfs_write+0xc5/0x1d0
[  287.212966]  ? trace_hardirqs_on_caller+0xe7/0x1c0
[  287.212969]  SyS_write+0x44/0xb0
[  287.212972]  entry_SYSCALL_64_fastpath+0x1c/0xb1
[  287.212973] RIP: 0033:0x7f344ed4a4a0
[  287.212974] RSP: 002b:7ffef50dfaa8 EFLAGS: 0246 ORIG_RAX: 
0001
[  287.212977] RAX: ffda RBX: 81470683 RCX: 
7f344ed4a4a0
[  287.212978] RDX: 0004 RSI: 0041d211 RDI: 
0006
[  287.212979] RBP: c91dbf88 R08: 008d6a50 R09: 

[  287.212980] R10:  R11: 0246 R12: 
0041d211
[  287.212981] R13: 0006 R14:  R15: 


[  287.212984]  ? __this_cpu_preempt_check+0x13/0x20
[  287.212988] Code: 63 8e 18 04 00 00 eb 93 4c 89 f7 c6 05 77 5c 77 00 
01 e8 dc 7f fd ff 89 d9 48 89 c2 4c 89 f6 48 c7 c7 18 f4 cf 81 e8 f1 c4 
9d ff <0f> ff eb c3 0f 1f 40

network device suspend/resume

2018-09-07 Thread Lakshmi

Hi,

I am bringing kernel bugzilla bug here 196399
https://bugzilla.kernel.org/show_bug.cgi?id=196399
This issue occurred last time two months ago and I wonder if it appears 
again. Can you confirm if there is any issue related to network device 
suspend/resume.


last instance is here
https://intel-gfx-ci.01.org/tree/drm-tip/CI_DRM_4376/fi-skl-6600u/igt@gem_exec_susp...@basic-s4-devices.html

==
0b95:7720 ASIX Electronics Corp. AX88772
USB network card
driver seems to be usbnet
==
Bug Report: 196399
We have found out that since at least 4.11-rc1, some machines in the 
Intel GFX CI lab have been generating the following warning when 
suspending to s4 (suspend to disk):


[  287.212825] [ cut here ]
[  287.212829] WARNING: CPU: 0 PID: 3165 at net/sched/sch_generic.c:316 
dev_watchdog+0x218/0x220
[  287.212830] Modules linked in: mcs7830 usbnet mii snd_hda_codec_hdmi 
snd_hda_codec_realtek snd_hda_codec_generic i915 x86_pkg_temp_thermal 
intel_powerclamp coretemp crct10dif_pclmul crc32_pclmul snd_hda_intel 
snd_hda_codec snd_hwdep ghash_clmulni_intel snd_hda_core snd_pcm 
i2c_designware_platform i2c_designware_core mei_me mei prime_numbers 
i2c_hid pinctrl_sunrisepoint pinctrl_intel
[  287.212864] CPU: 0 PID: 3165 Comm: gem_exec_suspen Tainted: G U 
   4.12.0-CI-CI_DRM_2829+ #1
[  287.212865] Hardware name: Dell Inc. XPS 13 9360/093TW6, BIOS 1.3.2 
01/18/2017

[  287.212867] task: 8801b4084f40 task.stack: c91d8000
[  287.212869] RIP: 0010:dev_watchdog+0x218/0x220
[  287.212870] RSP: 0018:88027e403e38 EFLAGS: 00010292
[  287.212872] RAX: 005a RBX:  RCX: 

[  287.212874] RDX: 0002 RSI: 81cbcf89 RDI: 
81c9c627
[  287.212875] RBP: 88027e403e68 R08:  R09: 
0001
[  287.212876] R10: 28e9c215 R11:  R12: 
88026e08a848
[  287.212877] R13:  R14: 88026e050020 R15: 
0001
[  287.212878] FS:  7f345056a8c0() GS:88027e40() 
knlGS:

[  287.212880] CS:  0010 DS:  ES:  CR0: 80050033
[  287.212881] CR2: 008d7008 CR3: 0001b4314000 CR4: 
003406f0

[  287.212882] Call Trace:
[  287.212883]  
[  287.212886]  ? qdisc_rcu_free+0x40/0x40
[  287.212888]  ? qdisc_rcu_free+0x40/0x40
[  287.212891]  call_timer_fn+0x8e/0x370
[  287.212894]  ? qdisc_rcu_free+0x40/0x40
[  287.212896]  expire_timers+0x150/0x1f0
[  287.212899]  run_timer_softirq+0x7c/0x160
[  287.212903]  __do_softirq+0x116/0x4a0
[  287.212906]  irq_exit+0xa9/0xc0
[  287.212909]  smp_apic_timer_interrupt+0x38/0x50
[  287.212912]  apic_timer_interrupt+0x90/0xa0
[  287.212914] RIP: 0010:delay_tsc+0x33/0xc0
[  287.212916] RSP: 0018:c91dbcd8 EFLAGS: 0286 ORIG_RAX: 
ff10
[  287.212918] RAX: 8000 RBX: 0005964f23a0 RCX: 
0001
[  287.212919] RDX: 8001 RSI: 81c8e23a RDI: 

[  287.212920] RBP: c91dbcf8 R08:  R09: 
0001
[  287.212921] R10:  R11:  R12: 
00059633478e
[  287.212922] R13: 00249f13 R14:  R15: 
880272eac008

[  287.212924]  
[  287.212929]  ? delay_tsc+0x6b/0xc0
[  287.212932]  __delay+0xa/0x10
[  287.212934]  __const_udelay+0x31/0x40
[  287.212936]  hibernation_debug_sleep+0x20/0x30
[  287.212938]  hibernation_snapshot+0x2bc/0x5f0
[  287.212940]  hibernate+0x159/0x2f0
[  287.212943]  state_store+0xe0/0xf0
[  287.212947]  kobj_attr_store+0xf/0x20
[  287.212949]  sysfs_kf_write+0x40/0x50
[  287.212951]  kernfs_fop_write+0x130/0x1b0
[  287.212955]  __vfs_write+0x23/0x120
[  287.212957]  ? rcu_read_lock_sched_held+0x75/0x80
[  287.212959]  ? rcu_sync_lockdep_assert+0x2a/0x50
[  287.212961]  ? __sb_start_write+0xfa/0x1f0
[  287.212964]  vfs_write+0xc5/0x1d0
[  287.212966]  ? trace_hardirqs_on_caller+0xe7/0x1c0
[  287.212969]  SyS_write+0x44/0xb0
[  287.212972]  entry_SYSCALL_64_fastpath+0x1c/0xb1
[  287.212973] RIP: 0033:0x7f344ed4a4a0
[  287.212974] RSP: 002b:7ffef50dfaa8 EFLAGS: 0246 ORIG_RAX: 
0001
[  287.212977] RAX: ffda RBX: 81470683 RCX: 
7f344ed4a4a0
[  287.212978] RDX: 0004 RSI: 0041d211 RDI: 
0006
[  287.212979] RBP: c91dbf88 R08: 008d6a50 R09: 

[  287.212980] R10:  R11: 0246 R12: 
0041d211
[  287.212981] R13: 0006 R14:  R15: 


[  287.212984]  ? __this_cpu_preempt_check+0x13/0x20
[  287.212988] Code: 63 8e 18 04 00 00 eb 93 4c 89 f7 c6 05 77 5c 77 00 
01 e8 dc 7f fd ff 89 d9 48 89 c2 4c 89 f6 48 c7 c7 18 f4 cf 81 e8 f1 c4 
9d ff <0f> ff eb c3 0f 1f 40 00 48 c7 47 08 00 00 00 00 55 48 c7 07 00

[ 

Re: [PATCH 1/7] fix hnode refcounting

2018-09-07 Thread Jamal Hadi Salim

On 2018-09-06 10:35 p.m., Al Viro wrote:

On Thu, Sep 06, 2018 at 06:21:09AM -0400, Jamal Hadi Salim wrote:

[..]


Argh...  Unfortunately, there's this: in u32_delete() we have
 if (root_ht) {
 if (root_ht->refcnt > 1) {
 *last = false;
 goto ret;
 }
 if (root_ht->refcnt == 1) {
 if (!ht_empty(root_ht)) {
 *last = false;
 goto ret;
 }
 }
 }
and that would need to be updated.  


It is not detrimental as you have it right now but
you are right an adjustment is needed...

Deleting of a root directly should not be allowed. But
you can flush a whole tp. Consider this:
--
sudo tc qdisc add dev $P ingress
sudo tc filter add dev $P parent : protocol ip prio 10 \
u32 match ip protocol 1 0xff

Which creates root ht 800

You shouldnt be allowed to do this:
--
tc filter delete dev $P parent : protocol ip prio 10 handle 800: u32
---

But you can delete the tp entirely as such:
---
tc filter delete dev $P parent : protocol ip prio 10 u32
--

The later will go via the destroy() path and flush all filters.

You should also be able to delete individual filters. ex:
$tc filter del dev $P parent : prio 10 handle 800:0:800 u32

Where that code you are referring to is important is when
the last filter deleted - we need the caller to know
and it destroys root.

i.e you should return last=true when the last filter is
deleted so root gets auto deleted (just like it was autocreated)


However, that logics is bloody odd
to start with.  First of all, root_ht has come from
struct tc_u_hnode *root_ht = rtnl_dereference(tp->root);
and the only place where it's ever modified is
 rcu_assign_pointer(tp->root, root_ht);
in u32_init(), where we'd bloody well checked that root_ht is non-NULL
(see
 if (root_ht == NULL)
 return -ENOBUFS;
upstream of that place) and where that assignment is inevitable on the
way to returning 0.  No matter what, if tp has passed u32_init() it
will have non-NULL ->root, forever.  And there is no way for tcf_proto
to be seen outside of tcf_proto_create() without ->init() having returned
0 - it gets freed before anyone sees it.



Yes, the check for root_ht is not necessary - but the check for the
last filter (and testing for last) is needed.


So this 'if (root_ht)' can't be false.  What's more, what the hell is the
whole thing checking?  We are in u32_delete().  It's called (as ->delete())
from tfilter_del_notify(), which is called from tc_del_tfilter().  If we
return 0 with *last true, we follow up calling tcf_proto_destroy().
OK, let's look at the logics in there:
* if there are links to root hnode => false
* if there's no links to root hnode and it has knodes => false
(BTW, if we ever get there with root_ht->refcnt < 1, we are obviously screwed)
* if there is a tcf_proto sharing tp->data => false (i.e. any filters
with different prio - don't bother)
* if tp is the only one with reference to tp->data and there are *any*
knodes => false.

Any extra links can come only from knodes in a non-empty hnode.  And it's not
a common case.  Shouldn't thIe whole thing be
* shared tp->data => false
* any non-empty hnode => false
instead?  Perhaps even with the knode counter in tp->data, avoiding any loops
in there, as well as the entire ht_empty()...

Now, in the very beginning of u32_delete() we have this:
 struct tc_u_hnode *ht = arg;

 if (ht == NULL)
 goto out;
OK, but the call of ->delete() is
 err = tp->ops->delete(tp, fh, last, extack);
and arg == NULL seen in u32_delete() means fh == NULL in tfilter_del_notify().
Which is called in
 if (!fh) {
...
} else {
 bool last;

 err = tfilter_del_notify(net, skb, n, tp, block,
  q, parent, fh, false, &last,
  extack);
How can we ever get there with NULL fh?



Try:
tc filter delete dev $P parent : protocol ip prio 10 u32
tcm handle is 0, so will hit that code path.


The whole thing makes very little sense; looks like it used to live in
u32_destroy() prior to commit 763dbf6328e41 ("net_sched: move the empty tp
check from ->destroy() to ->delete()"), but looking at the rationale in
that commit...  I don't see how it fixes anything - sure, now we remove
tcf_proto from the list before calling ->destroy().  Without any RCU delays
in between.  How could it possibly solve any issues with ->classify()
called in parallel with ->destroy()?  cls_u32 (at least these days)
does try to survive u32_destroy() in parallel with u32_classify();
if any other classifiers do not, they are still broken and that commit
has not done anything for them.

Anyway, adjus

Re: [PATCH 1/7] fix hnode refcounting

2018-09-07 Thread Jamal Hadi Salim

To clarify with an example i used to test
your patches:

#0 add ingress filter
$TC qdisc add dev $P ingress
#1 add filter
$TC filter add dev $P parent : protocol ip prio 10 \
u32 match ip protocol 1 0xff
#2 display
$TC filter ls dev $P parent :

#3 try to delete root
$TC filter delete dev $P parent : protocol ip prio 10 \
handle 800: u32

#4 nothing changes..
$TC filter ls dev $P parent :
#5 delete filter
$TC filter delete dev $P parent : protocol ip prio 10 \
handle 800:0:800 u32
#6 filter gone but hash table still there..
$TC filter ls dev $P parent :
#7 delete tp
$TC filter delete dev $P parent : protocol ip prio 10 \
u32
#8 now it is gone..
$TC filter ls dev $P parent :

your patches show #6 filter as still active.
We want it to look like #8

Hope this helps.

cheers,
jamal


[PATCH net-next v2] net: sched: change tcf_del_walker() to take idrinfo->lock

2018-09-07 Thread Vlad Buslov
Action API was changed to work with actions and action_idr in concurrency
safe manner, however tcf_del_walker() still uses actions without taking a
reference or idrinfo->lock first, and deletes them directly, disregarding
possible concurrent delete.

Add tc_action_wq workqueue to action API. Implement
tcf_idr_release_unsafe() that assumes external synchronization by caller
and delays blocking action cleanup part to tc_action_wq workqueue. Extend
tcf_action_cleanup() with 'async' argument to indicate that function should
free action asynchronously.

Change tcf_del_walker() to take idrinfo->lock while iterating over actions
and use new tcf_idr_release_unsafe() to release them while holding the
lock.

Signed-off-by: Vlad Buslov 
---
 include/net/act_api.h |  1 +
 net/sched/act_api.c   | 73 ---
 2 files changed, 65 insertions(+), 9 deletions(-)

diff --git a/include/net/act_api.h b/include/net/act_api.h
index c6f195b3c706..4c5117bc4afb 100644
--- a/include/net/act_api.h
+++ b/include/net/act_api.h
@@ -38,6 +38,7 @@ struct tc_action {
struct gnet_stats_queue __percpu *cpu_qstats;
struct tc_cookie__rcu *act_cookie;
struct tcf_chain*goto_chain;
+   struct work_struct  work;
 };
 #define tcf_index  common.tcfa_index
 #define tcf_refcnt common.tcfa_refcnt
diff --git a/net/sched/act_api.c b/net/sched/act_api.c
index 6f118d62c731..4ad9062c34b3 100644
--- a/net/sched/act_api.c
+++ b/net/sched/act_api.c
@@ -90,13 +90,38 @@ static void free_tcf(struct tc_action *p)
kfree(p);
 }
 
-static void tcf_action_cleanup(struct tc_action *p)
+static void tcf_action_free(struct tc_action *p)
+{
+   gen_kill_estimator(&p->tcfa_rate_est);
+   free_tcf(p);
+}
+
+static void tcf_action_free_work(struct work_struct *work)
+{
+   struct tc_action *p = container_of(work,
+  struct tc_action,
+  work);
+
+   tcf_action_free(p);
+}
+
+static struct workqueue_struct *tc_action_wq;
+
+static bool tcf_action_queue_work(struct work_struct *work, work_func_t func)
+{
+   INIT_WORK(work, func);
+   return queue_work(tc_action_wq, work);
+}
+
+static void tcf_action_cleanup(struct tc_action *p, bool async)
 {
if (p->ops->cleanup)
p->ops->cleanup(p);
 
-   gen_kill_estimator(&p->tcfa_rate_est);
-   free_tcf(p);
+   if (async)
+   tcf_action_queue_work(&p->work, tcf_action_free_work);
+   else
+   tcf_action_free(p);
 }
 
 static int __tcf_action_put(struct tc_action *p, bool bind)
@@ -109,7 +134,7 @@ static int __tcf_action_put(struct tc_action *p, bool bind)
idr_remove(&idrinfo->action_idr, p->tcfa_index);
spin_unlock(&idrinfo->lock);
 
-   tcf_action_cleanup(p);
+   tcf_action_cleanup(p, false);
return 1;
}
 
@@ -147,6 +172,24 @@ int __tcf_idr_release(struct tc_action *p, bool bind, bool 
strict)
 }
 EXPORT_SYMBOL(__tcf_idr_release);
 
+/* Release idr without obtaining idrinfo->lock. Caller must prevent any
+ * concurrent modifications of idrinfo->action_idr!
+ */
+
+static int tcf_idr_release_unsafe(struct tc_action *p)
+{
+   if (atomic_read(&p->tcfa_bindcnt) > 0)
+   return -EPERM;
+
+   if (refcount_dec_and_test(&p->tcfa_refcnt)) {
+   idr_remove(&p->idrinfo->action_idr, p->tcfa_index);
+   tcf_action_cleanup(p, true);
+   return ACT_P_DELETED;
+   }
+
+   return 0;
+}
+
 static size_t tcf_action_shared_attrs_size(const struct tc_action *act)
 {
struct tc_cookie *act_cookie;
@@ -262,20 +305,25 @@ static int tcf_del_walker(struct tcf_idrinfo *idrinfo, 
struct sk_buff *skb,
if (nla_put_string(skb, TCA_KIND, ops->kind))
goto nla_put_failure;
 
+   spin_lock(&idrinfo->lock);
idr_for_each_entry_ul(idr, p, id) {
-   ret = __tcf_idr_release(p, false, true);
+   ret = tcf_idr_release_unsafe(p);
if (ret == ACT_P_DELETED) {
module_put(ops->owner);
n_i++;
} else if (ret < 0) {
-   goto nla_put_failure;
+   goto nla_put_failure_locked;
}
}
+   spin_unlock(&idrinfo->lock);
+
if (nla_put_u32(skb, TCA_FCNT, n_i))
goto nla_put_failure;
nla_nest_end(skb, nest);
 
return n_i;
+nla_put_failure_locked:
+   spin_unlock(&idrinfo->lock);
 nla_put_failure:
nla_nest_cancel(skb, nest);
return ret;
@@ -341,7 +389,7 @@ static int tcf_idr_delete_index(struct tcf_idrinfo 
*idrinfo, u32 index)
p->tcfa_index));
spin_unlock(&idrinfo->lock);
 
-   tcf_action_cleanup(p);
+   tcf_

[iproute2 PATCH] bridge: Correct json output

2018-09-07 Thread Tobias Jungel
The current implementation adds configured vlans as "vlan": [ ... ] into
an array. This is malformed json and fails to be parsed. This patch
creates an object to include this key value pair.

Test with:

ip l a type bridge
./bridge/bridge -j vlan | jq

fixes c7c1a1ef5

Signed-off-by: Tobias Jungel 
---
 bridge/vlan.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/bridge/vlan.c b/bridge/vlan.c
index 19a36b80..9376b985 100644
--- a/bridge/vlan.c
+++ b/bridge/vlan.c
@@ -632,6 +632,7 @@ void print_vlan_info(FILE *fp, struct rtattr *tb)
if (!is_json_context())
fprintf(fp, "%s", _SL_);
 
+   open_json_object(NULL);
open_json_array(PRINT_JSON, "vlan");
 
for (i = RTA_DATA(list); RTA_OK(i, rem); i = RTA_NEXT(i, rem)) {
@@ -659,6 +660,7 @@ void print_vlan_info(FILE *fp, struct rtattr *tb)
}
 
close_json_array(PRINT_ANY, "\n");
+   close_json_object();
 }
 
 int do_vlan(int argc, char **argv)



[PATCH net-next v2] net: sched: cls_flower: dump offload count value

2018-09-07 Thread Vlad Buslov
Change flower in_hw_count type to fixed-size u32 and dump it as
TCA_FLOWER_IN_HW_COUNT. This change is necessary to properly test shared
blocks and re-offload functionality.

Signed-off-by: Vlad Buslov 
Acked-by: Jiri Pirko 
---
 include/net/sch_generic.h| 2 +-
 include/uapi/linux/pkt_cls.h | 2 ++
 net/sched/cls_flower.c   | 5 -
 3 files changed, 7 insertions(+), 2 deletions(-)

diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index a6d00093f35e..d68ac55539a5 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -362,7 +362,7 @@ static inline void tcf_block_offload_dec(struct tcf_block 
*block, u32 *flags)
 }
 
 static inline void
-tc_cls_offload_cnt_update(struct tcf_block *block, unsigned int *cnt,
+tc_cls_offload_cnt_update(struct tcf_block *block, u32 *cnt,
  u32 *flags, bool add)
 {
if (add) {
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index be382fb0592d..401d0c1e612d 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -483,6 +483,8 @@ enum {
TCA_FLOWER_KEY_ENC_OPTS,
TCA_FLOWER_KEY_ENC_OPTS_MASK,
 
+   TCA_FLOWER_IN_HW_COUNT,
+
__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 6fd9bdd93796..4b8dd37dd4f8 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -98,7 +98,7 @@ struct cls_fl_filter {
struct list_head list;
u32 handle;
u32 flags;
-   unsigned int in_hw_count;
+   u32 in_hw_count;
struct rcu_work rwork;
struct net_device *hw_dev;
 };
@@ -1880,6 +1880,9 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, 
void *fh,
if (f->flags && nla_put_u32(skb, TCA_FLOWER_FLAGS, f->flags))
goto nla_put_failure;
 
+   if (nla_put_u32(skb, TCA_FLOWER_IN_HW_COUNT, f->in_hw_count))
+   goto nla_put_failure;
+
if (tcf_exts_dump(skb, &f->exts))
goto nla_put_failure;
 
-- 
2.7.5



Re: [PATCH net-next v2] net: sched: cls_flower: dump offload count value

2018-09-07 Thread Jakub Kicinski
On Fri,  7 Sep 2018 17:22:21 +0300, Vlad Buslov wrote:
> Change flower in_hw_count type to fixed-size u32 and dump it as
> TCA_FLOWER_IN_HW_COUNT. This change is necessary to properly test shared
> blocks and re-offload functionality.
> 
> Signed-off-by: Vlad Buslov 
> Acked-by: Jiri Pirko 

LGTM, thanks for the respin!


Allow bpf_perf_event_output to access packet data

2018-09-07 Thread Lorenz Bauer
Re-sent due to HTML e-mail mess up, apologies.

-- Forwarded message --
From: Lorenz Bauer 
Date: 7 September 2018 at 15:53
Subject: Allow bpf_perf_event_output to access packet data
To: netdev@vger.kernel.org, Alexei Starovoitov ,
Daniel Borkmann 


Hello list,

I'm attempting to use bpf_perf_event_output to do packet sampling from XDP.

The code basically runs before our other XDP code, does a
perf_event_output with the full packet (for now) and then tail calls
into DDoS mitigation, etc.

I've just discovered that perf_event_output isn't allowed to access
packet data by the verifier. Is this something that could be allowed?

Best
Lorenz

-- 
Lorenz Bauer  |  Systems Engineer
25 Lavington St., London SE1 0NZ

www.cloudflare.com


for_each_netdev_feature() broken on big endian

2018-09-07 Thread Mehrtens, Hauke
Hi,

On a MIPS 32 Big endian system the netdev_sync_upper_features() function does 
not work correctly.
It does not disbale bit 15 (NETIF_F_LRO, 0x8000), but 47 
(NETIF_F_HW_TC, 0x8000).

The for_each_netdev_feature() macro is used to go over all netdev feature flags 
and calls for_each_set_bit() with a u64.
This is the code:
#define for_each_netdev_feature(mask_addr, bit) \
for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT)
https://elixir.bootlin.com/linux/v4.19-rc2/source/include/linux/netdev_features.h#L157

The for_each_set_bit() macro calls the find_first_bit() macro and works 
directly on the bits in the memory
organized in 32 bit values in the generic implementation, see here:
https://elixir.bootlin.com/linux/v4.19-rc2/source/lib/find_bit.c#L102

On big endian systems the u64 value is stored in a different order in memory 
compared to little endian systems.

When I execute this code: 
netdev_features_t upper_disables = NETIF_F_UPPER_DISABLES;
printk("%s:%i: addr: 0x%llx, size: %i\n", __func__, __LINE__, 
upper_disables, NETDEV_FEATURE_COUNT);
ret = find_first_bit((unsigned long *)&upper_disables, 
NETDEV_FEATURE_COUNT);
printk("%s:%i: addr: 0x%llx, size: %i, ret: %li\n", __func__, __LINE__, 
upper_disables, NETDEV_FEATURE_COUNT, ret);
I get this result:
[   35.227140] netdev_sync_upper_features:6912: addr: 0x8000, size: 48
[   35.27] netdev_sync_upper_features:6914: addr: 0x8000, size: 48, 
ret: 47
Expected would be a ret: 15.

As I understood the documentation for for_each_set_bit() this should find the 
first bit in memory and not in a integer,
so for_each_netdev_feature() should not use for_each_set_bit().
I do not really know what a good fix would be, I could convert the feature 
flags to little endian before calling for_each_netdev_feature(), would this be 
ok?

Hauke


Re: for_each_netdev_feature() broken on big endian

2018-09-07 Thread David Miller
From: "Mehrtens, Hauke" 
Date: Fri, 7 Sep 2018 15:10:53 +

> On a MIPS 32 Big endian system the netdev_sync_upper_features() function does 
> not work correctly.
> It does not disbale bit 15 (NETIF_F_LRO, 0x8000), but 47 
> (NETIF_F_HW_TC, 0x8000).
> 
> The for_each_netdev_feature() macro is used to go over all netdev feature 
> flags and calls for_each_set_bit() with a u64.
> This is the code:
> #define for_each_netdev_feature(mask_addr, bit)   \
>   for_each_set_bit(bit, (unsigned long *)mask_addr, NETDEV_FEATURE_COUNT)
> https://elixir.bootlin.com/linux/v4.19-rc2/source/include/linux/netdev_features.h#L157

Good catch, yes we cannot use the the generic bit handling macros on the netdev
feature flags because the feature flags are a u64 operated on as a unit whereas
"long" may be 32-bit or 64-bit depending upon the architecture.


Re: [PATCH v4 3/3] IB/ipoib: Log sysfs 'dev_id' accesses from userspace

2018-09-07 Thread Jason Gunthorpe
On Thu, Sep 06, 2018 at 05:51:12PM +0300, Arseny Maslennikov wrote:
> Some tools may currently be using only the deprecated attribute;
> let's print an elaborate and clear deprecation notice to kmsg.
> 
> To do that, we have to replace the whole sysfs file, since we inherit
> the original one from netdev.
> 
> Signed-off-by: Arseny Maslennikov 
>  drivers/infiniband/ulp/ipoib/ipoib_main.c | 31 +++
>  1 file changed, 31 insertions(+)
> 
> diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c 
> b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> index 30f840f874b3..74732726ec6f 100644
> +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> @@ -2386,6 +2386,35 @@ int ipoib_add_pkey_attr(struct net_device *dev)
>   return device_create_file(&dev->dev, &dev_attr_pkey);
>  }
>  
> +/*
> + * We erroneously exposed the iface's port number in the dev_id
> + * sysfs field long after dev_port was introduced for that purpose[1],
> + * and we need to stop everyone from relying on that.
> + * Let's overload the shower routine for the dev_id file here
> + * to gently bring the issue up.
> + *
> + * [1] https://www.spinics.net/lists/netdev/msg272123.html
> + */
> +static ssize_t dev_id_show(struct device *dev,
> +struct device_attribute *attr, char *buf)
> +{
> + struct net_device *ndev = to_net_dev(dev);
> +
> + if (ndev->dev_id == ndev->dev_port)
> + netdev_info_once(ndev,
> + "\"%s\" wants to know my dev_id. Should it look at 
> dev_port instead? See Documentation/ABI/testing/sysfs-class-net for more 
> info.\n",
> + current->comm);
> +
> + return sprintf(buf, "%#x\n", ndev->dev_id);
> +}
> +static DEVICE_ATTR_RO(dev_id);
> +
> +int ipoib_intercept_dev_id_attr(struct net_device *dev)
> +{
> + device_remove_file(&dev->dev, &dev_attr_dev_id);
> + return device_create_file(&dev->dev, &dev_attr_dev_id);
> +}

Isn't this racey with userspace? Ie what happens if udev is querying
the dev_id right here?

Do we know there is no userspace doing this?

>  static struct net_device *ipoib_add_port(const char *format,
>struct ib_device *hca, u8 port)
>  {
> @@ -2427,6 +2456,8 @@ static struct net_device *ipoib_add_port(const char 
> *format,
>*/
>   ndev->priv_destructor = ipoib_intf_free;
>  
> + if (ipoib_intercept_dev_id_attr(ndev))
> + goto sysfs_failed;

No device_remove_file needed?

Jason


Re: [**EXTERNAL**] Re: VRF with enslaved L3 enabled bridge

2018-09-07 Thread David Ahern
On 9/7/18 9:56 AM, D'Souza, Nelson wrote:

> 
> *From:* David Ahern 
> *Sent:* Thursday, September 6, 2018 5:27 PM
> *To:* D'Souza, Nelson; netdev@vger.kernel.org
> *Subject:* Re: [**EXTERNAL**] Re: VRF with enslaved L3 enabled bridge
>  
> On 9/5/18 12:00 PM, D'Souza, Nelson wrote:
>> Just following up would you be able to confirm that this is a
> Linux VRF issue?
> 
> I can confirm that I can reproduce the problem. Need to find time to dig
> into it.

bridge's netfilter hook is dropping the packet.

bridge's netfilter code registers hook operations that are invoked when
nh_hook is called. It then sees all subsequent calls to nf_hook.

Packet wise, the bridge netfilter hook runs first. br_nf_pre_routing
allocates nf_bridge, sets in_prerouting to 1 and calls NF_HOOK for
NF_INET_PRE_ROUTING. It's finish function, br_nf_pre_routing_finish,
then resets in_prerouting flag to 0. Any subsequent calls to nf_hook
invoke ip_sabotage_in. That function sees in_prerouting is not
set and steals (drops) the packet.

The simplest change is to have ip_sabotage_in recognize that the bridge
can be enslaved to a VRF (L3 master device) and allow the packet to
continue.

Thanks to Ido for the hint on ip_sabotage_in.

This patch works for me:

diff --git a/net/bridge/br_netfilter_hooks.c
b/net/bridge/br_netfilter_hooks.c
index 6e0dc6bcd32a..37278dc280eb 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -835,7 +835,8 @@ static unsigned int ip_sabotage_in(void *priv,
   struct sk_buff *skb,
   const struct nf_hook_state *state)
 {
-   if (skb->nf_bridge && !skb->nf_bridge->in_prerouting) {
+   if (skb->nf_bridge && !skb->nf_bridge->in_prerouting &&
+   !netif_is_l3_master(skb->dev)) {
state->okfn(state->net, state->sk, skb);
return NF_STOLEN;
}


Re: [PATCH v3 3/3] IB/ipoib: Log sysfs 'dev_id' accesses from userspace

2018-09-07 Thread Doug Ledford
On Thu, 2018-09-06 at 16:03 +0300, Leon Romanovsky wrote:
> On Thu, Sep 06, 2018 at 10:04:33AM +0300, Arseny Maslennikov wrote:
> > On Wed, Sep 05, 2018 at 04:50:35PM +0300, Leon Romanovsky wrote:
> > > On Mon, Sep 03, 2018 at 07:13:16PM +0300, Arseny Maslennikov wrote:
> > > > Signed-off-by: Arseny Maslennikov 
> > > > ---
> > > >  drivers/infiniband/ulp/ipoib/ipoib_main.c | 38 +++
> > > >  1 file changed, 38 insertions(+)
> > > > 
> > > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c 
> > > > b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> > > > index 30f840f874b3..7386e5bde3d3 100644
> > > > --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
> > > > +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> > > > @@ -2386,6 +2386,42 @@ int ipoib_add_pkey_attr(struct net_device *dev)
> > > > return device_create_file(&dev->dev, &dev_attr_pkey);
> > > >  }
> > > > 
> > > > +/*
> > > > + * We erroneously exposed the iface's port number in the dev_id
> > > > + * sysfs field long after dev_port was introduced for that purpose[1],
> > > > + * and we need to stop everyone from relying on that.
> > > > + * Let's overload the shower routine for the dev_id file here
> > > > + * to gently bring the issue up.
> > > > + *
> > > > + * [1] https://www.spinics.net/lists/netdev/msg272123.html
> > > > + */
> > > > +static ssize_t dev_id_show(struct device *dev,
> > > > +  struct device_attribute *attr, char *buf)
> > > > +{
> > > > +   struct net_device *ndev = to_net_dev(dev);
> > > > +   ssize_t ret = -EINVAL;
> > > > +
> > > > +   if (ndev->dev_id == ndev->dev_port) {
> > > > +   netdev_info_once(ndev,
> > > > +   "\"%s\" wants to know my dev_id. "
> > > > +   "Should it look at dev_port instead?\n",
> > > > +   current->comm);
> > > > +   netdev_info_once(ndev,
> > > > +   "See Documentation/ABI/testing/sysfs-class-net 
> > > > for more info.\n");
> > > > +   }
> > > > +
> > > > +   ret = sprintf(buf, "%#x\n", ndev->dev_id);
> > > > +
> > > > +   return ret;
> > > > +}
> > > > +static DEVICE_ATTR_RO(dev_id);
> > > > +
> > > 
> > > I don't see this field among exposed by IPoIB, why should we expose it 
> > > now?
> > > 
> > 
> > To deviate from standard netdev behaviour, which only prints the
> > field out. Doug wanted this to also print a deprecation message, and
> > netdev (obviously) does not do that. See below.
> > 
> > > > +int ipoib_intercept_dev_id_attr(struct net_device *dev)
> > > > +{
> > > > +   device_remove_file(&dev->dev, &dev_attr_dev_id);
> > > > +   return device_create_file(&dev->dev, &dev_attr_dev_id);
> > > 
> > > Why isn't enough to rely on netdev code?
> > > 
> > 
> > Netdev code relies on macros around a *static* function 'netdev_show',
> > which is defined in net/core/net-sysfs.c; it is not listed in any header
> > files, and the macros aren't as well. This all leads me to believe it
> > was not really meant to be used from outside net/core/net-sysfs.
> > 
> > The only way we could use any netdev code here is to set up our own
> > handler (again), printk() a message, then call netdev_show — but we have
> > no access to it.
> > 
> > Of course, it also may be that I'm terribly missing a clue.
> 
> Thanks,
> 
> IMHO, the end result of adequate Doug's request is a little bit too much.
> I don't think that it justifies such remove->create construction.
> 
> Personal opinion.

I agree with you that the end result is kinda bulky, *but*, we will need
to know if there are things using the old dev_id before we can remove
it.  In particular, I'm concerned the IPoIB handling code of
NetworkManager uses it.  It's worth the cost I think.

-- 
Doug Ledford 
GPG KeyID: B826A3330E572FDD
Key fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57 2FDD


signature.asc
Description: This is a digitally signed message part


Re: [PATCH v4 3/3] IB/ipoib: Log sysfs 'dev_id' accesses from userspace

2018-09-07 Thread Doug Ledford
On Fri, 2018-09-07 at 09:43 -0600, Jason Gunthorpe wrote:
> On Thu, Sep 06, 2018 at 05:51:12PM +0300, Arseny Maslennikov wrote:
> > Some tools may currently be using only the deprecated attribute;
> > let's print an elaborate and clear deprecation notice to kmsg.
> > 
> > To do that, we have to replace the whole sysfs file, since we inherit
> > the original one from netdev.
> > 
> > Signed-off-by: Arseny Maslennikov 
> >  drivers/infiniband/ulp/ipoib/ipoib_main.c | 31 +++
> >  1 file changed, 31 insertions(+)
> > 
> > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c 
> > b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> > index 30f840f874b3..74732726ec6f 100644
> > +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> > @@ -2386,6 +2386,35 @@ int ipoib_add_pkey_attr(struct net_device *dev)
> > return device_create_file(&dev->dev, &dev_attr_pkey);
> >  }
> >  
> > +/*
> > + * We erroneously exposed the iface's port number in the dev_id
> > + * sysfs field long after dev_port was introduced for that purpose[1],
> > + * and we need to stop everyone from relying on that.
> > + * Let's overload the shower routine for the dev_id file here
> > + * to gently bring the issue up.
> > + *
> > + * [1] https://www.spinics.net/lists/netdev/msg272123.html
> > + */
> > +static ssize_t dev_id_show(struct device *dev,
> > +  struct device_attribute *attr, char *buf)
> > +{
> > +   struct net_device *ndev = to_net_dev(dev);
> > +
> > +   if (ndev->dev_id == ndev->dev_port)
> > +   netdev_info_once(ndev,
> > +   "\"%s\" wants to know my dev_id. Should it look at 
> > dev_port instead? See Documentation/ABI/testing/sysfs-class-net for more 
> > info.\n",
> > +   current->comm);
> > +
> > +   return sprintf(buf, "%#x\n", ndev->dev_id);
> > +}
> > +static DEVICE_ATTR_RO(dev_id);
> > +
> > +int ipoib_intercept_dev_id_attr(struct net_device *dev)
> > +{
> > +   device_remove_file(&dev->dev, &dev_attr_dev_id);
> > +   return device_create_file(&dev->dev, &dev_attr_dev_id);
> > +}
> 
> Isn't this racey with userspace? Ie what happens if udev is querying
> the dev_id right here?
> 
> Do we know there is no userspace doing this?

I don't think that it can race (or reasonably can).  The intercept
function is done as part of ipoib_add_port() so the port itself isn't
live yet.  Things like udev shouldn't be scanning it until after we've
finished bringing it up and added it to the system, so any race here is
unimportant IMO.

> >  static struct net_device *ipoib_add_port(const char *format,
> >  struct ib_device *hca, u8 port)
> >  {
> > @@ -2427,6 +2456,8 @@ static struct net_device *ipoib_add_port(const char 
> > *format,
> >  */
> > ndev->priv_destructor = ipoib_intf_free;
> >  
> > +   if (ipoib_intercept_dev_id_attr(ndev))
> > +   goto sysfs_failed;
> 
> No device_remove_file needed?
> 
> Jason

-- 
Doug Ledford 
GPG KeyID: B826A3330E572FDD
Key fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57 2FDD


signature.asc
Description: This is a digitally signed message part


Re: [PATCH v4 3/3] IB/ipoib: Log sysfs 'dev_id' accesses from userspace

2018-09-07 Thread Jason Gunthorpe
On Fri, Sep 07, 2018 at 01:14:51PM -0400, Doug Ledford wrote:
> On Fri, 2018-09-07 at 09:43 -0600, Jason Gunthorpe wrote:
> > On Thu, Sep 06, 2018 at 05:51:12PM +0300, Arseny Maslennikov wrote:
> > > Some tools may currently be using only the deprecated attribute;
> > > let's print an elaborate and clear deprecation notice to kmsg.
> > > 
> > > To do that, we have to replace the whole sysfs file, since we inherit
> > > the original one from netdev.
> > > 
> > > Signed-off-by: Arseny Maslennikov 
> > >  drivers/infiniband/ulp/ipoib/ipoib_main.c | 31 +++
> > >  1 file changed, 31 insertions(+)
> > > 
> > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c 
> > > b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> > > index 30f840f874b3..74732726ec6f 100644
> > > +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> > > @@ -2386,6 +2386,35 @@ int ipoib_add_pkey_attr(struct net_device *dev)
> > >   return device_create_file(&dev->dev, &dev_attr_pkey);
> > >  }
> > >  
> > > +/*
> > > + * We erroneously exposed the iface's port number in the dev_id
> > > + * sysfs field long after dev_port was introduced for that purpose[1],
> > > + * and we need to stop everyone from relying on that.
> > > + * Let's overload the shower routine for the dev_id file here
> > > + * to gently bring the issue up.
> > > + *
> > > + * [1] https://www.spinics.net/lists/netdev/msg272123.html
> > > + */
> > > +static ssize_t dev_id_show(struct device *dev,
> > > +struct device_attribute *attr, char *buf)
> > > +{
> > > + struct net_device *ndev = to_net_dev(dev);
> > > +
> > > + if (ndev->dev_id == ndev->dev_port)
> > > + netdev_info_once(ndev,
> > > + "\"%s\" wants to know my dev_id. Should it look at 
> > > dev_port instead? See Documentation/ABI/testing/sysfs-class-net for more 
> > > info.\n",
> > > + current->comm);
> > > +
> > > + return sprintf(buf, "%#x\n", ndev->dev_id);
> > > +}
> > > +static DEVICE_ATTR_RO(dev_id);
> > > +
> > > +int ipoib_intercept_dev_id_attr(struct net_device *dev)
> > > +{
> > > + device_remove_file(&dev->dev, &dev_attr_dev_id);
> > > + return device_create_file(&dev->dev, &dev_attr_dev_id);
> > > +}
> > 
> > Isn't this racey with userspace? Ie what happens if udev is querying
> > the dev_id right here?
> > 
> > Do we know there is no userspace doing this?
> 
> I don't think that it can race (or reasonably can).  The intercept
> function is done as part of ipoib_add_port() so the port itself isn't
> live yet.

The above code is after register_netdev, so the port itself is
certainly live as far as userspace is concerned..

All the other sysfs stuff in add_port is already wrong/racy.. See
Parav's recent series fixing this for the main devices..

Jason


Re: [PATCH v3 3/3] IB/ipoib: Log sysfs 'dev_id' accesses from userspace

2018-09-07 Thread Leon Romanovsky
On Fri, Sep 07, 2018 at 01:14:37PM -0400, Doug Ledford wrote:
> On Thu, 2018-09-06 at 16:03 +0300, Leon Romanovsky wrote:
> > On Thu, Sep 06, 2018 at 10:04:33AM +0300, Arseny Maslennikov wrote:
> > > On Wed, Sep 05, 2018 at 04:50:35PM +0300, Leon Romanovsky wrote:
> > > > On Mon, Sep 03, 2018 at 07:13:16PM +0300, Arseny Maslennikov wrote:
> > > > > Signed-off-by: Arseny Maslennikov 
> > > > > ---
> > > > >  drivers/infiniband/ulp/ipoib/ipoib_main.c | 38 
> > > > > +++
> > > > >  1 file changed, 38 insertions(+)
> > > > >
> > > > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c 
> > > > > b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> > > > > index 30f840f874b3..7386e5bde3d3 100644
> > > > > --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
> > > > > +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> > > > > @@ -2386,6 +2386,42 @@ int ipoib_add_pkey_attr(struct net_device *dev)
> > > > >   return device_create_file(&dev->dev, &dev_attr_pkey);
> > > > >  }
> > > > >
> > > > > +/*
> > > > > + * We erroneously exposed the iface's port number in the dev_id
> > > > > + * sysfs field long after dev_port was introduced for that 
> > > > > purpose[1],
> > > > > + * and we need to stop everyone from relying on that.
> > > > > + * Let's overload the shower routine for the dev_id file here
> > > > > + * to gently bring the issue up.
> > > > > + *
> > > > > + * [1] https://www.spinics.net/lists/netdev/msg272123.html
> > > > > + */
> > > > > +static ssize_t dev_id_show(struct device *dev,
> > > > > +struct device_attribute *attr, char *buf)
> > > > > +{
> > > > > + struct net_device *ndev = to_net_dev(dev);
> > > > > + ssize_t ret = -EINVAL;
> > > > > +
> > > > > + if (ndev->dev_id == ndev->dev_port) {
> > > > > + netdev_info_once(ndev,
> > > > > + "\"%s\" wants to know my dev_id. "
> > > > > + "Should it look at dev_port instead?\n",
> > > > > + current->comm);
> > > > > + netdev_info_once(ndev,
> > > > > + "See Documentation/ABI/testing/sysfs-class-net 
> > > > > for more info.\n");
> > > > > + }
> > > > > +
> > > > > + ret = sprintf(buf, "%#x\n", ndev->dev_id);
> > > > > +
> > > > > + return ret;
> > > > > +}
> > > > > +static DEVICE_ATTR_RO(dev_id);
> > > > > +
> > > >
> > > > I don't see this field among exposed by IPoIB, why should we expose it 
> > > > now?
> > > >
> > >
> > > To deviate from standard netdev behaviour, which only prints the
> > > field out. Doug wanted this to also print a deprecation message, and
> > > netdev (obviously) does not do that. See below.
> > >
> > > > > +int ipoib_intercept_dev_id_attr(struct net_device *dev)
> > > > > +{
> > > > > + device_remove_file(&dev->dev, &dev_attr_dev_id);
> > > > > + return device_create_file(&dev->dev, &dev_attr_dev_id);
> > > >
> > > > Why isn't enough to rely on netdev code?
> > > >
> > >
> > > Netdev code relies on macros around a *static* function 'netdev_show',
> > > which is defined in net/core/net-sysfs.c; it is not listed in any header
> > > files, and the macros aren't as well. This all leads me to believe it
> > > was not really meant to be used from outside net/core/net-sysfs.
> > >
> > > The only way we could use any netdev code here is to set up our own
> > > handler (again), printk() a message, then call netdev_show — but we have
> > > no access to it.
> > >
> > > Of course, it also may be that I'm terribly missing a clue.
> >
> > Thanks,
> >
> > IMHO, the end result of adequate Doug's request is a little bit too much.
> > I don't think that it justifies such remove->create construction.
> >
> > Personal opinion.
>
> I agree with you that the end result is kinda bulky, *but*, we will need
> to know if there are things using the old dev_id before we can remove
> it.  In particular, I'm concerned the IPoIB handling code of
> NetworkManager uses it.  It's worth the cost I think.

I did my checks now and saw that ibdev2netdev uses the value provided
from this dev_id, so every invocation of that script will generate the
warning.

See this line in Parav's repo
https://github.com/Mellanox/container_scripts/blob/master/ibdev2netdev#L112

Thanks

>
> --
> Doug Ledford 
> GPG KeyID: B826A3330E572FDD
> Key fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57 2FDD




signature.asc
Description: PGP signature


Re: [PATCH net-next] veth: report NEWLINK event when moving the peer device in a new namespace

2018-09-07 Thread Thomas Haller
Hi David,


On Mon, 2018-09-03 at 20:54 -0600, David Ahern wrote:

> From init_net:
> $ ip monitor all-nsid

I thought the concern of the patch is the overhead of sending one
additional RTM_NEWLINK message. This workaround has likely higher
overhead. More importantly, it's so cumbersome, that I doubt anybody
would implementing such a solution.

When the events of one namespace are not sufficient to get all relevant
information (local to the namespace itself), the solution is not
monitor all-nsid.

You might save complexity and performance overhead in kernel. But what
you save here is just moved to user-space, which faces higher
complexity (at multiple places/projects, where developers are not
experts in netlink) and higher overhead.


RTM_GETLINK/NLM_F_DUMP allows to fetch information. The same
information is usually also emitted on changes via RTM_NEWLINK
notifications.
Yes, the information may be avilable somehow, but how cumbersome:

 a) receive RTM_DELLINK, recognize that the message belongs to a 
   veth that was moved to another netns, and recognize that the peer's 
   IFLA_LINK changed. This approach only works when the veth is moved
   away from the current namespace.
 b) or, enable monitor all-nsid, receive RTM_NEWLINK, recognize that 
   the event is for moving a veth between netns, find the peer in the 
   other netns and recognize that the peer's IFLA_LINK changed.


best,
THomas


signature.asc
Description: This is a digitally signed message part


Re: [PATCH net-next v2] net: sched: change tcf_del_walker() to take idrinfo->lock

2018-09-07 Thread Cong Wang
On Fri, Sep 7, 2018 at 6:52 AM Vlad Buslov  wrote:
>
> Action API was changed to work with actions and action_idr in concurrency
> safe manner, however tcf_del_walker() still uses actions without taking a
> reference or idrinfo->lock first, and deletes them directly, disregarding
> possible concurrent delete.
>
> Add tc_action_wq workqueue to action API. Implement
> tcf_idr_release_unsafe() that assumes external synchronization by caller
> and delays blocking action cleanup part to tc_action_wq workqueue. Extend
> tcf_action_cleanup() with 'async' argument to indicate that function should
> free action asynchronously.

Where exactly is blocking in tcf_action_cleanup()?

>From your code, it looks like free_tcf(), but from my observation,
the only blocking function inside is tcf_action_goto_chain_fini()
which calls __tcf_chain_put(). But, __tcf_chain_put() is blocking
_ONLY_ when tc_chain_notify() is called, for tc action it is never
called.

So, what else is blocking?


Re: [PATCH net-next 13/13] net: sched: add flags to Qdisc class ops struct

2018-09-07 Thread Cong Wang
On Thu, Sep 6, 2018 at 12:59 AM Vlad Buslov  wrote:
>
> Extend Qdisc_class_ops with flags. Create enum to hold possible class ops
> flag values. Add first class ops flags value QDISC_CLASS_OPS_DOIT_UNLOCKED
> to indicate that class ops functions can be called without taking rtnl
> lock.

We don't add anything that is not used.

This is the last patch in this series, so I am pretty sure you split
it in a wrong way, it certainly belongs to next series, not this series.


Re: [PATCH net-next 09/13] net: sched: extend tcf_block with rcu

2018-09-07 Thread Cong Wang
On Thu, Sep 6, 2018 at 12:59 AM Vlad Buslov  wrote:
>
> Extend tcf_block with rcu to allow safe deallocation when it is accessed
> concurrently.

This sucks, please fold this patch into where you call rcu_read_lock()
on tcf block.

This patch _alone_ is apparently not complete. This is not how
we split patches.


Re: [PATCH v3 3/3] IB/ipoib: Log sysfs 'dev_id' accesses from userspace

2018-09-07 Thread Doug Ledford
On Fri, 2018-09-07 at 20:28 +0300, Leon Romanovsky wrote:
> On Fri, Sep 07, 2018 at 01:14:37PM -0400, Doug Ledford wrote:
> > On Thu, 2018-09-06 at 16:03 +0300, Leon Romanovsky wrote:
> > > On Thu, Sep 06, 2018 at 10:04:33AM +0300, Arseny Maslennikov wrote:
> > > > On Wed, Sep 05, 2018 at 04:50:35PM +0300, Leon Romanovsky wrote:
> > > > > On Mon, Sep 03, 2018 at 07:13:16PM +0300, Arseny Maslennikov wrote:
> > > > > > Signed-off-by: Arseny Maslennikov 
> > > > > > ---
> > > > > >  drivers/infiniband/ulp/ipoib/ipoib_main.c | 38 
> > > > > > +++
> > > > > >  1 file changed, 38 insertions(+)
> > > > > > 
> > > > > > diff --git a/drivers/infiniband/ulp/ipoib/ipoib_main.c 
> > > > > > b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> > > > > > index 30f840f874b3..7386e5bde3d3 100644
> > > > > > --- a/drivers/infiniband/ulp/ipoib/ipoib_main.c
> > > > > > +++ b/drivers/infiniband/ulp/ipoib/ipoib_main.c
> > > > > > @@ -2386,6 +2386,42 @@ int ipoib_add_pkey_attr(struct net_device 
> > > > > > *dev)
> > > > > > return device_create_file(&dev->dev, &dev_attr_pkey);
> > > > > >  }
> > > > > > 
> > > > > > +/*
> > > > > > + * We erroneously exposed the iface's port number in the dev_id
> > > > > > + * sysfs field long after dev_port was introduced for that 
> > > > > > purpose[1],
> > > > > > + * and we need to stop everyone from relying on that.
> > > > > > + * Let's overload the shower routine for the dev_id file here
> > > > > > + * to gently bring the issue up.
> > > > > > + *
> > > > > > + * [1] https://www.spinics.net/lists/netdev/msg272123.html
> > > > > > + */
> > > > > > +static ssize_t dev_id_show(struct device *dev,
> > > > > > +  struct device_attribute *attr, char *buf)
> > > > > > +{
> > > > > > +   struct net_device *ndev = to_net_dev(dev);
> > > > > > +   ssize_t ret = -EINVAL;
> > > > > > +
> > > > > > +   if (ndev->dev_id == ndev->dev_port) {
> > > > > > +   netdev_info_once(ndev,
> > > > > > +   "\"%s\" wants to know my dev_id. "
> > > > > > +   "Should it look at dev_port instead?\n",
> > > > > > +   current->comm);
> > > > > > +   netdev_info_once(ndev,
> > > > > > +   "See Documentation/ABI/testing/sysfs-class-net 
> > > > > > for more info.\n");
> > > > > > +   }
> > > > > > +
> > > > > > +   ret = sprintf(buf, "%#x\n", ndev->dev_id);
> > > > > > +
> > > > > > +   return ret;
> > > > > > +}
> > > > > > +static DEVICE_ATTR_RO(dev_id);
> > > > > > +
> > > > > 
> > > > > I don't see this field among exposed by IPoIB, why should we expose 
> > > > > it now?
> > > > > 
> > > > 
> > > > To deviate from standard netdev behaviour, which only prints the
> > > > field out. Doug wanted this to also print a deprecation message, and
> > > > netdev (obviously) does not do that. See below.
> > > > 
> > > > > > +int ipoib_intercept_dev_id_attr(struct net_device *dev)
> > > > > > +{
> > > > > > +   device_remove_file(&dev->dev, &dev_attr_dev_id);
> > > > > > +   return device_create_file(&dev->dev, &dev_attr_dev_id);
> > > > > 
> > > > > Why isn't enough to rely on netdev code?
> > > > > 
> > > > 
> > > > Netdev code relies on macros around a *static* function 'netdev_show',
> > > > which is defined in net/core/net-sysfs.c; it is not listed in any header
> > > > files, and the macros aren't as well. This all leads me to believe it
> > > > was not really meant to be used from outside net/core/net-sysfs.
> > > > 
> > > > The only way we could use any netdev code here is to set up our own
> > > > handler (again), printk() a message, then call netdev_show — but we have
> > > > no access to it.
> > > > 
> > > > Of course, it also may be that I'm terribly missing a clue.
> > > 
> > > Thanks,
> > > 
> > > IMHO, the end result of adequate Doug's request is a little bit too much.
> > > I don't think that it justifies such remove->create construction.
> > > 
> > > Personal opinion.
> > 
> > I agree with you that the end result is kinda bulky, *but*, we will need
> > to know if there are things using the old dev_id before we can remove
> > it.  In particular, I'm concerned the IPoIB handling code of
> > NetworkManager uses it.  It's worth the cost I think.
> 
> I did my checks now and saw that ibdev2netdev uses the value provided
> from this dev_id, so every invocation of that script will generate the
> warning.
> 
> See this line in Parav's repo
> https://github.com/Mellanox/container_scripts/blob/master/ibdev2netdev#L112

I'm pretty sure libvirt + qemu will trigger it too.

-- 
Doug Ledford 
GPG KeyID: B826A3330E572FDD
Key fingerprint = AE6B 1BDA 122B 23B4 265B  1274 B826 A333 0E57 2FDD


signature.asc
Description: This is a digitally signed message part


Re: [PATCH net-next 08/13] net: sched: rename tcf_block_get{_ext}() and tcf_block_put{_ext}()

2018-09-07 Thread Cong Wang
On Thu, Sep 6, 2018 at 12:59 AM Vlad Buslov  wrote:
>
> Functions tcf_block_get{_ext}() and tcf_block_put{_ext}() actually
> attach/detach block to specific Qdisc besides just taking/putting
> reference. Rename them according to their purpose.

Where exactly does it attach to?

Each qdisc provides a pointer to a pointer of a block, like
&cl->block. It is where the result is saved to. It takes a parameter
of Qdisc* merely for read-only purpose.

So, renaming it to *attach() is even confusing, at least not
any better. Please find other names or leave them as they are.


Re: kernels > v4.12 oops/crash with ipsec-traffic: bisected to b838d5e1c5b6e57b10ec8af2268824041e3ea911: ipv4: mark DST_NOGC and remove the operation of dst_free()

2018-09-07 Thread Wolfgang Walter
Am Freitag, 31. August 2018, 08:50:24 schrieb Steffen Klassert:
> On Thu, Aug 30, 2018 at 08:53:50PM +0200, Wolfgang Walter wrote:
> > Hello,
> > 
> > kernels > 4.12 do not work on one of our main routers. They crash as soon
> > as ipsec-tunnels are configured and ipsec-traffic actually flows.
> 
> Can you please send the backtrace of this crash?
> 

I bootet the b838d5e1c5b6e57b10ec8af2268824041e3ea911 several times but I 
could not record the complete trace. I think I have to log to the serial 
console but I can't do that before next week.


What I could record ist:

There is a always 
 ... 
the callrace.

This is the part I could see:


irq_exit+0x71/0x80
do_IRQ+0x4d/0xd0
common_interrup+07a/0x7a

RIP: 010:cpuidle_enter_state+0x11d/0x200
RSP: 0018:c9000321bee0 EFLAGS: 0282 ORIG_RAX: ffc4
RAX: 88085efde450 RBX: 0004 RCX: 0003c9e63c13
RDX: 0003c9e63c13 RSI: ffb03103fe35ac43 RDI: 
RBP: e87cf600 R08: 000c R09: 0004
R10: 0400 R11: 0003c99e56fc R12: 0003c9e63c13
R13: 0003c9da9567 R14: 0004 R15: 822763e0
do_idle+0xd3/0x160
cpu_startup_entry+0x14/0x20
secondary_startup_64+0xa5/0xb0
Code: 00 0f b7 83 c0 00 00 00 80 7c 02 08 01 0f 86 d3 02 00 00 41
8b 8c 24 3c 10 00 00 48 8b 6b 58 85 c9 0f 84 2f 01 00 00 48 83 e5 fe  45 
60
02 0f 84 4e 01 00 00 f6 43 38 01 74 0d 80 00 bd ab 00 00
RIP: ip_forward+0xd4/0x470 RSP: 88085efc3cb0
CR2: 0060
[ end trace 7205b53c25b7b35a ]---
Kernel panic - not syncing: Fatal exception in interrupt
Kernel Offset: disabled
Rebooting in 60 seconds..


I got an email from Tobias Hommel and I think it is the same problem.

It is very clear that it is the difference from

ipv4: call dst_hold_safe() properly

to

ipv4: mark DST_NOGC and remove the operation of dst_free()

which triggers this bug.

Regards,
-- 
Wolfgang Walter
Studentenwerk München
Anstalt des öffentlichen Rechts


[Patch net-next] net_sched: remove redundant qdisc lock classes

2018-09-07 Thread Cong Wang
We no longer take any spinlock on RX path for ingress qdisc,
so this lockdep annotation is no longer needed.

Cc: Jamal Hadi Salim 
Signed-off-by: Cong Wang 
---
 net/sched/sch_api.c | 7 ---
 1 file changed, 7 deletions(-)

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 98541c6399db..411c40344b77 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -27,7 +27,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 
@@ -1053,10 +1052,6 @@ static int qdisc_block_indexes_set(struct Qdisc *sch, 
struct nlattr **tca,
return 0;
 }
 
-/* lockdep annotation is needed for ingress; egress gets it only for name */
-static struct lock_class_key qdisc_tx_lock;
-static struct lock_class_key qdisc_rx_lock;
-
 /*
Allocate and initialize new qdisc.
 
@@ -1121,7 +1116,6 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
if (handle == TC_H_INGRESS) {
sch->flags |= TCQ_F_INGRESS;
handle = TC_H_MAKE(TC_H_INGRESS, 0);
-   lockdep_set_class(qdisc_lock(sch), &qdisc_rx_lock);
} else {
if (handle == 0) {
handle = qdisc_alloc_handle(dev);
@@ -1129,7 +1123,6 @@ static struct Qdisc *qdisc_create(struct net_device *dev,
if (handle == 0)
goto err_out3;
}
-   lockdep_set_class(qdisc_lock(sch), &qdisc_tx_lock);
if (!netif_is_multiqueue(dev))
sch->flags |= TCQ_F_ONETXQUEUE;
}
-- 
2.14.4



[Patch net-next] htb: use anonymous union for simplicity

2018-09-07 Thread Cong Wang
cl->leaf.q is slightly more readable than cl->un.leaf.q.

Cc: Jamal Hadi Salim 
Signed-off-by: Cong Wang 
---
 net/sched/sch_htb.c | 98 ++---
 1 file changed, 49 insertions(+), 49 deletions(-)

diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 43c4bfe625a9..2c45e97b7b87 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -132,7 +132,7 @@ struct htb_class {
struct htb_class_inner {
struct htb_prio clprio[TC_HTB_NUMPRIO];
} inner;
-   } un;
+   };
s64 pq_key;
 
int prio_activity;  /* for which prios are we 
active */
@@ -411,13 +411,13 @@ static void htb_activate_prios(struct htb_sched *q, 
struct htb_class *cl)
int prio = ffz(~m);
m &= ~(1 << prio);
 
-   if (p->un.inner.clprio[prio].feed.rb_node)
+   if (p->inner.clprio[prio].feed.rb_node)
/* parent already has its feed in use so that
 * reset bit in mask as parent is already ok
 */
mask &= ~(1 << prio);
 
-   htb_add_to_id_tree(&p->un.inner.clprio[prio].feed, cl, 
prio);
+   htb_add_to_id_tree(&p->inner.clprio[prio].feed, cl, 
prio);
}
p->prio_activity |= mask;
cl = p;
@@ -447,19 +447,19 @@ static void htb_deactivate_prios(struct htb_sched *q, 
struct htb_class *cl)
int prio = ffz(~m);
m &= ~(1 << prio);
 
-   if (p->un.inner.clprio[prio].ptr == cl->node + prio) {
+   if (p->inner.clprio[prio].ptr == cl->node + prio) {
/* we are removing child which is pointed to 
from
 * parent feed - forget the pointer but remember
 * classid
 */
-   p->un.inner.clprio[prio].last_ptr_id = 
cl->common.classid;
-   p->un.inner.clprio[prio].ptr = NULL;
+   p->inner.clprio[prio].last_ptr_id = 
cl->common.classid;
+   p->inner.clprio[prio].ptr = NULL;
}
 
htb_safe_rb_erase(cl->node + prio,
- &p->un.inner.clprio[prio].feed);
+ &p->inner.clprio[prio].feed);
 
-   if (!p->un.inner.clprio[prio].feed.rb_node)
+   if (!p->inner.clprio[prio].feed.rb_node)
mask |= 1 << prio;
}
 
@@ -555,7 +555,7 @@ htb_change_class_mode(struct htb_sched *q, struct htb_class 
*cl, s64 *diff)
  */
 static inline void htb_activate(struct htb_sched *q, struct htb_class *cl)
 {
-   WARN_ON(cl->level || !cl->un.leaf.q || !cl->un.leaf.q->q.qlen);
+   WARN_ON(cl->level || !cl->leaf.q || !cl->leaf.q->q.qlen);
 
if (!cl->prio_activity) {
cl->prio_activity = 1 << cl->prio;
@@ -615,7 +615,7 @@ static int htb_enqueue(struct sk_buff *skb, struct Qdisc 
*sch,
__qdisc_drop(skb, to_free);
return ret;
 #endif
-   } else if ((ret = qdisc_enqueue(skb, cl->un.leaf.q,
+   } else if ((ret = qdisc_enqueue(skb, cl->leaf.q,
to_free)) != NET_XMIT_SUCCESS) {
if (net_xmit_drop_count(ret)) {
qdisc_qstats_drop(sch);
@@ -823,7 +823,7 @@ static struct htb_class *htb_lookup_leaf(struct htb_prio 
*hprio, const int prio)
cl = rb_entry(*sp->pptr, struct htb_class, node[prio]);
if (!cl->level)
return cl;
-   clp = &cl->un.inner.clprio[prio];
+   clp = &cl->inner.clprio[prio];
(++sp)->root = clp->feed.rb_node;
sp->pptr = &clp->ptr;
sp->pid = &clp->last_ptr_id;
@@ -857,7 +857,7 @@ static struct sk_buff *htb_dequeue_tree(struct htb_sched 
*q, const int prio,
 * graft operation on the leaf since last dequeue;
 * simply deactivate and skip such class
 */
-   if (unlikely(cl->un.leaf.q->q.qlen == 0)) {
+   if (unlikely(cl->leaf.q->q.qlen == 0)) {
struct htb_class *next;
htb_deactivate(q, cl);
 
@@ -873,12 +873,12 @@ static struct sk_buff *htb_dequeue_tree(struct htb_sched 
*q, const int prio,
goto next;
}
 
-   skb = cl->un.leaf.q->dequeue(cl->un.leaf.q);
+   skb = cl->leaf.q->dequeue(cl->leaf.q);
  

Re: [RFC PATCH bpf-next v2 0/4] Implement bpf queue/stack maps

2018-09-07 Thread Mauricio Vasquez




On 09/06/2018 07:13 PM, Alexei Starovoitov wrote:

On Fri, Aug 31, 2018 at 11:25:48PM +0200, Mauricio Vasquez B wrote:

In some applications this is needed have a pool of free elements, like for
example the list of free L4 ports in a SNAT.  None of the current maps allow
to do it as it is not possibleto get an any element without having they key
it is associated to.

This patchset implements two new kind of eBPF maps: queue and stack.
Those maps provide to eBPF programs the peek, push and pop operations, and for
userspace applications a new bpf_map_lookup_and_delete_elem() is added.

Signed-off-by: Mauricio Vasquez B 

---

I am sending this as an RFC because there is still an issue I am not sure how
to solve.

The queue/stack maps have a linked list for saving the nodes, and a
preallocation schema based on the pcpu_freelist already implemented and used
in the htabmap.  Each time an element is pushed into the map, a node from the
pcpu_freelist is taken and then added to the linked list.

The pop operation takes and *removes* the first node from the linked list, then
it uses *call_rcu* to postpose freeing the node, i.e, the node is only returned
to the pcpu_freelist when the rcu callback is executed.  This is needed because
an element returned by the pop() operation should remain valid for the whole
duration of the eBPF program.

The problem is that elements are not immediately returned to the free list, so
in some cases the push operation could fail because there are not free nodes
in the pcpu_freelist.

The following code snippet exposes that problem.

...
/* Push MAP_SIZE elements */
for (i = 0; i < MAP_SIZE; i++)
assert(bpf_map_update_elem(fd, NULL, &vals[i], 0) == 0);

/* Pop all elements */
for (i = 0; i < MAP_SIZE; i++)
assert(bpf_map_lookup_and_delete_elem(fd, NULL, &val) == 0 &&
   val == vals[i]);

   // sleep(1) <-- If I put this sleep, everything works.
/* Push MAP_SIZE elements */
for (i = 0; i < MAP_SIZE; i++)
assert(bpf_map_update_elem(fd, NULL, &vals[i], 0) == 0);
^^^
This fails because there are not available elements in pcpu_freelist
...

I think a possible solution is to oversize the pcpu_freelist (no idea by how
much, maybe double or, or make it 1.5 time the max elements in the map?)
I also have concerns about it, it would waste that memory in many cases and
this is also probably that it doesn't solve the issue because that code snippet
is puhsing and popping elements too fast, so even if the pcpu_freelist is much
large a certain time instant all the elements could be used.

Is this really an important issue?
Any idea of how to solve it?

It is important issue indeed and a difficult one to solve.
We have the same issue with hash map.
If the prog is doing:
value = lookup(key);
delete(key);
// here the prog shouldn't be accessing the value anymore, since the memory
// could have been reused, but value pointer is still valid and points to
// allocated memory
Just to notice that for the queue map it is a little bit worse because 
there isn't a way to mark an element to be reused, hence in some cases 
the pool of free elements could be exhausted.

bpf_map_pop_elem() is trying to do lookup_and_delete and preserve
validity of value without races.
With pcpu_freelist I don't think there is a solution.
We can have this queue/stack map without prealloc and use kmalloc/kfree
back and forth. Performance will not be as great, but for your use case,
I suspect, it will be good enough.
I agree, for our use case we are not that worried about the performance, 
it is still in the dataplane but let's say it is not in the "hot" path.

The key issue with kmalloc/kfree is unbounded time of rcu callbacks.
If somebody starts doing push/pop for every packet, the rcu subsystem
will struggle and nothing we can do about it.

The only way I could think of to resolve this problem is to reuse
the logic that Joe is working on for socket lookups inside the program.
Joe,
how is that going? Could you repost the latest patches?

In such case the api for stack map will look like:

elem = bpf_map_pop_elem(stack);
// access elem
bpf_map_free_elem(elem);
// here prog is not allowed to access elem and verifier will catch that

elem = bpf_map_alloc_elem(stack);
// populate elem
bpf_map_push_elem(elem);
// here prog is not allowed to access elem and verifier will catch that

Then both pre-allocated elems and kmalloc/kfree will work fine
and no unbounded rcu issues in both cases.




I read the Joe's proposal and using that for this problem looks like a 
nice solution.


I think a good trade-off for now would be to go ahead with a queue/stack 
map without preallocating support (or maybe include it having always in 
mind that this issue has to be solved in the near future) and then, as a 
separated work, try to use Joe's proposal in the map helpers.


What do you think?

Thanks,
Mauricio.


Re: kernels > v4.12 oops/crash with ipsec-traffic: bisected to b838d5e1c5b6e57b10ec8af2268824041e3ea911: ipv4: mark DST_NOGC and remove the operation of dst_free()

2018-09-07 Thread Wolfgang Walter
Hello Steffen,

in one of your emails to Thomas you wrote:
> xfrm_lookup+0x2a is at the very beginning of xfrm_lookup(), here we
> find:
> 
> u16 family = dst_orig->ops->family;
> 
> ops has an offset of 32 bytes (20 hex) in dst_orig, so looks like
> dst_orig is NULL.
> 
> In the forwarding case, we get dst_orig from the skb and dst_orig
> can't be NULL here unless the skb itself is already fishy.

Is this really true?

If xfrm_lookup is called from 

__xfrm_route_forward():

int __xfrm_route_forward(struct sk_buff *skb, unsigned short family)
{
struct net *net = dev_net(skb->dev);
struct flowi fl;
struct dst_entry *dst;
int res = 1;

if (xfrm_decode_session(skb, &fl, family) < 0) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMFWDHDRERROR);
return 0;
}

skb_dst_force(skb);

dst = xfrm_lookup(net, skb_dst(skb), &fl, NULL, XFRM_LOOKUP_QUEUE);
if (IS_ERR(dst)) {
res = 0;
dst = NULL;
}
skb_dst_set(skb, dst);
return res;
}

couldn't it be possible that skb_dst_force(skb) actually sets dst to NULL if 
it cannot safely lock it? If it is absolutely sure that skb_dst_force() never 
can set dst to NULL I wonder why it is called at all?


Here is  skb_dst_force()

static inline void skb_dst_force(struct sk_buff *skb)
{
if (skb_dst_is_noref(skb)) {
struct dst_entry *dst = skb_dst(skb);

WARN_ON(!rcu_read_lock_held());
if (!dst_hold_safe(dst))
dst = NULL;

skb->_skb_refdst = (unsigned long)dst;
}
}

and dst_hold_safe() is

static inline bool dst_hold_safe(struct dst_entry *dst)
{
return atomic_inc_not_zero(&dst->__refcnt);
}



Am Freitag, 7. September 2018, 22:22:39 schrieb Wolfgang Walter:
> Am Freitag, 31. August 2018, 08:50:24 schrieb Steffen Klassert:
> > On Thu, Aug 30, 2018 at 08:53:50PM +0200, Wolfgang Walter wrote:
> > > Hello,
> > > 
> > > kernels > 4.12 do not work on one of our main routers. They crash as
> > > soon
> > > as ipsec-tunnels are configured and ipsec-traffic actually flows.
> > 
> > Can you please send the backtrace of this crash?
> 
> I bootet the b838d5e1c5b6e57b10ec8af2268824041e3ea911 several times but I
> could not record the complete trace. I think I have to log to the serial
> console but I can't do that before next week.
> 
> 
> What I could record ist:
> 
> There is a always
>... 
> the callrace.
> 
> This is the part I could see:
> 
> 
> irq_exit+0x71/0x80
> do_IRQ+0x4d/0xd0
> common_interrup+07a/0x7a
> 
> RIP: 010:cpuidle_enter_state+0x11d/0x200
> RSP: 0018:c9000321bee0 EFLAGS: 0282 ORIG_RAX: ffc4
> RAX: 88085efde450 RBX: 0004 RCX: 0003c9e63c13
> RDX: 0003c9e63c13 RSI: ffb03103fe35ac43 RDI: 
> RBP: e87cf600 R08: 000c R09: 0004
> R10: 0400 R11: 0003c99e56fc R12: 0003c9e63c13
> R13: 0003c9da9567 R14: 0004 R15: 822763e0
> do_idle+0xd3/0x160
> cpu_startup_entry+0x14/0x20
> secondary_startup_64+0xa5/0xb0
> Code: 00 0f b7 83 c0 00 00 00 80 7c 02 08 01 0f 86 d3 02 00 00 41
> 8b 8c 24 3c 10 00 00 48 8b 6b 58 85 c9 0f 84 2f 01 00 00 48 83 e5 fe  45
> 60
> 02 0f 84 4e 01 00 00 f6 43 38 01 74 0d 80 00 bd ab 00 00
> RIP: ip_forward+0xd4/0x470 RSP: 88085efc3cb0
> CR2: 0060
> [ end trace 7205b53c25b7b35a ]---
> Kernel panic - not syncing: Fatal exception in interrupt
> Kernel Offset: disabled
> Rebooting in 60 seconds..
> 
> 
> I got an email from Tobias Hommel and I think it is the same problem.
> 
> It is very clear that it is the difference from
> 
>   ipv4: call dst_hold_safe() properly
> 
> to
> 
>   ipv4: mark DST_NOGC and remove the operation of dst_free()
> 
> which triggers this bug.
> 
> Regards,

Regards
-- 
Wolfgang Walter
Studentenwerk München
Anstalt des öffentlichen Rechts


[net-next] i40e(vf): remove i40e_ethtool_stats.h header file

2018-09-07 Thread Jacob Keller
Essentially reverts commit 8fd75c58a09a ("i40e: move ethtool
stats boiler plate code to i40e_ethtool_stats.h", 2018-08-30), and
additionally moves the similar code in i40evf into i40evf_ethtool.c.

The code was intially moved from i40e_ethtool.c into i40e_ethtool_stats.h
as a way of better logically organizing the code. This has two problems.
First, we can't have an inline function with variadic arguments on all
platforms. Second, it gave the appearance that we had plans to share
code between the i40e and i40evf drivers, due to having a near copy of
the contents in the i40evf/i40e_ethtool_stats.h file.

Patches which actually attempt to combine or share code between the i40e
and i40evf drivers have not materialized, and are likely a ways off.

Rather than fixing the one function which causes build issues, just move
this code back into the i40e_ethtool.c and i40evf_ethtool.c files. Note
that we also change these functions back from static inlines to just
statics, since they're no longer in a header file.

We can revisit this if/when work is done to actually attempt to share
code between drivers. Alternatively, this stats code could be made more
generic so that it can be shared across drivers as part of ethtool
kernel work.

Signed-off-by: Jacob Keller 
---
Sorry aboutthe delay, our iwl queue maintainer Jeff is on vacation, and
discussion about how best to resolve this issue was/is ongoing in the
IWL list.

I opted to just move the contents of these files back into
i40e_ethtool.c and i40evf_ethtool.c respectively, as I think it's better
than only moving the single offending function back into the .c file.
This is different from the approach suggested by Wang, Dongsheng
 on Intel Wired LAN, but I think it's a
better approach for now.

I've also kindly asked if Aaron Brown could test this out.

 .../net/ethernet/intel/i40e/i40e_ethtool.c| 219 -
 .../ethernet/intel/i40e/i40e_ethtool_stats.h  | 221 --
 .../intel/i40evf/i40e_ethtool_stats.h | 221 --
 .../ethernet/intel/i40evf/i40evf_ethtool.c| 219 -
 4 files changed, 436 insertions(+), 444 deletions(-)
 delete mode 100644 drivers/net/ethernet/intel/i40e/i40e_ethtool_stats.h
 delete mode 100644 drivers/net/ethernet/intel/i40evf/i40e_ethtool_stats.h

diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index d7d3974beca2..87fe2e60602f 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -6,7 +6,224 @@
 #include "i40e.h"
 #include "i40e_diag.h"
 
-#include "i40e_ethtool_stats.h"
+/* ethtool statistics helpers */
+
+/**
+ * struct i40e_stats - definition for an ethtool statistic
+ * @stat_string: statistic name to display in ethtool -S output
+ * @sizeof_stat: the sizeof() the stat, must be no greater than sizeof(u64)
+ * @stat_offset: offsetof() the stat from a base pointer
+ *
+ * This structure defines a statistic to be added to the ethtool stats buffer.
+ * It defines a statistic as offset from a common base pointer. Stats should
+ * be defined in constant arrays using the I40E_STAT macro, with every element
+ * of the array using the same _type for calculating the sizeof_stat and
+ * stat_offset.
+ *
+ * The @sizeof_stat is expected to be sizeof(u8), sizeof(u16), sizeof(u32) or
+ * sizeof(u64). Other sizes are not expected and will produce a WARN_ONCE from
+ * the i40e_add_ethtool_stat() helper function.
+ *
+ * The @stat_string is interpreted as a format string, allowing formatted
+ * values to be inserted while looping over multiple structures for a given
+ * statistics array. Thus, every statistic string in an array should have the
+ * same type and number of format specifiers, to be formatted by variadic
+ * arguments to the i40e_add_stat_string() helper function.
+ **/
+struct i40e_stats {
+   char stat_string[ETH_GSTRING_LEN];
+   int sizeof_stat;
+   int stat_offset;
+};
+
+/* Helper macro to define an i40e_stat structure with proper size and type.
+ * Use this when defining constant statistics arrays. Note that @_type expects
+ * only a type name and is used multiple times.
+ */
+#define I40E_STAT(_type, _name, _stat) { \
+   .stat_string = _name, \
+   .sizeof_stat = FIELD_SIZEOF(_type, _stat), \
+   .stat_offset = offsetof(_type, _stat) \
+}
+
+/* Helper macro for defining some statistics directly copied from the netdev
+ * stats structure.
+ */
+#define I40E_NETDEV_STAT(_net_stat) \
+   I40E_STAT(struct rtnl_link_stats64, #_net_stat, _net_stat)
+
+/* Helper macro for defining some statistics related to queues */
+#define I40E_QUEUE_STAT(_name, _stat) \
+   I40E_STAT(struct i40e_ring, _name, _stat)
+
+/* Stats associated with a Tx or Rx ring */
+static const struct i40e_stats i40e_gstrings_queue_stats[] = {
+   I40E_QUEUE_STAT("%s-%u.packets", stats.packets),
+   I40E_QUEUE_STAT("%s-%u.bytes", stats.bytes),
+};
+
+/*

Re: [net-next] i40e(vf): remove i40e_ethtool_stats.h header file

2018-09-07 Thread David Miller
From: Jacob Keller 
Date: Fri,  7 Sep 2018 14:55:44 -0700

> Essentially reverts commit 8fd75c58a09a ("i40e: move ethtool
> stats boiler plate code to i40e_ethtool_stats.h", 2018-08-30), and
> additionally moves the similar code in i40evf into i40evf_ethtool.c.
> 
> The code was intially moved from i40e_ethtool.c into i40e_ethtool_stats.h
> as a way of better logically organizing the code. This has two problems.
> First, we can't have an inline function with variadic arguments on all
> platforms. Second, it gave the appearance that we had plans to share
> code between the i40e and i40evf drivers, due to having a near copy of
> the contents in the i40evf/i40e_ethtool_stats.h file.
> 
> Patches which actually attempt to combine or share code between the i40e
> and i40evf drivers have not materialized, and are likely a ways off.
> 
> Rather than fixing the one function which causes build issues, just move
> this code back into the i40e_ethtool.c and i40evf_ethtool.c files. Note
> that we also change these functions back from static inlines to just
> statics, since they're no longer in a header file.
> 
> We can revisit this if/when work is done to actually attempt to share
> code between drivers. Alternatively, this stats code could be made more
> generic so that it can be shared across drivers as part of ethtool
> kernel work.
> 
> Signed-off-by: Jacob Keller 

Applied and after the build checks out will be pushed to net-next.

Thanks.


[PATCH net] netfilter: bridge: Don't sabotage nf_hook calls from an l3mdev

2018-09-07 Thread dsahern
From: David Ahern 

For starters, the bridge netfilter code registers operations that
are invoked any time nh_hook is called. Specifically, ip_sabotage_in
watches for nested calls for NF_INET_PRE_ROUTING when a bridge is in
the stack.

Packet wise, the bridge netfilter hook runs first. br_nf_pre_routing
allocates nf_bridge, sets in_prerouting to 1 and calls NF_HOOK for
NF_INET_PRE_ROUTING. It's finish function, br_nf_pre_routing_finish,
then resets in_prerouting flag to 0 and the packet continues up the
stack. The packet eventually makes it to the VRF driver and it invokes
nf_hook for NF_INET_PRE_ROUTING in case any rules have been added against
the vrf device.

Because of the registered operations the call to nf_hook causes
ip_sabotage_in to be invoked. That function sees the nf_bridge on the
skb and that in_prerouting is not set. Thinking it is an invalid nested
call it steals (drops) the packet.

Update ip_sabotage_in to recognize that the bridge or one of its upper
devices (e.g., vlan) can be enslaved to a VRF (L3 master device) and
allow the packet to go through the nf_hook a second time.

Fixes: 73e20b761acf ("net: vrf: Add support for PREROUTING rules on vrf device")
Reported-by: D'Souza, Nelson 
Signed-off-by: David Ahern 
---
 net/bridge/br_netfilter_hooks.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/bridge/br_netfilter_hooks.c b/net/bridge/br_netfilter_hooks.c
index 6e0dc6bcd32a..37278dc280eb 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -835,7 +835,8 @@ static unsigned int ip_sabotage_in(void *priv,
   struct sk_buff *skb,
   const struct nf_hook_state *state)
 {
-   if (skb->nf_bridge && !skb->nf_bridge->in_prerouting) {
+   if (skb->nf_bridge && !skb->nf_bridge->in_prerouting &&
+   !netif_is_l3_master(skb->dev)) {
state->okfn(state->net, state->sk, skb);
return NF_STOLEN;
}
-- 
2.11.0



Re: GPL compliance issue with liquidio/lio_23xx_vsw.bin firmware

2018-09-07 Thread Felix Manlunas
On Wed, Aug 29, 2018 at 09:26:35PM +0100, Ben Hutchings wrote:
> Date: Wed, 29 Aug 2018 21:26:35 +0100
> From: Ben Hutchings 
> To: Felix Manlunas , Florian Weimer
>  
> CC: linux-firmw...@kernel.org, "netdev@vger.kernel.org"
>  , Derek Chickles
>  , Satanand Burla
>  , Felix Manlunas
>  , Raghu Vatsavayi
>  , Manish Awasthi
>  , manojkumar.panic...@cavium.com
> Subject: Re: GPL compliance issue with liquidio/lio_23xx_vsw.bin firmware
> User-Agent: Evolution 3.29.92-1 
> 
> On Mon, 2018-08-27 at 17:04 -0700, Felix Manlunas wrote:
> > On Mon, Aug 27, 2018 at 05:01:10PM +0200, Florian Weimer wrote:
> > > liquidio/lio_23xx_vsw.bin contains a compiled MIPS Linux kernel:
> > > 
> > > $ tail --bytes=+1313 liquidio/lio_23xx_vsw.bin > elf
> > > $ readelf -aW elf
> > > […]
> > >   [ 6] __ksymtab PROGBITS80e495f8 64a5f8 00d130
> > > 00   A  0   0  8
> > >   [ 7] __ksymtab_gpl PROGBITS80e56728 657728 008400
> > > 00   A  0   0  8
> > >   [ 8] __ksymtab_strings PROGBITS80e5eb28 65fb28 018868
> > > 00   A  0   0  1
> > > […]
> > > Symbol table '.symtab' contains 1349 entries:
> > >Num:Value  Size TypeBind   Vis  Ndx Name
> > >  0:  0 NOTYPE  LOCAL  DEFAULT  UND
> > >  1:  0 FILELOCAL  DEFAULT  ABS
> > > arch/mips/kernel/head.o
> > >  2:  0 FILELOCAL  DEFAULT  ABS init/main.c
> > >  3:  0 FILELOCAL  DEFAULT  ABS
> > > include/linux/types.h
> > > […]
> > > 
> > > Yet there is no corresponding source provided, and LICENCE.cavium lacks
> > > the required notices.
> > > 
> > > Thanks,
> > > Florian
> > 
> > Cavium apologizes for the oversight.  Cavium has been advertising the
> > appropriate license terms including the existence of GPL in the firmware
> > in our outbox releases. We will update the license terms in LICENCE.cavium
> > in our upstream contribution in collaboration with our legal team.
> 
> Everything added to linux-firmware.git needs to be safe for Linux
> distributions to redistribute.  (There are some ancient firmware images
> with unclear licensing, but we don't want to add any more.)
> 
> My understanding is that GPL v2 requires that commercial distributors
> either provide the complete and corresponding source alongside the
> binaries, or include a written offer to provide it later.  They
> *cannot* simply point to your offer to do this (only non-commercial
> distributors can do that).  So the source needs to be published too.
> 
> Adding the complete Linux kernel source code to linux-firmware.git
> doesn't seem like a sensible step, so maybe this particular firmware
> needs to live elsewhere.
> 
> Ben.

Apologies for the delay. We're committed to provide sources for the GPL
components of LiquidIO firmware.

After Cavium's recent merger with Marvell, we are revisiting our Licensing
and expect to share the updated license soon. Since keeping Linux kernel
source code in linux-firmware.git doesn't make sense to us too, we're also
working internally on identifying the appropriate mechanism to share those
sources when requested.  That said, it is our opinion that for all
practical purposes (for example, distribution of driver + firmware) the
firmware binary for LiquidIO adapters should continue to reside in the
linux-firmware.git as long as all the prerequisites for sharing the
firmware are met.  In the case of liquidio_23xx_vsw.bin, this would mean an
additional license type in the LICENCE.cavium file for this firmware
including a reference to the accompanying sources for the Linux kernel
bundled in the firmware.

As we understand, there is no precedent in the linux-firmware.git for
device firmware that include a Linux kernel so we also request comments
from community on the best ways to support the LiquidIO firmware and
similar firmwares that may appear in future.

Cavium LiquidIO Team


Re: [**EXTERNAL**] Re: VRF with enslaved L3 enabled bridge

2018-09-07 Thread D'Souza, Nelson
Thanks David and Ido, for finding the root-cause for bridge Rx packets getting 
dropped, also for coming up with a patch.

Regards,
Nelson

On 9/7/18, 9:09 AM, "David Ahern"  wrote:

On 9/7/18 9:56 AM, D'Souza, Nelson wrote:

> 
> *From:* David Ahern 
> *Sent:* Thursday, September 6, 2018 5:27 PM
> *To:* D'Souza, Nelson; netdev@vger.kernel.org
> *Subject:* Re: [**EXTERNAL**] Re: VRF with enslaved L3 enabled bridge
>  
> On 9/5/18 12:00 PM, D'Souza, Nelson wrote:
>> Just following up would you be able to confirm that this is a
> Linux VRF issue?
> 
> I can confirm that I can reproduce the problem. Need to find time to dig
> into it.

bridge's netfilter hook is dropping the packet.

bridge's netfilter code registers hook operations that are invoked when
nh_hook is called. It then sees all subsequent calls to nf_hook.

Packet wise, the bridge netfilter hook runs first. br_nf_pre_routing
allocates nf_bridge, sets in_prerouting to 1 and calls NF_HOOK for
NF_INET_PRE_ROUTING. It's finish function, br_nf_pre_routing_finish,
then resets in_prerouting flag to 0. Any subsequent calls to nf_hook
invoke ip_sabotage_in. That function sees in_prerouting is not
set and steals (drops) the packet.

The simplest change is to have ip_sabotage_in recognize that the bridge
can be enslaved to a VRF (L3 master device) and allow the packet to
continue.

Thanks to Ido for the hint on ip_sabotage_in.

This patch works for me:

diff --git a/net/bridge/br_netfilter_hooks.c
b/net/bridge/br_netfilter_hooks.c
index 6e0dc6bcd32a..37278dc280eb 100644
--- a/net/bridge/br_netfilter_hooks.c
+++ b/net/bridge/br_netfilter_hooks.c
@@ -835,7 +835,8 @@ static unsigned int ip_sabotage_in(void *priv,
   struct sk_buff *skb,
   const struct nf_hook_state *state)
 {
-   if (skb->nf_bridge && !skb->nf_bridge->in_prerouting) {
+   if (skb->nf_bridge && !skb->nf_bridge->in_prerouting &&
+   !netif_is_l3_master(skb->dev)) {
state->okfn(state->net, state->sk, skb);
return NF_STOLEN;
}




[bpf-next, v2 1/3] flow_dissector: implements flow dissector BPF hook

2018-09-07 Thread Petar Penkov
From: Petar Penkov 

Adds a hook for programs of type BPF_PROG_TYPE_FLOW_DISSECTOR and
attach type BPF_FLOW_DISSECTOR that is executed in the flow dissector
path. The BPF program is per-network namespace.

Signed-off-by: Petar Penkov 
Signed-off-by: Willem de Bruijn 
---
 include/linux/bpf.h|   1 +
 include/linux/bpf_types.h  |   1 +
 include/linux/skbuff.h |   7 ++
 include/net/net_namespace.h|   3 +
 include/net/sch_generic.h  |  12 ++-
 include/uapi/linux/bpf.h   |  25 ++
 kernel/bpf/syscall.c   |   8 ++
 kernel/bpf/verifier.c  |  32 
 net/core/filter.c  |  67 
 net/core/flow_dissector.c  | 136 +
 tools/bpf/bpftool/prog.c   |   1 +
 tools/include/uapi/linux/bpf.h |  25 ++
 tools/lib/bpf/libbpf.c |   2 +
 13 files changed, 317 insertions(+), 3 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index 523481a3471b..988a00797bcd 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -212,6 +212,7 @@ enum bpf_reg_type {
PTR_TO_PACKET_META,  /* skb->data - meta_len */
PTR_TO_PACKET,   /* reg points to skb->data */
PTR_TO_PACKET_END,   /* skb->data + headlen */
+   PTR_TO_FLOW_KEYS,/* reg points to bpf_flow_keys */
 };
 
 /* The information passed from prog-specific *_is_valid_access
diff --git a/include/linux/bpf_types.h b/include/linux/bpf_types.h
index cd26c090e7c0..22083712dd18 100644
--- a/include/linux/bpf_types.h
+++ b/include/linux/bpf_types.h
@@ -32,6 +32,7 @@ BPF_PROG_TYPE(BPF_PROG_TYPE_LIRC_MODE2, lirc_mode2)
 #ifdef CONFIG_INET
 BPF_PROG_TYPE(BPF_PROG_TYPE_SK_REUSEPORT, sk_reuseport)
 #endif
+BPF_PROG_TYPE(BPF_PROG_TYPE_FLOW_DISSECTOR, flow_dissector)
 
 BPF_MAP_TYPE(BPF_MAP_TYPE_ARRAY, array_map_ops)
 BPF_MAP_TYPE(BPF_MAP_TYPE_PERCPU_ARRAY, percpu_array_map_ops)
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 17a13e4785fc..ce0e863f02a2 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -243,6 +243,8 @@ struct scatterlist;
 struct pipe_inode_info;
 struct iov_iter;
 struct napi_struct;
+struct bpf_prog;
+union bpf_attr;
 
 #if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
 struct nf_conntrack {
@@ -1192,6 +1194,11 @@ void skb_flow_dissector_init(struct flow_dissector 
*flow_dissector,
 const struct flow_dissector_key *key,
 unsigned int key_count);
 
+int skb_flow_dissector_bpf_prog_attach(const union bpf_attr *attr,
+  struct bpf_prog *prog);
+
+int skb_flow_dissector_bpf_prog_detach(const union bpf_attr *attr);
+
 bool __skb_flow_dissect(const struct sk_buff *skb,
struct flow_dissector *flow_dissector,
void *target_container,
diff --git a/include/net/net_namespace.h b/include/net/net_namespace.h
index 9b5fdc50519a..99d4148e0f90 100644
--- a/include/net/net_namespace.h
+++ b/include/net/net_namespace.h
@@ -43,6 +43,7 @@ struct ctl_table_header;
 struct net_generic;
 struct uevent_sock;
 struct netns_ipvs;
+struct bpf_prog;
 
 
 #define NETDEV_HASHBITS8
@@ -145,6 +146,8 @@ struct net {
 #endif
struct net_generic __rcu*gen;
 
+   struct bpf_prog __rcu   *flow_dissector_prog;
+
/* Note : following structs are cache line aligned */
 #ifdef CONFIG_XFRM
struct netns_xfrm   xfrm;
diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
index a6d00093f35e..1b81ba85fd2d 100644
--- a/include/net/sch_generic.h
+++ b/include/net/sch_generic.h
@@ -19,6 +19,7 @@ struct Qdisc_ops;
 struct qdisc_walker;
 struct tcf_walker;
 struct module;
+struct bpf_flow_keys;
 
 typedef int tc_setup_cb_t(enum tc_setup_type type,
  void *type_data, void *cb_priv);
@@ -307,9 +308,14 @@ struct tcf_proto {
 };
 
 struct qdisc_skb_cb {
-   unsigned intpkt_len;
-   u16 slave_dev_queue_mapping;
-   u16 tc_classid;
+   union {
+   struct {
+   unsigned intpkt_len;
+   u16 slave_dev_queue_mapping;
+   u16 tc_classid;
+   };
+   struct bpf_flow_keys *flow_keys;
+   };
 #define QDISC_CB_PRIV_LEN 20
unsigned char   data[QDISC_CB_PRIV_LEN];
 };
diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h
index 66917a4eba27..3064706fcaaa 100644
--- a/include/uapi/linux/bpf.h
+++ b/include/uapi/linux/bpf.h
@@ -152,6 +152,7 @@ enum bpf_prog_type {
BPF_PROG_TYPE_LWT_SEG6LOCAL,
BPF_PROG_TYPE_LIRC_MODE2,
BPF_PROG_TYPE_SK_REUSEPORT,
+   BPF_PROG_TYPE_FLOW_DISSECTOR,
 };
 
 enum bpf_attach_type {
@@ -172,6 +173,7 @@ enum bpf_attach_type {
BPF_CGROUP_UDP4_SENDMSG,
BPF_CGROUP_UDP6_SEN

[bpf-next, v2 0/3] Introduce eBPF flow dissector

2018-09-07 Thread Petar Penkov
From: Petar Penkov 

This patch series hardens the RX stack by allowing flow dissection in BPF,
as previously discussed [1]. Because of the rigorous checks of the BPF
verifier, this provides significant security guarantees. In particular, the
BPF flow dissector cannot get inside of an infinite loop, as with
CVE-2013-4348, because BPF programs are guaranteed to terminate. It cannot
read outside of packet bounds, because all memory accesses are checked.
Also, with BPF the administrator can decide which protocols to support,
reducing potential attack surface. Rarely encountered protocols can be
excluded from dissection and the program can be updated without kernel
recompile or reboot if a bug is discovered.

Patch 1 adds infrastructure to execute a BPF program in __skb_flow_dissect.
This includes a new BPF program and attach type.

Patch 2 adds a flow dissector program in BPF. This parses most protocols in
__skb_flow_dissect in BPF for a subset of flow keys (basic, control, ports,
and address types).

Patch 3 adds a selftest that attaches the BPF program to the flow dissector
and sends traffic with different levels of encapsulation.

Performance Evaluation:
The in-kernel implementation was compared against the demo program from
patch 2 using the test in patch 3 with IPv4/UDP traffic over 10 seconds.
$perf record -a -C 4 taskset -c 4 ./test_flow_dissector -i 4 -f 8 \
-t 10

In-kernel Dissector:
__skb_flow_dissect overhead: 2.12%
Total Packets: 3,272,597 (from output of ./test_flow_dissector)

BPF Dissector:
__skb_flow_dissect overhead: 1.63% 
Total Packets: 3,232,356 (from output of ./test_flow_dissector)

No-op BPF Dissector:
__skb_flow_dissect overhead: 1.52% 
Total Packets: 3,330,635 (from output of ./test_flow_dissector)

Changes since v1:
1/ (Patch 1) LD_ABS instructions now disallowed for the new BPF prog type 
2/ (Patch 1) now checks if skb is NULL in __skb_flow_dissect()
3/ (Patch 1) fixed incorrect accesses in flow_dissector_is_valid_access()
- writes to the flow_keys field now disallowed
- reads/writes to tc_classid and data_meta now disallowed 
4/ (Patch 2) headers pulled with bpf_skb_load_data if direct access fails 

Changes since RFC:
1/ (Patch 1) Flow dissector hook changed from global to per-netns
2/ (Patch 1) Defined struct bpf_flow_keys to be used in BPF flow dissector
programs instead of exposing the internal flow keys layout. Added a
function to translate from bpf_flow_keys to the internal layout after BPF
dissection is complete. The pointer to this struct is stored in
qdisc_skb_cb rather than inside of the 20 byte control block which
simplifies verification and allows access to all 20 bytes of the cb.
3/ (Patch 2) Removed GUE parsing as it relied on a hardcoded port
4/ (Patch 2) MPLS parsing now stops at the first label which is consistent
with the in-kernel flow dissector
5/ (Patch 2) Refactored to use direct packet access and to write out to
struct bpf_flow_keys

[1] http://vger.kernel.org/netconf2017_files/rx_hardening_and_udp_gso.pdf

Petar Penkov (3):
  flow_dissector: implements flow dissector BPF hook
  flow_dissector: implements eBPF parser
  selftests/bpf: test bpf flow dissection

 include/linux/bpf.h   |   1 +
 include/linux/bpf_types.h |   1 +
 include/linux/skbuff.h|   7 +
 include/net/net_namespace.h   |   3 +
 include/net/sch_generic.h |  12 +-
 include/uapi/linux/bpf.h  |  25 +
 kernel/bpf/syscall.c  |   8 +
 kernel/bpf/verifier.c |  32 +
 net/core/filter.c |  67 ++
 net/core/flow_dissector.c | 136 +++
 tools/bpf/bpftool/prog.c  |   1 +
 tools/include/uapi/linux/bpf.h|  25 +
 tools/lib/bpf/libbpf.c|   2 +
 tools/testing/selftests/bpf/.gitignore|   2 +
 tools/testing/selftests/bpf/Makefile  |   8 +-
 tools/testing/selftests/bpf/bpf_flow.c| 386 +
 tools/testing/selftests/bpf/config|   1 +
 .../selftests/bpf/flow_dissector_load.c   | 140 
 .../selftests/bpf/test_flow_dissector.c   | 782 ++
 .../selftests/bpf/test_flow_dissector.sh  | 115 +++
 tools/testing/selftests/bpf/with_addr.sh  |  54 ++
 tools/testing/selftests/bpf/with_tunnels.sh   |  36 +
 22 files changed, 1838 insertions(+), 6 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/bpf_flow.c
 create mode 100644 tools/testing/selftests/bpf/flow_dissector_load.c
 create mode 100644 tools/testing/selftests/bpf/test_flow_dissector.c
 create mode 100755 tools/testing/selftests/bpf/test_flow_dissector.sh
 create mode 100755 tools/testing/selftests/bpf/with_addr.sh
 create mode 100755 tools/testing/selftests/bpf/with_tunnels.sh

-- 
2.19.0.rc2.392.g5ba43deb5a-goog



[bpf-next, v2 2/3] flow_dissector: implements eBPF parser

2018-09-07 Thread Petar Penkov
From: Petar Penkov 

This eBPF program extracts basic/control/ip address/ports keys from
incoming packets. It supports recursive parsing for IP encapsulation,
and VLAN, along with IPv4/IPv6 and extension headers.  This program is
meant to show how flow dissection and key extraction can be done in
eBPF.

Link: http://vger.kernel.org/netconf2017_files/rx_hardening_and_udp_gso.pdf
Signed-off-by: Petar Penkov 
Signed-off-by: Willem de Bruijn 
---
 tools/testing/selftests/bpf/Makefile   |   2 +-
 tools/testing/selftests/bpf/bpf_flow.c | 386 +
 2 files changed, 387 insertions(+), 1 deletion(-)
 create mode 100644 tools/testing/selftests/bpf/bpf_flow.c

diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index fff7fb1285fc..e65f50f9185e 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -35,7 +35,7 @@ TEST_GEN_FILES = test_pkt_access.o test_xdp.o test_l4lb.o 
test_tcp_estats.o test
test_get_stack_rawtp.o test_sockmap_kern.o test_sockhash_kern.o \
test_lwt_seg6local.o sendmsg4_prog.o sendmsg6_prog.o 
test_lirc_mode2_kern.o \
get_cgroup_id_kern.o socket_cookie_prog.o test_select_reuseport_kern.o \
-   test_skb_cgroup_id_kern.o
+   test_skb_cgroup_id_kern.o bpf_flow.o
 
 # Order correspond to 'make run_tests' order
 TEST_PROGS := test_kmod.sh \
diff --git a/tools/testing/selftests/bpf/bpf_flow.c 
b/tools/testing/selftests/bpf/bpf_flow.c
new file mode 100644
index ..f1e33a574841
--- /dev/null
+++ b/tools/testing/selftests/bpf/bpf_flow.c
@@ -0,0 +1,386 @@
+// SPDX-License-Identifier: GPL-2.0
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "bpf_helpers.h"
+#include "bpf_endian.h"
+
+int _version SEC("version") = 1;
+#define PROG(F) SEC(#F) int bpf_func_##F
+
+/* These are the identifiers of the BPF programs that will be used in tail
+ * calls. Name is limited to 16 characters, with the terminating character and
+ * bpf_func_ above, we have only 6 to work with, anything after will be 
cropped.
+ */
+enum {
+   IP,
+   IPV6,
+   IPV6OP, /* Destination/Hop-by-Hop Options IPv6 Extension header */
+   IPV6FR, /* Fragmentation IPv6 Extension Header */
+   MPLS,
+   VLAN,
+};
+
+#define IP_MF  0x2000
+#define IP_OFFSET  0x1FFF
+#define IP6_MF 0x0001
+#define IP6_OFFSET 0xFFF8
+
+struct vlan_hdr {
+   __be16 h_vlan_TCI;
+   __be16 h_vlan_encapsulated_proto;
+};
+
+struct gre_hdr {
+   __be16 flags;
+   __be16 proto;
+};
+
+struct frag_hdr {
+   __u8 nexthdr;
+   __u8 reserved;
+   __be16 frag_off;
+   __be32 identification;
+};
+
+struct bpf_map_def SEC("maps") jmp_table = {
+   .type = BPF_MAP_TYPE_PROG_ARRAY,
+   .key_size = sizeof(__u32),
+   .value_size = sizeof(__u32),
+   .max_entries = 8
+};
+
+struct bpf_dissect_cb {
+   __u16 nhoff;
+};
+
+static __always_inline void *bpf_flow_dissect_get_header(struct __sk_buff *skb,
+__u16 hdr_size,
+void *buffer)
+{
+   struct bpf_dissect_cb *cb = (struct bpf_dissect_cb *)(skb->cb);
+   void *data_end = (__u8 *)(long)skb->data_end;
+   void *data = (__u8 *)(long)skb->data;
+   __u8 *hdr;
+
+   /* Verifies this variable offset does not overflow */
+   if (cb->nhoff > (USHRT_MAX - hdr_size))
+   return NULL;
+
+   hdr = data + cb->nhoff;
+   if (hdr + hdr_size <= data_end)
+   return hdr;
+
+   if (bpf_skb_load_bytes(skb, cb->nhoff, buffer, hdr_size))
+   return NULL;
+
+   return buffer;
+}
+
+/* Dispatches on ETHERTYPE */
+static __always_inline int parse_eth_proto(struct __sk_buff *skb, __be16 proto)
+{
+   struct bpf_flow_keys *keys = (struct bpf_flow_keys 
*)(long)(skb->flow_keys);
+
+   keys->n_proto = proto;
+   switch (proto) {
+   case bpf_htons(ETH_P_IP):
+   bpf_tail_call(skb, &jmp_table, IP);
+   break;
+   case bpf_htons(ETH_P_IPV6):
+   bpf_tail_call(skb, &jmp_table, IPV6);
+   break;
+   case bpf_htons(ETH_P_MPLS_MC):
+   case bpf_htons(ETH_P_MPLS_UC):
+   bpf_tail_call(skb, &jmp_table, MPLS);
+   break;
+   case bpf_htons(ETH_P_8021Q):
+   case bpf_htons(ETH_P_8021AD):
+   bpf_tail_call(skb, &jmp_table, VLAN);
+   break;
+   default:
+   /* Protocol not supported */
+   return BPF_DROP;
+   }
+
+   return BPF_DROP;
+}
+
+SEC("dissect")
+int dissect(struct __sk_buff *skb)
+{
+   if (!skb->vlan_present)
+   return parse_eth_proto(skb, skb->protocol);
+   else
+   return parse_eth_proto(skb, skb->v

[bpf-next, v2 3/3] selftests/bpf: test bpf flow dissection

2018-09-07 Thread Petar Penkov
From: Petar Penkov 

Adds a test that sends different types of packets over multiple
tunnels and verifies that valid packets are dissected correctly.  To do
so, a tc-flower rule is added to drop packets on UDP src port 9, and
packets are sent from ports 8, 9, and 10. Only the packets on port 9
should be dropped. Because tc-flower relies on the flow dissector to
match flows, correct classification demonstrates correct dissection.

Also add support logic to load the BPF program and to inject the test
packets.

Signed-off-by: Petar Penkov 
Signed-off-by: Willem de Bruijn 
---
 tools/testing/selftests/bpf/.gitignore|   2 +
 tools/testing/selftests/bpf/Makefile  |   6 +-
 tools/testing/selftests/bpf/config|   1 +
 .../selftests/bpf/flow_dissector_load.c   | 140 
 .../selftests/bpf/test_flow_dissector.c   | 782 ++
 .../selftests/bpf/test_flow_dissector.sh  | 115 +++
 tools/testing/selftests/bpf/with_addr.sh  |  54 ++
 tools/testing/selftests/bpf/with_tunnels.sh   |  36 +
 8 files changed, 1134 insertions(+), 2 deletions(-)
 create mode 100644 tools/testing/selftests/bpf/flow_dissector_load.c
 create mode 100644 tools/testing/selftests/bpf/test_flow_dissector.c
 create mode 100755 tools/testing/selftests/bpf/test_flow_dissector.sh
 create mode 100755 tools/testing/selftests/bpf/with_addr.sh
 create mode 100755 tools/testing/selftests/bpf/with_tunnels.sh

diff --git a/tools/testing/selftests/bpf/.gitignore 
b/tools/testing/selftests/bpf/.gitignore
index 4d789c1e5167..8a60c9b9892d 100644
--- a/tools/testing/selftests/bpf/.gitignore
+++ b/tools/testing/selftests/bpf/.gitignore
@@ -23,3 +23,5 @@ test_skb_cgroup_id_user
 test_socket_cookie
 test_cgroup_storage
 test_select_reuseport
+test_flow_dissector
+flow_dissector_load
diff --git a/tools/testing/selftests/bpf/Makefile 
b/tools/testing/selftests/bpf/Makefile
index e65f50f9185e..fd3851d5c079 100644
--- a/tools/testing/selftests/bpf/Makefile
+++ b/tools/testing/selftests/bpf/Makefile
@@ -47,10 +47,12 @@ TEST_PROGS := test_kmod.sh \
test_tunnel.sh \
test_lwt_seg6local.sh \
test_lirc_mode2.sh \
-   test_skb_cgroup_id.sh
+   test_skb_cgroup_id.sh \
+   test_flow_dissector.sh
 
 # Compile but not part of 'make run_tests'
-TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr 
test_skb_cgroup_id_user
+TEST_GEN_PROGS_EXTENDED = test_libbpf_open test_sock_addr 
test_skb_cgroup_id_user \
+   flow_dissector_load test_flow_dissector
 
 include ../lib.mk
 
diff --git a/tools/testing/selftests/bpf/config 
b/tools/testing/selftests/bpf/config
index b4994a94968b..3655508f95fd 100644
--- a/tools/testing/selftests/bpf/config
+++ b/tools/testing/selftests/bpf/config
@@ -18,3 +18,4 @@ CONFIG_CRYPTO_HMAC=m
 CONFIG_CRYPTO_SHA256=m
 CONFIG_VXLAN=y
 CONFIG_GENEVE=y
+CONFIG_NET_CLS_FLOWER=m
diff --git a/tools/testing/selftests/bpf/flow_dissector_load.c 
b/tools/testing/selftests/bpf/flow_dissector_load.c
new file mode 100644
index ..d3273b5b3173
--- /dev/null
+++ b/tools/testing/selftests/bpf/flow_dissector_load.c
@@ -0,0 +1,140 @@
+// SPDX-License-Identifier: GPL-2.0
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+const char *cfg_pin_path = "/sys/fs/bpf/flow_dissector";
+const char *cfg_map_name = "jmp_table";
+bool cfg_attach = true;
+char *cfg_section_name;
+char *cfg_path_name;
+
+static void load_and_attach_program(void)
+{
+   struct bpf_program *prog, *main_prog;
+   struct bpf_map *prog_array;
+   int i, fd, prog_fd, ret;
+   struct bpf_object *obj;
+   int prog_array_fd;
+
+   ret = bpf_prog_load(cfg_path_name, BPF_PROG_TYPE_FLOW_DISSECTOR, &obj,
+   &prog_fd);
+   if (ret)
+   error(1, 0, "bpf_prog_load %s", cfg_path_name);
+
+   main_prog = bpf_object__find_program_by_title(obj, cfg_section_name);
+   if (!main_prog)
+   error(1, 0, "bpf_object__find_program_by_title %s",
+ cfg_section_name);
+
+   prog_fd = bpf_program__fd(main_prog);
+   if (prog_fd < 0)
+   error(1, 0, "bpf_program__fd");
+
+   prog_array = bpf_object__find_map_by_name(obj, cfg_map_name);
+   if (!prog_array)
+   error(1, 0, "bpf_object__find_map_by_name %s", cfg_map_name);
+
+   prog_array_fd = bpf_map__fd(prog_array);
+   if (prog_array_fd < 0)
+   error(1, 0, "bpf_map__fd %s", cfg_map_name);
+
+   i = 0;
+   bpf_object__for_each_program(prog, obj) {
+   fd = bpf_program__fd(prog);
+   if (fd < 0)
+   error(1, 0, "bpf_program__fd");
+
+   if (fd != prog_fd) {
+   printf("%d: %s\n", i, bpf_program__title(prog, false));
+   bpf_map_update_elem(prog_array_fd, &i, &fd, BPF_ANY);
+   ++i;
+   }
+   }
+
+   ret = bpf_

Re: RE packet: fix reserve calculation - net/packet/af_packet.c

2018-09-07 Thread James Sakalaukus
On Fri, Sep 7, 2018 at 11:50 AM, Willem de Bruijn  wrote:
> Hi James,
>
> Thanks for the report. In the future please always include
> netdev@vger.kernel.org in technical discussions.
>
> On Fri, Sep 7, 2018 at 1:00 AM James Sakalaukus  wrote:
>>
>> Hello Willem and David,
>>
>> I have an unpolished Ethernet driver for a PCIe FPGA subsystem, and
>> the following commit has the side effect of moving the data alignment
>> for SOCK_RAW packets.
>>
>>
>> commit b84bbaf7a6c8cca24f8acf25a2c8e46913a947ba
>> net/packet/af_packet.c
>>
>> These changes to packet_snd() moves the data and tail pointers
>> backwards by net_device->hard_header_len, which is nominally ETH_HLEN.
>> The .ndo_start_xmit driver function now gets a struct sk_buff with
>> data alignment on a 2-byte boundary.  My DMA core is not happy about
>> it.
>>
>>
>> commit 9aad13b087ab0a588cd68259de618f100053360e
>>
>> This commit changed the previous fix from a skb_push to skb_reserve.
>> The functionality from my end did not change though.  .ndo_start_xmit
>> still gets a struct sk_buff with 2 byte alignment.
>>
>>
>> This may be causing problems for other network drivers with DMA
>> alignment requirements, but maybe its just me.
>
> This is the crux of the question.
>
>
> Did the PACKET_TX_RING variant work with your device?
>
> A quick scan seems to indicate that it is common to allocate a linear
> buffer and then reserve hard_header_len aligned up to 16B
> (HH_MOD_LEN). The actual alignment of both link layer and network
> header then depends on the alignment with which kmalloc returned. It
> is probably safe to assume that the buddy allocator returns a multiple
> of 4B at least for allocations of this size. Then the network layer
> header is 4B aligned. But for Ethernet, the skb_push in eth_header()
> would make the link layer header 2B aligned.
>
> If you are not seeing these problems with other protocols, I must be
> misreading that code.
>
> I will take a closer look.
>

I actually have not tested the device with any protocols other than
SOCK_RAW.  This device is on a real time network that does not use
standard network layer protocols.
PACKET_TX_RING is something I haven't had time to delve into.

It may be the case that other protocols always hand over a 2-byte
aligned buffer.  All I know for sure is that I was getting a 4-byte
aligned buffer before the update.


>> I've only been working
>> on the driver for about 6 months, so its not released, though I would
>> like to put it out there for others.  I updated my distro kernel this
>> week, and then proceeded to beat my head against the wall trying to
>> figure out why my driver mysteriously stopped working.
>>
>>
>> Thanks for your time,
>>
>> James Sakalaukus
>> ja...@sakalaukus.com


Re: RE packet: fix reserve calculation - net/packet/af_packet.c

2018-09-07 Thread James Sakalaukus
On Fri, Sep 7, 2018 at 1:34 PM, David Miller  wrote:
> From: Willem de Bruijn 
> Date: Fri, 7 Sep 2018 11:50:02 -0400
>
>> If you are not seeing these problems with other protocols, I must be
>> misreading that code.
>
> Right, the ethernet header is only guaranteed to be 2 byte aligned on
> transmit.
>
> Nothing ever gave larger alignment guarantees.
>
> This is the _only_ reasonable situation.
>
> If we made the ethernet header to be 4 byte aligned, that means the
> ipv4 addresses in the ipv4 header are not 4 byte aligned so we would
> do unaligned memory accesses when building the headers which are
> either expensive or trap depending upon the architecture.

Understood.

I thought SOCK_RAW made no assumptions with respect to the network
layer headers.  Previously, the buffer was allocated and then data was
copied into it without giving thought to alignment.  Sounds like I got
lucky with the alignment and I should have expected 2-bytes from day
1.

I've been using this device with only SOCK_RAW on a real time network,
and I have not even tried to use SOCK_STREAM or SOCK_DGRAM.
So, I just tried to use TCP on the device, and the buffer alignment is
in fact 2-bytes inside my transmit function, and my DMA core does not
work.  So...I guess I need to bring my hardware into the 21st century
or do a copy in the driver transmit function.

Thanks for your time


Re: [Patch net] net_sched: properly cancel netlink dump on failure

2018-09-07 Thread David Miller
From: Cong Wang 
Date: Thu,  6 Sep 2018 14:50:16 -0700

> When nla_put*() fails after nla_nest_start(), we need
> to call nla_nest_cancel() to cancel the message, otherwise
> we end up calling nla_nest_end() like a success.
> 
> Fixes: 0ed5269f9e41 ("net/sched: add tunnel option support to act_tunnel_key")
> Cc: Davide Caratti 
> Cc: Simon Horman 
> Signed-off-by: Cong Wang 

Applied, thanks.


Re: [PATCH net-next] cxgb4: impose mandatory VLAN usage when non-zero TAG ID

2018-09-07 Thread David Miller
From: Ganesh Goudar 
Date: Fri,  7 Sep 2018 15:59:33 +0530

> From: Casey Leedom 
> 
> When a non-zero VLAN Tag ID is passed to t4_set_vlan_acl()
> then impose mandatory VLAN Usage with that VLAN ID.
> I.e any other VLAN ID should result in packets getting
> dropped.
> 
> Signed-off-by: Casey Leedom 
> Signed-off-by: Ganesh Goudar 

Applied.


Re: [PATCH] tcp: really ignore MSG_ZEROCOPY if no SO_ZEROCOPY

2018-09-07 Thread David Miller
From: Vincent Whitchurch 
Date: Thu,  6 Sep 2018 15:54:59 +0200

> According to the documentation in msg_zerocopy.rst, the SO_ZEROCOPY
> flag was introduced because send(2) ignores unknown message flags and
> any legacy application which was accidentally passing the equivalent of
> MSG_ZEROCOPY earlier should not see any new behaviour.
> 
> Before commit f214f915e7db ("tcp: enable MSG_ZEROCOPY"), a send(2) call
> which passed the equivalent of MSG_ZEROCOPY without setting SO_ZEROCOPY
> would succeed.  However, after that commit, it fails with -ENOBUFS.  So
> it appears that the SO_ZEROCOPY flag fails to fulfill its intended
> purpose.  Fix it.
> 
> Fixes: f214f915e7db ("tcp: enable MSG_ZEROCOPY")
> Signed-off-by: Vincent Whitchurch 

Applied and queued up for -stable, thanks.