[net-next 01/14] ethtool: Add helper routines to pass vf to rx_flow_spec

2015-05-28 Thread Jeff Kirsher
From: John Fastabend 

The ring_cookie is 64 bits wide which is much larger than can be used
for actual queue index values. So provide some helper routines to
pack a VF index into the cookie. This is useful to steer packets to
a VF ring without having to know the queue layout of the device.

CC: Alex Duyck 
Signed-off-by: John Fastabend 
Signed-off-by: Jeff Kirsher 
---
 include/uapi/linux/ethtool.h | 25 +
 1 file changed, 25 insertions(+)

diff --git a/include/uapi/linux/ethtool.h b/include/uapi/linux/ethtool.h
index ae832b4..0594933 100644
--- a/include/uapi/linux/ethtool.h
+++ b/include/uapi/linux/ethtool.h
@@ -796,6 +796,31 @@ struct ethtool_rx_flow_spec {
__u32   location;
 };
 
+/* How rings are layed out when accessing virtual functions or
+ * offloaded queues is device specific. To allow users to do flow
+ * steering and specify these queues the ring cookie is partitioned
+ * into a 32bit queue index with an 8 bit virtual function id.
+ * This also leaves the 3bytes for further specifiers. It is possible
+ * future devices may support more than 256 virtual functions if
+ * devices start supporting PCIe w/ARI. However at the moment I
+ * do not know of any devices that support this so I do not reserve
+ * space for this at this time. If a future patch consumes the next
+ * byte it should be aware of this possiblity.
+ */
+#define ETHTOOL_RX_FLOW_SPEC_RING  0xLL
+#define ETHTOOL_RX_FLOW_SPEC_RING_VF   0x00FFLL
+#define ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF 32
+static inline __u64 ethtool_get_flow_spec_ring(__u64 ring_cookie)
+{
+   return ETHTOOL_RX_FLOW_SPEC_RING & ring_cookie;
+};
+
+static inline __u64 ethtool_get_flow_spec_ring_vf(__u64 ring_cookie)
+{
+   return (ETHTOOL_RX_FLOW_SPEC_RING_VF & ring_cookie) >>
+   ETHTOOL_RX_FLOW_SPEC_RING_VF_OFF;
+};
+
 /**
  * struct ethtool_rxnfc - command to get or set RX flow classification rules
  * @cmd: Specific command number - %ETHTOOL_GRXFH, %ETHTOOL_SRXFH,
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[net-next 06/14] i40e/i40evf: Add stats to count Tunnel ATR hits

2015-05-28 Thread Jeff Kirsher
From: Anjali Singhai Jain 

Add a 3rd dynamic filter counter to track Tunneled ATR hits separately.
Ethtool port stat "fdir_atr_tunnel_match"

Change-ID: Idd978b6db2a462b5722397cd2ffd04ef055f8655
Signed-off-by: Anjali Singhai Jain 
Tested-by: Jim Young 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e.h |  3 +++
 drivers/net/ethernet/intel/i40e/i40e_ethtool.c |  1 +
 drivers/net/ethernet/intel/i40e/i40e_main.c|  4 
 drivers/net/ethernet/intel/i40e/i40e_txrx.c| 13 ++---
 drivers/net/ethernet/intel/i40e/i40e_type.h|  1 +
 drivers/net/ethernet/intel/i40evf/i40e_type.h  |  1 +
 6 files changed, 20 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index 33c35d3..0bfa5a0 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -182,6 +182,7 @@ struct i40e_lump_tracking {
 enum i40e_fd_stat_idx {
I40E_FD_STAT_ATR,
I40E_FD_STAT_SB,
+   I40E_FD_STAT_ATR_TUNNEL,
I40E_FD_STAT_PF_COUNT
 };
 #define I40E_FD_STAT_PF_IDX(pf_id) ((pf_id) * I40E_FD_STAT_PF_COUNT)
@@ -189,6 +190,8 @@ enum i40e_fd_stat_idx {
(I40E_FD_STAT_PF_IDX(pf_id) + I40E_FD_STAT_ATR)
 #define I40E_FD_SB_STAT_IDX(pf_id)  \
(I40E_FD_STAT_PF_IDX(pf_id) + I40E_FD_STAT_SB)
+#define I40E_FD_ATR_TUNNEL_STAT_IDX(pf_id) \
+   (I40E_FD_STAT_PF_IDX(pf_id) + I40E_FD_STAT_ATR_TUNNEL)
 
 struct i40e_fdir_filter {
struct hlist_node fdir_node;
diff --git a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c 
b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
index e77b6bd..c568c90 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_ethtool.c
@@ -147,6 +147,7 @@ static struct i40e_stats i40e_gstrings_stats[] = {
I40E_PF_STAT("rx_hwtstamp_cleared", rx_hwtstamp_cleared),
I40E_PF_STAT("fdir_flush_cnt", fd_flush_cnt),
I40E_PF_STAT("fdir_atr_match", stats.fd_atr_match),
+   I40E_PF_STAT("fdir_atr_tunnel_match", stats.fd_atr_tunnel_match),
I40E_PF_STAT("fdir_sb_match", stats.fd_sb_match),
 
/* LPI stats */
diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index f1a8c4c..e70a616 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -1102,6 +1102,10 @@ static void i40e_update_pf_stats(struct i40e_pf *pf)
i40e_stat_update32(hw, I40E_GLQF_PCNT(pf->fd_sb_cnt_idx),
   pf->stat_offsets_loaded,
   &osd->fd_sb_match, &nsd->fd_sb_match);
+   i40e_stat_update32(hw,
+ I40E_GLQF_PCNT(I40E_FD_ATR_TUNNEL_STAT_IDX(pf->hw.pf_id)),
+ pf->stat_offsets_loaded,
+ &osd->fd_atr_tunnel_match, &nsd->fd_atr_tunnel_match);
 
val = rd32(hw, I40E_PRTPM_EEE_STAT);
nsd->tx_lpi_status =
diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 8565495..fc4ec82 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -2033,9 +2033,16 @@ static void i40e_atr(struct i40e_ring *tx_ring, struct 
sk_buff *skb,
 I40E_TXD_FLTR_QW1_FD_STATUS_SHIFT;
 
dtype_cmd |= I40E_TXD_FLTR_QW1_CNT_ENA_MASK;
-   dtype_cmd |=
-   ((u32)pf->fd_atr_cnt_idx << I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT) &
-   I40E_TXD_FLTR_QW1_CNTINDEX_MASK;
+   if (!(tx_flags & I40E_TX_FLAGS_VXLAN_TUNNEL))
+   dtype_cmd |=
+   ((u32)I40E_FD_ATR_STAT_IDX(pf->hw.pf_id) <<
+   I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT) &
+   I40E_TXD_FLTR_QW1_CNTINDEX_MASK;
+   else
+   dtype_cmd |=
+   ((u32)I40E_FD_ATR_TUNNEL_STAT_IDX(pf->hw.pf_id) <<
+   I40E_TXD_FLTR_QW1_CNTINDEX_SHIFT) &
+   I40E_TXD_FLTR_QW1_CNTINDEX_MASK;
 
fdir_desc->qindex_flex_ptype_vsi = cpu_to_le32(flex_ptype);
fdir_desc->rsvd = cpu_to_le32(0);
diff --git a/drivers/net/ethernet/intel/i40e/i40e_type.h 
b/drivers/net/ethernet/intel/i40e/i40e_type.h
index 568e855..9a5a75b 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_type.h
+++ b/drivers/net/ethernet/intel/i40e/i40e_type.h
@@ -1133,6 +1133,7 @@ struct i40e_hw_port_stats {
/* flow director stats */
u64 fd_atr_match;
u64 fd_sb_match;
+   u64 fd_atr_tunnel_match;
/* EEE LPI */
u32 tx_lpi_status;
u32 rx_lpi_status;
diff --git a/drivers/net/ethernet/intel/i40evf/i40e_type.h 
b/drivers/net/ethernet/intel/i40evf/i40e_type.h
index ec9d83a..c463ec4 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_type.h
+++ b/drivers/net/ethernet/intel/i40evf/i40e_type.h
@@ -1108,6 +1108,7 @@ struct i40e_hw_p

[net-next 05/14] i40e/i40evf: Add ATR support for tunneled TCP/IPv4/IPv6 packets.

2015-05-28 Thread Jeff Kirsher
From: Anjali Singhai Jain 

Without this, RSS would have done inner header load balancing. Now we can
get the benefits of ATR for tunneled packets to better align TX and RX
queues with the right core/interrupt.

Change-ID: I07d0e0a192faf28fdd33b2f04c32b2a82ff97ddd
Signed-off-by: Anjali Singhai Jain 
Signed-off-by: Jesse Brandeburg 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40e/i40e_txrx.c   | 77 +++
 drivers/net/ethernet/intel/i40e/i40e_txrx.h   |  1 +
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 34 ++--
 drivers/net/ethernet/intel/i40evf/i40e_txrx.h |  1 +
 4 files changed, 62 insertions(+), 51 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
index 0b4a7be..8565495 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
@@ -1923,11 +1923,11 @@ int i40e_napi_poll(struct napi_struct *napi, int budget)
  * i40e_atr - Add a Flow Director ATR filter
  * @tx_ring:  ring to add programming descriptor to
  * @skb:  send buffer
- * @flags:send flags
+ * @tx_flags: send tx flags
  * @protocol: wire protocol
  **/
 static void i40e_atr(struct i40e_ring *tx_ring, struct sk_buff *skb,
-u32 flags, __be16 protocol)
+u32 tx_flags, __be16 protocol)
 {
struct i40e_filter_program_desc *fdir_desc;
struct i40e_pf *pf = tx_ring->vsi->back;
@@ -1952,25 +1952,38 @@ static void i40e_atr(struct i40e_ring *tx_ring, struct 
sk_buff *skb,
if (!tx_ring->atr_sample_rate)
return;
 
-   /* snag network header to get L4 type and address */
-   hdr.network = skb_network_header(skb);
+   if (!(tx_flags & (I40E_TX_FLAGS_IPV4 | I40E_TX_FLAGS_IPV6)))
+   return;
 
-   /* Currently only IPv4/IPv6 with TCP is supported */
-   if (protocol == htons(ETH_P_IP)) {
-   if (hdr.ipv4->protocol != IPPROTO_TCP)
-   return;
+   if (!(tx_flags & I40E_TX_FLAGS_VXLAN_TUNNEL)) {
+   /* snag network header to get L4 type and address */
+   hdr.network = skb_network_header(skb);
 
-   /* access ihl as a u8 to avoid unaligned access on ia64 */
-   hlen = (hdr.network[0] & 0x0F) << 2;
-   } else if (protocol == htons(ETH_P_IPV6)) {
-   if (hdr.ipv6->nexthdr != IPPROTO_TCP)
+   /* Currently only IPv4/IPv6 with TCP is supported
+* access ihl as u8 to avoid unaligned access on ia64
+*/
+   if (tx_flags & I40E_TX_FLAGS_IPV4)
+   hlen = (hdr.network[0] & 0x0F) << 2;
+   else if (protocol == htons(ETH_P_IPV6))
+   hlen = sizeof(struct ipv6hdr);
+   else
return;
-
-   hlen = sizeof(struct ipv6hdr);
} else {
-   return;
+   hdr.network = skb_inner_network_header(skb);
+   hlen = skb_inner_network_header_len(skb);
}
 
+   /* Currently only IPv4/IPv6 with TCP is supported
+* Note: tx_flags gets modified to reflect inner protocols in
+* tx_enable_csum function if encap is enabled.
+*/
+   if ((tx_flags & I40E_TX_FLAGS_IPV4) &&
+   (hdr.ipv4->protocol != IPPROTO_TCP))
+   return;
+   else if ((tx_flags & I40E_TX_FLAGS_IPV6) &&
+(hdr.ipv6->nexthdr != IPPROTO_TCP))
+   return;
+
th = (struct tcphdr *)(hdr.network + hlen);
 
/* Due to lack of space, no more new filters can be programmed */
@@ -2117,16 +2130,14 @@ out:
  * i40e_tso - set up the tso context descriptor
  * @tx_ring:  ptr to the ring to send
  * @skb:  ptr to the skb we're sending
- * @tx_flags: the collected send information
- * @protocol: the send protocol
  * @hdr_len:  ptr to the size of the packet header
  * @cd_tunneling: ptr to context descriptor bits
  *
  * Returns 0 if no TSO can happen, 1 if tso is going, or error
  **/
 static int i40e_tso(struct i40e_ring *tx_ring, struct sk_buff *skb,
-   u32 tx_flags, __be16 protocol, u8 *hdr_len,
-   u64 *cd_type_cmd_tso_mss, u32 *cd_tunneling)
+   u8 *hdr_len, u64 *cd_type_cmd_tso_mss,
+   u32 *cd_tunneling)
 {
u32 cd_cmd, cd_tso_len, cd_mss;
struct ipv6hdr *ipv6h;
@@ -2218,12 +2229,12 @@ static int i40e_tsyn(struct i40e_ring *tx_ring, struct 
sk_buff *skb,
 /**
  * i40e_tx_enable_csum - Enable Tx checksum offloads
  * @skb: send buffer
- * @tx_flags: Tx flags currently set
+ * @tx_flags: pointer to Tx flags currently set
  * @td_cmd: Tx descriptor command bits to set
  * @td_offset: Tx descriptor header offsets to set
  * @cd_tunneling: ptr to context desc bits
  **/
-static void i40e_tx_enable_csum(struct sk_buff *skb, u32 tx_flags,
+static void i40e_tx_enable_csum(struct sk_buff *skb, u32 

[net-next 11/14] i40evf: skb->xmit_more support

2015-05-28 Thread Jeff Kirsher
From: Jesse Brandeburg 

Eric added support for skb->xmit_more in i40e, this ports that into
i40evf as well.

Support skb->xmit_more in i40evf is straightforward; we need to move
around i40e_maybe_stop_tx() call to correctly test netif_xmit_stopped()
before taking the decision to not kick the NIC.

Change-ID: Ia6a2e4a7ab335631c91ced51f55b25eb8468
Signed-off-by: Eric Dumazet 
Signed-off-by: Daniel Borkmann 
Signed-off-by: Jesse Brandeburg 
Tested-by: Jim Young 
Signed-off-by: Jeff Kirsher 
---
 drivers/net/ethernet/intel/i40evf/i40e_txrx.c | 88 ++-
 1 file changed, 47 insertions(+), 41 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c 
b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
index 1c79a08..6450663 100644
--- a/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
+++ b/drivers/net/ethernet/intel/i40evf/i40e_txrx.c
@@ -1670,6 +1670,47 @@ linearize_chk_done:
 }
 
 /**
+ * __i40evf_maybe_stop_tx - 2nd level check for tx stop conditions
+ * @tx_ring: the ring to be checked
+ * @size:the size buffer we want to assure is available
+ *
+ * Returns -EBUSY if a stop is needed, else 0
+ **/
+static inline int __i40evf_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
+{
+   netif_stop_subqueue(tx_ring->netdev, tx_ring->queue_index);
+   /* Memory barrier before checking head and tail */
+   smp_mb();
+
+   /* Check again in a case another CPU has just made room available. */
+   if (likely(I40E_DESC_UNUSED(tx_ring) < size))
+   return -EBUSY;
+
+   /* A reprieve! - use start_queue because it doesn't call schedule */
+   netif_start_subqueue(tx_ring->netdev, tx_ring->queue_index);
+   ++tx_ring->tx_stats.restart_queue;
+   return 0;
+}
+
+/**
+ * i40evf_maybe_stop_tx - 1st level check for tx stop conditions
+ * @tx_ring: the ring to be checked
+ * @size:the size buffer we want to assure is available
+ *
+ * Returns 0 if stop is not needed
+ **/
+#ifdef I40E_FCOE
+int i40evf_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
+#else
+static int i40evf_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
+#endif
+{
+   if (likely(I40E_DESC_UNUSED(tx_ring) >= size))
+   return 0;
+   return __i40evf_maybe_stop_tx(tx_ring, size);
+}
+
+/**
  * i40e_tx_map - Build the Tx descriptor
  * @tx_ring:  ring to send buffer on
  * @skb:  send buffer
@@ -1806,8 +1847,12 @@ static void i40e_tx_map(struct i40e_ring *tx_ring, 
struct sk_buff *skb,
 
tx_ring->next_to_use = i;
 
+   i40evf_maybe_stop_tx(tx_ring, DESC_NEEDED);
/* notify HW of packet */
-   writel(i, tx_ring->tail);
+   if (!skb->xmit_more ||
+   netif_xmit_stopped(netdev_get_tx_queue(tx_ring->netdev,
+  tx_ring->queue_index)))
+   writel(i, tx_ring->tail);
 
return;
 
@@ -1829,43 +1874,6 @@ dma_error:
 }
 
 /**
- * __i40e_maybe_stop_tx - 2nd level check for tx stop conditions
- * @tx_ring: the ring to be checked
- * @size:the size buffer we want to assure is available
- *
- * Returns -EBUSY if a stop is needed, else 0
- **/
-static inline int __i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
-{
-   netif_stop_subqueue(tx_ring->netdev, tx_ring->queue_index);
-   /* Memory barrier before checking head and tail */
-   smp_mb();
-
-   /* Check again in a case another CPU has just made room available. */
-   if (likely(I40E_DESC_UNUSED(tx_ring) < size))
-   return -EBUSY;
-
-   /* A reprieve! - use start_queue because it doesn't call schedule */
-   netif_start_subqueue(tx_ring->netdev, tx_ring->queue_index);
-   ++tx_ring->tx_stats.restart_queue;
-   return 0;
-}
-
-/**
- * i40e_maybe_stop_tx - 1st level check for tx stop conditions
- * @tx_ring: the ring to be checked
- * @size:the size buffer we want to assure is available
- *
- * Returns 0 if stop is not needed
- **/
-static int i40e_maybe_stop_tx(struct i40e_ring *tx_ring, int size)
-{
-   if (likely(I40E_DESC_UNUSED(tx_ring) >= size))
-   return 0;
-   return __i40e_maybe_stop_tx(tx_ring, size);
-}
-
-/**
  * i40e_xmit_descriptor_count - calculate number of tx descriptors needed
  * @skb: send buffer
  * @tx_ring: ring to send buffer on
@@ -1890,7 +1898,7 @@ static int i40e_xmit_descriptor_count(struct sk_buff *skb,
count += TXD_USE_COUNT(skb_shinfo(skb)->frags[f].size);
 
count += TXD_USE_COUNT(skb_headlen(skb));
-   if (i40e_maybe_stop_tx(tx_ring, count + 4 + 1)) {
+   if (i40evf_maybe_stop_tx(tx_ring, count + 4 + 1)) {
tx_ring->tx_stats.tx_busy++;
return 0;
}
@@ -1966,8 +1974,6 @@ static netdev_tx_t i40e_xmit_frame_ring(struct sk_buff 
*skb,
i40e_tx_map(tx_ring, skb, first, tx_flags, hdr_len,
td_cmd, td_offset);
 
-   i40e_maybe_stop_tx(tx_ring, DESC_NEEDED);
-
return NETDEV_TX_OK;
 
 out_dro

[PATCH net-next] vlan: Add GRO support for non hardware accelerated vlan

2015-05-28 Thread Toshiaki Makita
Currently packets with non-hardware-accelerated vlan cannot be handled
by GRO. This causes low performance for 802.1ad and stacked vlan, as their
vlan tags are currently not stripped by hardware.

This patch adds GRO support for non-hardware-accelerated vlan and
improves receive performance of them.

Test Environment:
 vlan device (.1Q) on vlan device (.1ad) on ixgbe (82599)

Result:

- Before

$ netperf -t TCP_STREAM -H 192.168.20.2 -l 60
Recv   SendSend
Socket Socket  Message  Elapsed
Size   SizeSize Time Throughput
bytes  bytes   bytessecs.10^6bits/sec

 87380  16384  1638460.005233.17

Rx side CPU usage:
  %usr  %sys  %irq %soft %idle
  0.27 58.03  0.00 41.70  0.00

- After

$ netperf -t TCP_STREAM -H 192.168.20.2 -l 60
Recv   SendSend
Socket Socket  Message  Elapsed
Size   SizeSize Time Throughput
bytes  bytes   bytessecs.10^6bits/sec

 87380  16384  1638460.007586.85

Rx side CPU usage:
  %usr  %sys  %irq %soft %idle
  0.50 25.83  0.00 59.53 14.14

Signed-off-by: Toshiaki Makita 
---
 net/8021q/vlan.c | 94 
 1 file changed, 94 insertions(+)

diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index 59555f0..0a9e8e1 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -618,6 +618,90 @@ out:
return err;
 }
 
+static struct sk_buff **vlan_gro_receive(struct sk_buff **head,
+struct sk_buff *skb)
+{
+   struct sk_buff *p, **pp = NULL;
+   struct vlan_hdr *vhdr;
+   unsigned int hlen, off_vlan;
+   const struct packet_offload *ptype;
+   __be16 type;
+   int flush = 1;
+
+   off_vlan = skb_gro_offset(skb);
+   hlen = off_vlan + sizeof(*vhdr);
+   vhdr = skb_gro_header_fast(skb, off_vlan);
+   if (skb_gro_header_hard(skb, hlen)) {
+   vhdr = skb_gro_header_slow(skb, hlen, off_vlan);
+   if (unlikely(!vhdr))
+   goto out;
+   }
+
+   type = vhdr->h_vlan_encapsulated_proto;
+
+   rcu_read_lock();
+   ptype = gro_find_receive_by_type(type);
+   if (!ptype)
+   goto out_unlock;
+
+   flush = 0;
+
+   for (p = *head; p; p = p->next) {
+   struct vlan_hdr *vhdr2;
+
+   if (!NAPI_GRO_CB(p)->same_flow)
+   continue;
+
+   vhdr2 = (struct vlan_hdr *)(p->data + off_vlan);
+   if (memcmp(vhdr, vhdr2, VLAN_HLEN))
+   NAPI_GRO_CB(p)->same_flow = 0;
+   }
+
+   skb_gro_pull(skb, sizeof(*vhdr));
+   skb_gro_postpull_rcsum(skb, vhdr, sizeof(*vhdr));
+   pp = ptype->callbacks.gro_receive(head, skb);
+
+out_unlock:
+   rcu_read_unlock();
+out:
+   NAPI_GRO_CB(skb)->flush |= flush;
+
+   return pp;
+}
+
+static int vlan_gro_complete(struct sk_buff *skb, int nhoff)
+{
+   struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + nhoff);
+   __be16 type = vhdr->h_vlan_encapsulated_proto;
+   struct packet_offload *ptype;
+   int err = -ENOENT;
+
+   rcu_read_lock();
+   ptype = gro_find_complete_by_type(type);
+   if (ptype)
+   err = ptype->callbacks.gro_complete(skb, nhoff + sizeof(*vhdr));
+
+   rcu_read_unlock();
+   return err;
+}
+
+static struct packet_offload vlan_packet_offloads[] __read_mostly = {
+   {
+   .type = cpu_to_be16(ETH_P_8021Q),
+   .callbacks = {
+   .gro_receive = vlan_gro_receive,
+   .gro_complete = vlan_gro_complete,
+   },
+   },
+   {
+   .type = cpu_to_be16(ETH_P_8021AD),
+   .callbacks = {
+   .gro_receive = vlan_gro_receive,
+   .gro_complete = vlan_gro_complete,
+   },
+   },
+};
+
 static int __net_init vlan_init_net(struct net *net)
 {
struct vlan_net *vn = net_generic(net, vlan_net_id);
@@ -645,6 +729,7 @@ static struct pernet_operations vlan_net_ops = {
 static int __init vlan_proto_init(void)
 {
int err;
+   unsigned int i;
 
pr_info("%s v%s\n", vlan_fullname, vlan_version);
 
@@ -668,6 +753,9 @@ static int __init vlan_proto_init(void)
if (err < 0)
goto err5;
 
+   for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++)
+   dev_add_offload(&vlan_packet_offloads[i]);
+
vlan_ioctl_set(vlan_ioctl_handler);
return 0;
 
@@ -685,7 +773,13 @@ err0:
 
 static void __exit vlan_cleanup_module(void)
 {
+   unsigned int i;
+
vlan_ioctl_set(NULL);
+
+   for (i = 0; i < ARRAY_SIZE(vlan_packet_offloads); i++)
+   dev_remove_offload(&vlan_packet_offloads[i]);
+
vlan_netlink_fini();
 
unregister_netdevice_notifier(&vlan_notifier_block);
-- 
1.8.1.2


--
To unsubscribe from this list: send the line "unsubscribe

Re: [PATCH] sctp: fix ASCONF list handling

2015-05-28 Thread Marcelo Ricardo Leitner
On Thu, May 28, 2015 at 06:15:11AM -0400, Neil Horman wrote:
> On Wed, May 27, 2015 at 09:52:17PM -0300, mleit...@redhat.com wrote:
> > From: Marcelo Ricardo Leitner 
> > 
> > ->auto_asconf_splist is per namespace and mangled by functions like
> > sctp_setsockopt_auto_asconf() which doesn't guarantee any serialization.
> > 
> > Also, the call to inet_sk_copy_descendant() was backuping
> > ->auto_asconf_list through the copy but was not honoring
> > ->do_auto_asconf, which could lead to list corruption if it was
> > different between both sockets.
> > 
> > This commit thus fixes the list handling by adding a spinlock to protect
> > against multiple writers and converts the list to be protected by RCU
> > too, so that we don't have a lock inverstion issue at
> > sctp_addr_wq_timeout_handler().
> > 
> > And as this list now uses RCU, we cannot do such backup and restore
> > while copying descendant data anymore as readers may be traversing the
> > list meanwhile. We fix this by simply ignoring/not copying those fields,
> > placed at the end of struct sctp_sock, so we can just ignore it together
> > with struct ipv6_pinfo data. For that we create sctp_copy_descendant()
> > so we don't clutter inet_sk_copy_descendant() with SCTP info.
> > 
> > Issue was found with a test application that kept flipping sysctl
> > default_auto_asconf on and off.
> > 
> > Fixes: 9f7d653b67ae ("sctp: Add Auto-ASCONF support (core).")
> > Signed-off-by: Marcelo Ricardo Leitner 
> > ---
> >  include/net/netns/sctp.h   |  6 +-
> >  include/net/sctp/structs.h |  2 ++
> >  net/sctp/protocol.c|  6 +-
> >  net/sctp/socket.c  | 39 ++-
> >  4 files changed, 38 insertions(+), 15 deletions(-)
> > 
> > diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
> > index 
> > 3573a81815ad9e0efb6ceb721eb066d3726419f0..e080bebb3147af39c8275261f57018eb01e917b0
> >  100644
> > --- a/include/net/netns/sctp.h
> > +++ b/include/net/netns/sctp.h
> > @@ -30,12 +30,15 @@ struct netns_sctp {
> > struct list_head local_addr_list;
> > struct list_head addr_waitq;
> > struct timer_list addr_wq_timer;
> > -   struct list_head auto_asconf_splist;
> > +   struct list_head __rcu auto_asconf_splist;
> You should use the addr_wq_lock here instead of creating a new lock, as thats
> already used to protect most accesses to the list you are concerned about.

Ok, that works too.

> Though truthfully, that shouldn't be necessecary.  The list in question is 
> only
> read in one location and only written in one location.  You can likely just
> rcu-ify, as the write side is in process context and protected by lock_sock.

It should, it's not protected by lock_sock as this list resides in
netns_sctp structure, which lock_sock doesn't cover. Write side is in
process context yes, but this list is written in sctp_init_sock(),
sctp_destroy_sock() and sctp_setsockopt_auto_asconf(), so one could
trigger this by either creating/destroying sockets if
default_auto_asconf=1 or just by creating a bunch of sockets and
flipping asconf via setsockopt (or a combination of these operations).
(I'll point this out in the changelog)

Btw I have two nits on the patch kindly broght to my attention already,
on adding blank newline and bad comment block, will fix it in v2.

  Marcelo

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] sctp: fix ASCONF list handling

2015-05-28 Thread Neil Horman
On Wed, May 27, 2015 at 09:52:17PM -0300, mleit...@redhat.com wrote:
> From: Marcelo Ricardo Leitner 
> 
> ->auto_asconf_splist is per namespace and mangled by functions like
> sctp_setsockopt_auto_asconf() which doesn't guarantee any serialization.
> 
> Also, the call to inet_sk_copy_descendant() was backuping
> ->auto_asconf_list through the copy but was not honoring
> ->do_auto_asconf, which could lead to list corruption if it was
> different between both sockets.
> 
> This commit thus fixes the list handling by adding a spinlock to protect
> against multiple writers and converts the list to be protected by RCU
> too, so that we don't have a lock inverstion issue at
> sctp_addr_wq_timeout_handler().
> 
> And as this list now uses RCU, we cannot do such backup and restore
> while copying descendant data anymore as readers may be traversing the
> list meanwhile. We fix this by simply ignoring/not copying those fields,
> placed at the end of struct sctp_sock, so we can just ignore it together
> with struct ipv6_pinfo data. For that we create sctp_copy_descendant()
> so we don't clutter inet_sk_copy_descendant() with SCTP info.
> 
> Issue was found with a test application that kept flipping sysctl
> default_auto_asconf on and off.
> 
> Fixes: 9f7d653b67ae ("sctp: Add Auto-ASCONF support (core).")
> Signed-off-by: Marcelo Ricardo Leitner 
> ---
>  include/net/netns/sctp.h   |  6 +-
>  include/net/sctp/structs.h |  2 ++
>  net/sctp/protocol.c|  6 +-
>  net/sctp/socket.c  | 39 ++-
>  4 files changed, 38 insertions(+), 15 deletions(-)
> 
> diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
> index 
> 3573a81815ad9e0efb6ceb721eb066d3726419f0..e080bebb3147af39c8275261f57018eb01e917b0
>  100644
> --- a/include/net/netns/sctp.h
> +++ b/include/net/netns/sctp.h
> @@ -30,12 +30,15 @@ struct netns_sctp {
>   struct list_head local_addr_list;
>   struct list_head addr_waitq;
>   struct timer_list addr_wq_timer;
> - struct list_head auto_asconf_splist;
> + struct list_head __rcu auto_asconf_splist;
You should use the addr_wq_lock here instead of creating a new lock, as thats
already used to protect most accesses to the list you are concerned about.
Though truthfully, that shouldn't be necessecary.  The list in question is only
read in one location and only written in one location.  You can likely just
rcu-ify, as the write side is in process context and protected by lock_sock.

Neil

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] neigh: Add missing rcu_assign_pointer

2015-05-28 Thread Eric Dumazet
On Thu, 2015-05-28 at 16:28 +0800, Ying Xue wrote:
> Commit e4c4e448cf55 ("neigh: Convert garbage collection from softirq
> to workqueue") misses to use rcu_assign_pointer() macro to assign a
> RCU-protected pointer.
> 
> Signed-off-by: Ying Xue 
> ---
>  net/core/neighbour.c |3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/net/core/neighbour.c b/net/core/neighbour.c
> index 3a74df7..aaad3a5 100644
> --- a/net/core/neighbour.c
> +++ b/net/core/neighbour.c
> @@ -783,7 +783,8 @@ static void neigh_periodic_work(struct work_struct *work)
>   if (atomic_read(&n->refcnt) == 1 &&
>   (state == NUD_FAILED ||
>time_after(jiffies, n->used + NEIGH_VAR(n->parms, 
> GC_STALETIME {
> - *np = n->next;
> + rcu_assign_pointer(*np, 
> rcu_dereference_protected(n->next,
> + 
> lockdep_is_held(&tbl->lock)));
>   n->dead = 1;
>   write_unlock(&n->lock);
>   neigh_cleanup_and_release(n);


This patch is not needed.

You really should read Documentation/RCU , because it looks like you are
quite confused.

When we remove an element from a RCU protected list, all the objects in
the chain are already ready to be caught by rcu readers.

Therefore, no additional memory barrier is needed before doing *np =
n->next;

Please do not add spurious memory barriers. Like atomic operations, we
want all of them being required and possibly documented.



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCHv3] pktgen: Convert return type of process_ipsec to bool

2015-05-28 Thread Jesper Dangaard Brouer
On Thu, 28 May 2015 00:11:05 -0400
Nicholas Krause  wrote:

> This converts the function, process_ipsec to the 
> return type of bool due to only returning either
> one or zero.
> 
> Signed-off-by: Nicholas Krause 
> ---
> v3
> Move the v2 changes below the sign off line for this patch.
> v2
> Change incorrect patch subject and make commit message
> clearer
>  net/core/pktgen.c | 6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/net/core/pktgen.c b/net/core/pktgen.c
> index 508155b..33bdb76 100644
> --- a/net/core/pktgen.c
> +++ b/net/core/pktgen.c
> @@ -2587,7 +2587,7 @@ static void free_SAs(struct pktgen_dev *pkt_dev)
>   }
>  }
>  
> -static int process_ipsec(struct pktgen_dev *pkt_dev,
> +static bool process_ipsec(struct pktgen_dev *pkt_dev,
> struct sk_buff *skb, __be16 protocol)

When doing this change, could you please align the above line to the
open parenthesis of process_ipsec (even-though it was also misaligned
before).

scripts/checkpatch.pl will tell you:
 CHECK: Alignment should match open parenthesis 

Did anyone tell you that kernel developers nitpick? ;-)

And usually you don't need to Cc the "main" Linux Kernel Mailing List
(linux-ker...@vger.kernel.org) with a trivial patch like this.  Sending
it to the network developers should be enough (netdev@vger.kernel.org).

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Sr. Network Kernel Developer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] netevent: remove automatic variable in register_netevent_notifier()

2015-05-28 Thread Wang Long
Remove automatic variable 'err' in register_netevent_notifier() and
return the return value of atomic_notifier_chain_register() directly.

Signed-off-by: Wang Long 
---
 net/core/netevent.c | 5 +
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/net/core/netevent.c b/net/core/netevent.c
index f17ccd2..8b3bc4f 100644
--- a/net/core/netevent.c
+++ b/net/core/netevent.c
@@ -31,10 +31,7 @@ static ATOMIC_NOTIFIER_HEAD(netevent_notif_chain);
  */
 int register_netevent_notifier(struct notifier_block *nb)
 {
-   int err;
-
-   err = atomic_notifier_chain_register(&netevent_notif_chain, nb);
-   return err;
+   return atomic_notifier_chain_register(&netevent_notif_chain, nb);
 }
 EXPORT_SYMBOL_GPL(register_netevent_notifier);
 
-- 
1.8.3.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net v2] switchdev: don't abort hardware ipv4 fib offload on failure to program fib entry in hardware

2015-05-28 Thread Jiri Pirko
Mon, May 18, 2015 at 10:19:16PM CEST, da...@davemloft.net wrote:
>From: Roopa Prabhu 
>Date: Sun, 17 May 2015 16:42:05 -0700
>
>> On most systems where you can offload routes to hardware,
>> doing routing in software is not an option (the cpu limitations
>> make routing impossible in software).
>
>You absolutely do not get to determine this policy, none of us
>do.
>
>What matters is that by default the damn switch device being there
>is %100 transparent to the user.
>
>And the way to achieve that default is to do software routes as
>a fallback.
>
>I am not going to entertain changes of this nature which fail
>route loading by default just because we've exceeded a device's
>HW capacity to offload.
>
>I thought I was _really_ clear about this at netdev 0.1

I certainly agree that by default, transparency 1:1 sw:hw mapping is
what we need for fib. The current code is a good start!

I see couple of issues regarding switchdev_fib_ipv4_abort:
1) If user adds and entry, switchdev_fib_ipv4_add fails, abort is
   executed -> and, error returned. I would expect that route entry should
   be added in this case. The next attempt of adding the same entry will
   be successful.
   The current behaviour breaks the transparency you are reffering to.
2) When switchdev_fib_ipv4_abort happens to be executed, the offload is
   disabled for good (until reboot). That is certainly not nice, alhough
   I understand that is the easiest solution for now.

I believe that we all agree that the 1:1 transparency, although it is a
default, may not be optimal for real-life usage. HW resources are
limited and user does not know them. The danger of hitting _abort and
screwing-up the whole system is huge, unacceptable.

So here, there are couple of more or less simple things that I suggest to
do in order to move a little bit forward:
1) Introduce system-wide option to switch _abort to just plain fail.
   When HW does not have capacity, do not flush and fallback to sw, but
   rather just fail to add the entry. This would not break anything.
   Userspace has to be prepared that entry add could fail.
2) Introduce a way to propagate resources to userspace. Driver knows about
   resources used/available/potentially_available. Switchdev infra could
   be extended in order to propagate the info to the user.
3) Introduce couple of flags for entry add that would alter the default
   behaviour. Something like:
NLM_F_SKIP_KERNEL
NLM_F_SKIP_OFFLOAD
   Again, this does not break the current users. On the other hand, this
   gives new users a leverage to instruct kernel where the entry should
   be added to (or not added to).

Any thoughts? Objections?

Thanks!

Jiri
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


pull request: bluetooth-next 2015-05-28

2015-05-28 Thread Johan Hedberg
Hi Dave,

Here's a set of patches intended for 4.2. The majority of the changes
are on the 802.15.4 side of things rather than Bluetooth related:

 - All sorts of cleanups & fixes to ieee802154 and related drivers
 - Rework of tx power support in ieee802154 and its drivers
 - Support for setting ieee802154 tx power through nl802154
 - New IDs for the btusb driver
 - Various cleanups & smaller fixes to btusb
 - New btrtl driver for Realtec devices
 - Fix suspend/resume for Realtek devices

Please let me know if there are any issues pulling. Thanks.

Johan

---
The following changes since commit f0b5e8a42f37a880b8467e59dc814f4f21581d3d:

  net: kill useless net_*_ingress_queue() definitions when NET_CLS_ACT is unset 
(2015-05-13 15:44:28 -0400)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/bluetooth/bluetooth-next.git 
for-upstream

for you to fetch changes up to b5a61c306b0dddb28e3a3ab5d782c73e5f665497:

  atusb: add support for at86rf230 (2015-05-27 19:29:54 +0200)


Alexander Aring (42):
  nl802154: cleanup invalid argument handling
  ieee802154: move validation check out of softmac
  ieee802154: change transmit power to s32
  ieee802154: change transmit power to mbm
  ieee802154: change cca ed level to mbm
  ieee802154: introduce wpan_phy_supported
  ieee802154: add several phy supported handling
  mac802154: check for really changes
  mac802154: remove check if operation is supported
  cfg802154: introduce wpan phy flags
  ieee802154: add iftypes capability
  at86rf230: set cca_modes supported flags
  at86rf230: rework tx power support
  at86rf230: rework tx cca energy detection level
  at86rf230: add cca ed level reset value
  at86rf230: add reset states of tx power level
  nl802154: add support for dump phy capabilities
  at86rf230: fix callback for aret handling
  mac802154: tx: allow xmit complete from hard irq
  ieee802154: add support for atusb transceiver
  fakelb: creating two virtual phys per default
  fakelb: use list_for_each_entry_safe
  fakelb: rename fakelb_dev_priv to fakelb_phy
  fakelb: don't deliver when one phy
  fakelb: declare rwlock static
  fakelb: declare fakelb list static
  fakelb: move lock out of iteration
  fakelb: introduce fakelb ifup phys list
  fakelb: use own channel and page attributes
  fakelb: add virtual phy reset defaults
  fakelb: remove fakelb_hw_deliver
  fakelb: add support for async xmit handling
  fakelb: cleanup code
  at86rf230: add missing cca ed level values
  mac802154: fix hold rtnl while ioctl
  mac802154: remove pib lock
  mac802154: use atomic ops for sequence incrementation
  mac802154: remove mib lock
  nl802154: fix cca mode wpan phy flag
  nl802154: add support for cca ed level info
  nl802154: add support to set cca ed level
  atusb: add support for at86rf230

Arnd Bergmann (1):
  mac802154: select CRYPTO when needed

Carlo Caione (1):
  Bluetooth: btrtl: Create separate module for Realtek BT driver

Chan-yeol Park (1):
  Bluetooth: btusb: Support QCA61x4 ROME v2.0

Daniel Drake (1):
  Bluetooth: btusb: fix Realtek suspend/resume

Florian Grandel (1):
  Bluetooth: mgmt: fix typos

Frederic Danis (4):
  Bluetooth: Fix calls to __hci_cmd_sync()
  Bluetooth: btusb: Fix calls to __hci_cmd_sync()
  Bluetooth: btintel: Fix calls to __hci_cmd_sync()
  Bluetooth: btbcm: Fix calls to __hci_cmd_sync()

Johan Hedberg (1):
  Bluetooth: Add debug logs for legacy SMP crypto functions

Lennert Buytenhek (7):
  mac802154: Avoid rtnl deadlock in mac802154_wpan_ioctl().
  ieee802154 socket: Return EMSGSIZE from raw_sendmsg() if packet too big.
  Documentation/networking/ieee802154.txt: fix various inaccuracies.
  ieee802154: Remove ieee802154_reduced_mlme_ops references.
  ieee802154: Remove 802.15.4/6LoWPAN checks for interface MTU.
  ieee802154 socket: No need to check for ARPHRD_IEEE802154 in raw_bind().
  mac802154: mac802154_mlme_start_req() optimisation.

Leo Yan (1):
  Bluetooth: btwilink: remove DEBUG define

Martin Townsend (1):
  mac802154: fakelb: Fix potential NULL pointer dereference.

Shailendra Verma (2):
  Bluetooth: btusb: Change 1 to true in bool type variable assignment
  Bluetooth: hci_uart: Change 1 to true for bool type variables assignments

Stefan Schmidt (3):
  ieee802154/atusb: Warn about outdated device firmware.
  ieee802154/atusb: Mark driver as AACK enabled in hardware.
  ieee802154/atusb: Set default ed level to 0xbe like the rest of these 
drivers

Varka Bhadram (2):
  ieee802154: add set transmit power support
  ieee802154: fix typo for file name

Xinming Hu (1):
  Bluetooth: btmrvl: fix compilation warning

 Documentation/networking/ieee802154.txt |  32

Re: [PATCH net-next V4 00/12] net/mlx5: ConnectX-4 100G Ethernet driver

2015-05-28 Thread Amir Vadai
On Thu, May 28, 2015 at 11:52 AM, David Miller  wrote:
> From: Ben Hutchings 
> Date: Wed, 27 May 2015 20:57:37 +0100
>
>> How would an application tell the difference between an IRQ handler
>> being renamed, or being unregistered and re-registered under a different
>> name?  I'm fairly sure it can't tell.
>
> What do things like the userland IRQ balancer do?

Thanks to Neil Horman, userland scripts can get the irq number from
sysfs (/sys/bus/pci/devices//msi_irqs) which is not based on
the irq naming [1].
He also fixed irq_balancer [2] to use this API instead of being based
on those strings.

I will drop the irq renaming from the patchset. mlx5_core driver will
set generic irq names (since same irq's might service both Ethernet
and Infiniband), for example: mlx5_comp0@pci::00:04.0.

Thanks,
Amir


[1] - kernel: da8d1c8 PCI/sysfs: add per pci device msi[x] irq listing (v5)
[2] - irq_balancer: 32a7757 Complete rework of how we detect and classify irqs
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH/RFC net-next] rocker: remove rocker parameter from functions that have rocker_port parameter

2015-05-28 Thread David Laight
From: Simon Horman
> Sent: 28 May 2015 04:23
> The rocker (switch) of a rocker_port may be trivially obtained from
> the latter it seems cleaner not to pass the former to a function when
> the latter is being passed anyway.

If the arguments are passed in registers (they almost certainly are)
or the function is inlined (possible since they are static) and
the calling code already has both values in registers then
passing both values saves a memory read inside the called code.

So on 'hot paths' it probably makes sense to pass both values.

David

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net iproute2 v2 1/2] mpls: always set type as RTN_UNICAST for route add/deletes

2015-05-28 Thread Robert Shearman

On 27/05/15 19:37, Roopa Prabhu wrote:

From: Roopa Prabhu 

Kernel expects type RTN_UNICAST for mpls route/dels

Signed-off-by: Vivek Venkataraman 
Signed-off-by: Roopa Prabhu 


Reviewed-by: Robert Shearman 


---
  ip/iproute.c |5 +
  1 file changed, 5 insertions(+)

diff --git a/ip/iproute.c b/ip/iproute.c
index 670a4c6..71c088b 100644
--- a/ip/iproute.c
+++ b/ip/iproute.c
@@ -803,6 +803,7 @@ static int iproute_modify(int cmd, unsigned flags, int 
argc, char **argv)
int scope_ok = 0;
int table_ok = 0;
int raw = 0;
+   int type_ok = 0;

memset(&req, 0, sizeof(req));

@@ -1095,6 +1096,7 @@ static int iproute_modify(int cmd, unsigned flags, int 
argc, char **argv)
rtnl_rtntype_a2n(&type, *argv) == 0) {
NEXT_ARG();
req.r.rtm_type = type;
+   type_ok = 1;
}

if (matches(*argv, "help") == 0)
@@ -1160,6 +1162,9 @@ static int iproute_modify(int cmd, unsigned flags, int 
argc, char **argv)
}
}

+   if (!type_ok && req.r.rtm_family == AF_MPLS)
+   req.r.rtm_type = RTN_UNICAST;
+
if (req.r.rtm_family == AF_UNSPEC)
req.r.rtm_family = AF_INET;



--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net iproute2 v2 1/2] mpls: always set type as RTN_UNICAST for route add/deletes

2015-05-28 Thread Robert Shearman

On 28/05/15 01:06, roopa wrote:

On 5/27/15, 1:08 PM, roopa wrote:

On 5/27/15, 12:59 PM, Robert Shearman wrote:

On 27/05/15 19:37, Roopa Prabhu wrote:

From: Roopa Prabhu 

Kernel expects type RTN_UNICAST for mpls route/dels

Signed-off-by: Vivek Venkataraman 
Signed-off-by: Roopa Prabhu 
---
  ip/iproute.c |5 +
  1 file changed, 5 insertions(+)

diff --git a/ip/iproute.c b/ip/iproute.c
index 670a4c6..71c088b 100644
--- a/ip/iproute.c
+++ b/ip/iproute.c
@@ -803,6 +803,7 @@ static int iproute_modify(int cmd, unsigned
flags, int argc, char **argv)
  int scope_ok = 0;
  int table_ok = 0;
  int raw = 0;
+int type_ok = 0;

  memset(&req, 0, sizeof(req));

@@ -1095,6 +1096,7 @@ static int iproute_modify(int cmd, unsigned
flags, int argc, char **argv)
  rtnl_rtntype_a2n(&type, *argv) == 0) {
  NEXT_ARG();
  req.r.rtm_type = type;
+type_ok = 1;
  }

  if (matches(*argv, "help") == 0)
@@ -1160,6 +1162,9 @@ static int iproute_modify(int cmd, unsigned
flags, int argc, char **argv)
  }
  }

+if (!type_ok && req.r.rtm_family == AF_MPLS)
+req.r.rtm_type = RTN_UNICAST;
+
  if (req.r.rtm_family == AF_UNSPEC)
  req.r.rtm_family = AF_INET;




There is this block of code near the start of iproute_modify that
sets req.r.rtm_type in the add/modify cases:

if (cmd != RTM_DELROUTE) {
req.r.rtm_protocol = RTPROT_BOOT;
req.r.rtm_scope = RT_SCOPE_UNIVERSE;
req.r.rtm_type = RTN_UNICAST;
}

How about doing similar for the mpls delete case? This would avoid
the need to track if the type has been set and would also make the
way rtm_type is set in the delete case as close as possible to that
in the add/modify cases.

sure that works too. There was already *_ok checks for the rest of the
attributes, ..so added it there.

v3 ...coming...


looking at the code again..now i remember why i have it this way. I will
have to add a check for family around
the code you point out above. And it some cases if the user has not
specified the family explicitly, we derive the msg family
in the while loop that parses the args...based on the other arguments
given by the user.
In the particular mpls case though, user explicitly specifies the family
and moving the patch to the code you point above should be ok.

But to be consistent with the rest of the code, it seems better to do
the check and set the defaults at the end after parsing all the args.

So, now i am inclined to keep the v2 patch as is...unless you have
strong reasons.


Ah, yes, of course.

In that case, LGTM.

Thanks,
Rob
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: kmalloc panic

2015-05-28 Thread Johannes Berg
On Wed, 2015-05-27 at 22:15 -0700, Cong Wang wrote:

> > rsi_client: module license 'Proprietary' taints kernel.
> > Disabling lock debugging due to kernel taint
> > RSI_Init called and registering the client driver

If this is what I think it is - the redpine signals wifi driver, then I
have no interest in this bug report whatsoever.

Please tell us
 * the exact kernel version, best with local patches
 * the wifi driver used

> > INFO: Allocated in 0x10100100 age=2669517238 cpu=16842820 pid=335563024

and what function this really was (0x10100100 isn't really useful).

johannes

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH] sctp: Fix mangled IPv4 addresses on a IPv6 listening socket

2015-05-28 Thread David Laight
From: Jason Gunthorpe
> Sent: 27 May 2015 18:05
> On Wed, May 27, 2015 at 04:41:18PM +, David Laight wrote:
> 
> > The code will be sleeping in kernel_accept() and later calls
> > kernel_getpeername().
> > The code is used for both TCP and SCTP and this part is common (using
> > the TCP semantics).
> 
> getpeername uses a different flow, it calls into inet6_getname which
> will always return the AF_INET6 version.

Ok, that explains why I hadn't seen the problem.
It also means I don't have to worry about it.

David

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH] net: tcp: Fix a PTO timing granularity issue

2015-05-28 Thread David Laight
From: Ido Yariv
> Sent: 28 May 2015 05:37
...
> +/* Convert msecs to jiffies, ensuring that the return value is at least 2
> + * jiffies.
> + * This can be used when setting tick-based timers to guarantee that they 
> won't
> + * expire right away.
> + */
> +static inline unsigned long tcp_safe_msecs_to_jiffies(const unsigned int m)

I don't like using 'safe' in function names, being 'safe; depends on what
the caller wants.
Maybe tcp_msecs_to_jiffies_min_2() would be better.

David

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] connector: add cgroup release event report to proc connector

2015-05-28 Thread Dimitri John Ledkov
On 28 May 2015 at 04:30, Zefan Li  wrote:
> On 2015/5/27 20:37, Dimitri John Ledkov wrote:
>> On 27 May 2015 at 12:22, Zefan Li  wrote:
>>> On 2015/5/27 6:07, Dimitri John Ledkov wrote:
 Add a kernel API to send a proc connector notification that a cgroup
 has become empty. A userspace daemon can then act upon such
 information, and usually clean-up and remove such a group as it's no
 longer needed.

 Currently there are two other ways (one for current & one for unified
 cgroups) to receive such notifications, but they either involve
 spawning userspace helper or monitoring a lot of files. This is a
 firehose of all such events instead from a single place.

 In the current cgroups structure the way to get notifications is by
 enabling `release_agent' and setting `notify_on_release' for a given
 cgroup hierarchy. This will then spawn userspace helper with removed
 cgroup as an argument. It has been acknowledged that this is
 expensive, especially in the exit-heavy workloads. In userspace this
 is currently used by systemd and CGmanager that I know of, both of
 agents establish connection to the long running daemon and pass the
 message to it. As a courtesy to other processes, such an event is
 sometimes forwarded further on, e.g. systemd forwards it to the system
 DBus.

 In the future/unified cgroups structure support for `release_agent' is
 removed, without a direct replacement. However, there is a new
 `cgroup.populated' file exposed that recursively reports if there are
 any tasks in a given cgroup hierarchy. It's a very good flag to
 quickly/lazily scan for empty things, however one would need to
 establish inotify watch on each and every cgroup.populated file at
 cgroup setup time (ideally before any pids enter said cgroup). Thus
 again anybody else, but the original creator of a given cgroup, has a
 chance to reliably monitor cgroup becoming empty (since there is no
 reliable recursive inotify watch).

 Hence, the addition to the proc connector firehose. Multiple things,
 albeit with a CAP_NET_ADMIN in the init pid/user namespace), could
 connect and monitor cgroups release notifications. In a way, this
 repeats udev history, at first it was a userspace helper, which later
 became a netlink socket. And I hope, that proc connector is a
 naturally good fit for this notification type.

 For precisely when cgroups should emit this event, see next patch
 against kernel/cgroup.c.

>>>
>>> We really don't want yet another way for cgroup notification.
>>>
>>
>> we do have multiple information sources for similar events in other
>> places... e.g. fork events can be tracked with ptrace and with
>> proc-connector, ditto other things.
>>
>>> Systemd is happy with this cgroup.populated interface. Do you have any
>>> real use case in mind that can't be satisfied with inotify watch?
>>>
>>
>> cgroup.populated is not implemented in systemd and would require a lot
>> of inotify watches.
>
> I believe systemd will use cgroup.populated, though I don't know its
> roadmap. Maybe it's waiting for the kernel to remove the experimental
> flag of unified hierarchy.
>

There is no code in master to support unified hierarchy in systemd
that I can see. And more and more things rely on the current
hierarchy, especially around container-like technologies.

>> Also it's only set on the unified structure and
>> not exposed on the current one.
>>
>> Also it will not allow anybody else to establish notify watch in a
>> timely manner. Thus anyone external to the cgroups creator will not be
>> able to monitor cgroup.populated at the right time.
>
> I guess this isn't a problem, as you can watch the IN_CREATE event, and
> then you'll get notified when a cgroup is created.
>

It is a problem, there is no effective way to establish race-free
inotify watches, which is well known. Having a watch on
/sys/fs/cgroup, one has to establish inotify watch on a directory
created there, and then another watch on cgroup.populated within
there. By which time a process could have already entered, run and
exited.

>> With
>> proc_connector I was thinking processes entering cgroups would be
>> useful events as well, but I don't have a use-case for them yet thus
>> I'm not sure how the event should look like.
>>
>> Would cgroup.populated be exposed on the legacy cgroup hierchy? At the
>> moment I see about ~20ms of my ~200ms boot wasted on spawning the
>> cgroups agent and I would like to get rid of that as soon as possible.
>> This patch solves it for me. ( i have a matching one to connect to
>> proc connector and then feed notifications to systemd via systemd's
>> private api end-point )
>>
>> Exposing cgroup.populated irrespective of the cgroup mount options
>> would be great, but would result in many watches being established
>> awaiting for a once in a lifecycle condition of a cgroup. 

Re: [PATCH net-next V4 00/12] net/mlx5: ConnectX-4 100G Ethernet driver

2015-05-28 Thread David Miller
From: Ben Hutchings 
Date: Wed, 27 May 2015 20:57:37 +0100

> How would an application tell the difference between an IRQ handler
> being renamed, or being unregistered and re-registered under a different
> name?  I'm fairly sure it can't tell.

What do things like the userland IRQ balancer do?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Problems with receiving packets

2015-05-28 Thread sssdrb
Hello,

I use Linux kernel 3.3.8 on Arm Xscale based embedded platform.
I noticed that sometimes some applications lost data from network.
To be more detailed - for example I'm using ping command between two
Arm boards. The communication goes through ethernet or wifi. Now from
time to time ping reports that it losts some packets. Additionaly I
run tcpdump on the same side where ping was invoced and it reports
that all the packets arrived (!).

This is the strange part - ping reports lost of data, while tcpdump
shows all of the packets (icmp request and replies).

It seems like there is some problem with transferring network data
from kernel space to user space (in RX path).
I'm sure that all the icmp reply packets are received by ethernet
driver and kernel, but ping application received only part of them
(not all of them).

Is it known problem and is there any solution for it?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] xfrm6: Do not use xfrm_local_error for path MTU issues in tunnels

2015-05-28 Thread Steffen Klassert
On Thu, May 28, 2015 at 12:18:51AM -0700, Alexander Duyck wrote:
> On 05/27/2015 10:36 PM, Steffen Klassert wrote:
> >On Wed, May 27, 2015 at 10:40:32AM -0700, Alexander Duyck wrote:
> >>This change makes it so that we use icmpv6_send to report PMTU issues back
> >>into tunnels in the case that the resulting packet is larger than the MTU
> >>of the outgoing interface.  Previously xfrm_local_error was being used in
> >>this case, however this was resulting in no changes, I suspect due to the
> >>fact that the tunnel itself was being kept out of the loop.
> >>
> >>This patch fixes PMTU problems seen on ip6_vti tunnels and is based on the
> >>behavior seen if the socket was orphaned.  Instead of requiring the socket
> >>to be orphaned this patch simply defaults to using icmpv6_send in the case
> >>that the frame came though a tunnel.
> >We can use icmpv6_send() just in the case that the packet
> >was already transmitted by a tunnel device, otherwise we
> >get the bug back that I mentioned in my other mail.
> >
> >Not sure if we have something to know that the packet
> >traversed a tunnel device. That's what I asked in the
> >thread 'Looking for a lost patch'.
> 
> Okay I will try to do some more digging.  From what I can tell right
> now it looks like my ping attempts are getting hung up on the
> xfrm_local_error in __xfrm6_output.  I wonder if we couldn't somehow
> make use of the skb->cb to store a pointer to the tunnel that could
> be checked to determine if we are going through a VTI or not.

Maybe it is as easy as the patch below, could you please test it?

Subject: [PATCH RFC] vti6: Add pmtu handling to vti6_xmit.

We currently rely on the PMTU discovery of xfrm.
However if a packet is localy sent, the PMTU mechanism
of xfrm tries to to local socket notification what
might not work for applications like ping that don't
check for this. So add pmtu handling to vti6_xmit to
report MTU changes immediately.

Signed-off-by: Steffen Klassert 
---
 net/ipv6/ip6_vti.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index ff3bd86..13cb771 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -434,6 +434,7 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, 
struct flowi *fl)
struct dst_entry *dst = skb_dst(skb);
struct net_device *tdev;
struct xfrm_state *x;
+   int mtu;
int err = -1;
 
if (!dst)
@@ -468,6 +469,15 @@ vti6_xmit(struct sk_buff *skb, struct net_device *dev, 
struct flowi *fl)
skb_dst_set(skb, dst);
skb->dev = skb_dst(skb)->dev;
 
+   mtu = dst_mtu(dst);
+   if (!skb->ignore_df && skb->len > mtu) {
+   skb_dst(skb)->ops->update_pmtu(dst, NULL, skb, mtu);
+
+   icmpv6_send(skb, ICMPV6_PKT_TOOBIG, 0, mtu);
+
+   return -EMSGSIZE;
+   }
+
err = dst_output(skb);
if (net_xmit_eval(err) == 0) {
struct pcpu_sw_netstats *tstats = this_cpu_ptr(dev->tstats);
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC net-next] rocker: remove rocker parameter from functions that have rocker_port parameter

2015-05-28 Thread Simon Horman
On Thu, May 28, 2015 at 08:15:42AM +0200, Jiri Pirko wrote:
> Thu, May 28, 2015 at 05:23:17AM CEST, simon.hor...@netronome.com wrote:
> >The rocker (switch) of a rocker_port may be trivially obtained from
> >the latter it seems cleaner not to pass the former to a function when
> >the latter is being passed anyway.
> 
> I don't understand reason for this patch. I like it the way it is I must
> say. + you introduce possible multiple dereference in a row in call-chain.

My main motivation is that it seems cleaner. I marked it as an RFC
as I wasn't sure if there was a particular reason that thins are how they
are. I have no objection to leaving things as they are if thats the
consensus.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next] neigh: Add missing rcu_assign_pointer

2015-05-28 Thread Ying Xue
Commit e4c4e448cf55 ("neigh: Convert garbage collection from softirq
to workqueue") misses to use rcu_assign_pointer() macro to assign a
RCU-protected pointer.

Signed-off-by: Ying Xue 
---
 net/core/neighbour.c |3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/core/neighbour.c b/net/core/neighbour.c
index 3a74df7..aaad3a5 100644
--- a/net/core/neighbour.c
+++ b/net/core/neighbour.c
@@ -783,7 +783,8 @@ static void neigh_periodic_work(struct work_struct *work)
if (atomic_read(&n->refcnt) == 1 &&
(state == NUD_FAILED ||
 time_after(jiffies, n->used + NEIGH_VAR(n->parms, 
GC_STALETIME {
-   *np = n->next;
+   rcu_assign_pointer(*np, 
rcu_dereference_protected(n->next,
+   
lockdep_is_held(&tbl->lock)));
n->dead = 1;
write_unlock(&n->lock);
neigh_cleanup_and_release(n);
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: kmalloc panic

2015-05-28 Thread Richard Weinberger
On Thu, May 28, 2015 at 9:21 AM, pavani
 wrote:
> Hi Cong ,
>
> Thanks for the response.
>
> Where we need to fix the bug ?I mean in the driver or kernel source code or
> hardware level.

The more interesting question is, is this a recent and pristine kernel
from kernel.org?

-- 
Thanks,
//richard
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


pull request (net): ipsec 2015-05-28

2015-05-28 Thread Steffen Klassert
1) Fix a race in xfrm_state_lookup_byspi, we need to take
   the refcount before we release xfrm_state_lock.
   From Li RongQing.

2) Fix IV generation on ESN state. We used just the
   low order sequence numbers for IV generation on
   ESN, as a result the IV can repeat on the same
   state. Fix this by using the  high order sequence
   number bits too and make sure to always initialize
   the high order bits with zero. These patches are
   serious stable candidates. Fixes from Herbert Xu.

3) Fix the skb->mark handling on vti. We don't
   reset skb->mark in skb_scrub_packet anymore,
   so vti must care to restore the original
   value back after it was used to lookup the
   vti policy and state. Fixes from Alexander Duyck.

Please pull or let me know if there are problems.

Thanks!

The following changes since commit 39376ccb1968ba9f83e2a880a8bf02ad5dea44e1:

  Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf (2015-04-27 
23:12:34 -0400)

are available in the git repository at:


  git://git.kernel.org/pub/scm/linux/kernel/git/klassert/ipsec.git master

for you to fetch changes up to d55c670cbc54b2270a465cdc382ce71adae45785:

  ip_vti/ip6_vti: Preserve skb->mark after rcv_cb call (2015-05-28 06:23:32 
+0200)


Alexander Duyck (3):
  ip_vti/ip6_vti: Do not touch skb->mark on xmit
  xfrm: Override skb->mark with tunnel->parm.i_key in xfrm_input
  ip_vti/ip6_vti: Preserve skb->mark after rcv_cb call

Herbert Xu (3):
  esp4: Use high-order sequence number bits for IV generation
  esp6: Use high-order sequence number bits for IV generation
  xfrm: Always zero high-order sequence number bits

Li RongQing (1):
  xfrm: fix a race in xfrm_state_lookup_byspi

 net/ipv4/esp4.c|  3 ++-
 net/ipv4/ip_vti.c  | 14 ++
 net/ipv6/esp6.c|  3 ++-
 net/ipv6/ip6_vti.c | 13 ++---
 net/xfrm/xfrm_input.c  | 17 -
 net/xfrm/xfrm_replay.c |  2 ++
 net/xfrm/xfrm_state.c  |  2 +-
 7 files changed, 43 insertions(+), 11 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/7] esp4: Use high-order sequence number bits for IV generation

2015-05-28 Thread Steffen Klassert
From: Herbert Xu 

I noticed we were only using the low-order bits for IV generation
when ESN is enabled.  This is very bad because it means that the
IV can repeat.  We must use the full 64 bits.

Signed-off-by: Herbert Xu 
Signed-off-by: Steffen Klassert 
---
 net/ipv4/esp4.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/esp4.c b/net/ipv4/esp4.c
index 421a80b..30b544f 100644
--- a/net/ipv4/esp4.c
+++ b/net/ipv4/esp4.c
@@ -256,7 +256,8 @@ static int esp_output(struct xfrm_state *x, struct sk_buff 
*skb)
aead_givcrypt_set_crypt(req, sg, sg, clen, iv);
aead_givcrypt_set_assoc(req, asg, assoclen);
aead_givcrypt_set_giv(req, esph->enc_data,
- XFRM_SKB_CB(skb)->seq.output.low);
+ XFRM_SKB_CB(skb)->seq.output.low +
+ ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32));
 
ESP_SKB_CB(skb)->tmp = tmp;
err = crypto_aead_givencrypt(req);
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/7] ip_vti/ip6_vti: Do not touch skb->mark on xmit

2015-05-28 Thread Steffen Klassert
From: Alexander Duyck 

Instead of modifying skb->mark we can simply modify the flowi_mark that is
generated as a result of the xfrm_decode_session.  By doing this we don't
need to actually touch the skb->mark and it can be preserved as it passes
out through the tunnel.

Signed-off-by: Alexander Duyck 
Signed-off-by: Steffen Klassert 
---
 net/ipv4/ip_vti.c  | 5 +++--
 net/ipv6/ip6_vti.c | 4 +++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 9f7269f..4c318e1 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -216,8 +216,6 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, 
struct net_device *dev)
 
memset(&fl, 0, sizeof(fl));
 
-   skb->mark = be32_to_cpu(tunnel->parms.o_key);
-
switch (skb->protocol) {
case htons(ETH_P_IP):
xfrm_decode_session(skb, &fl, AF_INET);
@@ -233,6 +231,9 @@ static netdev_tx_t vti_tunnel_xmit(struct sk_buff *skb, 
struct net_device *dev)
return NETDEV_TX_OK;
}
 
+   /* override mark with tunnel output key */
+   fl.flowi_mark = be32_to_cpu(tunnel->parms.o_key);
+
return vti_xmit(skb, dev, &fl);
 }
 
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index ed9d681..104de4d 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -495,7 +495,6 @@ vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
int ret;
 
memset(&fl, 0, sizeof(fl));
-   skb->mark = be32_to_cpu(t->parms.o_key);
 
switch (skb->protocol) {
case htons(ETH_P_IPV6):
@@ -516,6 +515,9 @@ vti6_tnl_xmit(struct sk_buff *skb, struct net_device *dev)
goto tx_err;
}
 
+   /* override mark with tunnel output key */
+   fl.flowi_mark = be32_to_cpu(t->parms.o_key);
+
ret = vti6_xmit(skb, dev, &fl);
if (ret < 0)
goto tx_err;
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 7/7] ip_vti/ip6_vti: Preserve skb->mark after rcv_cb call

2015-05-28 Thread Steffen Klassert
From: Alexander Duyck 

The vti6_rcv_cb and vti_rcv_cb calls were leaving the skb->mark modified
after completing the function.  This resulted in the original skb->mark
value being lost.  Since we only need skb->mark to be set for
xfrm_policy_check we can pull the assignment into the rcv_cb calls and then
just restore the original mark after xfrm_policy_check has been completed.

Signed-off-by: Alexander Duyck 
Signed-off-by: Steffen Klassert 
---
 net/ipv4/ip_vti.c  | 9 +++--
 net/ipv6/ip6_vti.c | 9 +++--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/ip_vti.c b/net/ipv4/ip_vti.c
index 4c318e1..0c15208 100644
--- a/net/ipv4/ip_vti.c
+++ b/net/ipv4/ip_vti.c
@@ -65,7 +65,6 @@ static int vti_input(struct sk_buff *skb, int nexthdr, __be32 
spi,
goto drop;
 
XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4 = tunnel;
-   skb->mark = be32_to_cpu(tunnel->parms.i_key);
 
return xfrm_input(skb, nexthdr, spi, encap_type);
}
@@ -91,6 +90,8 @@ static int vti_rcv_cb(struct sk_buff *skb, int err)
struct pcpu_sw_netstats *tstats;
struct xfrm_state *x;
struct ip_tunnel *tunnel = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4;
+   u32 orig_mark = skb->mark;
+   int ret;
 
if (!tunnel)
return 1;
@@ -107,7 +108,11 @@ static int vti_rcv_cb(struct sk_buff *skb, int err)
x = xfrm_input_state(skb);
family = x->inner_mode->afinfo->family;
 
-   if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family))
+   skb->mark = be32_to_cpu(tunnel->parms.i_key);
+   ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family);
+   skb->mark = orig_mark;
+
+   if (!ret)
return -EPERM;
 
skb_scrub_packet(skb, !net_eq(tunnel->net, dev_net(skb->dev)));
diff --git a/net/ipv6/ip6_vti.c b/net/ipv6/ip6_vti.c
index 104de4d..ff3bd86 100644
--- a/net/ipv6/ip6_vti.c
+++ b/net/ipv6/ip6_vti.c
@@ -322,7 +322,6 @@ static int vti6_rcv(struct sk_buff *skb)
}
 
XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6 = t;
-   skb->mark = be32_to_cpu(t->parms.i_key);
 
rcu_read_unlock();
 
@@ -342,6 +341,8 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err)
struct pcpu_sw_netstats *tstats;
struct xfrm_state *x;
struct ip6_tnl *t = XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6;
+   u32 orig_mark = skb->mark;
+   int ret;
 
if (!t)
return 1;
@@ -358,7 +359,11 @@ static int vti6_rcv_cb(struct sk_buff *skb, int err)
x = xfrm_input_state(skb);
family = x->inner_mode->afinfo->family;
 
-   if (!xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family))
+   skb->mark = be32_to_cpu(t->parms.i_key);
+   ret = xfrm_policy_check(NULL, XFRM_POLICY_IN, skb, family);
+   skb->mark = orig_mark;
+
+   if (!ret)
return -EPERM;
 
skb_scrub_packet(skb, !net_eq(t->net, dev_net(skb->dev)));
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/7] xfrm: Always zero high-order sequence number bits

2015-05-28 Thread Steffen Klassert
From: Herbert Xu 

As we're now always including the high bits of the sequence number
in the IV generation process we need to ensure that they don't
contain crap.

This patch ensures that the high sequence bits are always zeroed
so that we don't leak random data into the IV.

Signed-off-by: Herbert Xu 
Signed-off-by: Steffen Klassert 
---
 net/xfrm/xfrm_replay.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/net/xfrm/xfrm_replay.c b/net/xfrm/xfrm_replay.c
index dab57da..4fd725a 100644
--- a/net/xfrm/xfrm_replay.c
+++ b/net/xfrm/xfrm_replay.c
@@ -99,6 +99,7 @@ static int xfrm_replay_overflow(struct xfrm_state *x, struct 
sk_buff *skb)
 
if (x->type->flags & XFRM_TYPE_REPLAY_PROT) {
XFRM_SKB_CB(skb)->seq.output.low = ++x->replay.oseq;
+   XFRM_SKB_CB(skb)->seq.output.hi = 0;
if (unlikely(x->replay.oseq == 0)) {
x->replay.oseq--;
xfrm_audit_state_replay_overflow(x, skb);
@@ -177,6 +178,7 @@ static int xfrm_replay_overflow_bmp(struct xfrm_state *x, 
struct sk_buff *skb)
 
if (x->type->flags & XFRM_TYPE_REPLAY_PROT) {
XFRM_SKB_CB(skb)->seq.output.low = ++replay_esn->oseq;
+   XFRM_SKB_CB(skb)->seq.output.hi = 0;
if (unlikely(replay_esn->oseq == 0)) {
replay_esn->oseq--;
xfrm_audit_state_replay_overflow(x, skb);
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 6/7] xfrm: Override skb->mark with tunnel->parm.i_key in xfrm_input

2015-05-28 Thread Steffen Klassert
From: Alexander Duyck 

This change makes it so that if a tunnel is defined we just use the mark
from the tunnel instead of the mark from the skb header.  By doing this we
can avoid the need to set skb->mark inside of the tunnel receive functions.

Signed-off-by: Alexander Duyck 
Signed-off-by: Steffen Klassert 
---
 net/xfrm/xfrm_input.c | 17 -
 1 file changed, 16 insertions(+), 1 deletion(-)

diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index 526c4fe..b58286e 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -13,6 +13,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 
 static struct kmem_cache *secpath_cachep __read_mostly;
 
@@ -186,6 +188,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 
spi, int encap_type)
struct xfrm_state *x = NULL;
xfrm_address_t *daddr;
struct xfrm_mode *inner_mode;
+   u32 mark = skb->mark;
unsigned int family;
int decaps = 0;
int async = 0;
@@ -203,6 +206,18 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 
spi, int encap_type)
   XFRM_SPI_SKB_CB(skb)->daddroff);
family = XFRM_SPI_SKB_CB(skb)->family;
 
+   /* if tunnel is present override skb->mark value with tunnel i_key */
+   if (XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4) {
+   switch (family) {
+   case AF_INET:
+   mark = 
be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip4->parms.i_key);
+   break;
+   case AF_INET6:
+   mark = 
be32_to_cpu(XFRM_TUNNEL_SKB_CB(skb)->tunnel.ip6->parms.i_key);
+   break;
+   }
+   }
+
/* Allocate new secpath or COW existing one. */
if (!skb->sp || atomic_read(&skb->sp->refcnt) != 1) {
struct sec_path *sp;
@@ -229,7 +244,7 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 
spi, int encap_type)
goto drop;
}
 
-   x = xfrm_state_lookup(net, skb->mark, daddr, spi, nexthdr, 
family);
+   x = xfrm_state_lookup(net, mark, daddr, spi, nexthdr, family);
if (x == NULL) {
XFRM_INC_STATS(net, LINUX_MIB_XFRMINNOSTATES);
xfrm_audit_state_notfound(skb, family, spi, seq);
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/7] esp6: Use high-order sequence number bits for IV generation

2015-05-28 Thread Steffen Klassert
From: Herbert Xu 

I noticed we were only using the low-order bits for IV generation
when ESN is enabled.  This is very bad because it means that the
IV can repeat.  We must use the full 64 bits.

Signed-off-by: Herbert Xu 
Signed-off-by: Steffen Klassert 
---
 net/ipv6/esp6.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/esp6.c b/net/ipv6/esp6.c
index 31f1b5d..7c07ce3 100644
--- a/net/ipv6/esp6.c
+++ b/net/ipv6/esp6.c
@@ -248,7 +248,8 @@ static int esp6_output(struct xfrm_state *x, struct sk_buff 
*skb)
aead_givcrypt_set_crypt(req, sg, sg, clen, iv);
aead_givcrypt_set_assoc(req, asg, assoclen);
aead_givcrypt_set_giv(req, esph->enc_data,
- XFRM_SKB_CB(skb)->seq.output.low);
+ XFRM_SKB_CB(skb)->seq.output.low +
+ ((u64)XFRM_SKB_CB(skb)->seq.output.hi << 32));
 
ESP_SKB_CB(skb)->tmp = tmp;
err = crypto_aead_givencrypt(req);
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/7] xfrm: fix a race in xfrm_state_lookup_byspi

2015-05-28 Thread Steffen Klassert
From: Li RongQing 

The returned xfrm_state should be hold before unlock xfrm_state_lock,
otherwise the returned xfrm_state maybe be released.

Fixes: c454997e6[{pktgen, xfrm} Introduce xfrm_state_lookup_byspi..]
Cc: Fan Du 
Signed-off-by: Li RongQing 
Acked-by: Fan Du 
Signed-off-by: Steffen Klassert 
---
 net/xfrm/xfrm_state.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/xfrm/xfrm_state.c b/net/xfrm/xfrm_state.c
index f5e39e3..96688cd 100644
--- a/net/xfrm/xfrm_state.c
+++ b/net/xfrm/xfrm_state.c
@@ -927,8 +927,8 @@ struct xfrm_state *xfrm_state_lookup_byspi(struct net *net, 
__be32 spi,
x->id.spi != spi)
continue;
 
-   spin_unlock_bh(&net->xfrm.xfrm_state_lock);
xfrm_state_hold(x);
+   spin_unlock_bh(&net->xfrm.xfrm_state_lock);
return x;
}
spin_unlock_bh(&net->xfrm.xfrm_state_lock);
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: kmalloc panic

2015-05-28 Thread pavani

Hi Cong ,

Thanks for the response.

Where we need to fix the bug ?I mean in the driver or kernel source code 
or hardware level.


Is there any possible cases in the driver to fix this issue.

please reply me as soon as possible.

Thanks
pavani




On 05/28/2015 10:45 AM, Cong Wang wrote:

(Cc'ing netdev and wireless... Looks like a bug in wireless ext.)

On Wed, May 27, 2015 at 6:46 AM, pavani
 wrote:

Hi,

I connected to AP with the help of wpa_supplicant in linux.After connecting
to AP I am facing an issue like "kmalloc panic".can you help me
  how to solve this issue.Logs are like






CPU: ARM926EJ-S [41069265] revision 5 (ARMv5TEJ), cr=00053177
CPU: VIVT data cache, VIVT instruction cache
Machine: SpaceCom-Lite
Memory policy: ECC disabled, Data cache writeback
On node 0 totalpages: 16384
free_area_init_node: node 0, pgdat c03b7ff8, node_mem_map c03e9000
   Normal zone: 128 pages used for memmap
   Normal zone: 0 pages reserved
   Normal zone: 16256 pages, LIFO batch:3
Built 1 zonelists in Zone order, mobility grouping on.  Total pages: 16256
Kernel command line: console=ttyNS1,115200 root=/dev/mtdblock2
rootfstype=jffs2 quiet ro
PID hash table entries: 256 (order: -2, 1024 bytes)
Dentry cache hash table entries: 8192 (order: 3, 32768 bytes)
Inode-cache hash table entries: 4096 (order: 2, 16384 bytes)
Memory: 64MB = 64MB total
Memory: 60964k/60964k available, 4572k reserved, 0K highmem
Virtual kernel memory layout:
 vector  : 0x - 0x1000   (   4 kB)
 fixmap  : 0xfff0 - 0xfffe   ( 896 kB)
 DMA : 0xffc0 - 0xffe0   (   2 MB)
 vmalloc : 0xc480 - 0xf000   ( 696 MB)
 lowmem  : 0xc000 - 0xc400   (  64 MB)
 modules : 0xbf00 - 0xc000   (  16 MB)
   .init : 0xc0008000 - 0xc00e5000   ( 884 kB)
   .text : 0xc00e5000 - 0xc0387000   (2696 kB)
   .data : 0xc039a000 - 0xc03b8600   ( 122 kB)
SLUB: Genslabs=11, HWalign=32, Order=0-3, MinObjects=0, CPUs=1, Nodes=1
Hierarchical RCU implementation.
RCU-based detection of stalled CPUs is enabled.
NR_IRQS:82
Console: colour dummy device 80x30
Calibrating delay loop... 74.13 BogoMIPS (lpj=370688)
Mount-cache hash table entries: 512
CPU: Testing write buffer coherency: ok
Synthetic TSC timer will fire each 131104 jiffies.
NET: Registered protocol family 16
bio: create slab  at 0
Switching to clocksource ns921x-timer0
Switched to NOHz mode on CPU #0
NET: Registered protocol family 2
IP route cache hash table entries: 1024 (order: 0, 4096 bytes)
TCP established hash table entries: 2048 (order: 2, 16384 bytes)
TCP bind hash table entries: 2048 (order: 1, 8192 bytes)
TCP: Hash tables configured (established 2048 bind 2048)
TCP reno registered
UDP hash table entries: 256 (order: 0, 4096 bytes)
UDP-Lite hash table entries: 256 (order: 0, 4096 bytes)
NET: Registered protocol family 1
JFFS2 version 2.2. (NAND) © 2001-2006 Red Hat, Inc.
msgmni has been set to 119
alg: No test for stdrng (krng)
Block layer SCSI generic (bsg) driver version 0.4 loaded (major 253)
io scheduler noop registered
io scheduler deadline registered
io scheduler cfq registered (default)
ns921x-serial.1: ttyNS1 at MMIO 0x90018000 (irq = 8) is a NS921X
console [ttyNS1] enabled
ns921x-serial.2: ttyNS2 at MMIO 0x9002 (irq = 9) is a NS921X
ns921x-serial.3: ttyNS3 at MMIO 0x90028000 (irq = 10) is a NS921X
Digi NS921x UART driver
physmap platform flash device: 1000 at 5000
physmap-flash.0: Found 1 x16 devices at 0x0 in 16-bit bank
physmap-flash.0: Found an alias at 0x200 for the chip at 0x0
physmap-flash.0: Found an alias at 0x400 for the chip at 0x0
physmap-flash.0: Found an alias at 0x600 for the chip at 0x0
physmap-flash.0: Found an alias at 0x800 for the chip at 0x0
physmap-flash.0: Found an alias at 0xa00 for the chip at 0x0
physmap-flash.0: Found an alias at 0xc00 for the chip at 0x0
physmap-flash.0: Found an alias at 0xe00 for the chip at 0x0
  Amd/Fujitsu Extended Query Table at 0x0040
physmap-flash.0: CFI does not contain boot bank location. Assuming top.
number of CFI chips: 1
RedBoot partition parsing not available
Using physmap partition information
Creating 4 MTD partitions on "physmap-flash.0":
0x-0x0008 : "Bootloader"
0x0008-0x0088 : "Fallback"
0x0088-0x01e8 : "Normal"
0x01e8-0x0200 : "Data"
serialcan: serial line CAN interface driver
serialcan: 1 dynamic interface channels.
Digi NS9XXX Ethernet driver
rtc-ns9xxx rtc-ns9xxx.0: rtc core: registered rtc-ns9xxx as rtc0
ns9xxx-wdt ns9xxx-wdt: NS9xxx watchdog timer at 0xc48aa174
fims: Starting to register the FIMs module.
FIMs driver v0.2
fim-sdio: FIM SDIO driver v0.4
TCP cubic registered
NET: Registered protocol family 10
IPv6 over IPv4 tunneling driver
NET: Registered protocol family 17
can: controller area network core (rev 20090105 abi 8)
NET: Registered protocol family 29
can: raw protocol (rev 20090105)
determining board revision
board_rev = 0x0
kmemle

RE: [PATCH v4] bnx2x: Alloc 4k fragment for each rx ring buffer element

2015-05-28 Thread Yuval Mintz
>> +struct bnx2x_alloc_pool {
>> + struct page *page;
>> + dma_addr_t  dma;
>> + u8  offset;
>> + u8  frag_count;
>> +};
>...
>>  static int bnx2x_alloc_rx_sge(struct bnx2x *bp, struct bnx2x_fastpath *fp,
>> u16 index, gfp_t gfp_mask)
>>  {
>...
>> + pool->offset += SGE_PAGE_SIZE;
>> + pool->frag_count--;
>> +
>>   return 0;
>>  }

> One SGE_PAGE_SIZE is already bigger than representable by u8, so offset
> will overflow.

Thanks for the catch Michal.

Actually, this upsets me greatly. We didn't see it on a system with 4KB
pages, but this means you've actually tried to 'sell' us a fastpath fix that
was never tested on machines for which it was meant as an improvement.

Dave - if possible, please wait with accepting any further fixes for this
issue until we [qlogic] manage to prepare a test environment
where we can properly test this with 64KB page size architecture.

Thanks,
Yuval--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] xfrm6: Do not use xfrm_local_error for path MTU issues in tunnels

2015-05-28 Thread Alexander Duyck

On 05/27/2015 10:36 PM, Steffen Klassert wrote:

On Wed, May 27, 2015 at 10:40:32AM -0700, Alexander Duyck wrote:

This change makes it so that we use icmpv6_send to report PMTU issues back
into tunnels in the case that the resulting packet is larger than the MTU
of the outgoing interface.  Previously xfrm_local_error was being used in
this case, however this was resulting in no changes, I suspect due to the
fact that the tunnel itself was being kept out of the loop.

This patch fixes PMTU problems seen on ip6_vti tunnels and is based on the
behavior seen if the socket was orphaned.  Instead of requiring the socket
to be orphaned this patch simply defaults to using icmpv6_send in the case
that the frame came though a tunnel.

We can use icmpv6_send() just in the case that the packet
was already transmitted by a tunnel device, otherwise we
get the bug back that I mentioned in my other mail.

Not sure if we have something to know that the packet
traversed a tunnel device. That's what I asked in the
thread 'Looking for a lost patch'.


Okay I will try to do some more digging.  From what I can tell right now 
it looks like my ping attempts are getting hung up on the 
xfrm_local_error in __xfrm6_output.  I wonder if we couldn't somehow 
make use of the skb->cb to store a pointer to the tunnel that could be 
checked to determine if we are going through a VTI or not.


- Alex
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 1/1] hv_netvsc: Properly size the vrss queues

2015-05-28 Thread Dan Carpenter
Since you're redoing this anyway.

On Tue, May 26, 2015 at 04:21:09PM -0700, K. Y. Srinivasan wrote:
> diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
> index ddcc7f8..dd45440 100644
> --- a/drivers/net/hyperv/hyperv_net.h
> +++ b/drivers/net/hyperv/hyperv_net.h
> @@ -161,6 +161,7 @@ struct netvsc_device_info {
>   unsigned char mac_adr[ETH_ALEN];
>   bool link_state;/* 0 - link up, 1 - link down */
>   int  ring_size;
> + u32  max_num_vrss_chns;

We (Joe and I) have commented before that long names don't mix well with
the 80 character limit.  You could just leave the "num_" out.  Almost
all variables are numbers in C so it doesn't add anything.

regards,
dan carpenter
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


<    1   2   3