[PATCH v8 0/3] Introduce VF trust capability and xcast_mode in VF

2015-08-28 Thread Hiroshi Shimamoto
From: Hiroshi Shimamoto 

There is a limitation in the number of multicast L2 addresses in ixgbe
and ixgbevf driver. The number of multicast addresses in VF is 30 in the
current implementation. That means that we can use up to 30 IPv6
addresses only. On the other hand there is a functionality to set VF
multicast promiscuous mode in the NIC.

This patchset addresses the issue.

First, it introduces VF trusting capability. Like VF multicast promiscuous
may hurt security and performance. We would like to enable such
functionality only on trusted VF.
Next, it introduces VF xcast_mode that represents multicast mode in VF
and request it to PF. If ALLMULTI is set in VF network device, it requests
VF multicast promiscuous mode to PF. And the VF is trusted, PF enables VF
multicast promiscuous mode.

Short history
v5->v6
Reorganize patchsets, make it with VF trust and MC promisc mode.

v6->v7
Change to introduce xcast_mode instead of dedicated VF multicast
promisc mode API.

v7->v8
Fix to use EOPNOTSUPP in ixgbe_update_vf_xcast_mode() on error,
instead of -1.

Hiroshi Shimamoto (3):
  if_link: Add control trust VF
  ixgbe: Add new ndo to trust VF
  ixgbe, ixgbevf: Add new mbox API xcast mode

 drivers/net/ethernet/intel/ixgbe/ixgbe.h  |  8 ++
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |  1 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h  |  2 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c| 96 +++
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h|  1 +
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h  |  6 ++
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |  8 ++
 drivers/net/ethernet/intel/ixgbevf/mbx.h  |  2 +
 drivers/net/ethernet/intel/ixgbevf/vf.c   | 41 ++
 drivers/net/ethernet/intel/ixgbevf/vf.h   |  1 +
 include/linux/if_link.h   |  1 +
 include/linux/netdevice.h |  3 +
 include/uapi/linux/if_link.h  |  6 ++
 net/core/rtnetlink.c  | 24 +-
 14 files changed, 197 insertions(+), 3 deletions(-)

-- 
1.8.3.1



[PATCH v8 1/3] if_link: Add control trust VF

2015-08-28 Thread Hiroshi Shimamoto
From: Hiroshi Shimamoto 

Add netlink directives and ndo entry to trust VF user.

This controls the special permission of VF user.
The administrator will dedicatedly trust VF user to use some features
which impacts security and/or performance.

The administrator never turn it on unless VF user is fully trusted.

Signed-off-by: Hiroshi Shimamoto 
CC: Choi, Sy Jong 
---
 include/linux/if_link.h  |  1 +
 include/linux/netdevice.h|  3 +++
 include/uapi/linux/if_link.h |  6 ++
 net/core/rtnetlink.c | 24 +---
 4 files changed, 31 insertions(+), 3 deletions(-)

diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index ae5d0d2..f923d15 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -24,5 +24,6 @@ struct ifla_vf_info {
__u32 min_tx_rate;
__u32 max_tx_rate;
__u32 rss_query_en;
+   __u32 trusted;
 };
 #endif /* _LINUX_IF_LINK_H */
diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 6163ecb..7db19e7 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -880,6 +880,7 @@ typedef u16 (*select_queue_fallback_t)(struct net_device 
*dev,
  * int (*ndo_set_vf_rate)(struct net_device *dev, int vf, int min_tx_rate,
  *   int max_tx_rate);
  * int (*ndo_set_vf_spoofchk)(struct net_device *dev, int vf, bool setting);
+ * int (*ndo_set_vf_trust)(struct net_device *dev, int vf, bool setting);
  * int (*ndo_get_vf_config)(struct net_device *dev,
  * int vf, struct ifla_vf_info *ivf);
  * int (*ndo_set_vf_link_state)(struct net_device *dev, int vf, int 
link_state);
@@ -1121,6 +1122,8 @@ struct net_device_ops {
   int max_tx_rate);
int (*ndo_set_vf_spoofchk)(struct net_device *dev,
   int vf, bool setting);
+   int (*ndo_set_vf_trust)(struct net_device *dev,
+   int vf, bool setting);
int (*ndo_get_vf_config)(struct net_device *dev,
 int vf,
 struct ifla_vf_info *ivf);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 313c305..2d6abd4 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -498,6 +498,7 @@ enum {
 * on/off switch
 */
IFLA_VF_STATS,  /* network device statistics */
+   IFLA_VF_TRUST,  /* Trust VF */
__IFLA_VF_MAX,
 };
 
@@ -559,6 +560,11 @@ enum {
 
 #define IFLA_VF_STATS_MAX (__IFLA_VF_STATS_MAX - 1)
 
+struct ifla_vf_trust {
+   __u32 vf;
+   __u32 setting;
+};
+
 /* VF ports management section
  *
  * Nested layout of set/get msg is:
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 788ceed..2836bf1 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -831,7 +831,8 @@ static inline int rtnl_vfinfo_size(const struct net_device 
*dev,
 /* IFLA_VF_STATS_BROADCAST */
 nla_total_size(sizeof(__u64)) +
 /* IFLA_VF_STATS_MULTICAST */
-nla_total_size(sizeof(__u64)));
+nla_total_size(sizeof(__u64)) +
+nla_total_size(sizeof(struct ifla_vf_trust)));
return size;
} else
return 0;
@@ -1154,6 +1155,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct 
net_device *dev,
struct ifla_vf_link_state vf_linkstate;
struct ifla_vf_rss_query_en vf_rss_query_en;
struct ifla_vf_stats vf_stats;
+   struct ifla_vf_trust vf_trust;
 
/*
 * Not all SR-IOV capable drivers support the
@@ -1163,6 +1165,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct 
net_device *dev,
 */
ivi.spoofchk = -1;
ivi.rss_query_en = -1;
+   ivi.trusted = -1;
memset(ivi.mac, 0, sizeof(ivi.mac));
/* The default value for VF link state is "auto"
 * IFLA_VF_LINK_STATE_AUTO which equals zero
@@ -1176,7 +1179,8 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct 
net_device *dev,
vf_tx_rate.vf =
vf_spoofchk.vf =
vf_linkstate.vf =
-   vf_rss_query_en.vf = ivi.vf;
+   vf_rss_query_en.vf =
+   vf_trust.vf = ivi.vf;
 
memcpy(vf_mac.mac, ivi.mac, sizeof(ivi.mac));
   

[PATCH v8 2/3] ixgbe: Add new ndo to trust VF

2015-08-28 Thread Hiroshi Shimamoto
From: Hiroshi Shimamoto 

Implements the new netdev op to trust VF in ixgbe.

The administrator can turn on and off VF trusted by ip command which
supports trust message.
 # ip link set dev eth0 vf 1 trust on
or
 # ip link set dev eth0 vf 1 trust off

Send a ping to reset VF on changing the status of trusting.
VF driver will reconfigure its features on reset.

Signed-off-by: Hiroshi Shimamoto 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h   |  1 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c  |  1 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c | 37 ++
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h |  1 +
 4 files changed, 40 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h 
b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index 3b9b911..f147a5a 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -152,6 +152,7 @@ struct vf_data_storage {
u16 vlan_count;
u8 spoofchk_enabled;
bool rss_query_enabled;
+   u8 trusted;
unsigned int vf_api;
 };
 
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index 626ed01..914c1b0 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -8399,6 +8399,7 @@ static const struct net_device_ops ixgbe_netdev_ops = {
.ndo_set_vf_rate= ixgbe_ndo_set_vf_bw,
.ndo_set_vf_spoofchk= ixgbe_ndo_set_vf_spoofchk,
.ndo_set_vf_rss_query_en = ixgbe_ndo_set_vf_rss_query_en,
+   .ndo_set_vf_trust   = ixgbe_ndo_set_vf_trust,
.ndo_get_vf_config  = ixgbe_ndo_get_vf_config,
.ndo_get_stats64= ixgbe_get_stats64,
 #ifdef CONFIG_IXGBE_DCB
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
index 1d17b58..65aeb58 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
@@ -116,6 +116,9 @@ static int __ixgbe_enable_sriov(struct ixgbe_adapter 
*adapter)
 * we want to disable the querying by default.
 */
adapter->vfinfo[i].rss_query_enabled = 0;
+
+   /* Untrust all VFs */
+   adapter->vfinfo[i].trusted = false;
}
 
return 0;
@@ -1124,6 +1127,17 @@ void ixgbe_disable_tx_rx(struct ixgbe_adapter *adapter)
IXGBE_WRITE_REG(hw, IXGBE_VFRE(1), 0);
 }
 
+static inline void ixgbe_ping_vf(struct ixgbe_adapter *adapter, int vf)
+{
+   struct ixgbe_hw *hw = &adapter->hw;
+   u32 ping;
+
+   ping = IXGBE_PF_CONTROL_MSG;
+   if (adapter->vfinfo[vf].clear_to_send)
+   ping |= IXGBE_VT_MSGTYPE_CTS;
+   ixgbe_write_mbx(hw, &ping, 1, vf);
+}
+
 void ixgbe_ping_all_vfs(struct ixgbe_adapter *adapter)
 {
struct ixgbe_hw *hw = &adapter->hw;
@@ -1416,6 +1430,28 @@ int ixgbe_ndo_set_vf_rss_query_en(struct net_device 
*netdev, int vf,
return 0;
 }
 
+int ixgbe_ndo_set_vf_trust(struct net_device *netdev, int vf, bool setting)
+{
+   struct ixgbe_adapter *adapter = netdev_priv(netdev);
+
+   if (vf >= adapter->num_vfs)
+   return -EINVAL;
+
+   /* nothing to do */
+   if (adapter->vfinfo[vf].trusted == setting)
+   return 0;
+
+   adapter->vfinfo[vf].trusted = setting;
+
+   /* reset VF to reconfigure features */
+   adapter->vfinfo[vf].clear_to_send = false;
+   ixgbe_ping_vf(adapter, vf);
+
+   e_info(drv, "VF %u is %strusted\n", vf, setting ? "" : "not ");
+
+   return 0;
+}
+
 int ixgbe_ndo_get_vf_config(struct net_device *netdev,
int vf, struct ifla_vf_info *ivi)
 {
@@ -1430,5 +1466,6 @@ int ixgbe_ndo_get_vf_config(struct net_device *netdev,
ivi->qos = adapter->vfinfo[vf].pf_qos;
ivi->spoofchk = adapter->vfinfo[vf].spoofchk_enabled;
ivi->rss_query_en = adapter->vfinfo[vf].rss_query_enabled;
+   ivi->trusted = adapter->vfinfo[vf].trusted;
return 0;
 }
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
index 2c197e6..dad9257 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.h
@@ -49,6 +49,7 @@ int ixgbe_ndo_set_vf_bw(struct net_device *netdev, int vf, 
int min_tx_rate,
 int ixgbe_ndo_set_vf_spoofchk(struct net_device *netdev, int vf, bool setting);
 int ixgbe_ndo_set_vf_rss_query_en(struct net_device *netdev, int vf,
  bool setting);
+int ixgbe_ndo_set_vf_trust(struct net_device *netdev, int vf, bool setting);
 int ixgbe_ndo_get_vf_config(struct net_device *netdev,
int vf, struct ifla_vf_info *ivi);
 void ixgbe_check_vf_rate_limit(struct ixgbe_adapter *adapter);
-- 
1.8.3.1



Re: [PATCH net] netlink: rx mmap: fix POLLIN condition

2015-08-28 Thread Ken-ichirou MATSUZAWA
 Thank you for the reply.
 
On Tue, Aug 25, 2015 at 08:17:12PM -0700, David Miller wrote:
> So if netlink_forward_ring() _actually_ sees an entry that we should
> advance past, it will cycle through the whole ring, advancing ring->head
> until it equals the "ring->head != head" loop test fails.
> 
> We should definitely fix this bug first.

I should have realized it, sorry. I think the following patch will
fix it, would you review it?

> As per your patch, I wonder if a backwards scan would be faster.

I think so, thanks. I will resend it after netlink_forward_ring()
fix is applied.

Thanks,
Ken
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v8 3/3] ixgbe, ixgbevf: Add new mbox API xcast mode

2015-08-28 Thread Hiroshi Shimamoto
From: Hiroshi Shimamoto 

The limitation of the number of multicast address for VF is not enough
for the large scale server with SR-IOV feature. IPv6 requires the multicast
MAC address for each IP address to handle the Neighbor Solicitation
message. We couldn't assign over 30 IPv6 addresses to a single VF.

This patch introduces the new mailbox API, IXGBE_VF_UPDATE_XCAST_MODE,
to update multicast mode of VF. This adds 3 modes;
  - NONE only L2 exact match addresses or Flow Director enabled
  - MULTIBAM and ROMPE set
  - ALLMULTI BAM, ROMPE and MPE set

If a guest VF user wants over 30 MAC multicast addresses, set IFF_ALLMULTI
to request PF to update xcast mode to enable VF multicast promiscuous mode.

On the other hand, enabling VF multicast promiscuous mode may affect
security and performance in the network of the NIC. Only trusted VF can
enable multicast promiscuous mode. The behavior of untrusted VF is the
same as previous version.

Signed-off-by: Hiroshi Shimamoto 
---
 drivers/net/ethernet/intel/ixgbe/ixgbe.h  |  7 +++
 drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h  |  2 +
 drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c| 59 +++
 drivers/net/ethernet/intel/ixgbevf/ixgbevf.h  |  6 +++
 drivers/net/ethernet/intel/ixgbevf/ixgbevf_main.c |  8 +++
 drivers/net/ethernet/intel/ixgbevf/mbx.h  |  2 +
 drivers/net/ethernet/intel/ixgbevf/vf.c   | 41 
 drivers/net/ethernet/intel/ixgbevf/vf.h   |  1 +
 8 files changed, 126 insertions(+)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe.h 
b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
index f147a5a..838284c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe.h
@@ -153,9 +153,16 @@ struct vf_data_storage {
u8 spoofchk_enabled;
bool rss_query_enabled;
u8 trusted;
+   int xcast_mode;
unsigned int vf_api;
 };
 
+enum ixgbevf_xcast_modes {
+   IXGBEVF_XCAST_MODE_NONE = 0,
+   IXGBEVF_XCAST_MODE_MULTI,
+   IXGBEVF_XCAST_MODE_ALLMULTI,
+};
+
 struct vf_macvlans {
struct list_head l;
int vf;
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h
index b1e4703..8daa95f 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_mbx.h
@@ -102,6 +102,8 @@ enum ixgbe_pfvf_api_rev {
 #define IXGBE_VF_GET_RETA  0x0a/* VF request for RETA */
 #define IXGBE_VF_GET_RSS_KEY   0x0b/* get RSS key */
 
+#define IXGBE_VF_UPDATE_XCAST_MODE 0x0c
+
 /* length of permanent address message returned from PF */
 #define IXGBE_VF_PERMADDR_MSG_LEN 4
 /* word in permanent address message with the current multicast type */
diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
index 65aeb58..fcd8b27 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_sriov.c
@@ -119,6 +119,9 @@ static int __ixgbe_enable_sriov(struct ixgbe_adapter 
*adapter)
 
/* Untrust all VFs */
adapter->vfinfo[i].trusted = false;
+
+   /* set the default xcast mode */
+   adapter->vfinfo[i].xcast_mode = IXGBEVF_XCAST_MODE_NONE;
}
 
return 0;
@@ -1004,6 +1007,59 @@ static int ixgbe_get_vf_rss_key(struct ixgbe_adapter 
*adapter,
return 0;
 }
 
+static int ixgbe_update_vf_xcast_mode(struct ixgbe_adapter *adapter,
+ u32 *msgbuf, u32 vf)
+{
+   struct ixgbe_hw *hw = &adapter->hw;
+   int xcast_mode = msgbuf[1];
+   u32 vmolr, disable, enable;
+
+   /* verify the PF is supporting the correct APIs */
+   switch (adapter->vfinfo[vf].vf_api) {
+   case ixgbe_mbox_api_12:
+   break;
+   default:
+   return -EOPNOTSUPP;
+   }
+
+   if (xcast_mode > IXGBEVF_XCAST_MODE_MULTI &&
+   !adapter->vfinfo[vf].trusted) {
+   xcast_mode = IXGBEVF_XCAST_MODE_MULTI;
+   }
+
+   if (adapter->vfinfo[vf].xcast_mode == xcast_mode)
+   goto out;
+
+   switch (xcast_mode) {
+   case IXGBEVF_XCAST_MODE_NONE:
+   disable = IXGBE_VMOLR_BAM | IXGBE_VMOLR_ROMPE | IXGBE_VMOLR_MPE;
+   enable = 0;
+   break;
+   case IXGBEVF_XCAST_MODE_MULTI:
+   disable = IXGBE_VMOLR_MPE;
+   enable = IXGBE_VMOLR_BAM | IXGBE_VMOLR_ROMPE;
+   break;
+   case IXGBEVF_XCAST_MODE_ALLMULTI:
+   disable = 0;
+   enable = IXGBE_VMOLR_BAM | IXGBE_VMOLR_ROMPE | IXGBE_VMOLR_MPE;
+   break;
+   default:
+   return -EOPNOTSUPP;
+   }
+
+   vmolr = IXGBE_READ_REG(hw, IXGBE_VMOLR(vf));
+   vmolr &= ~disable;
+   vmolr |= enable;
+   IXGBE_WRITE_REG(hw, IXGBE_VMOLR(vf), v

[PATCH net] netlink: mmap: fix lookup frame position

2015-08-28 Thread Ken-ichirou MATSUZAWA
__netlink_lookup_frame() was always called with the same "pos"
value in netlink_forward_ring(). It will look at the same ring entry
header over and over again, every time through this loop. Then cycle
through the whole ring, advancing ring->head, not "pos" until it
equals the "ring->head != head" loop test fails.

Signed-off-by: Ken-ichirou MATSUZAWA 
---
 net/netlink/af_netlink.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index a774985..39fa91f 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -610,11 +610,11 @@ static void netlink_increment_head(struct netlink_ring 
*ring)
 
 static void netlink_forward_ring(struct netlink_ring *ring)
 {
-   unsigned int head = ring->head, pos = head;
+   unsigned int head = ring->head;
const struct nl_mmap_hdr *hdr;
 
do {
-   hdr = __netlink_lookup_frame(ring, pos);
+   hdr = __netlink_lookup_frame(ring, ring->head);
if (hdr->nm_status == NL_MMAP_STATUS_UNUSED)
break;
if (hdr->nm_status != NL_MMAP_STATUS_SKIP)
-- 
1.7.10.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next v5 00/11] ipv6: Only create RTF_CACHE route after encountering pmtu exception

2015-08-28 Thread Martin KaFai Lau
On Mon, Aug 17, 2015 at 11:43:20AM +0200, Alexander Holler wrote:
> That's why I vote to check out if it's possible/reasonable to backport this
> series to the stable kernels.
I have backported to 4.0.y without major issue, so possible.

I did try on 3.1x and gave up.

It is a lot of changes,  so I don't think it is a good idea for -stable.

Thanks,
Martin
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net] netlink: mmap: fix status setting in skb destructor

2015-08-28 Thread Ken-ichirou MATSUZAWA
On Tue, Aug 25, 2015 at 08:22:03PM -0700, David Miller wrote:
> From: Ken-ichirou MATSUZAWA 
> > I don't know the intension of setting VALID status in the skb
> > destructor. But I think it need to be set UNUSED status in case of
> 
> I think the idea is to have the user process this "zero length" frame
> and advance the status itself.
> 
> I think it is probably racy and problematic to have the kernel set a
> frame's state to UNUSED.  It is not a valid state transition for the
> kernel side of RX ring processing.
> 
> Only the user can safely release ring entries back to the kernel.

I will just update the frame status to UNUSED and advance ring
position in user space in case of nm_len == 0. Thank you.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] usbnet: Fix a race between usbnet_stop() and the BH

2015-08-28 Thread Eugene Shatokhin

25.08.2015 00:01, Bjørn Mork пишет:

Eugene Shatokhin  writes:


The race may happen when a device (e.g. YOTA 4G LTE Modem) is
unplugged while the system is downloading a large file from the Net.

Hardware breakpoints and Kprobes with delays were used to confirm that
the race does actually happen.

The race is on skb_queue ('next' pointer) between usbnet_stop()
and rx_complete(), which, in turn, calls usbnet_bh().

Here is a part of the call stack with the code where the changes to the
queue happen. The line numbers are for the kernel 4.1.0:

*0 __skb_unlink (skbuff.h:1517)
 prev->next = next;
*1 defer_bh (usbnet.c:430)
 spin_lock_irqsave(&list->lock, flags);
 old_state = entry->state;
 entry->state = state;
 __skb_unlink(skb, list);
 spin_unlock(&list->lock);
 spin_lock(&dev->done.lock);
 __skb_queue_tail(&dev->done, skb);
 if (dev->done.qlen == 1)
 tasklet_schedule(&dev->bh);
 spin_unlock_irqrestore(&dev->done.lock, flags);
*2 rx_complete (usbnet.c:640)
 state = defer_bh(dev, skb, &dev->rxq, state);

At the same time, the following code repeatedly checks if the queue is
empty and reads these values concurrently with the above changes:

*0  usbnet_terminate_urbs (usbnet.c:765)
 /* maybe wait for deletions to finish. */
 while (!skb_queue_empty(&dev->rxq)
 && !skb_queue_empty(&dev->txq)
 && !skb_queue_empty(&dev->done)) {
 schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS));
 set_current_state(TASK_UNINTERRUPTIBLE);
 netif_dbg(dev, ifdown, dev->net,
   "waited for %d urb completions\n", temp);
 }
*1  usbnet_stop (usbnet.c:806)
 if (!(info->flags & FLAG_AVOID_UNLINK_URBS))
 usbnet_terminate_urbs(dev);

As a result, it is possible, for example, that the skb is removed from
dev->rxq by __skb_unlink() before the check
"!skb_queue_empty(&dev->rxq)" in usbnet_terminate_urbs() is made. It is
also possible in this case that the skb is added to dev->done queue
after "!skb_queue_empty(&dev->done)" is checked. So
usbnet_terminate_urbs() may stop waiting and return while dev->done
queue still has an item.


Exactly what problem will that result in?  The tasklet_kill() will wait
for the processing of the single element done queue, and everything will
be fine.  Or?


Given enough time, what prevents defer_bh() from calling 
tasklet_schedule(&dev->bh) *after* usbnet_stop() calls tasklet_kill()?


Consider the following situation (assuming '&&' are changed to '||' in 
that while loop in usbnet_terminate_urbs() as they should be):


CPU0CPU1
usbnet_stop()   defer_bh() with list == dev->rxq
  usbnet_terminate_urbs()
__skb_unlink() removes the last
skb from dev->rxq.
dev->rxq, dev->txq and dev->done
are now empty.
  while (!skb_queue_empty()...)
The loop ends because all 3
queues are now empty.

  usbnet_terminate_urbs() ends.

usbnet_stop() continues:
  usbnet_status_stop(dev);
  ...
  del_timer_sync (&dev->delay);
  tasklet_kill (&dev->bh);
__skb_queue_tail(&dev->done, skb);
if (dev->done.qlen == 1)
  tasklet_schedule(&dev->bh);

The BH is scheduled at this point, which is not what was intended. The 
race window is small, but still.


Regards,
Eugene

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] route: fix breakage after moving lwtunnel state

2015-08-28 Thread Jiri Benc
On Thu, 27 Aug 2015 00:13:30 +0200, Thomas Graf wrote:
> Did you test with a card that features UDP encapsulation offloads?

No, I did not. I do have access to NICs supporting it but I think in
this case, the numbers without vxlan offloading were more interesting.

 Jiri

-- 
Jiri Benc
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] openswitch: fix typo CONFIG_NF_CONNTRACK_LABEL

2015-08-28 Thread Valentin Rothberg
Fix typo in conntrack.c
s/CONFIG_NF_CONNTRACK_LABEL/CONFIG_NF_CONNTRACK_LABELS/

Signed-off-by: Valentin Rothberg 
---
The typo was added by commmit c2ac66735870 ("openvswitch: Allow matching
on conntrack label").
I detected the issue scripts/checkkconfigsymbols.py


 net/openvswitch/conntrack.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/openvswitch/conntrack.c b/net/openvswitch/conntrack.c
index 890d3eedb447..886bd2758502 100644
--- a/net/openvswitch/conntrack.c
+++ b/net/openvswitch/conntrack.c
@@ -169,7 +169,7 @@ int ovs_ct_put_key(const struct sw_flow_key *key, struct 
sk_buff *skb)
nla_put_u32(skb, OVS_KEY_ATTR_CT_MARK, key->ct.mark))
return -EMSGSIZE;
 
-   if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABEL) &&
+   if (IS_ENABLED(CONFIG_NF_CONNTRACK_LABELS) &&
nla_put(skb, OVS_KEY_ATTR_CT_LABEL, sizeof(key->ct.label),
&key->ct.label))
return -EMSGSIZE;
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] usbnet: Fix a race between usbnet_stop() and the BH

2015-08-28 Thread Bjørn Mork
Eugene Shatokhin  writes:

> 25.08.2015 00:01, Bjørn Mork пишет:
>> Eugene Shatokhin  writes:
>>
>>> The race may happen when a device (e.g. YOTA 4G LTE Modem) is
>>> unplugged while the system is downloading a large file from the Net.
>>>
>>> Hardware breakpoints and Kprobes with delays were used to confirm that
>>> the race does actually happen.
>>>
>>> The race is on skb_queue ('next' pointer) between usbnet_stop()
>>> and rx_complete(), which, in turn, calls usbnet_bh().
>>>
>>> Here is a part of the call stack with the code where the changes to the
>>> queue happen. The line numbers are for the kernel 4.1.0:
>>>
>>> *0 __skb_unlink (skbuff.h:1517)
>>>  prev->next = next;
>>> *1 defer_bh (usbnet.c:430)
>>>  spin_lock_irqsave(&list->lock, flags);
>>>  old_state = entry->state;
>>>  entry->state = state;
>>>  __skb_unlink(skb, list);
>>>  spin_unlock(&list->lock);
>>>  spin_lock(&dev->done.lock);
>>>  __skb_queue_tail(&dev->done, skb);
>>>  if (dev->done.qlen == 1)
>>>  tasklet_schedule(&dev->bh);
>>>  spin_unlock_irqrestore(&dev->done.lock, flags);
>>> *2 rx_complete (usbnet.c:640)
>>>  state = defer_bh(dev, skb, &dev->rxq, state);
>>>
>>> At the same time, the following code repeatedly checks if the queue is
>>> empty and reads these values concurrently with the above changes:
>>>
>>> *0  usbnet_terminate_urbs (usbnet.c:765)
>>>  /* maybe wait for deletions to finish. */
>>>  while (!skb_queue_empty(&dev->rxq)
>>>  && !skb_queue_empty(&dev->txq)
>>>  && !skb_queue_empty(&dev->done)) {
>>>  schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS));
>>>  set_current_state(TASK_UNINTERRUPTIBLE);
>>>  netif_dbg(dev, ifdown, dev->net,
>>>"waited for %d urb completions\n", temp);
>>>  }
>>> *1  usbnet_stop (usbnet.c:806)
>>>  if (!(info->flags & FLAG_AVOID_UNLINK_URBS))
>>>  usbnet_terminate_urbs(dev);
>>>
>>> As a result, it is possible, for example, that the skb is removed from
>>> dev->rxq by __skb_unlink() before the check
>>> "!skb_queue_empty(&dev->rxq)" in usbnet_terminate_urbs() is made. It is
>>> also possible in this case that the skb is added to dev->done queue
>>> after "!skb_queue_empty(&dev->done)" is checked. So
>>> usbnet_terminate_urbs() may stop waiting and return while dev->done
>>> queue still has an item.
>>
>> Exactly what problem will that result in?  The tasklet_kill() will wait
>> for the processing of the single element done queue, and everything will
>> be fine.  Or?
>
> Given enough time, what prevents defer_bh() from calling
> tasklet_schedule(&dev->bh) *after* usbnet_stop() calls tasklet_kill()?
>
> Consider the following situation (assuming '&&' are changed to '||' in
> that while loop in usbnet_terminate_urbs() as they should be):
>
> CPU0CPU1
> usbnet_stop()   defer_bh() with list == dev->rxq
>   usbnet_terminate_urbs()
> __skb_unlink() removes the last
> skb from dev->rxq.
> dev->rxq, dev->txq and dev->done
> are now empty.
>   while (!skb_queue_empty()...)
> The loop ends because all 3
> queues are now empty.
>
>   usbnet_terminate_urbs() ends.
>
> usbnet_stop() continues:
>   usbnet_status_stop(dev);
>   ...
>   del_timer_sync (&dev->delay);
>   tasklet_kill (&dev->bh);
> __skb_queue_tail(&dev->done, skb);
> if (dev->done.qlen == 1)
>   tasklet_schedule(&dev->bh);
>
> The BH is scheduled at this point, which is not what was intended. The
> race window is small, but still.

I guess you are right.  At least I cannot prove that you are not :)

There is a bit too much complexity involved here for me...



Bjørn
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


pull request: batman-adv 20150828

2015-08-28 Thread Antonio Quartulli
Hello David,

this is our very last batch of patches intended for net-next/linux-4.3

It includes again 2 non-critical fixes which we couldn't send to net
and then the rest is just code restyling and beautification. No
major behavioural change was introduced.

Please pull or let me know of any problem!

Thanks a lot,
Antonio


The following changes since commit 1dd34b5ad8aebaff17b625fc0126e18243008a3f:

  bpf: fix bpf_skb_set_tunnel_key() helper (2015-08-26 17:38:13 -0700)

are available in the git repository at:

  git://git.open-mesh.org/linux-merge.git tags/batman-adv-for-davem

for you to fetch changes up to ed29266347025a19ee689807b07d121f0a7441f1:

  batman-adv: turn batadv_neigh_node_get() into local function (2015-08-27 
20:15:34 +0200)


Included changes:
- code beautification
- remove obsolete 'deleted' attribute for bat-gw node
- increase internal version number
- prevent potential access to netdev object after deregistration
- set needed_head/tail_room for batman virtual interface


Antonio Quartulli (1):
  batman-adv: don't access unregistered net_device object

Marek Lindner (5):
  batman-adv: move hardif refcount inc to batadv_neigh_node_new()
  batman-adv: remove redundant hard_iface assignment
  batman-adv: move neigh_node list add into batadv_neigh_node_new()
  batman-adv: rearrange batadv_neigh_node_new() arguments to follow 
convention
  batman-adv: turn batadv_neigh_node_get() into local function

Simon Wunderlich (3):
  batman-adv: remove obsolete deleted attribute for gateway node
  batman-adv: fix gateway client style issues
  batman-adv: Start new development cycle

Sven Eckelmann (1):
  batman-adv: Add lower layer needed_(head|tail)room to own ones

 net/batman-adv/bat_iv_ogm.c | 30 +-
 net/batman-adv/gateway_client.c | 50 ++-
 net/batman-adv/gateway_client.h |  2 +-
 net/batman-adv/hard-interface.c | 44 +++-
 net/batman-adv/main.c   |  2 +-
 net/batman-adv/main.h   |  2 +-
 net/batman-adv/originator.c | 90 -
 net/batman-adv/originator.h |  9 ++---
 net/batman-adv/soft-interface.c |  2 -
 net/batman-adv/types.h  |  2 -
 10 files changed, 117 insertions(+), 116 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 05/10] batman-adv: rearrange batadv_neigh_node_new() arguments to follow convention

2015-08-28 Thread Antonio Quartulli
From: Marek Lindner 

Signed-off-by: Marek Lindner 
Acked-by: Simon Wunderlich 
Signed-off-by: Antonio Quartulli 
---
 net/batman-adv/bat_iv_ogm.c | 2 +-
 net/batman-adv/originator.c | 7 ---
 net/batman-adv/originator.h | 5 +++--
 3 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 5e93af4..912d9c3 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -298,7 +298,7 @@ batadv_iv_ogm_neigh_new(struct batadv_hard_iface 
*hard_iface,
 {
struct batadv_neigh_node *neigh_node;
 
-   neigh_node = batadv_neigh_node_new(hard_iface, neigh_addr, orig_node);
+   neigh_node = batadv_neigh_node_new(orig_node, hard_iface, neigh_addr);
if (!neigh_node)
goto out;
 
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index d6d9809..099a84a 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -444,16 +444,17 @@ out:
 
 /**
  * batadv_neigh_node_new - create and init a new neigh_node object
+ * @orig_node: originator object representing the neighbour
  * @hard_iface: the interface where the neighbour is connected to
  * @neigh_addr: the mac address of the neighbour interface
- * @orig_node: originator object representing the neighbour
  *
  * Allocates a new neigh_node object and initialises all the generic fields.
  * Returns the new object or NULL on failure.
  */
 struct batadv_neigh_node *
-batadv_neigh_node_new(struct batadv_hard_iface *hard_iface,
- const u8 *neigh_addr, struct batadv_orig_node *orig_node)
+batadv_neigh_node_new(struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *hard_iface,
+ const u8 *neigh_addr)
 {
struct batadv_neigh_node *neigh_node;
 
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index 3fc76f6..fde3438 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -46,8 +46,9 @@ batadv_neigh_node_get(const struct batadv_orig_node 
*orig_node,
  const struct batadv_hard_iface *hard_iface,
  const u8 *addr);
 struct batadv_neigh_node *
-batadv_neigh_node_new(struct batadv_hard_iface *hard_iface,
- const u8 *neigh_addr, struct batadv_orig_node *orig_node);
+batadv_neigh_node_new(struct batadv_orig_node *orig_node,
+ struct batadv_hard_iface *hard_iface,
+ const u8 *neigh_addr);
 void batadv_neigh_node_free_ref(struct batadv_neigh_node *neigh_node);
 struct batadv_neigh_node *
 batadv_orig_router_get(struct batadv_orig_node *orig_node,
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 07/10] batman-adv: Start new development cycle

2015-08-28 Thread Antonio Quartulli
From: Simon Wunderlich 

Signed-off-by: Simon Wunderlich 
Signed-off-by: Antonio Quartulli 
---
 net/batman-adv/main.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/batman-adv/main.h b/net/batman-adv/main.h
index 78500ac..ebd8af0 100644
--- a/net/batman-adv/main.h
+++ b/net/batman-adv/main.h
@@ -24,7 +24,7 @@
 #define BATADV_DRIVER_DEVICE "batman-adv"
 
 #ifndef BATADV_SOURCE_VERSION
-#define BATADV_SOURCE_VERSION "2015.1"
+#define BATADV_SOURCE_VERSION "2015.2"
 #endif
 
 /* B.A.T.M.A.N. parameters */
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 08/10] batman-adv: don't access unregistered net_device object

2015-08-28 Thread Antonio Quartulli
In batadv_hardif_disable_interface() there is a call to
batadv_softif_destroy_sysfs() which in turns invokes
unregister_netdevice() on the soft_iface.
After this point we cannot rely on the soft_iface object
anymore because it might get free'd by the netdev periodic
routine at any time.

For this reason the netdev_upper_dev_unlink(.., soft_iface) call
is moved before the invocation of batadv_softif_destroy_sysfs() so
that we can be sure that the soft_iface object is still valid.

Signed-off-by: Antonio Quartulli 
Signed-off-by: Marek Lindner 
---
 net/batman-adv/hard-interface.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index f4a15d2..0565b20 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -528,6 +528,8 @@ void batadv_hardif_disable_interface(struct 
batadv_hard_iface *hard_iface,
batadv_purge_outstanding_packets(bat_priv, hard_iface);
dev_put(hard_iface->soft_iface);
 
+   netdev_upper_dev_unlink(hard_iface->net_dev, hard_iface->soft_iface);
+
/* nobody uses this interface anymore */
if (!bat_priv->num_ifaces) {
batadv_gw_check_client_stop(bat_priv);
@@ -536,7 +538,6 @@ void batadv_hardif_disable_interface(struct 
batadv_hard_iface *hard_iface,
batadv_softif_destroy_sysfs(hard_iface->soft_iface);
}
 
-   netdev_upper_dev_unlink(hard_iface->net_dev, hard_iface->soft_iface);
hard_iface->soft_iface = NULL;
batadv_hardif_free_ref(hard_iface);
 
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 04/10] batman-adv: remove obsolete deleted attribute for gateway node

2015-08-28 Thread Antonio Quartulli
From: Simon Wunderlich 

With rcu, the gateway node deleted attribute is not needed anymore. In
fact, it may delay the free of the gateway node and its referenced
structures. Therefore remove it altogether and simplify purging as well.

Signed-off-by: Simon Wunderlich 
Signed-off-by: Marek Lindner 
Signed-off-by: Antonio Quartulli 
---
 net/batman-adv/gateway_client.c | 48 +++--
 net/batman-adv/gateway_client.h |  2 +-
 net/batman-adv/main.c   |  2 +-
 net/batman-adv/originator.c |  1 -
 net/batman-adv/types.h  |  2 --
 5 files changed, 14 insertions(+), 41 deletions(-)

diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index d7ca214..634c7e3 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -161,9 +161,6 @@ batadv_gw_get_best_gw_node(struct batadv_priv *bat_priv)
 
rcu_read_lock();
hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) {
-   if (gw_node->deleted)
-   continue;
-
orig_node = gw_node->orig_node;
router = batadv_orig_router_get(orig_node, BATADV_IF_DEFAULT);
if (!router)
@@ -473,9 +470,6 @@ batadv_gw_node_get(struct batadv_priv *bat_priv,
if (gw_node_tmp->orig_node != orig_node)
continue;
 
-   if (gw_node_tmp->deleted)
-   continue;
-
if (!atomic_inc_not_zero(&gw_node_tmp->refcount))
continue;
 
@@ -525,9 +519,7 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv,
gw_node->bandwidth_down = ntohl(gateway->bandwidth_down);
gw_node->bandwidth_up = ntohl(gateway->bandwidth_up);
 
-   gw_node->deleted = 0;
if (ntohl(gateway->bandwidth_down) == 0) {
-   gw_node->deleted = jiffies;
batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
   "Gateway %pM removed from gateway list\n",
   orig_node->orig);
@@ -535,14 +527,21 @@ void batadv_gw_node_update(struct batadv_priv *bat_priv,
/* Note: We don't need a NULL check here, since curr_gw never
 * gets dereferenced.
 */
+   spin_lock_bh(&bat_priv->gw.list_lock);
+   hlist_del_init_rcu(&gw_node->list);
+   spin_unlock_bh(&bat_priv->gw.list_lock);
+
+   batadv_gw_node_free_ref(gw_node);
+
curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
if (gw_node == curr_gw)
batadv_gw_reselect(bat_priv);
+
+   if (curr_gw)
+   batadv_gw_node_free_ref(curr_gw);
}
 
 out:
-   if (curr_gw)
-   batadv_gw_node_free_ref(curr_gw);
if (gw_node)
batadv_gw_node_free_ref(gw_node);
 }
@@ -558,39 +557,19 @@ void batadv_gw_node_delete(struct batadv_priv *bat_priv,
batadv_gw_node_update(bat_priv, orig_node, &gateway);
 }
 
-void batadv_gw_node_purge(struct batadv_priv *bat_priv)
+void batadv_gw_node_free(struct batadv_priv *bat_priv)
 {
-   struct batadv_gw_node *gw_node, *curr_gw;
+   struct batadv_gw_node *gw_node;
struct hlist_node *node_tmp;
-   unsigned long timeout = msecs_to_jiffies(2 * BATADV_PURGE_TIMEOUT);
-   int do_reselect = 0;
-
-   curr_gw = batadv_gw_get_selected_gw_node(bat_priv);
 
spin_lock_bh(&bat_priv->gw.list_lock);
-
hlist_for_each_entry_safe(gw_node, node_tmp,
  &bat_priv->gw.list, list) {
-   if (((!gw_node->deleted) ||
-(time_before(jiffies, gw_node->deleted + timeout))) &&
-   atomic_read(&bat_priv->mesh_state) == BATADV_MESH_ACTIVE)
-   continue;
 
-   if (curr_gw == gw_node)
-   do_reselect = 1;
-
-   hlist_del_rcu(&gw_node->list);
+   hlist_del_init_rcu(&gw_node->list);
batadv_gw_node_free_ref(gw_node);
}
-
spin_unlock_bh(&bat_priv->gw.list_lock);
-
-   /* gw_reselect() needs to acquire the gw_list_lock */
-   if (do_reselect)
-   batadv_gw_reselect(bat_priv);
-
-   if (curr_gw)
-   batadv_gw_node_free_ref(curr_gw);
 }
 
 /* fails if orig_node has no router */
@@ -654,9 +633,6 @@ int batadv_gw_client_seq_print_text(struct seq_file *seq, 
void *offset)
 
rcu_read_lock();
hlist_for_each_entry_rcu(gw_node, &bat_priv->gw.list, list) {
-   if (gw_node->deleted)
-   continue;
-
/* fails if orig_node has no router */
if (batadv_write_buffer_text(bat_priv, seq, gw_node) < 0)
continue;
diff --git a/net/batman-adv/gateway_client.h b/net/batman-adv/gateway_client.h
index ef4d7e3..fa95277 100644
--- a/net/batman-adv/gateway_client.h
+++ b/ne

[PATCH 03/10] batman-adv: move neigh_node list add into batadv_neigh_node_new()

2015-08-28 Thread Antonio Quartulli
From: Marek Lindner 

All batadv_neigh_node_* functions expect the neigh_node list item to be part
of the orig_node->neigh_list, therefore the constructor of said list item
should be adding the newly created neigh_node to the respective list.

Signed-off-by: Marek Lindner 
Acked-by: Simon Wunderlich 
Signed-off-by: Antonio Quartulli 
---
 net/batman-adv/bat_iv_ogm.c | 21 +
 net/batman-adv/originator.c | 12 
 2 files changed, 13 insertions(+), 20 deletions(-)

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index b18184e..5e93af4 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -296,8 +296,7 @@ batadv_iv_ogm_neigh_new(struct batadv_hard_iface 
*hard_iface,
struct batadv_orig_node *orig_node,
struct batadv_orig_node *orig_neigh)
 {
-   struct batadv_priv *bat_priv = netdev_priv(hard_iface->soft_iface);
-   struct batadv_neigh_node *neigh_node, *tmp_neigh_node;
+   struct batadv_neigh_node *neigh_node;
 
neigh_node = batadv_neigh_node_new(hard_iface, neigh_addr, orig_node);
if (!neigh_node)
@@ -305,24 +304,6 @@ batadv_iv_ogm_neigh_new(struct batadv_hard_iface 
*hard_iface,
 
neigh_node->orig_node = orig_neigh;
 
-   spin_lock_bh(&orig_node->neigh_list_lock);
-   tmp_neigh_node = batadv_neigh_node_get(orig_node, hard_iface,
-  neigh_addr);
-   if (!tmp_neigh_node) {
-   hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list);
-   } else {
-   kfree(neigh_node);
-   batadv_hardif_free_ref(hard_iface);
-   neigh_node = tmp_neigh_node;
-   }
-   spin_unlock_bh(&orig_node->neigh_list_lock);
-
-   if (!tmp_neigh_node)
-   batadv_dbg(BATADV_DBG_BATMAN, bat_priv,
-  "Creating new neighbor %pM for orig_node %pM on 
interface %s\n",
-  neigh_addr, orig_node->orig,
-  hard_iface->net_dev->name);
-
 out:
return neigh_node;
 }
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index f8317c1..f751775 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -457,6 +457,10 @@ batadv_neigh_node_new(struct batadv_hard_iface *hard_iface,
 {
struct batadv_neigh_node *neigh_node;
 
+   neigh_node = batadv_neigh_node_get(orig_node, hard_iface, neigh_addr);
+   if (neigh_node)
+   goto out;
+
neigh_node = kzalloc(sizeof(*neigh_node), GFP_ATOMIC);
if (!neigh_node)
goto out;
@@ -478,6 +482,14 @@ batadv_neigh_node_new(struct batadv_hard_iface *hard_iface,
/* extra reference for return */
atomic_set(&neigh_node->refcount, 2);
 
+   spin_lock_bh(&orig_node->neigh_list_lock);
+   hlist_add_head_rcu(&neigh_node->list, &orig_node->neigh_list);
+   spin_unlock_bh(&orig_node->neigh_list_lock);
+
+   batadv_dbg(BATADV_DBG_BATMAN, orig_node->bat_priv,
+  "Creating new neighbor %pM for orig_node %pM on interface 
%s\n",
+  neigh_addr, orig_node->orig, hard_iface->net_dev->name);
+
 out:
return neigh_node;
 }
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 06/10] batman-adv: fix gateway client style issues

2015-08-28 Thread Antonio Quartulli
From: Simon Wunderlich 

commit 0511575c4d03 ("batman-adv: remove obsolete deleted attribute for
gateway node") incorrectly added an empy line and forgot to remove an
include.

Signed-off-by: Simon Wunderlich 
Signed-off-by: Marek Lindner 
Signed-off-by: Antonio Quartulli 
---
 net/batman-adv/gateway_client.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/batman-adv/gateway_client.c b/net/batman-adv/gateway_client.c
index 634c7e3..e6c8382 100644
--- a/net/batman-adv/gateway_client.c
+++ b/net/batman-adv/gateway_client.c
@@ -27,7 +27,6 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
 #include 
@@ -565,7 +564,6 @@ void batadv_gw_node_free(struct batadv_priv *bat_priv)
spin_lock_bh(&bat_priv->gw.list_lock);
hlist_for_each_entry_safe(gw_node, node_tmp,
  &bat_priv->gw.list, list) {
-
hlist_del_init_rcu(&gw_node->list);
batadv_gw_node_free_ref(gw_node);
}
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 09/10] batman-adv: Add lower layer needed_(head|tail)room to own ones

2015-08-28 Thread Antonio Quartulli
From: Sven Eckelmann 

The maximum of hard_header_len and maximum of all needed_(head|tail)room of
all slave interfaces of a batman-adv device must be used to define the
batman-adv device needed_(head|tail)room. This is required to avoid too
small buffer problems when these slave devices try to send the encapsulated
packet in a tx path without the possibility to resize the skbuff.

Signed-off-by: Sven Eckelmann 
Signed-off-by: Marek Lindner 
Signed-off-by: Antonio Quartulli 
---
 net/batman-adv/hard-interface.c | 41 +
 net/batman-adv/soft-interface.c |  2 --
 2 files changed, 41 insertions(+), 2 deletions(-)

diff --git a/net/batman-adv/hard-interface.c b/net/batman-adv/hard-interface.c
index 0565b20..f11345e 100644
--- a/net/batman-adv/hard-interface.c
+++ b/net/batman-adv/hard-interface.c
@@ -252,6 +252,44 @@ static void batadv_check_known_mac_addr(const struct 
net_device *net_dev)
rcu_read_unlock();
 }
 
+/**
+ * batadv_hardif_recalc_extra_skbroom() - Recalculate skbuff extra 
head/tailroom
+ * @soft_iface: netdev struct of the mesh interface
+ */
+static void batadv_hardif_recalc_extra_skbroom(struct net_device *soft_iface)
+{
+   const struct batadv_hard_iface *hard_iface;
+   unsigned short lower_header_len = ETH_HLEN;
+   unsigned short lower_headroom = 0;
+   unsigned short lower_tailroom = 0;
+   unsigned short needed_headroom;
+
+   rcu_read_lock();
+   list_for_each_entry_rcu(hard_iface, &batadv_hardif_list, list) {
+   if (hard_iface->if_status == BATADV_IF_NOT_IN_USE)
+   continue;
+
+   if (hard_iface->soft_iface != soft_iface)
+   continue;
+
+   lower_header_len = max_t(unsigned short, lower_header_len,
+hard_iface->net_dev->hard_header_len);
+
+   lower_headroom = max_t(unsigned short, lower_headroom,
+  hard_iface->net_dev->needed_headroom);
+
+   lower_tailroom = max_t(unsigned short, lower_tailroom,
+  hard_iface->net_dev->needed_tailroom);
+   }
+   rcu_read_unlock();
+
+   needed_headroom = lower_headroom + (lower_header_len - ETH_HLEN);
+   needed_headroom += batadv_max_header_len();
+
+   soft_iface->needed_headroom = needed_headroom;
+   soft_iface->needed_tailroom = lower_tailroom;
+}
+
 int batadv_hardif_min_mtu(struct net_device *soft_iface)
 {
struct batadv_priv *bat_priv = netdev_priv(soft_iface);
@@ -474,6 +512,8 @@ int batadv_hardif_enable_interface(struct batadv_hard_iface 
*hard_iface,
   "Not using interface %s (retrying later): interface 
not active\n",
   hard_iface->net_dev->name);
 
+   batadv_hardif_recalc_extra_skbroom(soft_iface);
+
/* begin scheduling originator messages on that interface */
batadv_schedule_bat_ogm(hard_iface);
 
@@ -529,6 +569,7 @@ void batadv_hardif_disable_interface(struct 
batadv_hard_iface *hard_iface,
dev_put(hard_iface->soft_iface);
 
netdev_upper_dev_unlink(hard_iface->net_dev, hard_iface->soft_iface);
+   batadv_hardif_recalc_extra_skbroom(hard_iface->soft_iface);
 
/* nobody uses this interface anymore */
if (!bat_priv->num_ifaces) {
diff --git a/net/batman-adv/soft-interface.c b/net/batman-adv/soft-interface.c
index d5c5ad9..ac4d08d 100644
--- a/net/batman-adv/soft-interface.c
+++ b/net/batman-adv/soft-interface.c
@@ -947,8 +947,6 @@ static void batadv_softif_init_early(struct net_device *dev)
 * have not been initialized yet
 */
dev->mtu = ETH_DATA_LEN;
-   /* reserve more space in the skbuff for our header */
-   dev->hard_header_len = batadv_max_header_len();
 
/* generate random address */
eth_hw_addr_random(dev);
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 01/10] batman-adv: move hardif refcount inc to batadv_neigh_node_new()

2015-08-28 Thread Antonio Quartulli
From: Marek Lindner 

The batadv_neigh_node cleanup function 'batadv_neigh_node_free_rcu()'
takes care of reducing the hardif refcounter, hence it's only logical
to assume the creating function of that same object
'batadv_neigh_node_new()' takes care of increasing the same refcounter.

Signed-off-by: Marek Lindner 
Acked-by: Simon Wunderlich 
Signed-off-by: Antonio Quartulli 
---
 net/batman-adv/bat_iv_ogm.c | 6 --
 net/batman-adv/originator.c | 6 ++
 2 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index 5c12200..b9b8b33 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -303,12 +303,6 @@ batadv_iv_ogm_neigh_new(struct batadv_hard_iface 
*hard_iface,
if (!neigh_node)
goto out;
 
-   if (!atomic_inc_not_zero(&hard_iface->refcount)) {
-   kfree(neigh_node);
-   neigh_node = NULL;
-   goto out;
-   }
-
neigh_node->orig_node = orig_neigh;
neigh_node->if_incoming = hard_iface;
 
diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 610620a..f8317c1 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -461,6 +461,12 @@ batadv_neigh_node_new(struct batadv_hard_iface *hard_iface,
if (!neigh_node)
goto out;
 
+   if (!atomic_inc_not_zero(&hard_iface->refcount)) {
+   kfree(neigh_node);
+   neigh_node = NULL;
+   goto out;
+   }
+
INIT_HLIST_NODE(&neigh_node->list);
INIT_HLIST_HEAD(&neigh_node->ifinfo_list);
spin_lock_init(&neigh_node->ifinfo_lock);
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 02/10] batman-adv: remove redundant hard_iface assignment

2015-08-28 Thread Antonio Quartulli
From: Marek Lindner 

The batadv_neigh_node_new() function already sets the hard_iface pointer.

Signed-off-by: Marek Lindner 
Acked-by: Simon Wunderlich 
Signed-off-by: Antonio Quartulli 
---
 net/batman-adv/bat_iv_ogm.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/batman-adv/bat_iv_ogm.c b/net/batman-adv/bat_iv_ogm.c
index b9b8b33..b18184e 100644
--- a/net/batman-adv/bat_iv_ogm.c
+++ b/net/batman-adv/bat_iv_ogm.c
@@ -304,7 +304,6 @@ batadv_iv_ogm_neigh_new(struct batadv_hard_iface 
*hard_iface,
goto out;
 
neigh_node->orig_node = orig_neigh;
-   neigh_node->if_incoming = hard_iface;
 
spin_lock_bh(&orig_node->neigh_list_lock);
tmp_neigh_node = batadv_neigh_node_get(orig_node, hard_iface,
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next v5 00/11] ipv6: Only create RTF_CACHE route after encountering pmtu exception

2015-08-28 Thread Alexander Holler

Am 28.08.2015 um 09:36 schrieb Martin KaFai Lau:

On Mon, Aug 17, 2015 at 11:43:20AM +0200, Alexander Holler wrote:

That's why I vote to check out if it's possible/reasonable to backport this
series to the stable kernels.

I have backported to 4.0.y without major issue, so possible.


Sure, as this was likely one of the versions they've used to create the 
patch.



I did try on 3.1x and gave up.

It is a lot of changes,  so I don't think it is a good idea for -stable.


Depends on what you're expecting from a (stable) kernel.

The patch description mentions what happens when a system deals with a 
lot of other ipv6-systems and that problem is easy to exercise and to value.


Rating the information leak is harder, some people even won't understand 
that this might be a problem.


And now look at which kernel-versions are now used in new devices 
(likely something <= 3.10, which is more than two years old), how long 
they will be used, and make a guess about IPv6 usage in 5 years.


Anyway, I've no insights about all the politics happening in the 
background (e.g. stuff like the LTSI tree) and I've just wanted raise 
awareness about that (imho important) patch series.


Regards,

Alexander Holler
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next v5 00/11] ipv6: Only create RTF_CACHE route after encountering pmtu exception

2015-08-28 Thread Alexander Holler

Am 28.08.2015 um 11:27 schrieb Alexander Holler:

Am 28.08.2015 um 09:36 schrieb Martin KaFai Lau:

On Mon, Aug 17, 2015 at 11:43:20AM +0200, Alexander Holler wrote:

That's why I vote to check out if it's possible/reasonable to
backport this
series to the stable kernels.

I have backported to 4.0.y without major issue, so possible.


Sure, as this was likely one of the versions they've used to create the
patch.


I did try on 3.1x and gave up.

It is a lot of changes,  so I don't think it is a good idea for -stable.


Depends on what you're expecting from a (stable) kernel.

The patch description mentions what happens when a system deals with a
lot of other ipv6-systems and that problem is easy to exercise and to
value.

Rating the information leak is harder, some people even won't understand
that this might be a problem.

And now look at which kernel-versions are now used in new devices
(likely something <= 3.10, which is more than two years old), how long
they will be used, and make a guess about IPv6 usage in 5 years.

Anyway, I've no insights about all the politics happening in the
background (e.g. stuff like the LTSI tree) and I've just wanted raise
awareness about that (imho important) patch series.


Not to speak about phones, but those are most likely a problem of one 
specific company  ;)


Regards,

Alexander Holler
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 10/10] batman-adv: turn batadv_neigh_node_get() into local function

2015-08-28 Thread Antonio Quartulli
From: Marek Lindner 

commit c214ebe1eb29 ("batman-adv: move neigh_node list add into
batadv_neigh_node_new()") removed external calls to
batadv_neigh_node_get().

Signed-off-by: Marek Lindner 
Signed-off-by: Antonio Quartulli 
---
 net/batman-adv/originator.c | 72 ++---
 net/batman-adv/originator.h |  4 ---
 2 files changed, 36 insertions(+), 40 deletions(-)

diff --git a/net/batman-adv/originator.c b/net/batman-adv/originator.c
index 099a84a..7486df9 100644
--- a/net/batman-adv/originator.c
+++ b/net/batman-adv/originator.c
@@ -443,6 +443,42 @@ out:
 }
 
 /**
+ * batadv_neigh_node_get - retrieve a neighbour from the list
+ * @orig_node: originator which the neighbour belongs to
+ * @hard_iface: the interface where this neighbour is connected to
+ * @addr: the address of the neighbour
+ *
+ * Looks for and possibly returns a neighbour belonging to this originator list
+ * which is connected through the provided hard interface.
+ * Returns NULL if the neighbour is not found.
+ */
+static struct batadv_neigh_node *
+batadv_neigh_node_get(const struct batadv_orig_node *orig_node,
+ const struct batadv_hard_iface *hard_iface,
+ const u8 *addr)
+{
+   struct batadv_neigh_node *tmp_neigh_node, *res = NULL;
+
+   rcu_read_lock();
+   hlist_for_each_entry_rcu(tmp_neigh_node, &orig_node->neigh_list, list) {
+   if (!batadv_compare_eth(tmp_neigh_node->addr, addr))
+   continue;
+
+   if (tmp_neigh_node->if_incoming != hard_iface)
+   continue;
+
+   if (!atomic_inc_not_zero(&tmp_neigh_node->refcount))
+   continue;
+
+   res = tmp_neigh_node;
+   break;
+   }
+   rcu_read_unlock();
+
+   return res;
+}
+
+/**
  * batadv_neigh_node_new - create and init a new neigh_node object
  * @orig_node: originator object representing the neighbour
  * @hard_iface: the interface where the neighbour is connected to
@@ -496,42 +532,6 @@ out:
 }
 
 /**
- * batadv_neigh_node_get - retrieve a neighbour from the list
- * @orig_node: originator which the neighbour belongs to
- * @hard_iface: the interface where this neighbour is connected to
- * @addr: the address of the neighbour
- *
- * Looks for and possibly returns a neighbour belonging to this originator list
- * which is connected through the provided hard interface.
- * Returns NULL if the neighbour is not found.
- */
-struct batadv_neigh_node *
-batadv_neigh_node_get(const struct batadv_orig_node *orig_node,
- const struct batadv_hard_iface *hard_iface,
- const u8 *addr)
-{
-   struct batadv_neigh_node *tmp_neigh_node, *res = NULL;
-
-   rcu_read_lock();
-   hlist_for_each_entry_rcu(tmp_neigh_node, &orig_node->neigh_list, list) {
-   if (!batadv_compare_eth(tmp_neigh_node->addr, addr))
-   continue;
-
-   if (tmp_neigh_node->if_incoming != hard_iface)
-   continue;
-
-   if (!atomic_inc_not_zero(&tmp_neigh_node->refcount))
-   continue;
-
-   res = tmp_neigh_node;
-   break;
-   }
-   rcu_read_unlock();
-
-   return res;
-}
-
-/**
  * batadv_orig_ifinfo_free_rcu - free the orig_ifinfo object
  * @rcu: rcu pointer of the orig_ifinfo object
  */
diff --git a/net/batman-adv/originator.h b/net/batman-adv/originator.h
index fde3438..fa18f9b 100644
--- a/net/batman-adv/originator.h
+++ b/net/batman-adv/originator.h
@@ -42,10 +42,6 @@ void batadv_orig_node_free_ref_now(struct batadv_orig_node 
*orig_node);
 struct batadv_orig_node *batadv_orig_node_new(struct batadv_priv *bat_priv,
  const u8 *addr);
 struct batadv_neigh_node *
-batadv_neigh_node_get(const struct batadv_orig_node *orig_node,
- const struct batadv_hard_iface *hard_iface,
- const u8 *addr);
-struct batadv_neigh_node *
 batadv_neigh_node_new(struct batadv_orig_node *orig_node,
  struct batadv_hard_iface *hard_iface,
  const u8 *neigh_addr);
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net v2] sctp: ASCONF-ACK with Unresolvable Address should be sent

2015-08-28 Thread Xin Long
RFC 5061:
This is an opaque integer assigned by the sender to identify each
request parameter.  The receiver of the ASCONF Chunk will copy this
32-bit value into the ASCONF Response Correlation ID field of the
ASCONF-ACK response parameter.  The sender of the ASCONF can use this
same value in the ASCONF-ACK to find which request the response is
for.  Note that the receiver MUST NOT change this 32-bit value.

Address Parameter: TLV

This field contains an IPv4 or IPv6 address parameter, as described
in Section 3.3.2.1 of [RFC4960].

ASCONF chunk with Error Cause Indication Parameter (Unresolvable Address)
should be sent if the Delete IP Address is not part of the association.

  Endpoint A   Endpoint B
  (ESTABLISHED)(ESTABLISHED)

  ASCONF->
  (Delete IP Address)
<-  ASCONF-ACK
(Unresolvable Address)

Signed-off-by: Xin Long 
---
 net/sctp/sm_make_chunk.c | 15 +--
 1 file changed, 13 insertions(+), 2 deletions(-)

diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 4068fe1..ce7f343 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -3090,8 +3090,19 @@ static __be16 sctp_process_asconf_param(struct 
sctp_association *asoc,
sctp_assoc_set_primary(asoc, asconf->transport);
sctp_assoc_del_nonprimary_peers(asoc,
asconf->transport);
-   } else
-   sctp_assoc_del_peer(asoc, &addr);
+   return SCTP_ERROR_NO_ERROR;
+   }
+
+   /* If the address is not part of the association, the
+* ASCONF-ACK with Error Cause Indication Parameter
+* which including cause of Unresolvable Address should
+* be sent.
+*/
+   peer = sctp_assoc_lookup_paddr(asoc, &addr);
+   if (!peer)
+   return SCTP_ERROR_DNS_FAILED;
+
+   sctp_assoc_rm_peer(asoc, peer);
break;
case SCTP_PARAM_SET_PRIMARY:
/* ADDIP Section 4.2.4
-- 
2.1.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 1/1] sfc: Allow driver to cope with a lower number of VIs than it needs for RSS

2015-08-28 Thread Shradha Shah
Previously, the driver would refuse to load if it couldn't secure
enough VIs from the MC to fulfill its RSS requirements.
This was causing probe to fail on later functions in
configurations where we'd run out of VIs, such as having many
VFs.

This change allows the driver to load with fewer VIs, down to a
minimum of 2. A warning will be printed saying that RSS
requirements were not met, possibly affecting performance.

efx->max_tx_channels needs to be set to avoid going down the
failure path in efx_probe_nic() immediately in the loop after the
probe() NIC-type function.
Also, Set rc=ENOSPC when bombing out of efx_probe_nic due to lack
of VIs.

Signed-off-by: Shradha Shah 
---
 drivers/net/ethernet/sfc/ef10.c   | 39 ++--
 drivers/net/ethernet/sfc/efx.c| 57 ---
 drivers/net/ethernet/sfc/efx.h|  1 +
 drivers/net/ethernet/sfc/falcon.c |  1 +
 drivers/net/ethernet/sfc/net_driver.h |  1 +
 drivers/net/ethernet/sfc/siena.c  |  1 +
 6 files changed, 72 insertions(+), 28 deletions(-)

diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c
index 06b8061..3245229 100644
--- a/drivers/net/ethernet/sfc/ef10.c
+++ b/drivers/net/ethernet/sfc/ef10.c
@@ -295,11 +295,11 @@ static int efx_ef10_probe(struct efx_nic *efx)
/* We can have one VI for each 8K region.  However, until we
 * use TX option descriptors we need two TX queues per channel.
 */
-   efx->max_channels =
-   min_t(unsigned int,
- EFX_MAX_CHANNELS,
- efx_ef10_mem_map_size(efx) /
- (EFX_VI_PAGE_SIZE * EFX_TXQ_TYPES));
+   efx->max_channels = min_t(unsigned int,
+ EFX_MAX_CHANNELS,
+ efx_ef10_mem_map_size(efx) /
+ (EFX_VI_PAGE_SIZE * EFX_TXQ_TYPES));
+   efx->max_tx_channels = efx->max_channels;
if (WARN_ON(efx->max_channels == 0))
return -EIO;
 
@@ -824,11 +824,13 @@ static int efx_ef10_dimension_resources(struct efx_nic 
*efx)
 {
struct efx_ef10_nic_data *nic_data = efx->nic_data;
unsigned int uc_mem_map_size, wc_mem_map_size;
-   unsigned int min_vis, pio_write_vi_base, max_vis;
+   unsigned int min_vis = max(EFX_TXQ_TYPES,
+  efx_separate_tx_channels ? 2 : 1);
+   unsigned int channel_vis, pio_write_vi_base, max_vis;
void __iomem *membase;
int rc;
 
-   min_vis = max(efx->n_channels, efx->n_tx_channels * EFX_TXQ_TYPES);
+   channel_vis = max(efx->n_channels, efx->n_tx_channels * EFX_TXQ_TYPES);
 
 #ifdef EFX_USE_PIO
/* Try to allocate PIO buffers if wanted and if the full
@@ -862,11 +864,11 @@ static int efx_ef10_dimension_resources(struct efx_nic 
*efx)
 * page size is >4K).  So we may allocate some extra VIs just
 * for writing PIO buffers through.
 *
-* The UC mapping contains (min_vis - 1) complete VIs and the
+* The UC mapping contains (channel_vis - 1) complete VIs and the
 * first half of the next VI.  Then the WC mapping begins with
 * the second half of this last VI.
 */
-   uc_mem_map_size = PAGE_ALIGN((min_vis - 1) * EFX_VI_PAGE_SIZE +
+   uc_mem_map_size = PAGE_ALIGN((channel_vis - 1) * EFX_VI_PAGE_SIZE +
 ER_DZ_TX_PIOBUF);
if (nic_data->n_piobufs) {
/* pio_write_vi_base rounds down to give the number of complete
@@ -881,7 +883,7 @@ static int efx_ef10_dimension_resources(struct efx_nic *efx)
} else {
pio_write_vi_base = 0;
wc_mem_map_size = 0;
-   max_vis = min_vis;
+   max_vis = channel_vis;
}
 
/* In case the last attached driver failed to free VIs, do it now */
@@ -893,6 +895,23 @@ static int efx_ef10_dimension_resources(struct efx_nic 
*efx)
if (rc != 0)
return rc;
 
+   if (nic_data->n_allocated_vis < channel_vis) {
+   netif_info(efx, drv, efx->net_dev,
+  "Could not allocate enough VIs to satisfy RSS"
+  " requirements. Performance may not be optimal.\n");
+   /* We didn't get the VIs to populate our channels.
+* We could keep what we got but then we'd have more
+* interrupts than we need.
+* Instead calculate new max_channels and restart
+*/
+   efx->max_channels = nic_data->n_allocated_vis;
+   efx->max_tx_channels =
+   nic_data->n_allocated_vis / EFX_TXQ_TYPES;
+
+   efx_ef10_free_vis(efx);
+   return -EAGAIN;
+   }
+
/* If we didn't get enough VIs to map all the PIO buffers, free the
 * PIO buffers
 */
diff --git a/drivers/net/ethernet/sfc/efx.c b/drivers

Re: [PATCH net-next v2] net: sched: consolidate tc_classify{,_compat}

2015-08-28 Thread Daniel Borkmann

On 08/28/2015 01:11 AM, David Miller wrote:
...

Sorry, I applied v1 before seeing this :-/


No problem.


If you could post a relative patch fixing the style issues, I'd
appreciate it.


Probably not worth the noise, but okay, will address it.

Thanks,
Daniel
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] route: fix breakage after moving lwtunnel state

2015-08-28 Thread Thomas Graf
On 08/27/15 at 02:20pm, Tom Herbert wrote:
> I'm doing:
> 
> ip route add :0:0:1::0:2:0/128 encap ila 2001:0:0:2 via
> 2401:db00:20:911a:face:0:27:0
> 
> so that 2401:db00:20:911a:face:0:27:0 is the next hop route for
> destination 2001:0:0:2::0:2:0. The dst_output for lwt just calls
> the original dest_output after transforming the packet without the use
> of any additional routes. So in this way ILA LWT is just acting as a
> "pass-through" packet transformation mechanism. Such a model might
> have additional utility: LWT occurs before iptables so that iptables
> sees the translated or encapsulated packet (davem mentioned this is
> probably what we want), we may want to defer translation until IP
> fragmentation (Roopa mentioned she needs this for MPLS).
> 
> > The IP metadata encap at FIB level is currently encap agnostic
> > and requires an intermediate encap device which then defines the
> > actual encap protocol:
> >
> > ip route overlay/prefix encap ip dst 10.1.1.1 dev vxlan0
> > ip route 10.1.1.1/prefix dev eth0
> >
> But then your outputting through another device, multiple routes are
> involved, performance drops :-( What not just set the route through
> VXLAN in that case?

The problem with having a single route is that it doesn't allow to
separate management of overlay and underlay. It is common to manage
the underlay with Quagga, bird or even static routes and defer the
overlay to Neutron or a fancy container orchestration system.

Caching of the 10.1.1.1 nexthop route in the overlay route would
essentially lead to the same behaviour without requiring to hardcode
the nexthop. I should have patches to demonstrate this in a bit.

> > I like it because we don't have to embed all the options as metadata
> > and can still set the through the device. An option would also be
> > to allow for both and add the following alternative:
> >
> > ip route overlay/prefix encap ip type vxlan dst 10.1.1.1 dev eth0
> 
> Better, we should be able to send encapsulated packets with needing a device.

Why is the device itself bad? I understand that we want to minimize
overhead but why is a single logical device to keep common config and
stats undesirable?
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v5 net-next 5/8] geneve: Add support to collect tunnel metadata.

2015-08-28 Thread Thomas Graf
On 08/27/15 at 02:29pm, Pravin Shelar wrote:
> On Thu, Aug 27, 2015 at 2:18 AM, Thomas Graf  wrote:
> > It is slightly non obvious that introducing an error condition above
> > this and before udp_tun_rx_dst() would introduce a memory leak. Other
> > than this looks great now.
> >
> I can not move this into if condition block since skb-scrub-packet
> drops skb dst entry.

I understand, VXLAN does the same. A comment would help though.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] rhashtable-test: add cond_resched() to thread test

2015-08-28 Thread Phil Sutter
This should fix for soft lockup bugs triggered on slow systems.

Signed-off-by: Phil Sutter 
---
 lib/test_rhashtable.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
index 8c1ad1c..63654e3 100644
--- a/lib/test_rhashtable.c
+++ b/lib/test_rhashtable.c
@@ -236,6 +236,8 @@ static int thread_lookup_test(struct thread_data *tdata)
   obj->value, key);
err++;
}
+
+   cond_resched();
}
return err;
 }
@@ -251,6 +253,7 @@ static int threadfunc(void *data)
 
for (i = 0; i < entries; i++) {
tdata->objs[i].value = (tdata->id << 16) | i;
+   cond_resched();
err = rhashtable_insert_fast(&ht, &tdata->objs[i].node,
 test_rht_params);
if (err == -ENOMEM || err == -EBUSY) {
@@ -285,6 +288,8 @@ static int threadfunc(void *data)
goto out;
}
tdata->objs[i].value = TEST_INSERT_FAIL;
+
+   cond_resched();
}
err = thread_lookup_test(tdata);
if (err) {
-- 
2.1.2

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/3] rhashtable-test: retry insert operations in threads

2015-08-28 Thread Phil Sutter
After adding cond_resched() calls to threadfunc(), a surprisingly high
rate of insert failures occurred probably due to table resizes getting a
better chance to run in background. To not soften up the remaining
tests, retry inserts until they either succeed or fail permanently.

Signed-off-by: Phil Sutter 
---
 lib/test_rhashtable.c | 13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
index 63654e3..093cf84 100644
--- a/lib/test_rhashtable.c
+++ b/lib/test_rhashtable.c
@@ -244,7 +244,7 @@ static int thread_lookup_test(struct thread_data *tdata)
 
 static int threadfunc(void *data)
 {
-   int i, step, err = 0, insert_fails = 0;
+   int i, step, err = 0, retries = 0;
struct thread_data *tdata = data;
 
up(&prestart_sem);
@@ -253,21 +253,22 @@ static int threadfunc(void *data)
 
for (i = 0; i < entries; i++) {
tdata->objs[i].value = (tdata->id << 16) | i;
+insert_retry:
cond_resched();
err = rhashtable_insert_fast(&ht, &tdata->objs[i].node,
 test_rht_params);
if (err == -ENOMEM || err == -EBUSY) {
-   tdata->objs[i].value = TEST_INSERT_FAIL;
-   insert_fails++;
+   retries++;
+   goto insert_retry;
} else if (err) {
pr_err("  thread[%d]: rhashtable_insert_fast failed\n",
   tdata->id);
goto out;
}
}
-   if (insert_fails)
-   pr_info("  thread[%d]: %d insert failures\n",
-   tdata->id, insert_fails);
+   if (retries)
+   pr_info("  thread[%d]: retried insert operation %d times\n",
+   tdata->id, retries);
 
err = thread_lookup_test(tdata);
if (err) {
-- 
2.1.2

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] rhashtable-test: calculate max_entries value by default

2015-08-28 Thread Phil Sutter
A maximum table size of 64k entries is insufficient for the multiple
threads test even in default configuration (10 threads * 5 objects =
50 objects in total). Since we know how many objects will be
inserted, calculate the max size unless overridden by parameter.

Note that specifying the exact number of objects upon table init won't
suffice as that value is being rounded down to the next power of two -
anticipate this by rounding up to the next power of two in beforehand.

Signed-off-by: Phil Sutter 
---
 lib/test_rhashtable.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
index 093cf84..73fcb8e 100644
--- a/lib/test_rhashtable.c
+++ b/lib/test_rhashtable.c
@@ -36,9 +36,9 @@ static int runs = 4;
 module_param(runs, int, 0);
 MODULE_PARM_DESC(runs, "Number of test runs per variant (default: 4)");
 
-static int max_size = 65536;
+static int max_size = 0;
 module_param(max_size, int, 0);
-MODULE_PARM_DESC(runs, "Maximum table size (default: 65536)");
+MODULE_PARM_DESC(runs, "Maximum table size (default: calculated)");
 
 static bool shrinking = false;
 module_param(shrinking, bool, 0);
@@ -317,7 +317,7 @@ static int __init test_rht_init(void)
entries = min(entries, MAX_ENTRIES);
 
test_rht_params.automatic_shrinking = shrinking;
-   test_rht_params.max_size = max_size;
+   test_rht_params.max_size = max_size ? : roundup_pow_of_two(entries);
test_rht_params.nelem_hint = size;
 
pr_info("Running rhashtable test nelem=%d, max_size=%d, shrinking=%d\n",
@@ -363,6 +363,8 @@ static int __init test_rht_init(void)
return -ENOMEM;
}
 
+   test_rht_params.max_size = max_size ? :
+  roundup_pow_of_two(tcount * entries);
err = rhashtable_init(&ht, &test_rht_params);
if (err < 0) {
pr_warn("Test failed: Unable to initialize hashtable: %d\n",
-- 
2.1.2

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] usbnet: Fix a race between usbnet_stop() and the BH

2015-08-28 Thread Eugene Shatokhin

28.08.2015 11:55, Bjørn Mork пишет:

Eugene Shatokhin  writes:


25.08.2015 00:01, Bjørn Mork пишет:

Eugene Shatokhin  writes:


The race may happen when a device (e.g. YOTA 4G LTE Modem) is
unplugged while the system is downloading a large file from the Net.

Hardware breakpoints and Kprobes with delays were used to confirm that
the race does actually happen.

The race is on skb_queue ('next' pointer) between usbnet_stop()
and rx_complete(), which, in turn, calls usbnet_bh().

Here is a part of the call stack with the code where the changes to the
queue happen. The line numbers are for the kernel 4.1.0:

*0 __skb_unlink (skbuff.h:1517)
  prev->next = next;
*1 defer_bh (usbnet.c:430)
  spin_lock_irqsave(&list->lock, flags);
  old_state = entry->state;
  entry->state = state;
  __skb_unlink(skb, list);
  spin_unlock(&list->lock);
  spin_lock(&dev->done.lock);
  __skb_queue_tail(&dev->done, skb);
  if (dev->done.qlen == 1)
  tasklet_schedule(&dev->bh);
  spin_unlock_irqrestore(&dev->done.lock, flags);
*2 rx_complete (usbnet.c:640)
  state = defer_bh(dev, skb, &dev->rxq, state);

At the same time, the following code repeatedly checks if the queue is
empty and reads these values concurrently with the above changes:

*0  usbnet_terminate_urbs (usbnet.c:765)
  /* maybe wait for deletions to finish. */
  while (!skb_queue_empty(&dev->rxq)
  && !skb_queue_empty(&dev->txq)
  && !skb_queue_empty(&dev->done)) {
  schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS));
  set_current_state(TASK_UNINTERRUPTIBLE);
  netif_dbg(dev, ifdown, dev->net,
"waited for %d urb completions\n", temp);
  }
*1  usbnet_stop (usbnet.c:806)
  if (!(info->flags & FLAG_AVOID_UNLINK_URBS))
  usbnet_terminate_urbs(dev);

As a result, it is possible, for example, that the skb is removed from
dev->rxq by __skb_unlink() before the check
"!skb_queue_empty(&dev->rxq)" in usbnet_terminate_urbs() is made. It is
also possible in this case that the skb is added to dev->done queue
after "!skb_queue_empty(&dev->done)" is checked. So
usbnet_terminate_urbs() may stop waiting and return while dev->done
queue still has an item.


Exactly what problem will that result in?  The tasklet_kill() will wait
for the processing of the single element done queue, and everything will
be fine.  Or?


Given enough time, what prevents defer_bh() from calling
tasklet_schedule(&dev->bh) *after* usbnet_stop() calls tasklet_kill()?

Consider the following situation (assuming '&&' are changed to '||' in
that while loop in usbnet_terminate_urbs() as they should be):

CPU0CPU1
usbnet_stop()   defer_bh() with list == dev->rxq
   usbnet_terminate_urbs()
 __skb_unlink() removes the last
 skb from dev->rxq.
 dev->rxq, dev->txq and dev->done
 are now empty.
   while (!skb_queue_empty()...)
 The loop ends because all 3
 queues are now empty.

   usbnet_terminate_urbs() ends.

usbnet_stop() continues:
   usbnet_status_stop(dev);
   ...
   del_timer_sync (&dev->delay);
   tasklet_kill (&dev->bh);
 __skb_queue_tail(&dev->done, skb);
 if (dev->done.qlen == 1)
   tasklet_schedule(&dev->bh);

The BH is scheduled at this point, which is not what was intended. The
race window is small, but still.


I guess you are right.  At least I cannot prove that you are not :)

There is a bit too much complexity involved here for me...


:-)

Yes, it is quite complex.

I admit, it was easier for me to find the races in usbnet (the tools 
like KernelStrider and RaceHound do the dirty work) than to analyze 
their consequences. The latter often requires some time and effort, and 
so it did this time.


Well, any objections to this patch?

Regards,

Eugene

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/3] rhashtable-test: add cond_resched() to thread test

2015-08-28 Thread Thomas Graf
On 08/28/15 at 12:28pm, Phil Sutter wrote:
> This should fix for soft lockup bugs triggered on slow systems.
> 
> Signed-off-by: Phil Sutter 

Acked-by: Thomas Graf 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Bridge] [PATCH net-next] bridge: Add netlink support for vlan_protocol attribute

2015-08-28 Thread Toshiaki Makita

On 15/08/28 (金) 0:48, Nikolay Aleksandrov wrote:



On Aug 26, 2015, at 11:00 PM, Toshiaki Makita  
wrote:

This enables bridge vlan_protocol to be configured through netlink.

When CONFIG_BRIDGE_VLAN_FILTERING is disabled, kernel behaves the
same way as this feature is not implemented.

Signed-off-by: Toshiaki Makita 
---
include/uapi/linux/if_link.h |  1 +
net/bridge/br_netlink.c  | 34 ++
net/bridge/br_private.h  |  1 +
net/bridge/br_vlan.c | 35 +--
4 files changed, 57 insertions(+), 14 deletions(-)



Nice, looks good. I have a similar patch as well and was going to ask wouldn’t 
it be
better to make empty stubs which return an error when vlan filtering isn’t 
configured
and drop the ifdefs in the netlink handling code ?
Similar to how vlan_filtering netlink attribute is handled in commit:
a7854037da00 ("bridge: netlink: add support for vlan_filtering attribute”)

Potential problem would be the return of the protocol, but I think if 0 is 
returned that
can be handled.


This is the exact reason why I didn't implement the stub.
I wanted to avoid to charge userspace with that special casing 0.
Also, this is consistent with sysfs implementation, which doesn't expose 
vlan_* entries when CONFIG_BRIDGE_VLAN_PROTOCOL is disabled.


Toshiaki Makita
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] rhashtable-test: retry insert operations in threads

2015-08-28 Thread Thomas Graf
On 08/28/15 at 12:28pm, Phil Sutter wrote:
> After adding cond_resched() calls to threadfunc(), a surprisingly high
> rate of insert failures occurred probably due to table resizes getting a
> better chance to run in background. To not soften up the remaining
> tests, retry inserts until they either succeed or fail permanently.
> 
> Signed-off-by: Phil Sutter 
> ---
>  lib/test_rhashtable.c | 13 +++--
>  1 file changed, 7 insertions(+), 6 deletions(-)
> 
> diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
> index 63654e3..093cf84 100644
> --- a/lib/test_rhashtable.c
> +++ b/lib/test_rhashtable.c
> @@ -244,7 +244,7 @@ static int thread_lookup_test(struct thread_data *tdata)
>  
>  static int threadfunc(void *data)
>  {
> - int i, step, err = 0, insert_fails = 0;
> + int i, step, err = 0, retries = 0;
>   struct thread_data *tdata = data;
>  
>   up(&prestart_sem);
> @@ -253,21 +253,22 @@ static int threadfunc(void *data)
>  
>   for (i = 0; i < entries; i++) {
>   tdata->objs[i].value = (tdata->id << 16) | i;
> +insert_retry:
>   cond_resched();
>   err = rhashtable_insert_fast(&ht, &tdata->objs[i].node,
>test_rht_params);
>   if (err == -ENOMEM || err == -EBUSY) {
> - tdata->objs[i].value = TEST_INSERT_FAIL;
> - insert_fails++;
> + retries++;
> + goto insert_retry;

Is it safe to retry indefinitely on ENOMEM? Retrying on EBUSY is
definitely an improvement and we should do the same in the non
threaded test as well.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] rhashtable-test: calculate max_entries value by default

2015-08-28 Thread Thomas Graf
On 08/28/15 at 12:28pm, Phil Sutter wrote:
> A maximum table size of 64k entries is insufficient for the multiple
> threads test even in default configuration (10 threads * 5 objects =
> 50 objects in total). Since we know how many objects will be
> inserted, calculate the max size unless overridden by parameter.
> 
> Note that specifying the exact number of objects upon table init won't
> suffice as that value is being rounded down to the next power of two -
> anticipate this by rounding up to the next power of two in beforehand.
> 
> Signed-off-by: Phil Sutter 

Acked-by: Thomas Graf 

Thanks for doing this work.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] rhashtable-test: retry insert operations in threads

2015-08-28 Thread Phil Sutter
On Fri, Aug 28, 2015 at 01:09:29PM +0200, Thomas Graf wrote:
> On 08/28/15 at 12:28pm, Phil Sutter wrote:
> > After adding cond_resched() calls to threadfunc(), a surprisingly high
> > rate of insert failures occurred probably due to table resizes getting a
> > better chance to run in background. To not soften up the remaining
> > tests, retry inserts until they either succeed or fail permanently.
> > 
> > Signed-off-by: Phil Sutter 
> > ---
> >  lib/test_rhashtable.c | 13 +++--
> >  1 file changed, 7 insertions(+), 6 deletions(-)
> > 
> > diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
> > index 63654e3..093cf84 100644
> > --- a/lib/test_rhashtable.c
> > +++ b/lib/test_rhashtable.c
> > @@ -244,7 +244,7 @@ static int thread_lookup_test(struct thread_data *tdata)
> >  
> >  static int threadfunc(void *data)
> >  {
> > -   int i, step, err = 0, insert_fails = 0;
> > +   int i, step, err = 0, retries = 0;
> > struct thread_data *tdata = data;
> >  
> > up(&prestart_sem);
> > @@ -253,21 +253,22 @@ static int threadfunc(void *data)
> >  
> > for (i = 0; i < entries; i++) {
> > tdata->objs[i].value = (tdata->id << 16) | i;
> > +insert_retry:
> > cond_resched();
> > err = rhashtable_insert_fast(&ht, &tdata->objs[i].node,
> >  test_rht_params);
> > if (err == -ENOMEM || err == -EBUSY) {
> > -   tdata->objs[i].value = TEST_INSERT_FAIL;
> > -   insert_fails++;
> > +   retries++;
> > +   goto insert_retry;
> 
> Is it safe to retry indefinitely on ENOMEM? Retrying on EBUSY is
> definitely an improvement and we should do the same in the non
> threaded test as well.

Oh yes, that is definitely a bug. I will respin and add the same for the
normal test, too.

Thanks, Phil
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH net-next 0/2] Add new switchdev device class

2015-08-28 Thread Andrew Lunn
> So with kobj, a device can have a parent.  So I experimented with my
> RFC patch and changed register_switchdev to take a parent switchdev
> arg, which is NULL for leaf switchdevs:
> 
> int register_switchdev(struct switchdev *sdev, const char *name,
>struct switchdev *parent)
> {
> struct device *dev = &sdev->dev;
> int err;
> 
> device_initialize(dev);
> 
> dev->class = &switchdev_class;
> if (parent)
> dev->parent = &parent->dev;
> 
> err = dev_set_name(dev, "%s", name);
> if (err)
> return err;
> 
> return device_add(dev);
> }
 
...

> With this, we can stack switchdevs, I guess as high as we want.  Does
> this look usable for DSA?   An attr set on the master would get pushed
> down to the leaves.  We'd can do it with the same style of recursive
> algos we use for switchdev port attrs.

Since this is a file system, we are limited to trees. But the hardware
is actually a graph. The interconnect points are ports on the switch,
and possible trunks/bonds of ports. I doubt there is a nice way to
represent this.

So it probably makes sense to have the switch with the port to the
host as the root of the tree, and all other switches are leafs of that
root.

How do you envisage addressing? I want to use netlink to read the
global registers from a specific switch for example.

   Andrew
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH -next 2/3] net: fib6: reduce identation in ip6_convert_metrics

2015-08-28 Thread Florian Westphal
From: Daniel Borkmann 

Reduce the identation a bit, there's no need to artificically have
it increased.

Signed-off-by: Daniel Borkmann 
---
 net/ipv6/route.c | 32 
 1 file changed, 16 insertions(+), 16 deletions(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index df3e353..56b2e71 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1711,26 +1711,26 @@ static int ip6_convert_metrics(struct mx6_config *mxc,
 
nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
int type = nla_type(nla);
+   u32 val;
 
-   if (type) {
-   u32 val;
-
-   if (unlikely(type > RTAX_MAX))
-   goto err;
-   if (type == RTAX_CC_ALGO) {
-   char tmp[TCP_CA_NAME_MAX];
+   if (!type)
+   continue;
+   if (unlikely(type > RTAX_MAX))
+   goto err;
 
-   nla_strlcpy(tmp, nla, sizeof(tmp));
-   val = tcp_ca_get_key_by_name(tmp);
-   if (val == TCP_CA_UNSPEC)
-   goto err;
-   } else {
-   val = nla_get_u32(nla);
-   }
+   if (type == RTAX_CC_ALGO) {
+   char tmp[TCP_CA_NAME_MAX];
 
-   mp[type - 1] = val;
-   __set_bit(type - 1, mxc->mx_valid);
+   nla_strlcpy(tmp, nla, sizeof(tmp));
+   val = tcp_ca_get_key_by_name(tmp);
+   if (val == TCP_CA_UNSPEC)
+   goto err;
+   } else {
+   val = nla_get_u32(nla);
}
+
+   mp[type - 1] = val;
+   __set_bit(type - 1, mxc->mx_valid);
}
 
mxc->mx = mp;
-- 
2.0.5

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH -next 3/3] tcp: use dctcp if enabled on the route to the initiator

2015-08-28 Thread Florian Westphal
From: Daniel Borkmann 

Currently, the following case doesn't use DCTCP, even if it should:
A responder has f.e. Cubic as system wide default, but for a specific
route to the initiating host, DCTCP is being set in RTAX_CC_ALGO. The
initiating host then uses DCTCP as congestion control, but since the
initiator sets ECT(0), tcp_ecn_create_request() doesn't set ecn_ok,
and we have to fall back to Reno after 3WHS completes.

We were thinking on how to solve this in a minimal, non-intrusive
way without bloating tcp_ecn_create_request() needlessly: lets cache
the CA ecn option flag in RTAX_FEATURES. In other words, when ECT(0)
is set on the SYN packet, set ecn_ok=1 iff route RTAX_FEATURES
contains RTAX_FEATURE_ECN_CA. This allows to only do a single metric
feature lookup inside tcp_ecn_create_request().

Joint work with Florian Westphal.

Signed-off-by: Daniel Borkmann 
Signed-off-by: Florian Westphal 
---
 include/net/tcp.h  |  2 +-
 include/uapi/linux/rtnetlink.h | 11 +++
 net/core/rtnetlink.c   |  4 
 net/ipv4/fib_semantics.c   |  9 -
 net/ipv4/tcp_cong.c|  9 ++---
 net/ipv4/tcp_input.c   |  7 +--
 net/ipv6/route.c   | 11 +--
 7 files changed, 40 insertions(+), 13 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 4a7b039..0cab28c 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -888,7 +888,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 
acked);
 extern struct tcp_congestion_ops tcp_reno;
 
 struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
-u32 tcp_ca_get_key_by_name(const char *name);
+u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca);
 #ifdef CONFIG_INET
 char *tcp_ca_get_name_by_key(u32 key, char *buffer);
 #else
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 0d3d3cc..a5eb242 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -418,10 +418,13 @@ enum {
 
 #define RTAX_MAX (__RTAX_MAX - 1)
 
-#define RTAX_FEATURE_ECN   0x0001
-#define RTAX_FEATURE_SACK  0x0002
-#define RTAX_FEATURE_TIMESTAMP 0x0004
-#define RTAX_FEATURE_ALLFRAG   0x0008
+#define RTAX_FEATURE_ECN   (1 << 0)
+#define RTAX_FEATURE_SACK  (1 << 1)
+#define RTAX_FEATURE_TIMESTAMP (1 << 2)
+#define RTAX_FEATURE_ALLFRAG   (1 << 3)
+#define RTAX_FEATURE_ECN_CA(1 << 4)
+
+#define RTAX_FEATURE_MASK_ECN  (RTAX_FEATURE_ECN | RTAX_FEATURE_ECN_CA)
 
 struct rta_session {
__u8proto;
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 788ceed..12a9b5a 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -678,6 +678,10 @@ int rtnetlink_put_metrics(struct sk_buff *skb, u32 
*metrics)
continue;
if (nla_put_string(skb, i + 1, name))
goto nla_put_failure;
+   } else if (i == RTAX_FEATURES - 1) {
+   u32 feat = metrics[i] & ~RTAX_FEATURE_ECN_CA;
+   if (nla_put_u32(skb, i + 1, feat))
+   goto nla_put_failure;
} else {
if (nla_put_u32(skb, i + 1, metrics[i]))
goto nla_put_failure;
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 88afbae..d7ecc92 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -879,6 +879,7 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, 
__be32 fib_prefsrc)
 static int
 fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
 {
+   bool ecn_ca = false;
struct nlattr *nla;
int remaining;
 
@@ -898,7 +899,7 @@ fib_convert_metrics(struct fib_info *fi, const struct 
fib_config *cfg)
char tmp[TCP_CA_NAME_MAX];
 
nla_strlcpy(tmp, nla, sizeof(tmp));
-   val = tcp_ca_get_key_by_name(tmp);
+   val = tcp_ca_get_key_by_name(tmp, &ecn_ca);
if (val == TCP_CA_UNSPEC)
return -EINVAL;
} else {
@@ -908,9 +909,15 @@ fib_convert_metrics(struct fib_info *fi, const struct 
fib_config *cfg)
val = 65535 - 40;
if (type == RTAX_MTU && val > 65535 - 15)
val = 65535 - 15;
+   if (type == RTAX_FEATURES)
+   val &= ~RTAX_FEATURE_ECN_CA;
+
fi->fib_metrics[type - 1] = val;
}
 
+   if (ecn_ca)
+   fi->fib_metrics[RTAX_FEATURES - 1] |= RTAX_FEATURE_ECN_CA;
+
return 0;
 }
 
diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c
index a2ed23c..93c4dc3 100644
--- a/net/ipv4/tcp_cong.c
+++ b/net/ipv4/tcp_cong.c
@@ -114,16 +114,19 @@ void tcp_unregister_congestion_control(struct 
tcp_congestion_ops *ca)
 

[PATCH -next 0/3] tcp: receive-side per route dctcp handling

2015-08-28 Thread Florian Westphal
Currently, the following case doesn't use DCTCP, even if it should:

- responder has f.e. cubic as system wide default
- 'ip route congctl dctcp $src' was set

Then, DCTCP is NOT used if a DCTCP sender attempts to connect from a
host in the $src range: ECT(0) is set, but listen_sk is not dctcp, so we
fail the INET_ECN_is_not_ect sanity check.

We also have to examine the dst used for the SYN/ACK reply to make this
case work.

In order to minimize additional cost, store the 'ecn is must have'
information is the dst_features field.

The set targets -next instead of -net since this doesn't seem to be a
serious bug and to give the change more soak time until it hits linus tree.

Daniel Borkmann (2):
  net: fib6: reduce identation in ip6_convert_metrics
  tcp: use dctcp if enabled on the route to the initiator

Florian Westphal (1):
  net: fib: move metrics parsing to a helper

 include/net/tcp.h  |  2 +-
 include/uapi/linux/rtnetlink.h | 11 +++---
 net/core/rtnetlink.c   |  4 +++
 net/ipv4/fib_semantics.c   | 78 ++
 net/ipv4/tcp_cong.c|  9 +++--
 net/ipv4/tcp_input.c   |  7 ++--
 net/ipv6/route.c   | 39 -
 7 files changed, 94 insertions(+), 56 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH -next 1/3] net: fib: move metrics parsing to a helper

2015-08-28 Thread Florian Westphal
fib_create_info() is already quite large, so before adding more
code to the metrics section move that to a helper, similar to
ip6_convert_metrics.

Suggested-by: Daniel Borkmann 
Signed-off-by: Florian Westphal 
---
 net/ipv4/fib_semantics.c | 71 
 1 file changed, 41 insertions(+), 30 deletions(-)

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 1b2d011..88afbae 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -876,6 +876,44 @@ static bool fib_valid_prefsrc(struct fib_config *cfg, 
__be32 fib_prefsrc)
return true;
 }
 
+static int
+fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
+{
+   struct nlattr *nla;
+   int remaining;
+
+   if (!cfg->fc_mx)
+   return 0;
+
+   nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
+   int type = nla_type(nla);
+   u32 val;
+
+   if (!type)
+   continue;
+   if (type > RTAX_MAX)
+   return -EINVAL;
+
+   if (type == RTAX_CC_ALGO) {
+   char tmp[TCP_CA_NAME_MAX];
+
+   nla_strlcpy(tmp, nla, sizeof(tmp));
+   val = tcp_ca_get_key_by_name(tmp);
+   if (val == TCP_CA_UNSPEC)
+   return -EINVAL;
+   } else {
+   val = nla_get_u32(nla);
+   }
+   if (type == RTAX_ADVMSS && val > 65535 - 40)
+   val = 65535 - 40;
+   if (type == RTAX_MTU && val > 65535 - 15)
+   val = 65535 - 15;
+   fi->fib_metrics[type - 1] = val;
+   }
+
+   return 0;
+}
+
 struct fib_info *fib_create_info(struct fib_config *cfg)
 {
int err;
@@ -948,36 +986,9 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
goto failure;
} endfor_nexthops(fi)
 
-   if (cfg->fc_mx) {
-   struct nlattr *nla;
-   int remaining;
-
-   nla_for_each_attr(nla, cfg->fc_mx, cfg->fc_mx_len, remaining) {
-   int type = nla_type(nla);
-
-   if (type) {
-   u32 val;
-
-   if (type > RTAX_MAX)
-   goto err_inval;
-   if (type == RTAX_CC_ALGO) {
-   char tmp[TCP_CA_NAME_MAX];
-
-   nla_strlcpy(tmp, nla, sizeof(tmp));
-   val = tcp_ca_get_key_by_name(tmp);
-   if (val == TCP_CA_UNSPEC)
-   goto err_inval;
-   } else {
-   val = nla_get_u32(nla);
-   }
-   if (type == RTAX_ADVMSS && val > 65535 - 40)
-   val = 65535 - 40;
-   if (type == RTAX_MTU && val > 65535 - 15)
-   val = 65535 - 15;
-   fi->fib_metrics[type - 1] = val;
-   }
-   }
-   }
+   err = fib_convert_metrics(fi, cfg);
+   if (err)
+   goto failure;
 
if (cfg->fc_mp) {
 #ifdef CONFIG_IP_ROUTE_MULTIPATH
-- 
2.0.5

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] macvtap/macvlan: use IFF_NO_QUEUE

2015-08-28 Thread Vlad Yasevich
On 08/27/2015 10:42 PM, Jason Wang wrote:
> 
> 
> On 08/27/2015 06:43 PM, Michael S. Tsirkin wrote:
>> On Wed, Aug 26, 2015 at 01:45:30PM +0800, Jason Wang wrote:
>>>
>>> On 08/26/2015 12:32 AM, Vlad Yasevich wrote:
 On 08/25/2015 07:30 AM, Jason Wang wrote:
> On 08/25/2015 06:17 PM, Michael S. Tsirkin wrote:
>> On Mon, Aug 24, 2015 at 04:33:12PM +0800, Jason Wang wrote:
 For macvlan, switch to use IFF_NO_QUEUE instead of tx_queue_len = 0.

 For macvtap, after commit 6acf54f1cf0a6747bac9fea26f34cfc5a9029523
 ("macvtap: Add support of packet capture on macvtap
 device."). Multiqueue macvtap suffers from single qdisc lock
 contention. This is because macvtap claims a non zero tx_queue_len and
 it reuses this value as it socket receive queue size.Thanks to
 IFF_NO_QUEUE, we can remove the lock contention without breaking
 existing socket receive queue length logic.

 Cc: Patrick McHardy 
 Cc: Vladislav Yasevich 
 Cc: Michael S. Tsirkin 
 Signed-off-by: Jason Wang 
>> Seems to make sense. Give me a day or two to get over the jet lag
>> (and get out from under the pile of mail accumulated while I was 
>> traveling),
>> I'll review properly and ack.
>>
> A note on this patch: only default qdisc were removed but we don't lose
> the ability to attach a qdisc to macvtap (though it may cause lock
> contention on multiqueue case).
>
 Wouldn't that lock contention be solved if we really had multiple queues
 for multi-queue macvtaps?

 -vlad
>>> Yes, but this introduce another layer of txq locks contention?
>> I don't follow - why does it? Could you clarify please?
> 
> I believe Vlad wants to remove NETIF_F_LLTX. If yes, core will do an
> extra tx lock at macvlan layer.

No, I don't want to remove it.  In a sense, it would function similar to
how it works when fwd_priv is populated.  I am still testing the code
as it's showing some strange artifacts...  could be due to keeping LLTX.

-vlad

> 
>>
>>> And it
>>> also needs macvlan multiqueue support. We used to do something like this
>>> but switch to NETIF_F_LLTX finally. You may refer:
>>>
>>> 2c11455321f37da6fe6cc36353149f9ac9183334 macvlan: add multiqueue capability
>>> 8ffab51b3dfc54876f145f15b351c41f3f703195 macvlan: lockless tx path
>> My concern is that the moment someone configures a non-standard qdisc
>> scalability suddenly disappears. That would also be tricky to debug in the
>> field as not a lot of developers use non-standard qdiscs.
>> What do you think?
> 
> Probably not an issue. Non-standard qdisc may need be attached manually
> after device creation, and we don't lose this ability with this patch.
> (Unless somebody changes default_qdisc). Actually, user before
> 6acf54f1cf0a6747bac9fea26f34cfc5a9029523 does not expect any qdisc work
> for macvtap like other stacked devices. This patch also restore this.
> 
> 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Patch net-next 4/5] net_sched: forbid setting default qdisc to inappropriate ones

2015-08-28 Thread Jamal Hadi Salim

On 08/28/15 00:23, David Miller wrote:


If a default qdisc like HTB is choosen, we invoke the ->init() function
and we change the HTB ->init() function to do something reasonable
if a NULL set of configuration attributes is given.  ie. make HTB use
some defaults.



That may work. Or to reduce ambiguity introduce qdisc->set_default().

cheers,
jamal
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next v2] bridge: vlan: allow to suppress local mac install for all vlans

2015-08-28 Thread Vlad Yasevich
On 08/27/2015 10:17 PM, Nikolay Aleksandrov wrote:
> 
>> On Aug 27, 2015, at 4:47 PM, Vlad Yasevich  wrote:
>>
>> On 08/27/2015 05:02 PM, Nikolay Aleksandrov wrote:
>>>
 On Aug 26, 2015, at 9:57 PM, roopa  wrote:

 On 8/26/15, 4:33 AM, Nikolay Aleksandrov wrote:
>> On Aug 25, 2015, at 11:06 PM, David Miller  wrote:
>>
>> From: Nikolay Aleksandrov 
>> Date: Tue, 25 Aug 2015 22:28:16 -0700
>>
>>> Certainly, that should be done and I will look into it, but the
>>> essence of this patch is a bit different. The problem here is not
>>> the size of the fdb entries, it’s more the number of them - having
>>> 96000 entries (even if they were 1 byte ones) is just way too much
>>> especially when the fdb hash size is small and static. We could work
>>> on making it dynamic though, but still these type of local entries
>>> per vlan per port can easily be avoided with this option.
>> 96000 bits can be stored in 12k.  Get where I'm going with this?
>>
>> Look at the problem sideways.
> Oh okay, I misunderstood your previous comment. I’ll look into that.
>
 I just wanted to add the other problems we have had with keeping these 
 macs (mostly from userspace POV):
 - add/del netlink notification storms
 - and large netlink dumps

 In addition to in-kernel optimizations, will be nice to have a solution 
 that reduces the burden on userspace. That will need a newer netlink dump 
 format for fdbs. Considering all the changes needed, Nikolays patch seems 
 less intrusive.
>>>
>>> Right, we need to take these into account as well. I’ll continue the 
>>> discussion on this (or restart it) because
>>> I looked into using a bitmap for the local entries only and while it fixes 
>>> the scalability issue, it presents
>>> a few new ones which are mostly related to the fact that these entries now 
>>> exist only without a vlan
>>> and if a new mac comes along which matches one of these but is in a vlan, 
>>> the entry will get created
>>> in br_fdb_update() unless we add a second lookup, but that will slow down 
>>> the learning path.
>>> Also this change requires an update of every fdb function that uses the vid 
>>> as a key (every fdb function?!)
>>> because now we can have the mac in two places instead of one which is a 
>>> pretty big churn with lots
>>> of conditionals all over the place and I don’t like it. Adding this 
>>> complexity for the local addresses only
>>> seems like an overkill, so I think to drop this issue for now.
>>
>> I seem to recall Roopa and I and maybe a few others have discussing this a 
>> few
>> years ago at plumbers, I can't remember the details any more.  All these 
>> local
>> addresses add a ton of confusion.  Does anyone (Stephen?) remember what the
>> original reason was for all these local addresses? I wonder if we can have
>> a nob to disable all of them (not just per vlan)?  That might be cleaner and
>> easier to swallow.
>>
> 
> Right, this would be the easiest way and if the others agree - I’ll post a 
> patch for it so we can
> have some way to resolve it today and even if we fix the scalability issue, 
> this is still a valid case
> that some people don’t want local fdbs installed automatically.
> Any objections to this ?
> 
>>> This patch (that works around the initial problem) also has these issues.
>>> Note that one way to take care of this in a more straight-forward way would 
>>> be to have each entry
>>> with some sort of a bitmap (like Vlad has tried earlier) and then we can 
>>> combine the paths so most
>>> of these issues disappear, but that will not be easy as was already 
>>> commented earlier. I’ve looked
>>> briefly into doing this with rhashtable so we can keep the memory footprint 
>>> for each entry relatively
>>> small but it still affects the performance and we can have thousands of 
>>> resizes happening. 
>>>
>>
>> So, one of the earlier approaches that I've tried (before rhashtable was
>> in the kernel) was to have a hash of vlan ids each with a data structure
>> pointing to a list of ports for a given vlan as well as a list of fdbs for
>> a given vlan.  As far as scalability goes, that's really the best approach.
>> It would also allow us to do packet accounting per vlan.  The only concern
>> at the time was performance of ingress lookup.   I think rhashtables might
>> help with this as well as ability to grow the footprint of the vlan hash
>> table dynamically.
>>
>> -vlad
>>
> I’ll look into it but I’m guessing the learning will become a more 
> complicated process with additional 
> allocations and some hash handling.

I don't remember learning being all that complicated.  The hash only changed 
under
rtnl when vlans were added/removed.  The nice this is that we wouldn't need
to rebalance, because if the vlan is removed all fdb links get removed too.  
They
don't move to another bucket (But that was with static hash.  Need to look at 
rhash in
more deta

Re: [PATCH net v2] sctp: ASCONF-ACK with Unresolvable Address should be sent

2015-08-28 Thread Vlad Yasevich
On 08/28/2015 05:45 AM, Xin Long wrote:
> RFC 5061:
> This is an opaque integer assigned by the sender to identify each
> request parameter.  The receiver of the ASCONF Chunk will copy this
> 32-bit value into the ASCONF Response Correlation ID field of the
> ASCONF-ACK response parameter.  The sender of the ASCONF can use this
> same value in the ASCONF-ACK to find which request the response is
> for.  Note that the receiver MUST NOT change this 32-bit value.
> 
> Address Parameter: TLV
> 
> This field contains an IPv4 or IPv6 address parameter, as described
> in Section 3.3.2.1 of [RFC4960].
> 
> ASCONF chunk with Error Cause Indication Parameter (Unresolvable Address)
> should be sent if the Delete IP Address is not part of the association.
> 
>   Endpoint A   Endpoint B
>   (ESTABLISHED)(ESTABLISHED)
> 
>   ASCONF->
>   (Delete IP Address)
> <-  ASCONF-ACK
> (Unresolvable Address)
> 
> Signed-off-by: Xin Long 

Acked-by: Vlad Yasevich 

-vlad

> ---
>  net/sctp/sm_make_chunk.c | 15 +--
>  1 file changed, 13 insertions(+), 2 deletions(-)
> 
> diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
> index 4068fe1..ce7f343 100644
> --- a/net/sctp/sm_make_chunk.c
> +++ b/net/sctp/sm_make_chunk.c
> @@ -3090,8 +3090,19 @@ static __be16 sctp_process_asconf_param(struct 
> sctp_association *asoc,
>   sctp_assoc_set_primary(asoc, asconf->transport);
>   sctp_assoc_del_nonprimary_peers(asoc,
>   asconf->transport);
> - } else
> - sctp_assoc_del_peer(asoc, &addr);
> + return SCTP_ERROR_NO_ERROR;
> + }
> +
> + /* If the address is not part of the association, the
> +  * ASCONF-ACK with Error Cause Indication Parameter
> +  * which including cause of Unresolvable Address should
> +  * be sent.
> +  */
> + peer = sctp_assoc_lookup_paddr(asoc, &addr);
> + if (!peer)
> + return SCTP_ERROR_DNS_FAILED;
> +
> + sctp_assoc_rm_peer(asoc, peer);
>   break;
>   case SCTP_PARAM_SET_PRIMARY:
>   /* ADDIP Section 4.2.4
> 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] rhashtable-test: retry insert operations in threads

2015-08-28 Thread Phil Sutter
On Fri, Aug 28, 2015 at 01:13:20PM +0200, Phil Sutter wrote:
> On Fri, Aug 28, 2015 at 01:09:29PM +0200, Thomas Graf wrote:
> > On 08/28/15 at 12:28pm, Phil Sutter wrote:
> > > After adding cond_resched() calls to threadfunc(), a surprisingly high
> > > rate of insert failures occurred probably due to table resizes getting a
> > > better chance to run in background. To not soften up the remaining
> > > tests, retry inserts until they either succeed or fail permanently.
> > > 
> > > Signed-off-by: Phil Sutter 
> > > ---
> > >  lib/test_rhashtable.c | 13 +++--
> > >  1 file changed, 7 insertions(+), 6 deletions(-)
> > > 
> > > diff --git a/lib/test_rhashtable.c b/lib/test_rhashtable.c
> > > index 63654e3..093cf84 100644
> > > --- a/lib/test_rhashtable.c
> > > +++ b/lib/test_rhashtable.c
> > > @@ -244,7 +244,7 @@ static int thread_lookup_test(struct thread_data 
> > > *tdata)
> > >  
> > >  static int threadfunc(void *data)
> > >  {
> > > - int i, step, err = 0, insert_fails = 0;
> > > + int i, step, err = 0, retries = 0;
> > >   struct thread_data *tdata = data;
> > >  
> > >   up(&prestart_sem);
> > > @@ -253,21 +253,22 @@ static int threadfunc(void *data)
> > >  
> > >   for (i = 0; i < entries; i++) {
> > >   tdata->objs[i].value = (tdata->id << 16) | i;
> > > +insert_retry:
> > >   cond_resched();
> > >   err = rhashtable_insert_fast(&ht, &tdata->objs[i].node,
> > >test_rht_params);
> > >   if (err == -ENOMEM || err == -EBUSY) {
> > > - tdata->objs[i].value = TEST_INSERT_FAIL;
> > > - insert_fails++;
> > > + retries++;
> > > + goto insert_retry;
> > 
> > Is it safe to retry indefinitely on ENOMEM? Retrying on EBUSY is
> > definitely an improvement and we should do the same in the non
> > threaded test as well.
> 
> Oh yes, that is definitely a bug. I will respin and add the same for the
> normal test, too.

Quite ugly, IMHO: rhashtable_insert_fast() may return -ENOMEM as
non-permanent error, if allocation in GFP_ATOMIC failed. In this case,
allocation in GFP_KERNEL is retried by rht_deferred_worker(). Sadly,
there is no way to determine if that has already been tried and failed.

The thread test triggers GFP_ATOMIC allocation failure quite easily, so
I can't really just ignore this issue. :)

Cheers, Phil
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net v2] sctp: ASCONF-ACK with Unresolvable Address should be sent

2015-08-28 Thread Marcelo Ricardo Leitner
On Fri, Aug 28, 2015 at 05:45:58PM +0800, Xin Long wrote:
> RFC 5061:
> This is an opaque integer assigned by the sender to identify each
> request parameter.  The receiver of the ASCONF Chunk will copy this
> 32-bit value into the ASCONF Response Correlation ID field of the
> ASCONF-ACK response parameter.  The sender of the ASCONF can use this
> same value in the ASCONF-ACK to find which request the response is
> for.  Note that the receiver MUST NOT change this 32-bit value.
> 
> Address Parameter: TLV
> 
> This field contains an IPv4 or IPv6 address parameter, as described
> in Section 3.3.2.1 of [RFC4960].
> 
> ASCONF chunk with Error Cause Indication Parameter (Unresolvable Address)
> should be sent if the Delete IP Address is not part of the association.
> 
>   Endpoint A   Endpoint B
>   (ESTABLISHED)(ESTABLISHED)
> 
>   ASCONF->
>   (Delete IP Address)
> <-  ASCONF-ACK
> (Unresolvable Address)
> 
> Signed-off-by: Xin Long 

Acked-by: Marcelo Ricardo Leitner 

> ---
>  net/sctp/sm_make_chunk.c | 15 +--
>  1 file changed, 13 insertions(+), 2 deletions(-)
> 
> diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
> index 4068fe1..ce7f343 100644
> --- a/net/sctp/sm_make_chunk.c
> +++ b/net/sctp/sm_make_chunk.c
> @@ -3090,8 +3090,19 @@ static __be16 sctp_process_asconf_param(struct 
> sctp_association *asoc,
>   sctp_assoc_set_primary(asoc, asconf->transport);
>   sctp_assoc_del_nonprimary_peers(asoc,
>   asconf->transport);
> - } else
> - sctp_assoc_del_peer(asoc, &addr);
> + return SCTP_ERROR_NO_ERROR;
> + }
> +
> + /* If the address is not part of the association, the
> +  * ASCONF-ACK with Error Cause Indication Parameter
> +  * which including cause of Unresolvable Address should
> +  * be sent.
> +  */
> + peer = sctp_assoc_lookup_paddr(asoc, &addr);
> + if (!peer)
> + return SCTP_ERROR_DNS_FAILED;
> +
> + sctp_assoc_rm_peer(asoc, peer);
>   break;
>   case SCTP_PARAM_SET_PRIMARY:
>   /* ADDIP Section 4.2.4
> -- 
> 2.1.0
> 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/2] Renesas: propagate platform_get_irq() error upstream

2015-08-28 Thread Sergei Shtylyov
Hello.

   Here's 2 patches against DaveM's 'net.git' repo. We fix error handling for
platform_get_irq() that overrides the error code with -ENODEV which e.g.
prevents the deferred probing from working.

[1/2] ravb: propagate platform_get_irq() error upstream
[2/2] sh_eth: propagate platform_get_irq() error upstream

MBR, Sergei

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] ravb: propagate platform_get_irq() error upstream

2015-08-28 Thread Sergei Shtylyov
The driver overrides the error returned by platform_get_irq() with -ENODEV
which e.g. precludes the deferred  probing from working. Propagate the real
error code to the driver core instead.

Signed-off-by: Sergei Shtylyov 

---
 drivers/net/ethernet/renesas/ravb_main.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: net/drivers/net/ethernet/renesas/ravb_main.c
===
--- net.orig/drivers/net/ethernet/renesas/ravb_main.c
+++ net/drivers/net/ethernet/renesas/ravb_main.c
@@ -1643,7 +1643,7 @@ static int ravb_probe(struct platform_de
ndev->dma = -1;
irq = platform_get_irq(pdev, 0);
if (irq < 0) {
-   error = -ENODEV;
+   error = irq;
goto out_release;
}
ndev->irq = irq;

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] sh_eth: propagate platform_get_irq() error upstream

2015-08-28 Thread Sergei Shtylyov
The driver overrides the error returned by platform_get_irq() with -ENODEV
which e.g. precludes the deferred  probing from working. Propagate the real
error code to the driver core instead.

Signed-off-by: Sergei Shtylyov 

---
 drivers/net/ethernet/renesas/sh_eth.c |4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

Index: net/drivers/net/ethernet/renesas/sh_eth.c
===
--- net.orig/drivers/net/ethernet/renesas/sh_eth.c
+++ net/drivers/net/ethernet/renesas/sh_eth.c
@@ -3089,10 +3089,8 @@ static int sh_eth_drv_probe(struct platf
 
ndev->dma = -1;
ret = platform_get_irq(pdev, 0);
-   if (ret < 0) {
-   ret = -ENODEV;
+   if (ret < 0)
goto out_release;
-   }
ndev->irq = ret;
 
SET_NETDEV_DEV(ndev, &pdev->dev);

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH net-next 3/5] net: Add helper function to compare inetpeer addresses

2015-08-28 Thread David Laight
From: David Ahern
> Sent: 27 August 2015 22:17
ATCH net-next 3/5] net: Add helper function to compare inetpeer addresses
> 
> tcp_metrics and inetpeer both have functions to compare inetpeer
> addresses. Consolidate into 1 version.
> 
> Signed-off-by: David Ahern 
> ---
...
> diff --git a/include/net/inetpeer.h b/include/net/inetpeer.h
> index f75b9e7036a2..9d9b3446731d 100644
> --- a/include/net/inetpeer.h
> +++ b/include/net/inetpeer.h
> @@ -121,6 +121,22 @@ static inline struct inet_peer *inet_getpeer_v6(struct 
> inet_peer_base *base,
>   return inet_getpeer(base, &daddr, create);
>  }
> 
> +static inline int inetpeer_addr_cmp(const struct inetpeer_addr *a,
> + const struct inetpeer_addr *b)
> +{
> + int i, n = (a->family == AF_INET ? 1 : 4);
> +
> + for (i = 0; i < n; i++) {
> + if (a->addr.a6[i] == b->addr.a6[i])
> + continue;
> + if ((__force u32)a->addr.a6[i] < (__force u32)b->addr.a6[i])
> + return -1;
> + return 1;
> + }
> +
> + return 0;
> +}

If the performance of this matters then I'd not use the loop for IPv4
and use u64 comparisons (esp. on 64 bit systems) in an unrolled loop for IPv6.
(Might need to worry about the alignment.)

I presume nothing cares that the ordering relation is endian dependant?
With either byteswapping memory reads or a byteswapping instruction
then an endian-independant ordering should be almost as quick.

David

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [linux-nfc] [PATCH 1/8] NFC: NCI: Allow connection close with dev down

2015-08-28 Thread Robert Dolca
On Sun, May 24, 2015 at 8:07 PM, Samuel Ortiz  wrote:
> Hi Robert,
>
> On Tue, Mar 31, 2015 at 05:03:42PM +0300, Robert Dolca wrote:
>> On Thu, Mar 26, 2015 at 2:29 AM, Samuel Ortiz  wrote:
>> > Hi Robert,
>> >
>> > On Tue, Feb 24, 2015 at 12:01:45PM +0200, Robert Dolca wrote:
>> >> By calling __nci_request instead of nci_request allows the driver to use
>> >> the function while initializing the device (setup stage)
>> >>
>> >> Signed-off-by: Robert Dolca 
>> >> ---
>> >>  net/nfc/nci/core.c | 2 +-
>> >>  1 file changed, 1 insertion(+), 1 deletion(-)
>> >>
>> >> diff --git a/net/nfc/nci/core.c b/net/nfc/nci/core.c
>> >> index 9575a18..c4dd5d8 100644
>> >> --- a/net/nfc/nci/core.c
>> >> +++ b/net/nfc/nci/core.c
>> >> @@ -558,7 +558,7 @@ static void nci_core_conn_close_req(struct nci_dev 
>> >> *ndev, unsigned long opt)
>> >>
>> >>  int nci_core_conn_close(struct nci_dev *ndev, u8 conn_id)
>> >>  {
>> >> - return nci_request(ndev, nci_core_conn_close_req, conn_id,
>> >> + return __nci_request(ndev, nci_core_conn_close_req, conn_id,
>> >>   msecs_to_jiffies(NCI_CMD_TIMEOUT));
>> > You're fixing your problem by removing the NCI request serialization and
>> > removing the check for your device being UP.
>> > I assume you need to open and close a proprietary connection from your
>> > setup hook ? Then please extend nci_request() to check for both NCI_UP
>> > and NCI_INIT.
>>
>> You are right, I am opening and closing a connection from the setup
>> function. The setup is called by nci_open_device. At the beginning of
>> nci_open_device, req_lock is being acquired and it is release at the
>> end of the function. That means that when setup is being called
>> req_lock is acuired. As you said I can modify nci_request to check for
>> NCI_INIT but it tries to acquire req_lock and it can not succeed.
> I see, I thought the issue was only about checking the NCI_* flags.
>
> As a short term solution, I propose you do the following:
>
> a) Export nci_core_conn_create_req, nci_core_conn_close_req and
> __nci_request.
> b) Call __nci_request() directly from your fdp_nci_close_conn() and
> fdp_nci_create_conn() routines.
>
> The long term, scalable fix would be to implement and export an
> __nci_send_cmd_sync() routine, that would transparently build an NCI
> request and tail it to the ndev req skb queue, and put the caller on a
> wait queue. The created request's response callback would then wake the
> caller up.

If nci_open_device would use another mutex instead of req_lock this
wouldn't be necessary.
I don't see any reason why nci_open_device should block the send
queue. Of course, in nci_open_device all calls to __nci_request would
have to be replaced with nci_request.

Samuel, would that be an acceptable solution?

Regards,
Robert
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[RFC PATCH kernel] Revert "net/mlx4_core: Add port attribute when tracking counters"

2015-08-28 Thread Alexey Kardashevskiy
68230242cdb breaks SRIOV on POWER8 system. I am not really suggesting
reverting the patch, rather asking for a fix.

To reproduce it:

1. boot latest upstream kernel (v4.2-rc8 sha1 4941b8f, ppc64le)

2. Run:
sudo rmmod mlx4_en mlx4_ib mlx4_core
sudo modprobe mlx4_core num_vfs=4 probe_vf=4 port_type_array=2,2 debug_level=1

3. Run QEMU (just to give a complete picture):
/home/aik/qemu-system-ppc64 -enable-kvm -m 2048 -machine pseries \
-nodefaults \
-chardev stdio,id=id0,signal=off,mux=on \
-device spapr-vty,id=id1,chardev=id0,reg=0x71000100 \
-mon id=id2,chardev=id0,mode=readline -nographic -vga none \
-initrd dhclient.cpio -kernel vml400bedbg \
-device vfio-pci,id=id3,host=0003:03:00.1
What guest is used does not matter at all.

4. Wait till guest boots and then run:
dhclient
This assigns IPs to both interfaces just fine. This is essential -
if interface was not brought up since guest started, the bug does not appear.
If interface was up and then down, this still causes the problem
(less likely though).

5. Run in the guest: shutdown -h 0
Guest prints:
mlx4_en: eth0: Close port called
mlx4_en: eth1: Close port called
mlx4_core :00:00.0: mlx4_shutdown was called
And then the host hangs. After 10-30 seconds the host console prints:
NMI watchdog: BUG: soft lockup - CPU#0 stuck for 22s! [qemu-system-ppc:5095]
OR
INFO: rcu_sched detected stalls on CPUs/tasks:
or some other random stuff but always related to some sort of lockup.
Backtraces are like these:

[c01e492a7ac0] [c0135b84] smp_call_function_many+0x2f4/0x3fable)
[c01e492a7b40] [c0135db8] kick_all_cpus_sync+0x38/0x50
[c01e492a7b60] [c0048f38] pmdp_huge_get_and_clear+0x48/0x70
[c01e492a7b90] [c023181c] change_huge_pmd+0xac/0x210
[c01e492a7bf0] [c01fb9e8] change_protection+0x678/0x720
[c01e492a7d00] [c0217d38] change_prot_numa+0x28/0xa0
[c01e492a7d30] [c00e0e40] task_numa_work+0x2a0/0x370
[c01e492a7db0] [c00c5fb4] task_work_run+0xe4/0x160
[c01e492a7e00] [c00169a4] do_notify_resume+0x84/0x90
[c01e492a7e30] [c00098b8] ret_from_except_lite+0x64/0x68

OR

[c01def1b7280] [c00ff941d368] 0xc00ff941d368 (unreliable)
[c01def1b7450] [c001512c] __switch_to+0x1fc/0x350
[c01def1b7490] [c01def1b74e0] 0xc01def1b74e0
[c01def1b74e0] [c011a50c] try_to_del_timer_sync+0x5c/0x90
[c01def1b7520] [c011a590] del_timer_sync+0x50/0x70
[c01def1b7550] [c09136fc] schedule_timeout+0x15c/0x2b0
[c01def1b7620] [c0910e6c] wait_for_common+0x12c/0x230
[c01def1b7660] [c00fa22c] up+0x4c/0x80
[c01def1b76a0] [d00016323e60] __mlx4_cmd+0x320/0x940 [mlx4_core]
[c01def1b7760] [c01def1b77a0] 0xc01def1b77a0
[c01def1b77f0] [d000163528b4] mlx4_2RST_QP_wrapper+0x154/0x1e0 
[mlx4_core]
[c01def1b7860] [d00016324934] mlx4_master_process_vhcr+0x1b4/0x6c0 
[mlx4_core]
[c01def1b7930] [d00016324170] __mlx4_cmd+0x630/0x940 [mlx4_core]
[c01def1b79f0] [d00016346fec] __mlx4_qp_modify.constprop.8+0x1ec/0x350 
[mlx4_core]
[c01def1b7ac0] [d00016292228] mlx4_ib_destroy_qp+0xd8/0x5d0 [mlx4_ib]
[c01def1b7b60] [d00013c7305c] ib_destroy_qp+0x1cc/0x290 [ib_core]
[c01def1b7bb0] [d00016284548] 
destroy_pv_resources.isra.14.part.15+0x48/0xf0 [mlx4_ib]
[c01def1b7be0] [d00016284d28] mlx4_ib_tunnels_update+0x168/0x170 
[mlx4_ib]
[c01def1b7c20] [d000162876e0] mlx4_ib_tunnels_update_work+0x30/0x50 
[mlx4_ib]
[c01def1b7c50] [c00c0d34] process_one_work+0x194/0x490
[c01def1b7ce0] [c00c11b0] worker_thread+0x180/0x5a0
[c01def1b7d80] [c00c8a0c] kthread+0x10c/0x130
[c01def1b7e30] [c00095a8] ret_from_kernel_thread+0x5c/0xb4

i.e. may or may not mention mlx4.
The issue may not happen on a first try but maximum on the second.


This is the function I am passing to the guest:
0003:03:00.1 Ethernet controller: Mellanox Technologies MT27500/MT27520 Family 
[ConnectX-3/ConnectX-3 Pro Virtual Function]
Subsystem: IBM Device 61b0
Flags: bus master, fast devsel, latency 0
[virtual] Memory at 3c108000 (64-bit, prefetchable) [size=128M]
Capabilities: 
Kernel driver in use: mlx4_core

And ideas? Some patches to try? Thanks!



---
 .../net/ethernet/mellanox/mlx4/resource_tracker.c  | 90 +-
 1 file changed, 3 insertions(+), 87 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c 
b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
index 73db584..802eb2a 100644
--- a/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
+++ b/drivers/net/ethernet/mellanox/mlx4/resource_tracker.c
@@ -723,9 +723,6 @@ static void update_gid(struct mlx4_dev *dev, struct 
mlx4_cmd_mailbox *inbox,
}
 }
 
-static int handle_counter(struct mlx4_dev *dev, struct mlx4_qp_context *qpc,
- u8 slave, int p

[PATCH net-next 4/4] vxlan: do not receive IPv4 packets on IPv6 socket

2015-08-28 Thread Jiri Benc
By default (subject to the sysctl settings), IPv6 sockets listen also for
IPv4 traffic. Vxlan is not prepared for that and expects IPv6 header in
packets received through an IPv6 socket.

In addition, it's currently not possible to have both IPv4 and IPv6 vxlan
tunnel on the same port (unless bindv6only sysctl is enabled), as it's not
possible to create and bind both IPv4 and IPv6 vxlan interfaces and there's
no way to specify both IPv4 and IPv6 remote/group IP addresses.

Set IPV6_V6ONLY on vxlan sockets to fix both of these issues. This is not
done globally in udp_tunnel, as l2tp and tipc seems to work okay when
receiving IPv4 packets on IPv6 socket and people may rely on this behavior.
The other tunnels (geneve and fou) do not support IPv6.

Signed-off-by: Jiri Benc 
---
 drivers/net/vxlan.c   | 1 +
 include/net/udp_tunnel.h  | 3 ++-
 net/ipv6/ip6_udp_tunnel.c | 9 +
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index e3adfe0ef66b..6c5269aea544 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2530,6 +2530,7 @@ static struct socket *vxlan_create_sock(struct net *net, 
bool ipv6,
udp_conf.family = AF_INET6;
udp_conf.use_udp6_rx_checksums =
!(flags & VXLAN_F_UDP_ZERO_CSUM6_RX);
+   udp_conf.ipv6_v6only = 1;
} else {
udp_conf.family = AF_INET;
}
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 35041d0fc21e..cb2f89f20f5c 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -31,7 +31,8 @@ struct udp_port_cfg {
__be16  peer_udp_port;
unsigned intuse_udp_checksums:1,
use_udp6_tx_checksums:1,
-   use_udp6_rx_checksums:1;
+   use_udp6_rx_checksums:1,
+   ipv6_v6only:1;
 };
 
 int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index e1a1136bda7c..14dacf1df529 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -23,6 +23,15 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg 
*cfg,
if (err < 0)
goto error;
 
+   if (cfg->ipv6_v6only) {
+   int val = 1;
+
+   err = kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY,
+   (char *) &val, sizeof(val));
+   if (err < 0)
+   goto error;
+   }
+
udp6_addr.sin6_family = AF_INET6;
memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6,
   sizeof(udp6_addr.sin6_addr));
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 3/4] fou: reject IPv6 config

2015-08-28 Thread Jiri Benc
fou does not really support IPv6 encapsulation. After an UDP socket is
created in fou_create, the encap_rcv callback is set either to fou_udp_recv
or to gue_udp_recv. Both of those unconditionally assume that the received
packet has an IPv4 header and access the data at network_header as it was an
IPv4 header. This leads to IPv6 flow label being interpreted as IP packet
length, etc.

Disallow fou tunnel to be configured as IPv6 until real IPv6 support is
added to fou.

Signed-off-by: Jiri Benc 
---
 net/ipv4/fou.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 2d1646cff057..e0fcbbbcfe54 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -566,7 +566,7 @@ static int parse_nl_config(struct genl_info *info,
if (info->attrs[FOU_ATTR_AF]) {
u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]);
 
-   if (family != AF_INET && family != AF_INET6)
+   if (family != AF_INET)
return -EINVAL;
 
cfg->udp_config.family = family;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 0/4] tunnels: fix incorrect IPv4/v6 headers interpretation

2015-08-28 Thread Jiri Benc
With tunneling, it is currently possible to get an IPv6 header and interpret
it as an IPv4 header, or to interpret an IPv6 address as an IPv4 address
(and vice versa). This leads to things like sending packets to incorrect
address, IPv6 flow label being interpreted as IP packet length, etc.

Fix several places where this can happen.

Most of this is net-next only. The third patch affects net, too, but it
doesn't seem there's anything in user space that sets the attribute at all
currently, thus net-next is fine.

Jiri Benc (4):
  ip_tunnels: convert the mode field of ip_tunnel_info to flags
  ip_tunnels: record IP version in tunnel info
  fou: reject IPv6 config
  vxlan: do not receive IPv4 packets on IPv6 socket

 drivers/net/geneve.c   |  3 +++
 drivers/net/vxlan.c|  5 -
 include/net/dst_metadata.h |  2 +-
 include/net/ip_tunnels.h   | 19 ---
 include/net/udp_tunnel.h   |  3 ++-
 net/core/filter.c  |  2 ++
 net/ipv4/fou.c |  2 +-
 net/ipv4/ip_gre.c  |  3 ++-
 net/ipv4/ip_tunnel_core.c  |  2 +-
 net/ipv4/route.c   |  2 +-
 net/ipv6/ip6_udp_tunnel.c  |  9 +
 net/ipv6/route.c   |  2 +-
 net/openvswitch/flow.c |  2 ++
 net/openvswitch/vport.c|  2 ++
 14 files changed, 43 insertions(+), 15 deletions(-)

-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 1/4] ip_tunnels: convert the mode field of ip_tunnel_info to flags

2015-08-28 Thread Jiri Benc
The mode field holds a single bit of information only (whether the
ip_tunnel_info struct is for rx or tx). Change the mode field to bit flags.
This allows more mode flags to be added.

Signed-off-by: Jiri Benc 
---
 drivers/net/vxlan.c| 2 +-
 include/net/dst_metadata.h | 1 -
 include/net/ip_tunnels.h   | 9 ++---
 net/ipv4/ip_gre.c  | 2 +-
 net/ipv4/route.c   | 2 +-
 net/ipv6/route.c   | 2 +-
 6 files changed, 6 insertions(+), 12 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 30e56cb58884..bd1b8cdf2bf6 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2113,7 +2113,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct 
net_device *dev)
}
 
if (vxlan->flags & VXLAN_F_COLLECT_METADATA &&
-   info && info->mode == IP_TUNNEL_INFO_TX) {
+   info && info->mode & IP_TUNNEL_INFO_TX) {
vxlan_xmit_one(skb, dev, NULL, false);
return NETDEV_TX_OK;
}
diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 60c03326c087..2b83f0d232e0 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -59,7 +59,6 @@ static inline struct metadata_dst *tun_rx_dst(__be16 flags,
return NULL;
 
info = &tun_dst->u.tun_info;
-   info->mode = IP_TUNNEL_INFO_RX;
info->key.tun_flags = flags;
info->key.tun_id = tunnel_id;
info->key.tp_src = 0;
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 224e4ecec91b..9bdb3948798f 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -50,13 +50,8 @@ struct ip_tunnel_key {
__be16  tp_dst;
 };
 
-/* Indicates whether the tunnel info structure represents receive
- * or transmit tunnel parameters.
- */
-enum {
-   IP_TUNNEL_INFO_RX,
-   IP_TUNNEL_INFO_TX,
-};
+/* Flags for ip_tunnel_info mode. */
+#define IP_TUNNEL_INFO_TX  0x01/* represents tx tunnel parameters */
 
 struct ip_tunnel_info {
struct ip_tunnel_keykey;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index faf1cde6f8da..1e813a9f9378 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -511,7 +511,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct 
net_device *dev)
int err;
 
tun_info = skb_tunnel_info(skb);
-   if (unlikely(!tun_info || tun_info->mode != IP_TUNNEL_INFO_TX))
+   if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX)))
goto err_free_skb;
 
key = &tun_info->key;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f3087aaa6dd8..3d9e70c804a9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1693,7 +1693,7 @@ static int ip_route_input_slow(struct sk_buff *skb, 
__be32 daddr, __be32 saddr,
 */
 
tun_info = skb_tunnel_info(skb);
-   if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX)
+   if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
else
fl4.flowi4_tun_key.tun_id = 0;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index df3e353a012d..308dd5f9158f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1174,7 +1174,7 @@ void ip6_route_input(struct sk_buff *skb)
};
 
tun_info = skb_tunnel_info(skb);
-   if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX)
+   if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
skb_dst_drop(skb);
skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 2/4] ip_tunnels: record IP version in tunnel info

2015-08-28 Thread Jiri Benc
There's currently nothing preventing directing packets with IPv6
encapsulation data to IPv4 tunnels (and vice versa). If this happens,
IPv6 addresses are incorrectly interpreted as IPv4 ones.

Track whether the given ip_tunnel_key contains IPv4 or IPv6 data. Store this
in ip_tunnel_info. Reject packets at appropriate places if they are supposed
to be encapsulated into an incompatible protocol.

Signed-off-by: Jiri Benc 
---
 drivers/net/geneve.c   |  3 +++
 drivers/net/vxlan.c|  2 ++
 include/net/dst_metadata.h |  1 +
 include/net/ip_tunnels.h   | 10 ++
 net/core/filter.c  |  2 ++
 net/ipv4/ip_gre.c  |  3 ++-
 net/ipv4/ip_tunnel_core.c  |  2 +-
 net/openvswitch/flow.c |  2 ++
 net/openvswitch/vport.c|  2 ++
 9 files changed, 25 insertions(+), 2 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 4357bae732d7..d4882d7904f6 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -644,6 +644,9 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct 
net_device *dev)
u8 *opts = NULL;
u8 vni[3];
 
+   if (ip_tunnel_info_af(info) != AF_INET)
+   goto err;
+
tunnel_id_to_vni(key->tun_id, vni);
if (key->tun_flags & TUNNEL_GENEVE_OPT)
opts = ip_tunnel_info_opts(info, info->options_len);
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index bd1b8cdf2bf6..e3adfe0ef66b 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1903,6 +1903,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct 
net_device *dev,
  dev->name);
goto drop;
}
+   if (family != ip_tunnel_info_af(info))
+   goto drop;
 
dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
vni = be64_to_cpu(info->key.tun_id);
diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 2b83f0d232e0..d32f49cc621d 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -105,6 +105,7 @@ static inline struct metadata_dst *ipv6_tun_rx_dst(struct 
sk_buff *skb,
info->key.u.ipv6.dst = ip6h->daddr;
info->key.tos = ipv6_get_dsfield(ip6h);
info->key.ttl = ip6h->hop_limit;
+   info->mode = IP_TUNNEL_INFO_IPV6;
return tun_dst;
 }
 
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 9bdb3948798f..2b4fa06e91bd 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -4,6 +4,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -52,6 +53,7 @@ struct ip_tunnel_key {
 
 /* Flags for ip_tunnel_info mode. */
 #define IP_TUNNEL_INFO_TX  0x01/* represents tx tunnel parameters */
+#define IP_TUNNEL_INFO_IPV60x02/* key contains IPv6 addresses */
 
 struct ip_tunnel_info {
struct ip_tunnel_keykey;
@@ -208,6 +210,8 @@ static inline void __ip_tunnel_info_init(struct 
ip_tunnel_info *tun_info,
 
tun_info->options = opts;
tun_info->options_len = opts_len;
+
+   tun_info->mode = 0;
 }
 
 static inline void ip_tunnel_info_init(struct ip_tunnel_info *tun_info,
@@ -221,6 +225,12 @@ static inline void ip_tunnel_info_init(struct 
ip_tunnel_info *tun_info,
  tun_id, tun_flags, opts, opts_len);
 }
 
+static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info
+  *tun_info)
+{
+   return tun_info->mode & IP_TUNNEL_INFO_IPV6 ? AF_INET6 : AF_INET;
+}
+
 #ifdef CONFIG_INET
 
 int ip_tunnel_init(struct net_device *dev);
diff --git a/net/core/filter.c b/net/core/filter.c
index 66500d490995..13079f03902e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1493,6 +1493,8 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 
size, u64 flags, u64 r5)
 
if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags || !info))
return -EINVAL;
+   if (ip_tunnel_info_af(info) != AF_INET)
+   return -EINVAL;
 
to->tunnel_id = be64_to_cpu(info->key.tun_id);
to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 1e813a9f9378..bd0679d90519 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -511,7 +511,8 @@ static void gre_fb_xmit(struct sk_buff *skb, struct 
net_device *dev)
int err;
 
tun_info = skb_tunnel_info(skb);
-   if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX)))
+   if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+ip_tunnel_info_af(tun_info) != AF_INET))
goto err_free_skb;
 
key = &tun_info->key;
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 934f2ac8ad61..0c756ade1cf7 100644
--- a/net/ipv4/ip_tunnel_core.c
+++ b/net/ipv4/ip_tunnel_cor

RE: [PATCH v8 1/3] if_link: Add control trust VF

2015-08-28 Thread Rose, Gregory V

> -Original Message-
> From: Hiroshi Shimamoto [mailto:h-shimam...@ct.jp.nec.com]
> Sent: Thursday, August 27, 2015 11:58 PM
> To: Or Gerlitz; Alexander Duyck; Skidmore, Donald C; Rose, Gregory V;
> Kirsher, Jeffrey T; intel-wired-...@lists.osuosl.org; nhor...@redhat.com;
> jogre...@redhat.com; Linux Netdev List; Choi, Sy Jong; Rony Efraim; Edward
> Cree; David Miller; sassm...@redhat.com
> Subject: [PATCH v8 1/3] if_link: Add control trust VF
> 
> From: Hiroshi Shimamoto 
> 
> Add netlink directives and ndo entry to trust VF user.
> 
> This controls the special permission of VF user.
> The administrator will dedicatedly trust VF user to use some features
> which impacts security and/or performance.
> 
> The administrator never turn it on unless VF user is fully trusted.
> 
> Signed-off-by: Hiroshi Shimamoto 
> CC: Choi, Sy Jong > ---

Thank you for persisting in this!

Acked-By: Greg Rose 

I'll leave the patches for ixgbe to Don Skidmore to review. 

>  include/linux/if_link.h  |  1 +
>  include/linux/netdevice.h|  3 +++
>  include/uapi/linux/if_link.h |  6 ++
>  net/core/rtnetlink.c | 24 +---
>  4 files changed, 31 insertions(+), 3 deletions(-)
> 
> diff --git a/include/linux/if_link.h b/include/linux/if_link.h index
> ae5d0d2..f923d15 100644
> --- a/include/linux/if_link.h
> +++ b/include/linux/if_link.h
> @@ -24,5 +24,6 @@ struct ifla_vf_info {
>   __u32 min_tx_rate;
>   __u32 max_tx_rate;
>   __u32 rss_query_en;
> + __u32 trusted;
>  };
>  #endif /* _LINUX_IF_LINK_H */
> diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index
> 6163ecb..7db19e7 100644
> --- a/include/linux/netdevice.h
> +++ b/include/linux/netdevice.h
> @@ -880,6 +880,7 @@ typedef u16 (*select_queue_fallback_t)(struct
> net_device *dev,
>   * int (*ndo_set_vf_rate)(struct net_device *dev, int vf, int
> min_tx_rate,
>   * int max_tx_rate);
>   * int (*ndo_set_vf_spoofchk)(struct net_device *dev, int vf, bool
> setting);
> + * int (*ndo_set_vf_trust)(struct net_device *dev, int vf, bool
> + setting);
>   * int (*ndo_get_vf_config)(struct net_device *dev,
>   *   int vf, struct ifla_vf_info *ivf);
>   * int (*ndo_set_vf_link_state)(struct net_device *dev, int vf, int
> link_state); @@ -1121,6 +1122,8 @@ struct net_device_ops {
>  int max_tx_rate);
>   int (*ndo_set_vf_spoofchk)(struct net_device *dev,
>  int vf, bool setting);
> + int (*ndo_set_vf_trust)(struct net_device *dev,
> + int vf, bool setting);
>   int (*ndo_get_vf_config)(struct net_device *dev,
>int vf,
>struct ifla_vf_info *ivf);
> diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
> index 313c305..2d6abd4 100644
> --- a/include/uapi/linux/if_link.h
> +++ b/include/uapi/linux/if_link.h
> @@ -498,6 +498,7 @@ enum {
>* on/off switch
>*/
>   IFLA_VF_STATS,  /* network device statistics */
> + IFLA_VF_TRUST,  /* Trust VF */
>   __IFLA_VF_MAX,
>  };
> 
> @@ -559,6 +560,11 @@ enum {
> 
>  #define IFLA_VF_STATS_MAX (__IFLA_VF_STATS_MAX - 1)
> 
> +struct ifla_vf_trust {
> + __u32 vf;
> + __u32 setting;
> +};
> +
>  /* VF ports management section
>   *
>   *   Nested layout of set/get msg is:
> diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c index
> 788ceed..2836bf1 100644
> --- a/net/core/rtnetlink.c
> +++ b/net/core/rtnetlink.c
> @@ -831,7 +831,8 @@ static inline int rtnl_vfinfo_size(const struct
> net_device *dev,
>/* IFLA_VF_STATS_BROADCAST */
>nla_total_size(sizeof(__u64)) +
>/* IFLA_VF_STATS_MULTICAST */
> -  nla_total_size(sizeof(__u64)));
> +  nla_total_size(sizeof(__u64)) +
> +  nla_total_size(sizeof(struct ifla_vf_trust)));
>   return size;
>   } else
>   return 0;
> @@ -1154,6 +1155,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb,
> struct net_device *dev,
>   struct ifla_vf_link_state vf_linkstate;
>   struct ifla_vf_rss_query_en vf_rss_query_en;
>   struct ifla_vf_stats vf_stats;
> + struct ifla_vf_trust vf_trust;
> 
>   /*
>* Not all SR-IOV capable drivers support the @@ -1163,6
> +1165,7 @@ static int rtnl_fill_ifinfo(struct sk_buff *skb, struct
> net_device *dev,
>*/
>   ivi.spoofchk = -1;
>   ivi.rss_query_en = -1;
> + ivi.trusted = -

[PATCH net-next 8/8] net: thunderx: Support for internal loopback mode

2015-08-28 Thread Aleksey Makarov
From: Sunil Goutham 

Support for setting VF's corresponding BGX LMAC in internal
loopback mode. This mode can be used for verifying basic HW
functionality such as packet I/O, RX checksum validation,
CQ/RBDR interrupts, stats e.t.c. Useful when DUT has no external
network connectivity.

'loopback' mode can be enabled or disabled via ethtool.

Note: This feature is not supported when no of VFs enabled are
morethan no of physical interfaces i.e active BGX LMACs

Signed-off-by: Sunil Goutham 
Signed-off-by: Aleksey Makarov 
---
 drivers/net/ethernet/cavium/thunder/nic.h | 11 
 drivers/net/ethernet/cavium/thunder/nic_main.c| 21 +++
 drivers/net/ethernet/cavium/thunder/nicvf_main.c  | 30 ++
 drivers/net/ethernet/cavium/thunder/thunder_bgx.c | 31 +++
 drivers/net/ethernet/cavium/thunder/thunder_bgx.h |  4 +++
 5 files changed, 97 insertions(+)

diff --git a/drivers/net/ethernet/cavium/thunder/nic.h 
b/drivers/net/ethernet/cavium/thunder/nic.h
index 35b2ee1..d3950b2 100644
--- a/drivers/net/ethernet/cavium/thunder/nic.h
+++ b/drivers/net/ethernet/cavium/thunder/nic.h
@@ -265,6 +265,7 @@ struct nicvf {
u8  node;
u8  tns_mode:1;
u8  sqs_mode:1;
+   u8  loopback_supported:1;
u16 mtu;
struct queue_set*qs;
 #defineMAX_SQS_PER_VF_SINGLE_NODE  5
@@ -344,6 +345,7 @@ struct nicvf {
 #defineNIC_MBOX_MSG_NICVF_PTR  0x13/* Send nicvf ptr to PF 
*/
 #defineNIC_MBOX_MSG_PNICVF_PTR 0x14/* Get primary qset 
nicvf ptr */
 #defineNIC_MBOX_MSG_SNICVF_PTR 0x15/* Send sqet nicvf ptr 
to PVF */
+#defineNIC_MBOX_MSG_LOOPBACK   0x16/* Set interface in 
loopback */
 #defineNIC_MBOX_MSG_CFG_DONE   0xF0/* VF configuration 
done */
 #defineNIC_MBOX_MSG_SHUTDOWN   0xF1/* VF is being shutdown 
*/
 
@@ -353,6 +355,7 @@ struct nic_cfg_msg {
u8node_id;
u8tns_mode:1;
u8sqs_mode:1;
+   u8loopback_supported:1;
u8mac_addr[ETH_ALEN];
 };
 
@@ -452,6 +455,13 @@ struct nicvf_ptr {
u64   nicvf;
 };
 
+/* Set interface in loopback mode */
+struct set_loopback {
+   u8msg;
+   u8vf_id;
+   bool  enable;
+};
+
 /* 128 bit shared memory between PF and each VF */
 union nic_mbx {
struct { u8 msg; }  msg;
@@ -468,6 +478,7 @@ union nic_mbx {
struct bgx_link_status  link_status;
struct sqs_allocsqs_alloc;
struct nicvf_ptrnicvf;
+   struct set_loopback lbk;
 };
 
 #define NIC_NODE_ID_MASK   0x03
diff --git a/drivers/net/ethernet/cavium/thunder/nic_main.c 
b/drivers/net/ethernet/cavium/thunder/nic_main.c
index 51f3048..fd36820 100644
--- a/drivers/net/ethernet/cavium/thunder/nic_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nic_main.c
@@ -154,6 +154,9 @@ static void nic_mbx_send_ready(struct nicpf *nic, int vf)
}
mbx.nic_cfg.sqs_mode = (vf >= nic->num_vf_en) ? true : false;
mbx.nic_cfg.node_id = nic->node;
+
+   mbx.nic_cfg.loopback_supported = vf < MAX_LMAC;
+
nic_send_msg_to_vf(nic, vf, &mbx);
 }
 
@@ -579,6 +582,21 @@ send_mbox:
nic_send_msg_to_vf(nic, sqs->vf_id, &mbx);
 }
 
+static int nic_config_loopback(struct nicpf *nic, struct set_loopback *lbk)
+{
+   int bgx_idx, lmac_idx;
+
+   if (lbk->vf_id > MAX_LMAC)
+   return -1;
+
+   bgx_idx = NIC_GET_BGX_FROM_VF_LMAC_MAP(nic->vf_lmac_map[lbk->vf_id]);
+   lmac_idx = NIC_GET_LMAC_FROM_VF_LMAC_MAP(nic->vf_lmac_map[lbk->vf_id]);
+
+   bgx_lmac_internal_loopback(nic->node, bgx_idx, lmac_idx, lbk->enable);
+
+   return 0;
+}
+
 /* Interrupt handler to handle mailbox messages from VFs */
 static void nic_handle_mbx_intr(struct nicpf *nic, int vf)
 {
@@ -702,6 +720,9 @@ static void nic_handle_mbx_intr(struct nicpf *nic, int vf)
case NIC_MBOX_MSG_BGX_STATS:
nic_get_bgx_stats(nic, &mbx.bgx_stats);
goto unlock;
+   case NIC_MBOX_MSG_LOOPBACK:
+   ret = nic_config_loopback(nic, &mbx.lbk);
+   break;
default:
dev_err(&nic->pdev->dev,
"Invalid msg from VF%d, msg 0x%x\n", vf, mbx.msg.msg);
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c 
b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 9a1091a..49228b6 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -202,6 +202,7 @@ static void  nicvf_handle_mbx_intr(struct nicvf *nic)
ether_addr_copy(nic->netdev->dev_addr,
mbx.nic_cfg.mac_addr);
nic->sqs_mode = mbx.nic_cfg.sqs_mode;
+   nic->loopback_supporte

[PATCH net-next 7/8] net: thunderx: Support for upto 96 queues for a VF

2015-08-28 Thread Aleksey Makarov
From: Sunil Goutham 

This patch adds support for handling multiple qsets assigned to a
single VF. There by increasing no of queues from earlier 8 to max
no of CPUs in the system i.e 48 queues on a single node and 96 on
dual node system. User doesn't have option to assign which Qsets/VFs
 to be merged. Upon request from VF, PF assigns next free Qsets as
secondary qsets. To maintain current behavior no of queues is kept
to 8 by default which can be increased via ethtool.

If user wants to unbind NICVF driver from a secondary Qset then it
should be done after tearing down primary VF's interface.

Signed-off-by: Sunil Goutham 
Signed-off-by: Aleksey Makarov 
Signed-off-by: Robert Richter 
---
 drivers/net/ethernet/cavium/thunder/nic.h  |  42 -
 drivers/net/ethernet/cavium/thunder/nic_main.c | 173 +++--
 .../net/ethernet/cavium/thunder/nicvf_ethtool.c| 136 +
 drivers/net/ethernet/cavium/thunder/nicvf_main.c   | 210 +++--
 drivers/net/ethernet/cavium/thunder/nicvf_queues.c |  32 +++-
 5 files changed, 507 insertions(+), 86 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nic.h 
b/drivers/net/ethernet/cavium/thunder/nic.h
index 89b997e..35b2ee1 100644
--- a/drivers/net/ethernet/cavium/thunder/nic.h
+++ b/drivers/net/ethernet/cavium/thunder/nic.h
@@ -258,13 +258,23 @@ struct nicvf_drv_stats {
 };
 
 struct nicvf {
+   struct nicvf*pnicvf;
struct net_device   *netdev;
struct pci_dev  *pdev;
u8  vf_id;
u8  node;
-   u8  tns_mode;
+   u8  tns_mode:1;
+   u8  sqs_mode:1;
u16 mtu;
struct queue_set*qs;
+#defineMAX_SQS_PER_VF_SINGLE_NODE  5
+#defineMAX_SQS_PER_VF  11
+   u8  sqs_id;
+   u8  sqs_count; /* Secondary Qset count */
+   struct nicvf*snicvf[MAX_SQS_PER_VF];
+   u8  rx_queues;
+   u8  tx_queues;
+   u8  max_queues;
void __iomem*reg_base;
boollink_up;
u8  duplex;
@@ -330,14 +340,19 @@ struct nicvf {
 #defineNIC_MBOX_MSG_RQ_SW_SYNC 0x0F/* Flush inflight pkts 
to RQ */
 #defineNIC_MBOX_MSG_BGX_STATS  0x10/* Get stats from BGX */
 #defineNIC_MBOX_MSG_BGX_LINK_CHANGE0x11/* BGX:LMAC link status 
*/
-#define NIC_MBOX_MSG_CFG_DONE  0x12/* VF configuration done */
-#define NIC_MBOX_MSG_SHUTDOWN  0x13/* VF is being shutdown */
+#defineNIC_MBOX_MSG_ALLOC_SQS  0x12/* Allocate secondary 
Qset */
+#defineNIC_MBOX_MSG_NICVF_PTR  0x13/* Send nicvf ptr to PF 
*/
+#defineNIC_MBOX_MSG_PNICVF_PTR 0x14/* Get primary qset 
nicvf ptr */
+#defineNIC_MBOX_MSG_SNICVF_PTR 0x15/* Send sqet nicvf ptr 
to PVF */
+#defineNIC_MBOX_MSG_CFG_DONE   0xF0/* VF configuration 
done */
+#defineNIC_MBOX_MSG_SHUTDOWN   0xF1/* VF is being shutdown 
*/
 
 struct nic_cfg_msg {
u8msg;
u8vf_id;
-   u8tns_mode;
u8node_id;
+   u8tns_mode:1;
+   u8sqs_mode:1;
u8mac_addr[ETH_ALEN];
 };
 
@@ -345,6 +360,7 @@ struct nic_cfg_msg {
 struct qs_cfg_msg {
u8msg;
u8num;
+   u8sqs_count;
u64   cfg;
 };
 
@@ -361,6 +377,7 @@ struct sq_cfg_msg {
u8msg;
u8qs_num;
u8sq_num;
+   bool  sqs_mode;
u64   cfg;
 };
 
@@ -420,6 +437,21 @@ struct bgx_link_status {
u32   speed;
 };
 
+/* Get Extra Qset IDs */
+struct sqs_alloc {
+   u8msg;
+   u8vf_id;
+   u8qs_count;
+};
+
+struct nicvf_ptr {
+   u8msg;
+   u8vf_id;
+   bool  sqs_mode;
+   u8sqs_id;
+   u64   nicvf;
+};
+
 /* 128 bit shared memory between PF and each VF */
 union nic_mbx {
struct { u8 msg; }  msg;
@@ -434,6 +466,8 @@ union nic_mbx {
struct rss_cfg_msg  rss_cfg;
struct bgx_stats_msgbgx_stats;
struct bgx_link_status  link_status;
+   struct sqs_allocsqs_alloc;
+   struct nicvf_ptrnicvf;
 };
 
 #define NIC_NODE_ID_MASK   0x03
diff --git a/drivers/net/ethernet/cavium/thunder/nic_main.c 
b/drivers/net/ethernet/cavium/thunder/nic_main.c
index 7dfec4a..51f3048 100644
--- a/drivers/net/ethernet/cavium/thunder/nic_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nic_main.c
@@ -28,6 +28,11 @@ struct nicpf {
u8  num_vf_en;  /* No of VF enabled */
boolvf_enabled[MAX_NUM_VFS_SUPPORTED];
void __iomem*reg_base;   /* R

[PATCH net-next 3/8] net: thunderx: mailboxes: remove code duplication

2015-08-28 Thread Aleksey Makarov
From: Sunil Goutham 

Use the nicvf_send_msg_to_pf() function in the mailbox code.

Signed-off-by: Sunil Goutham 
Signed-off-by: Robert Richter 
Signed-off-by: Aleksey Makarov 
---
 drivers/net/ethernet/cavium/thunder/nic.h|  3 +-
 drivers/net/ethernet/cavium/thunder/nicvf_main.c | 44 ++--
 2 files changed, 11 insertions(+), 36 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nic.h 
b/drivers/net/ethernet/cavium/thunder/nic.h
index 58adfd6..a83f567 100644
--- a/drivers/net/ethernet/cavium/thunder/nic.h
+++ b/drivers/net/ethernet/cavium/thunder/nic.h
@@ -295,10 +295,9 @@ struct nicvf {
charirq_name[NIC_VF_MSIX_VECTORS][20];
boolirq_allocated[NIC_VF_MSIX_VECTORS];
 
-   boolpf_ready_to_rcv_msg;
+   /* VF <-> PF mailbox communication */
boolpf_acked;
boolpf_nacked;
-   boolbgx_stats_acked;
boolset_mac_pending;
 } cacheline_aligned_in_smp;
 
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c 
b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index 670ff9b..d4ad36e 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -105,7 +105,6 @@ u64 nicvf_queue_reg_read(struct nicvf *nic, u64 offset, u64 
qidx)
 }
 
 /* VF -> PF mailbox communication */
-
 static void nicvf_write_to_mbx(struct nicvf *nic, union nic_mbx *mbx)
 {
u64 *msg = (u64 *)mbx;
@@ -147,26 +146,15 @@ int nicvf_send_msg_to_pf(struct nicvf *nic, union nic_mbx 
*mbx)
 */
 static int nicvf_check_pf_ready(struct nicvf *nic)
 {
-   int timeout = 5000, sleep = 20;
union nic_mbx mbx = {};
 
mbx.msg.msg = NIC_MBOX_MSG_READY;
-
-   nic->pf_ready_to_rcv_msg = false;
-
-   nicvf_write_to_mbx(nic, &mbx);
-
-   while (!nic->pf_ready_to_rcv_msg) {
-   msleep(sleep);
-   if (nic->pf_ready_to_rcv_msg)
-   break;
-   timeout -= sleep;
-   if (!timeout) {
-   netdev_err(nic->netdev,
-  "PF didn't respond to READY msg\n");
-   return 0;
-   }
+   if (nicvf_send_msg_to_pf(nic, &mbx)) {
+   netdev_err(nic->netdev,
+  "PF didn't respond to READY msg\n");
+   return 0;
}
+
return 1;
 }
 
@@ -197,7 +185,7 @@ static void  nicvf_handle_mbx_intr(struct nicvf *nic)
netdev_dbg(nic->netdev, "Mbox message: msg: 0x%x\n", mbx.msg.msg);
switch (mbx.msg.msg) {
case NIC_MBOX_MSG_READY:
-   nic->pf_ready_to_rcv_msg = true;
+   nic->pf_acked = true;
nic->vf_id = mbx.nic_cfg.vf_id & 0x7F;
nic->tns_mode = mbx.nic_cfg.tns_mode & 0x7F;
nic->node = mbx.nic_cfg.node_id;
@@ -221,7 +209,6 @@ static void  nicvf_handle_mbx_intr(struct nicvf *nic)
case NIC_MBOX_MSG_BGX_STATS:
nicvf_read_bgx_stats(nic, &mbx.bgx_stats);
nic->pf_acked = true;
-   nic->bgx_stats_acked = true;
break;
case NIC_MBOX_MSG_BGX_LINK_CHANGE:
nic->pf_acked = true;
@@ -1083,7 +1070,6 @@ void nicvf_update_lmac_stats(struct nicvf *nic)
 {
int stat = 0;
union nic_mbx mbx = {};
-   int timeout;
 
if (!netif_running(nic->netdev))
return;
@@ -1093,14 +1079,9 @@ void nicvf_update_lmac_stats(struct nicvf *nic)
/* Rx stats */
mbx.bgx_stats.rx = 1;
while (stat < BGX_RX_STATS_COUNT) {
-   nic->bgx_stats_acked = 0;
mbx.bgx_stats.idx = stat;
-   nicvf_send_msg_to_pf(nic, &mbx);
-   timeout = 0;
-   while ((!nic->bgx_stats_acked) && (timeout < 10)) {
-   msleep(2);
-   timeout++;
-   }
+   if (nicvf_send_msg_to_pf(nic, &mbx))
+   return;
stat++;
}
 
@@ -1109,14 +1090,9 @@ void nicvf_update_lmac_stats(struct nicvf *nic)
/* Tx stats */
mbx.bgx_stats.rx = 0;
while (stat < BGX_TX_STATS_COUNT) {
-   nic->bgx_stats_acked = 0;
mbx.bgx_stats.idx = stat;
-   nicvf_send_msg_to_pf(nic, &mbx);
-   timeout = 0;
-   while ((!nic->bgx_stats_acked) && (timeout < 10)) {
-   msleep(2);
-   timeout++;
-   }
+   if (nicvf_send_msg_to_pf(nic, &mbx))
+   return;
stat++;
}
 }
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 6/8] net: thunderx: Rework interrupt handler

2015-08-28 Thread Aleksey Makarov
From: Sunil Goutham 

Rework interrupt handler to avoid checking IRQ affinity of
CQ interrupts. Now separate handlers are registered for each IRQ
including RBDR. Also register interrupt handlers for only those
which are being used.

Signed-off-by: Sunil Goutham 
Signed-off-by: Aleksey Makarov 
---
 drivers/net/ethernet/cavium/thunder/nic.h  |   1 +
 drivers/net/ethernet/cavium/thunder/nicvf_main.c   | 172 -
 drivers/net/ethernet/cavium/thunder/nicvf_queues.h |   2 +
 3 files changed, 103 insertions(+), 72 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nic.h 
b/drivers/net/ethernet/cavium/thunder/nic.h
index a83f567..89b997e 100644
--- a/drivers/net/ethernet/cavium/thunder/nic.h
+++ b/drivers/net/ethernet/cavium/thunder/nic.h
@@ -135,6 +135,7 @@
 #defineNICVF_TX_TIMEOUT(50 * HZ)
 
 struct nicvf_cq_poll {
+   struct  nicvf *nicvf;
u8  cq_idx; /* Completion queue index */
struct  napi_struct napi;
 };
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c 
b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index de51828..2198f61 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -653,11 +653,20 @@ static void nicvf_handle_qs_err(unsigned long data)
nicvf_enable_intr(nic, NICVF_INTR_QS_ERR, 0);
 }
 
+static inline void nicvf_dump_intr_status(struct nicvf *nic)
+{
+   if (netif_msg_intr(nic))
+   netdev_info(nic->netdev, "%s: interrupt status 0x%llx\n",
+   nic->netdev->name, nicvf_reg_read(nic, NIC_VF_INT));
+}
+
 static irqreturn_t nicvf_misc_intr_handler(int irq, void *nicvf_irq)
 {
struct nicvf *nic = (struct nicvf *)nicvf_irq;
u64 intr;
 
+   nicvf_dump_intr_status(nic);
+
intr = nicvf_reg_read(nic, NIC_VF_INT);
/* Check for spurious interrupt */
if (!(intr & NICVF_INTR_MBOX_MASK))
@@ -668,59 +677,58 @@ static irqreturn_t nicvf_misc_intr_handler(int irq, void 
*nicvf_irq)
return IRQ_HANDLED;
 }
 
-static irqreturn_t nicvf_intr_handler(int irq, void *nicvf_irq)
+static irqreturn_t nicvf_intr_handler(int irq, void *cq_irq)
+{
+   struct nicvf_cq_poll *cq_poll = (struct nicvf_cq_poll *)cq_irq;
+   struct nicvf *nic = cq_poll->nicvf;
+   int qidx = cq_poll->cq_idx;
+
+   nicvf_dump_intr_status(nic);
+
+   /* Disable interrupts */
+   nicvf_disable_intr(nic, NICVF_INTR_CQ, qidx);
+
+   /* Schedule NAPI */
+   napi_schedule(&cq_poll->napi);
+
+   /* Clear interrupt */
+   nicvf_clear_intr(nic, NICVF_INTR_CQ, qidx);
+
+   return IRQ_HANDLED;
+}
+
+static irqreturn_t nicvf_rbdr_intr_handler(int irq, void *nicvf_irq)
 {
-   u64 qidx, intr, clear_intr = 0;
-   u64 cq_intr, rbdr_intr, qs_err_intr;
struct nicvf *nic = (struct nicvf *)nicvf_irq;
-   struct queue_set *qs = nic->qs;
-   struct nicvf_cq_poll *cq_poll = NULL;
+   u8 qidx;
 
-   intr = nicvf_reg_read(nic, NIC_VF_INT);
-   if (netif_msg_intr(nic))
-   netdev_info(nic->netdev, "%s: interrupt status 0x%llx\n",
-   nic->netdev->name, intr);
-
-   qs_err_intr = intr & NICVF_INTR_QS_ERR_MASK;
-   if (qs_err_intr) {
-   /* Disable Qset err interrupt and schedule softirq */
-   nicvf_disable_intr(nic, NICVF_INTR_QS_ERR, 0);
-   tasklet_hi_schedule(&nic->qs_err_task);
-   clear_intr |= qs_err_intr;
-   }
 
-   /* Disable interrupts and start polling */
-   cq_intr = (intr & NICVF_INTR_CQ_MASK) >> NICVF_INTR_CQ_SHIFT;
-   for (qidx = 0; qidx < qs->cq_cnt; qidx++) {
-   if (!(cq_intr & (1 << qidx)))
-   continue;
-   if (!nicvf_is_intr_enabled(nic, NICVF_INTR_CQ, qidx))
+   nicvf_dump_intr_status(nic);
+
+   /* Disable RBDR interrupt and schedule softirq */
+   for (qidx = 0; qidx < nic->qs->rbdr_cnt; qidx++) {
+   if (!nicvf_is_intr_enabled(nic, NICVF_INTR_RBDR, qidx))
continue;
+   nicvf_disable_intr(nic, NICVF_INTR_RBDR, qidx);
+   tasklet_hi_schedule(&nic->rbdr_task);
+   /* Clear interrupt */
+   nicvf_clear_intr(nic, NICVF_INTR_RBDR, qidx);
+   }
 
-   nicvf_disable_intr(nic, NICVF_INTR_CQ, qidx);
-   clear_intr |= ((1 << qidx) << NICVF_INTR_CQ_SHIFT);
+   return IRQ_HANDLED;
+}
 
-   cq_poll = nic->napi[qidx];
-   /* Schedule NAPI */
-   if (cq_poll)
-   napi_schedule(&cq_poll->napi);
-   }
+static irqreturn_t nicvf_qs_err_intr_handler(int irq, void *nicvf_irq)
+{
+   struct nicvf *nic = (struct nicvf *)nicvf_irq;
 
-   /* Handle RBDR interrupts */
-   rbdr_intr = (intr & NICVF_INTR_RBDR_MASK) >> NICVF_INTR_RBDR_SHIFT;
-   if (rbdr_intr) {
-   /* Disa

[PATCH net-next 5/8] net: thunderx: Support for HW VLAN stripping

2015-08-28 Thread Aleksey Makarov
From: Sunil Goutham 

This patch configures HW to strip 802.1Q header if found in a
receiving packet. The stripped VLAN ID and TCI information is
passed on to software via CQE_RX. Also sets netdev's 'vlan_features'
so that other HW offload features can be used for tagged packets.

This offload feature can be enabled or disabled via ethtool.

Network stack normally ignores RPS for 802.1Q packets and hence low
throughput. With this offload enabled throughput for tagged packets
will be almost same as normal packets.

Note: This patch doesn't enable HW VLAN insertion for transmit packets.

Signed-off-by: Sunil Goutham 
Signed-off-by: Aleksey Makarov 
---
 drivers/net/ethernet/cavium/thunder/nic_main.c |  4 
 drivers/net/ethernet/cavium/thunder/nicvf_main.c   | 28 +++---
 drivers/net/ethernet/cavium/thunder/nicvf_queues.c | 28 ++
 drivers/net/ethernet/cavium/thunder/nicvf_queues.h |  2 ++
 4 files changed, 55 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nic_main.c 
b/drivers/net/ethernet/cavium/thunder/nic_main.c
index 6e0c031..7dfec4a 100644
--- a/drivers/net/ethernet/cavium/thunder/nic_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nic_main.c
@@ -329,6 +329,10 @@ static void nic_init_hw(struct nicpf *nic)
 
/* Timer config */
nic_reg_write(nic, NIC_PF_INTR_TIMER_CFG, NICPF_CLK_PER_INT_TICK);
+
+   /* Enable VLAN ethertype matching and stripping */
+   nic_reg_write(nic, NIC_PF_RX_ETYPE_0_7,
+ (2 << 19) | (ETYPE_ALG_VLAN_STRIP << 16) | ETH_P_8021Q);
 }
 
 /* Channel parse index configuration */
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c 
b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index afd8ad4..de51828 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -491,6 +492,11 @@ static void nicvf_rcv_pkt_handler(struct net_device 
*netdev,
 
skb->protocol = eth_type_trans(skb, netdev);
 
+   /* Check for stripped VLAN */
+   if (cqe_rx->vlan_found && cqe_rx->vlan_stripped)
+   __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q),
+  ntohs((__force __be16)cqe_rx->vlan_tci));
+
if (napi && (netdev->features & NETIF_F_GRO))
napi_gro_receive(napi, skb);
else
@@ -1220,6 +1226,18 @@ static void nicvf_reset_task(struct work_struct *work)
nic->netdev->trans_start = jiffies;
 }
 
+static int nicvf_set_features(struct net_device *netdev,
+ netdev_features_t features)
+{
+   struct nicvf *nic = netdev_priv(netdev);
+   netdev_features_t changed = features ^ netdev->features;
+
+   if (changed & NETIF_F_HW_VLAN_CTAG_RX)
+   nicvf_config_vlan_stripping(nic, features);
+
+   return 0;
+}
+
 static const struct net_device_ops nicvf_netdev_ops = {
.ndo_open   = nicvf_open,
.ndo_stop   = nicvf_stop,
@@ -1228,6 +1246,7 @@ static const struct net_device_ops nicvf_netdev_ops = {
.ndo_set_mac_address= nicvf_set_mac_address,
.ndo_get_stats64= nicvf_get_stats64,
.ndo_tx_timeout = nicvf_tx_timeout,
+   .ndo_set_features   = nicvf_set_features,
 };
 
 static int nicvf_probe(struct pci_dev *pdev, const struct pci_device_id *ent)
@@ -1301,10 +1320,13 @@ static int nicvf_probe(struct pci_dev *pdev, const 
struct pci_device_id *ent)
if (err)
goto err_free_netdev;
 
-   netdev->features |= (NETIF_F_RXCSUM | NETIF_F_IP_CSUM | NETIF_F_SG |
-NETIF_F_TSO | NETIF_F_GRO | NETIF_F_RXHASH);
+   netdev->hw_features = (NETIF_F_RXCSUM | NETIF_F_IP_CSUM | NETIF_F_SG |
+  NETIF_F_TSO | NETIF_F_GRO |
+  NETIF_F_HW_VLAN_CTAG_RX | NETIF_F_RXHASH);
+
+   netdev->features |= netdev->hw_features;
 
-   netdev->hw_features = netdev->features;
+   netdev->vlan_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO;
 
netdev->netdev_ops = &nicvf_netdev_ops;
netdev->watchdog_timeo = NICVF_TX_TIMEOUT;
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c 
b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
index 4fc40d83..b294d67 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_queues.c
@@ -475,6 +475,27 @@ static void nicvf_reclaim_rbdr(struct nicvf *nic,
return;
 }
 
+void nicvf_config_vlan_stripping(struct nicvf *nic, netdev_features_t features)
+{
+   u64 rq_cfg;
+   int sqs;
+
+   rq_cfg = nicvf_queue_reg_read(nic, NIC_QSET_RQ_GEN_CFG, 0);
+
+   /* Enable first VLAN stripping */
+   if (features & NETIF_F_HW_VLAN_CTAG_RX)
+   rq_cfg |= (1ULL << 25);
+ 

[PATCH net-next 4/8] net: thunderx: Receive hashing HW offload support

2015-08-28 Thread Aleksey Makarov
From: Sunil Goutham 

Adding support for receive hashing HW offload by using RSS_ALG
and RSS_TAG fields of CQE_RX descriptor. Also removed dependency
on minimum receive queue count to configure RSS so that hash is
always generated.

This hash is used by RPS logic to distribute flows across multiple
CPUs. Offload can be disabled via ethtool.

Signed-off-by: Robert Richter 
Signed-off-by: Sunil Goutham 
Signed-off-by: Aleksey Makarov 
---
 .../net/ethernet/cavium/thunder/nicvf_ethtool.c| 14 -
 drivers/net/ethernet/cavium/thunder/nicvf_main.c   | 35 --
 2 files changed, 39 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c 
b/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c
index a961aa3..1eec2cd 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c
@@ -525,17 +525,15 @@ static int nicvf_set_rxfh(struct net_device *dev, const 
u32 *indir,
struct nicvf_rss_info *rss = &nic->rss_info;
int idx;
 
-   if ((nic->qs->rq_cnt <= 1) || (nic->cpi_alg != CPI_ALG_NONE)) {
-   rss->enable = false;
-   rss->hash_bits = 0;
-   return -EIO;
-   }
-
-   /* We do not allow change in unsupported parameters */
if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP)
return -EOPNOTSUPP;
 
-   rss->enable = true;
+   if (!rss->enable) {
+   netdev_err(nic->netdev,
+  "RSS is disabled, cannot change settings\n");
+   return -EIO;
+   }
+
if (indir) {
for (idx = 0; idx < rss->rss_size; idx++)
rss->ind_tbl[idx] = indir[idx];
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_main.c 
b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
index d4ad36e..afd8ad4 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_main.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_main.c
@@ -313,7 +313,7 @@ static int nicvf_rss_init(struct nicvf *nic)
 
nicvf_get_rss_size(nic);
 
-   if ((nic->qs->rq_cnt <= 1) || (cpi_alg != CPI_ALG_NONE)) {
+   if (cpi_alg != CPI_ALG_NONE) {
rss->enable = false;
rss->hash_bits = 0;
return 0;
@@ -416,6 +416,34 @@ static void nicvf_snd_pkt_handler(struct net_device 
*netdev,
}
 }
 
+static inline void nicvf_set_rxhash(struct net_device *netdev,
+   struct cqe_rx_t *cqe_rx,
+   struct sk_buff *skb)
+{
+   u8 hash_type;
+   u32 hash;
+
+   if (!(netdev->features & NETIF_F_RXHASH))
+   return;
+
+   switch (cqe_rx->rss_alg) {
+   case RSS_ALG_TCP_IP:
+   case RSS_ALG_UDP_IP:
+   hash_type = PKT_HASH_TYPE_L4;
+   hash = cqe_rx->rss_tag;
+   break;
+   case RSS_ALG_IP:
+   hash_type = PKT_HASH_TYPE_L3;
+   hash = cqe_rx->rss_tag;
+   break;
+   default:
+   hash_type = PKT_HASH_TYPE_NONE;
+   hash = 0;
+   }
+
+   skb_set_hash(skb, hash, hash_type);
+}
+
 static void nicvf_rcv_pkt_handler(struct net_device *netdev,
  struct napi_struct *napi,
  struct cmp_queue *cq,
@@ -451,6 +479,8 @@ static void nicvf_rcv_pkt_handler(struct net_device *netdev,
 
nicvf_set_rx_frame_cnt(nic, skb);
 
+   nicvf_set_rxhash(netdev, cqe_rx, skb);
+
skb_record_rx_queue(skb, cqe_rx->rq_idx);
if (netdev->hw_features & NETIF_F_RXCSUM) {
/* HW by default verifies TCP/UDP/SCTP checksums */
@@ -1272,7 +1302,8 @@ static int nicvf_probe(struct pci_dev *pdev, const struct 
pci_device_id *ent)
goto err_free_netdev;
 
netdev->features |= (NETIF_F_RXCSUM | NETIF_F_IP_CSUM | NETIF_F_SG |
-NETIF_F_TSO | NETIF_F_GRO);
+NETIF_F_TSO | NETIF_F_GRO | NETIF_F_RXHASH);
+
netdev->hw_features = netdev->features;
 
netdev->netdev_ops = &nicvf_netdev_ops;
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 2/8] net: thunderx: Add receive error stats reporting via ethtool

2015-08-28 Thread Aleksey Makarov
From: Sunil Goutham 

Added ethtool support to dump receive packet error statistics reported
in CQE. Also made some small fixes

Signed-off-by: Sunil Goutham 
Signed-off-by: Aleksey Makarov 
---
 drivers/net/ethernet/cavium/thunder/nic.h  | 36 +++--
 .../net/ethernet/cavium/thunder/nicvf_ethtool.c| 34 +++--
 drivers/net/ethernet/cavium/thunder/nicvf_main.c   | 26 ---
 drivers/net/ethernet/cavium/thunder/nicvf_queues.c | 86 +++---
 drivers/net/ethernet/cavium/thunder/nicvf_queues.h | 41 ---
 5 files changed, 103 insertions(+), 120 deletions(-)

diff --git a/drivers/net/ethernet/cavium/thunder/nic.h 
b/drivers/net/ethernet/cavium/thunder/nic.h
index 8aee250..58adfd6 100644
--- a/drivers/net/ethernet/cavium/thunder/nic.h
+++ b/drivers/net/ethernet/cavium/thunder/nic.h
@@ -190,10 +190,10 @@ enum tx_stats_reg_offset {
 };
 
 struct nicvf_hw_stats {
-   u64 rx_bytes_ok;
-   u64 rx_ucast_frames_ok;
-   u64 rx_bcast_frames_ok;
-   u64 rx_mcast_frames_ok;
+   u64 rx_bytes;
+   u64 rx_ucast_frames;
+   u64 rx_bcast_frames;
+   u64 rx_mcast_frames;
u64 rx_fcs_errors;
u64 rx_l2_errors;
u64 rx_drop_red;
@@ -204,6 +204,31 @@ struct nicvf_hw_stats {
u64 rx_drop_mcast;
u64 rx_drop_l3_bcast;
u64 rx_drop_l3_mcast;
+   u64 rx_bgx_truncated_pkts;
+   u64 rx_jabber_errs;
+   u64 rx_fcs_errs;
+   u64 rx_bgx_errs;
+   u64 rx_prel2_errs;
+   u64 rx_l2_hdr_malformed;
+   u64 rx_oversize;
+   u64 rx_undersize;
+   u64 rx_l2_len_mismatch;
+   u64 rx_l2_pclp;
+   u64 rx_ip_ver_errs;
+   u64 rx_ip_csum_errs;
+   u64 rx_ip_hdr_malformed;
+   u64 rx_ip_payload_malformed;
+   u64 rx_ip_ttl_errs;
+   u64 rx_l3_pclp;
+   u64 rx_l4_malformed;
+   u64 rx_l4_csum_errs;
+   u64 rx_udp_len_errs;
+   u64 rx_l4_port_errs;
+   u64 rx_tcp_flag_errs;
+   u64 rx_tcp_offset_errs;
+   u64 rx_l4_pclp;
+   u64 rx_truncated_pkts;
+
u64 tx_bytes_ok;
u64 tx_ucast_frames_ok;
u64 tx_bcast_frames_ok;
@@ -222,6 +247,7 @@ struct nicvf_drv_stats {
u64 rx_frames_1518;
u64 rx_frames_jumbo;
u64 rx_drops;
+
/* Tx */
u64 tx_frames_ok;
u64 tx_drops;
@@ -257,7 +283,7 @@ struct nicvf {
u32 cq_coalesce_usecs;
 
u32 msg_enable;
-   struct nicvf_hw_stats   stats;
+   struct nicvf_hw_stats   hw_stats;
struct nicvf_drv_stats  drv_stats;
struct bgx_statsbgx_stats;
struct work_struct  reset_task;
diff --git a/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c 
b/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c
index a4228e6..a961aa3 100644
--- a/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c
+++ b/drivers/net/ethernet/cavium/thunder/nicvf_ethtool.c
@@ -35,10 +35,10 @@ struct nicvf_stat {
 }
 
 static const struct nicvf_stat nicvf_hw_stats[] = {
-   NICVF_HW_STAT(rx_bytes_ok),
-   NICVF_HW_STAT(rx_ucast_frames_ok),
-   NICVF_HW_STAT(rx_bcast_frames_ok),
-   NICVF_HW_STAT(rx_mcast_frames_ok),
+   NICVF_HW_STAT(rx_bytes),
+   NICVF_HW_STAT(rx_ucast_frames),
+   NICVF_HW_STAT(rx_bcast_frames),
+   NICVF_HW_STAT(rx_mcast_frames),
NICVF_HW_STAT(rx_fcs_errors),
NICVF_HW_STAT(rx_l2_errors),
NICVF_HW_STAT(rx_drop_red),
@@ -49,6 +49,30 @@ static const struct nicvf_stat nicvf_hw_stats[] = {
NICVF_HW_STAT(rx_drop_mcast),
NICVF_HW_STAT(rx_drop_l3_bcast),
NICVF_HW_STAT(rx_drop_l3_mcast),
+   NICVF_HW_STAT(rx_bgx_truncated_pkts),
+   NICVF_HW_STAT(rx_jabber_errs),
+   NICVF_HW_STAT(rx_fcs_errs),
+   NICVF_HW_STAT(rx_bgx_errs),
+   NICVF_HW_STAT(rx_prel2_errs),
+   NICVF_HW_STAT(rx_l2_hdr_malformed),
+   NICVF_HW_STAT(rx_oversize),
+   NICVF_HW_STAT(rx_undersize),
+   NICVF_HW_STAT(rx_l2_len_mismatch),
+   NICVF_HW_STAT(rx_l2_pclp),
+   NICVF_HW_STAT(rx_ip_ver_errs),
+   NICVF_HW_STAT(rx_ip_csum_errs),
+   NICVF_HW_STAT(rx_ip_hdr_malformed),
+   NICVF_HW_STAT(rx_ip_payload_malformed),
+   NICVF_HW_STAT(rx_ip_ttl_errs),
+   NICVF_HW_STAT(rx_l3_pclp),
+   NICVF_HW_STAT(rx_l4_malformed),
+   NICVF_HW_STAT(rx_l4_csum_errs),
+   NICVF_HW_STAT(rx_udp_len_errs),
+   NICVF_HW_STAT(rx_l4_port_errs),
+   NICVF_HW_STAT(rx_tcp_flag_errs),
+   NICVF_HW_STAT(rx_tcp_offset_errs),
+   NICVF_HW_STAT(rx_l4_pclp),
+   NICVF_HW_STAT(rx_truncated_pkts),
NICVF_HW_STAT(tx_bytes_ok),
NICVF_HW_STAT(tx_ucast_frames_ok),
NICVF_HW_STAT(tx_bcast_frames_ok),
@@ -195,7 +219,7 @@ static void nicvf_get_ethtool_stats(struct net_device 
*netdev,
nicvf_update_lmac_stats(nic);
 
for (stat = 0; stat < nicvf_n_hw_stats; stat++)
-   *(data++) = ((u64 *)&nic->stats)
+   *(data++) = ((u64 *)&nic->hw_st

[PATCH net-next 1/8] net: thunderx: fix MAINTAINERS

2015-08-28 Thread Aleksey Makarov
From: Aleksey Makarov 

The liquidio and thunder drivers have different maintainers.

Signed-off-by: Aleksey Makarov 
---
 MAINTAINERS | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 4e6dcb6..43cf79e 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -928,7 +928,7 @@ M:  Sunil Goutham 
 M: Robert Richter 
 L: linux-arm-ker...@lists.infradead.org (moderated for non-subscribers)
 S: Supported
-F: drivers/net/ethernet/cavium/
+F: drivers/net/ethernet/cavium/thunder/
 
 ARM/CIRRUS LOGIC CLPS711X ARM ARCHITECTURE
 M: Alexander Shiyan 
@@ -2543,7 +2543,6 @@ M: Raghu Vatsavayi 

 L: netdev@vger.kernel.org
 W: http://www.cavium.com
 S: Supported
-F: drivers/net/ethernet/cavium/
 F: drivers/net/ethernet/cavium/liquidio/
 
 CC2520 IEEE-802.15.4 RADIO DRIVER
-- 
2.5.0

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Bridge] [PATCH net-next] bridge: Add netlink support for vlan_protocol attribute

2015-08-28 Thread Nikolay Aleksandrov

> On Aug 28, 2015, at 4:06 AM, Toshiaki Makita  
> wrote:
> 
> On 15/08/28 (金) 0:48, Nikolay Aleksandrov wrote:
>> 
>>> On Aug 26, 2015, at 11:00 PM, Toshiaki Makita 
>>>  wrote:
>>> 
>>> This enables bridge vlan_protocol to be configured through netlink.
>>> 
>>> When CONFIG_BRIDGE_VLAN_FILTERING is disabled, kernel behaves the
>>> same way as this feature is not implemented.
>>> 
>>> Signed-off-by: Toshiaki Makita 
>>> ---
>>> include/uapi/linux/if_link.h |  1 +
>>> net/bridge/br_netlink.c  | 34 ++
>>> net/bridge/br_private.h  |  1 +
>>> net/bridge/br_vlan.c | 35 +--
>>> 4 files changed, 57 insertions(+), 14 deletions(-)
>>> 
>> 
>> Nice, looks good. I have a similar patch as well and was going to ask 
>> wouldn’t it be
>> better to make empty stubs which return an error when vlan filtering isn’t 
>> configured
>> and drop the ifdefs in the netlink handling code ?
>> Similar to how vlan_filtering netlink attribute is handled in commit:
>> a7854037da00 ("bridge: netlink: add support for vlan_filtering attribute”)
>> 
>> Potential problem would be the return of the protocol, but I think if 0 is 
>> returned that
>> can be handled.
> 
> This is the exact reason why I didn't implement the stub.
> I wanted to avoid to charge userspace with that special casing 0.
> Also, this is consistent with sysfs implementation, which doesn't expose 
> vlan_* entries when CONFIG_BRIDGE_VLAN_PROTOCOL is disabled.
> 
> Toshiaki Makita

Okay, fair enough.

Thanks,
 Nik--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next v2] bridge: vlan: allow to suppress local mac install for all vlans

2015-08-28 Thread Nikolay Aleksandrov

> On Aug 28, 2015, at 5:31 AM, Vlad Yasevich  wrote:
> 
> On 08/27/2015 10:17 PM, Nikolay Aleksandrov wrote:
>> 
>>> On Aug 27, 2015, at 4:47 PM, Vlad Yasevich  wrote:
>>> 
>>> On 08/27/2015 05:02 PM, Nikolay Aleksandrov wrote:
 
> On Aug 26, 2015, at 9:57 PM, roopa  wrote:
> 
> On 8/26/15, 4:33 AM, Nikolay Aleksandrov wrote:
>>> On Aug 25, 2015, at 11:06 PM, David Miller  wrote:
>>> 
>>> From: Nikolay Aleksandrov 
>>> Date: Tue, 25 Aug 2015 22:28:16 -0700
>>> 
 Certainly, that should be done and I will look into it, but the
 essence of this patch is a bit different. The problem here is not
 the size of the fdb entries, it’s more the number of them - having
 96000 entries (even if they were 1 byte ones) is just way too much
 especially when the fdb hash size is small and static. We could work
 on making it dynamic though, but still these type of local entries
 per vlan per port can easily be avoided with this option.
>>> 96000 bits can be stored in 12k.  Get where I'm going with this?
>>> 
>>> Look at the problem sideways.
>> Oh okay, I misunderstood your previous comment. I’ll look into that.
>> 
> I just wanted to add the other problems we have had with keeping these 
> macs (mostly from userspace POV):
> - add/del netlink notification storms
> - and large netlink dumps
> 
> In addition to in-kernel optimizations, will be nice to have a solution 
> that reduces the burden on userspace. That will need a newer netlink dump 
> format for fdbs. Considering all the changes needed, Nikolays patch seems 
> less intrusive.
 
 Right, we need to take these into account as well. I’ll continue the 
 discussion on this (or restart it) because
 I looked into using a bitmap for the local entries only and while it fixes 
 the scalability issue, it presents
 a few new ones which are mostly related to the fact that these entries now 
 exist only without a vlan
 and if a new mac comes along which matches one of these but is in a vlan, 
 the entry will get created
 in br_fdb_update() unless we add a second lookup, but that will slow down 
 the learning path.
 Also this change requires an update of every fdb function that uses the 
 vid as a key (every fdb function?!)
 because now we can have the mac in two places instead of one which is a 
 pretty big churn with lots
 of conditionals all over the place and I don’t like it. Adding this 
 complexity for the local addresses only
 seems like an overkill, so I think to drop this issue for now.
>>> 
>>> I seem to recall Roopa and I and maybe a few others have discussing this a 
>>> few
>>> years ago at plumbers, I can't remember the details any more.  All these 
>>> local
>>> addresses add a ton of confusion.  Does anyone (Stephen?) remember what the
>>> original reason was for all these local addresses? I wonder if we can have
>>> a nob to disable all of them (not just per vlan)?  That might be cleaner and
>>> easier to swallow.
>>> 
>> 
>> Right, this would be the easiest way and if the others agree - I’ll post a 
>> patch for it so we can
>> have some way to resolve it today and even if we fix the scalability issue, 
>> this is still a valid case
>> that some people don’t want local fdbs installed automatically.
>> Any objections to this ?
>> 
 This patch (that works around the initial problem) also has these issues.
 Note that one way to take care of this in a more straight-forward way 
 would be to have each entry
 with some sort of a bitmap (like Vlad has tried earlier) and then we can 
 combine the paths so most
 of these issues disappear, but that will not be easy as was already 
 commented earlier. I’ve looked
 briefly into doing this with rhashtable so we can keep the memory 
 footprint for each entry relatively
 small but it still affects the performance and we can have thousands of 
 resizes happening. 
 
>>> 
>>> So, one of the earlier approaches that I've tried (before rhashtable was
>>> in the kernel) was to have a hash of vlan ids each with a data structure
>>> pointing to a list of ports for a given vlan as well as a list of fdbs for
>>> a given vlan.  As far as scalability goes, that's really the best approach.
>>> It would also allow us to do packet accounting per vlan.  The only concern
>>> at the time was performance of ingress lookup.   I think rhashtables might
>>> help with this as well as ability to grow the footprint of the vlan hash
>>> table dynamically.
>>> 
>>> -vlad
>>> 
>> I’ll look into it but I’m guessing the learning will become a more 
>> complicated process with additional 
>> allocations and some hash handling.
> 
> I don't remember learning being all that complicated.  The hash only changed 
> under
> rtnl when vlans were added/removed.  The nice this is that we wouldn't need
>

Re: [PATCH net] bonding: fix bond_poll_controller bh_enable warning

2015-08-28 Thread Nikolay Aleksandrov

> On Aug 27, 2015, at 6:54 PM, Nikolay Aleksandrov  wrote:
> 
> From: Nikolay Aleksandrov 
> 
> The problem is rcu_read_unlock_bh() which triggers a warning.
> ndo_poll_controller is supposed to be running with either irqs disabled
> or bh disabled already, so we don't need to take rcu_read_lock_bh.
> Use the standard rcu_read_lock/unlock to make the non-bh rcu_dereference
> happy.
> 

Actually I was wrong here, e.g. netpoll_send_udp(). It is currently only used 
by netconsole
with irqs disabled but that doesn’t have to be true for future users.
I wanted to avoid conditional lock acquiring but we may have to go that way.
I’ll post a v2 in a few hours.

Please drop this patch.

Thanks,
 Nik

> This patch fixes https://bugzilla.kernel.org/show_bug.cgi?id=102181
> 
> [   98.502922] bond0: making interface eth1 the new active one
> [   98.503039] [ cut here ]
> [   98.503039] WARNING: CPU: 0 PID: 1744 at kernel/softirq.c:150 
> __local_bh_enable_ip+0x96/0xc0()
> [   98.503039] Modules linked in: bonding(OE) rpcsec_gss_krb5 nfsv4 
> dns_resolver nfs fscache netconsole ppdev joydev parport_pc serio_raw parport 
> i2c_piix4 video acpi_cpufreq nfsd auth_rpcgss nfs_acl lockd grace sunrpc 
> virtio_net e1000 ata_generic pcnet32 mii virtio_pci virtio_ring virtio 
> pata_acpi
> [   98.503039] CPU: 0 PID: 1744 Comm: ifenslave Tainted: G   OE   
> 4.2.0-rc7+ #56
> [   98.503039] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS 
> VirtualBox 12/01/2006
> [   98.503039]   e96ba230 880020c236b8 
> 8183f105
> [   98.503039]    880020c236f8 
> 810a9496
> [   98.503039]  88002ea99e08 0200 a02a8e06 
> 88002ea99e08
> [   98.503039] Call Trace:
> [   98.503039]  [] dump_stack+0x4c/0x65
> [   98.503039]  [] warn_slowpath_common+0x86/0xc0
> [   98.503039]  [] ? bond_poll_controller+0x146/0x250 
> [bonding]
> [   98.503039]  [] warn_slowpath_null+0x1a/0x20
> [   98.503039]  [] __local_bh_enable_ip+0x96/0xc0
> [   98.503039]  [] bond_poll_controller+0x16f/0x250 
> [bonding]
> [   98.503039]  [] ? bond_poll_controller+0x33/0x250 
> [bonding]
> [   98.503039]  [] ? trace_hardirqs_off+0xd/0x10
> [   98.503039]  [] ? _raw_spin_unlock_irqrestore+0x5b/0x60
> [   98.503039]  [] netpoll_poll_dev+0x6e/0x350
> [   98.503039]  [] ? netpoll_start_xmit+0x137/0x1d0
> [   98.503039]  [] ? __alloc_skb+0x5b/0x210
> [   98.503039]  [] netpoll_send_skb_on_dev+0x12d/0x2a0
> [   98.503039]  [] netpoll_send_udp+0x2ce/0x430
> [   98.503039]  [] write_msg+0xb0/0xf0 [netconsole]
> [   98.503039]  [] 
> call_console_drivers.constprop.25+0x133/0x260
> [   98.503039]  [] console_unlock+0x2f4/0x580
> [   98.503039]  [] ? vprintk_emit+0x2e5/0x630
> [   98.503039]  [] vprintk_emit+0x325/0x630
> [   98.503039]  [] vprintk_default+0x29/0x40
> [   98.503039]  [] printk+0x55/0x6b
> [   98.503039]  [] __netdev_printk+0x16c/0x260
> [   98.503039]  [] netdev_info+0x62/0x80
> [   98.503039]  [] bond_change_active_slave+0x134/0x6a0 
> [bonding]
> [   98.503039]  [] bond_select_active_slave+0xc5/0x310 
> [bonding]
> [   98.503039]  [] bond_enslave+0x1088/0x10c0 [bonding]
> [   98.503039]  [] bond_do_ioctl+0x37b/0x400 [bonding]
> [   98.503039]  [] ? trace_hardirqs_on+0xd/0x10
> [   98.503039]  [] ? rtnl_lock+0x17/0x20
> [   98.503039]  [] dev_ifsioc+0x331/0x3e0
> [   98.503039]  [] dev_ioctl+0xec/0x6c0
> [   98.503039]  [] sock_do_ioctl+0x4a/0x60
> [   98.503039]  [] sock_ioctl+0x1c0/0x250
> [   98.503039]  [] do_vfs_ioctl+0x2ee/0x540
> [   98.503039]  [] ? up_read+0x23/0x40
> [   98.503039]  [] ? __do_page_fault+0x1d3/0x420
> [   98.503039]  [] ? __fget_light+0x66/0x90
> [   98.503039]  [] SyS_ioctl+0x79/0x90
> [   98.503039]  [] entry_SYSCALL_64_fastpath+0x12/0x76
> [   98.503039] ---[ end trace 00cfa804b0670051 ]---
> 
> Fixes: 616f45416ca0 ("bonding: implement bond_poll_controller()")
> Signed-off-by: Nikolay Aleksandrov 
> ---
> drivers/net/bonding/bond_main.c | 4 ++--
> 1 file changed, 2 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
> index a98dd4f1b0e3..1b4b24218807 100644
> --- a/drivers/net/bonding/bond_main.c
> +++ b/drivers/net/bonding/bond_main.c
> @@ -979,7 +979,7 @@ static void bond_poll_controller(struct net_device 
> *bond_dev)
>   if (bond_3ad_get_active_agg_info(bond, &ad_info))
>   return;
> 
> - rcu_read_lock_bh();
> + rcu_read_lock();
>   bond_for_each_slave_rcu(bond, slave, iter) {
>   ops = slave->dev->netdev_ops;
>   if (!bond_slave_is_up(slave) || !ops->ndo_poll_controller)
> @@ -1000,7 +1000,7 @@ static void bond_poll_controller(struct net_device 
> *bond_dev)
>   ops->ndo_poll_controller(slave->dev);
>   up(&ni->dev_lock);
>   }
> - rcu_read_unlock_bh();
> + rcu_read_unlock();
> }
> 
> static void bond_netpoll_cleanup(struct net_dev

Re: [PATCH 1/2] ravb: propagate platform_get_irq() error upstream

2015-08-28 Thread Geert Uytterhoeven
On Fri, Aug 28, 2015 at 3:55 PM, Sergei Shtylyov
 wrote:
> The driver overrides the error returned by platform_get_irq() with -ENODEV
> which e.g. precludes the deferred  probing from working. Propagate the real
> error code to the driver core instead.
>
> Signed-off-by: Sergei Shtylyov 

Acked-by: Geert Uytterhoeven 

Gr{oetje,eeting}s,

Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] sh_eth: propagate platform_get_irq() error upstream

2015-08-28 Thread Geert Uytterhoeven
On Fri, Aug 28, 2015 at 3:56 PM, Sergei Shtylyov
 wrote:
> The driver overrides the error returned by platform_get_irq() with -ENODEV
> which e.g. precludes the deferred  probing from working. Propagate the real
> error code to the driver core instead.
>
> Signed-off-by: Sergei Shtylyov 

Acked-by: Geert Uytterhoeven 

Gr{oetje,eeting}s,

Geert

--
Geert Uytterhoeven -- There's lots of Linux beyond ia32 -- ge...@linux-m68k.org

In personal conversations with technical people, I call myself a hacker. But
when I'm talking to journalists I just say "programmer" or something like that.
-- Linus Torvalds
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next v3] net: FIB tracepoints

2015-08-28 Thread David Ahern
A few useful tracepoints developing VRF driver.

Signed-off-by: David Ahern 
---
I realize the sensitivity around adding tracepoints, but these have been
invaluable developing the VRF device driver along with a return probe:
  perf probe -a 'fib_table_lookup_ret=fib_table_lookup%return ret=%ax'

v3
- removed memcpy for addresses per Dave's comment

v2
- added tos, scope, flags per Dave's comment

 include/trace/events/fib.h | 111 +
 net/core/net-traces.c  |   1 +
 net/ipv4/fib_frontend.c|   3 ++
 net/ipv4/fib_trie.c|   5 ++
 4 files changed, 120 insertions(+)
 create mode 100644 include/trace/events/fib.h

diff --git a/include/trace/events/fib.h b/include/trace/events/fib.h
new file mode 100644
index ..4030f75410d7
--- /dev/null
+++ b/include/trace/events/fib.h
@@ -0,0 +1,111 @@
+#undef TRACE_SYSTEM
+#define TRACE_SYSTEM fib
+
+#if !defined(_TRACE_FIB_H) || defined(TRACE_HEADER_MULTI_READ)
+#define _TRACE_FIB_H
+
+#include 
+#include 
+#include 
+#include 
+
+TRACE_EVENT(fib_table_lookup,
+
+   TP_PROTO(int tb_id, const struct flowi4 *flp),
+
+   TP_ARGS(tb_id, flp),
+
+   TP_STRUCT__entry(
+   __field(int,tb_id   )
+   __field(int,oif )
+   __field(int,iif )
+   __field(__u8,   tos )
+   __field(__u8,   scope   )
+   __field(__u8,   flags   )
+   __array(__u8,   src,4   )
+   __array(__u8,   dst,4   )
+   ),
+
+   TP_fast_assign(
+   __be32 *p32;
+
+   __entry->tb_id = tb_id;
+   __entry->oif = flp->flowi4_oif;
+   __entry->iif = flp->flowi4_iif;
+   __entry->tos = flp->flowi4_tos;
+   __entry->scope = flp->flowi4_scope;
+   __entry->flags = flp->flowi4_flags;
+
+   p32 = (__be32 *) __entry->src;
+   *p32 = flp->saddr;
+
+   p32 = (__be32 *) __entry->dst;
+   *p32 = flp->daddr;
+   ),
+
+   TP_printk("table %d oif %d iif %d src %pI4 dst %pI4 tos %d scope %d 
flags %x",
+ __entry->tb_id, __entry->oif, __entry->iif,
+ __entry->src, __entry->dst, __entry->tos, __entry->scope,
+ __entry->flags)
+);
+
+TRACE_EVENT(fib_table_lookup_nh,
+
+   TP_PROTO(const struct fib_nh *nh),
+
+   TP_ARGS(nh),
+
+   TP_STRUCT__entry(
+   __string(   name,   nh->nh_dev->name)
+   __field(int,oif )
+   __array(__u8,   src,4   )
+   ),
+
+   TP_fast_assign(
+   __be32 *p32 = (__be32 *) __entry->src;
+
+   __assign_str(name, nh->nh_dev ? nh->nh_dev->name : "not set");
+   __entry->oif = nh->nh_oif;
+   *p32 = nh->nh_saddr;
+   ),
+
+   TP_printk("nexthop dev %s oif %d src %pI4",
+ __get_str(name), __entry->oif, __entry->src)
+);
+
+TRACE_EVENT(fib_validate_source,
+
+   TP_PROTO(const struct net_device *dev, const struct flowi4 *flp),
+
+   TP_ARGS(dev, flp),
+
+   TP_STRUCT__entry(
+   __string(   name,   dev->name   )
+   __field(int,oif )
+   __field(int,iif )
+   __array(__u8,   src,4   )
+   __array(__u8,   dst,4   )
+   ),
+
+   TP_fast_assign(
+   __be32 *p32;
+
+   __assign_str(name, dev ? dev->name : "not set");
+   __entry->oif = flp->flowi4_oif;
+   __entry->iif = flp->flowi4_iif;
+
+   p32 = (__be32 *) __entry->src;
+   *p32 = flp->saddr;
+
+   p32 = (__be32 *) __entry->dst;
+   *p32 = flp->daddr;
+   ),
+
+   TP_printk("dev %s oif %d iif %d src %pI4 dst %pI4",
+ __get_str(name), __entry->oif, __entry->iif,
+ __entry->src, __entry->dst)
+);
+#endif /* _TRACE_FIB_H */
+
+/* This part must be outside protection */
+#include 
diff --git a/net/core/net-traces.c b/net/core/net-traces.c
index ba3c0120786c..adef015b2f41 100644
--- a/net/core/net-traces.c
+++ b/net/core/net-traces.c
@@ -31,6 +31,7 @@
 #include 
 #include 
 #include 
+#include 
 
 EXPORT_TRACEPOINT_SYMBOL_GPL(kfree_skb);
 
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 7fa277176c33..4036c94dfbe1 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -46,6 +46,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifndef CONFIG_IP_MULTIPLE_TABLES
 
@@ -344,6 +345,8 @@ static int __fib_validate_source(struct sk_buff *skb, 
__be32 src, __be32 dst,
 
fl4.flowi4_mark = IN_DEV_SRC_VMARK(idev) ? skb->mark : 0;
 
+   trace_fib_validate_so

Re: [PATCH net-next 1/4] ip_tunnels: convert the mode field of ip_tunnel_info to flags

2015-08-28 Thread Alexei Starovoitov
On Fri, Aug 28, 2015 at 04:27:25PM +0200, Jiri Benc wrote:
> The mode field holds a single bit of information only (whether the
> ip_tunnel_info struct is for rx or tx). Change the mode field to bit flags.
> This allows more mode flags to be added.
> 
> Signed-off-by: Jiri Benc 

Acked-by: Alexei Starovoitov 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 2/4] ip_tunnels: record IP version in tunnel info

2015-08-28 Thread Alexei Starovoitov
On Fri, Aug 28, 2015 at 04:27:26PM +0200, Jiri Benc wrote:
> There's currently nothing preventing directing packets with IPv6
> encapsulation data to IPv4 tunnels (and vice versa). If this happens,
> IPv6 addresses are incorrectly interpreted as IPv4 ones.
> 
> Track whether the given ip_tunnel_key contains IPv4 or IPv6 data. Store this
> in ip_tunnel_info. Reject packets at appropriate places if they are supposed
> to be encapsulated into an incompatible protocol.
> 
> Signed-off-by: Jiri Benc 

Acked-by: Alexei Starovoitov 
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next] net: sched: don't break line in tc_classify loop notification

2015-08-28 Thread Daniel Borkmann
Just some minor noise follow-up to address some stylistic issues of
commit 3b3ae880266d ("net: sched: consolidate tc_classify{,_compat}").
Accidentally v1 instead of v2 of that commit got applied, so this
patch adds the relative diff.

Suggested-by: Alexei Starovoitov 
Signed-off-by: Daniel Borkmann 
---
 net/sched/sch_api.c | 9 -
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index a3c70a1..f43c8f3 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -1825,8 +1825,7 @@ reclassify:
 
err = tp->classify(skb, tp, res);
 #ifdef CONFIG_NET_CLS_ACT
-   if (unlikely(err == TC_ACT_RECLASSIFY &&
-!compat_mode))
+   if (unlikely(err == TC_ACT_RECLASSIFY && !compat_mode))
goto reset;
 #endif
if (err >= 0)
@@ -1837,9 +1836,9 @@ reclassify:
 #ifdef CONFIG_NET_CLS_ACT
 reset:
if (unlikely(limit++ >= MAX_REC_LOOP)) {
-   net_notice_ratelimited("%s: reclassify loop, rule prio %u, "
-  "protocol %02x\n", tp->q->ops->id,
-  tp->prio & 0x, ntohs(tp->protocol));
+   net_notice_ratelimited("%s: reclassify loop, rule prio %u, 
protocol %02x\n",
+  tp->q->ops->id, tp->prio & 0x,
+  ntohs(tp->protocol));
return TC_ACT_SHOT;
}
 
-- 
1.9.3

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH net-next 0/2] Add new switchdev device class

2015-08-28 Thread Florian Fainelli
On 27/08/15 19:13, Arad, Ronen wrote:
> 
> 
>> -Original Message-
>> From: netdev-ow...@vger.kernel.org [mailto:netdev-ow...@vger.kernel.org] On
>> Behalf Of sfel...@gmail.com
>> Sent: Thursday, August 27, 2015 12:17 AM
>> To: netdev@vger.kernel.org
>> Cc: j...@resnulli.us; da...@davemloft.net; f.faine...@gmail.com;
>> ro...@cumulusnetworks.com
>> Subject: [RFC PATCH net-next 0/2] Add new switchdev device class
>>
>> From: Scott Feldman 
>>
>>
>> So what next?  I'd rather not build APIs around sysfs, so we need a netlink
>> API
>> we can build on top of this.  It's not really rtnl.  Maybe genl would work?
>> What ever it is, we'd need to teach iproute2 about a new 'switch' command.
>>
> [@Ronen] I developed PoC code based on genl which allows access for what I
> call device options. It generalizes libteam/team driver option handling.
> It allows for fields of all Netlink scalar or string types as well as arrays.
> It differentiates between port-specific and device options.

OpenWrt has had a similar scheme, called swconfig:

Kernel code:

https://dev.openwrt.org/browser/trunk/target/linux/generic/files/drivers/net/phy/swconfig.c

User-space tools:

https://dev.openwrt.org/browser/trunk/package/network/config/swconfig/src

This was initially proposed but was rejected in favor of what became
switchdev because it did not focus exclusively on the switch device (as
a whole piece of hardware), but rather allowed configuration of things
that were already available through bridge/vlan etc..

Having a netlink interface to interface with a global (as in, not
per-port) switch device sounds like a good idea, especially if there are
events that need to be sent back to user-space, my only major concern
with this approach is to make sure there is careful review of what goes
into this interface such that:

- it is strongly defined, not just allow sending custom
u8/u16/u32/u64/blobs attributes back and forth, but rather have a
properly defined set of commands and associated data-structures

- this always covers something that is not, by nature a switch
port/physical interface attribute, and for which there is not an
existing interface

Bottom line being that it is very easy for this interface to be a
catch-all, dumping ground of things that did not fit within existing
facilities...

> (It was not limited to read-only but this could be changed to address the
>  concerns raised on this thread)
> Extending to Tables from just a list of named options is welcomed.
> 
> The diagram below shows possible architecture.
> 
> +-+
> |  tool (e.g. swdevnl, iproute2)  |
> +-+
>|
> +-+
> | libswdev|
> +-+
>|
>   +-+
>   |  libnl3 |
>   +-+
>|
> User   |
> -
> Kernel |
>|
> +---+ +---+
> | genetlink | | rtnetlink |
> +---+ +---+
>|
> +---+
> |   swdev   |
> +---+
>|
> +-+
> | |
> |  SOMEswitch |
> | |
> +-+
> 
> Libswdev in the diagram is a user space library which should abstract the
> netlink interaction and encoding details from user-space tools.
> 
> Swdev is a kernel module which provides similar abstraction to drivers. It 
> saves drivers from most of the low level code.
> 
> Drivers register their supported options (or Table/Fields) with this module
> and provide getters functions. The Swdev kernel module provides the genl API
> for exporting device specific information.
> 
> This architecture allows for a generic tool to discover the information
> available from each driver/port. The tool could extract sufficient
> information which allows it to present user-friendly interface to users for
> drilling down and retrieving specific details.  
> 
>> Netlink API would allow us to represent switch-wide objects such as 
>> registers,
>> tables, stats, firmware, and maybe even control.  I think with with netlink
>> TLVs, we can create a framework for these objects but still allow the switch
>> driver provide switch-specific info.  For example, a table object:
>>
>> [TABLES]
>>  [TABLE]
>>  [FIELDS]
>>  [FIELD]
>>  [ID, TYPE]
>>  [DATA]
>>  [ID, VALUE]
>>
> [@Ronen] Some additional information could be useful. TABLE name, FIELD name,
> (possible also short names for CLI commands or pretty printing of table
> header), FIELD value ra

Re: [PATCH net-next] net: sched: don't break line in tc_classify loop notification

2015-08-28 Thread Alexei Starovoitov

On 8/28/15 9:46 AM, Daniel Borkmann wrote:

Just some minor noise follow-up to address some stylistic issues of
commit 3b3ae880266d ("net: sched: consolidate tc_classify{,_compat}").
Accidentally v1 instead of v2 of that commit got applied, so this
patch adds the relative diff.

Suggested-by: Alexei Starovoitov
Signed-off-by: Daniel Borkmann


Acked-by: Alexei Starovoitov 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 1/4] ip_tunnels: convert the mode field of ip_tunnel_info to flags

2015-08-28 Thread Pravin Shelar
On Fri, Aug 28, 2015 at 7:27 AM, Jiri Benc  wrote:
> The mode field holds a single bit of information only (whether the
> ip_tunnel_info struct is for rx or tx). Change the mode field to bit flags.
> This allows more mode flags to be added.
>
> Signed-off-by: Jiri Benc 
> ---
>  drivers/net/vxlan.c| 2 +-
>  include/net/dst_metadata.h | 1 -
>  include/net/ip_tunnels.h   | 9 ++---
>  net/ipv4/ip_gre.c  | 2 +-
>  net/ipv4/route.c   | 2 +-
>  net/ipv6/route.c   | 2 +-

geneve module also needs to be updated.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net v2] bonding: fix bond_poll_controller bh_enable warning

2015-08-28 Thread Nikolay Aleksandrov
From: Nikolay Aleksandrov 

The problem is rcu_read_unlock_bh() which triggers a warning when irqs are
disabled.
ndo_poll_controller can run with bh enabled, disabled or irqs disabled
so check if that is the case and acquire rcu_read_lock_bh only when not
running with disabled irqs. The only potential problem is with
netpoll_send_udp() currently because it can call find_skb() which may
invoke ndo_poll_controller.
We're okay w.r.t to rcu_bh when irqs are disabled so no need to acquire it.
Use the standard rcu_read_lock/unlock to make the non-bh rcu_dereference
happy.
To clarify currently the only user of netpoll_send_udp() is netconsole and
calls it with irqs disabled so we're fine.

[   98.502922] bond0: making interface eth1 the new active one
[   98.503039] [ cut here ]
[   98.503039] WARNING: CPU: 0 PID: 1744 at kernel/softirq.c:150 
__local_bh_enable_ip+0x96/0xc0()
[   98.503039] Modules linked in: bonding(OE) rpcsec_gss_krb5 nfsv4 
dns_resolver nfs fscache netconsole ppdev joydev parport_pc serio_raw parport 
i2c_piix4 video acpi_cpufreq nfsd auth_rpcgss nfs_acl lockd grace sunrpc 
virtio_net e1000 ata_generic pcnet32 mii virtio_pci virtio_ring virtio pata_acpi
[   98.503039] CPU: 0 PID: 1744 Comm: ifenslave Tainted: G   OE   
4.2.0-rc7+ #56
[   98.503039] Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS 
VirtualBox 12/01/2006
[   98.503039]   e96ba230 880020c236b8 
8183f105
[   98.503039]    880020c236f8 
810a9496
[   98.503039]  88002ea99e08 0200 a02a8e06 
88002ea99e08
[   98.503039] Call Trace:
[   98.503039]  [] dump_stack+0x4c/0x65
[   98.503039]  [] warn_slowpath_common+0x86/0xc0
[   98.503039]  [] ? bond_poll_controller+0x146/0x250 
[bonding]
[   98.503039]  [] warn_slowpath_null+0x1a/0x20
[   98.503039]  [] __local_bh_enable_ip+0x96/0xc0
[   98.503039]  [] bond_poll_controller+0x16f/0x250 [bonding]
[   98.503039]  [] ? bond_poll_controller+0x33/0x250 [bonding]
[   98.503039]  [] ? trace_hardirqs_off+0xd/0x10
[   98.503039]  [] ? _raw_spin_unlock_irqrestore+0x5b/0x60
[   98.503039]  [] netpoll_poll_dev+0x6e/0x350
[   98.503039]  [] ? netpoll_start_xmit+0x137/0x1d0
[   98.503039]  [] ? __alloc_skb+0x5b/0x210
[   98.503039]  [] netpoll_send_skb_on_dev+0x12d/0x2a0
[   98.503039]  [] netpoll_send_udp+0x2ce/0x430
[   98.503039]  [] write_msg+0xb0/0xf0 [netconsole]
[   98.503039]  [] 
call_console_drivers.constprop.25+0x133/0x260
[   98.503039]  [] console_unlock+0x2f4/0x580
[   98.503039]  [] ? vprintk_emit+0x2e5/0x630
[   98.503039]  [] vprintk_emit+0x325/0x630
[   98.503039]  [] vprintk_default+0x29/0x40
[   98.503039]  [] printk+0x55/0x6b
[   98.503039]  [] __netdev_printk+0x16c/0x260
[   98.503039]  [] netdev_info+0x62/0x80
[   98.503039]  [] bond_change_active_slave+0x134/0x6a0 
[bonding]
[   98.503039]  [] bond_select_active_slave+0xc5/0x310 
[bonding]
[   98.503039]  [] bond_enslave+0x1088/0x10c0 [bonding]
[   98.503039]  [] bond_do_ioctl+0x37b/0x400 [bonding]
[   98.503039]  [] ? trace_hardirqs_on+0xd/0x10
[   98.503039]  [] ? rtnl_lock+0x17/0x20
[   98.503039]  [] dev_ifsioc+0x331/0x3e0
[   98.503039]  [] dev_ioctl+0xec/0x6c0
[   98.503039]  [] sock_do_ioctl+0x4a/0x60
[   98.503039]  [] sock_ioctl+0x1c0/0x250
[   98.503039]  [] do_vfs_ioctl+0x2ee/0x540
[   98.503039]  [] ? up_read+0x23/0x40
[   98.503039]  [] ? __do_page_fault+0x1d3/0x420
[   98.503039]  [] ? __fget_light+0x66/0x90
[   98.503039]  [] SyS_ioctl+0x79/0x90
[   98.503039]  [] entry_SYSCALL_64_fastpath+0x12/0x76
[   98.503039] ---[ end trace 00cfa804b0670051 ]---

Fixes: 616f45416ca0 ("bonding: implement bond_poll_controller()")
Signed-off-by: Nikolay Aleksandrov 
---
v2: make sure we're either running with irqs disabled or have rcu_bh
Making it this way to protect against future potential users of
netpoll_send_udp() which may not disable interrupts, if we agree that
it can't be called without disabling interrupts then I can resubmit this
patch without the conditional rcu_bh and possibly add a warn to catch any
future offenders that use it without disabling interrupts.

 drivers/net/bonding/bond_main.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index a98dd4f1b0e3..3197a2180978 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -974,12 +974,17 @@ static void bond_poll_controller(struct net_device 
*bond_dev)
struct ad_info ad_info;
struct netpoll_info *ni;
const struct net_device_ops *ops;
+   bool rcubh_taken = false;
 
if (BOND_MODE(bond) == BOND_MODE_8023AD)
if (bond_3ad_get_active_agg_info(bond, &ad_info))
return;
 
-   rcu_read_lock_bh();
+   if (!in_irq() && !irqs_disabled()) {
+   rcu_read_lock_bh();
+   rcubh_taken = true;
+   }
+  

Re: [PATCH net-next 2/4] ip_tunnels: record IP version in tunnel info

2015-08-28 Thread Pravin Shelar
On Fri, Aug 28, 2015 at 7:27 AM, Jiri Benc  wrote:
> There's currently nothing preventing directing packets with IPv6
> encapsulation data to IPv4 tunnels (and vice versa). If this happens,
> IPv6 addresses are incorrectly interpreted as IPv4 ones.
>
> Track whether the given ip_tunnel_key contains IPv4 or IPv6 data. Store this
> in ip_tunnel_info. Reject packets at appropriate places if they are supposed
> to be encapsulated into an incompatible protocol.
>
> Signed-off-by: Jiri Benc 
> ---
>  drivers/net/geneve.c   |  3 +++
>  drivers/net/vxlan.c|  2 ++
>  include/net/dst_metadata.h |  1 +
>  include/net/ip_tunnels.h   | 10 ++
>  net/core/filter.c  |  2 ++
>  net/ipv4/ip_gre.c  |  3 ++-
>  net/ipv4/ip_tunnel_core.c  |  2 +-
>  net/openvswitch/flow.c |  2 ++
>  net/openvswitch/vport.c|  2 ++
>  9 files changed, 25 insertions(+), 2 deletions(-)
>
> diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
> index 4357bae732d7..d4882d7904f6 100644
> --- a/drivers/net/geneve.c
> +++ b/drivers/net/geneve.c
> @@ -644,6 +644,9 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, 
> struct net_device *dev)
> u8 *opts = NULL;
> u8 vni[3];
>
> +   if (ip_tunnel_info_af(info) != AF_INET)
> +   goto err;
> +
geneve_get_rt() already interpreted the info as ipv4 tunnel info.

We can avoid such bugs by introducing separate API to retrieve ipv4
and ipv6 tunnel info. Something like
skb_tunnel_info_v4()/skb_tunnel_info_v6() for ipv4 and ipv6.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net] bonding: fix bond dev flags after convert to arphrd_ether

2015-08-28 Thread Nikolay Aleksandrov

> On Aug 27, 2015, at 8:39 PM, Jay Vosburgh  wrote:
> 
> Nikolay Aleksandrov  wrote:
> [...]
>> Restarting this thread because there’s actually a bug here, what you 
>> described with
>> the bonding destruction is true when the slaves are all destroyed but it 
>> isn’t true if they’re
>> just released, if you take a look at bond_slave_netdev_event() the bond 
>> destruction happens
>> only on NETDEV_UNREGISTER and I just hit this bug by enslaving a 
>> non-ARPHRD_ETHER
>> device, releasing it and enslaving a ARPHRD_ETHER device so ether_setup() 
>> path in bond_enslave
>> is hit and IFF_MASTER gets dropped:
>> 17: bond0:  mtu 1500 qdisc noqueue 
>> state UP mode DEFAULT group default qlen 1000
>>   link/fddi 9a:33:c5:30:ff:a6 brd ff:ff:ff:ff:ff:ff
>> (release non-ARPHRD_ETHER slave)
>> (enslave ARPHRD_ETHER device)
>> 17: bond0:  mtu 1500 qdisc noqueue state UP 
>> mode DEFAULT group default qlen 1000
>>   link/ether 08:00:27:3c:13:57 brd ff:ff:ff:ff:ff:ff
>> 
>> Notice the master flag is gone and of course on unload we get:
>> [57981.545547] [ cut here ]
>> [57981.545567] WARNING: CPU: 0 PID: 13792 at fs/proc/generic.c:575 
>> remove_proc_entry+0x17e/0x190()
>> [57981.545572] remove_proc_entry: removing non-empty directory 
>> 'net/bonding', leaking at least 'bond0'
> [...]
>> We need to convert it back to ARPHRD_ETHER if releasing the last slave, 
>> because
>> we can’t destroy it (in some paths bond->dev is used after bond_release()).
>> Basically we should make the case that if the bonding doesn’t have any 
>> slaves then it’s
>> always an ARPHRD_ETHER device.
>> 
>> Thoughts ?
> 
>   I agree that it would be cleaner for bond_dev->type to switch
> back on release of last slave.  The options code (caller of
> bond_option_slaves_set) and bond_uninit() both reference the bond or dev
> after calling bond_release(), and would need changing if any release
> could destroy the bond itself.
> 
>   However, for the type change, there's the potentially tricky
> case of a nested non-ARPHRD_ETHER bond, e.g., bond0 -> bond1 -> ib0.
> This isn't a typical use case that I'm aware of, but I believe it's
> supported by the code.
> 
>   If ib0, the last slave, is released, bond1 will want to change
> to ARPHRD_ETHER, but bond0 is ARPHRD_INFINIBAND.  I suspect bonding will
> have to notice the NETDEV_PRE_TYPE_CHANGE and _POST_ notifiers and take
> appropriate action (i.e., cascade the type change upwards).
> 
>   There might be similar issues with other devices stacked on top
> of the IB -> Ether type-changing bond; I'm not sure how many of those
> there may be, though, since many things won't stack over IB devices (or
> an IB-flavor bond).
> 
Ugh right, this would be a problem. I’ll see if it can be handled well.

>   If the type change works, then I don't think we would still need
> the "release and destroy" logic.
> 
Right, that was my intention.

>   -J
> 
> ---
>   -Jay Vosburgh, jay.vosbu...@canonical.com

I’ll look into this some more and if it works out I’ll post the patch.

Thanks,
 Nik--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH iproute2 v3] add support for brief output for link and addresses

2015-08-28 Thread Andy Gospodarek
This adds support for slightly less output than is normally provided by
'ip link show' and 'ip addr show'.  This is a bit better when you have a
host with lots of interfaces.  Sample output:

$ ip -br link show
lo   UNKNOWN00:00:00:00:00:00 
p2p1 UP 08:00:27:ee:0b:3b 

p7p1 UP 08:00:27:9d:62:9f 

p8p1 DOWN   08:00:27:dc:d8:ca 

p9p1 UP 08:00:27:76:d9:75 

p7p1.100@p7p1UP 08:00:27:9d:62:9f 


$ ip -br -4 addr show
lo   UNKNOWN127.0.0.1/8
p2p1 UP 192.168.56.2/24
p7p1 UP 70.0.0.1/24
p8p1 DOWN   80.0.0.1/24
p9p1 UP 10.0.5.15/24
p7p1.100@p7p1UP 200.0.0.1/24

$ ip -br -6 addr show
lo   UNKNOWN::1/128
p2p1 UP fe80::a00:27ff:feee:b3b/64
p7p1 UP 7000::1/8 fe80::a00:27ff:fe9d:629f/64
p8p1 DOWN   8000::1/8
p9p1 UP fe80::a00:27ff:fe76:d975/64
p7p1.100@p7p1UP fe80::a00:27ff:fe9d:629f/64

$ ip -br addr show p7p1
p7p1 UP 70.0.0.1/24 7000::1/8 
fe80::a00:27ff:fe9d:629f/64

v2: Now with color support!
v3: Better field width estimation (except netdev names to keep output at a
decent width) and whitespace fixup.

Signed-off-by: Andy Gospodarek 
---
 include/utils.h   |   1 +
 ip/ip.c   |   5 +-
 ip/ip_common.h|   3 +
 ip/ipaddress.c| 150 +++---
 ip/iplink.c   |   5 +-
 man/man8/ip-link.8.in |   3 +-
 6 files changed, 143 insertions(+), 24 deletions(-)

diff --git a/include/utils.h b/include/utils.h
index 0c57ccd..f77edeb 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -19,6 +19,7 @@ extern int show_details;
 extern int show_raw;
 extern int resolve_hosts;
 extern int oneline;
+extern int brief;
 extern int timestamp;
 extern int timestamp_short;
 extern const char * _SL_;
diff --git a/ip/ip.c b/ip/ip.c
index e75447e..eea00b8 100644
--- a/ip/ip.c
+++ b/ip/ip.c
@@ -32,6 +32,7 @@ int show_stats;
 int show_details;
 int resolve_hosts;
 int oneline;
+int brief;
 int timestamp;
 const char *_SL_;
 int force;
@@ -55,7 +56,7 @@ static void usage(void)
 "-h[uman-readable] | -iec |\n"
 "-f[amily] { inet | inet6 | ipx | dnet | mpls | bridge | 
link } |\n"
 "-4 | -6 | -I | -D | -B | -0 |\n"
-"-l[oops] { maximum-addr-flush-attempts } |\n"
+"-l[oops] { maximum-addr-flush-attempts } | -br[ief] |\n"
 "-o[neline] | -t[imestamp] | -ts[hort] | -b[atch] 
[filename] |\n"
 "-rc[vbuf] [size] | -n[etns] name | -a[ll] | -c[olor]}\n");
exit(-1);
@@ -250,6 +251,8 @@ int main(int argc, char **argv)
if (argc <= 1)
usage();
batch_file = argv[1];
+   } else if (matches(opt, "-brief") == 0) {
+   ++brief;
} else if (matches(opt, "-rcvbuf") == 0) {
unsigned int size;
 
diff --git a/ip/ip_common.h b/ip/ip_common.h
index f120f5b..f74face 100644
--- a/ip/ip_common.h
+++ b/ip/ip_common.h
@@ -2,6 +2,9 @@ extern int get_operstate(const char *name);
 extern int print_linkinfo(const struct sockaddr_nl *who,
  struct nlmsghdr *n,
  void *arg);
+extern int print_linkinfo_brief(const struct sockaddr_nl *who,
+   struct nlmsghdr *n,
+   void *arg);
 extern int print_addrinfo(const struct sockaddr_nl *who,
  struct nlmsghdr *n,
  void *arg);
diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 13d9c46..2aa5fbf 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -138,13 +138,22 @@ static void print_operstate(FILE *f, __u8 state)
if (state >= sizeof(oper_states)/sizeof(oper_states[0]))
fprintf(f, "state %#x ", state);
else {
-   fprintf(f, "state ");
-   if (strcmp(oper_states[state], "UP") == 0)
-   color_fprintf(f, COLOR_OPERSTATE_UP, "%s ", 
oper_states[state]);
-   else if (strcmp(oper_states[state], "DOWN") == 0)
-   color_fprintf(f, COLOR_OPERSTATE_DOWN, "%s ", 
oper_states[state]);
-   else
-   fprintf(f, "%s ", oper_states[state]);
+   if (brief) {
+   if (strcmp(oper_states[state], "UP") == 0)
+   color_fprintf(f, COLOR_OPERSTATE_UP, "%-14s ", 
oper_states[state]);
+   else if (strcmp(oper_states[state], "DOWN") == 0)
+   color_fprintf(f, COLOR_OPERSTATE_DOWN, "%-14s 
", oper_states[state])

Re: [PATCH net-next 1/4] ip_tunnels: convert the mode field of ip_tunnel_info to flags

2015-08-28 Thread Jiri Benc
On Fri, 28 Aug 2015 10:29:55 -0700, Pravin Shelar wrote:
> geneve module also needs to be updated.

Right, the rebase on top of your patches is not complete. I'll send v2.

Thanks,

 Jiri

-- 
Jiri Benc
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH/RFC] openvswitch: Retain parsed IPv6 header fields in flow on error skipping extension headers

2015-08-28 Thread Pravin Shelar
On Thu, Aug 27, 2015 at 12:10 AM, Simon Horman
 wrote:
> Hi Pravin,
>
> On Mon, Aug 17, 2015 at 11:33:59AM -0700, Pravin Shelar wrote:
>> On Thu, Aug 13, 2015 at 6:30 PM, Simon Horman
>>  wrote:
>> > When an error occurs skipping IPv6 extension headers retain the already
>> > parsed IP protocol and IPv6 addresses in the flow. Also assume that the
>> > packet is not a fragment in the absence of information to the contrary;

...
> -- >8 --
> Subject: [PATCH v1.1] openvswitch: Retain parsed IPv6 header fields in flow 
> on error skipping extension headers
>
> When an error occurs skipping IPv6 extension headers retain the already
> parsed IP protocol and IPv6 addresses in the flow. Also assume that the
> packet is not a fragment in the absence of information to the contrary;
> that is always use the frag_off value set by ipv6_skip_exthdr().
>
> This allows matching on the IP protocol and IPv6 addresses of packets
> with malformed extension headers.
>
> Signed-off-by: Simon Horman 
>
> ---
>
> * Some consideration should be given to unwanted side effects of this patch
>   as it affects the handling of malformed packets.
>
> Signed-off-by: Simon Horman 
> ---
>  net/openvswitch/flow.c | 21 +++--
>  1 file changed, 15 insertions(+), 6 deletions(-)
>
> diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
> index 8db22ef73626..de4366f81b11 100644
> --- a/net/openvswitch/flow.c
> +++ b/net/openvswitch/flow.c
> @@ -271,8 +271,6 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct 
> sw_flow_key *key)
> key->ipv6.addr.dst = nh->daddr;
>
> payload_ofs = ipv6_skip_exthdr(skb, payload_ofs, &nexthdr, &frag_off);
> -   if (unlikely(payload_ofs < 0))
> -   return -EINVAL;
>
> if (frag_off) {
> if (frag_off & htons(~0x7))
> @@ -283,6 +281,13 @@ static int parse_ipv6hdr(struct sk_buff *skb, struct 
> sw_flow_key *key)
> key->ip.frag = OVS_FRAG_TYPE_NONE;
> }
>
> +   /* Delayed handling of error in ipv6_skip_exthdr() as it
> +* always sets frag_off to a valid value which may be
> +* used to set key->ip.frag above.
> +*/
> +   if (unlikely(payload_ofs < 0))
> +   return -EPROTO;
> +
> nh_len = payload_ofs - nh_ofs;
> skb_set_transport_header(skb, nh_ofs + nh_len);
> key->ip.proto = nexthdr;
> @@ -622,12 +627,16 @@ static int key_extract(struct sk_buff *skb, struct 
> sw_flow_key *key)
>
> nh_len = parse_ipv6hdr(skb, key);
> if (unlikely(nh_len < 0)) {
> -   memset(&key->ip, 0, sizeof(key->ip));
> -   memset(&key->ipv6.addr, 0, sizeof(key->ipv6.addr));
> -   if (nh_len == -EINVAL) {
> +   switch (nh_len) {
> +   case -EINVAL:
> +   memset(&key->ip, 0, sizeof(key->ip));
> +   memset(&key->ipv6.addr, 0, 
> sizeof(key->ipv6.addr));
> +   /* fall-through */
> +   case -EPROTO:
> skb->transport_header = skb->network_header;
> error = 0;
> -   } else {
> +   break;
> +   default:
> error = nh_len;
> }
> return error;
Looks good to me.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 2/4] ip_tunnels: record IP version in tunnel info

2015-08-28 Thread Jiri Benc
On Fri, 28 Aug 2015 10:32:15 -0700, Pravin Shelar wrote:
> > --- a/drivers/net/geneve.c
> > +++ b/drivers/net/geneve.c
> > @@ -644,6 +644,9 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, 
> > struct net_device *dev)
> > u8 *opts = NULL;
> > u8 vni[3];
> >
> > +   if (ip_tunnel_info_af(info) != AF_INET)
> > +   goto err;
> > +
> geneve_get_rt() already interpreted the info as ipv4 tunnel info.

Hmm, okay. I'll move the check. The geneve module changed more than
I thought.

Thanks for noticing this.

> We can avoid such bugs by introducing separate API to retrieve ipv4
> and ipv6 tunnel info. Something like
> skb_tunnel_info_v4()/skb_tunnel_info_v6() for ipv4 and ipv6.

I don't think we want that. Ideally, the xmit function should work with
both and use the protocol information to choose the correct output
path. I intend to try this with the metadata based vxlan which would
use the correct socket (IPv4 or IPv6) appropriately. That way, we won't
need a separate vxlan interface for IPv4 and IPv6 traffic. Will be much
more user friendly and most likely easier to use from ovs, too.

 Jiri

-- 
Jiri Benc
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH RFC V2 2/2] net: Optimize snmp stat aggregation by walking all the percpu data at once

2015-08-28 Thread David Miller
From: Raghavendra K T 
Date: Fri, 28 Aug 2015 12:09:52 +0530

> On 08/28/2015 12:08 AM, David Miller wrote:
>> From: Raghavendra K T 
>> Date: Wed, 26 Aug 2015 23:07:33 +0530
>>
>>> @@ -4641,10 +4647,12 @@ static inline void __snmp6_fill_stats64(u64
>>> *stats, void __percpu *mib,
>>>   static void snmp6_fill_stats(u64 *stats, struct inet6_dev *idev, int
>>>   attrtype,
>>>  int bytes)
>>>   {
>>> +   u64 buff[IPSTATS_MIB_MAX] = {0,};
>>> +
 ...
> hope you wanted to know the overhead than to change the current
> patch. please let me know..

I want you to change that variable initializer to an explicit memset().

The compiler is emitting a memset() or similar _anyways_.

Not because it will have any impact at all upon performance, but because
of how it looks to people trying to read and understand the code.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next v5 00/11] ipv6: Only create RTF_CACHE route after encountering pmtu exception

2015-08-28 Thread David Miller
From: Martin KaFai Lau 
Date: Fri, 28 Aug 2015 00:36:38 -0700

> On Mon, Aug 17, 2015 at 11:43:20AM +0200, Alexander Holler wrote:
>> That's why I vote to check out if it's possible/reasonable to backport this
>> series to the stable kernels.
> I have backported to 4.0.y without major issue, so possible.
> 
> I did try on 3.1x and gave up.
> 
> It is a lot of changes,  so I don't think it is a good idea for -stable.

I am absolutely, firmly, against any of this work going into -stable.

It is completely inappropriate, the potential for regressions is
enormous.
--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/2] Some phylib simplifications

2015-08-28 Thread Sergei Shtylyov
Hello.

   Here's 2 patches against DaveM's 'net-next.git' repo. We simplify a bogus
string of type casts in the 1st patch and make the code respect some coding
standards of the networking code in the 2nd one. I may follow with fixing of
checkpatch.pl's complaints. if I have time..

[1/2] phylib: simplify bogus phy_device_create() result
[2/2] phylib: simplify NULL checks

MBR, Sergei

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/2] phylib: simplify bogus phy_device_create() result

2015-08-28 Thread Sergei Shtylyov
Get rid of the bogus string of type casts where ERR_PTR() is enough.

Signed-off-by: Sergei Shtylyov 

---
 drivers/net/phy/phy_device.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

Index: net-next/drivers/net/phy/phy_device.c
===
--- net-next.orig/drivers/net/phy/phy_device.c
+++ net-next/drivers/net/phy/phy_device.c
@@ -157,7 +157,7 @@ struct phy_device *phy_device_create(str
/* We allocate the device, and initialize the default values */
dev = kzalloc(sizeof(*dev), GFP_KERNEL);
if (NULL == dev)
-   return (struct phy_device *)PTR_ERR((void *)-ENOMEM);
+   return ERR_PTR(-ENOMEM);
 
dev->dev.release = phy_device_release;
 

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] phylib: simplify NULL checks

2015-08-28 Thread Sergei Shtylyov
Fix scripts/checkpatch.pl's messages like:

CHECK: Comparison to NULL could be written "!phydrv->read_mmd_indirect"

BTW, it doesn't detect the reversed comparisons (which I've fixed as well).

Signed-off-by: Sergei Shtylyov 

---
 drivers/net/phy/phy.c|4 ++--
 drivers/net/phy/phy_device.c |6 +++---
 2 files changed, 5 insertions(+), 5 deletions(-)

Index: net-next/drivers/net/phy/phy.c
===
--- net-next.orig/drivers/net/phy/phy.c
+++ net-next/drivers/net/phy/phy.c
@@ -1040,7 +1040,7 @@ int phy_read_mmd_indirect(struct phy_dev
struct phy_driver *phydrv = phydev->drv;
int value = -1;
 
-   if (phydrv->read_mmd_indirect == NULL) {
+   if (!phydrv->read_mmd_indirect) {
struct mii_bus *bus = phydev->bus;
 
mutex_lock(&bus->mdio_lock);
@@ -1077,7 +1077,7 @@ void phy_write_mmd_indirect(struct phy_d
 {
struct phy_driver *phydrv = phydev->drv;
 
-   if (phydrv->write_mmd_indirect == NULL) {
+   if (!phydrv->write_mmd_indirect) {
struct mii_bus *bus = phydev->bus;
 
mutex_lock(&bus->mdio_lock);
Index: net-next/drivers/net/phy/phy_device.c
===
--- net-next.orig/drivers/net/phy/phy_device.c
+++ net-next/drivers/net/phy/phy_device.c
@@ -156,7 +156,7 @@ struct phy_device *phy_device_create(str
 
/* We allocate the device, and initialize the default values */
dev = kzalloc(sizeof(*dev), GFP_KERNEL);
-   if (NULL == dev)
+   if (!dev)
return ERR_PTR(-ENOMEM);
 
dev->dev.release = phy_device_release;
@@ -178,7 +178,7 @@ struct phy_device *phy_device_create(str
dev->bus = bus;
dev->dev.parent = &bus->dev;
dev->dev.bus = &mdio_bus_type;
-   dev->irq = bus->irq != NULL ? bus->irq[addr] : PHY_POLL;
+   dev->irq = bus->irq ? bus->irq[addr] : PHY_POLL;
dev_set_name(&dev->dev, PHY_ID_FMT, bus->id, addr);
 
dev->state = PHY_DOWN;
@@ -589,7 +589,7 @@ int phy_attach_direct(struct net_device
/* Assume that if there is no driver, that it doesn't
 * exist, and we should use the genphy driver.
 */
-   if (NULL == d->driver) {
+   if (!d->driver) {
if (phydev->is_c45)
d->driver = &genphy_driver[GENPHY_DRV_10G].driver;
else

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 net-next 3/4] fou: reject IPv6 config

2015-08-28 Thread Jiri Benc
fou does not really support IPv6 encapsulation. After an UDP socket is
created in fou_create, the encap_rcv callback is set either to fou_udp_recv
or to gue_udp_recv. Both of those unconditionally assume that the received
packet has an IPv4 header and access the data at network_header as it was an
IPv4 header. This leads to IPv6 flow label being interpreted as IP packet
length, etc.

Disallow fou tunnel to be configured as IPv6 until real IPv6 support is
added to fou.

CC: Tom Herbert 
Signed-off-by: Jiri Benc 
---
v2: no change
---
 net/ipv4/fou.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/ipv4/fou.c b/net/ipv4/fou.c
index 2d1646cff057..e0fcbbbcfe54 100644
--- a/net/ipv4/fou.c
+++ b/net/ipv4/fou.c
@@ -566,7 +566,7 @@ static int parse_nl_config(struct genl_info *info,
if (info->attrs[FOU_ATTR_AF]) {
u8 family = nla_get_u8(info->attrs[FOU_ATTR_AF]);
 
-   if (family != AF_INET && family != AF_INET6)
+   if (family != AF_INET)
return -EINVAL;
 
cfg->udp_config.family = family;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 net-next 2/4] ip_tunnels: record IP version in tunnel info

2015-08-28 Thread Jiri Benc
There's currently nothing preventing directing packets with IPv6
encapsulation data to IPv4 tunnels (and vice versa). If this happens,
IPv6 addresses are incorrectly interpreted as IPv4 ones.

Track whether the given ip_tunnel_key contains IPv4 or IPv6 data. Store this
in ip_tunnel_info. Reject packets at appropriate places if they are supposed
to be encapsulated into an incompatible protocol.

Signed-off-by: Jiri Benc 
Acked-by: Alexei Starovoitov 
---
v2: moved the af check in geneve before the first usage of tunnel info data
---
 drivers/net/geneve.c   |  2 ++
 drivers/net/vxlan.c|  2 ++
 include/net/dst_metadata.h |  1 +
 include/net/ip_tunnels.h   | 10 ++
 net/core/filter.c  |  2 ++
 net/ipv4/ip_gre.c  |  3 ++-
 net/ipv4/ip_tunnel_core.c  |  2 +-
 net/openvswitch/flow.c |  2 ++
 net/openvswitch/vport.c|  2 ++
 9 files changed, 24 insertions(+), 2 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 4a39c09f144c..3908a22f23d1 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -627,6 +627,8 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct 
net_device *dev)
netdev_dbg(dev, "no tunnel metadata\n");
goto tx_error;
}
+   if (info && ip_tunnel_info_af(info) != AF_INET)
+   goto tx_error;
}
 
rt = geneve_get_rt(skb, dev, &fl4, info);
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index bd1b8cdf2bf6..e3adfe0ef66b 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -1903,6 +1903,8 @@ static void vxlan_xmit_one(struct sk_buff *skb, struct 
net_device *dev,
  dev->name);
goto drop;
}
+   if (family != ip_tunnel_info_af(info))
+   goto drop;
 
dst_port = info->key.tp_dst ? : vxlan->cfg.dst_port;
vni = be64_to_cpu(info->key.tun_id);
diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 2b83f0d232e0..d32f49cc621d 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -105,6 +105,7 @@ static inline struct metadata_dst *ipv6_tun_rx_dst(struct 
sk_buff *skb,
info->key.u.ipv6.dst = ip6h->daddr;
info->key.tos = ipv6_get_dsfield(ip6h);
info->key.ttl = ip6h->hop_limit;
+   info->mode = IP_TUNNEL_INFO_IPV6;
return tun_dst;
 }
 
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 9bdb3948798f..2b4fa06e91bd 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -4,6 +4,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -52,6 +53,7 @@ struct ip_tunnel_key {
 
 /* Flags for ip_tunnel_info mode. */
 #define IP_TUNNEL_INFO_TX  0x01/* represents tx tunnel parameters */
+#define IP_TUNNEL_INFO_IPV60x02/* key contains IPv6 addresses */
 
 struct ip_tunnel_info {
struct ip_tunnel_keykey;
@@ -208,6 +210,8 @@ static inline void __ip_tunnel_info_init(struct 
ip_tunnel_info *tun_info,
 
tun_info->options = opts;
tun_info->options_len = opts_len;
+
+   tun_info->mode = 0;
 }
 
 static inline void ip_tunnel_info_init(struct ip_tunnel_info *tun_info,
@@ -221,6 +225,12 @@ static inline void ip_tunnel_info_init(struct 
ip_tunnel_info *tun_info,
  tun_id, tun_flags, opts, opts_len);
 }
 
+static inline unsigned short ip_tunnel_info_af(const struct ip_tunnel_info
+  *tun_info)
+{
+   return tun_info->mode & IP_TUNNEL_INFO_IPV6 ? AF_INET6 : AF_INET;
+}
+
 #ifdef CONFIG_INET
 
 int ip_tunnel_init(struct net_device *dev);
diff --git a/net/core/filter.c b/net/core/filter.c
index 66500d490995..13079f03902e 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1493,6 +1493,8 @@ static u64 bpf_skb_get_tunnel_key(u64 r1, u64 r2, u64 
size, u64 flags, u64 r5)
 
if (unlikely(size != sizeof(struct bpf_tunnel_key) || flags || !info))
return -EINVAL;
+   if (ip_tunnel_info_af(info) != AF_INET)
+   return -EINVAL;
 
to->tunnel_id = be64_to_cpu(info->key.tun_id);
to->remote_ipv4 = be32_to_cpu(info->key.u.ipv4.src);
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 1e813a9f9378..bd0679d90519 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -511,7 +511,8 @@ static void gre_fb_xmit(struct sk_buff *skb, struct 
net_device *dev)
int err;
 
tun_info = skb_tunnel_info(skb);
-   if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX)))
+   if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX) ||
+ip_tunnel_info_af(tun_info) != AF_INET))
goto err_free_skb;
 
key = &tun_info->key;
diff --git a/net/ipv4/ip_tunnel_core.c b/net/ipv4/ip_tunnel_core.c
index 934f2ac8ad61..0c756ade1cf7 100644
---

[PATCH v2 net-next 1/4] ip_tunnels: convert the mode field of ip_tunnel_info to flags

2015-08-28 Thread Jiri Benc
The mode field holds a single bit of information only (whether the
ip_tunnel_info struct is for rx or tx). Change the mode field to bit flags.
This allows more mode flags to be added.

Signed-off-by: Jiri Benc 
Acked-by: Alexei Starovoitov 
---
v2: converted geneve fully, too
---
 drivers/net/geneve.c   | 2 +-
 drivers/net/vxlan.c| 2 +-
 include/net/dst_metadata.h | 1 -
 include/net/ip_tunnels.h   | 9 ++---
 net/ipv4/ip_gre.c  | 2 +-
 net/ipv4/route.c   | 2 +-
 net/ipv6/route.c   | 2 +-
 7 files changed, 7 insertions(+), 13 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 4357bae732d7..4a39c09f144c 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -623,7 +623,7 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, struct 
net_device *dev)
 
if (geneve->collect_md) {
info = skb_tunnel_info(skb);
-   if (unlikely(info && info->mode != IP_TUNNEL_INFO_TX)) {
+   if (unlikely(info && !(info->mode & IP_TUNNEL_INFO_TX))) {
netdev_dbg(dev, "no tunnel metadata\n");
goto tx_error;
}
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 30e56cb58884..bd1b8cdf2bf6 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2113,7 +2113,7 @@ static netdev_tx_t vxlan_xmit(struct sk_buff *skb, struct 
net_device *dev)
}
 
if (vxlan->flags & VXLAN_F_COLLECT_METADATA &&
-   info && info->mode == IP_TUNNEL_INFO_TX) {
+   info && info->mode & IP_TUNNEL_INFO_TX) {
vxlan_xmit_one(skb, dev, NULL, false);
return NETDEV_TX_OK;
}
diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 60c03326c087..2b83f0d232e0 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -59,7 +59,6 @@ static inline struct metadata_dst *tun_rx_dst(__be16 flags,
return NULL;
 
info = &tun_dst->u.tun_info;
-   info->mode = IP_TUNNEL_INFO_RX;
info->key.tun_flags = flags;
info->key.tun_id = tunnel_id;
info->key.tp_src = 0;
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 224e4ecec91b..9bdb3948798f 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -50,13 +50,8 @@ struct ip_tunnel_key {
__be16  tp_dst;
 };
 
-/* Indicates whether the tunnel info structure represents receive
- * or transmit tunnel parameters.
- */
-enum {
-   IP_TUNNEL_INFO_RX,
-   IP_TUNNEL_INFO_TX,
-};
+/* Flags for ip_tunnel_info mode. */
+#define IP_TUNNEL_INFO_TX  0x01/* represents tx tunnel parameters */
 
 struct ip_tunnel_info {
struct ip_tunnel_keykey;
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index faf1cde6f8da..1e813a9f9378 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -511,7 +511,7 @@ static void gre_fb_xmit(struct sk_buff *skb, struct 
net_device *dev)
int err;
 
tun_info = skb_tunnel_info(skb);
-   if (unlikely(!tun_info || tun_info->mode != IP_TUNNEL_INFO_TX))
+   if (unlikely(!tun_info || !(tun_info->mode & IP_TUNNEL_INFO_TX)))
goto err_free_skb;
 
key = &tun_info->key;
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index f3087aaa6dd8..3d9e70c804a9 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1693,7 +1693,7 @@ static int ip_route_input_slow(struct sk_buff *skb, 
__be32 daddr, __be32 saddr,
 */
 
tun_info = skb_tunnel_info(skb);
-   if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX)
+   if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
fl4.flowi4_tun_key.tun_id = tun_info->key.tun_id;
else
fl4.flowi4_tun_key.tun_id = 0;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index df3e353a012d..308dd5f9158f 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1174,7 +1174,7 @@ void ip6_route_input(struct sk_buff *skb)
};
 
tun_info = skb_tunnel_info(skb);
-   if (tun_info && tun_info->mode == IP_TUNNEL_INFO_RX)
+   if (tun_info && !(tun_info->mode & IP_TUNNEL_INFO_TX))
fl6.flowi6_tun_key.tun_id = tun_info->key.tun_id;
skb_dst_drop(skb);
skb_dst_set(skb, ip6_route_input_lookup(net, skb->dev, &fl6, flags));
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 net-next 0/4] tunnels: fix incorrect IPv4/v6 headers interpretation

2015-08-28 Thread Jiri Benc
With tunneling, it is currently possible to get an IPv6 header and interpret
it as an IPv4 header, or to interpret an IPv6 address as an IPv4 address
(and vice versa). This leads to things like sending packets to incorrect
address, IPv6 flow label being interpreted as IP packet length, etc.

Fix several places where this can happen.

Most of this is net-next only. The third patch affects net, too, but it
doesn't seem there's anything in user space that sets the attribute at all
currently, thus net-next is fine.

Changelog:
v2: fixed geneve after incorrect rebase on top of Pravin's patches

Jiri Benc (4):
  ip_tunnels: convert the mode field of ip_tunnel_info to flags
  ip_tunnels: record IP version in tunnel info
  fou: reject IPv6 config
  vxlan: do not receive IPv4 packets on IPv6 socket

 drivers/net/geneve.c   |  4 +++-
 drivers/net/vxlan.c|  5 -
 include/net/dst_metadata.h |  2 +-
 include/net/ip_tunnels.h   | 19 ---
 include/net/udp_tunnel.h   |  3 ++-
 net/core/filter.c  |  2 ++
 net/ipv4/fou.c |  2 +-
 net/ipv4/ip_gre.c  |  3 ++-
 net/ipv4/ip_tunnel_core.c  |  2 +-
 net/ipv4/route.c   |  2 +-
 net/ipv6/ip6_udp_tunnel.c  |  9 +
 net/ipv6/route.c   |  2 +-
 net/openvswitch/flow.c |  2 ++
 net/openvswitch/vport.c|  2 ++
 14 files changed, 43 insertions(+), 16 deletions(-)

-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 net-next 4/4] vxlan: do not receive IPv4 packets on IPv6 socket

2015-08-28 Thread Jiri Benc
By default (subject to the sysctl settings), IPv6 sockets listen also for
IPv4 traffic. Vxlan is not prepared for that and expects IPv6 header in
packets received through an IPv6 socket.

In addition, it's currently not possible to have both IPv4 and IPv6 vxlan
tunnel on the same port (unless bindv6only sysctl is enabled), as it's not
possible to create and bind both IPv4 and IPv6 vxlan interfaces and there's
no way to specify both IPv4 and IPv6 remote/group IP addresses.

Set IPV6_V6ONLY on vxlan sockets to fix both of these issues. This is not
done globally in udp_tunnel, as l2tp and tipc seems to work okay when
receiving IPv4 packets on IPv6 socket and people may rely on this behavior.
The other tunnels (geneve and fou) do not support IPv6.

Signed-off-by: Jiri Benc 
---
v2: no change
---
 drivers/net/vxlan.c   | 1 +
 include/net/udp_tunnel.h  | 3 ++-
 net/ipv6/ip6_udp_tunnel.c | 9 +
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index e3adfe0ef66b..6c5269aea544 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2530,6 +2530,7 @@ static struct socket *vxlan_create_sock(struct net *net, 
bool ipv6,
udp_conf.family = AF_INET6;
udp_conf.use_udp6_rx_checksums =
!(flags & VXLAN_F_UDP_ZERO_CSUM6_RX);
+   udp_conf.ipv6_v6only = 1;
} else {
udp_conf.family = AF_INET;
}
diff --git a/include/net/udp_tunnel.h b/include/net/udp_tunnel.h
index 35041d0fc21e..cb2f89f20f5c 100644
--- a/include/net/udp_tunnel.h
+++ b/include/net/udp_tunnel.h
@@ -31,7 +31,8 @@ struct udp_port_cfg {
__be16  peer_udp_port;
unsigned intuse_udp_checksums:1,
use_udp6_tx_checksums:1,
-   use_udp6_rx_checksums:1;
+   use_udp6_rx_checksums:1,
+   ipv6_v6only:1;
 };
 
 int udp_sock_create4(struct net *net, struct udp_port_cfg *cfg,
diff --git a/net/ipv6/ip6_udp_tunnel.c b/net/ipv6/ip6_udp_tunnel.c
index e1a1136bda7c..14dacf1df529 100644
--- a/net/ipv6/ip6_udp_tunnel.c
+++ b/net/ipv6/ip6_udp_tunnel.c
@@ -23,6 +23,15 @@ int udp_sock_create6(struct net *net, struct udp_port_cfg 
*cfg,
if (err < 0)
goto error;
 
+   if (cfg->ipv6_v6only) {
+   int val = 1;
+
+   err = kernel_setsockopt(sock, IPPROTO_IPV6, IPV6_V6ONLY,
+   (char *) &val, sizeof(val));
+   if (err < 0)
+   goto error;
+   }
+
udp6_addr.sin6_family = AF_INET6;
memcpy(&udp6_addr.sin6_addr, &cfg->local_ip6,
   sizeof(udp6_addr.sin6_addr));
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] net/smsc911x: Fix deferred probe for interrupt

2015-08-28 Thread Tony Lindgren
The interrupt handler may not be available when smsc911x probes if the
interrupt handler is a GPIO controller for example. Let's fix that
by adding handling for -EPROBE_DEFER.

Cc: Steve Glendinning 
Signed-off-by: Tony Lindgren 
---
 drivers/net/ethernet/smsc/smsc911x.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/smsc/smsc911x.c 
b/drivers/net/ethernet/smsc/smsc911x.c
index 959aeea..cb9f166f 100644
--- a/drivers/net/ethernet/smsc/smsc911x.c
+++ b/drivers/net/ethernet/smsc/smsc911x.c
@@ -2435,7 +2435,10 @@ static int smsc911x_drv_probe(struct platform_device 
*pdev)
res_size = resource_size(res);
 
irq = platform_get_irq(pdev, 0);
-   if (irq <= 0) {
+   if (irq == -EPROBE_DEFER) {
+   retval = -EPROBE_DEFER;
+   goto out_0;
+   } else if (irq <= 0) {
pr_warn("Could not allocate irq resource\n");
retval = -ENODEV;
goto out_0;
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe netdev" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


  1   2   3   >