Re: [PATCH net-next 2/2] mpls: Packet stats

2017-01-13 Thread kbuild test robot
Hi Robert,

[auto build test ERROR on net-next/master]

url:
https://github.com/0day-ci/linux/commits/Robert-Shearman/net-AF-specific-RTM_GETSTATS-attributes/20170114-095819
config: i386-randconfig-sb0-01141243 (attached as .config)
compiler: gcc-5 (Debian 5.4.1-2) 5.4.1 20160904
reproduce:
# save the attached .config to linux build tree
make ARCH=i386 

All errors (new ones prefixed by >>):

   In file included from include/net/netns/mib.h:4:0,
from include/net/net_namespace.h:14,
from include/linux/netdevice.h:43,
from include/uapi/linux/if_arp.h:26,
from include/linux/if_arp.h:27,
from net/mpls/af_mpls.c:7:
   net/mpls/af_mpls.c: In function 'mpls_stats_inc_outucastpkts':
>> include/net/ipv6.h:163:35: error: 'struct netns_mib' has no member named 
>> 'ipv6_statistics'
 mod##SNMP_UPD_PO_STATS((net)->mib.statname##_statistics, field, (val));\
  ^
   include/net/snmp.h:145:15: note: in definition of macro 'SNMP_UPD_PO_STATS'
  __typeof__((mib->mibs) + 0) ptr = mib->mibs; \
  ^
   include/net/ipv6.h:177:3: note: in expansion of macro '_DEVUPD'
  _DEVUPD(net, ipv6, , idev, field, val)
  ^
   net/mpls/af_mpls.c:114:4: note: in expansion of macro 'IP6_UPD_PO_STATS'
   IP6_UPD_PO_STATS(dev_net(dev), in6dev,
   ^
>> include/net/ipv6.h:163:35: error: 'struct netns_mib' has no member named 
>> 'ipv6_statistics'
 mod##SNMP_UPD_PO_STATS((net)->mib.statname##_statistics, field, (val));\
  ^
   include/net/snmp.h:145:37: note: in definition of macro 'SNMP_UPD_PO_STATS'
  __typeof__((mib->mibs) + 0) ptr = mib->mibs; \
^
   include/net/ipv6.h:177:3: note: in expansion of macro '_DEVUPD'
  _DEVUPD(net, ipv6, , idev, field, val)
  ^
   net/mpls/af_mpls.c:114:4: note: in expansion of macro 'IP6_UPD_PO_STATS'
   IP6_UPD_PO_STATS(dev_net(dev), in6dev,
   ^
   In file included from include/asm-generic/percpu.h:6:0,
from arch/x86/include/asm/percpu.h:542,
from arch/x86/include/asm/preempt.h:5,
from include/linux/preempt.h:59,
from include/linux/spinlock.h:50,
from include/linux/mm_types.h:8,
from include/linux/kmemcheck.h:4,
from include/linux/skbuff.h:18,
from net/mpls/af_mpls.c:2:
   include/net/snmp.h:146:19: error: subscripted value is neither array nor 
pointer nor vector
  this_cpu_inc(ptr[basefield##PKTS]);  \
  ^
   include/linux/percpu-defs.h:206:47: note: in definition of macro 
'__verify_pcpu_ptr'
 const void __percpu *__vpp_verify = (typeof((ptr) + 0))NULL; \
  ^
   include/linux/percpu-defs.h:496:33: note: in expansion of macro 
'__pcpu_size_call'
#define this_cpu_add(pcp, val)  __pcpu_size_call(this_cpu_add_, pcp, val)
^
   include/linux/percpu-defs.h:507:28: note: in expansion of macro 
'this_cpu_add'
#define this_cpu_inc(pcp)  this_cpu_add(pcp, 1)
   ^
   include/net/snmp.h:146:3: note: in expansion of macro 'this_cpu_inc'
  this_cpu_inc(ptr[basefield##PKTS]);  \
  ^
   include/net/ipv6.h:163:7: note: in expansion of macro 'SNMP_UPD_PO_STATS'
 mod##SNMP_UPD_PO_STATS((net)->mib.statname##_statistics, field, (val));\
  ^
   include/net/ipv6.h:177:3: note: in expansion of macro '_DEVUPD'
  _DEVUPD(net, ipv6, , idev, field, val)
  ^
   net/mpls/af_mpls.c:114:4: note: in expansion of macro 'IP6_UPD_PO_STATS'
   IP6_UPD_PO_STATS(dev_net(dev), in6dev,
   ^
   include/net/snmp.h:146:19: error: subscripted value is neither array nor 
pointer nor vector
  this_cpu_inc(ptr[basefield##PKTS]);  \
  ^
   include/linux/percpu-defs.h:363:16: note: in definition of macro 
'__pcpu_size_call'
 switch(sizeof(variable)) { \
   ^
   include/linux/percpu-defs.h:507:28: note: in expansion of macro 
'this_cpu_add'
#define this_cpu_inc(pcp)  this_cpu_add(pcp, 1)
   ^
   include/net/snmp.h:146:3: note: in expansion of macro 'this_cpu_inc'
  this_cpu_inc(ptr[basefield##PKTS]);  \
  ^
   include/net/ipv6.h:163:7: note: in expansion of macro 'SNMP_UPD_PO_STATS'
 mod##SNMP_UPD_PO_STATS((net)->mib.statname##_statistics, field, (val));\
  ^
   include/net/ipv6.h:177:3: note: in expansion of macro '_DEVUPD'
  _DEVUPD(net, ipv6, , idev, field, val)
  ^
   net/mpls/af_mpls.c:114:4: note: in expansion of macro 'IP6_UPD_PO_STATS'
   IP6_UPD_PO_STATS(dev_net(dev), in6dev,
   ^
   In file included from arch/x86/include/asm/preempt.h:5:0,
from include/linux/preempt.h:59,
   

Re: Commit 1fe8e0... (include more headers in if_tunnel.h) breaks my user-space build.

2017-01-13 Thread Mikko Rapeli
On Fri, Jan 13, 2017 at 02:11:41PM -0800, Ben Greear wrote:
> On 01/13/2017 02:08 PM, Stephen Hemminger wrote:
> >On Fri, 13 Jan 2017 11:50:32 -0800
> >Ben Greear  wrote:
> >
> >>On 01/13/2017 11:41 AM, Stephen Hemminger wrote:
> >>>On Fri, 13 Jan 2017 11:12:32 -0800
> >>>Ben Greear  wrote:
> >>>
> I am including netinet/ip.h, and also linux/if_tunnel.h, and the 
> linux/ip.h conflicts with
> netinet/ip.h.
> 
> Maybe my build environment is screwed up, but maybe also it would be 
> better to
> just let the user include appropriate headers before including if_tunnel.h
> and revert this patch?
> 
> 
> include/uapi/linux/if_tunnel.h: include linux/if.h, linux/ip.h and 
> linux/in6.h
> 
>  Fixes userspace compilation errors like:
> 
>  error: field ‘iph’ has incomplete type
>  error: field ‘prefix’ has incomplete type
> 
>  Signed-off-by: Mikko Rapeli 
>  Signed-off-by: David S. Miller 
> 
> Thanks,
> Ben
> 
> >>>
> >>>What I ended up doing for iproute2 was including all headers used by the 
> >>>source
> >>>based on sanitized kernel headers.  Basically
> >>>  $ git grep '^#include  >>>   awk -F: '{print $2}' | \
> >>>   sed -e 's/^#include .*$//' | \
> >>>   sort -u >linux.headers
> >>>   $ for f in $(cat linux.headers)
> >>> do cp ~/kernel/net-next/usr/include/$f include/$f
> >>> done
> >>>
> >>>You can't take only some of the headers, once you decide to diverge from 
> >>>glibc provided
> >>>headers, you got to take them all.
> >>>
> >>
> >>I do grab a copy of the linux kernel headers and compile against that, but 
> >>netinet/ip.h is
> >>coming from the OS.  Do you mean I should not include netinet/ip.h and 
> >>instead use linux/ip.h?
> >
> >I don't think you can mix netinet/ip.h and linux/ip.h, yes that is a mess.
> >
> 
> Well, I still like the idea of reverting this patch..that way user-space does 
> not have to use
> linux/ip.h, and that lets them use netinet/ip.h and if_tunnel.h.

I might have patches for glibc compat for your case in
https://github.com/mcfrisk/linux/commits/headers_test_v06
if you include glibc headers first and then kernel uapi ones.

> Anyway, I'll let Dave and/or the original committer decideI've reverted 
> it in my local tree
> so I am able to build again...

My changes make uapi headers compile as they are in userspace. That exposes
problems like this for which user space has had workarounds for decades.
Sorry for that. The glibc compat fixes should help.

-Mikko

> Thanks,
> Ben
> 
> -- 
> Ben Greear 
> Candela Technologies Inc  http://www.candelatech.com
> 


Re: [PATCH net-next 2/2] mpls: Packet stats

2017-01-13 Thread kbuild test robot
Hi Robert,

[auto build test ERROR on net-next/master]

url:
https://github.com/0day-ci/linux/commits/Robert-Shearman/net-AF-specific-RTM_GETSTATS-attributes/20170114-095819
config: x86_64-randconfig-u0-01141334 (attached as .config)
compiler: gcc-6 (Debian 6.2.0-3) 6.2.0 20160901
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64 

All error/warnings (new ones prefixed by >>):

   In file included from include/net/netns/mib.h:4:0,
from include/net/net_namespace.h:14,
from include/linux/netdevice.h:43,
from include/uapi/linux/if_arp.h:26,
from include/linux/if_arp.h:27,
from net/mpls/af_mpls.c:7:
   net/mpls/af_mpls.c: In function 'mpls_stats_inc_outucastpkts':
>> include/net/ipv6.h:163:35: error: 'struct netns_mib' has no member named 
>> 'ipv6_statistics'; did you mean 'ip_statistics'?
 mod##SNMP_UPD_PO_STATS((net)->mib.statname##_statistics, field, (val));\
  ^
   include/net/snmp.h:145:15: note: in definition of macro 'SNMP_UPD_PO_STATS'
  __typeof__((mib->mibs) + 0) ptr = mib->mibs; \
  ^~~
>> include/net/ipv6.h:177:3: note: in expansion of macro '_DEVUPD'
  _DEVUPD(net, ipv6, , idev, field, val)
  ^~~
>> net/mpls/af_mpls.c:114:4: note: in expansion of macro 'IP6_UPD_PO_STATS'
   IP6_UPD_PO_STATS(dev_net(dev), in6dev,
   ^~~~
>> include/net/ipv6.h:163:35: error: 'struct netns_mib' has no member named 
>> 'ipv6_statistics'; did you mean 'ip_statistics'?
 mod##SNMP_UPD_PO_STATS((net)->mib.statname##_statistics, field, (val));\
  ^
   include/net/snmp.h:145:37: note: in definition of macro 'SNMP_UPD_PO_STATS'
  __typeof__((mib->mibs) + 0) ptr = mib->mibs; \
^~~
>> include/net/ipv6.h:177:3: note: in expansion of macro '_DEVUPD'
  _DEVUPD(net, ipv6, , idev, field, val)
  ^~~
>> net/mpls/af_mpls.c:114:4: note: in expansion of macro 'IP6_UPD_PO_STATS'
   IP6_UPD_PO_STATS(dev_net(dev), in6dev,
   ^~~~
   In file included from include/asm-generic/percpu.h:6:0,
from arch/x86/include/asm/percpu.h:542,
from arch/x86/include/asm/preempt.h:5,
from include/linux/preempt.h:59,
from include/linux/spinlock.h:50,
from include/linux/mm_types.h:8,
from include/linux/kmemcheck.h:4,
from include/linux/skbuff.h:18,
from net/mpls/af_mpls.c:2:
>> include/net/snmp.h:146:19: error: subscripted value is neither array nor 
>> pointer nor vector
  this_cpu_inc(ptr[basefield##PKTS]);  \
  ^
   include/linux/percpu-defs.h:206:47: note: in definition of macro 
'__verify_pcpu_ptr'
 const void __percpu *__vpp_verify = (typeof((ptr) + 0))NULL; \
  ^~~
   include/linux/percpu-defs.h:496:33: note: in expansion of macro 
'__pcpu_size_call'
#define this_cpu_add(pcp, val)  __pcpu_size_call(this_cpu_add_, pcp, val)
^~~~
   include/linux/percpu-defs.h:507:28: note: in expansion of macro 
'this_cpu_add'
#define this_cpu_inc(pcp)  this_cpu_add(pcp, 1)
   ^~~~
   include/net/snmp.h:146:3: note: in expansion of macro 'this_cpu_inc'
  this_cpu_inc(ptr[basefield##PKTS]);  \
  ^~~~
>> include/net/ipv6.h:163:7: note: in expansion of macro 'SNMP_UPD_PO_STATS'
 mod##SNMP_UPD_PO_STATS((net)->mib.statname##_statistics, field, (val));\
  ^
>> include/net/ipv6.h:177:3: note: in expansion of macro '_DEVUPD'
  _DEVUPD(net, ipv6, , idev, field, val)
  ^~~
>> net/mpls/af_mpls.c:114:4: note: in expansion of macro 'IP6_UPD_PO_STATS'
   IP6_UPD_PO_STATS(dev_net(dev), in6dev,
   ^~~~
>> include/net/snmp.h:146:19: error: subscripted value is neither array nor 
>> pointer nor vector
  this_cpu_inc(ptr[basefield##PKTS]);  \
  ^
   include/linux/percpu-defs.h:363:16: note: in definition of macro 
'__pcpu_size_call'
 switch(sizeof(variable)) { \
   ^~~~
   include/linux/percpu-defs.h:507:28: note: in expansion of macro 
'this_cpu_add'
#define this_cpu_inc(pcp)  this_cpu_add(pcp, 1)
   ^~~~
   include/net/snmp.h:146:3: note: in expansion of macro 'this_cpu_inc'
  this_cpu_inc(ptr[basefield##PKTS]);  \
  ^~~~
>> include/net/ipv6.h:163:7: note: in expansion of macro 'SNMP_UPD_PO_STATS'
 mod##SNMP_UPD_PO_STATS((net)->mib.statname##_statistics, field, (val));\
  ^
>> include/net/ipv6.h:177:3: note: in expansion of macro '_DEVUPD'
  _DEVUPD(net, ipv6, , idev, field, val)
  ^~~
>> 

Re: [PATCH net-next] cxgb4: Fix misleading packet/frame count stats.

2017-01-13 Thread David Miller
From: Ganesh Goudar 
Date: Fri, 13 Jan 2017 14:46:40 +0530

> Do not count pause frames as part of general TX/RX frame
> counters.
> 
> Based on the original work of Casey Leedom 
> Signed-off-by: Ganesh Goudar 

Applied.


Re: [PATCH net-next v2 0/5] bnxt_en: Misc. updates for net-next.

2017-01-13 Thread David Miller
From: Michael Chan 
Date: Fri, 13 Jan 2017 01:31:59 -0500

> Miscellaneous updates including firmware spec update, ethtool -p blinking
> LED support, RDMA SRIOV config callback, and minor fixes.
> 
> v2: Dropped the DCBX RoCE app TLV patch until the ETH_P_IBOE RDMA patch
> is merged.

Series applied, thanks.


Re: [PATCH net-next v2 00/13] tcp: RACK fast recovery

2017-01-13 Thread David Miller
From: Yuchung Cheng 
Date: Thu, 12 Jan 2017 22:11:29 -0800

> The patch set enables RACK loss detection (draft-ietf-tcpm-rack-01)
> to trigger fast recovery with a reordering timer.
> 
> Previously RACK has been running in auxiliary mode where it is
> used to detect packet losses once the recovery has triggered by
> other algorithms (e.g., FACK). By inspecting packet timestamps,
> RACK can start ACK-driven repairs timely. A few similar heuristics
> are no longer needed and are either removed or disabled to reduce
> the complexity of the Linux TCP loss recovery engine:
> 
>   1. FACK (Forward Acknowledgement)
>   2. Early Retransmit (RFC5827)
>   3. thin_dupack (fast recovery on single DUPACK for thin-streams)
>   4. NCR (Non-Congestion Robustness RFC4653) (RFC4653)
>   5. Forward Retransmit
> 
> After this change, Linux's loss recovery algorithms consist of
>   1. Conventional DUPACK threshold approach (RFC6675)
>   2. RACK and Tail Loss Probe (draft-ietf-tcpm-rack-01)
>   3. RTO plus F-RTO extension (RFC5682)
> 
> The patch set has been tested on Google servers extensively and
> presented in several IETF meetings. The data suggests that RACK
> successfully improves recovery performance:
> https://www.ietf.org/proceedings/97/slides/slides-97-tcpm-draft-ietf-tcpm-rack-01.pdf
> https://www.ietf.org/proceedings/96/slides/slides-96-tcpm-3.pdf

Series applied, thanks for all of your hard work.


Re: [PATCH net-next] ipv6: sr: add missing Kbuild export for header files

2017-01-13 Thread kbuild test robot
Hi David,

[auto build test WARNING on net-next/master]

url:
https://github.com/0day-ci/linux/commits/David-Lebrun/ipv6-sr-add-missing-Kbuild-export-for-header-files/20170114-095347
config: x86_64-randconfig-n0-01141038 (attached as .config)
compiler: gcc-4.8 (Debian 4.8.4-1) 4.8.4
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64 

All warnings (new ones prefixed by >>):

>> ./usr/include/linux/seg6.h:21: found __[us]{8,16,32,64} type without 
>> #include 
>> ./usr/include/linux/seg6_hmac.h:11: found __[us]{8,16,32,64} type without 
>> #include 

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


Re: [PATCH v5 00/13] net: ethernet: aquantia: Add AQtion 2.5/5 GB NIC driver

2017-01-13 Thread David VomLehn

On 01/13/2017 05:38 PM, Florian Fainelli wrote:

On 01/12/2017 09:02 PM, Alexander Loktionov wrote:

From: David VomLehn 

v1: Initial version
v2: o Make necessary drivers/net/ethernet changes to integrate software
 o Drop intermediate atlantic directory
 o Remove Makefile things only appropriate to out of tree module
   building
v3: o Move changes to drivers/net/ethernet/{Kconfig,Makefile} to the last
   patch to ensure clean bisection.
 o Removed inline attribute aq_hw_write_req() as it was defined in
   only one .c file.
 o #included pci.h in aq_common.h to get struct pci definition.
 o Modified code to unlock based execution flow rather than using a
   flag.
 o Made a number of functions that were only used in a single file
   static.
 o Cleaned up error and return code handling in various places.
 o Remove AQ_CFG_IP_ALIGN definition.
 o Other minor code clean up.
v4: o Using do_div for 64 bit division.
 o Modified NIC statistics code.
 o Using build_skb instead netdev_alloc_skb for single fragment
   packets.
 o Removed extra aq_nic.o from Makefile
v5: o Removed extra newline at the end of the files.
 o Wrapped cover letter lines.

Have not looked at the driver yet, but the threading of your emails is
weird, each patch is in reply to the previous one. It would be more
natural to have all numbered patches be in reply to the cover letter,
which according to the version of git you seem to have used (2.7.4)
should already be the default. In graphical terms what we see right now is:

[PATCH 00/13]
[PATCH 01/13]
[PATCH 02/13]


While we should see:


[PATCH 00/13]
[PATCH 01/13]
[PATCH 02/13]


Can you fix that for future submissions, this may sound like a cosmetic
thing, but it really helps with threading/reading etc.

Thanks!
This looks like it is a consequence of setting the git configuration 
item format.thread to deep. I've switched it to shallow, which should 
give the threading you suggest.


--
David VL



Re: [PATCH] can: Fix kernel panic at security_sock_rcv_skb

2017-01-13 Thread Liu Shuo

On Thu 12.Jan'17 at 17:33:38 +0100, Oliver Hartkopp wrote:

On 01/12/2017 02:01 PM, Eric Dumazet wrote:

On Thu, 2017-01-12 at 09:22 +0100, Oliver Hartkopp wrote:



But my main concern is:

The reason why can_rx_delete_receiver() was introduced was the need to
remove a huge number of receivers with can_rx_unregister().

When you call synchronize_rcu() after each receiver removal this would
potentially lead to a big performance issue when e.g. closing CAN_RAW
sockets with a high number of receivers.

So the idea was to remove/unlink the receiver hlist_del_rcu(>list)
and also kmem_cache_free(rcv_cache, r) by some rcu mechanism - so that
all elements are cleaned up by rcu at a later point.

Is it possible that the problems emerge due to hlist_del_rcu(>list)
and you accidently fix it with your introduced synchronize_rcu()?


I agree this patch does not fix the root cause.

The main problem seems that the sockets themselves are not RCU
protected.

If CAN uses RCU for delivery, then sockets should be freed only after
one RCU grace period.

On recent kernels, following patch could help :



Thanks Eric!

@Liu ShuoX: Can you check if Eric's suggestion fixes the issue in your 
setup?

Sorry for late reply. I was OOO yesterday.
With Eric's hint, i just found his patch that "net: add SOCK_RCU_FREE
socket flag" in the latest kernel. With backporting this one plus Eric's
following patch, it fixs my failure.

Thanks Eric and Oliver!

Shuo


Best regards,
Oliver


[PATCH] net: add regs attribute to phy device for user diagnose

2017-01-13 Thread yuan linyu
From: yuan linyu 

if phy device have register(s) configuration problem,
user can use this attribute to diagnose.
this feature need phy driver maintainer implement.

Signed-off-by: yuan linyu 
---
 drivers/net/phy/phy_device.c | 26 ++
 include/linux/phy.h  |  4 
 2 files changed, 30 insertions(+)

diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 92b0838..a400748 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -617,10 +617,36 @@ phy_has_fixups_show(struct device *dev, struct 
device_attribute *attr,
 }
 static DEVICE_ATTR_RO(phy_has_fixups);
 
+static ssize_t
+phy_regs_show(struct device *dev, struct device_attribute *attr,
+   char *buf)
+{
+   struct phy_device *phydev = to_phy_device(dev);
+
+   if (!phydev->drv || !phydev->drv->read_regs)
+   return 0;
+
+   return phydev->drv->read_regs(phydev, buf, PAGE_SIZE);
+}
+
+static ssize_t
+phy_regs_store(struct device *dev, struct device_attribute *attr,
+   const char *buf, size_t count)
+{
+   struct phy_device *phydev = to_phy_device(dev);
+
+   if (!phydev->drv || !phydev->drv->write_regs)
+   return 0;
+
+   return phydev->drv->write_regs(phydev, buf, count);
+}
+static DEVICE_ATTR_RW(phy_regs);
+
 static struct attribute *phy_dev_attrs[] = {
_attr_phy_id.attr,
_attr_phy_interface.attr,
_attr_phy_has_fixups.attr,
+   _attr_phy_regs.attr,
NULL,
 };
 ATTRIBUTE_GROUPS(phy_dev);
diff --git a/include/linux/phy.h b/include/linux/phy.h
index f7d95f6..c9c4ab3 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -622,6 +622,10 @@ struct phy_driver {
int (*set_tunable)(struct phy_device *dev,
struct ethtool_tunable *tuna,
const void *data);
+
+   /* Diagnose PHY register configuration issue from user space */
+   ssize_t (*read_regs)(struct phy_device *dev, char *buf, size_t size);
+   int (*write_regs)(struct phy_device *dev, const char *buf, size_t size);
 };
 #define to_phy_driver(d) container_of(to_mdio_common_driver(d),
\
  struct phy_driver, mdiodrv)
-- 
2.7.4




Re: [PATCH v5 00/13] net: ethernet: aquantia: Add AQtion 2.5/5 GB NIC driver

2017-01-13 Thread Florian Fainelli
On 01/12/2017 09:02 PM, Alexander Loktionov wrote:
> From: David VomLehn 
> 
> v1: Initial version
> v2: o Make necessary drivers/net/ethernet changes to integrate software
> o Drop intermediate atlantic directory
> o Remove Makefile things only appropriate to out of tree module
>   building
> v3: o Move changes to drivers/net/ethernet/{Kconfig,Makefile} to the last
>   patch to ensure clean bisection.
> o Removed inline attribute aq_hw_write_req() as it was defined in
>   only one .c file.
> o #included pci.h in aq_common.h to get struct pci definition.
> o Modified code to unlock based execution flow rather than using a
>   flag.
> o Made a number of functions that were only used in a single file
>   static.
> o Cleaned up error and return code handling in various places.
> o Remove AQ_CFG_IP_ALIGN definition.
> o Other minor code clean up.
> v4: o Using do_div for 64 bit division.
> o Modified NIC statistics code.
> o Using build_skb instead netdev_alloc_skb for single fragment
>   packets.
> o Removed extra aq_nic.o from Makefile
> v5: o Removed extra newline at the end of the files.
> o Wrapped cover letter lines.

Few build warnings with W=1 (W=2 is just too verbose) that you may want
to fix, this was with GCC 4.8, newer compilers may flag new warnings as
well:

drivers/net/ethernet/aquantia/aq_main.c: In function
'aq_pci_probe_get_hw_ops_by_id':
drivers/net/ethernet/aquantia/aq_main.c:41:6: warning: variable 'err'
set but not used [-Wunused-but-set-variable]
  int err = 0;
  ^
drivers/net/ethernet/aquantia/aq_main.c: In function 'aq_pci_remove':
drivers/net/ethernet/aquantia/aq_main.c:237:6: warning: variable 'err'
set but not used [-Wunused-but-set-variable]
  int err = 0;
  ^

-- 
Florian


Re: [PATCH v5 01/13] net: ethernet: aquantia: Make and configuration files.

2017-01-13 Thread Florian Fainelli
On 01/12/2017 09:02 PM, Alexander Loktionov wrote:
> From: David VomLehn 
> 
> Patches to create the make and configuration files.
> 
> Signed-off-by: Alexander Loktionov 
> Signed-off-by: Dmitrii Tarakanov 
> Signed-off-by: Pavel Belous 
> Signed-off-by: Dmitry Bezrukov 
> Signed-off-by: David M. VomLehn 
> ---
>  drivers/net/ethernet/aquantia/Kconfig  | 24 +++
>  drivers/net/ethernet/aquantia/Makefile | 42 
> ++
>  drivers/net/ethernet/aquantia/ver.h| 18 +++
>  3 files changed, 84 insertions(+)
>  create mode 100644 drivers/net/ethernet/aquantia/Kconfig
>  create mode 100644 drivers/net/ethernet/aquantia/Makefile
>  create mode 100644 drivers/net/ethernet/aquantia/ver.h
> 
> diff --git a/drivers/net/ethernet/aquantia/Kconfig 
> b/drivers/net/ethernet/aquantia/Kconfig
> new file mode 100644
> index 000..a74a4c0
> --- /dev/null
> +++ b/drivers/net/ethernet/aquantia/Kconfig
> @@ -0,0 +1,24 @@
> +#
> +# aQuantia device configuration
> +#
> +
> +config NET_VENDOR_AQUANTIA
> + bool "aQuantia devices"
> + default y
> + ---help---
> +   Set this to y if you have an Ethernet network cards that uses the 
> aQuantia
> +   chipset.

Could be interesting specify which specific commercial products available.

> +
> +   This option does not build any drivers; it casues the aQuantia
> +   drivers that can be built to appear in the list of Ethernet drivers.
> +
> +
> +if NET_VENDOR_AQUANTIA
> +
> +config AQTION
> + tristate "aQuantia AQtion Support"
> + depends on PCI
> + ---help---
> +   This enables the support for the aQuantia AQtion Ethernet card.
> +
> +endif # NET_VENDOR_AQUANTIA
> diff --git a/drivers/net/ethernet/aquantia/Makefile 
> b/drivers/net/ethernet/aquantia/Makefile
> new file mode 100644
> index 000..e4ae696
> --- /dev/null
> +++ b/drivers/net/ethernet/aquantia/Makefile
> @@ -0,0 +1,42 @@
> +
> +#
> +# aQuantia Ethernet Controller AQtion Linux Driver
> +# Copyright(c) 2014-2017 aQuantia Corporation.
> +#
> +# This program is free software; you can redistribute it and/or modify it
> +# under the terms and conditions of the GNU General Public License,
> +# version 2, as published by the Free Software Foundation.
> +#
> +# This program is distributed in the hope it will be useful, but WITHOUT
> +# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
> +# FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
> +# more details.
> +#
> +# You should have received a copy of the GNU General Public License along
> +# with this program. If not, see .
> +#
> +# The full GNU General Public License is included in this distribution in
> +# the file called "COPYING".
> +#
> +# Contact Information: 
> +# aQuantia Corporation, 105 E. Tasman Dr. San Jose, CA 95134, USA
> +#
> +
> +
> +#
> +# Makefile for the AQtion(tm) Ethernet driver
> +#
> +
> +obj-$(CONFIG_AQTION) += atlantic.o
> +
> +atlantic-objs := aq_main.o \
> + aq_nic.o \
> + aq_pci_func.o \
> + aq_vec.o \
> + aq_ring.o \
> + aq_hw_utils.o \
> + aq_ethtool.o \
> + hw_atl/hw_atl_a0.o \
> + hw_atl/hw_atl_b0.o \
> + hw_atl/hw_atl_utils.o \
> + hw_atl/hw_atl_llh.o
> diff --git a/drivers/net/ethernet/aquantia/ver.h 
> b/drivers/net/ethernet/aquantia/ver.h
> new file mode 100644
> index 000..636d646
> --- /dev/null
> +++ b/drivers/net/ethernet/aquantia/ver.h
> @@ -0,0 +1,18 @@
> +/*
> + * aQuantia Corporation Network Driver
> + * Copyright (C) 2014-2017 aQuantia Corporation. All rights reserved
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + */
> +
> +#ifndef VER_H
> +#define VER_H
> +
> +#define NIC_MAJOR_DRIVER_VERSION   1
> +#define NIC_MINOR_DRIVER_VERSION   5
> +#define NIC_BUILD_DRIVER_VERSION   339
> +#define NIC_REVISION_DRIVER_VERSION0

That probably means nothing since it's the first time for the upstream
kernel that such a driver exists. Defining versions of software is such
an interesting subject anyway!
-- 
Florian


Re: [PATCH v5 05/13] net: ethernet: aquantia: Support for NIC-specific code

2017-01-13 Thread Florian Fainelli
On 01/12/2017 09:02 PM, Alexander Loktionov wrote:
> From: David VomLehn 
> 
> Add support for code specific to the Atlantic NIC.
> 
> Signed-off-by: Alexander Loktionov 
> Signed-off-by: Dmitrii Tarakanov 
> Signed-off-by: Pavel Belous 
> Signed-off-by: Dmitry Bezrukov 
> Signed-off-by: David M. VomLehn 
> ---
>  drivers/net/ethernet/aquantia/aq_main.c | 291 
>  drivers/net/ethernet/aquantia/aq_main.h |  17 +
>  drivers/net/ethernet/aquantia/aq_nic.c  | 910 
> 
>  drivers/net/ethernet/aquantia/aq_nic.h  | 108 +++
>  drivers/net/ethernet/aquantia/aq_nic_internal.h |  46 ++
>  5 files changed, 1372 insertions(+)
>  create mode 100644 drivers/net/ethernet/aquantia/aq_main.c
>  create mode 100644 drivers/net/ethernet/aquantia/aq_main.h
>  create mode 100644 drivers/net/ethernet/aquantia/aq_nic.c
>  create mode 100644 drivers/net/ethernet/aquantia/aq_nic.h
>  create mode 100644 drivers/net/ethernet/aquantia/aq_nic_internal.h
> 
> diff --git a/drivers/net/ethernet/aquantia/aq_main.c 
> b/drivers/net/ethernet/aquantia/aq_main.c
> new file mode 100644
> index 000..18a6012
> --- /dev/null
> +++ b/drivers/net/ethernet/aquantia/aq_main.c
> @@ -0,0 +1,291 @@
> +/*
> + * aQuantia Corporation Network Driver
> + * Copyright (C) 2014-2017 aQuantia Corporation. All rights reserved
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + */
> +
> +/* File aq_main.c: Main file for aQuantia Linux driver. */
> +
> +#include "aq_main.h"
> +#include "aq_nic.h"
> +#include "aq_pci_func.h"
> +#include "aq_ethtool.h"
> +#include "hw_atl/hw_atl_a0.h"
> +#include "hw_atl/hw_atl_b0.h"
> +
> +#include 
> +#include 
> +
> +static const struct pci_device_id aq_pci_tbl[] = {
> + { PCI_VDEVICE(AQUANTIA, HW_ATL_DEVICE_ID_0001), },
> + { PCI_VDEVICE(AQUANTIA, HW_ATL_DEVICE_ID_D100), },
> + { PCI_VDEVICE(AQUANTIA, HW_ATL_DEVICE_ID_D107), },
> + { PCI_VDEVICE(AQUANTIA, HW_ATL_DEVICE_ID_D108), },
> + { PCI_VDEVICE(AQUANTIA, HW_ATL_DEVICE_ID_D109), },
> + {}
> +};
> +
> +MODULE_DEVICE_TABLE(pci, aq_pci_tbl);
> +
> +MODULE_LICENSE("GPL v2");
> +MODULE_VERSION(AQ_CFG_DRV_VERSION);
> +MODULE_AUTHOR(AQ_CFG_DRV_AUTHOR);
> +MODULE_DESCRIPTION(AQ_CFG_DRV_DESC);

Inline the defines directly into that file, and these typically go at
the end of the file.

> +
> +static struct aq_hw_ops *aq_pci_probe_get_hw_ops_by_id(struct pci_dev *pdev)
> +{
> + struct aq_hw_ops *ops = NULL;
> + int err = 0;
> +
> + ops = hw_atl_a0_get_ops_by_id(pdev);
> + if (ops) {
> + err = 0;
> + goto err_exit;
> + }

Should not that come from the pci_device_id .driver_data structure?

> +
> + ops = hw_atl_b0_get_ops_by_id(pdev);
> + if (ops) {
> + err = 0;
> + goto err_exit;
> + }
> +
> +/* the H/W was not recognized */
> + err = -EFAULT;
> +
> +err_exit:
> + return ops;
> +}
> +
> +static int aq_ndev_open(struct net_device *ndev)
> +{
> + struct aq_nic_s *aq_nic = NULL;
> + int err = 0;
> +
> + aq_nic = aq_nic_alloc_hot(ndev);
> + if (!aq_nic) {
> + err = -ENOMEM;
> + goto err_exit;
> + }
> + err = aq_nic_init(aq_nic);
> + if (err < 0)
> + goto err_exit;
> + err = aq_nic_start(aq_nic);
> + if (err < 0)
> + goto err_exit;
> +
> +err_exit:
> + if (err < 0) {

Just make the error label be taken only if there is an error instead of
checking here again for a negative error code.

> + if (aq_nic)
> + aq_nic_deinit(aq_nic);
> + }
> + return err;
> +}
> +
> +static int aq_ndev_close(struct net_device *ndev)
> +{
> + struct aq_nic_s *aq_nic = (struct aq_nic_s *)netdev_priv(ndev);

No casting needed.

> +
> + aq_nic_stop(aq_nic);
> + aq_nic_deinit(aq_nic);
> + aq_nic_free_hot_resources(aq_nic);
> +
> + return 0;
> +}
> +
> +static int aq_ndev_start_xmit(struct sk_buff *skb, struct net_device *ndev)
> +{
> + struct aq_nic_s *aq_nic = (struct aq_nic_s *)netdev_priv(ndev);
> + int err = 0;
> +
> + err = aq_nic_xmit(aq_nic, skb);
> + if (err < 0)
> + goto err_exit;
> +
> +err_exit:
> + return err;

Just inline the contents of aq_nic_xmit(), this function serves nothing
useful here.

> +}
> +
> +static int aq_ndev_change_mtu(struct net_device *ndev, int new_mtu)
> +{
> + struct aq_nic_s *aq_nic = (struct aq_nic_s *)netdev_priv(ndev);
> + int err = 0;
> +
> + if (new_mtu == ndev->mtu) {
> + err = 0;
> + goto err_exit;
> + }
> + if (new_mtu < 68) {
> + err = -EINVAL;

Re: [PATCH v5 08/13] net: ethernet: aquantia: PCI operations

2017-01-13 Thread Florian Fainelli
On 01/12/2017 09:02 PM, Alexander Loktionov wrote:
> From: David VomLehn 
> 
> Add functions that handle the PCI bus interface.
> 
> Signed-off-by: Alexander Loktionov 
> Signed-off-by: Dmitrii Tarakanov 
> Signed-off-by: Pavel Belous 
> Signed-off-by: Dmitry Bezrukov 
> Signed-off-by: David M. VomLehn 
> ---

> + /*enable interrupts */
> +#if AQ_CFG_FORCE_LEGACY_INT
> + self->irq_type = AQ_IRQ_LEGACY;
> +#else
> + err = pci_enable_msix(self->pdev, self->msix_entry,
> +   self->aq_hw_caps.msix_irqs);

You should really try MSI(x) first and fallback to leagcy PCI INT# if
you cannot succeed, asking for a driver rebuild to switch between these
behaviors may not be an option for most people (a module parameter may
be an option but is usually not such a great idea).
-- 
Florian


Re: [PATCH v5 11/13] net: ethernet: aquantia: Ethtool support

2017-01-13 Thread Florian Fainelli
On 01/12/2017 09:02 PM, Alexander Loktionov wrote:
> From: David VomLehn 
> 
> Add the driver interfaces required for support by the ethtool utility.
> 
> Signed-off-by: Alexander Loktionov 
> Signed-off-by: Dmitrii Tarakanov 
> Signed-off-by: Pavel Belous 
> Signed-off-by: Dmitry Bezrukov 
> Signed-off-by: David M. VomLehn 
> ---
>  drivers/net/ethernet/aquantia/aq_ethtool.c | 250 
> +
>  drivers/net/ethernet/aquantia/aq_ethtool.h |  19 +++
>  2 files changed, 269 insertions(+)
>  create mode 100644 drivers/net/ethernet/aquantia/aq_ethtool.c
>  create mode 100644 drivers/net/ethernet/aquantia/aq_ethtool.h
> 
> diff --git a/drivers/net/ethernet/aquantia/aq_ethtool.c 
> b/drivers/net/ethernet/aquantia/aq_ethtool.c
> new file mode 100644
> index 000..f11bdb1
> --- /dev/null
> +++ b/drivers/net/ethernet/aquantia/aq_ethtool.c
> @@ -0,0 +1,250 @@
> +/*
> + * aQuantia Corporation Network Driver
> + * Copyright (C) 2014-2017 aQuantia Corporation. All rights reserved
> + *
> + * This program is free software; you can redistribute it and/or modify it
> + * under the terms and conditions of the GNU General Public License,
> + * version 2, as published by the Free Software Foundation.
> + */
> +
> +/* File aq_ethtool.c: Definition of ethertool related functions. */
> +
> +#include "aq_ethtool.h"
> +#include "aq_nic.h"
> +
> +static void aq_ethtool_get_regs(struct net_device *ndev,
> + struct ethtool_regs *regs, void *p)
> +{
> + struct aq_nic_s *aq_nic = (struct aq_nic_s *)netdev_priv(ndev);

netdev_priv() returns a void * which requires no casting, please fix
this through the entire 13 patches.

> + u32 regs_count = aq_nic_get_regs_count(aq_nic);
> +
> + memset(p, 0, regs_count * sizeof(u32));
> + aq_nic_get_regs(aq_nic, regs, p);
> +}
> +
> +static int aq_ethtool_get_regs_len(struct net_device *ndev)
> +{
> + struct aq_nic_s *aq_nic = (struct aq_nic_s *)netdev_priv(ndev);
> + u32 regs_count = aq_nic_get_regs_count(aq_nic);
> +
> + return regs_count * sizeof(u32);
> +}
> +
> +static u32 aq_ethtool_get_link(struct net_device *ndev)
> +{
> + struct aq_nic_s *aq_nic = (struct aq_nic_s *)netdev_priv(ndev);
> +
> + return aq_nic_get_link_speed(aq_nic) ? 1U : 0U;

Can you either use PHYLIB to interface to your PHY (which does all the
nice state machine management) or at the very least ethtool_op_get_link()?

> +}
> +
> +static int aq_ethtool_get_settings(struct net_device *ndev,
> +struct ethtool_cmd *cmd)
> +{
> + struct aq_nic_s *aq_nic = (struct aq_nic_s *)netdev_priv(ndev);
> +
> + cmd->port = PORT_TP;
> + cmd->transceiver = XCVR_EXTERNAL;
> +
> + ethtool_cmd_speed_set(cmd, netif_carrier_ok(ndev) ?
> + aq_nic_get_link_speed(aq_nic) : 0U);
> +
> + cmd->duplex = DUPLEX_FULL;
> + aq_nic_get_link_settings(aq_nic, cmd);
> + return 0;

Consider switching to the new ksettings API and filling in a bit more
information like cmd->autoneg?

> +}
> +
> +static int aq_ethtool_set_settings(struct net_device *ndev,
> +struct ethtool_cmd *cmd)
> +{
> + struct aq_nic_s *aq_nic = (struct aq_nic_s *)netdev_priv(ndev);
> +
> + return aq_nic_set_link_settings(aq_nic, cmd);
> +}
> +
> +static const char aq_ethtool_stat_names[][ETH_GSTRING_LEN] = {
> + "InPackets",
> + "InUCast",
> + "InMCast",
> + "InBCast",
> + "InErrors",
> + "OutPackets",
> + "OutUCast",
> + "OutMCast",
> + "OutBCast",
> + "InUCastOctects",
> + "OutUCastOctects",
> + "InMCastOctects",
> + "OutMCastOctects",
> + "InBCastOctects",
> + "OutBCastOctects",
> + "InOctects",
> + "OutOctects",
> + "InPacketsDma",
> + "OutPacketsDma",
> + "InOctetsDma",
> + "OutOctetsDma",
> + "InDroppedDma",
> + "Queue[0] InPackets",
> + "Queue[0] OutPackets",
> + "Queue[0] InJumboPackets",
> + "Queue[0] InLroPackets",
> + "Queue[0] InErrors",
> +#if 1 < AQ_CFG_VECS_DEF

Yoda notations are usually frowned upon. Instead of making this decision
here, can you push that down to the actual function reading the statistics?

> +static void aq_ethtool_get_strings(struct net_device *ndev,
> +u32 stringset, u8 *data)
> +{

You need to check that stringset == ETH_SS_STATS here since that's the
only thing you support. ethtool usually does not do it if the ioctl()
returning the data does not exist, but other bogus applications might.

> + memcpy(data, *aq_ethtool_stat_names, sizeof(aq_ethtool_stat_names));
> +}
> +
> +static int aq_ethtool_get_sset_count(struct net_device *ndev, int stringset)
> +{

Same here.

> + return ARRAY_SIZE(aq_ethtool_stat_names);

> +#ifndef AQ_ETHTOOL_H
> 

Re: [PATCH v5 00/13] net: ethernet: aquantia: Add AQtion 2.5/5 GB NIC driver

2017-01-13 Thread Florian Fainelli
On 01/12/2017 09:02 PM, Alexander Loktionov wrote:
> From: David VomLehn 
> 
> v1: Initial version
> v2: o Make necessary drivers/net/ethernet changes to integrate software
> o Drop intermediate atlantic directory
> o Remove Makefile things only appropriate to out of tree module
>   building
> v3: o Move changes to drivers/net/ethernet/{Kconfig,Makefile} to the last
>   patch to ensure clean bisection.
> o Removed inline attribute aq_hw_write_req() as it was defined in
>   only one .c file.
> o #included pci.h in aq_common.h to get struct pci definition.
> o Modified code to unlock based execution flow rather than using a
>   flag.
> o Made a number of functions that were only used in a single file
>   static.
> o Cleaned up error and return code handling in various places.
> o Remove AQ_CFG_IP_ALIGN definition.
> o Other minor code clean up.
> v4: o Using do_div for 64 bit division.
> o Modified NIC statistics code.
> o Using build_skb instead netdev_alloc_skb for single fragment
>   packets.
> o Removed extra aq_nic.o from Makefile
> v5: o Removed extra newline at the end of the files.
> o Wrapped cover letter lines.

Have not looked at the driver yet, but the threading of your emails is
weird, each patch is in reply to the previous one. It would be more
natural to have all numbered patches be in reply to the cover letter,
which according to the version of git you seem to have used (2.7.4)
should already be the default. In graphical terms what we see right now is:

[PATCH 00/13]
[PATCH 01/13]
[PATCH 02/13]


While we should see:


[PATCH 00/13]
[PATCH 01/13]
[PATCH 02/13]


Can you fix that for future submissions, this may sound like a cosmetic
thing, but it really helps with threading/reading etc.

Thanks!
-- 
Florian


Re: [PATCH v5 02/13] net: ethernet: aquantia: Common functions and definitions

2017-01-13 Thread David Miller
From: Alexander Loktionov 
Date: Thu, 12 Jan 2017 21:02:18 -0800

> +#define AQ_OBJ_HEADER spinlock_t lock; atomic_t flags; atomic_t busy_count
> +
> +struct aq_obj_s {
> + AQ_OBJ_HEADER;
> +};

Please don't hide multiple declarations and types inside of a macro,
that makes the code harder to understand.

Use a sub-structure or similar, and pass that sub-structure to the
handlers.

> +#define AQ_OBJ_TST(_OBJ_, _FLAG_)  ((_FLAG_) & atomic_read(&(_OBJ_)->flags))
> +
> +#define AQ_OBJ_SET(_OBJ_, _F_) \
 ...
> +#define AQ_OBJ_CLR(_OBJ_, _F_) \

Please don't reinvent the wheel.

Use test_bit, set_bit, clear_bit, test_and_set_bit, and
test_and_clear_bit.  Using an atomic_t for flag bits is completely
inappropriate, that type is primarily meant for atomic counters.

The appropriate type for *_bit() operations is "unsigned long".


Re: [PATCH] i40e: Invoke softirqs after napi_reschedule

2017-01-13 Thread David Miller
From: Benjamin Poirier 
Date: Thu, 12 Jan 2017 17:04:14 -0800

> The following message is logged from time to time when using i40e:
> NOHZ: local_softirq_pending 08
> 
> i40e may schedule napi from a workqueue. Afterwards, softirqs are not run
> in a deterministic time frame. The problem is the same as what was
> described in commit ec13ee80145c ("virtio_net: invoke softirqs after
> __napi_schedule") and this patch applies the same fix to i40e.
> 
> Signed-off-by: Benjamin Poirier 

I hope to see this from one of Jeff's pull requests to me in the
near future.


Re: [PATCH net-next] liquidio: use fallback for selecting txq

2017-01-13 Thread David Miller
From: Felix Manlunas 
Date: Thu, 12 Jan 2017 16:18:22 -0800

> From: Satanand Burla 
> 
> Remove assignment to ndo_select_queue so that fallback is used for
> selecting txq.  Also remove the now-useless function that used to be
> assigned to ndo_select_queue.
> 
> Signed-off-by: Satanand Burla 
> Signed-off-by: Felix Manlunas 
> Signed-off-by: Derek Chickles 

Applied, thank you.


Re: [PATCH net-next] net: dsa: mv88e6xxx: add EEPROM support to 6390

2017-01-13 Thread David Miller
From: Vivien Didelot 
Date: Thu, 12 Jan 2017 18:07:16 -0500

> The Marvell 6352 chip has a 8-bit address/16-bit data EEPROM access.
> The Marvell 6390 chip has a 16-bit address/8-bit data EEPROM access.
> 
> This patch implements the 8-bit data EEPROM access in the mv88e6xxx
> driver and adds its support to chips of the 6390 family.
> 
> Signed-off-by: Vivien Didelot 

Applied.


[PATCH] cxgb4: Remove redundant memset before memcpy

2017-01-13 Thread Shyam Saini
The region set by the call to memset, immediately overwritten by
the subsequent call to memcpy and thus makes the  memset redundant.

Also remove the memset((, 0, sizeof(info)) on line 398 because
info is memcpy()'ed to before being used in the loop and it isn't
used outside of the loop.

Signed-off-by: Shyam Saini 
---
 drivers/net/ethernet/chelsio/cxgb4/sched.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/sched.c 
b/drivers/net/ethernet/chelsio/cxgb4/sched.c
index cbd68a8..c902635 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/sched.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/sched.c
@@ -397,9 +397,6 @@ static struct sched_class *t4_sched_class_lookup(struct 
port_info *pi,
struct ch_sched_params info;
struct ch_sched_params tp;
 
-   memset(, 0, sizeof(info));
-   memset(, 0, sizeof(tp));
-
memcpy(, p, sizeof(tp));
/* Don't try to match class parameter */
tp.u.params.class = SCHED_CLS_NONE;
@@ -409,7 +406,6 @@ static struct sched_class *t4_sched_class_lookup(struct 
port_info *pi,
if (e->state == SCHED_STATE_UNUSED)
continue;
 
-   memset(, 0, sizeof(info));
memcpy(, >info, sizeof(info));
/* Don't try to match class parameter */
info.u.params.class = SCHED_CLS_NONE;
@@ -458,7 +454,6 @@ static struct sched_class *t4_sched_class_alloc(struct 
port_info *pi,
if (!e)
goto out;
 
-   memset(, 0, sizeof(np));
memcpy(, p, sizeof(np));
np.u.params.class = e->idx;
 
-- 
2.7.4



Re: [net PATCH v3 5/5] virtio_net: XDP support for adjust_head

2017-01-13 Thread John Fastabend
On 17-01-13 12:08 PM, John Fastabend wrote:
> On 17-01-12 11:41 PM, Jason Wang wrote:
>>
>>
>> On 2017年01月13日 10:52, John Fastabend wrote:
>>> Add support for XDP adjust head by allocating a 256B header region
>>> that XDP programs can grow into. This is only enabled when a XDP
>>> program is loaded.
>>>
>>> In order to ensure that we do not have to unwind queue headroom push
>>> queue setup below bpf_prog_add. It reads better to do a prog ref
>>> unwind vs another queue setup call.
>>>
>>> At the moment this code must do a full reset to ensure old buffers
>>> without headroom on program add or with headroom on program removal
>>> are not used incorrectly in the datapath. Ideally we would only
>>> have to disable/enable the RX queues being updated but there is no
>>> API to do this at the moment in virtio so use the big hammer. In
>>> practice it is likely not that big of a problem as this will only
>>> happen when XDP is enabled/disabled changing programs does not
>>> require the reset. There is some risk that the driver may either
>>> have an allocation failure or for some reason fail to correctly
>>> negotiate with the underlying backend in this case the driver will
>>> be left uninitialized. I have not seen this ever happen on my test
>>> systems and for what its worth this same failure case can occur
>>> from probe and other contexts in virtio framework.
>>>
>>> Signed-off-by: John Fastabend 
>>> ---
>>>   drivers/net/virtio_net.c |  155 
>>> --
>>>   drivers/virtio/virtio.c  |9 ++-
>>>   include/linux/virtio.h   |3 +
>>>   3 files changed, 144 insertions(+), 23 deletions(-)
>>>
>>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>>> index 6041828..8b897e7 100644
>>> --- a/drivers/net/virtio_net.c
>>> +++ b/drivers/net/virtio_net.c
>>> @@ -28,6 +28,7 @@
>>>   #include 
>>>   #include 
>>>   #include 
>>> +#include 
>>>   #include 
>>> static int napi_weight = NAPI_POLL_WEIGHT;
>>> @@ -159,6 +160,9 @@ struct virtnet_info {
>>>   /* Ethtool settings */
>>>   u8 duplex;
>>>   u32 speed;
>>> +
>>> +/* Headroom allocated in RX Queue */
>>> +unsigned int headroom;
>>
>> If this could not be changed in anyway, better use a macro instead of a filed
>> here. And there's even no need to add an extra parameter to
>> add_recvbuf_mergeable().
> 
> OK originally I thought this might be dynamic but I agree no need
> for it here.
> 

Well there is a bit of an order of operation issue that means we need at
least some bit here to tell us an enablement is pending.

The problem is when we do the reset we need to know that headroom for XDP
is needed. But we can't use the xdp_prog values because xdp_prog can not
be added on an device that is up without headroom otherwise the program
could fail. Plus reset via freeze/restore tears these structures down and
rebuilds them.

How about a boolean bit here instead of an unsigned int,

'bool xdp_headroom_needed'

seems better than an int.

Thanks,
John





Re: [Patch net] atm: remove an unnecessary loop

2017-01-13 Thread Cong Wang
On Fri, Jan 13, 2017 at 4:15 PM, Francois Romieu  wrote:
> Cong Wang  :
>> On Fri, Jan 13, 2017 at 5:23 AM, Francois Romieu  
>> wrote:
> [...]
>> > alloc_skb() does not need to be in the "while" loop.
>>
>> This is exactly what I describe in my changelog, don't know
>> why you want to repeat it...
>
> Because it is still hidden in a while loop.
>
> You turned the alloc from a two level deep "while" loop to a one level
> one. I want it at zero level. alloc_skb(..., GFP_KERNEL) fails ?
> So let it be done (see patch in other message).
>

Why I didn't remove all the loops is already stated in the later patch,
you said you read it? I doubt. ;)


> [...]
>> Please don't expect me to fix many things in one patch, let's
>> fix each of them separately, agreed?
>
> I am not convinced that several patches are needed to get the whole
> picture right.
>

My guideline for stable fixes is one patch fixes one problem, maybe
not suitable to you I think. Let's agree to disagree. ;)


Re: [Patch net] atm: remove an unnecessary loop

2017-01-13 Thread Cong Wang
On Fri, Jan 13, 2017 at 4:14 PM, Francois Romieu  wrote:
> Cong Wang  :
> [...]
>> If you can justify API is not broken by doing that, I am more than happy
>> to do it, as I already stated in the latter patch:
>>
>> "Of course, the logic itself is suspicious, other sendmsg()
>> could handle skb allocation failure very well, not sure
>> why ATM has to wait for a successful one here. But probably
>> it is too late to change since the errno and behavior is
>> visible to user-space. So just leave the logic as it is."
>>
>> For some reason, no one reads that patch. :-/
>
> Believe it or not but I actually read it.
>
> It changes the logic : the original code would have been unable to
> escape the while loop on memory failure. Fine, I don't mind the change.
> Actually I believe that these two patches are too shy (and backport
> unefficient). Instead of trying to reformulate why, here's what I have
> in mind. Uncompiled, caveat emptor, etc.

I just don't want to break things, that is it. If you can convince me your
change will not break any user-space application, again I am more
than just happy about it. My ATM knowledge is close to zero. ;)


Re: [Patch net] atm: remove an unnecessary loop

2017-01-13 Thread Cong Wang
On Fri, Jan 13, 2017 at 3:54 PM, Chas Williams <3ch...@gmail.com> wrote:
> On Fri, 2017-01-13 at 10:20 -0800, Cong Wang wrote:
>> On Fri, Jan 13, 2017 at 9:10 AM, David Miller  wrote:
>> > From: Francois Romieu 
>> > Date: Fri, 13 Jan 2017 01:07:00 +0100
>> >
>> >> Were alloc_skb moved one level up in the call stack, there would be
>> >> no need to use the new wait api in the subsequent page, thus easing
>> >> pre 3.19 longterm kernel maintenance (at least those on korg page).
>> >>
>> >> But it tastes a tad bit too masochistic.
>> >
>> > Lack of error handling of allocation failure is always a huge red
>> > flag.  We even long ago tried to do something like this for TCP FIN
>> > handling.
>> >
>> > It's dumb, it doesn't work.
>> >
>> > Therefore I agree that the correct fix is to move the SKB allocation
>> > up one level to vcc_sendmsg() and make it handle errors properly.
>>
>> If you can justify API is not broken by doing that, I am more than happy
>> to do it, as I already stated in the latter patch:
>
> The man page for sendmsg() allows for ENOMEM.  See below.
>

Errno is just one part, you miss the behavior behind the logic.

>>
>> "Of course, the logic itself is suspicious, other sendmsg()
>> could handle skb allocation failure very well, not sure
>> why ATM has to wait for a successful one here. But probably
>> it is too late to change since the errno and behavior is
>> visible to user-space. So just leave the logic as it is."
>>
>> For some reason, no one reads that patch. :-/
>
> I read it and I agree.  I think it should be moved up/conflated with
> vcc_sendmsg().  vcc_sendmsg() can already return an errno for other
> conditions so if so has written something where they are explicitly
> not expecting a ENOMEM, we really can't help them.

Nope, the reason is never ENOMEM is expected or not. The current
_behavior_ behind this logic might be relied on by user-space.
The behavior here is, when allocation fails, kernel will retry under
certain circumstances, for example, if any fatal signal pending,
returns ERESTARTSYS, etc.. This is what I worry, not just ENOMEM
or not, which is too obvious.

Of course, I could be too conservative, I'd rather not to break things
for -stable at least.

Thanks.


Re: [Patch net] atm: remove an unnecessary loop

2017-01-13 Thread Francois Romieu
Francois Romieu  :
[...]

Now with a proper error code. Have a nice night.

diff --git a/net/atm/common.c b/net/atm/common.c
index a3ca922..e20d040 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -62,21 +62,16 @@ static void vcc_remove_socket(struct sock *sk)
write_unlock_irq(_sklist_lock);
 }
 
-static struct sk_buff *alloc_tx(struct atm_vcc *vcc, unsigned int size)
+static bool vcc_tx_ready(struct atm_vcc *vcc, unsigned int size)
 {
-   struct sk_buff *skb;
struct sock *sk = sk_atm(vcc);
 
if (sk_wmem_alloc_get(sk) && !atm_may_send(vcc, size)) {
pr_debug("Sorry: wmem_alloc = %d, size = %d, sndbuf = %d\n",
 sk_wmem_alloc_get(sk), size, sk->sk_sndbuf);
-   return NULL;
+   return false;
}
-   while (!(skb = alloc_skb(size, GFP_KERNEL)))
-   schedule();
-   pr_debug("%d += %d\n", sk_wmem_alloc_get(sk), skb->truesize);
-   atomic_add(skb->truesize, >sk_wmem_alloc);
-   return skb;
+   return true;
 }
 
 static void vcc_sock_destruct(struct sock *sk)
@@ -606,7 +601,7 @@ int vcc_sendmsg(struct socket *sock, struct msghdr *m, 
size_t size)
eff = (size+3) & ~3; /* align to word boundary */
prepare_to_wait(sk_sleep(sk), , TASK_INTERRUPTIBLE);
error = 0;
-   while (!(skb = alloc_tx(vcc, eff))) {
+   while (!vcc_tx_ready(vcc, eff)) {
if (m->msg_flags & MSG_DONTWAIT) {
error = -EAGAIN;
break;
@@ -628,6 +623,15 @@ int vcc_sendmsg(struct socket *sock, struct msghdr *m, 
size_t size)
finish_wait(sk_sleep(sk), );
if (error)
goto out;
+
+   skb = alloc_skb(eff, GFP_KERNEL);
+   if (!skb) {
+   error = -ENOMEM;
+   goto out;
+   }
+   pr_debug("%d += %d\n", sk_wmem_alloc_get(sk), skb->truesize);
+   atomic_add(skb->truesize, >sk_wmem_alloc);
+
skb->dev = NULL; /* for paths shared with net_device interfaces */
ATM_SKB(skb)->atm_options = vcc->atm_options;
if (!copy_from_iter_full(skb_put(skb, size), size, >msg_iter)) {


Re: [Patch net] atm: remove an unnecessary loop

2017-01-13 Thread Francois Romieu
Cong Wang  :
> On Fri, Jan 13, 2017 at 5:23 AM, Francois Romieu  wrote:
[...]
> > alloc_skb() does not need to be in the "while" loop.
> 
> This is exactly what I describe in my changelog, don't know
> why you want to repeat it...

Because it is still hidden in a while loop. 

You turned the alloc from a two level deep "while" loop to a one level
one. I want it at zero level. alloc_skb(..., GFP_KERNEL) fails ?
So let it be done (see patch in other message).

[...]
> Please don't expect me to fix many things in one patch, let's
> fix each of them separately, agreed?

I am not convinced that several patches are needed to get the whole
picture right.

-- 
Ueimor


Re: [Patch net] atm: remove an unnecessary loop

2017-01-13 Thread Francois Romieu
Cong Wang  :
[...]
> If you can justify API is not broken by doing that, I am more than happy
> to do it, as I already stated in the latter patch:
> 
> "Of course, the logic itself is suspicious, other sendmsg()
> could handle skb allocation failure very well, not sure
> why ATM has to wait for a successful one here. But probably
> it is too late to change since the errno and behavior is
> visible to user-space. So just leave the logic as it is."
> 
> For some reason, no one reads that patch. :-/

Believe it or not but I actually read it.

It changes the logic : the original code would have been unable to
escape the while loop on memory failure. Fine, I don't mind the change.
Actually I believe that these two patches are too shy (and backport
unefficient). Instead of trying to reformulate why, here's what I have
in mind. Uncompiled, caveat emptor, etc.

I'll do a (slow) build and test on saturday's night with a pair of
iphase 5575.

diff --git a/net/atm/common.c b/net/atm/common.c
index a3ca922..67f76f3 100644
--- a/net/atm/common.c
+++ b/net/atm/common.c
@@ -62,21 +62,16 @@ static void vcc_remove_socket(struct sock *sk)
write_unlock_irq(_sklist_lock);
 }
 
-static struct sk_buff *alloc_tx(struct atm_vcc *vcc, unsigned int size)
+static bool vcc_tx_ready(struct atm_vcc *vcc, unsigned int size)
 {
-   struct sk_buff *skb;
struct sock *sk = sk_atm(vcc);
 
if (sk_wmem_alloc_get(sk) && !atm_may_send(vcc, size)) {
pr_debug("Sorry: wmem_alloc = %d, size = %d, sndbuf = %d\n",
 sk_wmem_alloc_get(sk), size, sk->sk_sndbuf);
-   return NULL;
+   return false;
}
-   while (!(skb = alloc_skb(size, GFP_KERNEL)))
-   schedule();
-   pr_debug("%d += %d\n", sk_wmem_alloc_get(sk), skb->truesize);
-   atomic_add(skb->truesize, >sk_wmem_alloc);
-   return skb;
+   return true;
 }
 
 static void vcc_sock_destruct(struct sock *sk)
@@ -606,7 +601,7 @@ int vcc_sendmsg(struct socket *sock, struct msghdr *m, 
size_t size)
eff = (size+3) & ~3; /* align to word boundary */
prepare_to_wait(sk_sleep(sk), , TASK_INTERRUPTIBLE);
error = 0;
-   while (!(skb = alloc_tx(vcc, eff))) {
+   while (!vcc_tx_ready(vcc, eff)) {
if (m->msg_flags & MSG_DONTWAIT) {
error = -EAGAIN;
break;
@@ -628,6 +623,13 @@ int vcc_sendmsg(struct socket *sock, struct msghdr *m, 
size_t size)
finish_wait(sk_sleep(sk), );
if (error)
goto out;
+
+   skb = alloc_skb(eff, GFP_KERNEL);
+   if (!skb)
+   goto out;
+   pr_debug("%d += %d\n", sk_wmem_alloc_get(sk), skb->truesize);
+   atomic_add(skb->truesize, >sk_wmem_alloc);
+
skb->dev = NULL; /* for paths shared with net_device interfaces */
ATM_SKB(skb)->atm_options = vcc->atm_options;
if (!copy_from_iter_full(skb_put(skb, size), size, >msg_iter)) {

-- 
Ueimor


Re: [PATCH v2 net-next] Introduce a sysctl that modifies the value of PROT_SOCK.

2017-01-13 Thread Krister Johansen
On Thu, Jan 12, 2017 at 09:22:13AM -0500, David Miller wrote:
> From: Krister Johansen 
> > The use case for this change is to allow containerized processes to bind
> > to priviliged ports, but prevent them from ever being allowed to modify
> > their container's network configuration.  The latter is accomplished by
> > ensuring that the network namespace is not a child of the user
> > namespace.  This modification was needed to allow the container manager
> > to disable a namespace's priviliged port restrictions without exposing
> > control of the network namespace to processes in the user namespace.
> 
> This is what CAP_NET_BIND_SERVICE is for, and why it is a separate
> network privilege, please use it.

It sounds like I may have done an inadequate job of explaining why I
took this approach instead of going the CAP_NET_BIND_SERVICE route.

In this scenario, the network namespace is created and configured first.
Then the containerized processed get placed into a separate user
namespace.  This is so that the processes in the container, even if they
somehow manage to obtain extra privilege in the userns, can never modify
the network namespace.

The check in ns_capable() is looking at the priviliges of the user
namespace that created the netns and its parents.  Even if I were to
grant a process in the container CAP_NET_BIND_SERVICE, ns_capable()
wouldn't recognize that as being a valid privilige for the netns.

If I were to invert the order of operations and create the userns before
the netns, then the capability would be recognized.  However, that also
allows any potential privilege escalation in the userns to bring with it
the potential that an attacker can modify the container's network
configuration.

I'd much rather run the containers without privs, and without the userns
having rights to the netns, to mitigate the risk of an attacker being
able to alter the container's networking configuration.

-K


Re: [PATCH v2 net-next] Introduce a sysctl that modifies the value of PROT_SOCK.

2017-01-13 Thread Krister Johansen
On Thu, Jan 12, 2017 at 06:39:57AM -0800, Eric Dumazet wrote:
> On Wed, 2017-01-11 at 22:52 -0800, Krister Johansen wrote:
> > Add net.ipv4.ip_unprotected_port_start, which is a per namespace sysctl
> > that denotes the first unprotected inet port in the namespace.  To
> > disable all protected ports set this to zero.  It also checks for
> > overlap with the local port range.  The protected and local range may
> > not overlap.
> > 
> > The use case for this change is to allow containerized processes to bind
> > to priviliged ports, but prevent them from ever being allowed to modify
> > their container's network configuration.  The latter is accomplished by
> > ensuring that the network namespace is not a child of the user
> > namespace.  This modification was needed to allow the container manager
> > to disable a namespace's priviliged port restrictions without exposing
> > control of the network namespace to processes in the user namespace.
> > 
> > Signed-off-by: Krister Johansen 
> > ---
> >  include/net/ip.h   | 10 +
> >  include/net/netns/ipv4.h   |  1 +
> >  net/ipv4/af_inet.c |  5 -
> >  net/ipv4/sysctl_net_ipv4.c | 50 
> > +-
> >  net/ipv6/af_inet6.c|  3 ++-
> >  net/netfilter/ipvs/ip_vs_ctl.c |  7 +++---
> >  net/sctp/socket.c  | 10 +
> >  security/selinux/hooks.c   |  3 ++-
> 
> Adding a new sysctl without documentation is generally not accepted.
> 
> Please take a look at Documentation/networking/ip-sysctl.txt

Thanks for catching this.  I'll add an entry to the documentation.

> BTW, sticking to 'unprivileged' ports might be better than 'unprotected'
> which is vague.

I don't have a strong preference about the naming.  I'd be happy to
change it to 'unprivileged' instead.

-K


Re: [PATCH net] bpf: rework prog_digest into prog_tag

2017-01-13 Thread Daniel Borkmann

On 01/14/2017 12:49 AM, Andy Lutomirski wrote:

On Fri, Jan 13, 2017 at 3:41 PM, Daniel Borkmann  wrote:

On 01/14/2017 12:16 AM, Andy Lutomirski wrote:

On Fri, Jan 13, 2017 at 2:38 PM, Daniel Borkmann 
wrote:


Commit 7bd509e311f4 ("bpf: add prog_digest and expose it via
fdinfo/netlink") was recently discussed, partially due to
admittedly suboptimal name of "prog_digest" in combination
with sha1 hash usage, thus inevitably and rightfully concerns
about its security in terms of collision resistance were
raised with regards to use-cases.


Seems reasonable.  My only question is whether you'd still want to
switch to SHA-256 just from a code cleanliness perspective.  With
SHA-256 you can use the easy streaming API I wrote, but with SHA-1
you're still stuck with the crappy API in lib/, and I'm not
volunteering to fix up the SHA-1 API.


We'd need to truncate that in kernel anyway to not get a too long
tag, so given that I'm actually fine with it as-is. I was planning
to submit the code for testing to bpf selftests for net-next once
it's merged back, too.


Unless you want to kill off that vmalloc()+vfree() pair...


That is really just in slow-path, and should that become a bottleneck
compared to the rest of the verification steps or allocs we do there,
then we can always clean it up in net-next.

Thanks,
Daniel


Re: [net PATCH v3 2/5] net: virtio: wrap rtnl_lock in test for calling with lock already held

2017-01-13 Thread John Fastabend
On 17-01-13 09:31 AM, John Fastabend wrote:
> On 17-01-13 08:34 AM, Stephen Hemminger wrote:
>> On Thu, 12 Jan 2017 18:51:00 -0800
>> John Fastabend  wrote:
>>
>>>  
>>> -static void free_receive_bufs(struct virtnet_info *vi)
>>> +static void free_receive_bufs(struct virtnet_info *vi, bool need_lock)
>>>  {
>>> struct bpf_prog *old_prog;
>>> int i;
>>>  
>>> -   rtnl_lock();
>>> +   if (need_lock)
>>> +   rtnl_lock();
>>> for (i = 0; i < vi->max_queue_pairs; i++) {
>>> while (vi->rq[i].pages)
>>> __free_pages(get_a_page(>rq[i], GFP_KERNEL), 0);
>>> @@ -1879,7 +1880,8 @@ static void free_receive_bufs(struct virtnet_info *vi)
>>> if (old_prog)
>>> bpf_prog_put(old_prog);
>>> }
>>> -   rtnl_unlock();
>>> +   if (need_lock)
>>> +   rtnl_unlock();
>>>  }
>>
>> Conditional locking is bad idea; sparse complains about it and is later 
>> source
>> of bugs. The more typical way of doing this in kernel is:
> 
> OK I'll use the normal form.
> 
>>
>> void _foo(some args)
>> {
>>  ASSERT_RTNL();
>>
>>  ...
>> }
>>
>> void foo(some args)
>> {
>>  rtnl_lock();
>>  _foo(some args)
>>  rtnl_unlock();
>> }
>>
>>
> 

Actually doing this without a rtnl_try_lock() is going to create two more
callbacks in virtio core just for virtio_net. All the other users do not
appear to have locking restrictions. How about the following it at least
helps in that there is no argument passing and if/else on the locks itself
but does use the if around rtnl_try_lock().

--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1864,12 +1864,11 @@ static void virtnet_free_queues(struct virtnet_info *vi)
kfree(vi->sq);
 }

-static void free_receive_bufs(struct virtnet_info *vi)
+static void _free_receive_bufs(struct virtnet_info *vi)
 {
struct bpf_prog *old_prog;
int i;

-   rtnl_lock();
for (i = 0; i < vi->max_queue_pairs; i++) {
while (vi->rq[i].pages)
__free_pages(get_a_page(>rq[i], GFP_KERNEL), 0);
@@ -1879,6 +1878,12 @@ static void free_receive_bufs(struct virtnet_info *vi)
if (old_prog)
bpf_prog_put(old_prog);
}
+}
+
+static void free_receive_bufs(struct virtnet_info *vi)
+{
+   rtnl_lock();
+   _free_receive_bufs(vi);
rtnl_unlock();
 }

@@ -2358,7 +2363,10 @@ static void remove_vq_common(struct virtnet_info *vi)
/* Free unused buffers in both send and recv, if any. */
free_unused_bufs(vi);

-   free_receive_bufs(vi);
+   if (rtnl_is_locked());
+   _free_receive_bufs(vi);
+   else
+   free_receive_bufs(vi);

free_receive_page_frags(vi);



Re: [Patch net] atm: remove an unnecessary loop

2017-01-13 Thread Chas Williams
On Fri, 2017-01-13 at 10:20 -0800, Cong Wang wrote:
> On Fri, Jan 13, 2017 at 9:10 AM, David Miller  wrote:
> > From: Francois Romieu 
> > Date: Fri, 13 Jan 2017 01:07:00 +0100
> >
> >> Were alloc_skb moved one level up in the call stack, there would be
> >> no need to use the new wait api in the subsequent page, thus easing
> >> pre 3.19 longterm kernel maintenance (at least those on korg page).
> >>
> >> But it tastes a tad bit too masochistic.
> >
> > Lack of error handling of allocation failure is always a huge red
> > flag.  We even long ago tried to do something like this for TCP FIN
> > handling.
> >
> > It's dumb, it doesn't work.
> >
> > Therefore I agree that the correct fix is to move the SKB allocation
> > up one level to vcc_sendmsg() and make it handle errors properly.
> 
> If you can justify API is not broken by doing that, I am more than happy
> to do it, as I already stated in the latter patch:

The man page for sendmsg() allows for ENOMEM.  See below.

> 
> "Of course, the logic itself is suspicious, other sendmsg()
> could handle skb allocation failure very well, not sure
> why ATM has to wait for a successful one here. But probably
> it is too late to change since the errno and behavior is
> visible to user-space. So just leave the logic as it is."
> 
> For some reason, no one reads that patch. :-/

I read it and I agree.  I think it should be moved up/conflated with
vcc_sendmsg().  vcc_sendmsg() can already return an errno for other
conditions so if so has written something where they are explicitly
not expecting a ENOMEM, we really can't help them.

I would certainly prefer to not have to resort to an atomic allocation.
That's just going to make matters worse as far as similarity to the
existing API.

So, as Francois has suggested, just wait for the atm socket to
drain, and then do the allocation after the wait is finished.


[PATCH net-next] net: ipv6: Allow shorthand delete of all nexthops in multipath route

2017-01-13 Thread David Ahern
IPv4 allows multipath routes to be deleted using just the prefix and
length. For example:
$ ip ro ls vrf red
unreachable default metric 8192
1.1.1.0/24
nexthop via 10.100.1.254  dev eth1 weight 1
nexthop via 10.11.200.2  dev eth11.200 weight 1
10.11.200.0/24 dev eth11.200 proto kernel scope link src 10.11.200.3
10.100.1.0/24 dev eth1 proto kernel scope link src 10.100.1.3

$ ip ro del 1.1.1.0/24 vrf red

$ ip ro ls vrf red
unreachable default metric 8192
10.11.200.0/24 dev eth11.200 proto kernel scope link src 10.11.200.3
10.100.1.0/24 dev eth1 proto kernel scope link src 10.100.1.3

The same notation does not work with IPv6 because of how multipath routes
are implemented for IPv6. For IPv6 only the first nexthop of a multipath
route is deleted if the request contains only a prefix and length. This
leads to unnecessary complexity in userspace dealing with IPv6 multipath
routes.

This patch allows all nexthops to be deleted without specifying each one
in the delete request by passing a new flag, RTM_F_ALL_NEXTHOPS, in
rtm_flags.

With this patch (and an updated iproute2 command):
$  ip -6 ro ls vrf red
::/120 via 2100:1::62 dev eth1 metric 1024  pref medium
::/120 via 2100:1::61 dev eth1 metric 1024  pref medium
::/120 via 2100:1::60 dev eth1 metric 1024  pref medium
2100:1::/120 dev eth1 proto kernel metric 256  pref medium
2100:1::/64 dev eth1 proto kernel metric 256  expires 86386sec pref medium
...

$ ip -6 ro del vrf red ::1/120
$ ip -6 ro ls vrf red
2100:1::/120 dev eth1 proto kernel metric 256  pref medium
2100:1::/64 dev eth1 proto kernel metric 256  expires 86382sec pref medium
...

The flag is added to fib6_config by converting fc_type to a u16 (as
noted fc_type only uses 8 bits). The new u16 hole is a bitmap with
fc_delete_all_nexthop as the first bit.

Suggested-by: Dinesh Dutt 
Signed-off-by: David Ahern 
---
 include/net/ip6_fib.h  |  4 +++-
 include/uapi/linux/rtnetlink.h |  1 +
 net/ipv6/route.c   | 10 +-
 3 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index a74e2aa40ef4..11ab99e87c5f 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -37,7 +37,9 @@ struct fib6_config {
int fc_ifindex;
u32 fc_flags;
u32 fc_protocol;
-   u32 fc_type;/* only 8 bits are used */
+   u16 fc_type;/* only 8 bits are used */
+   u16 fc_delete_all_nexthop : 1,
+   __unused : 15;
 
struct in6_addr fc_dst;
struct in6_addr fc_src;
diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h
index 8c93ad1ef9ab..7fb206bc42f9 100644
--- a/include/uapi/linux/rtnetlink.h
+++ b/include/uapi/linux/rtnetlink.h
@@ -276,6 +276,7 @@ enum rt_scope_t {
 #define RTM_F_EQUALIZE 0x400   /* Multipath equalizer: NI  */
 #define RTM_F_PREFIX   0x800   /* Prefix addresses */
 #define RTM_F_LOOKUP_TABLE 0x1000  /* set rtm_table to FIB lookup result */
+#define RTM_F_ALL_NEXTHOPS 0x2000  /* delete all nexthops (IPv6) */
 
 /* Reserved table identifiers */
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ce5aaf448c54..8bb5f6a35ba8 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -2154,6 +2154,7 @@ static int ip6_route_del(struct fib6_config *cfg)
if (!table)
return err;
 
+again:
read_lock_bh(>tb6_lock);
 
fn = fib6_locate(>tb6_root,
@@ -2179,7 +2180,11 @@ static int ip6_route_del(struct fib6_config *cfg)
dst_hold(>dst);
read_unlock_bh(>tb6_lock);
 
-   return __ip6_del_rt(rt, >fc_nlinfo);
+   err = __ip6_del_rt(rt, >fc_nlinfo);
+   if (err || !cfg->fc_delete_all_nexthop)
+   return err;
+
+   goto again;
}
}
read_unlock_bh(>tb6_lock);
@@ -2849,6 +2854,9 @@ static int rtm_to_fib6_config(struct sk_buff *skb, struct 
nlmsghdr *nlh,
if (rtm->rtm_flags & RTM_F_CLONED)
cfg->fc_flags |= RTF_CACHE;
 
+   if (rtm->rtm_flags & RTM_F_ALL_NEXTHOPS)
+   cfg->fc_delete_all_nexthop = 1;
+
cfg->fc_nlinfo.portid = NETLINK_CB(skb).portid;
cfg->fc_nlinfo.nlh = nlh;
cfg->fc_nlinfo.nl_net = sock_net(skb->sk);
-- 
2.1.4



Re: [PATCH net] bpf: rework prog_digest into prog_tag

2017-01-13 Thread Andy Lutomirski
On Fri, Jan 13, 2017 at 3:41 PM, Daniel Borkmann  wrote:
> On 01/14/2017 12:16 AM, Andy Lutomirski wrote:
>>
>> On Fri, Jan 13, 2017 at 2:38 PM, Daniel Borkmann 
>> wrote:
>>>
>>> Commit 7bd509e311f4 ("bpf: add prog_digest and expose it via
>>> fdinfo/netlink") was recently discussed, partially due to
>>> admittedly suboptimal name of "prog_digest" in combination
>>> with sha1 hash usage, thus inevitably and rightfully concerns
>>> about its security in terms of collision resistance were
>>> raised with regards to use-cases.
>>
>>
>> Seems reasonable.  My only question is whether you'd still want to
>> switch to SHA-256 just from a code cleanliness perspective.  With
>> SHA-256 you can use the easy streaming API I wrote, but with SHA-1
>> you're still stuck with the crappy API in lib/, and I'm not
>> volunteering to fix up the SHA-1 API.
>
>
> We'd need to truncate that in kernel anyway to not get a too long
> tag, so given that I'm actually fine with it as-is. I was planning
> to submit the code for testing to bpf selftests for net-next once
> it's merged back, too.

Unless you want to kill off that vmalloc()+vfree() pair...

--Andy


[PATCH next] ipvlan: fix dev_id creation corner case.

2017-01-13 Thread Mahesh Bandewar
From: Mahesh Bandewar 

In the last patch da36e13cf65 ("ipvlan: improvise dev_id generation
logic in IPvlan") I missed some part of Dave's suggestion and because
of that the dev_id creation could fail in a corner case scenario. This
would happen when more or less 64k devices have been already created and
several have been deleted. If the devices that are still sticking around
are the last n bits from the bitmap. So in this scenario even if lower
bits are available, the dev_id search is so narrow that it always fails.

Fixes: da36e13cf65 ("ipvlan: improvise dev_id generation logic in IPvlan")
CC: David Miller 
CC: Eric Dumazet 
Signed-off-by: Mahesh Bandewar 
---
 drivers/net/ipvlan/ipvlan_main.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 92b221a03350..b5c390f0f2b3 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -550,6 +550,9 @@ static int ipvlan_link_new(struct net *src_net, struct 
net_device *dev,
err = ida_simple_get(>ida, port->dev_id_start, 0xFFFE,
 GFP_KERNEL);
if (err < 0)
+   err = ida_simple_get(>ida, 0x1, port->dev_id_start,
+GFP_KERNEL);
+   if (err < 0)
goto destroy_ipvlan_port;
dev->dev_id = err;
/* Increment id-base to the next slot for the future assignment */
-- 
2.11.0.483.g087da7b7c-goog



Re: [PATCH net] bpf: rework prog_digest into prog_tag

2017-01-13 Thread Daniel Borkmann

On 01/14/2017 12:16 AM, Andy Lutomirski wrote:

On Fri, Jan 13, 2017 at 2:38 PM, Daniel Borkmann  wrote:

Commit 7bd509e311f4 ("bpf: add prog_digest and expose it via
fdinfo/netlink") was recently discussed, partially due to
admittedly suboptimal name of "prog_digest" in combination
with sha1 hash usage, thus inevitably and rightfully concerns
about its security in terms of collision resistance were
raised with regards to use-cases.


Seems reasonable.  My only question is whether you'd still want to
switch to SHA-256 just from a code cleanliness perspective.  With
SHA-256 you can use the easy streaming API I wrote, but with SHA-1
you're still stuck with the crappy API in lib/, and I'm not
volunteering to fix up the SHA-1 API.


We'd need to truncate that in kernel anyway to not get a too long
tag, so given that I'm actually fine with it as-is. I was planning
to submit the code for testing to bpf selftests for net-next once
it's merged back, too.

Thanks,
Daniel


Re: [PATCH net] mlx4: do not call napi_schedule() without care

2017-01-13 Thread Eric Dumazet
On Fri, 2017-01-13 at 15:07 -0800, Alexander Duyck wrote:
> On Fri, Jan 13, 2017 at 8:39 AM, Eric Dumazet  wrote:
> > From: Eric Dumazet 
> >
> > Disable BH around the call to napi_schedule() to avoid following warning
> >
> > [   52.095499] NOHZ: local_softirq_pending 08
> > [   52.421291] NOHZ: local_softirq_pending 08
> > [   52.608313] NOHZ: local_softirq_pending 08
> >
> > Fixes: 8d59de8f7bb3 ("net/mlx4_en: Process all completions in RX rings 
> > after port goes up")
> > Signed-off-by: Eric Dumazet 
> > Cc: Erez Shitrit 
> > Cc: Eugenia Emantayev 
> > Cc: Tariq Toukan 
> > ---
> >  drivers/net/ethernet/mellanox/mlx4/en_netdev.c |5 -
> >  1 file changed, 4 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c 
> > b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> > index 
> > 4910d9af19335d4b97d39760c163b41eecc26242..761f8b12399cab245abccc0f7d7f84fde742c14d
> >  100644
> > --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> > +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> > @@ -1748,8 +1748,11 @@ int mlx4_en_start_port(struct net_device *dev)
> > /* Process all completions if exist to prevent
> >  * the queues freezing if they are full
> >  */
> > -   for (i = 0; i < priv->rx_ring_num; i++)
> > +   for (i = 0; i < priv->rx_ring_num; i++) {
> > +   local_bh_disable();
> > napi_schedule(>rx_cq[i]->napi);
> > +   local_bh_enable();
> > +   }
> 
> Couldn't you save yourself a ton of trouble by wrapping the loop
> inside of the local_bh_disable/enable instead of wrapping them up
> inside the loop?  It just seems like it might be more efficient to
> schedule them and then process them as a block instead of doing it one
> at a time.

What kind of troubles ?

Given the problem might be happening under flood, I believe it is much
safer to do as I did.

Otherwise, we will have to process a ton of messages at the
local_bh_enable() time and lock the {softirq}IRQ on one cpu.

I chose to do this on purpose.

Batching can be dangerous, and this is exactly the point we do not want
batching, with say 64 queues.

This code is driver starts, hardly fast path.




Re: [PATCH net] bpf: rework prog_digest into prog_tag

2017-01-13 Thread Alexei Starovoitov
On Fri, Jan 13, 2017 at 03:16:44PM -0800, Andy Lutomirski wrote:
> On Fri, Jan 13, 2017 at 2:38 PM, Daniel Borkmann  wrote:
> > Commit 7bd509e311f4 ("bpf: add prog_digest and expose it via
> > fdinfo/netlink") was recently discussed, partially due to
> > admittedly suboptimal name of "prog_digest" in combination
> > with sha1 hash usage, thus inevitably and rightfully concerns
> > about its security in terms of collision resistance were
> > raised with regards to use-cases.
> >
> 
> Seems reasonable.  My only question is whether you'd still want to
> switch to SHA-256 just from a code cleanliness perspective.  With
> SHA-256 you can use the easy streaming API I wrote, but with SHA-1
> you're still stuck with the crappy API in lib/, and I'm not
> volunteering to fix up the SHA-1 API.

No. As was stated many times before there are only negatives
in switching to sha256.



Re: [PATCH net] bpf: rework prog_digest into prog_tag

2017-01-13 Thread Andy Lutomirski
On Fri, Jan 13, 2017 at 2:38 PM, Daniel Borkmann  wrote:
> Commit 7bd509e311f4 ("bpf: add prog_digest and expose it via
> fdinfo/netlink") was recently discussed, partially due to
> admittedly suboptimal name of "prog_digest" in combination
> with sha1 hash usage, thus inevitably and rightfully concerns
> about its security in terms of collision resistance were
> raised with regards to use-cases.
>

Seems reasonable.  My only question is whether you'd still want to
switch to SHA-256 just from a code cleanliness perspective.  With
SHA-256 you can use the easy streaming API I wrote, but with SHA-1
you're still stuck with the crappy API in lib/, and I'm not
volunteering to fix up the SHA-1 API.

--Andy


Re: [PATCH v2 7/8] net: Rename TCA*BPF_DIGEST to ..._SHA256

2017-01-13 Thread Daniel Borkmann

On 01/11/2017 07:19 PM, Andy Lutomirski wrote:

On Wed, Jan 11, 2017 at 1:09 AM, Daniel Borkmann  wrote:

[...]

Ok. Sleeping over this a bit, how about a general rename into
"prog_tag" for fdinfo and TCA_BPF_TAG resp. TCA_ACT_BPF_TAG for
the netlink attributes, fwiw, it might reduce any assumptions on
this being made? If this would be preferable, I could cook that
patch against -net for renaming it?


That would be fine with me.

I think there are two reasonable approaches to computing the actual tag.

1. Use a standard, modern cryptographic hash.  SHA-256, SHA-512,
Blake2b, whatever.  SHA-1 is a bad choice in part because it's partly
broken and in part because the implementation in lib/ is a real mess
to use (as you noticed while writing the code).

2. Use whatever algorithm you like but make the tag so short that it's
obviously not collision-free.  48 or 64 bits is probably reasonable.

The intermediate versions are just asking for trouble.


Yeah agree, I've just sent a patch to rework this a bit and it got
also reasonably small for net. Cleanups, if needed, can be done in
net-next once that's pulled into it.

Thanks,
Daniel


Re: [pull request][for-next] Mellanox mlx5 Reorganize core driver directory layout

2017-01-13 Thread Tom Herbert
On Fri, Jan 13, 2017 at 2:59 PM, Rick Jones  wrote:
> On 01/13/2017 02:56 PM, Tom Herbert wrote:
>>
>> On Fri, Jan 13, 2017 at 2:45 PM, Saeed Mahameed
>>>
>>> what configuration are you running ? what traffic ?
>>>
>> Nothing fancy. 8 queues and 20 concurrent netperf TCP_STREAMs trips
>> it. Not a lot of them, but I don't think we really should ever see
>> these errors.
>
>
> Straight-up defaults with netperf, or do you use specific -s/S or -m/M
> options?
>
./super_netperf_tput 20 -H test001 -l 100 -t TCP_STREAM

> happy benchmarking,
>
> rick jones
>


Re: [PATCH net] mlx4: do not call napi_schedule() without care

2017-01-13 Thread Alexander Duyck
On Fri, Jan 13, 2017 at 8:39 AM, Eric Dumazet  wrote:
> From: Eric Dumazet 
>
> Disable BH around the call to napi_schedule() to avoid following warning
>
> [   52.095499] NOHZ: local_softirq_pending 08
> [   52.421291] NOHZ: local_softirq_pending 08
> [   52.608313] NOHZ: local_softirq_pending 08
>
> Fixes: 8d59de8f7bb3 ("net/mlx4_en: Process all completions in RX rings after 
> port goes up")
> Signed-off-by: Eric Dumazet 
> Cc: Erez Shitrit 
> Cc: Eugenia Emantayev 
> Cc: Tariq Toukan 
> ---
>  drivers/net/ethernet/mellanox/mlx4/en_netdev.c |5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c 
> b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> index 
> 4910d9af19335d4b97d39760c163b41eecc26242..761f8b12399cab245abccc0f7d7f84fde742c14d
>  100644
> --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
> @@ -1748,8 +1748,11 @@ int mlx4_en_start_port(struct net_device *dev)
> /* Process all completions if exist to prevent
>  * the queues freezing if they are full
>  */
> -   for (i = 0; i < priv->rx_ring_num; i++)
> +   for (i = 0; i < priv->rx_ring_num; i++) {
> +   local_bh_disable();
> napi_schedule(>rx_cq[i]->napi);
> +   local_bh_enable();
> +   }

Couldn't you save yourself a ton of trouble by wrapping the loop
inside of the local_bh_disable/enable instead of wrapping them up
inside the loop?  It just seems like it might be more efficient to
schedule them and then process them as a block instead of doing it one
at a time.

- Alex


Re: [pull request][for-next] Mellanox mlx5 Reorganize core driver directory layout

2017-01-13 Thread Rick Jones

On 01/13/2017 02:56 PM, Tom Herbert wrote:

On Fri, Jan 13, 2017 at 2:45 PM, Saeed Mahameed

what configuration are you running ? what traffic ?


Nothing fancy. 8 queues and 20 concurrent netperf TCP_STREAMs trips
it. Not a lot of them, but I don't think we really should ever see
these errors.


Straight-up defaults with netperf, or do you use specific -s/S or -m/M 
options?


happy benchmarking,

rick jones



Re: [pull request][for-next] Mellanox mlx5 Reorganize core driver directory layout

2017-01-13 Thread Tom Herbert
On Fri, Jan 13, 2017 at 2:45 PM, Saeed Mahameed
 wrote:
> On Sat, Jan 14, 2017 at 12:06 AM, Tom Herbert  wrote:
>> On Fri, Jan 13, 2017 at 12:29 PM, Leon Romanovsky  wrote:
>>> On Fri, Jan 13, 2017 at 12:14:07PM -0500, David Miller wrote:
 From: Saeed Mahameed 
 Date: Thu, 12 Jan 2017 19:22:34 +0200

 > This pull request includes one patch from Leon, this patch as described
 > below will change the driver directory structure and layout for better,
 > logical and modular driver files separation.
 >
 > This change is important to both rdma and net maintainers in order to
 > have smoother management of driver patches for different mlx5 sub modules
 > and smoother rdma-next vs. net-next features submissions.
 >
 > Please find more info below -in the tag commit message-,
 > review and let us know if there's any problem.
 >
 > This change doesn't introduce any conflicts with the current mlx5
 > fixes and cleanups posted on 2017-01-10 to net branch, and merge tests
 > worked flawlessly with no issues.
 >
 > This is the last pull request meant for both rdma-next and net-next.
 > Once pulled, this will be the base shared code for both trees.

 This is pretty crazy, it will make all bug fix backporting to -stable
 a complete nightmare for myself, Doug, various distribution maintainers
 and many other people who quietly have to maintain their own trees and
 do backporting.
>>>
>>> Hi Dave,
>>>
>>> I understand your worries, but our case is similar to various other
>>> drivers, for example hfi1 which was in staging for years while
>>> supported in RedHat and moved from there to IB. The Chelsio drivers did
>>> similar reorg in 2016 (drivers/net/ethernet/chelsio/libcxgb) while their
>>> drivers were in the tree for long time before.
>>>
>>> Additionally, Doug doesn't need to maintain -stable queue and it is done
>>> by relevant submaintainers who are adding stable tags by themselves. In
>>> the IB case, the burden will continue to be on me and not on Doug.
>>>
>> Recently I had to backport the mlx5 driver from 4.9 to 4.6 in order to
>> get support for XDP. The biggest issue I faced was the lack of
>> modularity in the many driver features that are now supported. The
>> problem with backporting these new features is the spider web of
>> dependencies that they bring in from the rest of the kernel. I ended
>> up taking out en_rep, vxlan, en_tc, eswitch, and dcbnl. The result was
>> ~340 patches which is still a lot but at least this was constrained to
>> patches in the mlx5 directories and are relevant to what we want to
>> do.
>>
>> In lieu of restructuring the directories, I would much rather see more
>> config options so that we can build drivers that don't unnecessarily
>> complicate our lives with features we don't use. This is not just true
>> for Mellanox, but I would say it would be true of any driver that
>> someone is trying to deploy and maintain at large scale.
>>
>
> I think we should have both, if the restructuring made right,
> new whole features (e.g eswitch and eswitch offlaods or any independent 
> module),
> can sit in their own directory and keep their own logic concentrated
> in one place, and only touch the
> main driver code with simple entry points in the main flow,  this way
> you can simply compile their whole directories
> out with a config flag directly from the Makefile.
>
>> Btw, we did hit one issue in the backport. We started to get rx csum
>> faults (checksum complete value indicates TCP checksum is bad, but
>> host computation says checksum is good). I ran against 4.9 upstream
>> kernel and do see these, however don't see them in 4.10. I haven't
>> bisected yet. Is this a known issue?
>>
>
> Not to me, I don't recall any csum related fixes or feature submitted
> lately to mlx5,
> Maybe something changed in the stack ?
>
> what configuration are you running ? what traffic ?
>
Nothing fancy. 8 queues and 20 concurrent netperf TCP_STREAMs trips
it. Not a lot of them, but I don't think we really should ever see
these errors.

Tom

>> Thanks,
>> Tom
>>

 I really don't think you can justify this rearrangement based upon the
 consequences and how much activity happens in this driver.

 You should have thought long and hard about the layout a long time ago
 rather than after the driver has been in the tree for many years.

 Sorry.
>> --
>> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
>> the body of a message to majord...@vger.kernel.org
>> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/3] powerpc: bpf: flush the entire JIT buffer

2017-01-13 Thread Daniel Borkmann

On 01/13/2017 06:10 PM, Naveen N. Rao wrote:

With bpf_jit_binary_alloc(), we allocate at a page granularity and fill
the rest of the space with illegal instructions to mitigate BPF spraying
attacks, while having the actual JIT'ed BPF program at a random location
within the allocated space. Under this scenario, it would be better to
flush the entire allocated buffer rather than just the part containing
the actual program. We already flush the buffer from start to the end of
the BPF program. Extend this to include the illegal instructions after
the BPF program.

Signed-off-by: Naveen N. Rao 


Acked-by: Daniel Borkmann 


Re: [PATCH net-next 2/2] mpls: Packet stats

2017-01-13 Thread Roopa Prabhu
On 1/13/17, 10:14 AM, Robert Shearman wrote:
> Having MPLS packet stats is useful for observing network operation and
> for diagnosing network problems. In the absence of anything better,
> RFC2863 and RFC3813 are used for guidance for which stats to expose
> and the semantics of them. In particular rx_noroutes maps to in
> unknown protos in RFC2863. The stats are exposed to userspace via
> AF_MPLS attributes embedded in the IFLA_STATS_AF_SPEC attribute of
> RTM_GETSTATS messages.
>
> All the introduced fields are 64-bit, even error ones, to ensure no
> overflow with long uptimes. Per-CPU counters are used to avoid
> cache-line contention on the commonly used fields. The other fields
> have also been made per-CPU for code to avoid performance problems in
> error conditions on the assumption that on some platforms the cost of
> atomic operations could be more expensive than sending the packet
> (which is what would be done in the success case). If that's not the
> case, we could instead not use per-CPU counters for these fields.
>
> Only unicast and non-fragment are exposed at the moment, but other
> counters can be exposed in the future either by adding to the end of
> struct mpls_link_stats or by additional netlink attributes in the
> AF_MPLS IFLA_STATS_AF_SPEC nested attribute.
>
> Signed-off-by: Robert Shearman 
> ---
>
Acked-by: Roopa Prabhu 


Re: [PATCH net-next 1/2] net: AF-specific RTM_GETSTATS attributes

2017-01-13 Thread Roopa Prabhu
On 1/13/17, 10:14 AM, Robert Shearman wrote:
> Add the functionality for including address-family-specific per-link
> stats in RTM_GETSTATS messages. This is done through adding a new
> IFLA_STATS_AF_SPEC attribute under which address family attributes are
> nested and then the AF-specific attributes can be further nested. This
> follows the model of IFLA_AF_SPEC on RTM_*LINK messages and it has the
> advantage of presenting an easily extended hierarchy. The rtnl_af_ops
> structure is extended to provide AFs with the opportunity to fill and
> provide the size of their stats attributes.
>
> One alternative would have been to provide AFs with the ability to add
> attributes directly into the RTM_GETSTATS message without a nested
> hierarchy. I discounted this approach as it increases the rate at
> which the 32 attribute number space is used up and it makes
> implementation a little more tricky for stats dump resuming (at the
> moment the order in which attributes are added to the message has to
> match the numeric order of the attributes).
>
> Another alternative would have been to register per-AF RTM_GETSTATS
> handlers. I discounted this approach as I perceived a common use-case
> to be getting all the stats for an interface and this approach would
> necessitate multiple requests/dumps to retrieve them all.
>
> Signed-off-by: Robert Shearman 
>
Acked-by: Roopa Prabhu 


Re: [PATCH net-next 0/2] mpls: Packet stats

2017-01-13 Thread Roopa Prabhu
On 1/13/17, 10:14 AM, Robert Shearman wrote:
> This patchset records per-interface packet stats in the MPLS
> forwarding path and exports them using a nest of attributes root at a
> new IFLA_STATS_AF_SPEC attribute as part of RTM_GETSTATS messages:
>
> [IFLA_STATS_AF_SPEC]
>  -> [AF_MPLS]
>   -> [MPLS_STATS_LINK]
>-> struct mpls_link_stats
>
> The first patch adds the rtnl infrastructure for this, including a new
> callbacks to per-AF ops of fill_stats_af and get_stats_af_size. The
> second patch records MPLS stats and makes use of the infrastructure to
> export them. The rtnl infrastructure could also be used to export IPv6
> stats in the future.
>
> Robert Shearman (2):
>   net: AF-specific RTM_GETSTATS attributes
>   mpls: Packet stats
>
LGTM, thanks Robert



Re: [pull request][for-next] Mellanox mlx5 Reorganize core driver directory layout

2017-01-13 Thread Saeed Mahameed
On Sat, Jan 14, 2017 at 12:06 AM, Tom Herbert  wrote:
> On Fri, Jan 13, 2017 at 12:29 PM, Leon Romanovsky  wrote:
>> On Fri, Jan 13, 2017 at 12:14:07PM -0500, David Miller wrote:
>>> From: Saeed Mahameed 
>>> Date: Thu, 12 Jan 2017 19:22:34 +0200
>>>
>>> > This pull request includes one patch from Leon, this patch as described
>>> > below will change the driver directory structure and layout for better,
>>> > logical and modular driver files separation.
>>> >
>>> > This change is important to both rdma and net maintainers in order to
>>> > have smoother management of driver patches for different mlx5 sub modules
>>> > and smoother rdma-next vs. net-next features submissions.
>>> >
>>> > Please find more info below -in the tag commit message-,
>>> > review and let us know if there's any problem.
>>> >
>>> > This change doesn't introduce any conflicts with the current mlx5
>>> > fixes and cleanups posted on 2017-01-10 to net branch, and merge tests
>>> > worked flawlessly with no issues.
>>> >
>>> > This is the last pull request meant for both rdma-next and net-next.
>>> > Once pulled, this will be the base shared code for both trees.
>>>
>>> This is pretty crazy, it will make all bug fix backporting to -stable
>>> a complete nightmare for myself, Doug, various distribution maintainers
>>> and many other people who quietly have to maintain their own trees and
>>> do backporting.
>>
>> Hi Dave,
>>
>> I understand your worries, but our case is similar to various other
>> drivers, for example hfi1 which was in staging for years while
>> supported in RedHat and moved from there to IB. The Chelsio drivers did
>> similar reorg in 2016 (drivers/net/ethernet/chelsio/libcxgb) while their
>> drivers were in the tree for long time before.
>>
>> Additionally, Doug doesn't need to maintain -stable queue and it is done
>> by relevant submaintainers who are adding stable tags by themselves. In
>> the IB case, the burden will continue to be on me and not on Doug.
>>
> Recently I had to backport the mlx5 driver from 4.9 to 4.6 in order to
> get support for XDP. The biggest issue I faced was the lack of
> modularity in the many driver features that are now supported. The
> problem with backporting these new features is the spider web of
> dependencies that they bring in from the rest of the kernel. I ended
> up taking out en_rep, vxlan, en_tc, eswitch, and dcbnl. The result was
> ~340 patches which is still a lot but at least this was constrained to
> patches in the mlx5 directories and are relevant to what we want to
> do.
>
> In lieu of restructuring the directories, I would much rather see more
> config options so that we can build drivers that don't unnecessarily
> complicate our lives with features we don't use. This is not just true
> for Mellanox, but I would say it would be true of any driver that
> someone is trying to deploy and maintain at large scale.
>

I think we should have both, if the restructuring made right,
new whole features (e.g eswitch and eswitch offlaods or any independent module),
can sit in their own directory and keep their own logic concentrated
in one place, and only touch the
main driver code with simple entry points in the main flow,  this way
you can simply compile their whole directories
out with a config flag directly from the Makefile.

> Btw, we did hit one issue in the backport. We started to get rx csum
> faults (checksum complete value indicates TCP checksum is bad, but
> host computation says checksum is good). I ran against 4.9 upstream
> kernel and do see these, however don't see them in 4.10. I haven't
> bisected yet. Is this a known issue?
>

Not to me, I don't recall any csum related fixes or feature submitted
lately to mlx5,
Maybe something changed in the stack ?

what configuration are you running ? what traffic ?

> Thanks,
> Tom
>
>>>
>>> I really don't think you can justify this rearrangement based upon the
>>> consequences and how much activity happens in this driver.
>>>
>>> You should have thought long and hard about the layout a long time ago
>>> rather than after the driver has been in the tree for many years.
>>>
>>> Sorry.
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next v2 08/10] net: dsa: Add support for platform data

2017-01-13 Thread Florian Fainelli
On 01/13/2017 06:11 AM, Andrew Lunn wrote:
>>  static int _dsa_register_switch(struct dsa_switch *ds, struct device *dev)
>>  {
>> +struct dsa_chip_data *pdata = dev->platform_data;
>>  struct device_node *np = dev->of_node;
>>  struct dsa_switch_tree *dst;
>>  struct device_node *ports;
>>  u32 tree, index;
>>  int i, err;
>>  
>> -err = dsa_parse_member_dn(np, , );
>> -if (err)
>> -return err;
>> +if (np) {
>> +err = dsa_parse_member_dn(np, , );
>> +if (err)
>> +return err;
>>  
>> -ports = dsa_get_ports(ds, np);
>> -if (IS_ERR(ports))
>> -return PTR_ERR(ports);
>> +ports = dsa_get_ports(ds, np);
>> +if (IS_ERR(ports))
>> +return PTR_ERR(ports);
>>  
>> -err = dsa_parse_ports_dn(ports, ds);
>> -if (err)
>> -return err;
>> +err = dsa_parse_ports_dn(ports, ds);
>> +if (err)
>> +return err;
>> +} else {
>> +err = dsa_parse_member(pdata, , );
> 

Hello Andrew,

> Hi Florian
> 
> Maybe it is hiding, but i don't see anywhere you check that pdata !=
> NULL.

You are right, there is not such a check, it should probably be added
early on.

> 
> At least for x86 platforms, i don't expect we are booting using
> platform data like ARM systems used to do. I think it is more likely a
> glue module will be loaded. It looks up the MDIO bus and appends a
> platform data to an MDIO device. The switch driver then needs to load
> and use the platform data. But if things happen in a different order,
> it could be the switch driver probes before the glue driver, meaning
> pdata is NULL.

That's very valid, I will fix this, thanks!

> 
> Do we even want to return -EPROBE_DEFERED?

I was trying to exercise that code path a little bit, but could not
quite make sense of what I was seeing, let me try again with more tracing.
-- 
Florian


[PATCH net] bpf: rework prog_digest into prog_tag

2017-01-13 Thread Daniel Borkmann
Commit 7bd509e311f4 ("bpf: add prog_digest and expose it via
fdinfo/netlink") was recently discussed, partially due to
admittedly suboptimal name of "prog_digest" in combination
with sha1 hash usage, thus inevitably and rightfully concerns
about its security in terms of collision resistance were
raised with regards to use-cases.

The intended use cases are for debugging resp. introspection
only for providing a stable "tag" over the instruction sequence
that both kernel and user space can calculate independently.
It's not usable at all for making a security relevant decision.
So collisions where two different instruction sequences generate
the same tag can happen, but ideally at a rather low rate. The
"tag" will be dumped in hex and is short enough to introspect
in tracepoints or kallsyms output along with other data such
as stack trace, etc. Thus, this patch performs a rename into
prog_tag and truncates the tag to a short output (64 bits) to
make it obvious it's not collision-free.

Should in future a hash or facility be needed with a security
relevant focus, then we can think about requirements, constraints,
etc that would fit to that situation. For now, rework the exposed
parts for the current use cases as long as nothing has been
released yet. Tested on x86_64 and s390x.

Fixes: 7bd509e311f4 ("bpf: add prog_digest and expose it via fdinfo/netlink")
Signed-off-by: Daniel Borkmann 
Acked-by: Alexei Starovoitov 
Cc: Andy Lutomirski 
---
 include/linux/bpf.h|  2 +-
 include/linux/filter.h |  6 --
 include/uapi/linux/pkt_cls.h   |  2 +-
 include/uapi/linux/tc_act/tc_bpf.h |  2 +-
 kernel/bpf/core.c  | 14 --
 kernel/bpf/syscall.c   |  8 
 kernel/bpf/verifier.c  |  2 +-
 net/sched/act_bpf.c|  5 ++---
 net/sched/cls_bpf.c|  4 ++--
 9 files changed, 24 insertions(+), 21 deletions(-)

diff --git a/include/linux/bpf.h b/include/linux/bpf.h
index f74ae68..05cf951 100644
--- a/include/linux/bpf.h
+++ b/include/linux/bpf.h
@@ -216,7 +216,7 @@ struct bpf_event_entry {
 u64 bpf_get_stackid(u64 r1, u64 r2, u64 r3, u64 r4, u64 r5);
 
 bool bpf_prog_array_compatible(struct bpf_array *array, const struct bpf_prog 
*fp);
-int bpf_prog_calc_digest(struct bpf_prog *fp);
+int bpf_prog_calc_tag(struct bpf_prog *fp);
 
 const struct bpf_func_proto *bpf_get_trace_printk_proto(void);
 
diff --git a/include/linux/filter.h b/include/linux/filter.h
index a0934e6..e4eb254 100644
--- a/include/linux/filter.h
+++ b/include/linux/filter.h
@@ -57,6 +57,8 @@
 /* BPF program can access up to 512 bytes of stack space. */
 #define MAX_BPF_STACK  512
 
+#define BPF_TAG_SIZE   8
+
 /* Helper macros for filter block array initializers. */
 
 /* ALU ops on registers, bpf_add|sub|...: dst_reg += src_reg */
@@ -408,7 +410,7 @@ struct bpf_prog {
kmemcheck_bitfield_end(meta);
enum bpf_prog_type  type;   /* Type of BPF program */
u32 len;/* Number of filter blocks */
-   u32 digest[SHA_DIGEST_WORDS]; /* Program digest */
+   u8  tag[BPF_TAG_SIZE];
struct bpf_prog_aux *aux;   /* Auxiliary fields */
struct sock_fprog_kern  *orig_prog; /* Original BPF program */
unsigned int(*bpf_func)(const void *ctx,
@@ -519,7 +521,7 @@ static inline u32 bpf_prog_insn_size(const struct bpf_prog 
*prog)
return prog->len * sizeof(struct bpf_insn);
 }
 
-static inline u32 bpf_prog_digest_scratch_size(const struct bpf_prog *prog)
+static inline u32 bpf_prog_tag_scratch_size(const struct bpf_prog *prog)
 {
return round_up(bpf_prog_insn_size(prog) +
sizeof(__be64) + 1, SHA_MESSAGE_BYTES);
diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index cb4bcdc..a4dcd88 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -397,7 +397,7 @@ enum {
TCA_BPF_NAME,
TCA_BPF_FLAGS,
TCA_BPF_FLAGS_GEN,
-   TCA_BPF_DIGEST,
+   TCA_BPF_TAG,
__TCA_BPF_MAX,
 };
 
diff --git a/include/uapi/linux/tc_act/tc_bpf.h 
b/include/uapi/linux/tc_act/tc_bpf.h
index a6b88a6..975b50d 100644
--- a/include/uapi/linux/tc_act/tc_bpf.h
+++ b/include/uapi/linux/tc_act/tc_bpf.h
@@ -27,7 +27,7 @@ enum {
TCA_ACT_BPF_FD,
TCA_ACT_BPF_NAME,
TCA_ACT_BPF_PAD,
-   TCA_ACT_BPF_DIGEST,
+   TCA_ACT_BPF_TAG,
__TCA_ACT_BPF_MAX,
 };
 #define TCA_ACT_BPF_MAX (__TCA_ACT_BPF_MAX - 1)
diff --git a/kernel/bpf/core.c b/kernel/bpf/core.c
index 1eb4f13..503d421 100644
--- a/kernel/bpf/core.c
+++ b/kernel/bpf/core.c
@@ -146,10 +146,11 @@ void __bpf_prog_free(struct bpf_prog *fp)
vfree(fp);
 }
 
-int bpf_prog_calc_digest(struct bpf_prog *fp)
+int bpf_prog_calc_tag(struct bpf_prog *fp)
 {
const u32 bits_offset = 

Re: [PATCH net-next v2 08/10] net: dsa: Add support for platform data

2017-01-13 Thread Florian Fainelli
On 01/13/2017 06:04 AM, Andrew Lunn wrote:
>> index cd91070b5467..d326fc4afad7 100644
>> --- a/net/dsa/dsa2.c
>> +++ b/net/dsa/dsa2.c
>> @@ -81,17 +81,23 @@ static void dsa_dst_del_ds(struct dsa_switch_tree *dst,
>>  
>>  static bool dsa_port_is_valid(struct dsa_port *port)
>>  {
>> -return !!port->dn;
>> +return !!(port->dn || port->name);
>>  }
>   
> Does this clash with Viviens recent change to make names optional and
> have the kernel assign it?

So there were two ways to look at this, one was that could check here
that ds->pd is assigned and port->name is assigned, which means that
platform data has to provide valid port name. We can also eliminate this
check entirely because we now support NULL names just fines.

> 
> I suppose you could use an name of "eth%d"? Is it worth adding a
> comment to the platform data structure?

Humm, that could be done, maybe for simplicity we can just let
net/dsa/dsa2.c assign names either based on what platform data provided,
or by falling back to eth%d.

Thanks!
-- 
Florian


Re: [PATCH net-next] net/mlx5e: Support bpf_xdp_adjust_head()

2017-01-13 Thread Martin KaFai Lau
On Fri, Jan 13, 2017 at 03:58:46PM +0200, Saeed Mahameed wrote:
> >> > @@ -680,7 +687,7 @@ static inline void mlx5e_xmit_xdp_frame(struct 
> >> > mlx5e_rq *rq,
> >> > memset(wqe, 0, sizeof(*wqe));
> >> >
> >> > /* copy the inline part */
> >> > -   memcpy(eseg->inline_hdr_start, data, MLX5E_XDP_MIN_INLINE);
> >> > +   memcpy(eseg->inline_hdr_start, xdp->data, MLX5E_XDP_MIN_INLINE);
> >> > eseg->inline_hdr_sz = cpu_to_be16(MLX5E_XDP_MIN_INLINE);
> >> >
> >> > dseg = (struct mlx5_wqe_data_seg *)cseg + (MLX5E_XDP_TX_DS_COUNT 
> >> > - 1);
> >> > @@ -706,22 +713,16 @@ static inline void mlx5e_xmit_xdp_frame(struct 
> >> > mlx5e_rq *rq,
> >> >  static inline bool mlx5e_xdp_handle(struct mlx5e_rq *rq,
> >> > const struct bpf_prog *prog,
> >> > struct mlx5e_dma_info *di,
> >> > -   void *data, u16 len)
> >> > +   struct xdp_buff *xdp)
> >> >  {
> >> > -   struct xdp_buff xdp;
> >> > u32 act;
> >> >
> >> > -   if (!prog)
> >> > -   return false;
> >> > -
> >> > -   xdp.data = data;
> >> > -   xdp.data_end = xdp.data + len;
> >> > -   act = bpf_prog_run_xdp(prog, );
> >> > +   act = bpf_prog_run_xdp(prog, xdp);
> >> > switch (act) {
> >> > case XDP_PASS:
> >> > return false;
> >> > case XDP_TX:
> >> > -   mlx5e_xmit_xdp_frame(rq, di, MLX5_RX_HEADROOM, len);
> >> > +   mlx5e_xmit_xdp_frame(rq, di, xdp);
> >> > return true;
> >> > default:
> >> > bpf_warn_invalid_xdp_action(act);
> >> > @@ -737,18 +738,19 @@ static inline
> >> >  struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, struct mlx5_cqe64 
> >> > *cqe,
> >> >  u16 wqe_counter, u32 cqe_bcnt)
> >> >  {
> >> > +   const struct bpf_prog *xdp_prog;
> >> > struct mlx5e_dma_info *di;
> >> > struct sk_buff *skb;
> >> > void *va, *data;
> >> > -   bool consumed;
> >> > +   u16 rx_headroom = rq->rx_headroom;
> >> >
> >> > di = >dma_info[wqe_counter];
> >> > va = page_address(di->page);
> >> > -   data   = va + MLX5_RX_HEADROOM;
> >> > +   data   = va + rx_headroom;
> >> >
> >> > dma_sync_single_range_for_cpu(rq->pdev,
> >> >   di->addr,
> >> > - MLX5_RX_HEADROOM,
> >> > + rx_headroom,
> >> >   rq->buff.wqe_sz,
> >> >   DMA_FROM_DEVICE);
> >> > prefetch(data);
> >> > @@ -760,11 +762,26 @@ struct sk_buff *skb_from_cqe(struct mlx5e_rq *rq, 
> >> > struct mlx5_cqe64 *cqe,
> >> > }
> >> >
> >> > rcu_read_lock();
> >> > -   consumed = mlx5e_xdp_handle(rq, READ_ONCE(rq->xdp_prog), di, 
> >> > data,
> >> > -   cqe_bcnt);
> >> > +   xdp_prog = READ_ONCE(rq->xdp_prog);
> >> > +   if (xdp_prog) {
> >> > +   struct xdp_buff xdp;
> >> > +   bool consumed;
> >> > +
> >> > +   xdp.data = data;
> >> > +   xdp.data_end = xdp.data + cqe_bcnt;
> >> > +   xdp.data_hard_start = va;
> >> > +
> >> > +   consumed = mlx5e_xdp_handle(rq, xdp_prog, di, );
> >> > +
> >> > +   if (consumed) {
> >> > +   rcu_read_unlock();
> >> > +   return NULL; /* page/packet was consumed by XDP 
> >> > */
> >> > +   }
> >> > +
> >> > +   rx_headroom = xdp.data - xdp.data_hard_start;
> >> > +   cqe_bcnt = xdp.data_end - xdp.data;
> >> > +   }
> >>
> >> This whole new logic belongs to mlx5e_xdp_handle, I would like to keep
> >> xdp related code in one place.
> >>
> >> move the xdp_buff initialization back to there and keep the xdp_prog
> >> check in mlx5e_xdp_handle;
> >> +  xdp_prog = READ_ONCE(rq->xdp_prog);
> >> +   if (!xdp_prog)
> >> +return false
> >>
> >> you can remove "const struct bpf_prog *prog" parameter from
> >> mlx5e_xdp_handle and take it directly from rq.
> >>
> >> if you need va for xdp_buff you can pass it as a paramter to
> >> mlx5e_xdp_handle  as well:
> >> mlx5e_xdp_handle(rq, di, va, data, cqe_bcnt);
> >> Make sense ?
> > I moved them because xdp.data could be adjusted which then
> > rx_headroom and cqe_bcnt have to be adjusted accordingly
> > in skb_from_cqe() also.
> >
> > I understand your point.  After another quick thought,
> > the adjusted xdp.data is the only one that we want in skb_from_cqe().
> > I will try to make mlx5e_xdp_handle() to return the adjusted xdp.data
> > instead of bool.
> >
>
> hmm, You also need the adjusted cqe_bcnt! this will make
> mlx5e_xdp_handle stuffed with parameters,
>
> 

Re: Commit 1fe8e0... (include more headers in if_tunnel.h) breaks my user-space build.

2017-01-13 Thread Ben Greear

On 01/13/2017 02:08 PM, Stephen Hemminger wrote:

On Fri, 13 Jan 2017 11:50:32 -0800
Ben Greear  wrote:


On 01/13/2017 11:41 AM, Stephen Hemminger wrote:

On Fri, 13 Jan 2017 11:12:32 -0800
Ben Greear  wrote:


I am including netinet/ip.h, and also linux/if_tunnel.h, and the linux/ip.h 
conflicts with
netinet/ip.h.

Maybe my build environment is screwed up, but maybe also it would be better to
just let the user include appropriate headers before including if_tunnel.h
and revert this patch?


include/uapi/linux/if_tunnel.h: include linux/if.h, linux/ip.h and linux/in6.h

 Fixes userspace compilation errors like:

 error: field ‘iph’ has incomplete type
 error: field ‘prefix’ has incomplete type

 Signed-off-by: Mikko Rapeli 
 Signed-off-by: David S. Miller 

Thanks,
Ben



What I ended up doing for iproute2 was including all headers used by the source
based on sanitized kernel headers.  Basically
  $ git grep '^#include .*$//' | \
sort -u >linux.headers
   $ for f in $(cat linux.headers)
 do cp ~/kernel/net-next/usr/include/$f include/$f
 done

You can't take only some of the headers, once you decide to diverge from glibc 
provided
headers, you got to take them all.



I do grab a copy of the linux kernel headers and compile against that, but 
netinet/ip.h is
coming from the OS.  Do you mean I should not include netinet/ip.h and instead 
use linux/ip.h?


I don't think you can mix netinet/ip.h and linux/ip.h, yes that is a mess.



Well, I still like the idea of reverting this patch..that way user-space does 
not have to use
linux/ip.h, and that lets them use netinet/ip.h and if_tunnel.h.

Anyway, I'll let Dave and/or the original committer decideI've reverted it 
in my local tree
so I am able to build again...

Thanks,
Ben

--
Ben Greear 
Candela Technologies Inc  http://www.candelatech.com



Re: Commit 1fe8e0... (include more headers in if_tunnel.h) breaks my user-space build.

2017-01-13 Thread Stephen Hemminger
On Fri, 13 Jan 2017 11:50:32 -0800
Ben Greear  wrote:

> On 01/13/2017 11:41 AM, Stephen Hemminger wrote:
> > On Fri, 13 Jan 2017 11:12:32 -0800
> > Ben Greear  wrote:
> >  
> >> I am including netinet/ip.h, and also linux/if_tunnel.h, and the 
> >> linux/ip.h conflicts with
> >> netinet/ip.h.
> >>
> >> Maybe my build environment is screwed up, but maybe also it would be 
> >> better to
> >> just let the user include appropriate headers before including if_tunnel.h
> >> and revert this patch?
> >>
> >>
> >> include/uapi/linux/if_tunnel.h: include linux/if.h, linux/ip.h and 
> >> linux/in6.h
> >>
> >>  Fixes userspace compilation errors like:
> >>
> >>  error: field ‘iph’ has incomplete type
> >>  error: field ‘prefix’ has incomplete type
> >>
> >>  Signed-off-by: Mikko Rapeli 
> >>  Signed-off-by: David S. Miller 
> >>
> >> Thanks,
> >> Ben
> >>  
> >
> > What I ended up doing for iproute2 was including all headers used by the 
> > source
> > based on sanitized kernel headers.  Basically
> >   $ git grep '^#include  > awk -F: '{print $2}' | \
> > sed -e 's/^#include .*$//' | \
> > sort -u >linux.headers
> >$ for f in $(cat linux.headers)
> >  do cp ~/kernel/net-next/usr/include/$f include/$f
> >  done
> >
> > You can't take only some of the headers, once you decide to diverge from 
> > glibc provided
> > headers, you got to take them all.
> >  
> 
> I do grab a copy of the linux kernel headers and compile against that, but 
> netinet/ip.h is
> coming from the OS.  Do you mean I should not include netinet/ip.h and 
> instead use linux/ip.h?

I don't think you can mix netinet/ip.h and linux/ip.h, yes that is a mess.



Re: [pull request][for-next] Mellanox mlx5 Reorganize core driver directory layout

2017-01-13 Thread Saeed Mahameed
On Fri, Jan 13, 2017 at 7:14 PM, David Miller  wrote:
> From: Saeed Mahameed 
> Date: Thu, 12 Jan 2017 19:22:34 +0200
>
>> This pull request includes one patch from Leon, this patch as described
>> below will change the driver directory structure and layout for better,
>> logical and modular driver files separation.
>>
>> This change is important to both rdma and net maintainers in order to
>> have smoother management of driver patches for different mlx5 sub modules
>> and smoother rdma-next vs. net-next features submissions.
>>
>> Please find more info below -in the tag commit message-,
>> review and let us know if there's any problem.
>>
>> This change doesn't introduce any conflicts with the current mlx5
>> fixes and cleanups posted on 2017-01-10 to net branch, and merge tests
>> worked flawlessly with no issues.
>>
>> This is the last pull request meant for both rdma-next and net-next.
>> Once pulled, this will be the base shared code for both trees.
>
> This is pretty crazy, it will make all bug fix backporting to -stable
> a complete nightmare for myself, Doug, various distribution maintainers
> and many other people who quietly have to maintain their own trees and
> do backporting.
>

I hear you,
But please bear with me here, what if we queue this patch up to -stable ? and we
(Mellanox) and specifically our dedicated inbox team, will make sure
that this patch
will land on the various distributions and for those maintaining their
own trees.
This patch is really straight forward (rename files) and I already
tried to cherry-pick it
on older kernels, I only got a couple of conflicts on some of the
"#inlcude" lines we've
changed, and they are pretty straightforward to fix, we can even avoid
this if we decide to
not move mlx5 header files in this phase.

If this is possible then all trees will be aligned and it will be a
win win situation.

> I really don't think you can justify this rearrangement based upon the
> consequences and how much activity happens in this driver.
>

Right, but this is not the only justification, I can sum it up to that
we would like
to lay out the foundation for many years to come for a well designed
driver with a modular
sub modules break down and scalable infrastructure. We already plan to
submit more mlx5
independent  sub modules - just like mlx5e (en_*) files and mlx5_ib
driver- so this was also
a reason for us to consider this re-engagement at this stage.

> You should have thought long and hard about the layout a long time ago
> rather than after the driver has been in the tree for many years.
>

I had this Idea for the separation before the mlx5 Ethernet
submission, but I wasn't the maintainer
back then, and i have been itching to submit such patch for long as
well, still i don't think
it is too late, We (Me and Leon) will keep maintaining this driver for
only god knows how many years to come,
and the mlx5 drivers are meant to serve at least 3-4 more future HW generations.

Long story short, We would like to re-arrange the driver in a way that
would serve us (the maintainers) and serve those
who are going do develop the future Stack features and the future HW
generations over the well designed (Hopefully)
mlx5 infrastructure.
Keeping it as it is today, will only make the situation worst in the
future and it will be really hard to avoid having a spaghetti code
in the mlx5 driver. All i want to point out here is that maintaining
such a flat subtree is also nightmare.

So i am only asking you to reconsider this change and give my -stable
suggestion a thought.

Thank you.
Saeed.


Re: [pull request][for-next] Mellanox mlx5 Reorganize core driver directory layout

2017-01-13 Thread Tom Herbert
On Fri, Jan 13, 2017 at 12:29 PM, Leon Romanovsky  wrote:
> On Fri, Jan 13, 2017 at 12:14:07PM -0500, David Miller wrote:
>> From: Saeed Mahameed 
>> Date: Thu, 12 Jan 2017 19:22:34 +0200
>>
>> > This pull request includes one patch from Leon, this patch as described
>> > below will change the driver directory structure and layout for better,
>> > logical and modular driver files separation.
>> >
>> > This change is important to both rdma and net maintainers in order to
>> > have smoother management of driver patches for different mlx5 sub modules
>> > and smoother rdma-next vs. net-next features submissions.
>> >
>> > Please find more info below -in the tag commit message-,
>> > review and let us know if there's any problem.
>> >
>> > This change doesn't introduce any conflicts with the current mlx5
>> > fixes and cleanups posted on 2017-01-10 to net branch, and merge tests
>> > worked flawlessly with no issues.
>> >
>> > This is the last pull request meant for both rdma-next and net-next.
>> > Once pulled, this will be the base shared code for both trees.
>>
>> This is pretty crazy, it will make all bug fix backporting to -stable
>> a complete nightmare for myself, Doug, various distribution maintainers
>> and many other people who quietly have to maintain their own trees and
>> do backporting.
>
> Hi Dave,
>
> I understand your worries, but our case is similar to various other
> drivers, for example hfi1 which was in staging for years while
> supported in RedHat and moved from there to IB. The Chelsio drivers did
> similar reorg in 2016 (drivers/net/ethernet/chelsio/libcxgb) while their
> drivers were in the tree for long time before.
>
> Additionally, Doug doesn't need to maintain -stable queue and it is done
> by relevant submaintainers who are adding stable tags by themselves. In
> the IB case, the burden will continue to be on me and not on Doug.
>
Recently I had to backport the mlx5 driver from 4.9 to 4.6 in order to
get support for XDP. The biggest issue I faced was the lack of
modularity in the many driver features that are now supported. The
problem with backporting these new features is the spider web of
dependencies that they bring in from the rest of the kernel. I ended
up taking out en_rep, vxlan, en_tc, eswitch, and dcbnl. The result was
~340 patches which is still a lot but at least this was constrained to
patches in the mlx5 directories and are relevant to what we want to
do.

In lieu of restructuring the directories, I would much rather see more
config options so that we can build drivers that don't unnecessarily
complicate our lives with features we don't use. This is not just true
for Mellanox, but I would say it would be true of any driver that
someone is trying to deploy and maintain at large scale.

Btw, we did hit one issue in the backport. We started to get rx csum
faults (checksum complete value indicates TCP checksum is bad, but
host computation says checksum is good). I ran against 4.9 upstream
kernel and do see these, however don't see them in 4.10. I haven't
bisected yet. Is this a known issue?

Thanks,
Tom

>>
>> I really don't think you can justify this rearrangement based upon the
>> consequences and how much activity happens in this driver.
>>
>> You should have thought long and hard about the layout a long time ago
>> rather than after the driver has been in the tree for many years.
>>
>> Sorry.


[PATCH net 1/3] be2net: fix status check in be_cmd_pmac_add()

2017-01-13 Thread Ivan Vecera
Return value from be_mcc_notify_wait() contains a base completion status
together with an additional status. The base_status() macro need to be
used to access base status.

Fixes: e3a7ae2 be2net: Changing MAC Address of a VF was broken
Cc: Sathya Perla 
Cc: Ajit Khaparde 
Cc: Sriharsha Basavapatna 
Cc: Somnath Kotur 
Signed-off-by: Ivan Vecera 
---
 drivers/net/ethernet/emulex/benet/be_cmds.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_cmds.c 
b/drivers/net/ethernet/emulex/benet/be_cmds.c
index 0e74529..30e8550 100644
--- a/drivers/net/ethernet/emulex/benet/be_cmds.c
+++ b/drivers/net/ethernet/emulex/benet/be_cmds.c
@@ -1118,7 +1118,7 @@ int be_cmd_pmac_add(struct be_adapter *adapter, u8 
*mac_addr,
 err:
mutex_unlock(>mcc_lock);
 
-if (status == MCC_STATUS_UNAUTHORIZED_REQUEST)
+if (base_status(status) == MCC_STATUS_UNAUTHORIZED_REQUEST)
status = -EPERM;
 
return status;
-- 
2.10.2



[PATCH net 2/3] be2net: don't delete MAC on close on unprivileged BE3 VFs

2017-01-13 Thread Ivan Vecera
BE3 VFs without FILTMGMT privilege are not allowed to modify its MAC,
VLAN table and UC/MC lists. So don't try to delete MAC on such VFs.

Cc: Sathya Perla 
Cc: Ajit Khaparde 
Cc: Sriharsha Basavapatna 
Cc: Somnath Kotur 
Signed-off-by: Ivan Vecera 
---
 drivers/net/ethernet/emulex/benet/be_main.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_main.c 
b/drivers/net/ethernet/emulex/benet/be_main.c
index ec010ce..d606e20 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -3609,7 +3609,11 @@ static void be_rx_qs_destroy(struct be_adapter *adapter)
 
 static void be_disable_if_filters(struct be_adapter *adapter)
 {
-   be_dev_mac_del(adapter, adapter->pmac_id[0]);
+   /* Don't delete MAC on BE3 VFs without FILTMGMT privilege  */
+   if (!BEx_chip(adapter) || !be_virtfn(adapter) ||
+   check_privilege(adapter, BE_PRIV_FILTMGMT))
+   be_dev_mac_del(adapter, adapter->pmac_id[0]);
+
be_clear_uc_list(adapter);
be_clear_mc_list(adapter);
 
-- 
2.10.2



[PATCH net 3/3] be2net: fix MAC addr setting on privileged BE3 VFs

2017-01-13 Thread Ivan Vecera
During interface opening MAC address stored in netdev->dev_addr is
programmed in the HW with exception of BE3 VFs where the initial
MAC is programmed by parent PF. This is OK when MAC address is not
changed when an interfaces is down. In this case the requested MAC is
stored to netdev->dev_addr and later is stored into HW during opening.
But this is not done for all BE3 VFs so the NIC HW does not know
anything about this change and all traffic is filtered.

This is the case of bonding if fail_over_mac == 0 where the MACs of
the slaves are changed while they are down.

The be2net behavior is too restrictive because if a BE3 VF has
the FILTMGMT privilege then it is able to modify its MAC without
any restriction.

To solve the described problem the driver should take care about these
privileged BE3 VFs so the MAC is programmed during opening. And by
contrast unpriviled BE3 VFs should not be allowed to change its MAC
in any case.

Cc: Sathya Perla 
Cc: Ajit Khaparde 
Cc: Sriharsha Basavapatna 
Cc: Somnath Kotur 
Signed-off-by: Ivan Vecera 
---
 drivers/net/ethernet/emulex/benet/be_main.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/emulex/benet/be_main.c 
b/drivers/net/ethernet/emulex/benet/be_main.c
index d606e20..1a7f8ad 100644
--- a/drivers/net/ethernet/emulex/benet/be_main.c
+++ b/drivers/net/ethernet/emulex/benet/be_main.c
@@ -318,6 +318,13 @@ static int be_mac_addr_set(struct net_device *netdev, void 
*p)
if (ether_addr_equal(addr->sa_data, adapter->dev_mac))
return 0;
 
+   /* BE3 VFs without FILTMGMT privilege are not allowed to set its MAC
+* address
+*/
+   if (BEx_chip(adapter) && be_virtfn(adapter) &&
+   !check_privilege(adapter, BE_PRIV_FILTMGMT))
+   return -EPERM;
+
/* if device is not running, copy MAC to netdev->dev_addr */
if (!netif_running(netdev))
goto done;
@@ -3766,8 +3773,9 @@ static int be_enable_if_filters(struct be_adapter 
*adapter)
if (status)
return status;
 
-   /* For BE3 VFs, the PF programs the initial MAC address */
-   if (!(BEx_chip(adapter) && be_virtfn(adapter))) {
+   /* Don't add MAC on BE3 VFs without FILTMGMT privilege */
+   if (!BEx_chip(adapter) || !be_virtfn(adapter) ||
+   check_privilege(adapter, BE_PRIV_FILTMGMT)) {
status = be_dev_mac_add(adapter, adapter->netdev->dev_addr);
if (status)
return status;
-- 
2.10.2



Re: kill off pci_enable_msi_{exact,range}

2017-01-13 Thread Tom Lendacky
On 1/13/2017 11:15 AM, Christoph Hellwig wrote:
> On Fri, Jan 13, 2017 at 11:13:21AM -0600, Bjorn Helgaas wrote:
>> I dropped the empty commit and replaced the xgbe patch with the one below.
>> Can you take a look at [1] and make sure it's what you expected?
> 
> This looks great, thanks!
> 

Christoph and Bjorn, thanks for taking care of this!

Tom


[PATCH net-next 2/2] sfc: get PIO buffer size from the NIC

2017-01-13 Thread Edward Cree
The 8000 series SFC NICs have 4K PIO buffers, rather than the 2K of
 the 7000 series.  Rather than having a hard-coded PIO buffer size
 (ER_DZ_TX_PIOBUF_SIZE), read it from the GET_CAPABILITIES_V2 MCDI
 response.

Signed-off-by: Edward Cree 
---
 drivers/net/ethernet/sfc/ef10.c | 16 ++--
 drivers/net/ethernet/sfc/nic.h  |  2 ++
 drivers/net/ethernet/sfc/tx.c   |  1 -
 3 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c
index f6e29a9..2ce5769 100644
--- a/drivers/net/ethernet/sfc/ef10.c
+++ b/drivers/net/ethernet/sfc/ef10.c
@@ -197,11 +197,15 @@ static int efx_ef10_init_datapath_caps(struct efx_nic 
*efx)
nic_data->datapath_caps =
MCDI_DWORD(outbuf, GET_CAPABILITIES_OUT_FLAGS1);
 
-   if (outlen >= MC_CMD_GET_CAPABILITIES_V2_OUT_LEN)
+   if (outlen >= MC_CMD_GET_CAPABILITIES_V2_OUT_LEN) {
nic_data->datapath_caps2 = MCDI_DWORD(outbuf,
GET_CAPABILITIES_V2_OUT_FLAGS2);
-   else
+   nic_data->piobuf_size = MCDI_WORD(outbuf,
+   GET_CAPABILITIES_V2_OUT_SIZE_PIO_BUFF);
+   } else {
nic_data->datapath_caps2 = 0;
+   nic_data->piobuf_size = ER_DZ_TX_PIOBUF_SIZE;
+   }
 
/* record the DPCPU firmware IDs to determine VEB vswitching support.
 */
@@ -823,8 +827,8 @@ static int efx_ef10_link_piobufs(struct efx_nic *efx)
offset = ((efx->tx_channel_offset + efx->n_tx_channels -
   tx_queue->channel->channel - 1) *
  efx_piobuf_size);
-   index = offset / ER_DZ_TX_PIOBUF_SIZE;
-   offset = offset % ER_DZ_TX_PIOBUF_SIZE;
+   index = offset / nic_data->piobuf_size;
+   offset = offset % nic_data->piobuf_size;
 
/* When the host page size is 4K, the first
 * host page in the WC mapping may be within
@@ -1159,11 +1163,11 @@ static int efx_ef10_dimension_resources(struct efx_nic 
*efx)
 * functions of the controller.
 */
if (efx_piobuf_size != 0 &&
-   ER_DZ_TX_PIOBUF_SIZE / efx_piobuf_size * EF10_TX_PIOBUF_COUNT >=
+   nic_data->piobuf_size / efx_piobuf_size * EF10_TX_PIOBUF_COUNT >=
efx->n_tx_channels) {
unsigned int n_piobufs =
DIV_ROUND_UP(efx->n_tx_channels,
-ER_DZ_TX_PIOBUF_SIZE / efx_piobuf_size);
+nic_data->piobuf_size / efx_piobuf_size);
 
rc = efx_ef10_alloc_piobufs(efx, n_piobufs);
if (rc)
diff --git a/drivers/net/ethernet/sfc/nic.h b/drivers/net/ethernet/sfc/nic.h
index 6a69aa3..383ff6e 100644
--- a/drivers/net/ethernet/sfc/nic.h
+++ b/drivers/net/ethernet/sfc/nic.h
@@ -343,6 +343,7 @@ enum {
  * @pio_write_base: Base address for writing PIO buffers
  * @pio_write_vi_base: Relative VI number for @pio_write_base
  * @piobuf_handle: Handle of each PIO buffer allocated
+ * @piobuf_size: size of a single PIO buffer
  * @must_restore_piobufs: Flag: PIO buffers have yet to be restored after MC
  * reboot
  * @rx_rss_context: Firmware handle for our RSS context
@@ -380,6 +381,7 @@ struct efx_ef10_nic_data {
void __iomem *wc_membase, *pio_write_base;
unsigned int pio_write_vi_base;
unsigned int piobuf_handle[EF10_TX_PIOBUF_COUNT];
+   u16 piobuf_size;
bool must_restore_piobufs;
u32 rx_rss_context;
bool rx_rss_context_exclusive;
diff --git a/drivers/net/ethernet/sfc/tx.c b/drivers/net/ethernet/sfc/tx.c
index beaf980..ff88d60 100644
--- a/drivers/net/ethernet/sfc/tx.c
+++ b/drivers/net/ethernet/sfc/tx.c
@@ -28,7 +28,6 @@
 
 #ifdef EFX_USE_PIO
 
-#define EFX_PIOBUF_SIZE_MAX ER_DZ_TX_PIOBUF_SIZE
 #define EFX_PIOBUF_SIZE_DEF ALIGN(256, L1_CACHE_BYTES)
 unsigned int efx_piobuf_size __read_mostly = EFX_PIOBUF_SIZE_DEF;
 


[PATCH net-next 1/2] sfc: allow PIO more often

2017-01-13 Thread Edward Cree
If an option descriptor has been sent on a queue but not followed by a
 packet, there will have been no completion event, so the read and write
 counts won't match and we'll think we can't do PIO.  This combines with
 the fact that we have two TX queues (for en/disable checksum offload),
 and that both must be empty for PIO to happen.
This patch adds a separate "packet_write_count" that tracks the most
 recent write_count we expect to see a completion event for; this excludes
 option descriptors but _includes_ PIO descriptors (even though they look
 like option descriptors).  This is then used, rather than write_count,
 in efx_nic_tx_is_empty().
We only bother to maintain packet_write_count on EF10, since on Siena
 (a) there are no option descriptors and it always equals write_count, and
 (b) there's no PIO, so we don't need it anyway.

Signed-off-by: Edward Cree 
---
 drivers/net/ethernet/sfc/ef10.c   |  5 +
 drivers/net/ethernet/sfc/net_driver.h |  9 +
 drivers/net/ethernet/sfc/nic.h| 17 ++---
 drivers/net/ethernet/sfc/siena.c  |  1 +
 drivers/net/ethernet/sfc/tx.c |  1 +
 5 files changed, 30 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/sfc/ef10.c b/drivers/net/ethernet/sfc/ef10.c
index 208e004..f6e29a9 100644
--- a/drivers/net/ethernet/sfc/ef10.c
+++ b/drivers/net/ethernet/sfc/ef10.c
@@ -2358,7 +2358,11 @@ static void efx_ef10_tx_write(struct efx_tx_queue 
*tx_queue)
/* Create TX descriptor ring entry */
if (buffer->flags & EFX_TX_BUF_OPTION) {
*txd = buffer->option;
+   if (EFX_QWORD_FIELD(*txd, ESF_DZ_TX_OPTION_TYPE) == 1)
+   /* PIO descriptor */
+   tx_queue->packet_write_count = 
tx_queue->write_count;
} else {
+   tx_queue->packet_write_count = tx_queue->write_count;
BUILD_BUG_ON(EFX_TX_BUF_CONT != 1);
EFX_POPULATE_QWORD_3(
*txd,
@@ -5796,6 +5800,7 @@ const struct efx_nic_type efx_hunt_a0_nic_type = {
.rx_ts_offset = ES_DZ_RX_PREFIX_TSTAMP_OFST,
.can_rx_scatter = true,
.always_rx_scatter = true,
+   .option_descriptors = true,
.max_interrupt_mode = EFX_INT_MODE_MSIX,
.timer_period_max = 1 << ERF_DD_EVQ_IND_TIMER_VAL_WIDTH,
.offload_features = EF10_OFFLOAD_FEATURES,
diff --git a/drivers/net/ethernet/sfc/net_driver.h 
b/drivers/net/ethernet/sfc/net_driver.h
index 49db9e8..b20fe43 100644
--- a/drivers/net/ethernet/sfc/net_driver.h
+++ b/drivers/net/ethernet/sfc/net_driver.h
@@ -208,6 +208,12 @@ struct efx_tx_buffer {
  * @write_count: Current write pointer
  * This is the number of buffers that have been added to the
  * hardware ring.
+ * @packet_write_count: Completable write pointer
+ * This is the write pointer of the last packet written.
+ * Normally this will equal @write_count, but as option descriptors
+ * don't produce completion events, they won't update this.
+ * Filled in iff @efx->type->option_descriptors; only used for PIO.
+ * Thus, this is written and used on EF10, and neither on farch.
  * @old_read_count: The value of read_count when last checked.
  * This is here for performance reasons.  The xmit path will
  * only get the up-to-date value of read_count if this
@@ -255,6 +261,7 @@ struct efx_tx_queue {
/* Members used only on the xmit path */
unsigned int insert_count cacheline_aligned_in_smp;
unsigned int write_count;
+   unsigned int packet_write_count;
unsigned int old_read_count;
unsigned int tso_bursts;
unsigned int tso_long_headers;
@@ -1237,6 +1244,7 @@ struct efx_mtd_partition {
  * @rx_buffer_padding: Size of padding at end of RX packet
  * @can_rx_scatter: NIC is able to scatter packets to multiple buffers
  * @always_rx_scatter: NIC will always scatter packets to multiple buffers
+ * @option_descriptors: NIC supports TX option descriptors
  * @max_interrupt_mode: Highest capability interrupt mode supported
  * from  efx_init_mode.
  * @timer_period_max: Maximum period of interrupt timer (in ticks)
@@ -1395,6 +1403,7 @@ struct efx_nic_type {
unsigned int rx_buffer_padding;
bool can_rx_scatter;
bool always_rx_scatter;
+   bool option_descriptors;
unsigned int max_interrupt_mode;
unsigned int timer_period_max;
netdev_features_t offload_features;
diff --git a/drivers/net/ethernet/sfc/nic.h b/drivers/net/ethernet/sfc/nic.h
index 2237746..6a69aa3 100644
--- a/drivers/net/ethernet/sfc/nic.h
+++ b/drivers/net/ethernet/sfc/nic.h
@@ -85,6 +85,17 @@ static inline bool __efx_nic_tx_is_empty(struct efx_tx_queue 
*tx_queue,
return ((empty_read_count ^ write_count) & ~EFX_EMPTY_COUNT_VALID) == 0;
 }
 
+/* Report whether the NIC considers this TX 

[PATCH net-next 0/2] sfc: TX PIO fixes

2017-01-13 Thread Edward Cree
Edward Cree (2):
  sfc: allow PIO more often
  sfc: get PIO buffer size from the NIC

 drivers/net/ethernet/sfc/ef10.c   | 21 +++--
 drivers/net/ethernet/sfc/net_driver.h |  9 +
 drivers/net/ethernet/sfc/nic.h| 19 ---
 drivers/net/ethernet/sfc/siena.c  |  1 +
 drivers/net/ethernet/sfc/tx.c |  2 +-
 5 files changed, 42 insertions(+), 10 deletions(-)



[PATCH net-next] sctp: remove useless code from sctp_apply_peer_addr_params

2017-01-13 Thread Marcelo Ricardo Leitner
sctp_frag_point() doesn't store anything, and thus just calling it
cannot do anything useful.

sctp_apply_peer_addr_params is only called by
sctp_setsockopt_peer_addr_params. When operating on an asoc,
sctp_setsockopt_peer_addr_params will call sctp_apply_peer_addr_params
once for the asoc, and then once for each transport this asoc has,
meaning that the frag_point will be recomputed when updating the
transports and calling it when updating the asoc is not necessary.
IOW, no action is needed here and we can remove this call.

Signed-off-by: Marcelo Ricardo Leitner 
---
 net/sctp/socket.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 
318c6786d6539a301ac7b76d82a49a1af3818d10..635e0341269330187c78ba93a35689f5c5d6be02
 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2430,7 +2430,6 @@ static int sctp_apply_peer_addr_params(struct 
sctp_paddrparams *params,
sctp_assoc_sync_pmtu(sctp_opt2sk(sp), asoc);
} else if (asoc) {
asoc->pathmtu = params->spp_pathmtu;
-   sctp_frag_point(asoc, params->spp_pathmtu);
} else {
sp->pathmtu = params->spp_pathmtu;
}
-- 
2.9.3



Re: [PATCH net-next] sctp: remove dead code from sctp_apply_peer_addr_params

2017-01-13 Thread Marcelo Ricardo Leitner
On Fri, Jan 13, 2017 at 06:27:32PM -0200, Marcelo Ricardo Leitner wrote:
> sctp_frag_point() doesn't store anything, and thus just calling it
> cannot do anything useful.

Please ignore this one. Will post another one with
s/dead code/useless code/ , as the code not really dead..

> 
> sctp_apply_peer_addr_params is only called by
> sctp_setsockopt_peer_addr_params. When operating on an asoc,
> sctp_setsockopt_peer_addr_params will call sctp_apply_peer_addr_params
> once for the asoc, and then once for each transport this asoc has,
> meaning that the frag_point will be recomputed when updating the
> transports and calling it when updating the asoc is not necessary.
> IOW, no action is needed here and we can remove this call.
> 
> Signed-off-by: Marcelo Ricardo Leitner 
> ---
>  net/sctp/socket.c | 1 -
>  1 file changed, 1 deletion(-)
> 
> diff --git a/net/sctp/socket.c b/net/sctp/socket.c
> index 
> 318c6786d6539a301ac7b76d82a49a1af3818d10..635e0341269330187c78ba93a35689f5c5d6be02
>  100644
> --- a/net/sctp/socket.c
> +++ b/net/sctp/socket.c
> @@ -2430,7 +2430,6 @@ static int sctp_apply_peer_addr_params(struct 
> sctp_paddrparams *params,
>   sctp_assoc_sync_pmtu(sctp_opt2sk(sp), asoc);
>   } else if (asoc) {
>   asoc->pathmtu = params->spp_pathmtu;
> - sctp_frag_point(asoc, params->spp_pathmtu);
>   } else {
>   sp->pathmtu = params->spp_pathmtu;
>   }
> -- 
> 2.9.3
> 
> --
> To unsubscribe from this list: send the line "unsubscribe linux-sctp" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html
> 


Re: [pull request][for-next] Mellanox mlx5 Reorganize core driver directory layout

2017-01-13 Thread Leon Romanovsky
On Fri, Jan 13, 2017 at 12:14:07PM -0500, David Miller wrote:
> From: Saeed Mahameed 
> Date: Thu, 12 Jan 2017 19:22:34 +0200
>
> > This pull request includes one patch from Leon, this patch as described
> > below will change the driver directory structure and layout for better,
> > logical and modular driver files separation.
> >
> > This change is important to both rdma and net maintainers in order to
> > have smoother management of driver patches for different mlx5 sub modules
> > and smoother rdma-next vs. net-next features submissions.
> >
> > Please find more info below -in the tag commit message-,
> > review and let us know if there's any problem.
> >
> > This change doesn't introduce any conflicts with the current mlx5
> > fixes and cleanups posted on 2017-01-10 to net branch, and merge tests
> > worked flawlessly with no issues.
> >
> > This is the last pull request meant for both rdma-next and net-next.
> > Once pulled, this will be the base shared code for both trees.
>
> This is pretty crazy, it will make all bug fix backporting to -stable
> a complete nightmare for myself, Doug, various distribution maintainers
> and many other people who quietly have to maintain their own trees and
> do backporting.

Hi Dave,

I understand your worries, but our case is similar to various other
drivers, for example hfi1 which was in staging for years while
supported in RedHat and moved from there to IB. The Chelsio drivers did
similar reorg in 2016 (drivers/net/ethernet/chelsio/libcxgb) while their
drivers were in the tree for long time before.

Additionally, Doug doesn't need to maintain -stable queue and it is done
by relevant submaintainers who are adding stable tags by themselves. In
the IB case, the burden will continue to be on me and not on Doug.

>
> I really don't think you can justify this rearrangement based upon the
> consequences and how much activity happens in this driver.
>
> You should have thought long and hard about the layout a long time ago
> rather than after the driver has been in the tree for many years.
>
> Sorry.


signature.asc
Description: PGP signature


[PATCH net-next] sctp: remove unused var from sctp_process_asconf

2017-01-13 Thread Marcelo Ricardo Leitner
Assigned but not used.

Signed-off-by: Marcelo Ricardo Leitner 
---
 net/sctp/sm_make_chunk.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 
a15d824a313d310ed03ba77055d22b1c7c9d0662..80a9088084ac0d1116bf983fad9a0cdfddeefe44
 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -3210,7 +3210,6 @@ struct sctp_chunk *sctp_process_asconf(struct 
sctp_association *asoc,
union sctp_params param;
sctp_addiphdr_t *hdr;
union sctp_addr_param   *addr_param;
-   sctp_addip_param_t  *asconf_param;
struct sctp_chunk   *asconf_ack;
__be16  err_code;
int length = 0;
@@ -3230,7 +3229,6 @@ struct sctp_chunk *sctp_process_asconf(struct 
sctp_association *asoc,
 * asconf parameter.
 */
length = ntohs(addr_param->p.length);
-   asconf_param = (void *)addr_param + length;
chunk_len -= length;
 
/* create an ASCONF_ACK chunk.
-- 
2.9.3



[PATCH net-next] sctp: remove dead code from sctp_apply_peer_addr_params

2017-01-13 Thread Marcelo Ricardo Leitner
sctp_frag_point() doesn't store anything, and thus just calling it
cannot do anything useful.

sctp_apply_peer_addr_params is only called by
sctp_setsockopt_peer_addr_params. When operating on an asoc,
sctp_setsockopt_peer_addr_params will call sctp_apply_peer_addr_params
once for the asoc, and then once for each transport this asoc has,
meaning that the frag_point will be recomputed when updating the
transports and calling it when updating the asoc is not necessary.
IOW, no action is needed here and we can remove this call.

Signed-off-by: Marcelo Ricardo Leitner 
---
 net/sctp/socket.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 
318c6786d6539a301ac7b76d82a49a1af3818d10..635e0341269330187c78ba93a35689f5c5d6be02
 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -2430,7 +2430,6 @@ static int sctp_apply_peer_addr_params(struct 
sctp_paddrparams *params,
sctp_assoc_sync_pmtu(sctp_opt2sk(sp), asoc);
} else if (asoc) {
asoc->pathmtu = params->spp_pathmtu;
-   sctp_frag_point(asoc, params->spp_pathmtu);
} else {
sp->pathmtu = params->spp_pathmtu;
}
-- 
2.9.3



Re: [PATCH 2/3] powerpc: bpf: flush the entire JIT buffer

2017-01-13 Thread Alexei Starovoitov
On Fri, Jan 13, 2017 at 10:40:01PM +0530, Naveen N. Rao wrote:
> With bpf_jit_binary_alloc(), we allocate at a page granularity and fill
> the rest of the space with illegal instructions to mitigate BPF spraying
> attacks, while having the actual JIT'ed BPF program at a random location
> within the allocated space. Under this scenario, it would be better to
> flush the entire allocated buffer rather than just the part containing
> the actual program. We already flush the buffer from start to the end of
> the BPF program. Extend this to include the illegal instructions after
> the BPF program.
> 
> Signed-off-by: Naveen N. Rao 

Acked-by: Alexei Starovoitov 



Re: [PATCH 1/3] powerpc: bpf: remove redundant check for non-null image

2017-01-13 Thread Alexei Starovoitov
On Fri, Jan 13, 2017 at 10:40:00PM +0530, Naveen N. Rao wrote:
> From: Daniel Borkmann 
> 
> We have a check earlier to ensure we don't proceed if image is NULL. As
> such, the redundant check can be removed.
> 
> Signed-off-by: Daniel Borkmann 
> [Added similar changes for classic BPF JIT]
> Signed-off-by: Naveen N. Rao 

Acked-by: Alexei Starovoitov 



Re: [net PATCH v3 5/5] virtio_net: XDP support for adjust_head

2017-01-13 Thread John Fastabend
On 17-01-12 11:41 PM, Jason Wang wrote:
> 
> 
> On 2017年01月13日 10:52, John Fastabend wrote:
>> Add support for XDP adjust head by allocating a 256B header region
>> that XDP programs can grow into. This is only enabled when a XDP
>> program is loaded.
>>
>> In order to ensure that we do not have to unwind queue headroom push
>> queue setup below bpf_prog_add. It reads better to do a prog ref
>> unwind vs another queue setup call.
>>
>> At the moment this code must do a full reset to ensure old buffers
>> without headroom on program add or with headroom on program removal
>> are not used incorrectly in the datapath. Ideally we would only
>> have to disable/enable the RX queues being updated but there is no
>> API to do this at the moment in virtio so use the big hammer. In
>> practice it is likely not that big of a problem as this will only
>> happen when XDP is enabled/disabled changing programs does not
>> require the reset. There is some risk that the driver may either
>> have an allocation failure or for some reason fail to correctly
>> negotiate with the underlying backend in this case the driver will
>> be left uninitialized. I have not seen this ever happen on my test
>> systems and for what its worth this same failure case can occur
>> from probe and other contexts in virtio framework.
>>
>> Signed-off-by: John Fastabend 
>> ---
>>   drivers/net/virtio_net.c |  155 
>> --
>>   drivers/virtio/virtio.c  |9 ++-
>>   include/linux/virtio.h   |3 +
>>   3 files changed, 144 insertions(+), 23 deletions(-)
>>
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index 6041828..8b897e7 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -28,6 +28,7 @@
>>   #include 
>>   #include 
>>   #include 
>> +#include 
>>   #include 
>> static int napi_weight = NAPI_POLL_WEIGHT;
>> @@ -159,6 +160,9 @@ struct virtnet_info {
>>   /* Ethtool settings */
>>   u8 duplex;
>>   u32 speed;
>> +
>> +/* Headroom allocated in RX Queue */
>> +unsigned int headroom;
> 
> If this could not be changed in anyway, better use a macro instead of a filed
> here. And there's even no need to add an extra parameter to
> add_recvbuf_mergeable().

OK originally I thought this might be dynamic but I agree no need
for it here.

> 
>>   };
>> struct padded_vnet_hdr {
>> @@ -359,6 +363,7 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
>>   }
>> if (vi->mergeable_rx_bufs) {
>> +xdp->data -= sizeof(struct virtio_net_hdr_mrg_rxbuf);
> 
> Fail to understand why this is needed. We should have excluded vnet header 
> from
> xdp->data even before bpf_prog_run_xdp().
> 
>>   /* Zero header and leave csum up to XDP layers */
>>   hdr = xdp->data;
>>   memset(hdr, 0, vi->hdr_len);
>> @@ -375,7 +380,9 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
>>   num_sg = 2;
>>   sg_init_table(sq->sg, 2);
>>   sg_set_buf(sq->sg, hdr, vi->hdr_len);
>> -skb_to_sgvec(skb, sq->sg + 1, 0, skb->len);
>> +skb_to_sgvec(skb, sq->sg + 1,
>> + xdp->data - xdp->data_hard_start,
>> + xdp->data_end - xdp->data);
>>   }
>>   err = virtqueue_add_outbuf(sq->vq, sq->sg, num_sg,
>>  data, GFP_ATOMIC);
>> @@ -401,7 +408,6 @@ static struct sk_buff *receive_small(struct net_device 
>> *dev,
>>   struct bpf_prog *xdp_prog;
>> len -= vi->hdr_len;
>> -skb_trim(skb, len);
>> rcu_read_lock();
>>   xdp_prog = rcu_dereference(rq->xdp_prog);
>> @@ -413,11 +419,15 @@ static struct sk_buff *receive_small(struct net_device
>> *dev,
>>   if (unlikely(hdr->hdr.gso_type || hdr->hdr.flags))
>>   goto err_xdp;
>>   -xdp.data = skb->data;
>> +xdp.data_hard_start = skb->data;
>> +xdp.data = skb->data + vi->headroom;
>>   xdp.data_end = xdp.data + len;
>>   act = bpf_prog_run_xdp(xdp_prog, );
>>   switch (act) {
>>   case XDP_PASS:
>> +/* Recalculate length in case bpf program changed it */
>> +len = xdp.data_end - xdp.data;
>> +__skb_pull(skb, xdp.data - xdp.data_hard_start);
> 
> How about do this just after bpf_pro_run_xdp() for XDP_TX too? This is more
> readable and there's no need to change xmit path.

Agreed will do.

> 
>>   break;
>>   case XDP_TX:
>>   virtnet_xdp_xmit(vi, rq, , skb);
>> @@ -432,6 +442,7 @@ static struct sk_buff *receive_small(struct net_device 
>> *dev,
>>   }
>>   rcu_read_unlock();
>>   +skb_trim(skb, len);
>>   return skb;
>> err_xdp:
>> @@ -569,7 +580,11 @@ static struct sk_buff *receive_mergeable(struct
>> net_device *dev,
>>   if (unlikely(hdr->hdr.gso_type))
>>   goto err_xdp;
>>   +/* Allow consuming headroom but reserve enough space to push
>> 

ANNOUNCE: Netdev 2.1 in Montreal

2017-01-13 Thread Jamal Hadi Salim


Folks,

We are pleased to announce Netdev 2.1 (year 2, conference 1)
in the beautiful city of Montreal, Canada on the 6th to 8th of April.
The website is now online: http://www.netdevconf.org/2.1/
Netdev 2.1 will be held back to back with netconf2017.1
(http://vger.kernel.org/netconf2017.html)

Netdev 2.1 is a  community-driven conference geared towards Linux
netheads. Linux kernel networking and user space utilization of the
interfaces to the Linux kernel networking subsystem are the central
theme.
If you are using Linux as a boot system for proprietary networking,
then this conference _may not be for you_.

Registration costs
--
Very Cheap.
$CAN 300 if you register before Feb 27. $360 after.
$CAN 150 if you are a student. $180 after Feb 27

Why you should register
---
If you yearn for the old community tech driven conferences where
you mingle with fellow geeks (only these would be Linux networking
geeks) then this would be it. There will be no marketing flashing
light openings or loud bad music. Just a pure feed of Linux
networking. Gurus and magicians of all sorts will be there mingling,
juggling and giving talks.

While there will be heavy Linux kernel influence we expect a lot
of user space presence as well.

Exact Location:
-
Tentative: LE WESTIN MONTRÉAL

Sponsorship


If you can help us organize this event by sponsoring, please drop us a 
line to: spon...@netdevconf.org and we'll send you our sponsorship policy.


Important dates

January 18, 2017 Call for proposals open
January 23, 2017 Registration opens
February 20, 2017 Call for proposals close
February 27, 2017 Early Registration close
March 13, 2017 Conference schedule announced
March 27, 2017 Paper submission
March 31, 2017 Online Registration close
April 3, 2017 Slides submission
April 6-8, 2017 Conference days
May 1, 2017 Slides release
June 1, 2017 Paper release

cheers,
jamal


Re: [net PATCH v3 3/5] virtio_net: factor out xdp handler for readability

2017-01-13 Thread John Fastabend
On 17-01-12 11:40 PM, Jason Wang wrote:
> 
> 
> On 2017年01月13日 10:51, John Fastabend wrote:
>> At this point the do_xdp_prog is mostly if/else branches handling
>> the different modes of virtio_net. So remove it and handle running
>> the program in the per mode handlers.
>>
>> Signed-off-by: John Fastabend 
>> ---
>>   drivers/net/virtio_net.c |   76 
>> +-
>>   1 file changed, 28 insertions(+), 48 deletions(-)
>>
>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
>> index 43cb2e0..ec54644 100644
>> --- a/drivers/net/virtio_net.c
>> +++ b/drivers/net/virtio_net.c
>> @@ -388,49 +388,6 @@ static void virtnet_xdp_xmit(struct virtnet_info *vi,
>>   virtqueue_kick(sq->vq);
>>   }
>>   
> 
> [...]
> 
>> /* This happens when rx buffer size is underestimated */
>> @@ -598,8 +570,10 @@ static struct sk_buff *receive_mergeable(struct
>> net_device *dev,
>>   if (unlikely(hdr->hdr.gso_type))
>>   goto err_xdp;
>>   -act = do_xdp_prog(vi, rq, xdp_prog,
>> -  page_address(xdp_page) + offset, len);
>> +data = page_address(xdp_page) + offset;
>> +xdp.data = data + desc_room;
>> +xdp.data_end = xdp.data + (len - vi->hdr_len);
> 
> It looks desc_room is always vi->hdr_len.
> 

Seems to be the case I'll just use vi->hdr_len and remove the variable.

Thanks.

>> +act = bpf_prog_run_xdp(xdp_prog, );
>>   switch (act) {
>>   case XDP_PASS:
>>   /* We can only create skb based on xdp_page. */
>> @@ -613,13 +587,19 @@ static struct sk_buff *receive_mergeable(struct
>> net_device *dev,
>>   }
>>   break;
>>   case XDP_TX:
>> +qp = vi->curr_queue_pairs -
>> +vi->xdp_queue_pairs +
>> +smp_processor_id();
>> +virtnet_xdp_xmit(vi, rq, >sq[qp], , data);
>>   ewma_pkt_len_add(>mrg_avg_pkt_len, len);
>>   if (unlikely(xdp_page != page))
>>   goto err_xdp;
>>   rcu_read_unlock();
>>   goto xdp_xmit;
>> -case XDP_DROP:
>>   default:
>> +bpf_warn_invalid_xdp_action(act);
>> +case XDP_ABORTED:
>> +case XDP_DROP:
>>   if (unlikely(xdp_page != page))
>>   __free_pages(xdp_page, 0);
>>   ewma_pkt_len_add(>mrg_avg_pkt_len, len);
>>
> 



Re: [net PATCH 5/5] virtio_net: XDP support for adjust_head

2017-01-13 Thread John Fastabend
On 17-01-13 09:23 AM, Michael S. Tsirkin wrote:
> On Thu, Jan 12, 2017 at 01:45:19PM -0800, John Fastabend wrote:
>> Add support for XDP adjust head by allocating a 256B header region
>> that XDP programs can grow into. This is only enabled when a XDP
>> program is loaded.
>>
>> In order to ensure that we do not have to unwind queue headroom push
>> queue setup below bpf_prog_add. It reads better to do a prog ref
>> unwind vs another queue setup call.
>>
>> At the moment this code must do a full reset to ensure old buffers
>> without headroom on program add or with headroom on program removal
>> are not used incorrectly in the datapath. Ideally we would only
>> have to disable/enable the RX queues being updated but there is no
>> API to do this at the moment in virtio so use the big hammer. In
>> practice it is likely not that big of a problem as this will only
>> happen when XDP is enabled/disabled changing programs does not
>> require the reset. There is some risk that the driver may either
>> have an allocation failure or for some reason fail to correctly
>> negotiate with the underlying backend in this case the driver will
>> be left uninitialized. I have not seen this ever happen on my test
>> systems and for what its worth this same failure case can occur
>> from probe and other contexts in virtio framework.
>>
>> Signed-off-by: John Fastabend 
>> ---


[...]

>>  
>> +#define VIRTIO_XDP_HEADROOM 256
>> +
>> +static int init_vqs(struct virtnet_info *vi);
>> +static void remove_vq_common(struct virtnet_info *vi, bool lock);
>> +
>> +/* Reset virtio device with RTNL held this is very similar to the
>> + * freeze()/restore() logic except we need to ensure locking. It is
>> + * possible that this routine may fail and leave the driver in a
>> + * failed state. However assuming the driver negotiated correctly
>> + * at probe time we _should_ be able to (re)negotiate driver again.
>> + */
>> +static int virtnet_xdp_reset(struct virtnet_info *vi)
>> +{
>> +struct virtio_device *vdev = vi->vdev;
>> +unsigned int status;
>> +int i, ret;
>> +
>> +/* Disable and unwind rings */
>> +virtio_config_disable(vdev);
>> +vdev->failed = vdev->config->get_status(vdev) & VIRTIO_CONFIG_S_FAILED;
>> +
>> +netif_device_detach(vi->dev);
> 
> After this point, netif_device_present
> will return false, and then we have a bunch of code
> that does
> if (!netif_device_present(dev))
> return -ENODEV;
> 
> 
> so we need to audit this code to make sure it's
> all called under rtnl, correct?
> 

Correct. In the XDP case it is.

> We don't want it to fail because of timing.
> 
> Maybe add an assert there.
> 

I can add an assert here to ensure it doesn't ever get
refactored out or something.

> 
>> +cancel_delayed_work_sync(>refill);
>> +if (netif_running(vi->dev)) {
>> +for (i = 0; i < vi->max_queue_pairs; i++)
>> +napi_disable(>rq[i].napi);
>> +}
>> +
>> +remove_vq_common(vi, false);
>> +
>> +/* Do a reset per virtio spec recommendation */
>> +vdev->config->reset(vdev);
>> +
>> +/* Acknowledge that we've seen the device. */
>> +status = vdev->config->get_status(vdev);
>> +vdev->config->set_status(vdev, status | VIRTIO_CONFIG_S_ACKNOWLEDGE);
>> +
>> +/* Notify driver is up and finalize features per specification. The
>> + * error code from finalize features is checked here but should not
>> + * fail because we assume features were previously synced successfully.
>> + */
>> +status = vdev->config->get_status(vdev);
>> +vdev->config->set_status(vdev, status | VIRTIO_CONFIG_S_DRIVER);
>> +ret = virtio_finalize_features(vdev);
> 
> I'd rather we put all this in virtio core, maybe call it virtio_reset or
> something.

At first I started to do this but decided it was easier to open code it I
was on the fence though so if we think it would be cleaner then I will
do it.

The trick is needs to be broken down into two pieces, something like the
following,

virtio_reset() {
do_generic_down_part  -> pci pieces
vdev->config->down()  -> do down part of device specifics
do_generic_up_part
vdev->config->up()-> do up part of device specifics
do_finalize_part
}

Alternatively we could reuse the freeze/restore device callbacks but those
make assumptions about locking. So we could pass a context through but per
Stephen's comment that gets a bit fragile. And sparse doesn't like it either
apparently. I think making it an explicit down/up reset callback might
make it clean and reusable for any other devices.

Any thoughts? My preference outside of open coding it is the new down_reset
and up_reset callbacks.

> 
>> +if (ret) {
>> +netdev_warn(vi->dev, "virtio_finalize_features failed during 
>> reset aborting\n");
>> +goto err;
>> +}
>> +

Re: Commit 1fe8e0... (include more headers in if_tunnel.h) breaks my user-space build.

2017-01-13 Thread Ben Greear

On 01/13/2017 11:41 AM, Stephen Hemminger wrote:

On Fri, 13 Jan 2017 11:12:32 -0800
Ben Greear  wrote:


I am including netinet/ip.h, and also linux/if_tunnel.h, and the linux/ip.h 
conflicts with
netinet/ip.h.

Maybe my build environment is screwed up, but maybe also it would be better to
just let the user include appropriate headers before including if_tunnel.h
and revert this patch?


include/uapi/linux/if_tunnel.h: include linux/if.h, linux/ip.h and linux/in6.h

 Fixes userspace compilation errors like:

 error: field ‘iph’ has incomplete type
 error: field ‘prefix’ has incomplete type

 Signed-off-by: Mikko Rapeli 
 Signed-off-by: David S. Miller 

Thanks,
Ben



What I ended up doing for iproute2 was including all headers used by the source
based on sanitized kernel headers.  Basically
  $ git grep '^#include .*$//' | \
sort -u >linux.headers
   $ for f in $(cat linux.headers)
 do cp ~/kernel/net-next/usr/include/$f include/$f
 done

You can't take only some of the headers, once you decide to diverge from glibc 
provided
headers, you got to take them all.



I do grab a copy of the linux kernel headers and compile against that, but 
netinet/ip.h is
coming from the OS.  Do you mean I should not include netinet/ip.h and instead 
use linux/ip.h?

Thanks,
Ben

--
Ben Greear 
Candela Technologies Inc  http://www.candelatech.com



Re: Commit 1fe8e0... (include more headers in if_tunnel.h) breaks my user-space build.

2017-01-13 Thread Stephen Hemminger
On Fri, 13 Jan 2017 11:12:32 -0800
Ben Greear  wrote:

> I am including netinet/ip.h, and also linux/if_tunnel.h, and the linux/ip.h 
> conflicts with
> netinet/ip.h.
> 
> Maybe my build environment is screwed up, but maybe also it would be better to
> just let the user include appropriate headers before including if_tunnel.h
> and revert this patch?
> 
> 
> include/uapi/linux/if_tunnel.h: include linux/if.h, linux/ip.h and linux/in6.h
> 
>  Fixes userspace compilation errors like:
> 
>  error: field ‘iph’ has incomplete type
>  error: field ‘prefix’ has incomplete type
> 
>  Signed-off-by: Mikko Rapeli 
>  Signed-off-by: David S. Miller 
> 
> Thanks,
> Ben
> 

What I ended up doing for iproute2 was including all headers used by the source
based on sanitized kernel headers.  Basically
  $ git grep '^#include .*$//' | \
sort -u >linux.headers
   $ for f in $(cat linux.headers)
 do cp ~/kernel/net-next/usr/include/$f include/$f
 done

You can't take only some of the headers, once you decide to diverge from glibc 
provided
headers, you got to take them all.


Re: Commit 1fe8e0... (include more headers in if_tunnel.h) breaks my user-space build.

2017-01-13 Thread Ben Greear

On 01/13/2017 11:12 AM, Ben Greear wrote:

I am including netinet/ip.h, and also linux/if_tunnel.h, and the linux/ip.h 
conflicts with
netinet/ip.h.

Maybe my build environment is screwed up, but maybe also it would be better to
just let the user include appropriate headers before including if_tunnel.h
and revert this patch?


include/uapi/linux/if_tunnel.h: include linux/if.h, linux/ip.h and linux/in6.h

Fixes userspace compilation errors like:

error: field ‘iph’ has incomplete type
error: field ‘prefix’ has incomplete type

Signed-off-by: Mikko Rapeli 
Signed-off-by: David S. Miller 

Thanks,
Ben



I forgot the full commit ID, my abbreviation was not sufficient to be unique it 
seems:

1fe8e0f074c77aa41aaa579345a9e675acbebfa9

Thanks,
Ben

--
Ben Greear 
Candela Technologies Inc  http://www.candelatech.com



Commit 1fe8e0... (include more headers in if_tunnel.h) breaks my user-space build.

2017-01-13 Thread Ben Greear

I am including netinet/ip.h, and also linux/if_tunnel.h, and the linux/ip.h 
conflicts with
netinet/ip.h.

Maybe my build environment is screwed up, but maybe also it would be better to
just let the user include appropriate headers before including if_tunnel.h
and revert this patch?


include/uapi/linux/if_tunnel.h: include linux/if.h, linux/ip.h and linux/in6.h

Fixes userspace compilation errors like:

error: field ‘iph’ has incomplete type
error: field ‘prefix’ has incomplete type

Signed-off-by: Mikko Rapeli 
Signed-off-by: David S. Miller 

Thanks,
Ben

--
Ben Greear 
Candela Technologies Inc  http://www.candelatech.com



[PATCHv3 net-next 7/7] sctp: implement sender-side procedures for SSN Reset Request Parameter

2017-01-13 Thread Xin Long
This patch is to implement sender-side procedures for the Outgoing
and Incoming SSN Reset Request Parameter described in rfc6525 section
5.1.2 and 5.1.3.

It is also add sockopt SCTP_RESET_STREAMS in rfc6525 section 6.3.2
for users.

Note that the new asoc member strreset_outstanding is to make sure
only one reconf request chunk on the fly as rfc6525 section 5.1.1
demands.

Signed-off-by: Xin Long 
---
 include/net/sctp/sctp.h|  6 
 include/net/sctp/structs.h |  1 +
 include/uapi/linux/sctp.h  | 11 +++
 net/sctp/outqueue.c| 33 +--
 net/sctp/socket.c  | 29 +
 net/sctp/stream.c  | 79 ++
 6 files changed, 149 insertions(+), 10 deletions(-)

diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index bc0e049..3cfd365b 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -194,6 +194,12 @@ void sctp_remaddr_proc_exit(struct net *net);
 int sctp_offload_init(void);
 
 /*
+ * sctp/stream.c
+ */
+int sctp_send_reset_streams(struct sctp_association *asoc,
+   struct sctp_reset_streams *params);
+
+/*
  * Module global variables
  */
 
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index d99b76e..231fa9ac 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1875,6 +1875,7 @@ struct sctp_association {
 reconf_enable:1;
 
__u8 strreset_enable;
+   __u8 strreset_outstanding; /* request param count on the fly */
 
__u32 strreset_outseq; /* Update after receiving response */
__u32 strreset_inseq; /* Update after receiving request */
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index 867be0f..03c27ce 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -116,6 +116,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_DEFAULT_PRINFO114
 #define SCTP_PR_ASSOC_STATUS   115
 #define SCTP_ENABLE_STREAM_RESET   118
+#define SCTP_RESET_STREAMS 119
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE  0x
@@ -145,6 +146,9 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_ENABLE_CHANGE_ASSOC_REQ   0x04
 #define SCTP_ENABLE_STRRESET_MASK  0x07
 
+#define SCTP_STREAM_RESET_INCOMING 0x01
+#define SCTP_STREAM_RESET_OUTGOING 0x02
+
 /* These are bit fields for msghdr->msg_flags.  See section 5.1.  */
 /* On user space Linux, these live in  as an enum.  */
 enum sctp_msg_flags {
@@ -1015,4 +1019,11 @@ struct sctp_info {
__u32   __reserved3;
 };
 
+struct sctp_reset_streams {
+   sctp_assoc_t srs_assoc_id;
+   uint16_t srs_flags;
+   uint16_t srs_number_streams;/* 0 == ALL */
+   uint16_t srs_stream_list[]; /* list if srs_num_streams is not 0 */
+};
+
 #endif /* _UAPI_SCTP_H */
diff --git a/net/sctp/outqueue.c b/net/sctp/outqueue.c
index 34efaa4..65abe22 100644
--- a/net/sctp/outqueue.c
+++ b/net/sctp/outqueue.c
@@ -915,22 +915,28 @@ static void sctp_outq_flush(struct sctp_outq *q, int 
rtx_timeout, gfp_t gfp)
case SCTP_CID_ECN_ECNE:
case SCTP_CID_ASCONF:
case SCTP_CID_FWD_TSN:
+   case SCTP_CID_RECONF:
status = sctp_packet_transmit_chunk(packet, chunk,
one_packet, gfp);
if (status  != SCTP_XMIT_OK) {
/* put the chunk back */
list_add(>list, >control_chunk_list);
-   } else {
-   asoc->stats.octrlchunks++;
-   /* PR-SCTP C5) If a FORWARD TSN is sent, the
-* sender MUST assure that at least one T3-rtx
-* timer is running.
-*/
-   if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN) 
{
-   sctp_transport_reset_t3_rtx(transport);
-   transport->last_time_sent = jiffies;
-   }
+   break;
+   }
+
+   asoc->stats.octrlchunks++;
+   /* PR-SCTP C5) If a FORWARD TSN is sent, the
+* sender MUST assure that at least one T3-rtx
+* timer is running.
+*/
+   if (chunk->chunk_hdr->type == SCTP_CID_FWD_TSN) {
+   sctp_transport_reset_t3_rtx(transport);
+   transport->last_time_sent = jiffies;
}
+
+   if (chunk == asoc->strreset_chunk)
+   sctp_transport_reset_reconf_timer(transport);
+
break;
 
default:
@@ -1016,6 +1022,8 @@ static void 

[PATCHv3 net-next 6/7] sctp: add sockopt SCTP_ENABLE_STREAM_RESET

2017-01-13 Thread Xin Long
This patch is to add sockopt SCTP_ENABLE_STREAM_RESET to get/set
strreset_enable to indicate which reconf request type it supports,
which is described in rfc6525 section 6.3.1.

Signed-off-by: Xin Long 
---
 include/net/sctp/structs.h |  4 +++
 include/uapi/linux/sctp.h  |  7 
 net/sctp/associola.c   |  1 +
 net/sctp/socket.c  | 84 ++
 4 files changed, 96 insertions(+)

diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index ee037ef..d99b76e 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1257,6 +1257,8 @@ struct sctp_endpoint {
__u8  auth_enable:1,
  prsctp_enable:1,
  reconf_enable:1;
+
+   __u8  strreset_enable;
 };
 
 /* Recover the outter endpoint structure. */
@@ -1872,6 +1874,8 @@ struct sctp_association {
 prsctp_enable:1,
 reconf_enable:1;
 
+   __u8 strreset_enable;
+
__u32 strreset_outseq; /* Update after receiving response */
__u32 strreset_inseq; /* Update after receiving request */
 
diff --git a/include/uapi/linux/sctp.h b/include/uapi/linux/sctp.h
index a406adc..867be0f 100644
--- a/include/uapi/linux/sctp.h
+++ b/include/uapi/linux/sctp.h
@@ -115,6 +115,7 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_PR_SUPPORTED  113
 #define SCTP_DEFAULT_PRINFO114
 #define SCTP_PR_ASSOC_STATUS   115
+#define SCTP_ENABLE_STREAM_RESET   118
 
 /* PR-SCTP policies */
 #define SCTP_PR_SCTP_NONE  0x
@@ -138,6 +139,12 @@ typedef __s32 sctp_assoc_t;
 #define SCTP_PR_RTX_ENABLED(x) (SCTP_PR_POLICY(x) == SCTP_PR_SCTP_RTX)
 #define SCTP_PR_PRIO_ENABLED(x)(SCTP_PR_POLICY(x) == SCTP_PR_SCTP_PRIO)
 
+/* For enable stream reset */
+#define SCTP_ENABLE_RESET_STREAM_REQ   0x01
+#define SCTP_ENABLE_RESET_ASSOC_REQ0x02
+#define SCTP_ENABLE_CHANGE_ASSOC_REQ   0x04
+#define SCTP_ENABLE_STRRESET_MASK  0x07
+
 /* These are bit fields for msghdr->msg_flags.  See section 5.1.  */
 /* On user space Linux, these live in  as an enum.  */
 enum sctp_msg_flags {
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 68b99ad..e50dc6d 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -271,6 +271,7 @@ static struct sctp_association 
*sctp_association_init(struct sctp_association *a
asoc->active_key_id = ep->active_key_id;
asoc->prsctp_enable = ep->prsctp_enable;
asoc->reconf_enable = ep->reconf_enable;
+   asoc->strreset_enable = ep->strreset_enable;
 
/* Save the hmacs and chunks list into this association */
if (ep->auth_hmacs_list)
diff --git a/net/sctp/socket.c b/net/sctp/socket.c
index 318c678..ae07db4 100644
--- a/net/sctp/socket.c
+++ b/net/sctp/socket.c
@@ -3751,6 +3751,42 @@ static int sctp_setsockopt_default_prinfo(struct sock 
*sk,
return retval;
 }
 
+static int sctp_setsockopt_enable_strreset(struct sock *sk,
+  char __user *optval,
+  unsigned int optlen)
+{
+   struct sctp_assoc_value params;
+   struct sctp_association *asoc;
+   int retval = -EINVAL;
+
+   if (optlen != sizeof(params))
+   goto out;
+
+   if (copy_from_user(, optval, optlen)) {
+   retval = -EFAULT;
+   goto out;
+   }
+
+   if (params.assoc_value & (~SCTP_ENABLE_STRRESET_MASK))
+   goto out;
+
+   asoc = sctp_id2assoc(sk, params.assoc_id);
+   if (asoc) {
+   asoc->strreset_enable = params.assoc_value;
+   } else if (!params.assoc_id) {
+   struct sctp_sock *sp = sctp_sk(sk);
+
+   sp->ep->strreset_enable = params.assoc_value;
+   } else {
+   goto out;
+   }
+
+   retval = 0;
+
+out:
+   return retval;
+}
+
 /* API 6.2 setsockopt(), getsockopt()
  *
  * Applications use setsockopt() and getsockopt() to set or retrieve
@@ -3917,6 +3953,9 @@ static int sctp_setsockopt(struct sock *sk, int level, 
int optname,
case SCTP_DEFAULT_PRINFO:
retval = sctp_setsockopt_default_prinfo(sk, optval, optlen);
break;
+   case SCTP_ENABLE_STREAM_RESET:
+   retval = sctp_setsockopt_enable_strreset(sk, optval, optlen);
+   break;
default:
retval = -ENOPROTOOPT;
break;
@@ -6401,6 +6440,47 @@ static int sctp_getsockopt_pr_assocstatus(struct sock 
*sk, int len,
return retval;
 }
 
+static int sctp_getsockopt_enable_strreset(struct sock *sk, int len,
+  char __user *optval,
+  int __user *optlen)
+{
+   struct sctp_assoc_value params;
+   struct sctp_association *asoc;
+   int retval = -EFAULT;
+
+   if (len < sizeof(params)) {
+   retval = -EINVAL;
+   goto out;
+   }
+
+   len = 

[PATCHv3 net-next 4/7] sctp: add stream reconf primitive

2017-01-13 Thread Xin Long
This patch is to add a primitive based on sctp primitive frame for
sending stream reconf request. It works as the other primitives,
and create a SCTP_CMD_REPLY command to send the request chunk out.

sctp_primitive_RECONF would be the api to send a reconf request
chunk.

Signed-off-by: Xin Long 
---
 include/net/sctp/constants.h |  3 ++-
 include/net/sctp/sctp.h  |  2 ++
 include/net/sctp/sm.h|  1 +
 net/sctp/primitive.c |  3 +++
 net/sctp/sm_statefuns.c  | 13 +
 net/sctp/sm_statetable.c | 20 
 6 files changed, 41 insertions(+), 1 deletion(-)

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 8307c86..3567c97 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -114,9 +114,10 @@ typedef enum {
SCTP_PRIMITIVE_SEND,
SCTP_PRIMITIVE_REQUESTHEARTBEAT,
SCTP_PRIMITIVE_ASCONF,
+   SCTP_PRIMITIVE_RECONF,
 } sctp_event_primitive_t;
 
-#define SCTP_EVENT_PRIMITIVE_MAX   SCTP_PRIMITIVE_ASCONF
+#define SCTP_EVENT_PRIMITIVE_MAX   SCTP_PRIMITIVE_RECONF
 #define SCTP_NUM_PRIMITIVE_TYPES   (SCTP_EVENT_PRIMITIVE_MAX + 1)
 
 /* We define here a utility type for manipulating subtypes.
diff --git a/include/net/sctp/sctp.h b/include/net/sctp/sctp.h
index 598d938..bc0e049 100644
--- a/include/net/sctp/sctp.h
+++ b/include/net/sctp/sctp.h
@@ -141,6 +141,8 @@ int sctp_primitive_ABORT(struct net *, struct 
sctp_association *, void *arg);
 int sctp_primitive_SEND(struct net *, struct sctp_association *, void *arg);
 int sctp_primitive_REQUESTHEARTBEAT(struct net *, struct sctp_association *, 
void *arg);
 int sctp_primitive_ASCONF(struct net *, struct sctp_association *, void *arg);
+int sctp_primitive_RECONF(struct net *net, struct sctp_association *asoc,
+ void *arg);
 
 /*
  * sctp/input.c
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index d2d9e28..430ed13 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -157,6 +157,7 @@ sctp_state_fn_t sctp_sf_error_shutdown;
 sctp_state_fn_t sctp_sf_ignore_primitive;
 sctp_state_fn_t sctp_sf_do_prm_requestheartbeat;
 sctp_state_fn_t sctp_sf_do_prm_asconf;
+sctp_state_fn_t sctp_sf_do_prm_reconf;
 
 /* Prototypes for other event state functions.  */
 sctp_state_fn_t sctp_sf_do_no_pending_tsn;
diff --git a/net/sctp/primitive.c b/net/sctp/primitive.c
index ab8d9f9..f0553a0 100644
--- a/net/sctp/primitive.c
+++ b/net/sctp/primitive.c
@@ -211,3 +211,6 @@ DECLARE_PRIMITIVE(REQUESTHEARTBEAT);
 */
 
 DECLARE_PRIMITIVE(ASCONF);
+
+/* RE-CONFIG 5.1 */
+DECLARE_PRIMITIVE(RECONF);
diff --git a/net/sctp/sm_statefuns.c b/net/sctp/sm_statefuns.c
index 2ae186a..782e579 100644
--- a/net/sctp/sm_statefuns.c
+++ b/net/sctp/sm_statefuns.c
@@ -5185,6 +5185,19 @@ sctp_disposition_t sctp_sf_do_prm_asconf(struct net *net,
return SCTP_DISPOSITION_CONSUME;
 }
 
+/* RE-CONFIG Section 5.1 RECONF Chunk Procedures */
+sctp_disposition_t sctp_sf_do_prm_reconf(struct net *net,
+const struct sctp_endpoint *ep,
+const struct sctp_association *asoc,
+const sctp_subtype_t type,
+void *arg, sctp_cmd_seq_t *commands)
+{
+   struct sctp_chunk *chunk = arg;
+
+   sctp_add_cmd_sf(commands, SCTP_CMD_REPLY, SCTP_CHUNK(chunk));
+   return SCTP_DISPOSITION_CONSUME;
+}
+
 /*
  * Ignore the primitive event
  *
diff --git a/net/sctp/sm_statetable.c b/net/sctp/sm_statetable.c
index 3da521a..b5438b4 100644
--- a/net/sctp/sm_statetable.c
+++ b/net/sctp/sm_statetable.c
@@ -643,6 +643,25 @@ chunk_event_table_unknown[SCTP_STATE_NUM_STATES] = {
TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \
 } /* TYPE_SCTP_PRIMITIVE_ASCONF */
 
+#define TYPE_SCTP_PRIMITIVE_RECONF { \
+   /* SCTP_STATE_CLOSED */ \
+   TYPE_SCTP_FUNC(sctp_sf_error_closed), \
+   /* SCTP_STATE_COOKIE_WAIT */ \
+   TYPE_SCTP_FUNC(sctp_sf_error_closed), \
+   /* SCTP_STATE_COOKIE_ECHOED */ \
+   TYPE_SCTP_FUNC(sctp_sf_error_closed), \
+   /* SCTP_STATE_ESTABLISHED */ \
+   TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \
+   /* SCTP_STATE_SHUTDOWN_PENDING */ \
+   TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \
+   /* SCTP_STATE_SHUTDOWN_SENT */ \
+   TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \
+   /* SCTP_STATE_SHUTDOWN_RECEIVED */ \
+   TYPE_SCTP_FUNC(sctp_sf_do_prm_reconf), \
+   /* SCTP_STATE_SHUTDOWN_ACK_SENT */ \
+   TYPE_SCTP_FUNC(sctp_sf_error_shutdown), \
+} /* TYPE_SCTP_PRIMITIVE_RECONF */
+
 /* The primary index for this table is the primitive type.
  * The secondary index for this table is the state.
  */
@@ -653,6 +672,7 @@ static const sctp_sm_table_entry_t 
primitive_event_table[SCTP_NUM_PRIMITIVE_TYPE
TYPE_SCTP_PRIMITIVE_SEND,
TYPE_SCTP_PRIMITIVE_REQUESTHEARTBEAT,
TYPE_SCTP_PRIMITIVE_ASCONF,
+  

[PATCHv3 net-next 5/7] sctp: add reconf_enable in asoc ep and netns

2017-01-13 Thread Xin Long
This patch is to add reconf_enable field in all of asoc ep and netns
to indicate if they support stream reset.

When initializing, asoc reconf_enable get the default value from ep
reconf_enable which is from netns netns reconf_enable by default.

It is also to add reconf_capable in asoc peer part to know if peer
supports reconf_enable, the value is set if ext params have reconf
chunk support when processing init chunk, just as rfc6525 section
5.1.1 demands.

Signed-off-by: Xin Long 
---
 include/net/netns/sctp.h   |  3 +++
 include/net/sctp/structs.h |  7 +--
 net/sctp/associola.c   |  1 +
 net/sctp/endpointola.c |  1 +
 net/sctp/protocol.c|  3 +++
 net/sctp/sm_make_chunk.c   | 15 +++
 6 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/include/net/netns/sctp.h b/include/net/netns/sctp.h
index c501d67..b7871d0 100644
--- a/include/net/netns/sctp.h
+++ b/include/net/netns/sctp.h
@@ -118,6 +118,9 @@ struct netns_sctp {
/* Flag to indicate if PR-SCTP is enabled. */
int prsctp_enable;
 
+   /* Flag to indicate if PR-CONFIG is enabled. */
+   int reconf_enable;
+
/* Flag to idicate if SCTP-AUTH is enabled */
int auth_enable;
 
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 463b4d6..ee037ef 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1255,7 +1255,8 @@ struct sctp_endpoint {
struct list_head endpoint_shared_keys;
__u16 active_key_id;
__u8  auth_enable:1,
- prsctp_enable:1;
+ prsctp_enable:1,
+ reconf_enable:1;
 };
 
 /* Recover the outter endpoint structure. */
@@ -1508,6 +1509,7 @@ struct sctp_association {
hostname_address:1, /* Peer understands DNS addresses? 
*/
asconf_capable:1,   /* Does peer support ADDIP? */
prsctp_capable:1,   /* Can peer do PR-SCTP? */
+   reconf_capable:1,   /* Can peer do RE-CONFIG? */
auth_capable:1; /* Is peer doing SCTP-AUTH? */
 
/* sack_needed : This flag indicates if the next received
@@ -1867,7 +1869,8 @@ struct sctp_association {
 
__u8 need_ecne:1,   /* Need to send an ECNE Chunk? */
 temp:1,/* Is it a temporary association? */
-prsctp_enable:1;
+prsctp_enable:1,
+reconf_enable:1;
 
__u32 strreset_outseq; /* Update after receiving response */
__u32 strreset_inseq; /* Update after receiving request */
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index fc33540..68b99ad 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -270,6 +270,7 @@ static struct sctp_association 
*sctp_association_init(struct sctp_association *a
 
asoc->active_key_id = ep->active_key_id;
asoc->prsctp_enable = ep->prsctp_enable;
+   asoc->reconf_enable = ep->reconf_enable;
 
/* Save the hmacs and chunks list into this association */
if (ep->auth_hmacs_list)
diff --git a/net/sctp/endpointola.c b/net/sctp/endpointola.c
index 410ddc1..8c58923 100644
--- a/net/sctp/endpointola.c
+++ b/net/sctp/endpointola.c
@@ -164,6 +164,7 @@ static struct sctp_endpoint *sctp_endpoint_init(struct 
sctp_endpoint *ep,
ep->auth_hmacs_list = auth_hmacs;
ep->auth_chunk_list = auth_chunks;
ep->prsctp_enable = net->sctp.prsctp_enable;
+   ep->reconf_enable = net->sctp.reconf_enable;
 
return ep;
 
diff --git a/net/sctp/protocol.c b/net/sctp/protocol.c
index f9c3c37..8227bbb 100644
--- a/net/sctp/protocol.c
+++ b/net/sctp/protocol.c
@@ -1258,6 +1258,9 @@ static int __net_init sctp_defaults_init(struct net *net)
/* Enable PR-SCTP by default. */
net->sctp.prsctp_enable = 1;
 
+   /* Disable RECONF by default. */
+   net->sctp.reconf_enable = 0;
+
/* Disable AUTH by default. */
net->sctp.auth_enable = 0;
 
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index 172385c..c4a0a9c 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -270,6 +270,11 @@ struct sctp_chunk *sctp_make_init(const struct 
sctp_association *asoc,
num_ext += 2;
}
 
+   if (asoc->reconf_enable) {
+   extensions[num_ext] = SCTP_CID_RECONF;
+   num_ext += 1;
+   }
+
if (sp->adaptation_ind)
chunksize += sizeof(aiparam);
 
@@ -434,6 +439,11 @@ struct sctp_chunk *sctp_make_init_ack(const struct 
sctp_association *asoc,
num_ext += 2;
}
 
+   if (asoc->peer.reconf_capable) {
+   extensions[num_ext] = SCTP_CID_RECONF;
+   num_ext += 1;
+   }
+
if (sp->adaptation_ind)
chunksize += sizeof(aiparam);
 
@@ -2012,6 +2022,11 @@ static void sctp_process_ext_param(struct 
sctp_association *asoc,
 
for (i = 0; 

[PATCHv3 net-next 2/7] sctp: add support for generating stream reconf ssn reset request chunk

2017-01-13 Thread Xin Long
This patch is to add asoc strreset_outseq and strreset_inseq for
saving the reconf request sequence, initialize them when create
assoc and process init, and also to define Incoming and Outgoing
SSN Reset Request Parameter described in rfc6525 section 4.1 and
4.2, As they can be in one same chunk as section rfc6525 3.1-3
describes, it makes them in one function.

Signed-off-by: Xin Long 
---
 include/linux/sctp.h   | 26 ++
 include/net/sctp/sm.h  |  5 ++-
 include/net/sctp/structs.h |  3 ++
 net/sctp/associola.c   |  1 +
 net/sctp/sm_make_chunk.c   | 88 ++
 5 files changed, 122 insertions(+), 1 deletion(-)

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index cdc3b05..d5da19c 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -200,6 +200,13 @@ typedef enum {
SCTP_PARAM_SUCCESS_REPORT   = cpu_to_be16(0xc005),
SCTP_PARAM_ADAPTATION_LAYER_IND = cpu_to_be16(0xc006),
 
+   /* RE-CONFIG. Section 4 */
+   SCTP_PARAM_RESET_OUT_REQUEST= cpu_to_be16(0x000d),
+   SCTP_PARAM_RESET_IN_REQUEST = cpu_to_be16(0x000e),
+   SCTP_PARAM_RESET_TSN_REQUEST= cpu_to_be16(0x000f),
+   SCTP_PARAM_RESET_RESPONSE   = cpu_to_be16(0x0010),
+   SCTP_PARAM_RESET_ADD_OUT_STREAMS= cpu_to_be16(0x0011),
+   SCTP_PARAM_RESET_ADD_IN_STREAMS = cpu_to_be16(0x0012),
 } sctp_param_t; /* enum */
 
 
@@ -716,4 +723,23 @@ struct sctp_reconf_chunk {
__u8 params[0];
 } __packed;
 
+struct sctp_strreset_req {
+   sctp_paramhdr_t param_hdr;
+   __u32 request_seq;
+} __packed;
+
+struct sctp_strreset_outreq {
+   sctp_paramhdr_t param_hdr;
+   __u32 request_seq;
+   __u32 response_seq;
+   __u32 send_reset_at_tsn;
+   __u16 list_of_streams[0];
+} __packed;
+
+struct sctp_strreset_inreq {
+   sctp_paramhdr_t param_hdr;
+   __u32 request_seq;
+   __u16 list_of_streams[0];
+} __packed;
+
 #endif /* __LINUX_SCTP_H__ */
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index ca6c971..3462cb0 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -259,7 +259,10 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct 
sctp_association *asoc,
__u32 new_cum_tsn, size_t nstreams,
struct sctp_fwdtsn_skip *skiplist);
 struct sctp_chunk *sctp_make_auth(const struct sctp_association *asoc);
-
+struct sctp_chunk *sctp_make_strreset_req(
+   const struct sctp_association *asoc,
+   __u16 stream_num, __u16 *stream_list,
+   bool out, bool in);
 void sctp_chunk_assign_tsn(struct sctp_chunk *);
 void sctp_chunk_assign_ssn(struct sctp_chunk *);
 
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 4741ec2..3dc983e 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -1865,6 +1865,9 @@ struct sctp_association {
 temp:1,/* Is it a temporary association? */
 prsctp_enable:1;
 
+   __u32 strreset_outseq; /* Update after receiving response */
+   __u32 strreset_inseq; /* Update after receiving request */
+
struct sctp_priv_assoc_stats stats;
 
int sent_cnt_removable;
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 36294f7..42ece6f 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -207,6 +207,7 @@ static struct sctp_association 
*sctp_association_init(struct sctp_association *a
 * association to the same value as the initial TSN.
 */
asoc->addip_serial = asoc->c.initial_tsn;
+   asoc->strreset_outseq = asoc->c.initial_tsn;
 
INIT_LIST_HEAD(>addip_chunk_list);
INIT_LIST_HEAD(>asconf_ack_list);
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index fd58097..172385c 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -1844,6 +1844,7 @@ struct sctp_association *sctp_unpack_cookie(
retval->next_tsn = retval->c.initial_tsn;
retval->ctsn_ack_point = retval->next_tsn - 1;
retval->addip_serial = retval->c.initial_tsn;
+   retval->strreset_outseq = retval->c.initial_tsn;
retval->adv_peer_ack_point = retval->ctsn_ack_point;
retval->peer.prsctp_capable = retval->c.prsctp_capable;
retval->peer.adaptation_ind = retval->c.adaptation_ind;
@@ -2387,6 +2388,8 @@ int sctp_process_init(struct sctp_association *asoc, 
struct sctp_chunk *chunk,
asoc->peer.i.initial_tsn =
ntohl(peer_init->init_hdr.initial_tsn);
 
+   asoc->strreset_inseq = asoc->peer.i.initial_tsn;
+
/* Apply the upper bounds for output streams based on peer's
 * number of inbound streams.
 */
@@ -3559,3 +3562,88 @@ static struct sctp_chunk *sctp_make_reconf(
 
return retval;
 }

[PATCHv3 net-next 3/7] sctp: add stream reconf timer

2017-01-13 Thread Xin Long
This patch is to add a per transport timer based on sctp timer frame
for stream reconf chunk retransmission. It would start after sending
a reconf request chunk, and stop after receiving the response chunk.

If the timer expires, besides retransmitting the reconf request chunk,
it would also do the same thing with data RTO timer. like to increase
the appropriate error counts, and perform threshold management, possibly
destroying the asoc if sctp retransmission thresholds are exceeded, just
as section 5.1.1 describes.

This patch is also to add asoc strreset_chunk, it is used to save the
reconf request chunk, so that it can be retransmitted, and to check if
the response is really for this request by comparing the information
inside with the response chunk as well.

Signed-off-by: Xin Long 
---
 include/net/sctp/constants.h |  1 +
 include/net/sctp/sm.h|  2 ++
 include/net/sctp/structs.h   |  6 ++
 net/sctp/associola.c |  9 +
 net/sctp/sm_sideeffect.c | 32 
 net/sctp/sm_statefuns.c  | 28 
 net/sctp/sm_statetable.c | 20 
 net/sctp/transport.c | 17 +++--
 8 files changed, 113 insertions(+), 2 deletions(-)

diff --git a/include/net/sctp/constants.h b/include/net/sctp/constants.h
index 5b847e4..8307c86 100644
--- a/include/net/sctp/constants.h
+++ b/include/net/sctp/constants.h
@@ -90,6 +90,7 @@ typedef enum {
SCTP_EVENT_TIMEOUT_T4_RTO,
SCTP_EVENT_TIMEOUT_T5_SHUTDOWN_GUARD,
SCTP_EVENT_TIMEOUT_HEARTBEAT,
+   SCTP_EVENT_TIMEOUT_RECONF,
SCTP_EVENT_TIMEOUT_SACK,
SCTP_EVENT_TIMEOUT_AUTOCLOSE,
 } sctp_event_timeout_t;
diff --git a/include/net/sctp/sm.h b/include/net/sctp/sm.h
index 3462cb0..d2d9e28 100644
--- a/include/net/sctp/sm.h
+++ b/include/net/sctp/sm.h
@@ -167,6 +167,7 @@ sctp_state_fn_t sctp_sf_cookie_wait_icmp_abort;
 
 /* Prototypes for timeout event state functions.  */
 sctp_state_fn_t sctp_sf_do_6_3_3_rtx;
+sctp_state_fn_t sctp_sf_send_reconf;
 sctp_state_fn_t sctp_sf_do_6_2_sack;
 sctp_state_fn_t sctp_sf_autoclose_timer_expire;
 
@@ -278,6 +279,7 @@ int sctp_do_sm(struct net *net, sctp_event_t event_type, 
sctp_subtype_t subtype,
 /* 2nd level prototypes */
 void sctp_generate_t3_rtx_event(unsigned long peer);
 void sctp_generate_heartbeat_event(unsigned long peer);
+void sctp_generate_reconf_event(unsigned long peer);
 void sctp_generate_proto_unreach_event(unsigned long peer);
 
 void sctp_ootb_pkt_free(struct sctp_packet *);
diff --git a/include/net/sctp/structs.h b/include/net/sctp/structs.h
index 3dc983e..463b4d6 100644
--- a/include/net/sctp/structs.h
+++ b/include/net/sctp/structs.h
@@ -877,6 +877,9 @@ struct sctp_transport {
/* Timer to handle ICMP proto unreachable envets */
struct timer_list proto_unreach_timer;
 
+   /* Timer to handler reconf chunk rtx */
+   struct timer_list reconf_timer;
+
/* Since we're using per-destination retransmission timers
 * (see above), we're also using per-destination "transmitted"
 * queues.  This probably ought to be a private struct
@@ -935,6 +938,7 @@ void sctp_transport_pmtu(struct sctp_transport *, struct 
sock *sk);
 void sctp_transport_free(struct sctp_transport *);
 void sctp_transport_reset_t3_rtx(struct sctp_transport *);
 void sctp_transport_reset_hb_timer(struct sctp_transport *);
+void sctp_transport_reset_reconf_timer(struct sctp_transport *transport);
 int sctp_transport_hold(struct sctp_transport *);
 void sctp_transport_put(struct sctp_transport *);
 void sctp_transport_update_rto(struct sctp_transport *, __u32);
@@ -1868,6 +1872,8 @@ struct sctp_association {
__u32 strreset_outseq; /* Update after receiving response */
__u32 strreset_inseq; /* Update after receiving request */
 
+   struct sctp_chunk *strreset_chunk; /* save request chunk */
+
struct sctp_priv_assoc_stats stats;
 
int sent_cnt_removable;
diff --git a/net/sctp/associola.c b/net/sctp/associola.c
index 42ece6f..fc33540 100644
--- a/net/sctp/associola.c
+++ b/net/sctp/associola.c
@@ -362,6 +362,9 @@ void sctp_association_free(struct sctp_association *asoc)
/* Free stream information. */
sctp_stream_free(asoc->stream);
 
+   if (asoc->strreset_chunk)
+   sctp_chunk_free(asoc->strreset_chunk);
+
/* Clean up the bound address list. */
sctp_bind_addr_free(>base.bind_addr);
 
@@ -520,6 +523,12 @@ void sctp_assoc_rm_peer(struct sctp_association *asoc,
if (asoc->peer.last_data_from == peer)
asoc->peer.last_data_from = transport;
 
+   if (asoc->strreset_chunk &&
+   asoc->strreset_chunk->transport == peer) {
+   asoc->strreset_chunk->transport = transport;
+   sctp_transport_reset_reconf_timer(transport);
+   }
+
/* If we remove the transport an INIT was last sent to, set it to
  

[PATCHv3 net-next 1/7] sctp: add a common helper function to generate stream reconf chunk

2017-01-13 Thread Xin Long
This patch is to define a common api used to alloc memory and initialize
reconf chunk header that described in rfc6525 section 3.1.

All reconf chunks will be generated by calling this helper function.

Signed-off-by: Xin Long 
---
 include/linux/sctp.h |  6 ++
 net/sctp/sm_make_chunk.c | 33 +
 2 files changed, 39 insertions(+)

diff --git a/include/linux/sctp.h b/include/linux/sctp.h
index fcb4c36..cdc3b05 100644
--- a/include/linux/sctp.h
+++ b/include/linux/sctp.h
@@ -108,6 +108,7 @@ typedef enum {
/* Use hex, as defined in ADDIP sec. 3.1 */
SCTP_CID_ASCONF = 0xC1,
SCTP_CID_ASCONF_ACK = 0x80,
+   SCTP_CID_RECONF = 0x82,
 } sctp_cid_t; /* enum */
 
 
@@ -710,4 +711,9 @@ struct sctp_infox {
struct sctp_association *asoc;
 };
 
+struct sctp_reconf_chunk {
+   sctp_chunkhdr_t chunk_hdr;
+   __u8 params[0];
+} __packed;
+
 #endif /* __LINUX_SCTP_H__ */
diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
index a15d824..fd58097 100644
--- a/net/sctp/sm_make_chunk.c
+++ b/net/sctp/sm_make_chunk.c
@@ -3526,3 +3526,36 @@ struct sctp_chunk *sctp_make_fwdtsn(const struct 
sctp_association *asoc,
 
return retval;
 }
+
+/* RE-CONFIG 3.1 (RE-CONFIG chunk)
+ *   0   1   2   3
+ *   0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  | Type = 130|  Chunk Flags  |  Chunk Length |
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  \   \
+ *  /  Re-configuration Parameter   /
+ *  \   \
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ *  \   \
+ *  / Re-configuration Parameter (optional) /
+ *  \   \
+ *  +-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
+ */
+static struct sctp_chunk *sctp_make_reconf(
+   const struct sctp_association *asoc,
+   int length)
+{
+   struct sctp_reconf_chunk *reconf;
+   struct sctp_chunk *retval;
+
+   retval = sctp_make_control(asoc, SCTP_CID_RECONF, 0, length,
+  GFP_ATOMIC);
+   if (!retval)
+   return NULL;
+
+   reconf = (struct sctp_reconf_chunk *)retval->chunk_hdr;
+   retval->param_hdr.v = reconf->params;
+
+   return retval;
+}
-- 
2.1.0



[PATCHv3 net-next 0/7] sctp: add sender-side procedures for stream reconf ssn reset request chunk

2017-01-13 Thread Xin Long
Patch 7/7 is to implement sender-side procedures for the Outgoing
and Incoming SSN Reset Request Parameter described in rfc6525
section 5.1.2 and 5.1.3

Patches 1-6/7 are ahead of it to define some apis and asoc members
for it.

Note that with this patchset, asoc->reconf_enable has no chance yet to
be set, until the patch "sctp: add get and set sockopt for reconf_enable"
is applied in the future. As we can not just enable it when sctp is not
capable of processing reconf chunk yet.

v1->v2:
  - put these into a smaller group.
  - rename some temporary variables in the codes.
  - rename the titles of the commits and improve some changelogs.
v2->v3:
  - re-split the patchset and make sure it has no dead codes for review.

Xin Long (7):
  sctp: add a common helper function to generate stream reconf chunk
  sctp: add support for generating stream reconf ssn reset request chunk
  sctp: add stream reconf timer
  sctp: add stream reconf primitive
  sctp: add reconf_enable in asoc ep and netns
  sctp: add sockopt SCTP_ENABLE_STREAM_RESET
  sctp: implement sender-side procedures for SSN Reset Request Parameter

 include/linux/sctp.h |  32 ++
 include/net/netns/sctp.h |   3 +
 include/net/sctp/constants.h |   4 +-
 include/net/sctp/sctp.h  |   8 +++
 include/net/sctp/sm.h|   8 ++-
 include/net/sctp/structs.h   |  21 ++-
 include/uapi/linux/sctp.h|  18 ++
 net/sctp/associola.c |  12 
 net/sctp/endpointola.c   |   1 +
 net/sctp/outqueue.c  |  33 +++
 net/sctp/primitive.c |   3 +
 net/sctp/protocol.c  |   3 +
 net/sctp/sm_make_chunk.c | 136 +++
 net/sctp/sm_sideeffect.c |  32 ++
 net/sctp/sm_statefuns.c  |  41 +
 net/sctp/sm_statetable.c |  40 +
 net/sctp/socket.c| 113 +++
 net/sctp/stream.c|  79 +
 net/sctp/transport.c |  17 +-
 19 files changed, 588 insertions(+), 16 deletions(-)

-- 
2.1.0



Re: [PATCH][V2] flow dissector: check if arp_eth is null rather than arp

2017-01-13 Thread Eric Dumazet
On Fri, 2017-01-13 at 18:48 +, Colin King wrote:
> From: Colin Ian King 
> 
> arp is being checked instead of arp_eth to see if the call to
> __skb_header_pointer failed. Fix this by checking arp_eth is
> null instead of arp.   Also fix to use length hlen rather than
> hlen - sizeof(_arp); thanks to Eric Dumazet for spotting
> this latter issue.
> 
> CoverityScan CID#1396428 ("Logically dead code") on 2nd
> arp comparison (which should be arp_eth instead).
> 
> Fixes: commit 55733350e5e8b70c5 ("flow disector: ARP support")
> Signed-off-by: Colin Ian King 
> ---

Acked-by: Eric Dumazet 

Thanks !



Re: [PATCH net-next] mii_bus: increase MII_BUS_ID_SIZE to 61

2017-01-13 Thread Florian Fainelli
On 01/13/2017 08:31 AM, Andrew Lunn wrote:
> On Fri, Jan 13, 2017 at 04:19:12PM +0100, Volodymyr Bendiuga wrote:
>> From: Volodymyr Bendiuga 
>>
>> Some bus names are pretty long and do not fit into 20 chars.
>>
>> Signed-off-by: Volodymyr Bendiuga 
>> Signed-off-by: Magnus Öberg 
>> ---
>>  include/linux/phy.h | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/include/linux/phy.h b/include/linux/phy.h
>> index feb8a98..b67f94d 100644
>> --- a/include/linux/phy.h
>> +++ b/include/linux/phy.h
>> @@ -162,7 +162,7 @@ static inline const char *phy_modes(phy_interface_t 
>> interface)
>>   * Need to be a little smaller than phydev->dev.bus_id to leave room
>>   * for the ":%02x"
>>   */
>> -#define MII_BUS_ID_SIZE (20 - 3)
>> +#define MII_BUS_ID_SIZE (64 - 3)
> 
> Hi Volodymyr
> 
> Humm, i assume you looked at the comment? What is the size of 
> phydev->dev.bus_id?
> Is 61 still a little smaller?

Also it seems like you missed this one:

/* A Structure for boards to register fixups with the PHY Lib */
struct phy_fixup {
struct list_head list;
char bus_id[20];
^
u32 phy_uid;
u32 phy_uid_mask;
int (*run)(struct phy_device *phydev);
};

Did you really run into naming conflicts where increasing the bus ID
size became the only solution here?
-- 
Florian


[PATCH][V2] flow dissector: check if arp_eth is null rather than arp

2017-01-13 Thread Colin King
From: Colin Ian King 

arp is being checked instead of arp_eth to see if the call to
__skb_header_pointer failed. Fix this by checking arp_eth is
null instead of arp.   Also fix to use length hlen rather than
hlen - sizeof(_arp); thanks to Eric Dumazet for spotting
this latter issue.

CoverityScan CID#1396428 ("Logically dead code") on 2nd
arp comparison (which should be arp_eth instead).

Fixes: commit 55733350e5e8b70c5 ("flow disector: ARP support")
Signed-off-by: Colin Ian King 
---
 net/core/flow_dissector.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index e3dffc7..c35aae1 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -407,9 +407,9 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
 
arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp),
   sizeof(_arp_eth), data,
-  hlen - sizeof(_arp),
+  hlen,
   &_arp_eth);
-   if (!arp)
+   if (!arp_eth)
goto out_bad;
 
if (dissector_uses_key(flow_dissector,
-- 
2.10.2



Re: [PATCH] flow disector: check if arp_eth is null rather than arp

2017-01-13 Thread Eric Dumazet
On Fri, 2017-01-13 at 18:25 +, Colin Ian King wrote:
> On 13/01/17 18:24, Eric Dumazet wrote:

> > It looks that we try very hard to add critical bugs in flow dissector.
> > 
> > This is embarrassing really.
> > 
> > I am questioning if the __skb_header_pointer() is correct
> > 
> > Why using hlen - sizeof(_arp) ?
> > 
> >arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp),
> >   sizeof(_arp_eth), data,
> >   hlen - sizeof(_arp),
> >   &_arp_eth);
> > 
> 
> Yep, the sizeof maybe dubious too, I overlooked that one; if somebody
> can clarify that then I'll send a V2 if it needs fixing up too.

I am pretty sure we should use hlen instead of (hlen - sizeof(_arp))

A V2 would be nice ;)




Re: [PATCH] flow disector: check if arp_eth is null rather than arp

2017-01-13 Thread Eric Dumazet
On Fri, 2017-01-13 at 13:34 +, Colin King wrote:
> From: Colin Ian King 
> 
> arp is being checked instead of arp_eth to see if the call to
> __skb_header_pointer failed. Fix this by checking arp_eth is
> null instead of arp.
> 
> CoverityScan CID#1396428 ("Logically dead code") on 2nd
> arp comparison (which should be arp_eth instead).
> 
> Fixes: commit 55733350e5e8b70c5 ("flow disector: ARP support")
> Signed-off-by: Colin Ian King 
> ---
>  net/core/flow_dissector.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
> index e3dffc7..fec48e9 100644
> --- a/net/core/flow_dissector.c
> +++ b/net/core/flow_dissector.c
> @@ -409,7 +409,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
>  sizeof(_arp_eth), data,
>  hlen - sizeof(_arp),
>  &_arp_eth);
> - if (!arp)
> + if (!arp_eth)
>   goto out_bad;
>  
>   if (dissector_uses_key(flow_dissector,

It looks that we try very hard to add critical bugs in flow dissector.

This is embarrassing really.

I am questioning if the __skb_header_pointer() is correct

Why using hlen - sizeof(_arp) ?

   arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp),
  sizeof(_arp_eth), data,
  hlen - sizeof(_arp),
  &_arp_eth);




Re: [Patch net] atm: remove an unnecessary loop

2017-01-13 Thread Cong Wang
On Fri, Jan 13, 2017 at 9:10 AM, David Miller  wrote:
> From: Francois Romieu 
> Date: Fri, 13 Jan 2017 01:07:00 +0100
>
>> Were alloc_skb moved one level up in the call stack, there would be
>> no need to use the new wait api in the subsequent page, thus easing
>> pre 3.19 longterm kernel maintenance (at least those on korg page).
>>
>> But it tastes a tad bit too masochistic.
>
> Lack of error handling of allocation failure is always a huge red
> flag.  We even long ago tried to do something like this for TCP FIN
> handling.
>
> It's dumb, it doesn't work.
>
> Therefore I agree that the correct fix is to move the SKB allocation
> up one level to vcc_sendmsg() and make it handle errors properly.

If you can justify API is not broken by doing that, I am more than happy
to do it, as I already stated in the latter patch:

"Of course, the logic itself is suspicious, other sendmsg()
could handle skb allocation failure very well, not sure
why ATM has to wait for a successful one here. But probably
it is too late to change since the errno and behavior is
visible to user-space. So just leave the logic as it is."

For some reason, no one reads that patch. :-/


Re: [PATCH] flow disector: check if arp_eth is null rather than arp

2017-01-13 Thread Colin Ian King
On 13/01/17 18:24, Eric Dumazet wrote:
> On Fri, 2017-01-13 at 13:34 +, Colin King wrote:
>> From: Colin Ian King 
>>
>> arp is being checked instead of arp_eth to see if the call to
>> __skb_header_pointer failed. Fix this by checking arp_eth is
>> null instead of arp.
>>
>> CoverityScan CID#1396428 ("Logically dead code") on 2nd
>> arp comparison (which should be arp_eth instead).
>>
>> Fixes: commit 55733350e5e8b70c5 ("flow disector: ARP support")
>> Signed-off-by: Colin Ian King 
>> ---
>>  net/core/flow_dissector.c | 2 +-
>>  1 file changed, 1 insertion(+), 1 deletion(-)
>>
>> diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
>> index e3dffc7..fec48e9 100644
>> --- a/net/core/flow_dissector.c
>> +++ b/net/core/flow_dissector.c
>> @@ -409,7 +409,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
>> sizeof(_arp_eth), data,
>> hlen - sizeof(_arp),
>> &_arp_eth);
>> -if (!arp)
>> +if (!arp_eth)
>>  goto out_bad;
>>  
>>  if (dissector_uses_key(flow_dissector,
> 
> It looks that we try very hard to add critical bugs in flow dissector.
> 
> This is embarrassing really.
> 
> I am questioning if the __skb_header_pointer() is correct
> 
> Why using hlen - sizeof(_arp) ?
> 
>arp_eth = __skb_header_pointer(skb, nhoff + sizeof(_arp),
>   sizeof(_arp_eth), data,
>   hlen - sizeof(_arp),
>   &_arp_eth);
> 

Yep, the sizeof maybe dubious too, I overlooked that one; if somebody
can clarify that then I'll send a V2 if it needs fixing up too.

Colin



Re: [Patch net] atm: remove an unnecessary loop

2017-01-13 Thread Cong Wang
On Fri, Jan 13, 2017 at 5:23 AM, Francois Romieu  wrote:
> Cong Wang  :
> [...]
>> alloc_skb(GFP_KERNEL) itself is sleeping, so the new wait api is still
>> needed.
>
> The task state change warning is the symptom.
>
> The deeply nested alloc_skb is the problem.
>
> Diagnosis: nesting is wrong. It makes zero sense. Fix it and the
> implicit task state change problem automagically goes away.
>
> alloc_skb() does not need to be in the "while" loop.

This is exactly what I describe in my changelog, don't know
why you want to repeat it...


>
> alloc_skb() does not need to be in the {prepare_to_wait/add_wait_queue ...
> finish_wait/remove_wait_queue} block.
>

If you ever read the followup patch of this one, you will find:

"
Of course, the logic itself is suspicious, other sendmsg()
could handle skb allocation failure very well, not sure
why ATM has to wait for a successful one here. But probably
it is too late to change since the errno and behavior is
visible to user-space. So just leave the logic as it is.
"


> alloc_tx() is not correctly named: given its original content, it deserves
> to be called something like:

Please don't expect me to fix many things in one patch, let's
fix each of them separately, agreed?

>
> "wait_for_decent_tx_drain_and_alloc_by_hand_coz_i_dont_trust_the_mm_subsystem_and_i_dont_know_what_i_want"
>
> I claim that:
> - alloc_tx() should only perform the "wait_for_decent_tx_drain" part
> - alloc_skb() ought to be done directly in vcc_sendmsg
> - alloc_skb() failure can be handled gracefully in vcc_sendmsg
> - alloc_skb() may use a (m->msg_flags & MSG_DONTWAIT) dependant
>   GFP_{KERNEL / ATOMIC} flag
> - most of it can be done in a longterm maintenance pain minimizing
>   way. Call it a side-effect: I don't claim that it *must* be done
>   this way.

Never disagree, but again, please ensure there is no API brokeness
as I mentioned in the followup patch which you missed. Apparently
my ATM knowledge is not enough to justify the API/ABI.

Thanks.


Re: [RFC PATCH] tcp: accept RST for rcv_nxt - 1 after receiving a FIN

2017-01-13 Thread Eric Dumazet
On Fri, 2017-01-13 at 12:28 -0500, Jason Baron wrote:
> i,
> 
> (Re-sending - seems like my reply was lost)
> 
> I wanted to define this condition as narrowly as I could. I'm ok
> dropping it -
> I'm not sure its going to make much difference in practice. So to that end,
> dropping this extra check makes sense.
> 
> I posted this as RFC because RFC 5961, I don't think says anything about
> accepting rcv_nxt - 1 in this case, so I was wondering what people
> thought...

This seems a reasonable trade-off to me

( to the rescue : RFC 1122 1.2.2 )






[PATCH net-next 2/2] mpls: Packet stats

2017-01-13 Thread Robert Shearman
Having MPLS packet stats is useful for observing network operation and
for diagnosing network problems. In the absence of anything better,
RFC2863 and RFC3813 are used for guidance for which stats to expose
and the semantics of them. In particular rx_noroutes maps to in
unknown protos in RFC2863. The stats are exposed to userspace via
AF_MPLS attributes embedded in the IFLA_STATS_AF_SPEC attribute of
RTM_GETSTATS messages.

All the introduced fields are 64-bit, even error ones, to ensure no
overflow with long uptimes. Per-CPU counters are used to avoid
cache-line contention on the commonly used fields. The other fields
have also been made per-CPU for code to avoid performance problems in
error conditions on the assumption that on some platforms the cost of
atomic operations could be more expensive than sending the packet
(which is what would be done in the success case). If that's not the
case, we could instead not use per-CPU counters for these fields.

Only unicast and non-fragment are exposed at the moment, but other
counters can be exposed in the future either by adding to the end of
struct mpls_link_stats or by additional netlink attributes in the
AF_MPLS IFLA_STATS_AF_SPEC nested attribute.

Signed-off-by: Robert Shearman 
---
 include/uapi/linux/mpls.h |  30 
 net/mpls/af_mpls.c| 179 --
 net/mpls/internal.h   |  58 ++-
 net/mpls/mpls_iptunnel.c  |  11 ++-
 4 files changed, 250 insertions(+), 28 deletions(-)

diff --git a/include/uapi/linux/mpls.h b/include/uapi/linux/mpls.h
index 24a6cb1aec86..77a19dfe3990 100644
--- a/include/uapi/linux/mpls.h
+++ b/include/uapi/linux/mpls.h
@@ -43,4 +43,34 @@ struct mpls_label {
 
 #define MPLS_LABEL_FIRST_UNRESERVED16 /* RFC3032 */
 
+/* These are embedded into IFLA_STATS_AF_SPEC:
+ * [IFLA_STATS_AF_SPEC]
+ * -> [AF_MPLS]
+ *-> [MPLS_STATS_xxx]
+ *
+ * Attributes:
+ * [MPLS_STATS_LINK] = {
+ * struct mpls_link_stats
+ * }
+ */
+enum {
+   MPLS_STATS_UNSPEC, /* also used as 64bit pad attribute */
+   MPLS_STATS_LINK,
+   __MPLS_STATS_MAX,
+};
+
+#define MPLS_STATS_MAX (__MPLS_STATS_MAX - 1)
+
+struct mpls_link_stats {
+   __u64   rx_packets; /* total packets received   */
+   __u64   tx_packets; /* total packets transmitted*/
+   __u64   rx_bytes;   /* total bytes received */
+   __u64   tx_bytes;   /* total bytes transmitted  */
+   __u64   rx_errors;  /* bad packets received */
+   __u64   tx_errors;  /* packet transmit problems */
+   __u64   rx_dropped; /* packet dropped on receive*/
+   __u64   tx_dropped; /* packet dropped on transmit   */
+   __u64   rx_noroute; /* no route for packet dest */
+};
+
 #endif /* _UAPI_MPLS_H */
diff --git a/net/mpls/af_mpls.c b/net/mpls/af_mpls.c
index 15fe97644ffe..fb20941cdda2 100644
--- a/net/mpls/af_mpls.c
+++ b/net/mpls/af_mpls.c
@@ -8,6 +8,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -17,8 +18,8 @@
 #include 
 #if IS_ENABLED(CONFIG_IPV6)
 #include 
-#include 
 #endif
+#include 
 #include 
 #include "internal.h"
 
@@ -48,11 +49,6 @@ static struct mpls_route *mpls_route_input_rcu(struct net 
*net, unsigned index)
return rt;
 }
 
-static inline struct mpls_dev *mpls_dev_get(const struct net_device *dev)
-{
-   return rcu_dereference_rtnl(dev->mpls_ptr);
-}
-
 bool mpls_output_possible(const struct net_device *dev)
 {
return dev && (dev->flags & IFF_UP) && netif_carrier_ok(dev);
@@ -98,6 +94,29 @@ bool mpls_pkt_too_big(const struct sk_buff *skb, unsigned 
int mtu)
 }
 EXPORT_SYMBOL_GPL(mpls_pkt_too_big);
 
+void mpls_stats_inc_outucastpkts(struct net_device *dev,
+const struct sk_buff *skb)
+{
+   struct mpls_dev *mdev;
+   struct inet6_dev *in6dev;
+
+   if (skb->protocol == htons(ETH_P_MPLS_UC)) {
+   mdev = mpls_dev_get(dev);
+   if (mdev)
+   MPLS_INC_STATS_LEN(mdev, skb->len,
+  tx_packets,
+  tx_bytes);
+   } else if (skb->protocol == htons(ETH_P_IP)) {
+   IP_UPD_PO_STATS(dev_net(dev), IPSTATS_MIB_OUT, skb->len);
+   } else if (skb->protocol == htons(ETH_P_IPV6)) {
+   in6dev = __in6_dev_get(dev);
+   if (in6dev)
+   IP6_UPD_PO_STATS(dev_net(dev), in6dev,
+IPSTATS_MIB_OUT, skb->len);
+   }
+}
+EXPORT_SYMBOL_GPL(mpls_stats_inc_outucastpkts);
+
 static u32 mpls_multipath_hash(struct mpls_route *rt,
   struct sk_buff *skb, bool bos)
 {
@@ -253,6 +272,7 @@ static int mpls_forward(struct sk_buff *skb, struct 
net_device *dev,
struct mpls_nh *nh;

[PATCH net-next 1/2] net: AF-specific RTM_GETSTATS attributes

2017-01-13 Thread Robert Shearman
Add the functionality for including address-family-specific per-link
stats in RTM_GETSTATS messages. This is done through adding a new
IFLA_STATS_AF_SPEC attribute under which address family attributes are
nested and then the AF-specific attributes can be further nested. This
follows the model of IFLA_AF_SPEC on RTM_*LINK messages and it has the
advantage of presenting an easily extended hierarchy. The rtnl_af_ops
structure is extended to provide AFs with the opportunity to fill and
provide the size of their stats attributes.

One alternative would have been to provide AFs with the ability to add
attributes directly into the RTM_GETSTATS message without a nested
hierarchy. I discounted this approach as it increases the rate at
which the 32 attribute number space is used up and it makes
implementation a little more tricky for stats dump resuming (at the
moment the order in which attributes are added to the message has to
match the numeric order of the attributes).

Another alternative would have been to register per-AF RTM_GETSTATS
handlers. I discounted this approach as I perceived a common use-case
to be getting all the stats for an interface and this approach would
necessitate multiple requests/dumps to retrieve them all.

Signed-off-by: Robert Shearman 
---
 include/net/rtnetlink.h  |  4 
 include/uapi/linux/if_link.h |  1 +
 net/core/rtnetlink.c | 50 
 3 files changed, 55 insertions(+)

diff --git a/include/net/rtnetlink.h b/include/net/rtnetlink.h
index 4113916cc1bb..106de5f7bf06 100644
--- a/include/net/rtnetlink.h
+++ b/include/net/rtnetlink.h
@@ -139,6 +139,10 @@ struct rtnl_af_ops {
const struct nlattr *attr);
int (*set_link_af)(struct net_device *dev,
   const struct nlattr *attr);
+
+   int (*fill_stats_af)(struct sk_buff *skb,
+const struct net_device *dev);
+   size_t  (*get_stats_af_size)(const struct net_device 
*dev);
 };
 
 void __rtnl_af_unregister(struct rtnl_af_ops *ops);
diff --git a/include/uapi/linux/if_link.h b/include/uapi/linux/if_link.h
index 6b13e591abc9..184b16ed2b84 100644
--- a/include/uapi/linux/if_link.h
+++ b/include/uapi/linux/if_link.h
@@ -847,6 +847,7 @@ enum {
IFLA_STATS_LINK_XSTATS,
IFLA_STATS_LINK_XSTATS_SLAVE,
IFLA_STATS_LINK_OFFLOAD_XSTATS,
+   IFLA_STATS_AF_SPEC,
__IFLA_STATS_MAX,
 };
 
diff --git a/net/core/rtnetlink.c b/net/core/rtnetlink.c
index 18b5aae99bec..4edc1bd7a735 100644
--- a/net/core/rtnetlink.c
+++ b/net/core/rtnetlink.c
@@ -3829,6 +3829,39 @@ static int rtnl_fill_statsinfo(struct sk_buff *skb, 
struct net_device *dev,
*idxattr = 0;
}
 
+   if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, *idxattr)) {
+   struct rtnl_af_ops *af_ops;
+
+   *idxattr = IFLA_STATS_AF_SPEC;
+   attr = nla_nest_start(skb, IFLA_STATS_AF_SPEC);
+   if (!attr)
+   goto nla_put_failure;
+
+   list_for_each_entry(af_ops, _af_ops, list) {
+   if (af_ops->fill_stats_af) {
+   struct nlattr *af;
+   int err;
+
+   af = nla_nest_start(skb, af_ops->family);
+   if (!af)
+   goto nla_put_failure;
+
+   err = af_ops->fill_stats_af(skb, dev);
+
+   if (err == -ENODATA)
+   nla_nest_cancel(skb, af);
+   else if (err < 0)
+   goto nla_put_failure;
+
+   nla_nest_end(skb, af);
+   }
+   }
+
+   nla_nest_end(skb, attr);
+
+   *idxattr = 0;
+   }
+
nlmsg_end(skb, nlh);
 
return 0;
@@ -3885,6 +3918,23 @@ static size_t if_nlmsg_stats_size(const struct 
net_device *dev,
if (stats_attr_valid(filter_mask, IFLA_STATS_LINK_OFFLOAD_XSTATS, 0))
size += rtnl_get_offload_stats_size(dev);
 
+   if (stats_attr_valid(filter_mask, IFLA_STATS_AF_SPEC, 0)) {
+   struct rtnl_af_ops *af_ops;
+
+   /* for IFLA_STATS_AF_SPEC */
+   size += nla_total_size(0);
+
+   list_for_each_entry(af_ops, _af_ops, list) {
+   if (af_ops->get_stats_af_size) {
+   size += nla_total_size(
+   af_ops->get_stats_af_size(dev));
+
+   /* for AF_* */
+   size += nla_total_size(0);
+   }
+   }
+   }
+
return size;
 }
 

[PATCH net-next 0/2] mpls: Packet stats

2017-01-13 Thread Robert Shearman
This patchset records per-interface packet stats in the MPLS
forwarding path and exports them using a nest of attributes root at a
new IFLA_STATS_AF_SPEC attribute as part of RTM_GETSTATS messages:

[IFLA_STATS_AF_SPEC]
 -> [AF_MPLS]
  -> [MPLS_STATS_LINK]
   -> struct mpls_link_stats

The first patch adds the rtnl infrastructure for this, including a new
callbacks to per-AF ops of fill_stats_af and get_stats_af_size. The
second patch records MPLS stats and makes use of the infrastructure to
export them. The rtnl infrastructure could also be used to export IPv6
stats in the future.

Robert Shearman (2):
  net: AF-specific RTM_GETSTATS attributes
  mpls: Packet stats

 include/net/rtnetlink.h  |   4 +
 include/uapi/linux/if_link.h |   1 +
 include/uapi/linux/mpls.h|  30 
 net/core/rtnetlink.c |  50 
 net/mpls/af_mpls.c   | 179 +--
 net/mpls/internal.h  |  58 +-
 net/mpls/mpls_iptunnel.c |  11 ++-
 7 files changed, 305 insertions(+), 28 deletions(-)

-- 
2.1.4



  1   2   3   >