date:20170821

Re: Something hitting my total number of connections to the server

2017-08-21 Thread Akshat Kakkar

On Tue, Aug 22, 2017 at 11:12 AM, Akshat Kakkar  wrote:
> There are multiple hosts/clients. All are mainly windows based.
>
> Timestamp is not used as my clients mainly are windows based and in
> that it tcp timestamp is by defauly disabled.
>
> sysctl is as follows:
>
> kernel.shmmax = 68719476736
> kernel.shmall = 4294967296
> kernel.pid_max=4194303
> vm.max_map_count=131072
> kernel.sem=250 32000 32 250
>
> net.netfilter.nf_conntrack_generic_timeout = 300
> net.netfilter.nf_conntrack_tcp_timeout_syn_sent = 60
> net.netfilter.nf_conntrack_tcp_timeout_syn_recv = 30
> net.netfilter.nf_conntrack_tcp_timeout_established = 7200
> net.netfilter.nf_conntrack_tcp_timeout_fin_wait = 60
> net.netfilter.nf_conntrack_tcp_timeout_close_wait = 30
> net.netfilter.nf_conntrack_tcp_timeout_last_ack = 30
> net.netfilter.nf_conntrack_tcp_timeout_time_wait = 60
> net.netfilter.nf_conntrack_tcp_timeout_close = 10
> net.netfilter.nf_conntrack_tcp_timeout_max_retrans = 300
> net.netfilter.nf_conntrack_tcp_timeout_unacknowledged = 300
> net.netfilter.nf_conntrack_udp_timeout = 30
> net.netfilter.nf_conntrack_udp_timeout_stream = 180
> net.netfilter.nf_conntrack_icmp_timeout = 30
> net.netfilter.nf_conntrack_events_retry_timeout = 15
> net.core.rmem_max = 8388608
> net.core.wmem_max = 8388608
>
> net.ipv4.tcp_tw_reuse=1
> net.ipv4.tcp_tw_recycle=1
> net.ipv4.tcp_fin_timeout=30
> net.ipv4.tcp_keepalive_time=1800
> net.ipv4.tcp_keepalive_intvl=60
> net.ipv4.tcp_keepalive_probes=20
> net.ipv4.tcp_max_syn_backlog=4096
> net.ipv4.tcp_syncookies=1
> net.ipv4.tcp_sack=1
> net.ipv4.tcp_dsack=1
> net.ipv4.tcp_window_scaling=1
> net.ipv4.tcp_syn_retries=3
> net.ipv4.tcp_synack_retries=3
> net.ipv4.tcp_retries1=3
> net.ipv4.tcp_retries2=15
> net.ipv4.ip_local_port_range=102465535
>
> net.ipv4.tcp_timestamps=0
>
> net.core.netdev_max_backlog=1
> net.core.somaxconn=10
> net.core.optmem_max=81920
>
> net.netfilter.nf_conntrack_max=524288
> net.nf_conntrack_max=524288
> net.ipv6.conf.all.disable_ipv6 = 1
> fs.file-max=100
>
> net.ipv4.tcp_no_metrics_save = 1
> net.ipv4.tcp_max_syn_backlog = 10240
> net.ipv4.tcp_congestion_control=htcp
>
> net.ipv4.tcp_rfc1337 = 1
> net.core.netdev_max_backlog = 65536
> net.ipv4.tcp_max_tw_buckets = 144
>
> net.core.rmem_max = 134217728
> net.core.wmem_max = 134217728
>
>
>
>
> On Mon, Aug 21, 2017 at 11:14 PM, Eric Dumazet  wrote:
>> On Mon, 2017-08-21 at 10:44 -0700, Eric Dumazet wrote:
>>
>>> - Why is timewait not being used ?
>>>
>>
>> s/timewait/timestamps/
>>
>>
>>
[Apologies for top post.]


There are multiple hosts/clients. All are mainly windows based.

Timestamp is not used as my clients mainly are windows based and in
that it tcp timestamp is by defauly disabled.

sysctl is as follows:

kernel.shmmax = 68719476736
kernel.shmall = 4294967296
kernel.pid_max=4194303
vm.max_map_count=131072
kernel.sem=250 32000 32 250

net.netfilter.nf_conntrack_generic_timeout = 300
net.netfilter.nf_conntrack_tcp_timeout_syn_sent = 60
net.netfilter.nf_conntrack_tcp_timeout_syn_recv = 30
net.netfilter.nf_conntrack_tcp_timeout_established = 7200
net.netfilter.nf_conntrack_tcp_timeout_fin_wait = 60
net.netfilter.nf_conntrack_tcp_timeout_close_wait = 30
net.netfilter.nf_conntrack_tcp_timeout_last_ack = 30
net.netfilter.nf_conntrack_tcp_timeout_time_wait = 60
net.netfilter.nf_conntrack_tcp_timeout_close = 10
net.netfilter.nf_conntrack_tcp_timeout_max_retrans = 300
net.netfilter.nf_conntrack_tcp_timeout_unacknowledged = 300
net.netfilter.nf_conntrack_udp_timeout = 30
net.netfilter.nf_conntrack_udp_timeout_stream = 180
net.netfilter.nf_conntrack_icmp_timeout = 30
net.netfilter.nf_conntrack_events_retry_timeout = 15
net.core.rmem_max = 8388608
net.core.wmem_max = 8388608

net.ipv4.tcp_tw_reuse=1
net.ipv4.tcp_tw_recycle=1
net.ipv4.tcp_fin_timeout=30
net.ipv4.tcp_keepalive_time=1800
net.ipv4.tcp_keepalive_intvl=60
net.ipv4.tcp_keepalive_probes=20
net.ipv4.tcp_max_syn_backlog=4096
net.ipv4.tcp_syncookies=1
net.ipv4.tcp_sack=1
net.ipv4.tcp_dsack=1
net.ipv4.tcp_window_scaling=1
net.ipv4.tcp_syn_retries=3
net.ipv4.tcp_synack_retries=3
net.ipv4.tcp_retries1=3
net.ipv4.tcp_retries2=15
net.ipv4.ip_local_port_range=102465535

net.ipv4.tcp_timestamps=0

net.core.netdev_max_backlog=1
net.core.somaxconn=10
net.core.optmem_max=81920

net.netfilter.nf_conntrack_max=524288
net.nf_conntrack_max=524288
net.ipv6.conf.all.disable_ipv6 = 1
fs.file-max=100

net.ipv4.tcp_no_metrics_save = 1
net.ipv4.tcp_max_syn_backlog = 10240
net.ipv4.tcp_congestion_control=htcp

net.ipv4.tcp_rfc1337 = 1
net.core.netdev_max_backlog = 65536
net.ipv4.tcp_max_tw_buckets = 144

net.core.rmem_max = 134217728
net.core.wmem_max = 134217728

Re: Something hitting my total number of connections to the server

2017-08-21 Thread Akshat Kakkar

There are multiple hosts/clients. All are mainly windows based.

Timestamp is not used as my clients mainly are windows based and in
that it tcp timestamp is by defauly disabled.

sysctl is as follows:

kernel.shmmax = 68719476736
kernel.shmall = 4294967296
kernel.pid_max=4194303
vm.max_map_count=131072
kernel.sem=250 32000 32 250

net.netfilter.nf_conntrack_generic_timeout = 300
net.netfilter.nf_conntrack_tcp_timeout_syn_sent = 60
net.netfilter.nf_conntrack_tcp_timeout_syn_recv = 30
net.netfilter.nf_conntrack_tcp_timeout_established = 7200
net.netfilter.nf_conntrack_tcp_timeout_fin_wait = 60
net.netfilter.nf_conntrack_tcp_timeout_close_wait = 30
net.netfilter.nf_conntrack_tcp_timeout_last_ack = 30
net.netfilter.nf_conntrack_tcp_timeout_time_wait = 60
net.netfilter.nf_conntrack_tcp_timeout_close = 10
net.netfilter.nf_conntrack_tcp_timeout_max_retrans = 300
net.netfilter.nf_conntrack_tcp_timeout_unacknowledged = 300
net.netfilter.nf_conntrack_udp_timeout = 30
net.netfilter.nf_conntrack_udp_timeout_stream = 180
net.netfilter.nf_conntrack_icmp_timeout = 30
net.netfilter.nf_conntrack_events_retry_timeout = 15
net.core.rmem_max = 8388608
net.core.wmem_max = 8388608

net.ipv4.tcp_tw_reuse=1
net.ipv4.tcp_tw_recycle=1
net.ipv4.tcp_fin_timeout=30
net.ipv4.tcp_keepalive_time=1800
net.ipv4.tcp_keepalive_intvl=60
net.ipv4.tcp_keepalive_probes=20
net.ipv4.tcp_max_syn_backlog=4096
net.ipv4.tcp_syncookies=1
net.ipv4.tcp_sack=1
net.ipv4.tcp_dsack=1
net.ipv4.tcp_window_scaling=1
net.ipv4.tcp_syn_retries=3
net.ipv4.tcp_synack_retries=3
net.ipv4.tcp_retries1=3
net.ipv4.tcp_retries2=15
net.ipv4.ip_local_port_range=102465535

net.ipv4.tcp_timestamps=0

net.core.netdev_max_backlog=1
net.core.somaxconn=10
net.core.optmem_max=81920

net.netfilter.nf_conntrack_max=524288
net.nf_conntrack_max=524288
net.ipv6.conf.all.disable_ipv6 = 1
fs.file-max=100

net.ipv4.tcp_no_metrics_save = 1
net.ipv4.tcp_max_syn_backlog = 10240
net.ipv4.tcp_congestion_control=htcp

net.ipv4.tcp_rfc1337 = 1
net.core.netdev_max_backlog = 65536
net.ipv4.tcp_max_tw_buckets = 144

net.core.rmem_max = 134217728
net.core.wmem_max = 134217728




On Mon, Aug 21, 2017 at 11:14 PM, Eric Dumazet  wrote:
> On Mon, 2017-08-21 at 10:44 -0700, Eric Dumazet wrote:
>
>> - Why is timewait not being used ?
>>
>
> s/timewait/timestamps/
>
>
>

Re: [PATCH net-next 3/3 v5] drivers: net: ethernet: qualcomm: rmnet: Initial implementation

2017-08-21 Thread Subash Abhinov Kasiviswanathan


+   priv = netdev_priv(rmnet_dev);
+   memset(priv, 0, sizeof(struct rmnet_priv));


Netdev private area is always zero on creation. See alloc_netdev().


+   if (!rmnet_dev)
+   return 0;


Do not confuse 0 with NULL. Did you run sparse?


+
+   priv = netdev_priv(rmnet_dev);
+   if (!priv)
+   return 0;



netdev_priv() always returns a non-NULL value.  The private area is 
just

a constant offset below the original network_device structure.


Hi Stephen

I havent used sparse till now. I'll fix the errors reported by it as 
well

your other comments and post again.


--
Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a 
Linux Foundation Collaborative Project

Re: skb_over_panic when sending esp traffic from a vmware guest

2017-08-21 Thread Steffen Klassert

On Thu, Aug 17, 2017 at 07:19:03PM +, Nick Huber wrote:
> I've been experience the following traceback since upgrading from the 4.9 
> kernel to the 4.11 branch. I've only seen this in a few VMWare guests and I 
> haven't been able to narrow down what exactly is causing it. I'm not familiar 
> with kernel debugging but any help in tracing this down would be appreciated.

Can you give some details about your setup?

>From the backtrace I see that you use GRE tunnels.
How is the setup of the GRE tunnel and the SAs?
How is routing done? In particular I'm interested
if you use routing by netfilter or socket marks.

Re: [PATCH net-next 1/3 v7] net: ether: Add support for multiplexing and aggregation type

2017-08-21 Thread Subash Abhinov Kasiviswanathan

+#define ETH_P_MAP   0xDA1A  /* Multiplexing and 
Aggregation Protocol

+*  NOT AN OFFICIALLY REGISTERED ID ]
+*/


Hi Subash

This list is sorted. So this entry should go earlier.


+   netdev_info(real_dev, "Removed from rmnet\n");


I would probably turn all your netdev_info()s into netdev_dbg()s.  You
seem to be spamming the kernel log quite a bit.



Hi Andrew

I'll make these changes and post v8.

--
Qualcomm Innovation Center, Inc.
The Qualcomm Innovation Center, Inc. is a member of Code Aurora Forum, a 
Linux Foundation Collaborative Project

Re: [RFC PATCH] dt-binding: net: sfp binding documentation

2017-08-21 Thread Baruch Siach

Hi Rob,

On Mon, Aug 21, 2017 at 02:10:33PM -0500, Rob Herring wrote:
> On Sun, Aug 20, 2017 at 5:28 AM, Baruch Siach  wrote:
> > Add device-tree binding documentation SFP transceivers. Support for SFP
> > transceivers has been recently introduced (drivers/net/phy/sfp.c).
> >
> > Signed-off-by: Baruch Siach 
> > ---
> >
> > The SFP driver is on net-next.
> >
> > Not sure about the rate-select-gpio property name. The SFP+ standard
> > (not supported yet) uses two signals, RS0 and RS1. RS0 is compatible
> > with the SFP rate select signal, while RS1 controls the Tx rate.
> > ---
> >  Documentation/devicetree/bindings/net/sff-sfp.txt | 24 
> > +++
> >  1 file changed, 24 insertions(+)
> >  create mode 100644 Documentation/devicetree/bindings/net/sff-sfp.txt
> >
> > diff --git a/Documentation/devicetree/bindings/net/sff-sfp.txt 
> > b/Documentation/devicetree/bindings/net/sff-sfp.txt
> > new file mode 100644
> > index ..f0c27bc3925e
> > --- /dev/null
> > +++ b/Documentation/devicetree/bindings/net/sff-sfp.txt
> > @@ -0,0 +1,24 @@
> > +Small Form Factor (SFF) Committee Small Form-factor Pluggable (SFP)
> > +Transceiver
> > +
> > +Required properties:
> > +
> > +- compatible : must be "sff,sfp"
> 
> Need to document "sff" vendor prefix.

"sff" stands for Small Form Factor Committee, now under Storage Networking 
Industry Association (SNIA). Not really a vendor, but a standards body. Does 
that count? I could not find any other example in vendor-prefixes.txt, other 
than "linux" which is kind of special.

> Kind of a short name, but I guess it is sufficient. Are there
> revisions of the standard (not SFP+) or more than one form factor (I
> don't recall any)?

I'm not aware of any other revisions.

baruch

-- 
 http://baruch.siach.name/blog/  ~. .~   Tk Open Systems
=}ooO--U--Ooo{=
   - bar...@tkos.co.il - tel: +972.2.679.5364, http://www.tkos.co.il -

Re: [RFC net-next v2] bridge lwtunnel, VPLS & NVGRE

2017-08-21 Thread Roopa Prabhu

On Mon, Aug 21, 2017 at 10:15 AM, David Lamparter  wrote:
> Hi all,
>
>
> this is an update on the earlier "[RFC net-next] VPLS support".  Note
> I've changed the subject lines on some of the patches to better reflect
> what they really do (tbh the earlier subject lines were crap.)
>
> As previously, iproute2 / FRR patches are at:
> - https://github.com/eqvinox/vpls-iproute2
> - https://github.com/opensourcerouting/frr/commits/vpls
> while this patchset is also available at:
> - https://github.com/eqvinox/vpls-linux-kernel
> (but please be aware that I'm amending and rebasing commits)
>
> The NVGRE implementation in the 3rd patch in this series is actually an
> accident - I was just wiring up gretap as a reference;  only after I was
> done I noticed that that sums up to NVGRE, more or less.  IMHO, it does
> serve well to demonstrate the bridge changes are not VPLS-specific.
>
> To refer some notes from the first announce mail:
>> I've tested some basic setups, the chain from LDP down into the kernel
>> works at least in these.  FRR has some testcases around from OpenBSD
>> VPLS support, I haven't wired that up to run against Linux / this
>> patchset yet.
>
> Same as before (API didn't change).
>
>> The patchset needs a lot of polishing (yes I left my TODO notes in the
>> commit messages), for now my primary concern is overall design
>> feedback.  Roopa has already provided a lot of input (Thanks!);  the
>> major topic I'm expecting to get discussion on is the bridge FDB
>> changes.
>
> Got some useful input;  but still need feedback on the bridge FDB
> changes (first 2 patches).  I don't believe it to have a significant
> impact on existing bridge operation, and I believe a multipoint tunnel
> driver without its own FDB (e.g. NVGRE in this set) should perform
> better than one with its own FDB (e.g. existing VXLAN).
>
>> P.S.: For a little context on the bridge FDB changes - I'm hoping to
>> find some time to extend this to the MDB to allow aggregating dst
>> metadata and handing down a list of dst metas on TX.  This isn't
>> specifically for VPLS but rather to give sufficient information to the
>> 802.11 stack to allow it to optimize selecting rates (or unicasting)
>> for multicast traffic by having the multicast subscriber list known.
>> This is done by major commercial wifi solutions (e.g. google "dynamic
>> multicast optimization".)
>
> You can find hacks at this on:
> https://github.com/eqvinox/vpls-linux-kernel/tree/mdb-hack
> Please note that the patches in that branch are not at an acceptable
> quality level, but you can see the semantic relation to 802.11.
>
> I would, however, like to point out that this branch has pseudo-working
> IGMP/MLD snooping for VPLS, and it'd be 20-ish lines to add it to NVGRE
> (I'll do that as soon as I get to it, it'll pop up on that branch too.)
>
> This is relevant to the discussion because it's a feature which is
> non-obvious (to me) on how to do with the VXLAN model of having an
> entirely separate FDB.  Meanwhile, with this architecture, the proof of
> concept / hack is coming in at a measly cost of:
> 8 files changed, 176 insertions(+), 15 deletions(-)

David, what is special about the vpls igmp/mld snooping code ?...do
you have to snoop vpls attrs ?.
in the vxlan model.., the vxlan driver can snoop its own attrs eg
vxlan-id, remote dst etc.
and the pkt is passed up to the bridge where it will hit the normal
bridge igmp/mpld snooping code.
can you pls elaborate ?

keeping vpls specific code and api in a separate vpls driver allows
for cleanly extending it in the future.

[PATCH v2 1/4] net: stmmac: dwmac-sun8i: support RGMII modes with PHY internal delay

2017-08-21 Thread Icenowy Zheng

Some boards uses a PHY with internal delay with an Allwinner SoC.

Support these PHY modes in the driver.

As the driver has no configuration registers for these modes, just treat
them as ordinary RGMII.

Signed-off-by: Icenowy Zheng 
---
 drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c 
b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
index fffd6d5fc907..2af680cac497 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c
@@ -723,6 +723,9 @@ static int sun8i_dwmac_set_syscon(struct stmmac_priv *priv)
/* default */
break;
case PHY_INTERFACE_MODE_RGMII:
+   case PHY_INTERFACE_MODE_RGMII_ID:
+   case PHY_INTERFACE_MODE_RGMII_RXID:
+   case PHY_INTERFACE_MODE_RGMII_TXID:
reg |= SYSCON_EPIT | SYSCON_ETCS_INT_GMII;
break;
case PHY_INTERFACE_MODE_RMII:
-- 
2.13.5

[PATCH v2 0/4] Workaround broken RTL8211E on some Pine64+ boards

2017-08-21 Thread Icenowy Zheng

Some Pine64+ boards come with bad RTL8211E PHYs, which cannot work reliably
unless do some hack. According to Pine64 people, Realtek describes the hack
as totally disabling RX delay, and it's not documented at all.

This patchset introduces the workaround on Pine64+.

The first patch adds RGMII variants' support to the dwmac-sun8i driver.

The second patch renames some macros in RTL PHY driver, and the third
patch introduces the hack as the "RGMII-TXID" mode of the PHY.

The fourth patch enables the hack in the device tree.

Icenowy Zheng (4):
  net: stmmac: dwmac-sun8i: support RGMII modes with PHY internal delay
  net: phy: realtek: change macro name for page select register
  net: phy: realtek: add disable RX internal delay mode
  arm64: allwinner: a64: disable the RTL8211E internal RX delay on
Pine64+

 .../boot/dts/allwinner/sun50i-a64-pine64-plus.dts  |  2 +-
 drivers/net/ethernet/stmicro/stmmac/dwmac-sun8i.c  |  3 ++
 drivers/net/phy/realtek.c  | 43 +++---
 3 files changed, 42 insertions(+), 6 deletions(-)

-- 
2.13.5

[PATCH v2 2/4] net: phy: realtek: change macro name for page select register

2017-08-21 Thread Icenowy Zheng

From: Icenowy Zheng 

The page select register also exists on RTL8211E PHY (although it
behaves slightly differently).

Change the register macro name to remove the F.

Signed-off-by: Icenowy Zheng 
---
 drivers/net/phy/realtek.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/drivers/net/phy/realtek.c b/drivers/net/phy/realtek.c
index 9cbe645e3d89..d820d00addf6 100644
--- a/drivers/net/phy/realtek.c
+++ b/drivers/net/phy/realtek.c
@@ -22,11 +22,13 @@
 #define RTL821x_INER   0x12
 #define RTL821x_INER_INIT  0x6400
 #define RTL821x_INSR   0x13
+
+#define RTL8211_PAGE_SELECT0x1f
+
 #define RTL8211E_INER_LINK_STATUS 0x400
 
 #define RTL8211F_INER_LINK_STATUS 0x0010
 #define RTL8211F_INSR  0x1d
-#define RTL8211F_PAGE_SELECT   0x1f
 #define RTL8211F_TX_DELAY  0x100
 
 MODULE_DESCRIPTION("Realtek PHY driver");
@@ -46,10 +48,10 @@ static int rtl8211f_ack_interrupt(struct phy_device *phydev)
 {
int err;
 
-   phy_write(phydev, RTL8211F_PAGE_SELECT, 0xa43);
+   phy_write(phydev, RTL8211_PAGE_SELECT, 0xa43);
err = phy_read(phydev, RTL8211F_INSR);
/* restore to default page 0 */
-   phy_write(phydev, RTL8211F_PAGE_SELECT, 0x0);
+   phy_write(phydev, RTL8211_PAGE_SELECT, 0x0);
 
return (err < 0) ? err : 0;
 }
@@ -102,7 +104,7 @@ static int rtl8211f_config_init(struct phy_device *phydev)
if (ret < 0)
return ret;
 
-   phy_write(phydev, RTL8211F_PAGE_SELECT, 0xd08);
+   phy_write(phydev, RTL8211_PAGE_SELECT, 0xd08);
reg = phy_read(phydev, 0x11);
 
/* enable TX-delay for rgmii-id and rgmii-txid, otherwise disable it */
@@ -114,7 +116,7 @@ static int rtl8211f_config_init(struct phy_device *phydev)
 
phy_write(phydev, 0x11, reg);
/* restore to default page 0 */
-   phy_write(phydev, RTL8211F_PAGE_SELECT, 0x0);
+   phy_write(phydev, RTL8211_PAGE_SELECT, 0x0);
 
return 0;
 }
-- 
2.13.5

Re: [PATCHv3 iproute2 net-next] gre: add support for ERSPAN tunnel

2017-08-21 Thread Stephen Hemminger

On Fri, 18 Aug 2017 05:54:08 -0700
William Tu  wrote:

> The patch adds ERSPAN type II tunnel support.  The implementation
> is based on the draft at https://tools.ietf.org/html/draft-foschiano-erspan-01
> One of the purposes is for Linux box to be able to receive ERSPAN
> monitoring traffic sent from the Cisco switch, by creating a ERSPAN
> tunnel device.  In addition, the patch also adds ERSPAN TX, so traffic
> can also be encapsulated into ERSPAN and sent out.
> 
> The implementation reuses the key as ERSPAN session ID, and
> field 'erspan' as ERSPAN Index fields:
> ./ip link add dev ers11 type erspan seq key 100 erspan 123 \
>   local 172.16.1.200 remote 172.16.1.100
> 
> Signed-off-by: William Tu 
> Signed-off-by: Meenakshi Vohra 
> Cc: Stephen Hemminger 
> Cc: Alexey Kuznetsov 

Waiting until ERSPAN is upstream in kernel (net-next).

Re: [PATCH net-next 3/3 v5] drivers: net: ethernet: qualcomm: rmnet: Initial implementation

2017-08-21 Thread Stephen Hemminger

> +void rmnet_vnd_setup(struct net_device *rmnet_dev)
> +{
> + struct rmnet_priv *priv;
> +
> + /* Clear out private data */
> + priv = netdev_priv(rmnet_dev);
> + memset(priv, 0, sizeof(struct rmnet_priv));

Netdev private area is always zero on creation. See alloc_netdev().


> +struct rmnet_endpoint *rmnet_vnd_get_endpoint(struct net_device *rmnet_dev)
> +{
> + struct rmnet_priv *priv;
> +
> + if (!rmnet_dev)
> + return 0;

Do not confuse 0 with NULL. Did you run sparse?

> +
> + priv = netdev_priv(rmnet_dev);
> + if (!priv)
> + return 0;


netdev_priv() always returns a non-NULL value.  The private area is just
a constant offset below the original network_device structure.

[PATCH v2 3/4] net: phy: realtek: add disable RX internal delay mode

2017-08-21 Thread Icenowy Zheng

From: Icenowy Zheng 

Some RTL8211E chips have broken GbE function, which needs a hack to
fix. It's said that this fix will affect the performance on not-buggy
PHYs, so it should only be enabled on boards with the broken PHY.
Currently only some Pine64+ boards are known to have this issue.

This hack is said to disable RX relay for RTL8211E according to Realtek.
So implement it as RGMII-TXID mode.

As this hack is not documented on the datasheet at all, it contains
magic numbers, and could not be revealed. These magic numbers are
received from Realtek via Pine64.

Signed-off-by: Icenowy Zheng 
---
Changes in v2:
- Used RGMII_TXID phy mode.

 drivers/net/phy/realtek.c | 31 +++
 1 file changed, 31 insertions(+)

diff --git a/drivers/net/phy/realtek.c b/drivers/net/phy/realtek.c
index d820d00addf6..8306b6abaaa8 100644
--- a/drivers/net/phy/realtek.c
+++ b/drivers/net/phy/realtek.c
@@ -13,6 +13,7 @@
  * option) any later version.
  *
  */
+#include 
 #include 
 #include 
 
@@ -26,6 +27,8 @@
 #define RTL8211_PAGE_SELECT0x1f
 
 #define RTL8211E_INER_LINK_STATUS 0x400
+#define RTL8211E_EXT_PAGE_SELECT 0x1e
+#define RTL8211E_EXT_PAGE  0x7
 
 #define RTL8211F_INER_LINK_STATUS 0x0010
 #define RTL8211F_INSR  0x1d
@@ -121,6 +124,33 @@ static int rtl8211f_config_init(struct phy_device *phydev)
return 0;
 }
 
+static int rtl8211e_config_init(struct phy_device *phydev)
+{
+   struct device *dev = >mdio.dev;
+   struct device_node *of_node = dev->of_node;
+   int ret;
+
+   ret = genphy_config_init(phydev);
+   if (ret < 0)
+   return ret;
+
+   if (phydev->interface == PHY_INTERFACE_MODE_RGMII_TXID) {
+   /* Disable the RX internal delay here.
+*
+* All the magic numbers are not documented on RTL8211E
+* datasheet. They're said to be from Realtek by Pine64.
+*/
+   phy_write(phydev, RTL8211_PAGE_SELECT, RTL8211E_EXT_PAGE);
+   phy_write(phydev, RTL8211E_EXT_PAGE_SELECT, 0xa4);
+   phy_write(phydev, 0x1c, 0xb591);
+
+   /* Restore to default page 0 */
+   phy_write(phydev, RTL8211_PAGE_SELECT, 0);
+   }
+
+   return 0;
+}
+
 static struct phy_driver realtek_drvs[] = {
{
.phy_id = 0x8201,
@@ -159,6 +189,7 @@ static struct phy_driver realtek_drvs[] = {
.features   = PHY_GBIT_FEATURES,
.flags  = PHY_HAS_INTERRUPT,
.config_aneg= _config_aneg,
+   .config_init= rtl8211e_config_init,
.read_status= _read_status,
.ack_interrupt  = _ack_interrupt,
.config_intr= _config_intr,
-- 
2.13.5

[PATCH v2 4/4] arm64: allwinner: a64: disable the RTL8211E internal RX delay on Pine64+

2017-08-21 Thread Icenowy Zheng

Some Pine64+ boards have a broken RTL8211E PHY, which cannot work
reliably in 1000Base-T mode with default configuration.

A solution is passed to Pine64, which is said to be disabling the
internal RX delay of the PHY.

Enable the hack by set the PHY mode to RGMII-TXID.

Signed-off-by: Icenowy Zheng 
---
Changes in v2:
- It can be merged now as dwmac-sun8i entered mainline.
- Use phy-mode rgmii-txid instead of custom property.

 arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts 
b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts
index 24f1aac366d6..ed715426fffc 100644
--- a/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts
+++ b/arch/arm64/boot/dts/allwinner/sun50i-a64-pine64-plus.dts
@@ -52,7 +52,7 @@
  {
pinctrl-names = "default";
pinctrl-0 = <_pins>;
-   phy-mode = "rgmii";
+   phy-mode = "rgmii-txid";
phy-handle = <_rgmii_phy>;
status = "okay";
 };
-- 
2.13.5

Re: [PATCH net-next] virtio-net: invoke zerocopy callback on xmit path if no tx napi

2017-08-21 Thread Willem de Bruijn

>>> Interesting, deadlock could be treated as a a radical case of the
>>> discussion
>>> here https://patchwork.kernel.org/patch/3787671/.
>>>
>>> git grep tells more similar skb_orphan() cases. Do we need to change them
>>> all (or part)?
>>
>> Most skb_orphan calls are not relevant to the issue of transmit delay.
>
>
> Yes, but at least we should audit the ones in drivers/net.

Do you mean other virtual device driver transmit paths, like xen,
specifically?

>>> Actually, we may meet similar issues at many other places (e.g netem).
>>
>> Netem is an interesting case. Because it is intended to mimic network
>> delay, at least in the case where it calls skb_orphan, it may make
>> sense to release all references, including calling skb_zcopy_clear.
>>
>> In general, zerocopy reverts to copy on all paths that may cause
>> unbounded delay due to another process. Guarding against delay
>> induced by the administrator is infeasible. It is always possible to
>> just pause the nic. Netem is one instance of that, and not unbounded.
>
>
> The problem is, admin may only delay the traffic in e.g one interface, but
> it actually delay or stall all traffic inside a VM.

Understood. Ideally we can remove the HoL blocking cause of this,
itself.

>>> Need
>>> to consider a complete solution for this. Figuring out all places that
>>> could
>>> delay a packet is a method.
>>
>> The issue described in the referenced patch seems like head of line
>> blocking between two flows. If one flow delays zerocopy descriptor
>> release from the vhost-net pool, it blocks all subsequent descriptors
>> in that pool from being released, also delaying other flows that use
>> the same descriptor pool. If the pool is empty, all transmission stopped.
>>
>> Reverting to copy tx when the pool reaches a low watermark, as the
>> patch does, fixes this.
>
>
> An issue of the referenced patch is that sndbuf could be smaller than low
> watermark.
>
>> Perhaps the descriptor pool should also be
>> revised to allow out of order completions. Then there is no need to
>> copy zerocopy packets whenever they may experience delay.
>
>
> Yes, but as replied in the referenced thread, windows driver may treat out
> of order completion as a bug.

Interesting. I missed that. Perhaps the zerocopy optimization
could be gated on guest support for out of order completions.

>> On the point of counting copy vs zerocopy: the new msg_zerocopy
>> variant of ubuf_info has a field to record whether a deep copy was
>> made. This can be used with vhost-net zerocopy, too.
>
>
> Just to make sure I understand. It's still not clear to me how to reuse this
> for vhost-net, e.g zerocopy flag is in a union which is not used by
> vhost_net.

True, but that is not set in stone. I went back and forth on that when
preparing fix 0a4a060bb204 ("sock: fix zerocopy_success regression
with msg_zerocopy"). The field can be moved outside the union and
initialized in the other zerocopy paths.

Re: [PATCH net-next] virtio-net: invoke zerocopy callback on xmit path if no tx napi

2017-08-21 Thread Jason Wang




On 2017年08月21日 23:41, Willem de Bruijn wrote:

On Mon, Aug 21, 2017 at 8:33 AM, Jason Wang  wrote:


On 2017年08月19日 14:38, Koichiro Den wrote:

Facing the possible unbounded delay relying on freeing on xmit path,
we also better to invoke and clear the upper layer zerocopy callback
beforehand to keep them from waiting for unbounded duration in vain.
For instance, this removes the possible deadlock in the case that the
upper layer is a zerocopy-enabled vhost-net.
This does not apply if napi_tx is enabled since it will be called in
reasonale time.

Signed-off-by: Koichiro Den 
---
   drivers/net/virtio_net.c | 8 
   1 file changed, 8 insertions(+)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 4302f313d9a7..f7deaa5b7b50 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -1290,6 +1290,14 @@ static netdev_tx_t start_xmit(struct sk_buff *skb,
struct net_device *dev)
 /* Don't wait up for transmitted skbs to be freed. */
 if (!use_napi) {
+   if (skb_shinfo(skb)->tx_flags & SKBTX_DEV_ZEROCOPY) {
+   struct ubuf_info *uarg;
+   uarg = skb_shinfo(skb)->destructor_arg;
+   if (uarg->callback)
+   uarg->callback(uarg, true);
+   skb_shinfo(skb)->destructor_arg = NULL;
+   skb_shinfo(skb)->tx_flags &= ~SKBTX_DEV_ZEROCOPY;
+   }
 skb_orphan(skb);
 nf_reset(skb);
 }



Interesting, deadlock could be treated as a a radical case of the discussion
here https://patchwork.kernel.org/patch/3787671/.

git grep tells more similar skb_orphan() cases. Do we need to change them
all (or part)?

Most skb_orphan calls are not relevant to the issue of transmit delay.


Yes, but at least we should audit the ones in drivers/net.




Actually, we may meet similar issues at many other places (e.g netem).

Netem is an interesting case. Because it is intended to mimic network
delay, at least in the case where it calls skb_orphan, it may make
sense to release all references, including calling skb_zcopy_clear.

In general, zerocopy reverts to copy on all paths that may cause
unbounded delay due to another process. Guarding against delay
induced by the administrator is infeasible. It is always possible to
just pause the nic. Netem is one instance of that, and not unbounded.


The problem is, admin may only delay the traffic in e.g one interface, 
but it actually delay or stall all traffic inside a VM.





Need
to consider a complete solution for this. Figuring out all places that could
delay a packet is a method.

The issue described in the referenced patch seems like head of line
blocking between two flows. If one flow delays zerocopy descriptor
release from the vhost-net pool, it blocks all subsequent descriptors
in that pool from being released, also delaying other flows that use
the same descriptor pool. If the pool is empty, all transmission stopped.

Reverting to copy tx when the pool reaches a low watermark, as the
patch does, fixes this.


An issue of the referenced patch is that sndbuf could be smaller than 
low watermark.



Perhaps the descriptor pool should also be
revised to allow out of order completions. Then there is no need to
copy zerocopy packets whenever they may experience delay.


Yes, but as replied in the referenced thread, windows driver may treat 
out of order completion as a bug.




On the point of counting copy vs zerocopy: the new msg_zerocopy
variant of ubuf_info has a field to record whether a deep copy was
made. This can be used with vhost-net zerocopy, too.


Just to make sure I understand. It's still not clear to me how to reuse 
this for vhost-net, e.g zerocopy flag is in a union which is not used by 
vhost_net.


Thanks

[PATCH net-next,2/4] hv_netvsc: Clean up unused parameter from netvsc_get_rss_hash_opts()

2017-08-21 Thread Haiyang Zhang

From: Haiyang Zhang 

The parameter "nvdev" is not in use.

Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/netvsc_drv.c |5 ++---
 1 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 4677d21..d8612b1 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -1228,8 +1228,7 @@ static void netvsc_get_strings(struct net_device *dev, 
u32 stringset, u8 *data)
 }
 
 static int
-netvsc_get_rss_hash_opts(struct netvsc_device *nvdev,
-struct ethtool_rxnfc *info)
+netvsc_get_rss_hash_opts(struct ethtool_rxnfc *info)
 {
info->data = RXH_IP_SRC | RXH_IP_DST;
 
@@ -1267,7 +1266,7 @@ static void netvsc_get_strings(struct net_device *dev, 
u32 stringset, u8 *data)
return 0;
 
case ETHTOOL_GRXFH:
-   return netvsc_get_rss_hash_opts(nvdev, info);
+   return netvsc_get_rss_hash_opts(info);
}
return -EOPNOTSUPP;
 }
-- 
1.7.1

[PATCH net-next,0/4] hv_netvsc: Ethtool handler to change UDP hash levels

2017-08-21 Thread Haiyang Zhang

From: Haiyang Zhang 

The patch set adds the functions to switch UDP hash level between
L3 and L4 by ethtool command. UDP over IPv4 and v6 can be set
differently. The default hash level is L4. We currently only
allow switching TX hash level from within the guests.

The ethtool callback function is triggered by command line, and
update the per device variables of the hash level.

On Azure, fragmented UDP packets is not yet supported with L4
hashing, and may have high packet loss rate. Using L3 hashing is
recommended in this case. This ethtool option allows a user to
make this selection.

Haiyang Zhang (4):
  hv_netvsc: Clean up unused parameter from netvsc_get_hash()
  hv_netvsc: Clean up unused parameter from netvsc_get_rss_hash_opts()
  hv_netvsc: Add ethtool handler to set and get UDP hash levels
  hv_netvsc: Update netvsc Document for UDP hash level setting

 Documentation/networking/netvsc.txt |   22 --
 drivers/net/hyperv/hyperv_net.h |2 +
 drivers/net/hyperv/netvsc_drv.c |   77 +++
 3 files changed, 88 insertions(+), 13 deletions(-)

[PATCH net-next,4/4] hv_netvsc: Update netvsc Document for UDP hash level setting

2017-08-21 Thread Haiyang Zhang

From: Haiyang Zhang 

Update Documentation/networking/netvsc.txt for UDP hash level setting
and related info.

Signed-off-by: Haiyang Zhang 
---
 Documentation/networking/netvsc.txt |   22 +-
 1 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/Documentation/networking/netvsc.txt 
b/Documentation/networking/netvsc.txt
index 4ddb4e4..fa8d863 100644
--- a/Documentation/networking/netvsc.txt
+++ b/Documentation/networking/netvsc.txt
@@ -21,11 +21,23 @@ Features
   
   Hyper-V supports receive side scaling. For TCP, packets are
   distributed among available queues based on IP address and port
-  number. Current versions of Hyper-V host, only distribute UDP
-  packets based on the IP source and destination address.
-  The port number is not used as part of the hash value for UDP.
-  Fragmented IP packets are not distributed between queues;
-  all fragmented packets arrive on the first channel.
+  number.
+
+  For UDP, we can switch UDP hash level between L3 and L4 by ethtool
+  command. UDP over IPv4 and v6 can be set differently. The default
+  hash level is L4. We currently only allow switching TX hash level
+  from within the guests.
+
+  On Azure, fragmented UDP packets have high loss rate with L4
+  hashing. Using L3 hashing is recommended in this case.
+
+  For example, for UDP over IPv4 on eth0:
+  To include UDP port numbers in hasing:
+ethtool -N eth0 rx-flow-hash udp4 sdfn
+  To exclude UDP port numbers in hasing:
+ethtool -N eth0 rx-flow-hash udp4 sd
+  To show UDP hash level:
+ethtool -n eth0 rx-flow-hash udp4
 
   Generic Receive Offload, aka GRO
   
-- 
1.7.1

[PATCH net-next,3/4] hv_netvsc: Add ethtool handler to set and get UDP hash levels

2017-08-21 Thread Haiyang Zhang

From: Haiyang Zhang 

The patch add the functions to switch UDP hash level between
L3 and L4 by ethtool command. UDP over IPv4 and v6 can be set
differently. The default hash level is L4. We currently only
allow switching TX hash level from within the guests.

On Azure, fragmented UDP packets have high loss rate with L4
hashing. Using L3 hashing is recommended in this case.

For example, for UDP over IPv4 on eth0:
To include UDP port numbers in hasing:
ethtool -N eth0 rx-flow-hash udp4 sdfn
To exclude UDP port numbers in hasing:
ethtool -N eth0 rx-flow-hash udp4 sd
To show UDP hash level:
ethtool -n eth0 rx-flow-hash udp4

Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/hyperv_net.h |2 +
 drivers/net/hyperv/netvsc_drv.c |   78 +++
 2 files changed, 72 insertions(+), 8 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 9198dd1..ff1c0c8 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -720,6 +720,8 @@ struct net_device_context {
u32 tx_send_table[VRSS_SEND_TAB_SIZE];
 
/* Ethtool settings */
+   bool udp4_l4_hash;
+   bool udp6_l4_hash;
u8 duplex;
u32 speed;
struct netvsc_ethtool_stats eth_stats;
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index d8612b1..c0c4c91 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -190,10 +190,12 @@ static int netvsc_close(struct net_device *net)
return ppi;
 }
 
-/* Azure hosts don't support non-TCP port numbers in hashing yet. We compute
- * hash for non-TCP traffic with only IP numbers.
+/* Azure hosts don't support non-TCP port numbers in hashing for fragmented
+ * packets. We can use ethtool to change UDP hash level when necessary.
  */
-static inline u32 netvsc_get_hash(struct sk_buff *skb)
+static inline u32 netvsc_get_hash(
+   struct sk_buff *skb,
+   const struct net_device_context *ndc)
 {
struct flow_keys flow;
u32 hash;
@@ -204,7 +206,11 @@ static inline u32 netvsc_get_hash(struct sk_buff *skb)
if (!skb_flow_dissect_flow_keys(skb, , 0))
return 0;
 
-   if (flow.basic.ip_proto == IPPROTO_TCP) {
+   if (flow.basic.ip_proto == IPPROTO_TCP ||
+   (flow.basic.ip_proto == IPPROTO_UDP &&
+((flow.basic.n_proto == htons(ETH_P_IP) && ndc->udp4_l4_hash) ||
+ (flow.basic.n_proto == htons(ETH_P_IPV6) &&
+  ndc->udp6_l4_hash {
return skb_get_hash(skb);
} else {
if (flow.basic.n_proto == htons(ETH_P_IP))
@@ -227,7 +233,7 @@ static inline int netvsc_get_tx_queue(struct net_device 
*ndev,
struct sock *sk = skb->sk;
int q_idx;
 
-   q_idx = ndc->tx_send_table[netvsc_get_hash(skb) &
+   q_idx = ndc->tx_send_table[netvsc_get_hash(skb, ndc) &
   (VRSS_SEND_TAB_SIZE - 1)];
 
/* If queue index changed record the new value */
@@ -891,6 +897,9 @@ static void netvsc_init_settings(struct net_device *dev)
 {
struct net_device_context *ndc = netdev_priv(dev);
 
+   ndc->udp4_l4_hash = true;
+   ndc->udp6_l4_hash = true;
+
ndc->speed = SPEED_UNKNOWN;
ndc->duplex = DUPLEX_FULL;
 }
@@ -1228,7 +1237,8 @@ static void netvsc_get_strings(struct net_device *dev, 
u32 stringset, u8 *data)
 }
 
 static int
-netvsc_get_rss_hash_opts(struct ethtool_rxnfc *info)
+netvsc_get_rss_hash_opts(struct net_device_context *ndc,
+struct ethtool_rxnfc *info)
 {
info->data = RXH_IP_SRC | RXH_IP_DST;
 
@@ -1236,9 +1246,20 @@ static void netvsc_get_strings(struct net_device *dev, 
u32 stringset, u8 *data)
case TCP_V4_FLOW:
case TCP_V6_FLOW:
info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
-   /* fallthrough */
+   break;
+
case UDP_V4_FLOW:
+   if (ndc->udp4_l4_hash)
+   info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+
+   break;
+
case UDP_V6_FLOW:
+   if (ndc->udp6_l4_hash)
+   info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+
+   break;
+
case IPV4_FLOW:
case IPV6_FLOW:
break;
@@ -1266,11 +1287,51 @@ static void netvsc_get_strings(struct net_device *dev, 
u32 stringset, u8 *data)
return 0;
 
case ETHTOOL_GRXFH:
-   return netvsc_get_rss_hash_opts(info);
+   return netvsc_get_rss_hash_opts(ndc, info);
}
return -EOPNOTSUPP;
 }
 
+static int netvsc_set_rss_hash_opts(struct net_device_context *ndc,
+   struct ethtool_rxnfc *info)
+{
+   if (info->data == (RXH_IP_SRC | RXH_IP_DST |
+  RXH_L4_B_0_1 | RXH_L4_B_2_3)) {
+   if

[PATCH net-next,1/4] hv_netvsc: Clean up unused parameter from netvsc_get_hash()

2017-08-21 Thread Haiyang Zhang

From: Haiyang Zhang 

The parameter "sk" is not in use.

Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/netvsc_drv.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index b33f050..4677d21 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -193,7 +193,7 @@ static int netvsc_close(struct net_device *net)
 /* Azure hosts don't support non-TCP port numbers in hashing yet. We compute
  * hash for non-TCP traffic with only IP numbers.
  */
-static inline u32 netvsc_get_hash(struct sk_buff *skb, struct sock *sk)
+static inline u32 netvsc_get_hash(struct sk_buff *skb)
 {
struct flow_keys flow;
u32 hash;
@@ -227,7 +227,7 @@ static inline int netvsc_get_tx_queue(struct net_device 
*ndev,
struct sock *sk = skb->sk;
int q_idx;
 
-   q_idx = ndc->tx_send_table[netvsc_get_hash(skb, sk) &
+   q_idx = ndc->tx_send_table[netvsc_get_hash(skb) &
   (VRSS_SEND_TAB_SIZE - 1)];
 
/* If queue index changed record the new value */
-- 
1.7.1

Re: [PATCH net-next] liquidio: show NIC's U-Boot version in a dev_info() message

2017-08-21 Thread Andrew Lunn

On Mon, Aug 21, 2017 at 06:19:44PM -0700, Felix Manlunas wrote:
> From: Weilin Chang 
> 
> Signed-off-by: Weilin Chang 
> Signed-off-by: Felix Manlunas 
> ---
>  .../net/ethernet/cavium/liquidio/octeon_console.c  | 85 
> ++
>  .../net/ethernet/cavium/liquidio/octeon_device.h   |  5 ++
>  2 files changed, 90 insertions(+)
> 
> diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_console.c 
> b/drivers/net/ethernet/cavium/liquidio/octeon_console.c
> index 19e5212..88ef12b 100644
> --- a/drivers/net/ethernet/cavium/liquidio/octeon_console.c
> +++ b/drivers/net/ethernet/cavium/liquidio/octeon_console.c
> @@ -30,6 +30,7 @@
>  #include "liquidio_image.h"
>  #include "octeon_mem_ops.h"
>  
> +static void octeon_get_uboot_version(struct octeon_device *oct);

Hi Felix

Please can we avoid this forward declaration by putting the code in
the correct order.

>  static void octeon_remote_lock(void);
>  static void octeon_remote_unlock(void);
>  static u64 cvmx_bootmem_phy_named_block_find(struct octeon_device *oct,
> @@ -611,6 +612,9 @@ int octeon_add_console(struct octeon_device *oct, u32 
> console_num,
>  
>   work = >console_poll_work[console_num].work;
>  
> + if (oct->uboot_len == 0)
> + octeon_get_uboot_version(oct);
> +
>   INIT_DELAYED_WORK(work, check_console);
>   oct->console_poll_work[console_num].ctxptr = (void *)oct;
>   oct->console_poll_work[console_num].ctxul = console_num;
> @@ -724,6 +728,87 @@ static int octeon_console_read(struct octeon_device 
> *oct, u32 console_num,
>   return bytes_to_read;
>  }
>  
> +static void octeon_get_uboot_version(struct octeon_device *oct)
> +{
> + s32 bytes_read, tries, total_read;
> + struct octeon_console *console;
> + u32 console_num = 0;
> + int i;
> +
> + if (octeon_console_send_cmd(oct, "setenv stdout pci\n", 50))
> + return;
> +
> + console = >console[console_num];
> + tries = 0;
> + total_read = 0;
> +
> + if (octeon_console_send_cmd(oct, "version\n", 1))
> + return;
> +
> + do {
> + /* Take console output regardless of whether it will
> +  * be logged
> +  */
> + bytes_read =
> + octeon_console_read(oct,
> + console_num, oct->uboot_version +
> + total_read,
> + OCTEON_UBOOT_BUFFER_SIZE - 1 -
> + total_read);
> + if (bytes_read > 0) {
> + oct->uboot_version[bytes_read] = 0x0;
> +
> + total_read += bytes_read;
> + if (console->waiting)
> + octeon_console_handle_result(oct, console_num);
> + } else if (bytes_read < 0) {
> + dev_err(>pci_dev->dev, "Error reading console %u, 
> ret=%d\n",
> + console_num, bytes_read);
> + }
> +
> + tries++;
> + } while ((bytes_read > 0) && (tries < 16));
> +
> + /* If nothing is read after polling the console,
> +  * output any leftovers if any
> +  */
> + if ((total_read == 0) && (console->leftover[0])) {
> + dev_dbg(>pci_dev->dev, "%u: %s\n",
> + console_num, console->leftover);
> + console->leftover[0] = '\0';
> + }
> +
> + if (octeon_console_send_cmd(oct, "setenv stdout serial\n", 50))
> + return;
> +
> + /* U-Boot */
> + for (i = 0; i < (OCTEON_UBOOT_BUFFER_SIZE - 9); i++) {
> + if (oct->uboot_version[i] == 'U' &&
> + oct->uboot_version[i + 2] == 'B' &&
> + oct->uboot_version[i + 3] == 'o' &&
> + oct->uboot_version[i + 4] == 'o' &&
> + oct->uboot_version[i + 5] == 't') {
> + oct->uboot_sidx = i;
> + i++;
> + for (; oct->uboot_version[i] != 0x0; i++) {

I think you need a test in here to ensure i stays within the buffer.

> + if (oct->uboot_version[i] == 'm' &&
> + oct->uboot_version[i + 1] == 'i' &&
> + oct->uboot_version[i + 2] == 'p' &&
> + oct->uboot_version[i + 3] == 's') {
> + oct->uboot_eidx = i - 1;
> + oct->uboot_version[i - 1] = 0x0;
> + oct->uboot_len = oct->uboot_eidx -
> + oct->uboot_sidx + 1;
> + dev_info(>pci_dev->dev, "%s\n",
> +  >uboot_version
> +

linux-next: manual merge of the net-next tree with the rockchip tree

2017-08-21 Thread Stephen Rothwell

Hi all,

Today's linux-next merge of the net-next tree got a conflict in:

  arch/arm64/boot/dts/rockchip/rk3328-evb.dts

between commits:

  ab78718bda79 ("arm64: dts: rockchip: Enable tsadc module on RK3328 eavluation 
board")
  1e28037ec88e ("arm64: dts: rockchip: add rk805 node for rk3328-evb")

from the rockchip tree and commit:

  4b05bc6157eb ("ARM64: dts: rockchip: Enable gmac2phy for rk3328-evb")

from the net-next tree.

I fixed it up (see below) and can carry the fix as necessary. This
is now fixed as far as linux-next is concerned, but any non trivial
conflicts should be mentioned to your upstream maintainer when your tree
is submitted for merging.  You may also want to consider cooperating
with the maintainer of the conflicting tree to minimise any particularly
complex conflicts.

-- 
Cheers,
Stephen Rothwell

diff --cc arch/arm64/boot/dts/rockchip/rk3328-evb.dts
index 86605ae7b6f5,b9f36dad17e6..
--- a/arch/arm64/boot/dts/rockchip/rk3328-evb.dts
+++ b/arch/arm64/boot/dts/rockchip/rk3328-evb.dts
@@@ -51,147 -51,24 +51,164 @@@
stdout-path = "serial2:150n8";
};
  
 +  dc_12v: dc-12v {
 +  compatible = "regulator-fixed";
 +  regulator-name = "dc_12v";
 +  regulator-always-on;
 +  regulator-boot-on;
 +  regulator-min-microvolt = <1200>;
 +  regulator-max-microvolt = <1200>;
 +  };
 +
+   vcc_phy: vcc-phy-regulator {
+   compatible = "regulator-fixed";
+   regulator-name = "vcc_phy";
+   regulator-always-on;
+   regulator-boot-on;
+   };
++
 +  vcc_sys: vcc-sys {
 +  compatible = "regulator-fixed";
 +  regulator-name = "vcc_sys";
 +  regulator-always-on;
 +  regulator-boot-on;
 +  regulator-min-microvolt = <500>;
 +  regulator-max-microvolt = <500>;
 +  vin-supply = <_12v>;
 +  };
  };
  
+  {
+   phy-supply = <_phy>;
+   clock_in_out = "output";
+   assigned-clocks = < SCLK_MAC2PHY_SRC>;
+   assigned-clock-rate = <5000>;
+   assigned-clocks = < SCLK_MAC2PHY>;
+   assigned-clock-parents = < SCLK_MAC2PHY_SRC>;
+   status = "okay";
+ };
+ 
 + {
 +  status = "okay";
 +
 +  rk805: rk805@18 {
 +  compatible = "rockchip,rk805";
 +  reg = <0x18>;
 +  interrupt-parent = <>;
 +  interrupts = <6 IRQ_TYPE_LEVEL_LOW>;
 +  #clock-cells = <1>;
 +  clock-output-names = "xin32k", "rk805-clkout2";
 +  gpio-controller;
 +  #gpio-cells = <2>;
 +  pinctrl-names = "default";
 +  pinctrl-0 = <_int_l>;
 +  rockchip,system-power-controller;
 +  wakeup-source;
 +
 +  vcc1-supply = <_sys>;
 +  vcc2-supply = <_sys>;
 +  vcc3-supply = <_sys>;
 +  vcc4-supply = <_sys>;
 +  vcc5-supply = <_io>;
 +  vcc6-supply = <_io>;
 +
 +  regulators {
 +  vdd_logic: DCDC_REG1 {
 +  regulator-name = "vdd_logic";
 +  regulator-min-microvolt = <712500>;
 +  regulator-max-microvolt = <145>;
 +  regulator-always-on;
 +  regulator-boot-on;
 +  regulator-state-mem {
 +  regulator-on-in-suspend;
 +  regulator-suspend-microvolt = <100>;
 +  };
 +  };
 +
 +  vdd_arm: DCDC_REG2 {
 +  regulator-name = "vdd_arm";
 +  regulator-min-microvolt = <712500>;
 +  regulator-max-microvolt = <145>;
 +  regulator-always-on;
 +  regulator-boot-on;
 +  regulator-state-mem {
 +  regulator-on-in-suspend;
 +  regulator-suspend-microvolt = <95>;
 +  };
 +  };
 +
 +  vcc_ddr: DCDC_REG3 {
 +  regulator-name = "vcc_ddr";
 +  regulator-always-on;
 +  regulator-boot-on;
 +  regulator-state-mem {
 +  regulator-on-in-suspend;
 +  };
 +  };
 +
 +  vcc_io: DCDC_REG4 {
 +  regulator-name = "vcc_io";
 +  regulator-min-microvolt = <330>;
 +  regulator-max-microvolt = <330>;
 +

[PATCH net-next] liquidio: show NIC's U-Boot version in a dev_info() message

2017-08-21 Thread Felix Manlunas

From: Weilin Chang 

Signed-off-by: Weilin Chang 
Signed-off-by: Felix Manlunas 
---
 .../net/ethernet/cavium/liquidio/octeon_console.c  | 85 ++
 .../net/ethernet/cavium/liquidio/octeon_device.h   |  5 ++
 2 files changed, 90 insertions(+)

diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_console.c 
b/drivers/net/ethernet/cavium/liquidio/octeon_console.c
index 19e5212..88ef12b 100644
--- a/drivers/net/ethernet/cavium/liquidio/octeon_console.c
+++ b/drivers/net/ethernet/cavium/liquidio/octeon_console.c
@@ -30,6 +30,7 @@
 #include "liquidio_image.h"
 #include "octeon_mem_ops.h"
 
+static void octeon_get_uboot_version(struct octeon_device *oct);
 static void octeon_remote_lock(void);
 static void octeon_remote_unlock(void);
 static u64 cvmx_bootmem_phy_named_block_find(struct octeon_device *oct,
@@ -611,6 +612,9 @@ int octeon_add_console(struct octeon_device *oct, u32 
console_num,
 
work = >console_poll_work[console_num].work;
 
+   if (oct->uboot_len == 0)
+   octeon_get_uboot_version(oct);
+
INIT_DELAYED_WORK(work, check_console);
oct->console_poll_work[console_num].ctxptr = (void *)oct;
oct->console_poll_work[console_num].ctxul = console_num;
@@ -724,6 +728,87 @@ static int octeon_console_read(struct octeon_device *oct, 
u32 console_num,
return bytes_to_read;
 }
 
+static void octeon_get_uboot_version(struct octeon_device *oct)
+{
+   s32 bytes_read, tries, total_read;
+   struct octeon_console *console;
+   u32 console_num = 0;
+   int i;
+
+   if (octeon_console_send_cmd(oct, "setenv stdout pci\n", 50))
+   return;
+
+   console = >console[console_num];
+   tries = 0;
+   total_read = 0;
+
+   if (octeon_console_send_cmd(oct, "version\n", 1))
+   return;
+
+   do {
+   /* Take console output regardless of whether it will
+* be logged
+*/
+   bytes_read =
+   octeon_console_read(oct,
+   console_num, oct->uboot_version +
+   total_read,
+   OCTEON_UBOOT_BUFFER_SIZE - 1 -
+   total_read);
+   if (bytes_read > 0) {
+   oct->uboot_version[bytes_read] = 0x0;
+
+   total_read += bytes_read;
+   if (console->waiting)
+   octeon_console_handle_result(oct, console_num);
+   } else if (bytes_read < 0) {
+   dev_err(>pci_dev->dev, "Error reading console %u, 
ret=%d\n",
+   console_num, bytes_read);
+   }
+
+   tries++;
+   } while ((bytes_read > 0) && (tries < 16));
+
+   /* If nothing is read after polling the console,
+* output any leftovers if any
+*/
+   if ((total_read == 0) && (console->leftover[0])) {
+   dev_dbg(>pci_dev->dev, "%u: %s\n",
+   console_num, console->leftover);
+   console->leftover[0] = '\0';
+   }
+
+   if (octeon_console_send_cmd(oct, "setenv stdout serial\n", 50))
+   return;
+
+   /* U-Boot */
+   for (i = 0; i < (OCTEON_UBOOT_BUFFER_SIZE - 9); i++) {
+   if (oct->uboot_version[i] == 'U' &&
+   oct->uboot_version[i + 2] == 'B' &&
+   oct->uboot_version[i + 3] == 'o' &&
+   oct->uboot_version[i + 4] == 'o' &&
+   oct->uboot_version[i + 5] == 't') {
+   oct->uboot_sidx = i;
+   i++;
+   for (; oct->uboot_version[i] != 0x0; i++) {
+   if (oct->uboot_version[i] == 'm' &&
+   oct->uboot_version[i + 1] == 'i' &&
+   oct->uboot_version[i + 2] == 'p' &&
+   oct->uboot_version[i + 3] == 's') {
+   oct->uboot_eidx = i - 1;
+   oct->uboot_version[i - 1] = 0x0;
+   oct->uboot_len = oct->uboot_eidx -
+   oct->uboot_sidx + 1;
+   dev_info(>pci_dev->dev, "%s\n",
+>uboot_version
+   [oct->uboot_sidx]);
+   return;
+   }
+   }
+   }
+   }
+}
+
 #define FBUF_SIZE  (4 * 1024 * 1024)
 
 int octeon_download_firmware(struct octeon_device *oct, const u8 *data,
diff --git

Re: [PATCH net-next 3/3 v7] drivers: net: ethernet: qualcomm: rmnet: Initial implementation

2017-08-21 Thread Andrew Lunn

> +static int rmnet_unregister_real_device(struct net_device *real_dev)
> +{
> + struct rmnet_real_dev_info *rdinfo;
> + struct list_head *iter;
> +
> + ASSERT_RTNL();
> +
> + if (!rmnet_is_real_dev_registered(real_dev) ||
> + netdev_lower_get_next(real_dev, ))
> + return -EINVAL;
> +
> + rdinfo = __rmnet_get_real_dev_info(real_dev);
> + kfree(rdinfo);
> +
> + netdev_rx_handler_unregister(real_dev);
> +
> + /* release reference on real_dev */
> + dev_put(real_dev);
> +
> + netdev_info(real_dev, "Removed from rmnet\n");

I would probably turn all your netdev_info()s into netdev_dbg()s.  You
seem to be spamming the kernel log quite a bit.

  Andrew

Re: [PATCH net] udp: on peeking bad csum, drop packets even if not at head

2017-08-21 Thread Willem de Bruijn

On Mon, Aug 21, 2017 at 8:12 PM, Willem de Bruijn
 wrote:
> On Mon, Aug 21, 2017 at 6:40 PM, Eric Dumazet  wrote:
>> On Mon, 2017-08-21 at 17:39 -0400, Willem de Bruijn wrote:
>>> From: Willem de Bruijn 
>>>
>>> When peeking, if a bad csum is discovered, the skb is unlinked from
>>> the queue with __sk_queue_drop_skb and the peek operation restarted.
>>>
>>> __sk_queue_drop_skb only drops packets that match the queue head. With
>>> sk_peek_off, the skb need not be at head, causing the call to fail and
>>> the same skb to be found again on restart.
>>>
>>> Walk the queue to find the correct skb. Limit the walk to sk_peek_off,
>>> to bound cycle cost to at most twice the original skb_queue_walk in
>>> __skb_try_recv_from_queue.
>>>
>>> The operation may race with updates to sk_peek_off. As the operation
>>> is retried, it will eventually succeed.
>>>
>>> Signed-off-by: Willem de Bruijn 
>>
>> You forgot the Fixes: tag, that such a bug fix deserves.
>
> Indeed, sorry. I'm looking into that now. It should be the patch that
> introduced peeking at offset, but need to verify.

It is. Fixes: 627d2d6b5500 ("udp: enable MSG_PEEK at non-zero offset")

Re: [PATCH net-next 1/3 v7] net: ether: Add support for multiplexing and aggregation type

2017-08-21 Thread Andrew Lunn

On Mon, Aug 21, 2017 at 04:36:57PM -0600, Subash Abhinov Kasiviswanathan wrote:
> Define the multiplexing and aggregation (MAP) ether type 0xDA1A. This
> is needed for receiving data in the MAP protocol like RMNET. This is
> not an officially registered ID.
> 
> Signed-off-by: Subash Abhinov Kasiviswanathan 
> ---
>  include/uapi/linux/if_ether.h | 4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
> index 5bc9bfd..e80b03f 100644
> --- a/include/uapi/linux/if_ether.h
> +++ b/include/uapi/linux/if_ether.h
> @@ -104,7 +104,9 @@
>  #define ETH_P_QINQ3  0x9300  /* deprecated QinQ VLAN [ NOT AN 
> OFFICIALLY REGISTERED ID ] */
>  #define ETH_P_EDSA   0xDADA  /* Ethertype DSA [ NOT AN OFFICIALLY 
> REGISTERED ID ] */
>  #define ETH_P_AF_IUCV   0xFBFB   /* IBM af_iucv [ NOT AN 
> OFFICIALLY REGISTERED ID ] */
> -
> +#define ETH_P_MAP   0xDA1A  /* Multiplexing and Aggregation 
> Protocol
> +  *  NOT AN OFFICIALLY REGISTERED ID ]
> +  */

Hi Subash

This list is sorted. So this entry should go earlier.

 Andrew

Re: [PATCH net-next v2 1/2] tcp: Remove unnecessary dst check in tcp_conn_request.

2017-08-21 Thread Tonghao Zhang

On Mon, Aug 21, 2017 at 10:56 PM, Eric Dumazet  wrote:
> Please do not top post.
Got it, thanks.

> On Mon, 2017-08-21 at 21:24 +0800, Tonghao Zhang wrote:
>> Thanks,  yes  this is a bug. I found this bug exists from 3.17~ 4.13.
>> The commit is d94e0417
>>
>
> This bug was there at the beginning of git tree.
>
>
>> One question:  should I send a patch for each kernel version because
>> code conflicts ?
>>
>> a patch for v4.12
>> a patch for v4.11
>> a patch for v4.10~v4.7
>> a patch for v4.6~v3.17
>>
>> and
>> a patch for  net-next, because tcp_tw_recycle has been removed.
>>
>
> Given this bug only would matter if syncookies are disabled, I would not
> bother and only target net-next. This does not look serious enough to
> deserve backports to stable versions.
>
OK, thanks again.

>> Thanks very much.
>>
>> On Sun, Aug 20, 2017 at 12:25 PM, David Miller  wrote:
>> > From: Tonghao Zhang 
>> > Date: Wed, 16 Aug 2017 20:02:45 -0700
>> >
>> >> Because we remove the tcp_tw_recycle support in the commit
>> >> 4396e46187c ('tcp: remove tcp_tw_recycle') and also delete
>> >> the code 'af_ops->route_req' for sysctl_tw_recycle in tcp_conn_request.
>> >> Now when we call the 'af_ops->route_req', the dist always is
>> >> NULL, and we remove the unnecessay check.
>> >>
>> >> Signed-off-by: Tonghao Zhang 
>> >
>> > This is a bug actually, rather than something to paper over
>> > by removing the check.
>> >
>> > Code earlier in this function needs a proper 'dst' in order to operate
>> > properly.
>> >
>> > There is a call to tcp_peer_is_proven() which must have a proper route
>> > to make the determination yet it will always be NULL.
>> >
>> > Please investigate what the code is doing and how a test became
>> > "unnecessary" over time before blindly removing it, thank you.
>
>

Re: Re: [PATCH net-next] net: sched: Add the invalid handle check in qdisc_class_find

2017-08-21 Thread Cong Wang

On Mon, Aug 21, 2017 at 5:46 PM, Gao Feng  wrote:
> Hi Cong,
>
> Thanks your reminder firstly.
> But I had used the get_maintainer.pl actually before sent the patch.
>
> The following is the output.
> [fgao@ikuai8 net-next]#./scripts/get_maintainer.pl 
> patch_ScheCheck/0001-net-sched-Add-the-invalid-handle-check-in-qdisc_clas.patch
> "David S. Miller"  (maintainer:NETWORKING [GENERAL])
> netdev@vger.kernel.org (open list:NETWORKING [GENERAL])
> linux-ker...@vger.kernel.org (open list)
>
> I don't know if it is an issue of the script "get_maintainer.pl"  or my usage 
> is wrong.

No, probably because we don't add include/net/sch_generic.h into TC
subsystem, while we should.

Re:Re: [PATCH net-next] net: sched: Add the invalid handle check in qdisc_class_find

2017-08-21 Thread Gao Feng

At 2017-08-22 03:58:03, "Cong Wang"  wrote:
>On Mon, Aug 21, 2017 at 10:47 AM, David Miller  wrote:
>> From: gfree.w...@vip.163.com
>> Date: Fri, 18 Aug 2017 15:23:24 +0800
>>
>>> From: Gao Feng 
>>>
>>> Add the invalid handle "0" check to avoid unnecessary search, because
>>> the qdisc uses the skb->priority as the handle value to look up, and
>>> it is "0" usually.
>>>
>>> Signed-off-by: Gao Feng 
>>
>> Jamal, Cong, please review.
>>
>> If 'id' zero is never hashed into the tables, this change looks
>> legitimate.
>
>Looks good to me.
>
>Gao, in the future please Cc maintainers directly, you can
>use ./scripts/get_maintainer.pl.
>
>Thanks.

Hi Cong,

Thanks your reminder firstly.
But I had used the get_maintainer.pl actually before sent the patch.

The following is the output.
[fgao@ikuai8 net-next]#./scripts/get_maintainer.pl 
patch_ScheCheck/0001-net-sched-Add-the-invalid-handle-check-in-qdisc_clas.patch
"David S. Miller"  (maintainer:NETWORKING [GENERAL])
netdev@vger.kernel.org (open list:NETWORKING [GENERAL])
linux-ker...@vger.kernel.org (open list)

I don't know if it is an issue of the script "get_maintainer.pl"  or my usage 
is wrong.

Anyway, thanks you & Jamal's review.

Best Regards
Feng

Re: [PATCH v2 2/2] net: phy: Don't use drv when it is NULL in phy_attached_print

2017-08-21 Thread Florian Fainelli

On 08/21/2017 07:24 AM, Andrew Lunn wrote:
> On Mon, Aug 21, 2017 at 01:45:30PM +0200, Romain Perier wrote:
>> Currently, if this logging function is used prior the phy driver is
>> bound to the phy device (that is usually done from .ndo_open),
>> 'phydev->drv' might be NULL, resulting in a kernel crash. That is
>> typically the case in the stmmac driver, info about the phy is displayed
>> during the registration of the MDIO bus, and then genphy driver is bound
>> to this phydev when .ndo_open is called.
>>
>> This commit fixes the issue by using the right genphy driver, when
>> phydev->drv is NULL.
>>
>> Fixes: fbca164776e4 ("net: stmmac: Use the right logging functi")
>> Signed-off-by: Romain Perier 
>> ---
>>  drivers/net/phy/phy_device.c | 13 +++--
>>  1 file changed, 11 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
>> index 1790f7fec125..6af6dc6dfeaf 100644
>> --- a/drivers/net/phy/phy_device.c
>> +++ b/drivers/net/phy/phy_device.c
>> @@ -864,15 +864,24 @@ EXPORT_SYMBOL(phy_attached_info);
>>  #define ATTACHED_FMT "attached PHY driver [%s] (mii_bus:phy_addr=%s, 
>> irq=%d)"
>>  void phy_attached_print(struct phy_device *phydev, const char *fmt, ...)
>>  {
>> +struct phy_driver *drv = phydev->drv;
>> +
>> +if (!drv) {
>> +if (phydev->is_c45)
>> +drv = _10g_driver;
>> +else
>> +drv = _driver;
>> +}
>> +
> 
> As i said in my comment to v1, i don't like this.

Agreed, just use Andrew's earlier suggestion of checking phydev->drv
validity.

We don't have an equivalent of "unregistered" in the PHY layer, but
"unbound" seems like it could be what we want here.

Thanks
-- 
Florian

Re: [PATCH v2 1/2] net: stmmac: Delete dead code for MDIO registration

2017-08-21 Thread Florian Fainelli

On 08/21/2017 04:45 AM, Romain Perier wrote:
> This code is no longer used, the logging function was changed by commit
> fbca164776e4 ("net: stmmac: Use the right logging functi").
> 
> Fixes: fbca164776e4 ("net: stmmac: Use the right logging functi")
> Signed-off-by: Romain Perier 
> ---
>  drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c | 16 
>  1 file changed, 16 deletions(-)
> 
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c 
> b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
> index 72ec711fcba2..f5f37bfa1d58 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_mdio.c
> @@ -248,9 +248,6 @@ int stmmac_mdio_register(struct net_device *ndev)
>   found = 0;
>   for (addr = 0; addr < PHY_MAX_ADDR; addr++) {
>   struct phy_device *phydev = mdiobus_get_phy(new_bus, addr);
> - int act = 0;
> - char irq_num[4];
> - char *irq_str;
>  
>   if (!phydev)
>   continue;
> @@ -273,19 +270,6 @@ int stmmac_mdio_register(struct net_device *ndev)
>   if (priv->plat->phy_addr == -1)
>   priv->plat->phy_addr = addr;
>  
> - act = (priv->plat->phy_addr == addr);
> - switch (phydev->irq) {
> - case PHY_POLL:
> - irq_str = "POLL";
> - break;
> - case PHY_IGNORE_INTERRUPT:
> - irq_str = "IGNORE";
> - break;
> - default:
> - sprintf(irq_num, "%d", phydev->irq);
> - irq_str = irq_num;
> - break;
> - }

I was actually just looking into moving these prints to
phy_attached_info(), since it is useful to know whether the interrupt is
POLL, IGNORE, or valid. Can you move that there? Then you can really
migrate over phy_attached_info() with no information loss.

Thanks!
-- 
Florian

Re: [iproute PATCH v2] lib/bpf: Don't leak fp in bpf_find_mntpt()

2017-08-21 Thread Stephen Hemminger

On Mon, 21 Aug 2017 16:46:51 +0200
Phil Sutter  wrote:

> If fopen() succeeded but len != PATH_MAX, the function leaks the open
> FILE pointer. Fix this by checking len value before calling fopen().
> 
> Signed-off-by: Phil Sutter 
> Acked-by: Daniel Borkmann 


Thanks, Applied

Re: [iproute PATCH v2 2/7] ss: Make sure index variable is >= 0

2017-08-21 Thread Stephen Hemminger

On Mon, 21 Aug 2017 19:08:08 +0200
Phil Sutter  wrote:

> This shouldn't happen but relying upon external data without checking
> may lead to unexpected results.
> 
> Signed-off-by: Phil Sutter 
> ---
>  misc/ss.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/misc/ss.c b/misc/ss.c
> index 10360e5a04ff8..1ee02d73b2d7f 100644
> --- a/misc/ss.c
> +++ b/misc/ss.c
> @@ -2004,7 +2004,7 @@ static void tcp_timer_print(struct tcpstat *s)
>   "unknown"
>   };
>  
> - if (s->timer) {
> + if (s->timer >= 0) {
>   if (s->timer > 4)
>   s->timer = 5;
>   printf(" timer:(%s,%s,%d)",

Let's go one step deeper on this.
Why is s->timer an int, it should be unsigned. In which case the code in
print would not have to change.

Re: [RFC net-next v2] bridge lwtunnel, VPLS & NVGRE

2017-08-21 Thread David Lamparter

On Mon, Aug 21, 2017 at 05:01:51PM -0700, Stephen Hemminger wrote:
> On Mon, 21 Aug 2017 19:15:17 +0200 David Lamparter  wrote:
> > > P.S.: For a little context on the bridge FDB changes - I'm hoping to
> > > find some time to extend this to the MDB to allow aggregating dst
> > > metadata and handing down a list of dst metas on TX.  This isn't
> > > specifically for VPLS but rather to give sufficient information to the
> > > 802.11 stack to allow it to optimize selecting rates (or unicasting)
> > > for multicast traffic by having the multicast subscriber list known.
> > > This is done by major commercial wifi solutions (e.g. google "dynamic
> > > multicast optimization".)  
> > 
> > You can find hacks at this on:
> > https://github.com/eqvinox/vpls-linux-kernel/tree/mdb-hack
> > Please note that the patches in that branch are not at an acceptable
> > quality level, but you can see the semantic relation to 802.11.
> > 
> > I would, however, like to point out that this branch has pseudo-working
> > IGMP/MLD snooping for VPLS, and it'd be 20-ish lines to add it to NVGRE
> > (I'll do that as soon as I get to it, it'll pop up on that branch too.)
> > 
> > This is relevant to the discussion because it's a feature which is
> > non-obvious (to me) on how to do with the VXLAN model of having an
> > entirely separate FDB.  Meanwhile, with this architecture, the proof of
> > concept / hack is coming in at a measly cost of:
> > 8 files changed, 176 insertions(+), 15 deletions(-)
> 
> I know the bridge is an easy target to extend L2 forwarding, but it is not
> the only option. Have you condidered building a new driver

Yes I have;  I dismissed the approach because even though an fdb is
reasonable to duplicate, I did not believe replicating multicast
snooping code into both VPLS and 802.11 (and possibly VXLAN) to be a
viable option.  ...is it?

> (like VXLAN does) which does the forwarding you want. Having all
> features in one driver makes for worse performance, and increased
> complexity.

Can you elaborate?  I agree with that sentence as a general statement,
but a general statement needs to apply to a specific situation.  As
discussed in the previous thread with Nikolay, checking skb->_refdst
against 0 should be doable without touching additional cachelines, so
the performance cost should be rather small.  For complexity - it's
keeping an extra pointer around, which is semantically bound to the
existing net_bridge_fdb_entry->dst.  On the other hand, it spares us
from another copy of a fdb implementation, and two copies of multicast
snooping code...  I honestly believe this patchset is a good approach.

-David

Re: [iproute PATCH v2 0/3] Covscan: Fix for missing error checking

2017-08-21 Thread Stephen Hemminger

On Mon, 21 Aug 2017 18:36:49 +0200
Phil Sutter  wrote:

> This series collects patches from v1 dealing with spots where error
> checking is necessary or recommended.
> 
> Minor changes to patches 1 and 2, patch 3 remains unchanged.
> 
> Phil Sutter (3):
>   iproute: Check mark value input
>   iplink_vrf: Complain if main table is not found
>   devlink: Check return code of strslashrsplit()
> 
>  devlink/devlink.c | 16 
>  ip/iplink_vrf.c   |  4 +++-
>  ip/iproute.c  |  6 --
>  3 files changed, 19 insertions(+), 7 deletions(-)
> 

These 3 look fine. Applied

Re: [iproute PATCH v3 2/7] xfrm_state: Make sure alg_name is NULL-terminated

2017-08-21 Thread Stephen Hemminger

On Mon, 21 Aug 2017 15:23:36 +0200
Phil Sutter  wrote:

> Signed-off-by: Phil Sutter 
> ---
>  ip/xfrm_state.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/ip/xfrm_state.c b/ip/xfrm_state.c
> index e11c93bf1c3b5..7c0389038986e 100644
> --- a/ip/xfrm_state.c
> +++ b/ip/xfrm_state.c
> @@ -125,7 +125,8 @@ static int xfrm_algo_parse(struct xfrm_algo *alg, enum 
> xfrm_attr_type_t type,
>   fprintf(stderr, "warning: ALGO-NAME/ALGO-KEYMAT values will be sent to 
> the kernel promiscuously! (verifying them isn't implemented yet)\n");
>  #endif
>  
> - strncpy(alg->alg_name, name, sizeof(alg->alg_name));
> + strncpy(alg->alg_name, name, sizeof(alg->alg_name) - 1);
> + alg->alg_name[sizeof(alg->alg_name) - 1] = '\0';
>  
>   if (slen > 2 && strncmp(key, "0x", 2) == 0) {
>   /* split two chars "0x" from the top */

You are fixing enough of these null terminated string issues, that maybe
introducing strlcpy() would make sense. Either in utils (or -lbsd).

Re: [iproute PATCH v2 1/7] nstat: Avoid passing negative fd to fdopen()

2017-08-21 Thread Stephen Hemminger

On Mon, 21 Aug 2017 19:08:07 +0200
Phil Sutter  wrote:

> Introduce a wrapper which does the sanity checking and returns NULL
> in case fd is invalid.
> 
> Signed-off-by: Phil Sutter 
> ---
>  misc/nstat.c | 15 +++
>  1 file changed, 11 insertions(+), 4 deletions(-)
> 
> diff --git a/misc/nstat.c b/misc/nstat.c
> index 1212b1f2c8128..7cdde75a56e4e 100644
> --- a/misc/nstat.c
> +++ b/misc/nstat.c
> @@ -252,9 +252,16 @@ static void load_ugly_table(FILE *fp)
>   }
>  }
>  
> +static FILE *fdopen_null(int fd, const char *mode)
> +{
> + if (fd < 0)
> + return NULL;
> + return fdopen(fd, mode);
> +}
> +
>  static void load_sctp_snmp(void)
>  {
> - FILE *fp = fdopen(net_sctp_snmp_open(), "r");
> + FILE *fp = fdopen_null(net_sctp_snmp_open(), "r");
>  
>   if (fp) {
>   load_good_table(fp);
> @@ -264,7 +271,7 @@ static void load_sctp_snmp(void)
>  
>  static void load_snmp(void)
>  {
> - FILE *fp = fdopen(net_snmp_open(), "r");
> + FILE *fp = fdopen_null(net_snmp_open(), "r");
>  
>   if (fp) {
>   load_ugly_table(fp);
> @@ -274,7 +281,7 @@ static void load_snmp(void)
>  
>  static void load_snmp6(void)
>  {
> - FILE *fp = fdopen(net_snmp6_open(), "r");
> + FILE *fp = fdopen_null(net_snmp6_open(), "r");
>  
>   if (fp) {
>   load_good_table(fp);
> @@ -284,7 +291,7 @@ static void load_snmp6(void)
>  
>  static void load_netstat(void)
>  {
> - FILE *fp = fdopen(net_netstat_open(), "r");
> + FILE *fp = fdopen_null(net_netstat_open(), "r");
>  
>   if (fp) {
>   load_ugly_table(fp);

Why not just fix it at the source of the open.
I.e 
static FILE *generic_proc_open(condt char * env, const char *name)
{
...
return fopen(p, "r");
}

Re: [iproute PATCH v3 2/5] nstat: Fix for potential NULL pointer dereference

2017-08-21 Thread Stephen Hemminger

On Mon, 21 Aug 2017 12:03:05 +0200
Phil Sutter  wrote:

> If the string at 'p' contains neither space not newline, 'p' will become
> NULL. Make sure this isn't the case before dereferencing it.
> 
> Signed-off-by: Phil Sutter 
> ---
> Changes since v2:
> - Call abort() if 'p' becomes NULL.
> ---
>  misc/nstat.c | 2 ++
>  1 file changed, 2 insertions(+)
> 
> diff --git a/misc/nstat.c b/misc/nstat.c
> index a4dd405d43a93..56e9367e99736 100644
> --- a/misc/nstat.c
> +++ b/misc/nstat.c
> @@ -217,6 +217,8 @@ static void load_ugly_table(FILE *fp)
>   n->next = db;
>   db = n;
>   p = next;
> + if (!p)
> + abort();
>   }
>   n = db;
>   if (fgets(buf, sizeof(buf), fp) == NULL)

This doesn't do anything better than just dereferencing NULL.
In either case program crashes with no useful information to user.
Not applying this.

Re: [iproute PATCH v3 0/6] Covscan: Don't access garbage

2017-08-21 Thread Stephen Hemminger

On Mon, 21 Aug 2017 11:26:58 +0200
Phil Sutter  wrote:

> This series collects patches from v1 which resolve situations where
> garbage might be read, either due to missing initialization of
> variables or accessing data which went out of scope.
> 
> Changes since v2:
> - Rebased onto current master branch.
> - Dropped first patch since it is not a real issue.
> 
> Phil Sutter (6):
>   ipaddress: Avoid accessing uninitialized variable lcl
>   iplink_can: Prevent overstepping array bounds
>   ipmaddr: Avoid accessing uninitialized data
>   ss: Use C99 initializer in netlink_show_one()
>   netem/maketable: Check return value of fstat()
>   tc/q_multiq: Don't pass garbage in TCA_OPTIONS
> 
>  ip/ipaddress.c|  2 +-
>  ip/iplink_can.c   |  4 ++--
>  ip/ipmaddr.c  |  2 +-
>  misc/ss.c | 13 +++--
>  netem/maketable.c |  4 ++--
>  tc/q_multiq.c |  2 +-
>  6 files changed, 14 insertions(+), 13 deletions(-)
> 

These look fine. Applied.

Re: [iproute PATCH v2 0/7] Covscan: Dead code elimination

2017-08-21 Thread Stephen Hemminger

On Thu, 17 Aug 2017 19:09:24 +0200
Phil Sutter  wrote:

> This series collects patches from v1 which deal with dead code, either
> by removing it or changing context so it is accessed again if that makes
> sense.
> 
> No changes to the actual patches, just splitting into smaller series.
> 
> Phil Sutter (7):
>   devlink: No need for this self-assignment
>   ipntable: No need to check and assign to parms_rta
>   iproute: Fix for missing 'Oifs:' display
>   lib/rt_names: Drop dead code in rtnl_rttable_n2a()
>   ss: Skip useless check in parse_hostcond()
>   ss: Drop useless assignment
>   tc/m_gact: Drop dead code
> 
>  devlink/devlink.c |  2 +-
>  ip/ipntable.c |  2 --
>  ip/iproute.c  |  8 +---
>  lib/rt_names.c|  4 
>  misc/ss.c |  3 +--
>  tc/m_gact.c   | 14 +++---
>  6 files changed, 10 insertions(+), 23 deletions(-)
> 

Sure these look fine. Applied.

Re: [PATCH v6 iproute2 0/8] RDMAtool

2017-08-21 Thread Stephen Hemminger

On Sun, 20 Aug 2017 12:58:20 +0300
Leon Romanovsky  wrote:

> From: Leon Romanovsky 
> 
> This is fifth revision of series implementing the RDAMtool -  the tool
> to configure RDMA devices.
> 
> It looks like everyone who was interested to read cover letter already did it,
> so I'll start from the changelog:
> 
> Changelog:
> v5->v6:
>  * Removed double includes
>  * Copied rdma_netlink.h from he kernel to include/rdma folder, so the
>tool can be built as a standalone.
> v4->v5:
>  * Rebased to latest net-next branch
>  * Moved BIT() macro from devlink to general utils.h file - Patch #1.
>  * Changed the order of patches - moved man pages to be last patch.
>  * Rewrote all switch->case->return_string constructions to be static
>tables with help of David's macro magic. Thanks a lot.
>  * Dropped dependency on exported device and port properties. Now tool depends
>on RDMA netlink only and all needed code is already in Doug's for-next.
>  * Added two OPA specific physical link states, because their names is
>too broad - TEST and OFFLINE, I named it as OPA_TEST and OPA_OFFLINE.
> v3->v4:
>  * Rebased to latest net-next branch
>  * Added JSON output -j (json) and -p (pretty output)
>  * Exported and reused kernel UAPIs and defines instead of hard coded
>version.
> v2->v3:
>  * Removed MAX()
>  * Reduced scope of rd_argv_match
>  * Removed return from rdma_free_devmap
>  * Added extra break at rdma_send_msg
> v1->v2:
>  * Squashed multiple (and similar) patches to be one patch for dev object
>and one patch for link object.
>  * Removed port_map struct
>  * Removed global netlink dump during initialization, it removed the need to 
> store
>the intermediate variables and reuse ability of netlink to signal if 
> variable
>exists or doesn't.
>  * Added "-d" --details option and put all CAPs under it.
> 
> v0->v1:
>  * Moved hunk with changes in man/Makefile from first patch to the last patch
>  * Removed the "unknown command" from the examples in commit messages
>  * Removed special "caps" parsing command and put it to be part of general 
> "show" command
>  * Changed parsed capability format to be similar to iproute2 suite
>  * Added FW version as an output of show command.
>  * Added forgotten CAP_FLAGS to the nla_policy list
> RFC->v0:
>  * Removed everything that is not implemented yet.
>  * Abandoned sysfs interfaces in favor of netlink.
> 
> -
> The initial proposal was sent as RFC [1] and was based on sysfs entries as 
> POC.
> 
> The current series was rewritten completely to work with RDMA netlinks as
> a source of user<->kernel communications. In order to achieve that, the
> RDMA netlinks were extensively refactored and modernized [2, 3, 4 and 5].
> 
> The Doug's for-next tag includes most of the needed patches for this tool.
> 
> The following is an example of various runs on my machine with 5 devices
> (4 in IB mode and one in Ethernet mode).
> 
> ### Without parameters
> $ rdma
> Usage: rdma [ OPTIONS ] OBJECT { COMMAND | help }
> where  OBJECT := { dev | link | help }
>OPTIONS := { -V[ersion] | -d[etails] | -j[son] | -p[retty]}
> 
> ### With unspecified device name
> $ rdma dev
> 1: mlx5_0: node_type ca fw 2.8. node_guid 5254:00c0:fe12:3457 
> sys_image_guid 5254:00c0:fe12:3457
> 2: mlx5_1: node_type ca fw 2.8. node_guid 5254:00c0:fe12:3458 
> sys_image_guid 5254:00c0:fe12:3458
> 3: mlx5_2: node_type ca fw 2.8. node_guid 5254:00c0:fe12:3459 
> sys_image_guid 5254:00c0:fe12:3459
> 4: mlx5_3: node_type ca fw 2.8. node_guid 5254:00c0:fe12:345a 
> sys_image_guid 5254:00c0:fe12:345a
> 5: mlx5_4: node_type ca fw 2.8. node_guid 5254:00c0:fe12:345b 
> sys_image_guid 5254:00c0:fe12:345b
> 
> ### Detailed mode
> $ rdma -d dev
> 1: mlx5_0: node_type ca fw 2.8. node_guid 5254:00c0:fe12:3457 
> sys_image_guid 5254:00c0:fe12:3457
> caps:  SYS_IMAGE_GUID, RC_RNR_NAK_GEN, MEM_WINDOW, UD_IP_CSUM, UD_TSO, XRC, 
> MEM_MGT_EXTENSIONS, BLOCK_MULTICAST_LOOPBACK, MEM_WINDOW_TYPE_2B, 
> RAW_IP_CSUM, MANAGED_FLOW_STEERING, RESIZE_MAX_WR>
> 2: mlx5_1: node_type ca fw 2.8. node_guid 5254:00c0:fe12:3458 
> sys_image_guid 5254:00c0:fe12:3458
> caps:  SYS_IMAGE_GUID, RC_RNR_NAK_GEN, MEM_WINDOW, UD_IP_CSUM, UD_TSO, XRC, 
> MEM_MGT_EXTENSIONS, BLOCK_MULTICAST_LOOPBACK, MEM_WINDOW_TYPE_2B, 
> RAW_IP_CSUM, MANAGED_FLOW_STEERING, RESIZE_MAX_WR>
> 3: mlx5_2: node_type ca fw 2.8. node_guid 5254:00c0:fe12:3459 
> sys_image_guid 5254:00c0:fe12:3459
> caps:  SYS_IMAGE_GUID, RC_RNR_NAK_GEN, MEM_WINDOW, UD_IP_CSUM, UD_TSO, XRC, 
> MEM_MGT_EXTENSIONS, BLOCK_MULTICAST_LOOPBACK, MEM_WINDOW_TYPE_2B, 
> RAW_IP_CSUM, MANAGED_FLOW_STEERING, RESIZE_MAX_WR>
> 4: mlx5_3: node_type ca fw 2.8. node_guid 5254:00c0:fe12:345a 
>

Re: [PATCH net] udp: on peeking bad csum, drop packets even if not at head

2017-08-21 Thread Willem de Bruijn

On Mon, Aug 21, 2017 at 6:40 PM, Eric Dumazet  wrote:
> On Mon, 2017-08-21 at 17:39 -0400, Willem de Bruijn wrote:
>> From: Willem de Bruijn 
>>
>> When peeking, if a bad csum is discovered, the skb is unlinked from
>> the queue with __sk_queue_drop_skb and the peek operation restarted.
>>
>> __sk_queue_drop_skb only drops packets that match the queue head. With
>> sk_peek_off, the skb need not be at head, causing the call to fail and
>> the same skb to be found again on restart.
>>
>> Walk the queue to find the correct skb. Limit the walk to sk_peek_off,
>> to bound cycle cost to at most twice the original skb_queue_walk in
>> __skb_try_recv_from_queue.
>>
>> The operation may race with updates to sk_peek_off. As the operation
>> is retried, it will eventually succeed.
>>
>> Signed-off-by: Willem de Bruijn 
>
> You forgot the Fixes: tag, that such a bug fix deserves.

Indeed, sorry. I'm looking into that now. It should be the patch that
introduced peeking at offset, but need to verify.

I should also add that this bug was discovered by syzkaller.

> I am not a big fan of your patch and would prefer a solution without the
> loop.

Agreed.

Re: [PATCH net-next] net: dsa: User per-cpu 64-bit statistics

2017-08-21 Thread Florian Fainelli

On 08/21/2017 04:23 PM, Florian Fainelli wrote:
> On 08/04/2017 10:11 AM, Eric Dumazet wrote:
>> On Fri, 2017-08-04 at 08:51 -0700, Florian Fainelli wrote:
>>> On 08/03/2017 10:36 PM, Eric Dumazet wrote:
 On Thu, 2017-08-03 at 21:33 -0700, Florian Fainelli wrote:
> During testing with a background iperf pushing 1Gbit/sec worth of
> traffic and having both ifconfig and ethtool collect statistics, we
> could see quite frequent deadlocks. Convert the often accessed DSA slave
> network devices statistics to per-cpu 64-bit statistics to remove these
> deadlocks and provide fast efficient statistics updates.
>

 This seems to be a bug fix, it would be nice to get a proper tag like :

 Fixes: f613ed665bb3 ("net: dsa: Add support for 64-bit statistics")
>>>
>>> Right, should have been added, thanks!
>>>

 Problem here is that if multiple cpus can call dsa_switch_rcv() at the
 same time, then u64_stats_update_begin() contract is not respected.
>>>
>>> This is really where I struggled understanding what is wrong in the
>>> non-per CPU version, my understanding is that we have:
>>>
>>> - writers for xmit executes in process context
>>> - writers for receive executes from NAPI (from the DSA's master network
>>> device through it's own NAPI doing netif_receive_skb -> netdev_uses_dsa
>>> -> netif_receive_skb)
>>>
>>> readers should all execute in process context. The test scenario that
>>> led to a deadlock involved running iperf in the background, having a
>>> while loop with both ifconfig and ethtool reading stats, and somehow
>>> when iperf exited, either reader would just be locked. So I guess this
>>> leaves us with the two writers not being mutually excluded then, right?
>>
>> You could add a debug version of u64_stats_update_begin()
>>
>> doing 
>>
>> int ret = atomic_inc((atomic_t *)syncp);
>>
>> BUG_ON(ret & 1);>
>>
>> And u64_stats_update_end()
>>
>> int ret = atomic_inc((atomic_t *)syncp);
> 
> so with your revised suggested patch:
> 
> static inline void u64_stats_update_begin(struct u64_stats_sync *syncp)
> {
> #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
> int ret = atomic_inc_return((atomic_t *)syncp);
> BUG_ON(ret & 1);
> #endif
> #if 0
> #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
> write_seqcount_begin(>seq);
> #endif
> #endif
> }
> 
> static inline void u64_stats_update_end(struct u64_stats_sync *syncp)
> {
> #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
> int ret = atomic_inc_return((atomic_t *)syncp);
> BUG_ON(!(ret & 1));
> #endif
> #if 0
> #if BITS_PER_LONG==32 && defined(CONFIG_SMP)
> write_seqcount_end(>seq);
> #endif
> #endif
> }
> 
> and this makes us choke pretty early in IRQ accounting, did I get your
> suggestion right?

Well if we return 1 from atomic_inc_return() and the previous value was
zero, of course we are going to be bugging here. The idea behind the
patch I suppose is to make sure that we always get an odd number upon
u64_stats_update_begin()/entry, and an even number upon
u64_stats_update_end()/exit, right?

> 
> [0.015149] [ cut here ]
> [0.020051] kernel BUG at ./include/linux/u64_stats_sync.h:82!
> [0.026221] Internal error: Oops - BUG: 0 [#1] SMP ARM
> [0.031661] Modules linked in:
> [0.034970] CPU: 0 PID: 0 Comm: swapper/0 Not tainted
> 4.13.0-rc5-01297-g7d3f0cd43fee-dirty #33
> [0.043990] Hardware name: Broadcom STB (Flattened Device Tree)
> [0.050237] task: c180a500 task.stack: c180
> [0.055065] PC is at irqtime_account_delta+0xa4/0xa8
> [0.060322] LR is at 0x1
> [0.063057] pc : []lr : [<0001>]psr: 01d3
> [0.069652] sp : c1801eec  ip : ee78b458  fp : c0e5ea48
> [0.075212] r10: c18b4b40  r9 : f0803000  r8 : ee00a800
> [0.080781] r7 : 0001  r6 : c180a500  r5 : c180  r4 : 
> [0.087680] r3 :   r2 : ec8c  r1 : ee78b3c0  r0 : ee78b440
> [0.094546] Flags: nzcv  IRQs off  FIQs off  Mode SVC_32  ISA ARM
> Segment user
> [0.102314] Control: 30c5387d  Table: 3000  DAC: fffd
> [0.108414] Process swapper/0 (pid: 0, stack limit = 0xc1800210)
> [0.114791] Stack: (0xc1801eec to 0xc1802000)
> [0.119431] 1ee0:ee78b440 c180
> c180a500 0001 c02505c8
> [0.128079] 1f00: 0004 ee00a800 e000  
> c0227890 c17e6f20 c0278910
> [0.136665] 1f20: c185724c c18079a0 f080200c c1801f58 f0802000
> c0201494 c0e00c18 2053
> [0.145303] 1f40:  c1801f8c  c180 c18b4b40
> c020d238  001f
> [0.153915] 1f60: 00040d00  efffc940  c18b4b40
> c1807440  
> [0.162571] 1f80: c18b4b40 c0e5ea48 0004 c1801fa8 c0322fb0
> c0e00c18 2053 
> [0.171226] 1fa0: c18b4b40    
> c0e006c0  
> [0.179890] 1fc0:  c1807448 c0e5ea48  
> c18b4dd4

Re: [RFC net-next v2] bridge lwtunnel, VPLS & NVGRE

2017-08-21 Thread Stephen Hemminger

On Mon, 21 Aug 2017 19:15:17 +0200
David Lamparter  wrote:

> Hi all,
> 
> 
> this is an update on the earlier "[RFC net-next] VPLS support".  Note
> I've changed the subject lines on some of the patches to better reflect
> what they really do (tbh the earlier subject lines were crap.)
> 
> As previously, iproute2 / FRR patches are at:
> - https://github.com/eqvinox/vpls-iproute2
> - https://github.com/opensourcerouting/frr/commits/vpls
> while this patchset is also available at:
> - https://github.com/eqvinox/vpls-linux-kernel
> (but please be aware that I'm amending and rebasing commits)
> 
> The NVGRE implementation in the 3rd patch in this series is actually an
> accident - I was just wiring up gretap as a reference;  only after I was
> done I noticed that that sums up to NVGRE, more or less.  IMHO, it does
> serve well to demonstrate the bridge changes are not VPLS-specific.
> 
> To refer some notes from the first announce mail:
> > I've tested some basic setups, the chain from LDP down into the kernel
> > works at least in these.  FRR has some testcases around from OpenBSD
> > VPLS support, I haven't wired that up to run against Linux / this
> > patchset yet.  
> 
> Same as before (API didn't change).
> 
> > The patchset needs a lot of polishing (yes I left my TODO notes in the
> > commit messages), for now my primary concern is overall design
> > feedback.  Roopa has already provided a lot of input (Thanks!);  the
> > major topic I'm expecting to get discussion on is the bridge FDB
> > changes.  
> 
> Got some useful input;  but still need feedback on the bridge FDB
> changes (first 2 patches).  I don't believe it to have a significant
> impact on existing bridge operation, and I believe a multipoint tunnel
> driver without its own FDB (e.g. NVGRE in this set) should perform
> better than one with its own FDB (e.g. existing VXLAN).
> 
> > P.S.: For a little context on the bridge FDB changes - I'm hoping to
> > find some time to extend this to the MDB to allow aggregating dst
> > metadata and handing down a list of dst metas on TX.  This isn't
> > specifically for VPLS but rather to give sufficient information to the
> > 802.11 stack to allow it to optimize selecting rates (or unicasting)
> > for multicast traffic by having the multicast subscriber list known.
> > This is done by major commercial wifi solutions (e.g. google "dynamic
> > multicast optimization".)  
> 
> You can find hacks at this on:
> https://github.com/eqvinox/vpls-linux-kernel/tree/mdb-hack
> Please note that the patches in that branch are not at an acceptable
> quality level, but you can see the semantic relation to 802.11.
> 
> I would, however, like to point out that this branch has pseudo-working
> IGMP/MLD snooping for VPLS, and it'd be 20-ish lines to add it to NVGRE
> (I'll do that as soon as I get to it, it'll pop up on that branch too.)
> 
> This is relevant to the discussion because it's a feature which is
> non-obvious (to me) on how to do with the VXLAN model of having an
> entirely separate FDB.  Meanwhile, with this architecture, the proof of
> concept / hack is coming in at a measly cost of:
> 8 files changed, 176 insertions(+), 15 deletions(-)
> 
> 
> Cheers,
> 
> -David
> 
> 
> --- diffstat:
> include/linux/netdevice.h  |  18 ++
> include/net/dst_metadata.h |  51 ++---
> include/net/ip_tunnels.h   |   5 ++
> include/uapi/linux/lwtunnel.h  |   8 +++
> include/uapi/linux/neighbour.h |   2 +
> include/uapi/linux/rtnetlink.h |   5 ++
> net/bridge/br.c|   2 +-
> net/bridge/br_device.c |   4 ++
> net/bridge/br_fdb.c| 119 
> net/bridge/br_input.c  |   6 +-
> net/bridge/br_private.h|   6 +-
> net/core/lwtunnel.c|   1 +
> net/ipv4/ip_gre.c  |  40 --
> net/ipv4/ip_tunnel.c   |   1 +
> net/ipv4/ip_tunnel_core.c  |  87 +++--
> net/mpls/Kconfig   |  11 
> net/mpls/Makefile  |   1 +
> net/mpls/af_mpls.c | 113 --
> net/mpls/internal.h|  44 +--
> net/mpls/vpls.c| 550 
> +++
> 20 files changed, 990 insertions(+), 84 deletions(-)

I know the bridge is an easy target to extend L2 forwarding, but it is not
the only option. Have you condidered building a new driver (like VXLAN does)
which does the forwarding you want. Having all features in one driver
makes for worse performance, and increased complexity.

[PATCH] once: switch to new jump label API

2017-08-21 Thread Eric Biggers

From: Eric Biggers 

Switch the DO_ONCE() macro from the deprecated jump label API to the new
one.  The new one is more readable, and for DO_ONCE() it also makes the
generated code more icache-friendly: now the one-time initialization
code is placed out-of-line at the jump target, rather than at the inline
fallthrough case.

Signed-off-by: Eric Biggers 
---
 include/linux/once.h | 6 +++---
 lib/once.c   | 8 
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/include/linux/once.h b/include/linux/once.h
index 9c98aaa87cbc..724724918e8b 100644
--- a/include/linux/once.h
+++ b/include/linux/once.h
@@ -5,7 +5,7 @@
 #include 
 
 bool __do_once_start(bool *done, unsigned long *flags);
-void __do_once_done(bool *done, struct static_key *once_key,
+void __do_once_done(bool *done, struct static_key_true *once_key,
unsigned long *flags);
 
 /* Call a function exactly once. The idea of DO_ONCE() is to perform
@@ -38,8 +38,8 @@ void __do_once_done(bool *done, struct static_key *once_key,
({   \
bool ___ret = false; \
static bool ___done = false; \
-   static struct static_key ___once_key = STATIC_KEY_INIT_TRUE; \
-   if (static_key_true(&___once_key)) { \
+   static DEFINE_STATIC_KEY_TRUE(___once_key);  \
+   if (static_branch_unlikely(&___once_key)) {  \
unsigned long ___flags;  \
___ret = __do_once_start(&___done, &___flags);   \
if (unlikely(___ret)) {  \
diff --git a/lib/once.c b/lib/once.c
index 05c8604627eb..831c5a6b0bb2 100644
--- a/lib/once.c
+++ b/lib/once.c
@@ -5,7 +5,7 @@
 
 struct once_work {
struct work_struct work;
-   struct static_key *key;
+   struct static_key_true *key;
 };
 
 static void once_deferred(struct work_struct *w)
@@ -14,11 +14,11 @@ static void once_deferred(struct work_struct *w)
 
work = container_of(w, struct once_work, work);
BUG_ON(!static_key_enabled(work->key));
-   static_key_slow_dec(work->key);
+   static_branch_disable(work->key);
kfree(work);
 }
 
-static void once_disable_jump(struct static_key *key)
+static void once_disable_jump(struct static_key_true *key)
 {
struct once_work *w;
 
@@ -51,7 +51,7 @@ bool __do_once_start(bool *done, unsigned long *flags)
 }
 EXPORT_SYMBOL(__do_once_start);
 
-void __do_once_done(bool *done, struct static_key *once_key,
+void __do_once_done(bool *done, struct static_key_true *once_key,
unsigned long *flags)
__releases(once_lock)
 {
-- 
2.14.1.480.gb18f417b89-goog

Re: [PATCH net-next] net: dsa: User per-cpu 64-bit statistics

2017-08-21 Thread Florian Fainelli

On 08/04/2017 10:11 AM, Eric Dumazet wrote:
> On Fri, 2017-08-04 at 08:51 -0700, Florian Fainelli wrote:
>> On 08/03/2017 10:36 PM, Eric Dumazet wrote:
>>> On Thu, 2017-08-03 at 21:33 -0700, Florian Fainelli wrote:
 During testing with a background iperf pushing 1Gbit/sec worth of
 traffic and having both ifconfig and ethtool collect statistics, we
 could see quite frequent deadlocks. Convert the often accessed DSA slave
 network devices statistics to per-cpu 64-bit statistics to remove these
 deadlocks and provide fast efficient statistics updates.

>>>
>>> This seems to be a bug fix, it would be nice to get a proper tag like :
>>>
>>> Fixes: f613ed665bb3 ("net: dsa: Add support for 64-bit statistics")
>>
>> Right, should have been added, thanks!
>>
>>>
>>> Problem here is that if multiple cpus can call dsa_switch_rcv() at the
>>> same time, then u64_stats_update_begin() contract is not respected.
>>
>> This is really where I struggled understanding what is wrong in the
>> non-per CPU version, my understanding is that we have:
>>
>> - writers for xmit executes in process context
>> - writers for receive executes from NAPI (from the DSA's master network
>> device through it's own NAPI doing netif_receive_skb -> netdev_uses_dsa
>> -> netif_receive_skb)
>>
>> readers should all execute in process context. The test scenario that
>> led to a deadlock involved running iperf in the background, having a
>> while loop with both ifconfig and ethtool reading stats, and somehow
>> when iperf exited, either reader would just be locked. So I guess this
>> leaves us with the two writers not being mutually excluded then, right?
> 
> You could add a debug version of u64_stats_update_begin()
> 
> doing 
> 
> int ret = atomic_inc((atomic_t *)syncp);
> 
> BUG_ON(ret & 1);>
> 
> And u64_stats_update_end()
> 
> int ret = atomic_inc((atomic_t *)syncp);

so with your revised suggested patch:

static inline void u64_stats_update_begin(struct u64_stats_sync *syncp)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
int ret = atomic_inc_return((atomic_t *)syncp);
BUG_ON(ret & 1);
#endif
#if 0
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
write_seqcount_begin(>seq);
#endif
#endif
}

static inline void u64_stats_update_end(struct u64_stats_sync *syncp)
{
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
int ret = atomic_inc_return((atomic_t *)syncp);
BUG_ON(!(ret & 1));
#endif
#if 0
#if BITS_PER_LONG==32 && defined(CONFIG_SMP)
write_seqcount_end(>seq);
#endif
#endif
}

and this makes us choke pretty early in IRQ accounting, did I get your
suggestion right?

[0.015149] [ cut here ]
[0.020051] kernel BUG at ./include/linux/u64_stats_sync.h:82!
[0.026221] Internal error: Oops - BUG: 0 [#1] SMP ARM
[0.031661] Modules linked in:
[0.034970] CPU: 0 PID: 0 Comm: swapper/0 Not tainted
4.13.0-rc5-01297-g7d3f0cd43fee-dirty #33
[0.043990] Hardware name: Broadcom STB (Flattened Device Tree)
[0.050237] task: c180a500 task.stack: c180
[0.055065] PC is at irqtime_account_delta+0xa4/0xa8
[0.060322] LR is at 0x1
[0.063057] pc : []lr : [<0001>]psr: 01d3
[0.069652] sp : c1801eec  ip : ee78b458  fp : c0e5ea48
[0.075212] r10: c18b4b40  r9 : f0803000  r8 : ee00a800
[0.080781] r7 : 0001  r6 : c180a500  r5 : c180  r4 : 
[0.087680] r3 :   r2 : ec8c  r1 : ee78b3c0  r0 : ee78b440
[0.094546] Flags: nzcv  IRQs off  FIQs off  Mode SVC_32  ISA ARM
Segment user
[0.102314] Control: 30c5387d  Table: 3000  DAC: fffd
[0.108414] Process swapper/0 (pid: 0, stack limit = 0xc1800210)
[0.114791] Stack: (0xc1801eec to 0xc1802000)
[0.119431] 1ee0:ee78b440 c180
c180a500 0001 c02505c8
[0.128079] 1f00: 0004 ee00a800 e000  
c0227890 c17e6f20 c0278910
[0.136665] 1f20: c185724c c18079a0 f080200c c1801f58 f0802000
c0201494 c0e00c18 2053
[0.145303] 1f40:  c1801f8c  c180 c18b4b40
c020d238  001f
[0.153915] 1f60: 00040d00  efffc940  c18b4b40
c1807440  
[0.162571] 1f80: c18b4b40 c0e5ea48 0004 c1801fa8 c0322fb0
c0e00c18 2053 
[0.171226] 1fa0: c18b4b40    
c0e006c0  
[0.179890] 1fc0:  c1807448 c0e5ea48  
c18b4dd4 c180745c c0e5ea44
[0.188546] 1fe0: c180c0d0 7000 420f00f3  
8090  
[0.197165] [] (irqtime_account_delta) from []
(irqtime_account_irq+0xc0/0xc4)
[0.206664] [] (irqtime_account_irq) from []
(irq_exit+0x28/0x154)
[0.215012] [] (irq_exit) from []
(__handle_domain_irq+0x60/0xb4)
[0.223245] [] (__handle_domain_irq) from []
(gic_handle_irq+0x48/0x8c)
[0.232035] [] (gic_handle_irq) from []
(__irq_svc+0x58/0x74)
[0.239941] Exception stack(0xc1801f58 to 0xc1801fa0)
[

Re: [PATCH net-next,1/4] hv_netvsc: Clean up unused parameter from netvsc_get_hash()

2017-08-21 Thread David Miller


All proper patch series must have a header "[PATCH xxx 0/N]" posting
which explains at a high level what the patch series does, how it does
it, and why it is doing it that way.

Therefore, please resubmit this patch series with a proper header
posting.

Thank you.

Re: [PATCH net] udp: on peeking bad csum, drop packets even if not at head

2017-08-21 Thread Eric Dumazet

On Mon, 2017-08-21 at 17:39 -0400, Willem de Bruijn wrote:
> From: Willem de Bruijn 
> 
> When peeking, if a bad csum is discovered, the skb is unlinked from
> the queue with __sk_queue_drop_skb and the peek operation restarted.
> 
> __sk_queue_drop_skb only drops packets that match the queue head. With
> sk_peek_off, the skb need not be at head, causing the call to fail and
> the same skb to be found again on restart.
> 
> Walk the queue to find the correct skb. Limit the walk to sk_peek_off,
> to bound cycle cost to at most twice the original skb_queue_walk in
> __skb_try_recv_from_queue.
> 
> The operation may race with updates to sk_peek_off. As the operation
> is retried, it will eventually succeed.
> 
> Signed-off-by: Willem de Bruijn 

You forgot the Fixes: tag, that such a bug fix deserves.

I am not a big fan of your patch and would prefer a solution without the
loop.

skb already have ->next and ->prev pointer telling us its position in
the receive queue.

We only need to make sure we are the last owner of this skb before doing
the check (ie cancel the kfree_skb() that usually follows the call to
v__sk_queue_drop_skb() of skb->next being NULL or not.

Something like :

 include/net/sock.h  |2 +-
 net/core/datagram.c |   22 ++
 net/ipv4/udp.c  |2 +-
 net/ipv6/udp.c  |2 +-
 4 files changed, 17 insertions(+), 11 deletions(-)

diff --git a/include/net/sock.h b/include/net/sock.h
index aeeec62992ca..6e43bab92d95 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -2030,7 +2030,7 @@ void sk_reset_timer(struct sock *sk, struct timer_list 
*timer,
 void sk_stop_timer(struct sock *sk, struct timer_list *timer);
 
 int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
-   struct sk_buff *skb, unsigned int flags,
+   struct sk_buff **pskb, unsigned int flags,
void (*destructor)(struct sock *sk,
   struct sk_buff *skb));
 int __sock_queue_rcv_skb(struct sock *sk, struct sk_buff *skb);
diff --git a/net/core/datagram.c b/net/core/datagram.c
index a21ca8dee5ea..7e129f91af89 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -353,21 +353,27 @@ void __skb_free_datagram_locked(struct sock *sk, struct 
sk_buff *skb, int len)
 EXPORT_SYMBOL(__skb_free_datagram_locked);
 
 int __sk_queue_drop_skb(struct sock *sk, struct sk_buff_head *sk_queue,
-   struct sk_buff *skb, unsigned int flags,
+   struct sk_buff **pskb, unsigned int flags,
void (*destructor)(struct sock *sk,
   struct sk_buff *skb))
 {
int err = 0;
 
if (flags & MSG_PEEK) {
+   struct sk_buff *skb = *pskb;
+
err = -ENOENT;
spin_lock_bh(_queue->lock);
-   if (skb == skb_peek(sk_queue)) {
-   __skb_unlink(skb, sk_queue);
-   refcount_dec(>users);
-   if (destructor)
-   destructor(sk, skb);
-   err = 0;
+   refcount_dec(>users);
+   *pskb = NULL;
+   if (refcount_dec_if_one(>users)) {
+   if (skb->next) {
+   __skb_unlink(skb, sk_queue);
+   if (destructor)
+   destructor(sk, skb);
+   err = 0;
+   }
+   __kfree_skb(skb);
}
spin_unlock_bh(_queue->lock);
}
@@ -400,7 +406,7 @@ EXPORT_SYMBOL(__sk_queue_drop_skb);
 
 int skb_kill_datagram(struct sock *sk, struct sk_buff *skb, unsigned int flags)
 {
-   int err = __sk_queue_drop_skb(sk, >sk_receive_queue, skb, flags,
+   int err = __sk_queue_drop_skb(sk, >sk_receive_queue, , flags,
  NULL);
 
kfree_skb(skb);
diff --git a/net/ipv4/udp.c b/net/ipv4/udp.c
index cd1d044a7fa5..b5f90b845a6f 100644
--- a/net/ipv4/udp.c
+++ b/net/ipv4/udp.c
@@ -1648,7 +1648,7 @@ int udp_recvmsg(struct sock *sk, struct msghdr *msg, 
size_t len, int noblock,
return err;
 
 csum_copy_err:
-   if (!__sk_queue_drop_skb(sk, _sk(sk)->reader_queue, skb, flags,
+   if (!__sk_queue_drop_skb(sk, _sk(sk)->reader_queue, , flags,
 udp_skb_destructor)) {
UDP_INC_STATS(sock_net(sk), UDP_MIB_CSUMERRORS, is_udplite);
UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
diff --git a/net/ipv6/udp.c b/net/ipv6/udp.c
index 20039c8501eb..214a973571fd 100644
--- a/net/ipv6/udp.c
+++ b/net/ipv6/udp.c
@@ -465,7 +465,7 @@ int udpv6_recvmsg(struct sock *sk, struct msghdr *msg, 
size_t len,
return err;
 
 csum_copy_err:
-   if (!__sk_queue_drop_skb(sk,

Re: [PATCH net] udp: on peeking bad csum, drop packets even if not at head

2017-08-21 Thread Willem de Bruijn

On Mon, Aug 21, 2017 at 5:39 PM, Willem de Bruijn
 wrote:
> From: Willem de Bruijn 
>
> When peeking, if a bad csum is discovered, the skb is unlinked from
> the queue with __sk_queue_drop_skb and the peek operation restarted.
>
> __sk_queue_drop_skb only drops packets that match the queue head. With
> sk_peek_off, the skb need not be at head, causing the call to fail and
> the same skb to be found again on restart.
>
> Walk the queue to find the correct skb. Limit the walk to sk_peek_off,
> to bound cycle cost to at most twice the original skb_queue_walk in
> __skb_try_recv_from_queue.
>
> The operation may race with updates to sk_peek_off. As the operation
> is retried, it will eventually succeed.
>
> Signed-off-by: Willem de Bruijn 

Eric just suggested an alternative that does not require looping, which
is much nicer.

[PATCH net-next 1/3 v7] net: ether: Add support for multiplexing and aggregation type

2017-08-21 Thread Subash Abhinov Kasiviswanathan

Define the multiplexing and aggregation (MAP) ether type 0xDA1A. This
is needed for receiving data in the MAP protocol like RMNET. This is
not an officially registered ID.

Signed-off-by: Subash Abhinov Kasiviswanathan 
---
 include/uapi/linux/if_ether.h | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/include/uapi/linux/if_ether.h b/include/uapi/linux/if_ether.h
index 5bc9bfd..e80b03f 100644
--- a/include/uapi/linux/if_ether.h
+++ b/include/uapi/linux/if_ether.h
@@ -104,7 +104,9 @@
 #define ETH_P_QINQ30x9300  /* deprecated QinQ VLAN [ NOT AN 
OFFICIALLY REGISTERED ID ] */
 #define ETH_P_EDSA 0xDADA  /* Ethertype DSA [ NOT AN OFFICIALLY 
REGISTERED ID ] */
 #define ETH_P_AF_IUCV   0xFBFB /* IBM af_iucv [ NOT AN OFFICIALLY 
REGISTERED ID ] */
-
+#define ETH_P_MAP   0xDA1A  /* Multiplexing and Aggregation 
Protocol
+*  NOT AN OFFICIALLY REGISTERED ID ]
+*/
 #define ETH_P_802_3_MIN0x0600  /* If the value in the ethernet 
type is less than this value
 * then the frame is Ethernet II. Else 
it is 802.3 */
 
-- 
1.9.1

[PATCH net-next 2/3 v7] net: arp: Add support for raw IP device

2017-08-21 Thread Subash Abhinov Kasiviswanathan

Define the raw IP type. This is needed for raw IP net devices
like rmnet.

Signed-off-by: Subash Abhinov Kasiviswanathan 
---
 include/uapi/linux/if_arp.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/if_arp.h b/include/uapi/linux/if_arp.h
index cf73510..a2a6356 100644
--- a/include/uapi/linux/if_arp.h
+++ b/include/uapi/linux/if_arp.h
@@ -59,6 +59,7 @@
 #define ARPHRD_LAPB516 /* LAPB */
 #define ARPHRD_DDCMP517/* Digital's DDCMP protocol */
 #define ARPHRD_RAWHDLC 518 /* Raw HDLC */
+#define ARPHRD_RAWIP519/* Raw IP   */
 
 #define ARPHRD_TUNNEL  768 /* IPIP tunnel  */
 #define ARPHRD_TUNNEL6 769 /* IP6IP6 tunnel*/
-- 
1.9.1

[PATCH net-next 3/3 v7] drivers: net: ethernet: qualcomm: rmnet: Initial implementation

2017-08-21 Thread Subash Abhinov Kasiviswanathan

RmNet driver provides a transport agnostic MAP (multiplexing and
aggregation protocol) support in embedded module. Module provides
virtual network devices which can be attached to any IP-mode
physical device. This will be used to provide all MAP functionality
on future hardware in a single consistent location.

Signed-off-by: Subash Abhinov Kasiviswanathan 
---
 Documentation/networking/rmnet.txt |  82 
 drivers/net/ethernet/qualcomm/Kconfig  |   2 +
 drivers/net/ethernet/qualcomm/Makefile |   2 +
 drivers/net/ethernet/qualcomm/rmnet/Kconfig|  12 +
 drivers/net/ethernet/qualcomm/rmnet/Makefile   |  12 +
 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 418 +
 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h |  54 +++
 .../net/ethernet/qualcomm/rmnet/rmnet_handlers.c   | 276 ++
 .../net/ethernet/qualcomm/rmnet/rmnet_handlers.h   |  26 ++
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h|  88 +
 .../ethernet/qualcomm/rmnet/rmnet_map_command.c| 116 ++
 .../net/ethernet/qualcomm/rmnet/rmnet_map_data.c   | 105 ++
 .../net/ethernet/qualcomm/rmnet/rmnet_private.h|  45 +++
 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c| 263 +
 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h|  32 ++
 15 files changed, 1533 insertions(+)
 create mode 100644 Documentation/networking/rmnet.txt
 create mode 100644 drivers/net/ethernet/qualcomm/rmnet/Kconfig
 create mode 100644 drivers/net/ethernet/qualcomm/rmnet/Makefile
 create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
 create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
 create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
 create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h
 create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
 create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c
 create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
 create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h
 create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
 create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h

diff --git a/Documentation/networking/rmnet.txt 
b/Documentation/networking/rmnet.txt
new file mode 100644
index 000..6b341ea
--- /dev/null
+++ b/Documentation/networking/rmnet.txt
@@ -0,0 +1,82 @@
+1. Introduction
+
+rmnet driver is used for supporting the Multiplexing and aggregation
+Protocol (MAP). This protocol is used by all recent chipsets using Qualcomm
+Technologies, Inc. modems.
+
+This driver can be used to register onto any physical network device in
+IP mode. Physical transports include USB, HSIC, PCIe and IP accelerator.
+
+Multiplexing allows for creation of logical netdevices (rmnet devices) to
+handle multiple private data networks (PDN) like a default internet, tethering,
+multimedia messaging service (MMS) or IP media subsystem (IMS). Hardware sends
+packets with MAP headers to rmnet. Based on the multiplexer id, rmnet
+routes to the appropriate PDN after removing the MAP header.
+
+Aggregation is required to achieve high data rates. This involves hardware
+sending aggregated bunch of MAP frames. rmnet driver will de-aggregate
+these MAP frames and send them to appropriate PDN's.
+
+2. Packet format
+
+a. MAP packet (data / control)
+
+MAP header has the same endianness of the IP packet.
+
+Packet format -
+
+Bit 0 1   2-7  8 - 15   16 - 31
+Function   Command / Data   Reserved Pad   Multiplexer IDPayload length
+Bit32 - x
+Function Raw  Bytes
+
+Command (1)/ Data (0) bit value is to indicate if the packet is a MAP command
+or data packet. Control packet is used for transport level flow control. Data
+packets are standard IP packets.
+
+Reserved bits are usually zeroed out and to be ignored by receiver.
+
+Padding is number of bytes to be added for 4 byte alignment if required by
+hardware.
+
+Multiplexer ID is to indicate the PDN on which data has to be sent.
+
+Payload length includes the padding length but does not include MAP header
+length.
+
+b. MAP packet (command specific)
+
+Bit 0 1   2-7  8 - 15   16 - 31
+Function   Command Reserved Pad   Multiplexer IDPayload length
+Bit  32 - 3940 - 4546 - 47   48 - 63
+Function   Command nameReserved   Command Type   Reserved
+Bit  64 - 95
+Function   Transaction ID
+Bit  96 - 127
+Function   Command data
+
+Command 1 indicates disabling flow while 2 is enabling flow
+
+Command types -
+0 for MAP command request
+1 is to acknowledge the receipt of a command
+2 is for unsupported commands
+3 is for error during processing of commands
+
+c. Aggregation
+
+Aggregation is multiple MAP packets (can be data or command)

[PATCH net-next 0/3 v7] Add support for rmnet driver

2017-08-21 Thread Subash Abhinov Kasiviswanathan

This patch adds support for the rmnet driver which is required to
support recent chipsets using Qualcomm Technologies, Inc. modems. The data
from hardware follows the multiplexing and aggregation protocol (MAP).

This driver can be used to register onto any physical network device in
IP mode. Physical transports include USB, HSIC, PCIe and IP accelerator.

rmnet driver helps to decode these packets and queue them to network
stack (and encode and transmit it to the physical device).

--
v1: Same as the RFC patch with some minor fixes for issues reported by
kbuild test robot.

v1->v2: Change datatypes and remove config IOCTL as mentioned by David.
Also fix checkpatch issues and remove some unused code.

v2->v3: Move location to drivers/net and rename to rmnet. Change the
userspace - netlink communication from custom netlink to rtnl_link_ops.
Refactor some code. Use a fixed config for ingress and egress.

v3->v4: Move location to drivers/net/ethernet/qualcomm/.
Fix comments from Stephen and Jiri -
Split the ether and arp type changes into seperate patches.
Remove debug and custom logging and switch to standard netdevice log.
Remove module parameters. Refactor and change some code style issues.

v4->v5: Rename some structs and variables. Move the initializer
before the for loop start. Put the arp type in correct sequence.

v5->v6: Fix comments from Dan -
Use the upper link API. As a result, remove all the refcounting logic.
Device refcount is explicitly held on real_dev on rx_handler
registration only. Modifiy the flow control struct. Remove the unused
ethernet mode handling.

v6->v7: Fix comments from David - Add newline to end of Makefile. Remove
inline from .c files. Move the module init/exit to rmnet config. Fix an
error reported by kbuild test robot for an unused file.

Subash Abhinov Kasiviswanathan (3):
  net: ether: Add support for multiplexing and aggregation type
  net: arp: Add support for raw IP device
  drivers: net: ethernet: qualcomm: rmnet: Initial implementation

 Documentation/networking/rmnet.txt |  82 
 drivers/net/ethernet/qualcomm/Kconfig  |   2 +
 drivers/net/ethernet/qualcomm/Makefile |   2 +
 drivers/net/ethernet/qualcomm/rmnet/Kconfig|  12 +
 drivers/net/ethernet/qualcomm/rmnet/Makefile   |  12 +
 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c | 418 +
 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h |  54 +++
 .../net/ethernet/qualcomm/rmnet/rmnet_handlers.c   | 276 ++
 .../net/ethernet/qualcomm/rmnet/rmnet_handlers.h   |  26 ++
 drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h|  88 +
 .../ethernet/qualcomm/rmnet/rmnet_map_command.c| 116 ++
 .../net/ethernet/qualcomm/rmnet/rmnet_map_data.c   | 105 ++
 .../net/ethernet/qualcomm/rmnet/rmnet_private.h|  45 +++
 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c| 263 +
 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h|  32 ++
 include/uapi/linux/if_arp.h|   1 +
 include/uapi/linux/if_ether.h  |   4 +-
 17 files changed, 1537 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/networking/rmnet.txt
 create mode 100644 drivers/net/ethernet/qualcomm/rmnet/Kconfig
 create mode 100644 drivers/net/ethernet/qualcomm/rmnet/Makefile
 create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
 create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_config.h
 create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
 create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.h
 create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_map.h
 create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_command.c
 create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_map_data.c
 create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_private.h
 create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.c
 create mode 100644 drivers/net/ethernet/qualcomm/rmnet/rmnet_vnd.h

-- 
1.9.1

Re: XDP redirect measurements, gotchas and tracepoints

2017-08-21 Thread Alexei Starovoitov

On Mon, Aug 21, 2017 at 09:25:06PM +0200, Jesper Dangaard Brouer wrote:
> 
> Third gotcha(3): You got this far, loaded xdp on both interfaces, and
> notice now that (with default setup) you can RX with 14Mpps but only
> TX with 6.9Mpps (and might have 5% idle cycles).  I debugged this via
> perf tracepoint event xdp:xdp_redirect, and found this was due to
> overrunning the xdp TX ring-queue size.

we should probably fix this somehow.
Once tx-ing netdev added to devmap we can enable xdp on it automatically?

Re: [PATCH v2] mt7601u: check memory allocation failure

2017-08-21 Thread Jakub Kicinski

On Tue, 22 Aug 2017 00:06:17 +0200, Christophe JAILLET wrote:
> Check memory allocation failure and return -ENOMEM in such a case, as
> already done a few lines below.
> 
> As 'dev->tx_q' can be NULL, we also need to check for that in
> 'mt7601u_free_tx()', and return early.
> 
> Signed-off-by: Christophe JAILLET 

Acked-by: Jakub Kicinski

Re: [PATCH net] net: dsa: skb_put_padto() already frees nskb

2017-08-21 Thread Florian Fainelli

On 08/21/2017 03:15 PM, woojung@microchip.com wrote:
> Florian,
> 
>> -Original Message-
>> From: Florian Fainelli [mailto:f.faine...@gmail.com]
>> Sent: Monday, August 21, 2017 3:42 PM
>> To: netdev@vger.kernel.org
>> Cc: da...@davemloft.net; and...@lunn.ch;
>> vivien.dide...@savoirfairelinux.com; Woojung Huh - C21699; Florian Fainelli
>> Subject: [PATCH net] net: dsa: skb_put_padto() already frees nskb
>>
>> skb_put_padto() already frees the passed sk_buff reference upon error,
>> so calling kfree_skb() on it again is not necessary.
>>
>> Detected by CoverityScan, CID#1416687 ("USE_AFTER_FREE")
>>
>> Fixes: e71cb9e00922 ("net: dsa: ksz: fix skb freeing")
>> Signed-off-by: Florian Fainelli 
>> ---
>>  net/dsa/tag_ksz.c | 4 +---
>>  1 file changed, 1 insertion(+), 3 deletions(-)
>>
>> diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
>> index de66ca8e6201..107172c82107 100644
>> --- a/net/dsa/tag_ksz.c
>> +++ b/net/dsa/tag_ksz.c
>> @@ -60,10 +60,8 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb,
>> struct net_device *dev)
>>   skb_transport_header(skb) - skb-
>>> head);
>>  skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len));
>>
>> -if (skb_put_padto(nskb, nskb->len + padlen)) {
>> -kfree_skb(nskb);
>> +if (skb_put_padto(nskb, nskb->len + padlen))
>>  return NULL;
>> -}
>>
>>  kfree_skb(skb);
>>  }
>> --
> 
> Because skb_put_padto() frees skb when it fails,  below lines in e71cb9e00922
> ("net: dsa: ksz: fix skb freeing") will be an issue to.
> 
>   if (skb_tailroom(skb) >= padlen + KSZ_INGRESS_TAG_LEN) {
> + if (skb_put_padto(skb, skb->len + padlen))
> + return NULL;
> +
> 
> When it fails skb will be freed twice in skb_put_padto() and
> caller of dsa_slave_xmit().

You are right, I am not sure what is the best way to fix tag_ksz.c other
than somehow open coding skb_put_padto() minus the freeing on error part?
-- 
Florian

RE: [PATCH net] net: dsa: skb_put_padto() already frees nskb

2017-08-21 Thread Woojung.Huh

Florian,

> -Original Message-
> From: Florian Fainelli [mailto:f.faine...@gmail.com]
> Sent: Monday, August 21, 2017 3:42 PM
> To: netdev@vger.kernel.org
> Cc: da...@davemloft.net; and...@lunn.ch;
> vivien.dide...@savoirfairelinux.com; Woojung Huh - C21699; Florian Fainelli
> Subject: [PATCH net] net: dsa: skb_put_padto() already frees nskb
> 
> skb_put_padto() already frees the passed sk_buff reference upon error,
> so calling kfree_skb() on it again is not necessary.
> 
> Detected by CoverityScan, CID#1416687 ("USE_AFTER_FREE")
> 
> Fixes: e71cb9e00922 ("net: dsa: ksz: fix skb freeing")
> Signed-off-by: Florian Fainelli 
> ---
>  net/dsa/tag_ksz.c | 4 +---
>  1 file changed, 1 insertion(+), 3 deletions(-)
> 
> diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
> index de66ca8e6201..107172c82107 100644
> --- a/net/dsa/tag_ksz.c
> +++ b/net/dsa/tag_ksz.c
> @@ -60,10 +60,8 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb,
> struct net_device *dev)
>skb_transport_header(skb) - skb-
> >head);
>   skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len));
> 
> - if (skb_put_padto(nskb, nskb->len + padlen)) {
> - kfree_skb(nskb);
> + if (skb_put_padto(nskb, nskb->len + padlen))
>   return NULL;
> - }
> 
>   kfree_skb(skb);
>   }
> --

Because skb_put_padto() frees skb when it fails,  below lines in e71cb9e00922
("net: dsa: ksz: fix skb freeing") will be an issue to.

if (skb_tailroom(skb) >= padlen + KSZ_INGRESS_TAG_LEN) {
+   if (skb_put_padto(skb, skb->len + padlen))
+   return NULL;
+

When it fails skb will be freed twice in skb_put_padto() and
caller of dsa_slave_xmit().

Woojung

Re: [PATCH] mt7601u: check memory allocation failure

2017-08-21 Thread Christophe JAILLET


Le 21/08/2017 à 23:41, Jakub Kicinski a écrit :

On Mon, 21 Aug 2017 14:34:30 -0700, Jakub Kicinski wrote:

On Mon, 21 Aug 2017 22:59:56 +0200, Christophe JAILLET wrote:

Check memory allocation failure and return -ENOMEM in such a case, as
already done a few lines below

Signed-off-by: Christophe JAILLET 

Acked-by: Jakub Kicinski 

Wait, I take that back.  This code is a bit weird.  We would return an
error, then mt7601u_dma_init() will call mt7601u_free_tx_queue() which
doesn't check for tx_q == NULL condition.

Looks like mt7601u_free_tx() has to check for dev->tx_q == NULL and
return early if that's the case.  Or mt7601u_alloc_tx() should really
clean things up on it's own on failure.  Ugh.


You are right. Thanks for the review.

I've sent a v2 which updates 'mt7601u_free_tx()'.
Doing so sounds more in line with the spirit of this code.

CJ

Re: [PATCH net-next 03/11] net: dsa: debugfs: add tree

2017-08-21 Thread Florian Fainelli

On 08/14/2017 03:22 PM, Vivien Didelot wrote:
> This commit adds the boiler plate to create a DSA related debug
> filesystem entry as well as a "tree" file, containing the tree index.
> 
> # cat switch1/tree
> 0
> 
> Signed-off-by: Vivien Didelot 

Reviewed-by: Florian Fainelli 
-- 
Florian

[PATCH v2] mt7601u: check memory allocation failure

2017-08-21 Thread Christophe JAILLET

Check memory allocation failure and return -ENOMEM in such a case, as
already done a few lines below.

As 'dev->tx_q' can be NULL, we also need to check for that in
'mt7601u_free_tx()', and return early.

Signed-off-by: Christophe JAILLET 
---
v2: avoid another NULL pointer dereference in 'mt7601u_free_tx()' if the
allocation had failed.
---
 drivers/net/wireless/mediatek/mt7601u/dma.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/drivers/net/wireless/mediatek/mt7601u/dma.c 
b/drivers/net/wireless/mediatek/mt7601u/dma.c
index 660267b359e4..7f3e3983b781 100644
--- a/drivers/net/wireless/mediatek/mt7601u/dma.c
+++ b/drivers/net/wireless/mediatek/mt7601u/dma.c
@@ -457,6 +457,9 @@ static void mt7601u_free_tx(struct mt7601u_dev *dev)
 {
int i;
 
+   if (!dev->tx_q)
+   return;
+
for (i = 0; i < __MT_EP_OUT_MAX; i++)
mt7601u_free_tx_queue(>tx_q[i]);
 }
@@ -484,6 +487,8 @@ static int mt7601u_alloc_tx(struct mt7601u_dev *dev)
 
dev->tx_q = devm_kcalloc(dev->dev, __MT_EP_OUT_MAX,
 sizeof(*dev->tx_q), GFP_KERNEL);
+   if (!dev->tx_q)
+   return -ENOMEM;
 
for (i = 0; i < __MT_EP_OUT_MAX; i++)
if (mt7601u_alloc_tx_queue(dev, >tx_q[i]))
-- 
2.11.0

[PATCH net-next,1/4] hv_netvsc: Clean up unused parameter from netvsc_get_hash()

2017-08-21 Thread Haiyang Zhang

From: Haiyang Zhang 

The parameter "sk" is not in use.

Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/netvsc_drv.c |4 ++--
 1 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index b33f050..4677d21 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -193,7 +193,7 @@ static int netvsc_close(struct net_device *net)
 /* Azure hosts don't support non-TCP port numbers in hashing yet. We compute
  * hash for non-TCP traffic with only IP numbers.
  */
-static inline u32 netvsc_get_hash(struct sk_buff *skb, struct sock *sk)
+static inline u32 netvsc_get_hash(struct sk_buff *skb)
 {
struct flow_keys flow;
u32 hash;
@@ -227,7 +227,7 @@ static inline int netvsc_get_tx_queue(struct net_device 
*ndev,
struct sock *sk = skb->sk;
int q_idx;
 
-   q_idx = ndc->tx_send_table[netvsc_get_hash(skb, sk) &
+   q_idx = ndc->tx_send_table[netvsc_get_hash(skb) &
   (VRSS_SEND_TAB_SIZE - 1)];
 
/* If queue index changed record the new value */
-- 
1.7.1

[PATCH net-next,2/4] hv_netvsc: Clean up unused parameter from netvsc_get_rss_hash_opts()

2017-08-21 Thread Haiyang Zhang

From: Haiyang Zhang 

The parameter "nvdev" is not in use.

Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/netvsc_drv.c |5 ++---
 1 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index 4677d21..d8612b1 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -1228,8 +1228,7 @@ static void netvsc_get_strings(struct net_device *dev, 
u32 stringset, u8 *data)
 }
 
 static int
-netvsc_get_rss_hash_opts(struct netvsc_device *nvdev,
-struct ethtool_rxnfc *info)
+netvsc_get_rss_hash_opts(struct ethtool_rxnfc *info)
 {
info->data = RXH_IP_SRC | RXH_IP_DST;
 
@@ -1267,7 +1266,7 @@ static void netvsc_get_strings(struct net_device *dev, 
u32 stringset, u8 *data)
return 0;
 
case ETHTOOL_GRXFH:
-   return netvsc_get_rss_hash_opts(nvdev, info);
+   return netvsc_get_rss_hash_opts(info);
}
return -EOPNOTSUPP;
 }
-- 
1.7.1

[PATCH net-next,3/4] hv_netvsc: Add ethtool handler to set and get UDP hash levels

2017-08-21 Thread Haiyang Zhang

From: Haiyang Zhang 

The patch add the functions to switch UDP hash level between
L3 and L4 by ethtool command. UDP over IPv4 and v6 can be set
differently. The default hash level is L4. We currently only
allow switching TX hash level from within the guests.

On Azure, fragmented UDP packets have high loss rate with L4
hashing. Using L3 hashing is recommended in this case.

For example, for UDP over IPv4 on eth0:
To include UDP port numbers in hasing:
ethtool -N eth0 rx-flow-hash udp4 sdfn
To exclude UDP port numbers in hasing:
ethtool -N eth0 rx-flow-hash udp4 sd
To show UDP hash level:
ethtool -n eth0 rx-flow-hash udp4

Signed-off-by: Haiyang Zhang 
---
 drivers/net/hyperv/hyperv_net.h |2 +
 drivers/net/hyperv/netvsc_drv.c |   78 +++
 2 files changed, 72 insertions(+), 8 deletions(-)

diff --git a/drivers/net/hyperv/hyperv_net.h b/drivers/net/hyperv/hyperv_net.h
index 9198dd1..ff1c0c8 100644
--- a/drivers/net/hyperv/hyperv_net.h
+++ b/drivers/net/hyperv/hyperv_net.h
@@ -720,6 +720,8 @@ struct net_device_context {
u32 tx_send_table[VRSS_SEND_TAB_SIZE];
 
/* Ethtool settings */
+   bool udp4_l4_hash;
+   bool udp6_l4_hash;
u8 duplex;
u32 speed;
struct netvsc_ethtool_stats eth_stats;
diff --git a/drivers/net/hyperv/netvsc_drv.c b/drivers/net/hyperv/netvsc_drv.c
index d8612b1..c0c4c91 100644
--- a/drivers/net/hyperv/netvsc_drv.c
+++ b/drivers/net/hyperv/netvsc_drv.c
@@ -190,10 +190,12 @@ static int netvsc_close(struct net_device *net)
return ppi;
 }
 
-/* Azure hosts don't support non-TCP port numbers in hashing yet. We compute
- * hash for non-TCP traffic with only IP numbers.
+/* Azure hosts don't support non-TCP port numbers in hashing for fragmented
+ * packets. We can use ethtool to change UDP hash level when necessary.
  */
-static inline u32 netvsc_get_hash(struct sk_buff *skb)
+static inline u32 netvsc_get_hash(
+   struct sk_buff *skb,
+   const struct net_device_context *ndc)
 {
struct flow_keys flow;
u32 hash;
@@ -204,7 +206,11 @@ static inline u32 netvsc_get_hash(struct sk_buff *skb)
if (!skb_flow_dissect_flow_keys(skb, , 0))
return 0;
 
-   if (flow.basic.ip_proto == IPPROTO_TCP) {
+   if (flow.basic.ip_proto == IPPROTO_TCP ||
+   (flow.basic.ip_proto == IPPROTO_UDP &&
+((flow.basic.n_proto == htons(ETH_P_IP) && ndc->udp4_l4_hash) ||
+ (flow.basic.n_proto == htons(ETH_P_IPV6) &&
+  ndc->udp6_l4_hash {
return skb_get_hash(skb);
} else {
if (flow.basic.n_proto == htons(ETH_P_IP))
@@ -227,7 +233,7 @@ static inline int netvsc_get_tx_queue(struct net_device 
*ndev,
struct sock *sk = skb->sk;
int q_idx;
 
-   q_idx = ndc->tx_send_table[netvsc_get_hash(skb) &
+   q_idx = ndc->tx_send_table[netvsc_get_hash(skb, ndc) &
   (VRSS_SEND_TAB_SIZE - 1)];
 
/* If queue index changed record the new value */
@@ -891,6 +897,9 @@ static void netvsc_init_settings(struct net_device *dev)
 {
struct net_device_context *ndc = netdev_priv(dev);
 
+   ndc->udp4_l4_hash = true;
+   ndc->udp6_l4_hash = true;
+
ndc->speed = SPEED_UNKNOWN;
ndc->duplex = DUPLEX_FULL;
 }
@@ -1228,7 +1237,8 @@ static void netvsc_get_strings(struct net_device *dev, 
u32 stringset, u8 *data)
 }
 
 static int
-netvsc_get_rss_hash_opts(struct ethtool_rxnfc *info)
+netvsc_get_rss_hash_opts(struct net_device_context *ndc,
+struct ethtool_rxnfc *info)
 {
info->data = RXH_IP_SRC | RXH_IP_DST;
 
@@ -1236,9 +1246,20 @@ static void netvsc_get_strings(struct net_device *dev, 
u32 stringset, u8 *data)
case TCP_V4_FLOW:
case TCP_V6_FLOW:
info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
-   /* fallthrough */
+   break;
+
case UDP_V4_FLOW:
+   if (ndc->udp4_l4_hash)
+   info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+
+   break;
+
case UDP_V6_FLOW:
+   if (ndc->udp6_l4_hash)
+   info->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3;
+
+   break;
+
case IPV4_FLOW:
case IPV6_FLOW:
break;
@@ -1266,11 +1287,51 @@ static void netvsc_get_strings(struct net_device *dev, 
u32 stringset, u8 *data)
return 0;
 
case ETHTOOL_GRXFH:
-   return netvsc_get_rss_hash_opts(info);
+   return netvsc_get_rss_hash_opts(ndc, info);
}
return -EOPNOTSUPP;
 }
 
+static int netvsc_set_rss_hash_opts(struct net_device_context *ndc,
+   struct ethtool_rxnfc *info)
+{
+   if (info->data == (RXH_IP_SRC | RXH_IP_DST |
+  RXH_L4_B_0_1 | RXH_L4_B_2_3)) {
+   if

[PATCH net-next,4/4] hv_netvsc: Update netvsc Document for UDP hash level setting

2017-08-21 Thread Haiyang Zhang

From: Haiyang Zhang 

Update Documentation/networking/netvsc.txt for UDP hash level setting
and related info.

Signed-off-by: Haiyang Zhang 
---
 Documentation/networking/netvsc.txt |   22 +-
 1 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/Documentation/networking/netvsc.txt 
b/Documentation/networking/netvsc.txt
index 4ddb4e4..fa8d863 100644
--- a/Documentation/networking/netvsc.txt
+++ b/Documentation/networking/netvsc.txt
@@ -21,11 +21,23 @@ Features
   
   Hyper-V supports receive side scaling. For TCP, packets are
   distributed among available queues based on IP address and port
-  number. Current versions of Hyper-V host, only distribute UDP
-  packets based on the IP source and destination address.
-  The port number is not used as part of the hash value for UDP.
-  Fragmented IP packets are not distributed between queues;
-  all fragmented packets arrive on the first channel.
+  number.
+
+  For UDP, we can switch UDP hash level between L3 and L4 by ethtool
+  command. UDP over IPv4 and v6 can be set differently. The default
+  hash level is L4. We currently only allow switching TX hash level
+  from within the guests.
+
+  On Azure, fragmented UDP packets have high loss rate with L4
+  hashing. Using L3 hashing is recommended in this case.
+
+  For example, for UDP over IPv4 on eth0:
+  To include UDP port numbers in hasing:
+ethtool -N eth0 rx-flow-hash udp4 sdfn
+  To exclude UDP port numbers in hasing:
+ethtool -N eth0 rx-flow-hash udp4 sd
+  To show UDP hash level:
+ethtool -n eth0 rx-flow-hash udp4
 
   Generic Receive Offload, aka GRO
   
-- 
1.7.1

Re: [PATCH] mt7601u: check memory allocation failure

2017-08-21 Thread Jakub Kicinski

On Mon, 21 Aug 2017 14:34:30 -0700, Jakub Kicinski wrote:
> On Mon, 21 Aug 2017 22:59:56 +0200, Christophe JAILLET wrote:
> > Check memory allocation failure and return -ENOMEM in such a case, as
> > already done a few lines below
> > 
> > Signed-off-by: Christophe JAILLET   
> 
> Acked-by: Jakub Kicinski 

Wait, I take that back.  This code is a bit weird.  We would return an
error, then mt7601u_dma_init() will call mt7601u_free_tx_queue() which
doesn't check for tx_q == NULL condition.

Looks like mt7601u_free_tx() has to check for dev->tx_q == NULL and
return early if that's the case.  Or mt7601u_alloc_tx() should really
clean things up on it's own on failure.  Ugh.

[PATCH net] udp: on peeking bad csum, drop packets even if not at head

2017-08-21 Thread Willem de Bruijn

From: Willem de Bruijn 

When peeking, if a bad csum is discovered, the skb is unlinked from
the queue with __sk_queue_drop_skb and the peek operation restarted.

__sk_queue_drop_skb only drops packets that match the queue head. With
sk_peek_off, the skb need not be at head, causing the call to fail and
the same skb to be found again on restart.

Walk the queue to find the correct skb. Limit the walk to sk_peek_off,
to bound cycle cost to at most twice the original skb_queue_walk in
__skb_try_recv_from_queue.

The operation may race with updates to sk_peek_off. As the operation
is retried, it will eventually succeed.

Signed-off-by: Willem de Bruijn 

---

Simpler would be to check (skb->csum_complete_sw && !sbk->csum_valid)
in __skb_try_recv_from_queue to ignore skbs with bad checksum. But
__udp_lib_checksum_complete does not update those fields if called
while peeking, because the skb is shared. I found no way around that.
---
 net/core/datagram.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/net/core/datagram.c b/net/core/datagram.c
index a21ca8dee5ea..5cf32b2372d3 100644
--- a/net/core/datagram.c
+++ b/net/core/datagram.c
@@ -360,9 +360,17 @@ int __sk_queue_drop_skb(struct sock *sk, struct 
sk_buff_head *sk_queue,
int err = 0;
 
if (flags & MSG_PEEK) {
+   struct sk_buff *lskb;
+   int off = sk_peek_offset(sk, flags);
+
err = -ENOENT;
spin_lock_bh(_queue->lock);
-   if (skb == skb_peek(sk_queue)) {
+   lskb = skb_peek(sk_queue);
+   while (lskb != skb && lskb && off >= lskb->len) {
+   off -= lskb->len;
+   lskb = skb_peek_next(lskb, sk_queue);
+   }
+   if (lskb == skb) {
__skb_unlink(skb, sk_queue);
refcount_dec(>users);
if (destructor)
-- 
2.14.1.480.gb18f417b89-goog

Re: [PATCH] mt7601u: check memory allocation failure

2017-08-21 Thread Jakub Kicinski

On Mon, 21 Aug 2017 22:59:56 +0200, Christophe JAILLET wrote:
> Check memory allocation failure and return -ENOMEM in such a case, as
> already done a few lines below
> 
> Signed-off-by: Christophe JAILLET 

Acked-by: Jakub Kicinski 

Thanks!

Re: [PATCH v3 net-next] bpf/verifier: track liveness for pruning

2017-08-21 Thread Alexei Starovoitov


On 8/21/17 2:00 PM, Daniel Borkmann wrote:

On 08/21/2017 10:44 PM, Edward Cree wrote:

On 21/08/17 21:27, Daniel Borkmann wrote:

On 08/21/2017 08:36 PM, Edward Cree wrote:

On 19/08/17 00:37, Alexei Starovoitov wrote:

[...]

I'm tempted to just rip out env->varlen_map_value_access and always
check
   the whole thing, because honestly I don't know what it was meant
to do
   originally or how it can ever do any useful pruning.  While
drastic, it
   does cause your test case to pass.


Original intention from 484611357c19 ("bpf: allow access into map
value arrays") was that it wouldn't potentially make pruning worse
if PTR_TO_MAP_VALUE_ADJ was not used, meaning that we wouldn't need
to take reg state's min_value and max_value into account for state
checking; this was basically due to min_value / max_value is being
adjusted/tracked on every alu/jmp ops for involved regs (e.g.
adjust_reg_min_max_vals() and others that mangle them) even if we
have the case that no actual dynamic map access is used throughout
the program. To give an example on net tree, the bpf_lxc.o prog's
section increases from 36,386 to 68,226 when
env->varlen_map_value_access
is always true, so it does have an effect. Did you do some checks
on this on net-next?

I tested with the cilium progs and saw no change in insn count.  I
  suspect that for the normal case I already killed this optimisation
  when I did my unification patch, it was previously about ignoring
  min/max values on all regs (including scalars), whereas on net-next
  it only ignores them on map_value pointers; in practice this is
  useless because we tend to still have the offset scalar sitting in
  a register somewhere.  (Come to think of it, this may have been
  behind a large chunk of the #insn increase that my patches caused.)


Yeah, this would seem plausible.


Since we use umax_value in find_good_pkt_pointers() now (to check
  against MAX_PACKET_OFF and ensure our reg->range is really ok), we
  can't just stop caring about all min/max values just because we
  haven't done any variable map accesses.
I don't see a way around this.


Agree, was thinking the same. If there's not really a regression in
terms of complexity, then lets kill the flag.


+1

diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index 2489e67b65f6..908d13b2a2aa 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -3582,7 +3582,7 @@ static int do_check(struct bpf_verifier_env *env)
init_reg_state(regs);
state->parent = NULL;
insn_idx = 0;
-   env->varlen_map_value_access = false;
+   env->varlen_map_value_access = true;

makes _zero_ difference on cilium*.o tests, so let's just kill
that workaround.

Re: [PATCH v3 net-next] bpf/verifier: track liveness for pruning

2017-08-21 Thread Alexei Starovoitov


On 8/21/17 1:24 PM, Edward Cree wrote:

On 18/08/17 15:16, Edward Cree wrote:

On 18/08/17 04:21, Alexei Starovoitov wrote:

It seems you're trying to sort-of do per-fake-basic block liveness
analysis, but our state_list_marks are not correct if we go with
canonical basic block definition, since we mark the jump insn and
not insn after the branch and not every basic block boundary is
properly detected.

I think the reason this works is that jump insns can't do writes.
[snip]
the sl->state will never have any write marks and it'll all just work.
But I should really test that!

I tested this, and found that, no, sl->state can have write marks, and the
 algorithm will get the wrong answer in that case.  So I've got a patch to
 make the first iteration ignore write marks, as part of a series which I
 will post shortly.  When I do so, please re-do your tests with adding
 state_list_marks in strange and exciting places; it should work wherever
 you put them.  Like you say, it "magically doesn't depend on proper basic
 block boundaries", and that's because really pruning is just a kind of
 checkpointing that just happens to be most effective when done just after
 a jump (pop_stack).

Can I have a SOB for your "grr" test program, so I can include it in the
 series?


yes. of course. just give the test some reasonable name :)

Re: [RFC] about net: Fix inconsistent teardown and release of private netdev state.

2017-08-21 Thread David Miller

From: Eric Dumazet 
Date: Fri, 18 Aug 2017 20:40:01 -0700

> Let look at tun->pcpu_stats, for example.
> 
> It is allocated at line 1831, before the register_netdevice()
> 
> drivers/net/tun.c does not provide ndo_init()

I see the problem now.

And it's done this way because several steps need to occur, with
various kinds of dependencies, before the register_netdevice() call is
made.

I'll see if I can untangle this somehow.

Re: [PATCH net-next 3/3 v6] drivers: net: ethernet: qualcomm: rmnet: Initial implementation

2017-08-21 Thread David Miller

From: Subash Abhinov Kasiviswanathan 
Date: Fri, 18 Aug 2017 23:35:31 -0600

> diff --git a/drivers/net/ethernet/qualcomm/Makefile 
> b/drivers/net/ethernet/qualcomm/Makefile
> index 92fa7c4..c4f38bd 100644
> --- a/drivers/net/ethernet/qualcomm/Makefile
> +++ b/drivers/net/ethernet/qualcomm/Makefile
> @@ -9,3 +9,5 @@ obj-$(CONFIG_QCA7000_UART) += qcauart.o
>  qcauart-objs := qca_uart.o
>  
>  obj-y += emac/
> +
> +obj-$(CONFIG_RMNET) += rmnet/
> \ No newline at end of file

Missing final newline in this file.

> diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c 
> b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
> new file mode 100644
> index 000..5338bab
> --- /dev/null
> +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_config.c
 ...
> +static inline int
> +rmnet_is_real_dev_registered(const struct net_device *real_dev)
> +{

Do not declare functions as inline in foo.c files, let the compiler decide.

> +static inline struct rmnet_real_dev_info*
> +__rmnet_get_real_dev_info(const struct net_device *real_dev)
> +{

Likewise.

> diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c 
> b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
> new file mode 100644
> index 000..f34fe9e
> --- /dev/null
> +++ b/drivers/net/ethernet/qualcomm/rmnet/rmnet_handlers.c
 ...
> +static inline void rmnet_set_skb_proto(struct sk_buff *skb)
> +{

Likewise.

> diff --git a/drivers/net/ethernet/qualcomm/rmnet/rmnet_main.c 
> b/drivers/net/ethernet/qualcomm/rmnet/rmnet_main.c
> new file mode 100644
> index 000..80c3920
 ...
> +/* Startup/Shutdown */
> +
> +static int __init rmnet_init(void)
> +{
> + rmnet_config_init();
> + return 0;
> +}
> +
> +static void __exit rmnet_exit(void)
> +{
> + rmnet_config_exit();
> +}
> +
> +module_init(rmnet_init)
> +module_exit(rmnet_exit)
> +MODULE_LICENSE("GPL v2");

Unless you intend to do something different here, having a completely separate 
foo.c file
just to invoke functions in another file at module init and exit time is 
wasteful.  Just
do the module_init()/module_exit() where the rmnet_config_{init,exit}() 
functions are.

Re: [PATCH v3 net-next] bpf/verifier: track liveness for pruning

2017-08-21 Thread Daniel Borkmann


On 08/21/2017 10:44 PM, Edward Cree wrote:

On 21/08/17 21:27, Daniel Borkmann wrote:

On 08/21/2017 08:36 PM, Edward Cree wrote:

On 19/08/17 00:37, Alexei Starovoitov wrote:

[...]

I'm tempted to just rip out env->varlen_map_value_access and always check
   the whole thing, because honestly I don't know what it was meant to do
   originally or how it can ever do any useful pruning.  While drastic, it
   does cause your test case to pass.


Original intention from 484611357c19 ("bpf: allow access into map
value arrays") was that it wouldn't potentially make pruning worse
if PTR_TO_MAP_VALUE_ADJ was not used, meaning that we wouldn't need
to take reg state's min_value and max_value into account for state
checking; this was basically due to min_value / max_value is being
adjusted/tracked on every alu/jmp ops for involved regs (e.g.
adjust_reg_min_max_vals() and others that mangle them) even if we
have the case that no actual dynamic map access is used throughout
the program. To give an example on net tree, the bpf_lxc.o prog's
section increases from 36,386 to 68,226 when env->varlen_map_value_access
is always true, so it does have an effect. Did you do some checks
on this on net-next?

I tested with the cilium progs and saw no change in insn count.  I
  suspect that for the normal case I already killed this optimisation
  when I did my unification patch, it was previously about ignoring
  min/max values on all regs (including scalars), whereas on net-next
  it only ignores them on map_value pointers; in practice this is
  useless because we tend to still have the offset scalar sitting in
  a register somewhere.  (Come to think of it, this may have been
  behind a large chunk of the #insn increase that my patches caused.)


Yeah, this would seem plausible.


Since we use umax_value in find_good_pkt_pointers() now (to check
  against MAX_PACKET_OFF and ensure our reg->range is really ok), we
  can't just stop caring about all min/max values just because we
  haven't done any variable map accesses.
I don't see a way around this.


Agree, was thinking the same. If there's not really a regression in
terms of complexity, then lets kill the flag.

[PATCH] mt7601u: check memory allocation failure

2017-08-21 Thread Christophe JAILLET

Check memory allocation failure and return -ENOMEM in such a case, as
already done a few lines below

Signed-off-by: Christophe JAILLET 
---
 drivers/net/wireless/mediatek/mt7601u/dma.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/drivers/net/wireless/mediatek/mt7601u/dma.c 
b/drivers/net/wireless/mediatek/mt7601u/dma.c
index 660267b359e4..fa0173579d32 100644
--- a/drivers/net/wireless/mediatek/mt7601u/dma.c
+++ b/drivers/net/wireless/mediatek/mt7601u/dma.c
@@ -484,6 +484,8 @@ static int mt7601u_alloc_tx(struct mt7601u_dev *dev)
 
dev->tx_q = devm_kcalloc(dev->dev, __MT_EP_OUT_MAX,
 sizeof(*dev->tx_q), GFP_KERNEL);
+   if (!dev->tx_q)
+   return -ENOMEM;
 
for (i = 0; i < __MT_EP_OUT_MAX; i++)
if (mt7601u_alloc_tx_queue(dev, >tx_q[i]))
-- 
2.11.0

Re: [PATCH v3 net-next] bpf/verifier: track liveness for pruning

2017-08-21 Thread Edward Cree

On 21/08/17 21:27, Daniel Borkmann wrote:
> On 08/21/2017 08:36 PM, Edward Cree wrote:
>> On 19/08/17 00:37, Alexei Starovoitov wrote:
> [...]
>> I'm tempted to just rip out env->varlen_map_value_access and always check
>>   the whole thing, because honestly I don't know what it was meant to do
>>   originally or how it can ever do any useful pruning.  While drastic, it
>>   does cause your test case to pass.
>
> Original intention from 484611357c19 ("bpf: allow access into map
> value arrays") was that it wouldn't potentially make pruning worse
> if PTR_TO_MAP_VALUE_ADJ was not used, meaning that we wouldn't need
> to take reg state's min_value and max_value into account for state
> checking; this was basically due to min_value / max_value is being
> adjusted/tracked on every alu/jmp ops for involved regs (e.g.
> adjust_reg_min_max_vals() and others that mangle them) even if we
> have the case that no actual dynamic map access is used throughout
> the program. To give an example on net tree, the bpf_lxc.o prog's
> section increases from 36,386 to 68,226 when env->varlen_map_value_access
> is always true, so it does have an effect. Did you do some checks
> on this on net-next?
I tested with the cilium progs and saw no change in insn count.  I
 suspect that for the normal case I already killed this optimisation
 when I did my unification patch, it was previously about ignoring
 min/max values on all regs (including scalars), whereas on net-next
 it only ignores them on map_value pointers; in practice this is
 useless because we tend to still have the offset scalar sitting in
 a register somewhere.  (Come to think of it, this may have been
 behind a large chunk of the #insn increase that my patches caused.)
Since we use umax_value in find_good_pkt_pointers() now (to check
 against MAX_PACKET_OFF and ensure our reg->range is really ok), we
 can't just stop caring about all min/max values just because we
 haven't done any variable map accesses.
I don't see a way around this.

-Ed

Re: [PATCH net-next] net: sched: Add the invalid handle check in qdisc_class_find

2017-08-21 Thread David Miller

From: gfree.w...@vip.163.com
Date: Fri, 18 Aug 2017 15:23:24 +0800

> From: Gao Feng 
> 
> Add the invalid handle "0" check to avoid unnecessary search, because
> the qdisc uses the skb->priority as the handle value to look up, and
> it is "0" usually.
> 
> Signed-off-by: Gao Feng 

Applied, thanks.

Re: [net-next 1/1] tipc: don't reset stale broadcast send link

2017-08-21 Thread David Miller

From: Jon Maloy 
Date: Mon, 21 Aug 2017 17:59:30 +0200

> When the broadcast send link after 100 attempts has failed to
> transfer a packet to all peers, we consider it stale, and reset
> it. Thereafter it needs to re-synchronize with the peers, something
> currently done by just resetting and re-establishing all links to
> all peers. This has turned out to be overkill, with potentially
> unwanted consequences for the remaining cluster.
> 
> A closer analysis reveals that this can be done much simpler. When
> this kind of failure happens, for reasons that may lie outside the
> TIPC protocol, it is typically only one peer which is failing to
> receive and acknowledge packets. It is hence sufficient to identify
> and reset the links only to that peer to resolve the situation, without
> having to reset the broadcast link at all. This solution entails a much
> lower risk of negative consequences for the own node as well as for
> the overall cluster.
> 
> We implement this change in this commit.
> 
> Reviewed-by: Parthasarathy Bhuvaragan 
> Acked-by: Ying Xue 
> Signed-off-by: Jon Maloy 

Applied, thanks Jon.

Re: [PATCH v3 net-next] bpf/verifier: track liveness for pruning

2017-08-21 Thread Daniel Borkmann


On 08/21/2017 08:36 PM, Edward Cree wrote:

On 19/08/17 00:37, Alexei Starovoitov wrote:

[...]

I'm tempted to just rip out env->varlen_map_value_access and always check
  the whole thing, because honestly I don't know what it was meant to do
  originally or how it can ever do any useful pruning.  While drastic, it
  does cause your test case to pass.


Original intention from 484611357c19 ("bpf: allow access into map
value arrays") was that it wouldn't potentially make pruning worse
if PTR_TO_MAP_VALUE_ADJ was not used, meaning that we wouldn't need
to take reg state's min_value and max_value into account for state
checking; this was basically due to min_value / max_value is being
adjusted/tracked on every alu/jmp ops for involved regs (e.g.
adjust_reg_min_max_vals() and others that mangle them) even if we
have the case that no actual dynamic map access is used throughout
the program. To give an example on net tree, the bpf_lxc.o prog's
section increases from 36,386 to 68,226 when env->varlen_map_value_access
is always true, so it does have an effect. Did you do some checks
on this on net-next?

Re: [PATCH v3 net-next] bpf/verifier: track liveness for pruning

2017-08-21 Thread Edward Cree

On 18/08/17 15:16, Edward Cree wrote:
> On 18/08/17 04:21, Alexei Starovoitov wrote:
>> It seems you're trying to sort-of do per-fake-basic block liveness
>> analysis, but our state_list_marks are not correct if we go with
>> canonical basic block definition, since we mark the jump insn and
>> not insn after the branch and not every basic block boundary is
>> properly detected.
> I think the reason this works is that jump insns can't do writes.
> [snip]
> the sl->state will never have any write marks and it'll all just work.
> But I should really test that!
I tested this, and found that, no, sl->state can have write marks, and the
 algorithm will get the wrong answer in that case.  So I've got a patch to
 make the first iteration ignore write marks, as part of a series which I
 will post shortly.  When I do so, please re-do your tests with adding
 state_list_marks in strange and exciting places; it should work wherever
 you put them.  Like you say, it "magically doesn't depend on proper basic
 block boundaries", and that's because really pruning is just a kind of
 checkpointing that just happens to be most effective when done just after
 a jump (pop_stack).

Can I have a SOB for your "grr" test program, so I can include it in the
 series?

-Ed

Re: [PATCH net-next] net: sched: Add the invalid handle check in qdisc_class_find

2017-08-21 Thread Jamal Hadi Salim

On 17-08-21 03:58 PM, Cong Wang wrote:

On Mon, Aug 21, 2017 at 10:47 AM, David Miller  wrote:

From: gfree.w...@vip.163.com
Date: Fri, 18 Aug 2017 15:23:24 +0800

From: Gao Feng 

Add the invalid handle "0" check to avoid unnecessary search, because
the qdisc uses the skb->priority as the handle value to look up, and
it is "0" usually.

Signed-off-by: Gao Feng 

Jamal, Cong, please review.

If 'id' zero is never hashed into the tables, this change looks
legitimate.

Looks good to me.

Looks good to me as well...

cheers,
jamal

Re: [PATCH RESEND 1/2] net: enable high resolution timer mode to timeout datagram sockets

2017-08-21 Thread Cong Wang

On Fri, Aug 18, 2017 at 11:44 AM, Vallish Vaidyeshwara
 wrote:
> -   *timeo_p = schedule_timeout(*timeo_p);
> +   /* Wait using highres timer */
> +   expires = ktime_add_ns(ktime_get(), jiffies_to_nsecs(*timeo_p));
> +   pre_sched_time = jiffies;
> +   if (schedule_hrtimeout(, HRTIMER_MODE_ABS))

Does this work with MAX_SCHEDULE_TIMEOUT too??

[PATCH net] net/hsr: Check skb_put_padto() return value

2017-08-21 Thread Florian Fainelli

skb_put_padto() will free the sk_buff passed as reference in case of
errors, but we still need to check its return value and decide what to
do.

Detected by CoverityScan, CID#1416688 ("CHECKED_RETURN")

Fixes: ee1c27977284 ("net/hsr: Added support for HSR v1")
Signed-off-by: Florian Fainelli 
---
 net/hsr/hsr_device.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/hsr/hsr_device.c b/net/hsr/hsr_device.c
index 4e7bdb213cd0..172d8309f89e 100644
--- a/net/hsr/hsr_device.c
+++ b/net/hsr/hsr_device.c
@@ -314,7 +314,8 @@ static void send_hsr_supervision_frame(struct hsr_port 
*master,
hsr_sp = skb_put(skb, sizeof(struct hsr_sup_payload));
ether_addr_copy(hsr_sp->MacAddressA, master->dev->dev_addr);
 
-   skb_put_padto(skb, ETH_ZLEN + HSR_HLEN);
+   if (skb_put_padto(skb, ETH_ZLEN + HSR_HLEN))
+   return;
 
hsr_forward_skb(skb, master);
return;
-- 
2.9.3

Re: [PATCH net-next] net: sched: Add the invalid handle check in qdisc_class_find

2017-08-21 Thread Cong Wang

On Mon, Aug 21, 2017 at 10:47 AM, David Miller  wrote:
> From: gfree.w...@vip.163.com
> Date: Fri, 18 Aug 2017 15:23:24 +0800
>
>> From: Gao Feng 
>>
>> Add the invalid handle "0" check to avoid unnecessary search, because
>> the qdisc uses the skb->priority as the handle value to look up, and
>> it is "0" usually.
>>
>> Signed-off-by: Gao Feng 
>
> Jamal, Cong, please review.
>
> If 'id' zero is never hashed into the tables, this change looks
> legitimate.

Looks good to me.

Gao, in the future please Cc maintainers directly, you can
use ./scripts/get_maintainer.pl.

Thanks.

Re: [linux-sunxi] Re: [PATCH 2/4] dt-bindings: add binding for RTL8211E Ethernet PHY

2017-08-21 Thread Florian Fainelli

On 08/21/2017 07:53 AM, icen...@aosc.io wrote:
> 在 2017-05-05 02:29，Florian Fainelli 写道：
>> On 05/04/2017 11:26 AM, Icenowy Zheng wrote:
>>>
>>>
>>> 于 2017年5月5日 GMT+08:00 上午2:21:29, Florian Fainelli
>>>  写到:
 On 05/04/2017 11:10 AM, icen...@aosc.io wrote:
> 在 2017-04-22 08:22，Florian Fainelli 写道：
>> On 04/21/2017 04:24 PM, Icenowy Zheng wrote:
>>> From: Icenowy Zheng 
>>>
>>> Some RTL8211E Ethernet PHY have an issue that needs a workaround
>>> indicated with device tree.
>>>
>>> Add the binding for a property that indicates this workaround.
>>>
>>> Signed-off-by: Icenowy Zheng 
>>> ---
>>>  .../devicetree/bindings/net/realtek,rtl8211e.txt   | 22
>>> ++
>>>  1 file changed, 22 insertions(+)
>>>  create mode 100644
>>> Documentation/devicetree/bindings/net/realtek,rtl8211e.txt
>>>
>>> diff --git
>>> a/Documentation/devicetree/bindings/net/realtek,rtl8211e.txt
>>> b/Documentation/devicetree/bindings/net/realtek,rtl8211e.txt
>>> new file mode 100644
>>> index ..c1913301bfe8
>>> --- /dev/null
>>> +++ b/Documentation/devicetree/bindings/net/realtek,rtl8211e.txt
>>> @@ -0,0 +1,22 @@
>>> +Realtek RTL8211E Ethernet PHY
>>> +
>>> +One batch of RTL8211E is slight broken, that needs some special
 (and
>>> +full of magic numbers) tweaking in order to make GbE to operate
>>> properly.
>>> +The only well-known board that used the broken batch is Pine64+.
>>> +Configure it through an Ethernet OF device node.
>>> +
>>> +Optional properties:
>>> +
>>> +- realtek,disable-rx-delay:
>>> +  If set, RX delay will be completely disabled (according to
>>> Realtek). This
>>> +  will affect the performance on non-broken boards.
>>> +  default: do not disable RX delay.
>>
>> Please don't introduce custom properties to do that, instead correct
>> specify the "phy-mode" such that it is e.g: "rgmii-txid" which
 indicates
>> that there should be no RX internal delay, but a TX internal delay
 added
>> by the PHY.
>
> Checked the document, the meaning of "rgmii-txid" is not correct
 here.
>
> This doesn't effect the MAC, and the MAC should still add TX delay.
>
> The definition of "rgmii-txid" in
> Documentation/devicetree/binding/net/ethernet.txt is "RGMII with
> internal TX delay provided by the PHY, the MAC should not add an TX
 delay
> in this case". However, this do not indicate that the MAC doesn't add
 TX
> delay; in fact that just totally disabled the PHY to provide the RX
 delay.
> MAC still should to add delay on both TX/RX, which is the semantic of
> standard "rgmii".
>
> So I cannot used "rgmii-txid" here, but should continue to use this
> custom property.
> 
> Sorry for replying an old email, but it's because the driver of the MAC I
> used is merged (dwmac-sun8i).
> 
> The driver of the MAC currently only supports "mii", "rmii", and "rgmii",
> and according to the SoC's user manual, the MAC cannot has its delays
> disabled.
> 
> How should it handle this "rgmii-txid" here? Just treat it as "rgmii"?

Considering there are no configurable delays on the MAC side, all you
can do is treat all RGMII variants the same by configuring the MAC for
RGMII mode (with no additional capabilities and as opposed to MII, RMII
which are other clocking/data pins modes) and let the PHY configure the
delay accordingly based on "phy-mode"/phy_interface_t. You can use
phy_interface_is_rgmii() as a helper function to cover all 4 variants.
-- 
Florian

Re: [PATCH net] net: dsa: skb_put_padto() already frees nskb

2017-08-21 Thread Andrew Lunn

On Mon, Aug 21, 2017 at 12:41:43PM -0700, Florian Fainelli wrote:
> skb_put_padto() already frees the passed sk_buff reference upon error,
> so calling kfree_skb() on it again is not necessary.
> 
> Detected by CoverityScan, CID#1416687 ("USE_AFTER_FREE")
> 
> Fixes: e71cb9e00922 ("net: dsa: ksz: fix skb freeing")
> Signed-off-by: Florian Fainelli 

Reviewed-by: Andrew Lunn 

Andrew

Re: [PATCH 1/2] vhost: remove the possible fruitless search on iotlb prefetch

2017-08-21 Thread Michael S. Tsirkin

On Sat, Aug 19, 2017 at 03:41:14PM +0900, Koichiro Den wrote:
> Signed-off-by: Koichiro Den 
> ---
>  drivers/vhost/vhost.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
> index e4613a3c362d..93e909afc1c3 100644
> --- a/drivers/vhost/vhost.c
> +++ b/drivers/vhost/vhost.c
> @@ -1184,7 +1184,7 @@ static int iotlb_access_ok(struct vhost_virtqueue *vq,
>   while (len > s) {
>   node = vhost_umem_interval_tree_iter_first(>umem_tree,
>  addr,
> -addr + len - 1);
> +addr + len - s - 1);
>   if (node == NULL || node->start > addr) {
>   vhost_iotlb_miss(vq, addr, access);
>   return false;

This works but it probably makes sense to just refactor the code to make end of
range a variable. I posted a patch like this, pls take a look.

> -- 
> 2.9.4
>

[PATCH] vhost: fix end of range for access_ok

2017-08-21 Thread Michael S. Tsirkin

During access_ok checks, addr increases as we iterate over the data
structure, thus addr + len - 1 will point beyond the end of region we
are translating.  Harmless since we then verify that the region covers
addr, but let's not waste cpu cycles.

Reported-by: Koichiro Den 
Signed-off-by: Michael S. Tsirkin 
---

Lightly tested, would appreciate an ack from reporter.

 drivers/vhost/vhost.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
index e4613a3..ecd70e4 100644
--- a/drivers/vhost/vhost.c
+++ b/drivers/vhost/vhost.c
@@ -1176,7 +1176,7 @@ static int iotlb_access_ok(struct vhost_virtqueue *vq,
 {
const struct vhost_umem_node *node;
struct vhost_umem *umem = vq->iotlb;
-   u64 s = 0, size, orig_addr = addr;
+   u64 s = 0, size, orig_addr = addr, last = addr + len - 1;
 
if (vhost_vq_meta_fetch(vq, addr, len, type))
return true;
@@ -1184,7 +1184,7 @@ static int iotlb_access_ok(struct vhost_virtqueue *vq,
while (len > s) {
node = vhost_umem_interval_tree_iter_first(>umem_tree,
   addr,
-  addr + len - 1);
+  last);
if (node == NULL || node->start > addr) {
vhost_iotlb_miss(vq, addr, access);
return false;
-- 
MST

[PATCH net] net: dsa: skb_put_padto() already frees nskb

2017-08-21 Thread Florian Fainelli

skb_put_padto() already frees the passed sk_buff reference upon error,
so calling kfree_skb() on it again is not necessary.

Detected by CoverityScan, CID#1416687 ("USE_AFTER_FREE")

Fixes: e71cb9e00922 ("net: dsa: ksz: fix skb freeing")
Signed-off-by: Florian Fainelli 
---
 net/dsa/tag_ksz.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index de66ca8e6201..107172c82107 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -60,10 +60,8 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct 
net_device *dev)
 skb_transport_header(skb) - skb->head);
skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len));
 
-   if (skb_put_padto(nskb, nskb->len + padlen)) {
-   kfree_skb(nskb);
+   if (skb_put_padto(nskb, nskb->len + padlen))
return NULL;
-   }
 
kfree_skb(skb);
}
-- 
2.9.3

[PATCH net] net: dsa: skb_put_padto() already frees nskb

2017-08-21 Thread Florian Fainelli

skb_put_padto() already frees the passed sk_buff reference upon error,
so calling kfree_skb() on it again is not necessary.

Detected by CoverityScan, CID#1416687 ("USE_AFTER_FREE")

Fixes: e71cb9e00922 ("net: dsa: ksz: fix skb freeing")
Signed-off-by: Florian Fainelli 
---
 net/dsa/tag_ksz.c | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/net/dsa/tag_ksz.c b/net/dsa/tag_ksz.c
index de66ca8e6201..107172c82107 100644
--- a/net/dsa/tag_ksz.c
+++ b/net/dsa/tag_ksz.c
@@ -60,10 +60,8 @@ static struct sk_buff *ksz_xmit(struct sk_buff *skb, struct 
net_device *dev)
 skb_transport_header(skb) - skb->head);
skb_copy_and_csum_dev(skb, skb_put(nskb, skb->len));
 
-   if (skb_put_padto(nskb, nskb->len + padlen)) {
-   kfree_skb(nskb);
+   if (skb_put_padto(nskb, nskb->len + padlen))
return NULL;
-   }
 
kfree_skb(skb);
}
-- 
2.9.3

[PATCH V2 net-next 1/2] liquidio: move macro definition to a proper place

2017-08-21 Thread Felix Manlunas

From: Veerasenareddy Burru 

The macro LIO_CMD_WAIT_TM is not specific to the PF driver; it can be used
by the VF driver too, so move its definition from a PF-specific header file
to one that's common to PF and VF.

Signed-off-by: Veerasenareddy Burru 
Signed-off-by: Felix Manlunas 
---
 drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.h | 2 --
 drivers/net/ethernet/cavium/liquidio/liquidio_common.h  | 2 ++
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.h 
b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.h
index dee6046..2aba524 100644
--- a/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.h
+++ b/drivers/net/ethernet/cavium/liquidio/cn23xx_pf_device.h
@@ -24,8 +24,6 @@
 
 #include "cn23xx_pf_regs.h"
 
-#define LIO_CMD_WAIT_TM 100
-
 /* Register address and configuration for a CN23XX devices.
  * If device specific changes need to be made then add a struct to include
  * device specific fields as shown in the commented section
diff --git a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h 
b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
index 906e30a..d0076c1 100644
--- a/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
+++ b/drivers/net/ethernet/cavium/liquidio/liquidio_common.h
@@ -237,6 +237,8 @@ static inline void add_sg_size(struct octeon_sg_entry 
*sg_entry,
 #define   OCTNET_CMD_VLAN_FILTER_ENABLE 0x1
 #define   OCTNET_CMD_VLAN_FILTER_DISABLE 0x0
 
+#define   LIO_CMD_WAIT_TM 100
+
 /* RX(packets coming from wire) Checksum verification flags */
 /* TCP/UDP csum */
 #define   CNNIC_L4SUM_VERIFIED 0x1
-- 
2.9.0

[PATCH V2 net-next 2/2] liquidio: make VF driver notify NIC firmware of MTU change

2017-08-21 Thread Felix Manlunas

From: Veerasenareddy Burru 

Signed-off-by: Veerasenareddy Burru 
Signed-off-by: Felix Manlunas 
---
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 27 ++
 1 file changed, 22 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c 
b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
index 0402b18..2e993ce 100644
--- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
+++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
@@ -1544,14 +1544,31 @@ static struct net_device_stats 
*liquidio_get_stats(struct net_device *netdev)
  */
 static int liquidio_change_mtu(struct net_device *netdev, int new_mtu)
 {
-   struct lio *lio = GET_LIO(netdev);
+   struct octnic_ctrl_pkt nctrl;
+   struct octeon_device *oct;
+   struct lio *lio;
+   int ret = 0;
 
-   lio->mtu = new_mtu;
+   lio = GET_LIO(netdev);
+   oct = lio->oct_dev;
+
+   memset(, 0, sizeof(struct octnic_ctrl_pkt));
 
-   netif_info(lio, probe, lio->netdev, "MTU Changed from %d to %d\n",
-  netdev->mtu, new_mtu);
+   nctrl.ncmd.u64 = 0;
+   nctrl.ncmd.s.cmd = OCTNET_CMD_CHANGE_MTU;
+   nctrl.ncmd.s.param1 = new_mtu;
+   nctrl.iq_no = lio->linfo.txpciq[0].s.q_no;
+   nctrl.wait_time = LIO_CMD_WAIT_TM;
+   nctrl.netpndev = (u64)netdev;
+   nctrl.cb_fn = liquidio_link_ctrl_cmd_completion;
+
+   ret = octnet_send_nic_ctrl_pkt(lio->oct_dev, );
+   if (ret < 0) {
+   dev_err(>pci_dev->dev, "Failed to set MTU\n");
+   return -EIO;
+   }
 
-   netdev->mtu = new_mtu;
+   lio->mtu = new_mtu;
 
return 0;
 }
-- 
2.9.0

[PATCH V2 net-next 0/2] liquidio: VF driver will notify NIC firmware of MTU change

2017-08-21 Thread Felix Manlunas

From: Veerasenareddy Burru 

Make VF driver notify NIC firmware of MTU change.  Firmware needs this
information for MTU propagation and enforcement.

The first patch in this series moves a macro definition to a proper place
to prevent a build error in the second patch which has the code that sends
the notification.

Change Log:
  V1 -> V2
* Add "From:" line to patch #1 and #2 to give credit to the author.
* In patch #2, order local variable declarations from longest to
  shortest line.

Veerasenareddy Burru (2):
  liquidio: move macro definition to a proper place
  liquidio: make VF driver notify NIC firmware of MTU change

 .../ethernet/cavium/liquidio/cn23xx_pf_device.h|  2 --
 drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 27 ++
 .../net/ethernet/cavium/liquidio/liquidio_common.h |  2 ++
 3 files changed, 24 insertions(+), 7 deletions(-)

-- 
2.9.0

XDP redirect measurements, gotchas and tracepoints

2017-08-21 Thread Jesper Dangaard Brouer


I'be been playing with the latest XDP_REDIRECT feature, that was
accepted in net-next (for ixgbe), see merge commit[1].
 [1] https://git.kernel.org/davem/net-next/c/6093ec2dc31

At a first glance the performance looks awesome, and it is(!) when
your system is tuned for this workload. When perfectly tuned I can
show 13,096,427 pps forwarding, which is very close to 10Gbit/s
wirespeed at 64bytes (14.88Mpps).  Using only a single CPU (E5-1650 v4
@3.60GHz) core.

First gotcha(1): be aware of what you measure.  The reported numbers from
xdp_redirect_map is how many packets the XDP program received.  It
have no info whether the packet was actually transmitted out.  This
info is avail via TX counters[2] or an xdp tracepoint.

[2] ethtool_stats:

https://github.com/netoptimizer/network-testing/blob/master/bin/ethtool_stats.pl

Second gotcha(2): you cannot TX out a device, unless it also have a
xdp bpf program attached. (This is an implicit dependency, as the
driver code need to setup XDP resources before it can ndo_xdp_xmit).

Third gotcha(3): You got this far, loaded xdp on both interfaces, and
notice now that (with default setup) you can RX with 14Mpps but only
TX with 6.9Mpps (and might have 5% idle cycles).  I debugged this via
perf tracepoint event xdp:xdp_redirect, and found this was due to
overrunning the xdp TX ring-queue size.

 Thus, for this workload, we need to adjust either the TX ring-queue
size (ethtool -G) or the DMA completion interval (ethtool -C rx-usecs).
See tuning and measurements below signature.

Fourth gotcha(4): Monitoring XDP redirect performance via the
tracepoint xdp:xdp_redirect, is too slow, and affect the measurements
themselves.  I'm working on optimizing these tracepoints, and will
share results tomorrow.

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


No-tuning (default auto-tuning rx-usecs 1):
 Notice tx_packets is too low compared to RX

Show adapter(s) (ixgbe1 ixgbe2) statistics (ONLY that changed!)
Ethtool(ixgbe1  ) stat: 14720134 ( 14,720,134) <= fdir_miss /sec
Ethtool(ixgbe1  ) stat:874951205 (874,951,205) <= rx_bytes /sec
Ethtool(ixgbe1  ) stat:952434290 (952,434,290) <= rx_bytes_nic /sec
Ethtool(ixgbe1  ) stat:   271737 (271,737) <= rx_missed_errors /sec
Ethtool(ixgbe1  ) stat:27631 ( 27,631) <= rx_no_dma_resources 
/sec
Ethtool(ixgbe1  ) stat: 14582520 ( 14,582,520) <= rx_packets /sec
Ethtool(ixgbe1  ) stat: 14610072 ( 14,610,072) <= rx_pkts_nic /sec
Ethtool(ixgbe1  ) stat:874947566 (874,947,566) <= rx_queue_2_bytes /sec
Ethtool(ixgbe1  ) stat: 14582459 ( 14,582,459) <= rx_queue_2_packets 
/sec
Ethtool(ixgbe2  ) stat:417934735 (417,934,735) <= tx_bytes /sec
Ethtool(ixgbe2  ) stat:445801114 (445,801,114) <= tx_bytes_nic /sec
Ethtool(ixgbe2  ) stat:  6965579 (  6,965,579) <= tx_packets /sec
Ethtool(ixgbe2  ) stat:  6965771 (  6,965,771) <= tx_pkts_nic /sec


Tuned with rx-usecs 25:
 ethtool -C ixgbe1 rx-usecs 25 ;\
 ethtool -C ixgbe2 rx-usecs 25

Show adapter(s) (ixgbe1 ixgbe2) statistics (ONLY that changed!)
Ethtool(ixgbe1  ) stat: 14123764 ( 14,123,764) <= fdir_miss /sec
Ethtool(ixgbe1  ) stat:786101618 (786,101,618) <= rx_bytes /sec
Ethtool(ixgbe1  ) stat:952807289 (952,807,289) <= rx_bytes_nic /sec
Ethtool(ixgbe1  ) stat:  1047989 (  1,047,989) <= rx_missed_errors /sec
Ethtool(ixgbe1  ) stat:   737938 (737,938) <= rx_no_dma_resources 
/sec
Ethtool(ixgbe1  ) stat: 13101694 ( 13,101,694) <= rx_packets /sec
Ethtool(ixgbe1  ) stat: 13839620 ( 13,839,620) <= rx_pkts_nic /sec
Ethtool(ixgbe1  ) stat:786101618 (786,101,618) <= rx_queue_2_bytes /sec
Ethtool(ixgbe1  ) stat: 13101694 ( 13,101,694) <= rx_queue_2_packets 
/sec
Ethtool(ixgbe2  ) stat:785785590 (785,785,590) <= tx_bytes /sec
Ethtool(ixgbe2  ) stat:838179358 (838,179,358) <= tx_bytes_nic /sec
Ethtool(ixgbe2  ) stat: 13096427 ( 13,096,427) <= tx_packets /sec
Ethtool(ixgbe2  ) stat: 13096519 ( 13,096,519) <= tx_pkts_nic /

Tuned with adjusting ring-queue sizes:
 ethtool -G ixgbe1 rx 1024 tx 1024 ;\
 ethtool -G ixgbe2 rx 1024 tx 1024

Show adapter(s) (ixgbe1 ixgbe2) statistics (ONLY that changed!)
Ethtool(ixgbe1  ) stat: 14169252 ( 14,169,252) <= fdir_miss /sec
Ethtool(ixgbe1  ) stat:783666937 (783,666,937) <= rx_bytes /sec
Ethtool(ixgbe1  ) stat:957332815 (957,332,815) <= rx_bytes_nic /sec
Ethtool(ixgbe1  ) stat:  1053052 (  1,053,052) <= rx_missed_errors /sec
Ethtool(ixgbe1  ) stat:   844113 (844,113) <= rx_no_dma_resources 
/sec
Ethtool(ixgbe1  ) stat: 13061116 ( 13,061,116) <= rx_packets /sec
Ethtool(ixgbe1  ) stat: 13905221 ( 13,905,221) <= rx_pkts_nic /sec
Ethtool(ixgbe1  ) stat:783666937 (783,666,937) <= rx_queue_2_bytes /sec

Re: [RFC PATCH] dt-binding: net: sfp binding documentation

2017-08-21 Thread Rob Herring

On Mon, Aug 21, 2017 at 10:06 AM, Baruch Siach  wrote:
> Hi Russell,
>
> On Mon, Aug 21, 2017 at 01:53:17PM +0100, Russell King - ARM Linux wrote:
>> On Sun, Aug 20, 2017 at 01:28:06PM +0300, Baruch Siach wrote:
>> > Add device-tree binding documentation SFP transceivers. Support for SFP
>> > transceivers has been recently introduced (drivers/net/phy/sfp.c).
>> >
>> > Signed-off-by: Baruch Siach 
>> > ---
>> >
>> > The SFP driver is on net-next.
>> >
>> > Not sure about the rate-select-gpio property name. The SFP+ standard
>> > (not supported yet) uses two signals, RS0 and RS1. RS0 is compatible
>> > with the SFP rate select signal, while RS1 controls the Tx rate.
>>
>> SFP+ is usable with this, but the platforms I have do not wire the
>> rate select pins on the SFP+ sockets to GPIOs, but hard-wire them.
>
> So maybe naming this signal 'rate-select0-gpio' would make it more future
> (SPF+) proof? Or 'rate-select-rx-gpio'?

Just extend it by making it an array of 2 gpios.

Rob

Re: [RFC PATCH] dt-binding: net: sfp binding documentation

2017-08-21 Thread Rob Herring

On Sun, Aug 20, 2017 at 5:28 AM, Baruch Siach  wrote:
> Add device-tree binding documentation SFP transceivers. Support for SFP
> transceivers has been recently introduced (drivers/net/phy/sfp.c).
>
> Signed-off-by: Baruch Siach 
> ---
>
> The SFP driver is on net-next.
>
> Not sure about the rate-select-gpio property name. The SFP+ standard
> (not supported yet) uses two signals, RS0 and RS1. RS0 is compatible
> with the SFP rate select signal, while RS1 controls the Tx rate.
> ---
>  Documentation/devicetree/bindings/net/sff-sfp.txt | 24 
> +++
>  1 file changed, 24 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/net/sff-sfp.txt
>
> diff --git a/Documentation/devicetree/bindings/net/sff-sfp.txt 
> b/Documentation/devicetree/bindings/net/sff-sfp.txt
> new file mode 100644
> index ..f0c27bc3925e
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/net/sff-sfp.txt
> @@ -0,0 +1,24 @@
> +Small Form Factor (SFF) Committee Small Form-factor Pluggable (SFP)
> +Transceiver
> +
> +Required properties:
> +
> +- compatible : must be "sff,sfp"

Need to document "sff" vendor prefix.

Kind of a short name, but I guess it is sufficient. Are there
revisions of the standard (not SFP+) or more than one form factor (I
don't recall any)?

> +
> +Optional Properties:
> +
> +- i2c-bus : phandle of an I2C bus controller for the SFP two wire serial
> +  interface

Why not a child of the i2c bus it is on? IOW, what should this be a child of?

> +
> +- moddef0-gpio : phandle of the MOD-DEF0 (AKA Mod_ABS) module presence input
> +  gpio signal

mod-def0-gpios?

> +
> +- los-gpio : phandle of the Receiver Loss of Signal Indication input gpio
> +  signal
> +
> +- tx-fault-gpio : phandle of the Module Transmitter Fault input gpio signal
> +
> +- tx-disable-gpio : phandle of the Transmitter Disable output gpio signal
> +
> +- rate-select-gpio : phandle of the Rx Signaling Rate Select (AKA RS0) output
> +  gpio

-gpios is the preferred form for all of these.

> --
> 2.14.1
>
> --
> To unsubscribe from this list: send the line "unsubscribe devicetree" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html

Re: [PATCH v3 net-next] bpf/verifier: track liveness for pruning

2017-08-21 Thread Edward Cree

On 19/08/17 00:37, Alexei Starovoitov wrote:
> that '14: safe' above is not correct.
>
> Disabling liveness as:
> @@ -3282,7 +3288,7 @@ static bool regsafe(struct bpf_reg_state *rold,
> struct bpf_reg_state *rcur,
> bool varlen_map_access, struct idpair *idmap)
>  {
> -   if (!(rold->live & REG_LIVE_READ))
> +   if (0 && !(rold->live & REG_LIVE_READ))
>
> makes the test work properly and proper verifier output is:
> from 9 to 11: R0=map_value(id=0,off=0,ks=8,vs=48,imm=0) 
> R1=inv(id=0,smax_value=10) R2=inv11 R10=fp0
> 11: (64) (u32) r1 <<= (u32) 2
> 12: (0f) r0 += r1
> 13: (05) goto pc+0
> 14: (7a) *(u64 *)(r0 +0) = 4
>
> R0=map_value(id=0,off=0,ks=8,vs=48,umax_value=17179869180,var_off=(0x0; 
> 0x3fffc)) R1=inv(id=0,umax_value=17179869180,var_off=(0x0; 0x3fffc)) 
> R2=inv11 R10=fp0
> R0 unbounded memory access, make sure to bounds check any array access into a 
> map
>
> I don't yet understand the underlying reason. R0 should have been
> marked as LIVE_READ by ST_MEM...
> Please help debug it further.
>
Having added a bunch of debugging, I found out that indeed R0 _had_ been
 marked as LIVE_READ.  The problem was that env->varlen_map_value_access
 wasn't set, because the access was at a constant offset (imm=0), but then
 when we compare register states we just say "oh yeah, it's a map_value,
 we don't need to look at the var_off".
This probably results from my unifying PTR_TO_MAP_VALUE with
 PTR_TO_MAP_VALUE_ADJ; before that the old and new R0 would have different
 reg->type so wouldn't match.
I'm tempted to just rip out env->varlen_map_value_access and always check
 the whole thing, because honestly I don't know what it was meant to do
 originally or how it can ever do any useful pruning.  While drastic, it
 does cause your test case to pass.

I'm not quite sure why your test passed when you disabled liveness, though;
 that I can't explain.

-Ed

Re: [PATCH RESEND 0/2] enable hires timer to timeout datagram socket

2017-08-21 Thread Vallish Vaidyeshwara

On Sun, Aug 20, 2017 at 01:47:45AM +, Vallish Vaidyeshwara wrote:
> On Sat, Aug 19, 2017 at 08:21:45AM +0200, Richard Cochran wrote:
> > On Fri, Aug 18, 2017 at 10:27:56PM +, Vallish Vaidyeshwara wrote:
> > > We have a on-demand application that uses long timeouts and needs to 
> > > react to
> > > events within milliseconds.
> >
> 
> Hello Richard,
> 
> > Huh?  The test program you posted does not react to any event.
> >
> 
> Application has logic for complex events and test program is kept simple to
> highlight the change in behavior seen with system calls.
>

Hello Richard,

AWS Lambda is affected by this change in behavior in
system call. Following links has more information:
https://en.wikipedia.org/wiki/AWS_Lambda
https://aws.amazon.com/lambda/

Thanks.
-Vallish

Re: [PATCH net-next 2/2] liquidio: make VF driver notify NIC firmware of MTU change

2017-08-21 Thread David Miller

From: Felix Manlunas 
Date: Fri, 18 Aug 2017 11:35:20 -0700

> Signed-off-by: Veerasenareddy Burru 
> Signed-off-by: Felix Manlunas 
> ---
>  drivers/net/ethernet/cavium/liquidio/lio_vf_main.c | 22 
> ++
>  1 file changed, 18 insertions(+), 4 deletions(-)
> 
> diff --git a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c 
> b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
> index 0402b18..e947783 100644
> --- a/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
> +++ b/drivers/net/ethernet/cavium/liquidio/lio_vf_main.c
> @@ -1545,13 +1545,27 @@ static struct net_device_stats 
> *liquidio_get_stats(struct net_device *netdev)
>  static int liquidio_change_mtu(struct net_device *netdev, int new_mtu)
>  {
>   struct lio *lio = GET_LIO(netdev);
> + struct octeon_device *oct = lio->oct_dev;
> + struct octnic_ctrl_pkt nctrl;
> + int ret = 0;

Please order local variable declarations from longest to shortest line.

Re: [PATCHv3 net-next] gre: introduce native tunnel support for ERSPAN

2017-08-21 Thread David Miller

From: William Tu 
Date: Fri, 18 Aug 2017 06:24:40 -0700

> +static int erspan_rcv(struct sk_buff *skb, struct tnl_ptk_info *tpi,
> +   int gre_hdr_len)
> +{
> + struct net *net = dev_net(skb->dev);
> + struct ip_tunnel_net *itn;
> + struct ip_tunnel *tunnel;
> + struct metadata_dst *tun_dst = NULL;
> + const struct iphdr *iph;
> + struct erspanhdr *ershdr;
> + __be32 index;
> + __be32 session_id;
> + int len;

Please order local variables from longest to shortest line, ie. reverse
christmas tree format.

> +
> + itn = net_generic(net, erspan_net_id);
> + iph = ip_hdr(skb);
> + len =  iph->ihl * 4 + gre_hdr_len + sizeof(*ershdr);
> +
> + if (unlikely(!pskb_may_pull(skb, len)))
> + return -ENOMEM;

I think the len passed here is wrong, it should be
"gre_hdr_len + sizeof(*ershdr)".
> +static void erspan_build_header(struct sk_buff *skb,
> + __be32 id, u32 index, bool truncate)
> +{
> + struct erspanhdr *ershdr;
> + struct iphdr *iphdr = ip_hdr(skb);
> + struct ethhdr *eth = eth_hdr(skb);
> + struct qtag_prefix {
> + __be16 eth_type;
> + __be16 tci;
> + } *qp;
> + u16 vlan_tci = 0;
> + enum erspan_encap_type enc_type = ERSPAN_ENCAP_NOVLAN;

Reverse christmas tree for the local variables, please.

> +static int erspan_validate(struct nlattr *tb[], struct nlattr *data[],
> +struct netlink_ext_ack *extack)
> +{
> + int ret;
> + __be16 flags = 0;

Likewise.

Re: [PATCH net-next] net: check type when freeing metadata dst

2017-08-21 Thread David Miller

From: David Lamparter 
Date: Fri, 18 Aug 2017 14:31:35 +0200

> Commit 3fcece12bc1b ("net: store port/representator id in metadata_dst")
> added a new type field to metadata_dst, but metadata_dst_free() wasn't
> updated to check it before freeing the METADATA_IP_TUNNEL specific dst
> cache entry.
> 
> This is not currently causing problems since it's far enough back in the
> struct to be zeroed for the only other type currently in existance
> (METADATA_HW_PORT_MUX), but nevertheless it's not correct.
> 
> Fixes: 3fcece12bc1b ("net: store port/representator id in metadata_dst")
> Signed-off-by: David Lamparter 

Applied.

1 2 3 >

1 - 100 of 295 matches

Mail list logo