date:20160209

Re: [PATCH v2 6/6] dt-bindings: net: ks8995: add bindings documentation for ks8995

2016-02-09 Thread Helmut Buchsbaum


On 02/08/2016 07:44 PM, Sergei Shtylyov wrote:

Hello.

On 02/08/2016 08:35 PM, Helmut Buchsbaum wrote:


Signed-off-by: Helmut Buchsbaum 
---
  .../devicetree/bindings/net/micrel-ks8995.txt| 20

  1 file changed, 20 insertions(+)
  create mode 100644
Documentation/devicetree/bindings/net/micrel-ks8995.txt

diff --git a/Documentation/devicetree/bindings/net/micrel-ks8995.txt
b/Documentation/devicetree/bindings/net/micrel-ks8995.txt
new file mode 100644
index 000..7f11ca6
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/micrel-ks8995.txt
@@ -0,0 +1,20 @@
+Micrel KS8995 SPI controlled Ethernet Switch families
+
+Required properties (according to spi-bus.txt):
+- compatible: either "micrel,ks8995", "micrel,ksz8864" or
"micrel,ksz8795"
+
+Optional properties:
+- reset-gpios : phandle of gpio that will be used to reset chip
during probe
+
+Example:
+
+spi-master {
+...
+ksz8795 {


ePAPR tells to name the nodes generically, according to their function.


+compatible = "micrel,ksz8795";
+
+reg = <0>;
+spi-max-frequency = <5000>;
+reset-gpios = <&gpio0 46 1>;
+};
+};


MBR, Sergei


Hello Sergei,

just to avoid any misunderstandings: you refer to ePAPR, ch. 2.2.1 Node 
Names. Your definitely right, I'll correct the naming in my example!


Thanks,
Helmut

Re: [PATCH v2 3/6] net: phy: spi_ks8995: add support for resetting switch using GPIO

2016-02-09 Thread Andrew Lunn

On Mon, Feb 08, 2016 at 06:35:34PM +0100, Helmut Buchsbaum wrote:
> When using device tree it is no more possible to reset the PHY at board
> level. Furthermore, doing in the driver allows to power down the switch
> when it is not used any more.
> 
> The patch introduces a new optional property "reset-gpios" denoting an
> appropriate GPIO handle, e.g.:
> 
> reset-gpios = <&gpio0 46 1>

Hi Helmut

Since you are respinning, please change the 1 to GPIO_ACTIVE_LOW.

Reviewed-by: Andrew Lunn 

Thanks
Andrew

> 
> Signed-off-by: Helmut Buchsbaum 
> ---
>  drivers/net/phy/spi_ks8995.c | 71 
> ++--
>  1 file changed, 62 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/net/phy/spi_ks8995.c b/drivers/net/phy/spi_ks8995.c
> index 2803c8e..04d468f 100644
> --- a/drivers/net/phy/spi_ks8995.c
> +++ b/drivers/net/phy/spi_ks8995.c
> @@ -18,6 +18,9 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
> +#include 
>  
>  #include 
>  
> @@ -120,7 +123,8 @@ static const struct ks8995_chip_params ks8995_chip[] = {
>  };
>  
>  struct ks8995_pdata {
> - /* not yet implemented */
> + int reset_gpio;
> + enum of_gpio_flags reset_gpio_flags;
>  };
>  
>  struct ks8995_switch {
> @@ -339,6 +343,24 @@ err_out:
>   return err;
>  }
>  
> +/* ks8995_parse_dt - setup platform data from devicetree
> + * @ks: pointer to switch instance
> + *
> + * Parses supported DT properties and sets up platform data
> + * accordingly.
> + */
> +static void ks8995_parse_dt(struct ks8995_switch *ks)
> +{
> + struct device_node *np = ks->spi->dev.of_node;
> + struct ks8995_pdata *pdata = ks->pdata;
> +
> + if (!np)
> + return;
> +
> + pdata->reset_gpio = of_get_named_gpio_flags(np, "reset-gpios", 0,
> + &pdata->reset_gpio_flags);
> +}
> +
>  static const struct bin_attribute ks8995_registers_attr = {
>   .attr = {
>   .name   = "registers",
> @@ -352,14 +374,10 @@ static const struct bin_attribute ks8995_registers_attr 
> = {
>  /*  
> */
>  static int ks8995_probe(struct spi_device *spi)
>  {
> - struct ks8995_switch*ks;
> - struct ks8995_pdata *pdata;
> - int err;
> + struct ks8995_switch *ks;
> + int err;
>   int variant = spi_get_device_id(spi)->driver_data;
>  
> - /* Chip description */
> - pdata = spi->dev.platform_data;
> -
>   if (variant >= max_variant) {
>   dev_err(&spi->dev, "bad chip variant %d\n", variant);
>   return -ENODEV;
> @@ -370,10 +388,42 @@ static int ks8995_probe(struct spi_device *spi)
>   return -ENOMEM;
>  
>   mutex_init(&ks->lock);
> - ks->pdata = pdata;
>   ks->spi = spi_dev_get(spi);
>   ks->chip = &ks8995_chip[variant];
>  
> + if (ks->spi->dev.of_node) {
> + ks->pdata = devm_kzalloc(&spi->dev, sizeof(*ks->pdata),
> +  GFP_KERNEL);
> + if (!ks->pdata)
> + return -ENOMEM;
> +
> + ks->pdata->reset_gpio = -1;
> +
> + ks8995_parse_dt(ks);
> + }
> +
> + if (!ks->pdata)
> + ks->pdata = spi->dev.platform_data;
> +
> + /* de-assert switch reset */
> + if (ks->pdata && gpio_is_valid(ks->pdata->reset_gpio)) {
> + unsigned long flags;
> +
> + flags = (ks->pdata->reset_gpio_flags == OF_GPIO_ACTIVE_LOW ?
> +  GPIOF_ACTIVE_LOW : 0);
> +
> + err = devm_gpio_request_one(&spi->dev,
> + ks->pdata->reset_gpio,
> + flags, "switch-reset");
> + if (err) {
> + dev_err(&spi->dev,
> + "failed to get reset-gpios: %d\n", err);
> + return -EIO;
> + }
> +
> + gpiod_set_value(gpio_to_desc(ks->pdata->reset_gpio), 0);
> + }
> +
>   spi_set_drvdata(spi, ks);
>  
>   spi->mode = SPI_MODE_0;
> @@ -414,11 +464,14 @@ static int ks8995_remove(struct spi_device *spi)
>  
>   sysfs_remove_bin_file(&spi->dev.kobj, &ks->regs_attr);
>  
> + /* assert reset */
> + if (ks->pdata && gpio_is_valid(ks->pdata->reset_gpio))
> + gpiod_set_value(gpio_to_desc(ks->pdata->reset_gpio), 1);
> +
>   return 0;
>  }
>  
>  /*  
> */
> -
>  static struct spi_driver ks8995_driver = {
>   .driver = {
>   .name   = "spi-ks8995",
> -- 
> 2.1.4
>

Re: [PATCH net v3 2/4] net: add rx_nohandler stat counter

2016-02-09 Thread David Miller

From: Eric Dumazet 
Date: Mon, 08 Feb 2016 14:57:40 -0800

> Whole point of TLV is that it allows us to add new fields at the end of
> the structures.
 ...
> Look at iproute2, you were the one adding in 2004 code to cope with
> various tcp_info sizes.
> 
> So 12 years later, you cannot say it does not work anymore.

+1

Re: [PATCH net] tcp: do not drop syn_recv on all icmp reports

2016-02-09 Thread David Miller

From: Eric Dumazet 
Date: Tue, 02 Feb 2016 19:31:12 -0800

> From: Eric Dumazet 
> 
> Petr Novopashenniy reported that ICMP redirects on SYN_RECV sockets
> were leading to RST.
> 
> This is of course incorrect.
> 
> A specific list of ICMP messages should be able to drop a SYN_RECV.
> 
> For instance, a REDIRECT on SYN_RECV shall be ignored, as we do
> not hold a dst per SYN_RECV pseudo request.
> 
> Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=111751
> Fixes: 079096f103fa ("tcp/dccp: install syn_recv requests into ehash table")
> Reported-by: Petr Novopashenniy 
> Signed-off-by: Eric Dumazet 

Applied and queued up for -stable, thanks Eric.

Re: [PATCH v3] net:Add sysctl_max_skb_frags

2016-02-09 Thread David Miller

From: Hans Westgaard Ry 
Date: Wed,  3 Feb 2016 09:26:57 +0100

> Devices may have limits on the number of fragments in an skb they support.
> Current codebase uses a constant as maximum for number of fragments one
> skb can hold and use.
> When enabling scatter/gather and running traffic with many small messages
> the codebase uses the maximum number of fragments and may thereby violate
> the max for certain devices.
> The patch introduces a global variable as max number of fragments.
> 
> Signed-off-by: Hans Westgaard Ry 
> Reviewed-by: Håkon Bugge 

I know some people don't like this patch, but no better solution exists
at this time.

Like others, I'd personally would rather this be a per-device attribute
but that currently would not work at all.

The device that TCP and other elements see when the build packets is
not necessarily the one that is going to send the frame.  Encapsulation
and other structures hide the truely transmitting device.

And we lack a foolproof way to propagate attributes like this through
the stack of devices up to the top.

So for now this is what we have to use, as unfortunate as it may be.

If someone is suitably angry about this state of affairs, I encourage
them to direct that energy at a better long term solution :-)

Applied and queued up for -stable, thanks.

Re: [PATCH net v2] tg3: Fix for tg3 transmit queue 0 timed out when too many gso_segs

2016-02-09 Thread David Miller

From: skallam 
Date: Wed,  3 Feb 2016 14:09:38 +0530

> From: Siva Reddy Kallam 
> 
> tg3_tso_bug() can hit a condition where the entire tx ring is not big
> enough to segment the GSO packet. For example, if MSS is very small,
> gso_segs can exceed the tx ring size. When we hit the condition, it
> will cause tx timeout.
> 
> tg3_tso_bug() is called to handle TSO and DMA hardware bugs.
> For TSO bugs, if tg3_tso_bug() cannot succeed, we have to drop the packet.
> For DMA bugs, we can still fall back to linearize the SKB and let the
> hardware transmit the TSO packet.
> 
> This patch adds a function tg3_tso_bug_gso_check() to check if there
> are enough tx descriptors for GSO before calling tg3_tso_bug().
> The caller will then handle the error appropriately - drop or
> lineraize the SKB.
> 
> v2: Corrected patch description to avoid confusion.
> 
> Signed-off-by: Siva Reddy Kallam 
> Signed-off-by: Michael Chan 
> Acked-by: Prashant Sreedharan 

Applied and queued up for -stable.

Thanks.

Re: [patch net-next 0/3] bridge: mdb: flag offloaded mdb entries

2016-02-09 Thread David Miller

From: Jiri Pirko 
Date: Wed,  3 Feb 2016 09:57:03 +0100

> This patchset extends uapi to let the user know if an mdb entry is
> offloaded.

Series applied, but I really hope that it is safe to use that empry u8
slot for the flag field.

Thanks.

Re: [PATCH net-next] bonding: 3ad: apply ad_actor settings changes immediately

2016-02-09 Thread David Miller

From: Nikolay Aleksandrov 
Date: Wed,  3 Feb 2016 13:17:01 +0100

> From: Nikolay Aleksandrov 
> 
> Currently the bonding allows to set ad_actor_system and prio while the
> bond device is down, but these are actually applied only if there aren't
> any slaves yet (applied to bond device when first slave shows up, and to
> slaves at 3ad bind time). After this patch changes are applied immediately
> and the new values can be used/seen after the bond's upped so it's not
> necessary anymore to release all and enslave again to see the changes.
> 
> CC: Jay Vosburgh 
> CC: Veaceslav Falico 
> CC: Andy Gospodarek 
> Signed-off-by: Nikolay Aleksandrov 

Applied, thanks Nikolay.

Re: [PATCH net] enic: increment devcmd2 result ring in case of timeout

2016-02-09 Thread David Miller

From: Govindarajulu Varadarajan 
Date: Wed,  3 Feb 2016 14:40:44 +0530

> From: Sandeep Pillai 
> 
> Firmware posts the devcmd result in result ring. In case of timeout, driver
> does not increment the current result pointer and firmware could post the
> result after timeout has occurred. During next devcmd, driver would be
> reading the result of previous devcmd.
> 
> Fix this by incrementing result even in case of timeout.
> 
> Fixes: 373fb0873d43 ("enic: add devcmd2")
> Signed-off-by: Sandeep Pillai 
> Signed-off-by: Govindarajulu Varadarajan <_gov...@gmx.com>

Applied and queued up for -stable, thanks.

Re: [PATCH 0/4] Add Ethernet support on STM32F429

2016-02-09 Thread David Miller

From: Alexandre TORGUE 
Date: Wed,  3 Feb 2016 15:54:31 +0100

> STM32F429 Chip embeds a Synopsys 3.50a MAC IP.
> This series:
>   -enhance current stmmac driver to control it (code already available) 
> and 
>adds basic glue for STM32F429 chip.
>   -Enable basic Net config in kernel.
> 
> Note that DT patches are not present because STM32 pinctrl code is not
> yet avalaible.

Looks like this needs to be respun to deal with the warnings the kbuild
robot reported.

Thanks.

Re: [PATCH net] sctp: translate network order to host order when users get a hmacid

2016-02-09 Thread David Miller

From: Xin Long 
Date: Wed,  3 Feb 2016 23:33:30 +0800

> Commit ed5a377d87dc ("sctp: translate host order to network order when
> setting a hmacid") corrected the hmacid byte-order when setting a hmacid.
> but the same issue also exists on getting a hmacid.
> 
> We fix it by changing hmacids to host order when users get them with
> getsockopt.
> 
> Fixes: Commit ed5a377d87dc ("sctp: translate host order to network order when 
> setting a hmacid")
> Signed-off-by: Xin Long 

Applied and queued up for -stable.

Thanks.

Re: [PATCH net] selinux: nlmsgtab: add SOCK_DESTROY to the netlink mapping tables

2016-02-09 Thread David Miller

From: Lorenzo Colitti 
Date: Thu,  4 Feb 2016 01:17:12 +0900

> Without this, using SOCK_DESTROY in enforcing mode results in:
> 
>   SELinux: unrecognized netlink message type=21 for sclass=32
> 
> Signed-off-by: Lorenzo Colitti 

Applied, thanks Lorenzo.

Re: [PATCH net] enic: increment devcmd2 result ring in case of timeout

2016-02-09 Thread David Miller

From: Govindarajulu Varadarajan 
Date: Wed,  3 Feb 2016 14:40:44 +0530

> From: Sandeep Pillai 
> 
> Firmware posts the devcmd result in result ring. In case of timeout, driver
> does not increment the current result pointer and firmware could post the
> result after timeout has occurred. During next devcmd, driver would be
> reading the result of previous devcmd.
> 
> Fix this by incrementing result even in case of timeout.
> 
> Fixes: 373fb0873d43 ("enic: add devcmd2")
> Signed-off-by: Sandeep Pillai 
> Signed-off-by: Govindarajulu Varadarajan <_gov...@gmx.com>

Applied and queued up for -stable, thanks.

Re: [PATCH net-next] hv_netvsc: Increase delay for RNDIS_STATUS_NETWORK_CHANGE

2016-02-09 Thread David Miller

From: Haiyang Zhang 
Date: Tue,  2 Feb 2016 16:15:56 -0800

> We simulates a link down period for RNDIS_STATUS_NETWORK_CHANGE message to
> trigger DHCP renew. User daemons may need multiple seconds to trigger the
> link down event. (e.g. ifplugd: 5sec, network-manager: 4sec.) So update
> this link down period to 10 sec to properly trigger DHCP renew.
> 
> Signed-off-by: Haiyang Zhang 

Two things look really bad about this to me:

1) Any value you choose is arbitrary.  If some new network configuration daemon
   is slower, you will have to change this value again.

   This is _NOT_ sustainable in the long term.

2) It is completely unclear to me why this driver needs to delay at all or
   wait for anything.  I see no other driver having to deal with this issue.

Until you address both of these points I am not going to apply this patch.

Thanks.

[net-next v2] bonding: use return instead of goto

2016-02-09 Thread Zhang Shengju

Replace 'goto' with 'return' to remove unnecessary check at label:
err_undo_flags.

The reason is that 'err_undo_flags' do two things for the first slave device:
1.revert bond mac address if it is set by the slave device.
2.revert bond device type if it's not ARPHRD_ETHER.

It's not necessary for the following three places, they changed neither bond 
mac address nor type. It's straightforward to return directly.

Signed-off-by: Zhang Shengju 
---
 drivers/net/bonding/bond_main.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
index bcc7b19..abe014f 100644
--- a/drivers/net/bonding/bond_main.c
+++ b/drivers/net/bonding/bond_main.c
@@ -1379,8 +1379,7 @@ int bond_enslave(struct net_device *bond_dev, struct 
net_device *slave_dev)
if (slave_dev->flags & IFF_UP) {
netdev_err(bond_dev, "%s is up - this may be due to an out of 
date ifenslave\n",
   slave_dev->name);
-   res = -EPERM;
-   goto err_undo_flags;
+   return -EPERM;
}
 
/* set bonding device ether type by slave - bonding netdevices are
@@ -1400,8 +1399,7 @@ int bond_enslave(struct net_device *bond_dev, struct 
net_device *slave_dev)
res = notifier_to_errno(res);
if (res) {
netdev_err(bond_dev, "refused to change device 
type\n");
-   res = -EBUSY;
-   goto err_undo_flags;
+   return -EBUSY;
}
 
/* Flush unicast and multicast addresses */
@@ -1421,8 +1419,7 @@ int bond_enslave(struct net_device *bond_dev, struct 
net_device *slave_dev)
} else if (bond_dev->type != slave_dev->type) {
netdev_err(bond_dev, "%s ether type (%d) is different from 
other slaves (%d), can not enslave it\n",
   slave_dev->name, slave_dev->type, bond_dev->type);
-   res = -EINVAL;
-   goto err_undo_flags;
+   return -EINVAL;
}
 
if (slave_ops->ndo_set_mac_address == NULL) {
-- 
1.8.3.1

Re: [PATCH v2 6/6] dt-bindings: net: ks8995: add bindings documentation for ks8995

2016-02-09 Thread Sergei Shtylyov


On 2/9/2016 11:07 AM, Helmut Buchsbaum wrote:


Signed-off-by: Helmut Buchsbaum 
---
  .../devicetree/bindings/net/micrel-ks8995.txt| 20

  1 file changed, 20 insertions(+)
  create mode 100644
Documentation/devicetree/bindings/net/micrel-ks8995.txt

diff --git a/Documentation/devicetree/bindings/net/micrel-ks8995.txt
b/Documentation/devicetree/bindings/net/micrel-ks8995.txt
new file mode 100644
index 000..7f11ca6
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/micrel-ks8995.txt
@@ -0,0 +1,20 @@
+Micrel KS8995 SPI controlled Ethernet Switch families
+
+Required properties (according to spi-bus.txt):
+- compatible: either "micrel,ks8995", "micrel,ksz8864" or
"micrel,ksz8795"
+
+Optional properties:
+- reset-gpios : phandle of gpio that will be used to reset chip
during probe
+
+Example:
+
+spi-master {
+...
+ksz8795 {


ePAPR tells to name the nodes generically, according to their function.


+compatible = "micrel,ksz8795";
+
+reg = <0>;
+spi-max-frequency = <5000>;
+reset-gpios = <&gpio0 46 1>;
+};
+};


MBR, Sergei


Hello Sergei,

just to avoid any misunderstandings: you refer to ePAPR, ch. 2.2.1 Node Names.


   "2.2.2, Generic Names Recommendation", actually. :-)


Your definitely right, I'll correct the naming in my example!


   TIA.


Thanks,
Helmut


MBR, Sergei

[net PATCH v2] flow_dissector: Fix unaligned access in __skb_flow_dissector when used by eth_get_headlen

2016-02-09 Thread Alexander Duyck

This patch fixes an issue with unaligned accesses when using
eth_get_headlen on a page that was DMA aligned instead of being IP aligned.
The fact is when trying to check the length we don't need to be looking at
the flow label so we can reorder the checks to first check if we are
supposed to gather the flow label and then make the call to actually get
it.

v2:  Updated path so that either STOP_AT_FLOW_LABEL or KEY_FLOW_LABEL can
 cause us to check for the flow label.

Reported-by: Sowmini Varadhan 
Signed-off-by: Alexander Duyck 
---
 net/core/flow_dissector.c |9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index c992463c03d7..699b2c415cb0 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -209,7 +209,6 @@ ip:
case htons(ETH_P_IPV6): {
const struct ipv6hdr *iph;
struct ipv6hdr _iph;
-   __be32 flow_label;
 
 ipv6:
iph = __skb_header_pointer(skb, nhoff, sizeof(_iph), data, 
hlen, &_iph);
@@ -230,8 +229,12 @@ ipv6:
key_control->addr_type = FLOW_DISSECTOR_KEY_IPV6_ADDRS;
}
 
-   flow_label = ip6_flowlabel(iph);
-   if (flow_label) {
+   if ((dissector_uses_key(flow_dissector,
+   FLOW_DISSECTOR_KEY_FLOW_LABEL) ||
+(flags & FLOW_DISSECTOR_F_STOP_AT_FLOW_LABEL)) &&
+   ip6_flowlabel(iph)) {
+   __be32 flow_label = ip6_flowlabel(iph);
+
if (dissector_uses_key(flow_dissector,
   FLOW_DISSECTOR_KEY_FLOW_LABEL)) {
key_tags = 
skb_flow_dissector_target(flow_dissector,

RE: [PATCH v3 net-next] net: Implement fast csum_partial for x86_64

2016-02-09 Thread David Laight

From: George Spelvin [mailto:li...@horizon.com]
> Sent: 08 February 2016 20:13
> David Laight wrote:
> > I'd need convincing that unrolling the loop like that gives any significant 
> > gain.
> > You have a dependency chain on the carry flag so have delays between the 
> > 'adcq'
> > instructions (these may be more significant than the memory reads from l1 
> > cache).
> 
> If the carry chain is a bottleneck, on Broadwell+ (feature flag
> X86_FEATURE_ADX), there are the ADCX and ADOX instructions, which use
> separate flag bits for their carry chains and so can be interleaved.
> 
> I don't have such a machine to test on, but if someone who does
> would like to do a little benchmarking, that would be an interesting
> data point.
> 
> Unfortunately, that means yet another version of the main loop,
> but if there's a significant benefit...

Well, the only part actually worth writing in assembler is the 'adc' loop.
So run-time substitution of separate versions (as is done for memcpy())
wouldn't be hard.

Since adcx and adox must execute in parallel I clearly need to re-remember
how dependencies against the flags register work. I'm sure I remember
issues with 'false dependencies' against the flags.

However you still need a loop construct that doesn't modify 'o' or 'c'.
Using leal, jcxz, jmp might work.
(Unless broadwell actually has a fast 'loop' instruction.)

(I've not got a suitable test cpu.)

David

Re: [PATCH net v3 2/4] net: add rx_nohandler stat counter

2016-02-09 Thread Jamal Hadi Salim

On 16-02-09 03:40 AM, David Miller wrote:

From: Eric Dumazet 
Date: Mon, 08 Feb 2016 14:57:40 -0800

Whole point of TLV is that it allows us to add new fields at the end of
the structures.

  ...

Look at iproute2, you were the one adding in 2004 code to cope with
various tcp_info sizes.

So 12 years later, you cannot say it does not work anymore.

+1

The TLV L should be canonical way to determine length. i.e should be
sufficient to just look at L and understand that content has changed.
But:
Using sizeof could be dangerous unless the data is packed to be
32-bit aligned. Looking INET_DIAG_INFO check for sizeof
there is a small 8 bit hole in tcp_info I think between
these two fields:

__u8tcpi_snd_wscale : 4, tcpi_rcv_wscale : 4;
__u32   tcpi_rto;
---

The kernel will pad to make sure the TLV data is 32-bit aligned.
I am not sure if that will be the same length as sizeof() in all
hardware + compilers... For this case,
it is almost safe to just add a version field - probably in the hole.
Or have a #define to say what the expected length should be. Or add
an 8 bit pad.

In general adding new fields that are non-optional is problematic. i.e
by non-optional i mean always expected to be present.
I think a good test is old kernel with new iproute2. If the new field
is non-optional, it will fail (example iproute2 may try to print a value
that it expects but because old kernel doesnt understand it; it is 
non-existent).

cheers,
jamal

Re: [Intel-wired-lan] [next] igb: allow setting MAC address on i211 using a device tree blob

2016-02-09 Thread Jeff Kirsher

On Fri, 2016-01-29 at 23:11 +0100, John Holland wrote:
> The Intel i211 LOM pcie ethernet controllers' iNVM operates as an
> OTP 
> and has no externel EEPROM interface [1]. The following allows the 
> driver to pickup the MAC address from a device tree blob when
> CONFIG_OF 
> has been enabled.
> 
> [1] 
> http://www.intel.com/content/www/us/en/embedded/products/networking/i
> 211-ethernet-controller-datasheet.html
> 
> Signed-off-by: John Holland 
> ---
>   drivers/net/ethernet/intel/igb/igb_main.c | 30 
> ++
>   1 file changed, 30 insertions(+)
> 
> diff --git a/drivers/net/ethernet/intel/igb/igb_main.c 
> b/drivers/net/ethernet/intel/igb/igb_main.c
> index 31e5f39..9c92443 100644
> --- a/drivers/net/ethernet/intel/igb/igb_main.c
> +++ b/drivers/net/ethernet/intel/igb/igb_main.c
> @@ -56,6 +56,11 @@
>   #include 
>   #include "igb.h"
> 
> +#ifdef defined(CONFIG_OF)
> +#include 
> +#include 
> +#endif
> +
>   #define MAJ 5
>   #define MIN 3
>   #define BUILD 0
> @@ -2217,6 +,26 @@ static s32 igb_init_i2c(struct igb_adapter
> *adapter)
>   }
> 
>   /**
> + *  igb_read_mac_addr_dts - Read mac addres from the device tree
> blob.

Address is mis-spelled above

> + *  @hw: pointer to the e1000 hardware structure
> + **/
> +#ifdef defined(CONFIG_OF)

Minor nitpick, you should have the function comment header wrapped in
the #ifdef as well.

> +static void igb_read_mac_addr_dts(struct e1000_hw *hw)
> +{
> +   const u8 *mac;
> +   struct device_node *dn;
> +
> +   dn = of_find_compatible_node(NULL, NULL, "intel,i211");
> +   if (!dn)
> +   return;
> +
> +   mac = of_get_mac_address(dn);
> +   if (mac)
> +   ether_addr_copy(hw->mac.addr, mac);
> +}
> +#endif
> +
> +/**
>    *  igb_probe - Device Initialization Routine
>    *  @pdev: PCI device information struct
>    *  @ent: entry in igb_pci_tbl
> @@ -2420,6 +2445,11 @@ static int igb_probe(struct pci_dev *pdev,
> const 
> struct pci_device_id *ent)
>  if (hw->mac.ops.read_mac_addr(hw))
>  dev_err(&pdev->dev, "NVM Read Error\n");
> 
> +#ifdef defined(CONFIG_OF)
> +   if (!is_valid_ether_addr(hw->mac.addr))
> +   igb_read_mac_addr_dts(hw);
> +#endif
> +
>  memcpy(netdev->dev_addr, hw->mac.addr, netdev->addr_len);
> 
>  if (!is_valid_ether_addr(netdev->dev_addr)) {
> ___
> Intel-wired-lan mailing list
> intel-wired-...@lists.osuosl.org
> http://lists.osuosl.org/mailman/listinfo/intel-wired-lan


signature.asc
Description: This is a digitally signed message part

Re: [net-next PATCH 0/7] tc offload for cls_u32 on ixgbe

2016-02-09 Thread Fastabend, John R

On 2/4/2016 3:19 PM, Pablo Neira Ayuso wrote:
> On Thu, Feb 04, 2016 at 10:16:56AM +0100, Jiri Pirko wrote:
>> Wed, Feb 03, 2016 at 10:27:32AM CET, john.fastab...@gmail.com wrote:
>>>
>>> Also by adding get_parse_graph and set_parse_graph attributes as
>>> in my previous flow_api work we can build programmable devices
>>> and programmatically learn when rules can or can not be loaded
>>> into the hardware. Again future work.
>>>
>>> Any comments/feedback appreciated.

Sorry if you get this twice it doesn't look like my original response
made it to netdev and the laptop I replied on charger blew up.

>>
>> I like this being thin and elegant solution. However, ~2 years ago when I
>> pushed openvswitch kernel datapath offload patchset, people were yelling
>> at me that it is not generic enough solution, that tc has to be able
>> to use the api (Jamal :)), nftables as well.
>

The other problem with OVS is if you have the capabilities to do
wildcard lookups (e.g. TCAM/SRAM/etc) then offloading the exact
match table in OVS is really inefficient use of the resource. You
really want to load the megaflow table into hardware. I just don't
think its a good scheme for what you want.

> I would be glad to join this debate during NetDev 1.1 too.
>

great.

> I think we should provide a solution that allows people uses both
> tc and nftables, this would require a bit of generic infrastructure on
> top of it so we don't restrict users to one single solution, in other
> words, we allow the user to select its own poison.
>
>> Now this patch is making offload strictly tc-based and nobody seems to
>> care :) I do. I think that we might try to find some generic middle
>> layer.

If we can build the universal model for 'tc' and 'nftable' we should
unify them higher in the stack? It doesn't make sense to me for the
driver folks to try and create the unified model for two subsystems
if we don't think its worthwhile in software as well.

>
> I agree and I'll be happy to help to push this ahead. Let's try to sit
> and get together to resolve this.

Great.

>
> See you soon.
>

Re: [net-next PATCH 3/7] net: sched: add cls_u32 offload hooks for netdevs

2016-02-09 Thread Fastabend, John R

On 2/4/2016 5:18 AM, Amir Vadai" wrote:
> On Wed, Feb 03, 2016 at 01:28:37AM -0800, John Fastabend wrote:
>> This patch allows netdev drivers to consume cls_u32 offloads via
>> the ndo_setup_tc ndo op.
>>
>> This works aligns with how network drivers have been doing qdisc
>> offloads for mqprio.
>>
>> Signed-off-by: John Fastabend 
>> ---

[...]

>> +enum {
>> +TC_CLSU32_NEW_KNODE,
> TC_CLSU32_NEW_KNODE is never used

aha yep that snuck in there. In a follow up patch for the fm10k devices
where we can support hash tables (e.g. divisor > 1) I use it. Although
on closer inspection I need to check that the divisor == 1 on ixgbe or
else abort because we can get out of sync if software expects hash
tables here.

Thanks, nice catch.

> 
> [...]
>

Re: [net-next PATCH 0/7] tc offload for cls_u32 on ixgbe

2016-02-09 Thread Fastabend, John R

On 2/4/2016 5:12 AM, Jamal Hadi Salim wrote:
> 
> On 16-02-03 01:48 PM, Fastabend, John R wrote:
> 
> BTW: For the record John, I empathize with you that we need to
> move. Please have patience - we are close; lets just get this resolved
> in Seville. I like your patches a lot and would love to just have
> your patches pushed in, but the challenges with community is being able
> to reach some middle ground. We are not as bad as some of the standards
> organizations. I am sure we'll get this resolved by end of next week
> if not, I am %100 in agreement some form of your patches (And Amir's
> need to go in and then we can refactor as needed)

Agreed although I'm a bit worried we are starting to talk about a
single hardware IR. This discussion has always failed in my experience.

> 
>>> 1) "priorities" for filters and some form of "index" for actions is
>>> is needed. I think index (which tends to be a 32 bit value is what
>>> Amir's patches refered to as "cookie" - or at least some hardware
>>> can be used to query the action with). Priorities maybe implicit in
>>> the order in which they are added. And th idea of appending vs
>>> exclusivity vs replace (which  netlink already supports)
>>> is important to worry about (TCAMS tend to assume an append mode
>>> for example).
>>
>> The code denotes add/del/replace already. I'm not sure why a TCAM
>> would assume an append mode but OK maybe that is some API you have
>> the APIs I use don't have these semantics.
>>
> 
> Basically most hardware (or i should say driver implementations of
> mostly TCAMS) allow you to add exactly the same filter as many times
> as you want. They dont really look at what you want to filter on
> and then scream "conflict". IOW, you (user) are responsible for
> conflict resolution at the filter level. The driver sees this blob
> and requests for some index/key from the hardware then just adds it.
> You can then use this key/index to delete/replace etc.
> This is what i meant by "append" mode.
> However if a classifier implementation cares about filter ambiguity
> resolution, then priorities are used. We need to worry about the
> bigger picture.
> 

Sure in other classifiers its used but its not needed in the set I
planned to added it later.

> 
>> For this series using cls_u32 the handle gives you everything you need
>> to put entries in the right table and row. Namely the ht # and order #
>> from 'tc'.
> 
> True - but with a caveat. There are only 2^12 max tables you can
> have for example and up to 2^12 filters per bucket etc.
> 

This is a software limitation as well right? If it hasn't showed up
as a limitation on the software side why would it be an issue here?
Do you have more than 2^12 tables on your devices? If so I guess we
can tack on another 32bits somewhere.

>> Take a look at u32_change and u32_classify its the handle
>> that places the filter into the list and the handle that is matched in
>> classify. We should place the filters in the hardware in the same order
>> that is used by u32_change.
>>
> 
> I can see some parallels, but:
> The nodeid in itself is insufficent for two reasons:
> You cant have more than 2^12 filters per bucket;
> and the nodeid then takes two meanings: a) it is an id
> b) it specifies the order in which things are looked up.
> 
> I think you need to take the u32 address and map it to something in your
> hardware. But at the same time it is important to have the abstraction
> closely emulate your hardware.
> 

IMO the hardware/interface must preserve the same ordering of
filters/hash_Tables/etc. How it does that mapping should be
a driver concern and it can always abort if it fails.

>> Also ran a few tests and can't see how priority works in u32 maybe you
>> can shed some light but as best I can tell it doesn't have any effect
>> on rule execution.
>>
> 
> True.
> u32 doesnt care because it will give you a nodeid if you dont specify
> one. i.e conflict resolution is mapped to you not specifying exactly
> the same ht:bkt:nodeid more than once. And if you will let the
> kernel do it for you (as i am assumming you are saying your hardware
> will) then no need.

Yep. Faithfully offloading u32 here not changing anything except
I do have to abort on some cases with the simpler devices. fm10k for
example can model hash nodes with divisors > 1.

> 
>>>
>>> 2) I like the u32 approach where it makes sense; but sometimes it
>>> doesnt make sense from a usability pov. I work with some ASICs
>>> that have 10 tuples that are  fixed. Yes, a user can describe a policy
>>> with u32 but flower would be more  usable say with flower (both
>>> programmatic and cli)
>>
>> Sure so create a set of offload hooks for flower we don't need only
>> one hardware classifier any more than we would like a single software
>> classifiers.
> 
> 
> Glad to hear that.
> I was a little concerned that despite my love for u32 it was
> going to be _the_ classifier. It doesnt fit for all offload cases
> and sometimes it is because of human o

Re: [net-next PATCH 7/7] net: ixgbe: add support for tc_u32 offload

2016-02-09 Thread Fastabend, John R

[...]

>> Ah I should have annotated this in the commit msg. I turn the feature
>> off by default to enable it the user needs to run
>>
>>  # ethtool -K ethx hw-tc-offload on
>>
>> this is just a habit of mine to leave new features off by default for
>> a bit until I work out some of the kinks. For example I found a case
>> today where if you build loops into your u32 graph the hardware tables
>> can get out of sync with the software tables. This is sort of extreme
>> corner case not sure if anyone would really use u32 but it is valid
>> and the hardware should abort correctly.
> Yeh - that is nice :) But I was just pointing out on a small typo which I
> think you have.
> The new case will never happen. You compare: (features & NETIF_F_NTUPLE) == 
> NETIF_F_HW_TC
> Also the comment before the switch should be modified.

Aha nice catch my scripts were enabling both ntuple and hw-tc-offload
for testing compatibility issues. I wonder if there is a bug somewhere
else though that checks that code most likely because it was definately
getting offloaded.

Good catch again thanks.

> 
>>
>> Thanks,
>> John
>>

Re: [net-next PATCH 7/7] net: ixgbe: add support for tc_u32 offload

2016-02-09 Thread Fastabend, John R

[...]

>>
>> If you leave ht and order off the tc cli I believe 'tc' just
>> picks some semi-arbitrary ones for you. I've been in the habit
>> of always specifying them even for software filters.
>>
> 
> The default table id is essentially 0x800. Default bucket is 0.
> "order" essentially is the filter id. And given you can link tables
> (Nice work John!); essentially the ht:bucket:nodeid is an "address" to
> a specific filter on a specific table and when makes sense a specific
> hash bucket. Some other way to look at it is as a way to construct
> a mapping to a TCAM key.
> What John is doing is essentially taking the nodeid and trying to use
> it as a priority. In otherwise the abstraction is reduced to a linked
> list in which the ordering is how the list is traversed.
> It may work in this case, but i am for being able to explicitly specify
> priorities.

Sorry bombing you with emails Jamal. Another thing to note is ixgbe
doesn't support hash tables explicitly but our other devices do. So
when a hash node is created we can map that onto a hardware block
and actually do the hash.

> 
> cheers,
> jamal
>

Re: [PATCH net-next v2 0/4] packet: tpacket gso and csum offload

2016-02-09 Thread David Miller

From: Willem de Bruijn 
Date: Wed,  3 Feb 2016 18:02:13 -0500

> From: Willem de Bruijn 
> 
> Extend PACKET_VNET_HDR socket option support to packet sockets with
> memory mapped rings.
> 
> Patches 2 and 4 add support to tpacket_rcv and tpacket_snd.
> 
> Patch 1 prepares for this by moving the relevant virtio_net_hdr
> logic out of packet_snd and packet_rcv into helper functions.
> 
> GSO transmission requires all headers in the skb linear section.
> Patch 3 moves parsing of tx_ring slot headers before skb allocation
> to enable allocation with sufficient linear size.
> 
> Changes
>   v1->v2:
> - fix bounds checks:
>   - subtract sizeof(vnet_hdr) before comparing tp_len to size_max
>   - compare tp_len to size_max also with GSO, just do not truncate to MTU

Series applied, thanks.

Re: [Intel-wired-lan] [next] igb: allow setting MAC address on i211 using a device tree blob

2016-02-09 Thread Andrew Lunn

> > +static void igb_read_mac_addr_dts(struct e1000_hw *hw)
> > +{
> > +   const u8 *mac;
> > +   struct device_node *dn;
> > +
> > +   dn = of_find_compatible_node(NULL, NULL, "intel,i211");

Hi John

Would this also work for the i210?

If so, you normally use the compatible string for the first device
this works with. So maybe this should be changed to intel,i210?

Thanks
Andrew

Re: [PATCH iproute2 v2 21/21] iplink: bridge: add support for netfilter call attributes

2016-02-09 Thread Pablo Neira Ayuso

Hi Nikolay,

On Tue, Feb 09, 2016 at 12:14:39AM +0100, Nikolay Aleksandrov wrote:
> From: Nikolay Aleksandrov 
> 
> This patch implements support for the IFLA_BR_NF_CALL_(IP|IP6|ARP)TABLES
> attributes in iproute2 so it can change their values.
> 
> Signed-off-by: Nikolay Aleksandrov 
> ---
>  ip/iplink_bridge.c | 45 +
>  1 file changed, 45 insertions(+)
> 
> diff --git a/ip/iplink_bridge.c b/ip/iplink_bridge.c
> index a55a36adacdf..1b666f0adef4 100644
> --- a/ip/iplink_bridge.c
> +++ b/ip/iplink_bridge.c
> @@ -47,6 +47,9 @@ static void print_explain(FILE *f)
>   "  [ mcast_query_interval QUERY_INTERVAL ]\n"
>   "  [ mcast_query_response_interval 
> QUERY_RESPONSE_INTERVAL ]\n"
>   "  [ mcast_startup_query_interval 
> STARTUP_QUERY_INTERVAL ]\n"
> + "  [ nf_call_iptables NF_CALL_IPTABLES ]\n"
> + "  [ nf_call_ip6tables NF_CALL_IP6TABLES ]\n"
> + "  [ nf_call_arptables NF_CALL_ARPTABLES ]\n"

We will soon have conntrack support for bridge, that will help us kill
this bridge_netfilter glue code that has caused us many headaches.

So I'd prefer not to give more exposition to this.

Re: [Intel-wired-lan] [next] igb: allow setting MAC address on i211 using a device tree blob

2016-02-09 Thread Andrew Lunn

> > +   dn = of_find_compatible_node(NULL, NULL, "intel,i211");

Humm, NULL, NULL. That means find the first node anywhere in the
device tree which matches. This is not going to work too well when you
have multiple i211s.

There is a way so specify a DT node is attached to a specific PCIe
bus/slot. I think you should search only there, so solving the
multiple device issue.

 Andrew

Re: [net PATCH v2] flow_dissector: Fix unaligned access in __skb_flow_dissector when used by eth_get_headlen

2016-02-09 Thread David Miller

From: Alexander Duyck 
Date: Tue, 09 Feb 2016 02:49:54 -0800

> This patch fixes an issue with unaligned accesses when using
> eth_get_headlen on a page that was DMA aligned instead of being IP aligned.
> The fact is when trying to check the length we don't need to be looking at
> the flow label so we can reorder the checks to first check if we are
> supposed to gather the flow label and then make the call to actually get
> it.
> 
> v2:  Updated path so that either STOP_AT_FLOW_LABEL or KEY_FLOW_LABEL can
>  cause us to check for the flow label.
> 
> Reported-by: Sowmini Varadhan 
> Signed-off-by: Alexander Duyck 

Applied and queued up for -stable, thanks Alex.

Re: [net-next PATCH 06/11] RFC: mlx5: RX bulking or bundling of packets before calling network stack

2016-02-09 Thread Saeed Mahameed

On Tue, Feb 2, 2016 at 11:13 PM, Jesper Dangaard Brouer
 wrote:
> There are several techniques/concepts combined in this optimization.
> It is both a data-cache and instruction-cache optimization.
>
> First of all, this is primarily about delaying touching
> packet-data, which happend in eth_type_trans, until the prefetch
> have had time to fetch.  Thus, hopefully avoiding a cache-miss on
> packet data.
>
> Secondly, the instruction-cache optimization is about, not
> calling the network stack for every packet, which is pulled out
> of the RX ring.  Calling the full stack likely removes/flushes
> the instruction cache every time.
>
> Thus, have two loops, one loop pulling out packet from the RX
> ring and starting the prefetching, and the second loop calling
> eth_type_trans() and invoking the stack via napi_gro_receive().
>
> Signed-off-by: Jesper Dangaard Brouer 
>
>
> Notes:
> This is the patch that gave a speed up of 6.2Mpps to 12Mpps, when
> trying to measure lowest RX level, by dropping the packets in the
> driver itself (marked drop point as comment).
Indeed looks very promising in respect of instruction-cache
optimization, but i have some doubts regarding the data-cache
optimizations (prefetch), please see my below questions.

We will take this patch and test it in house.

>
> For now, the ring is emptied upto the budget.  I don't know if it
> would be better to chunk it up more?
Not sure, according to netdevice.h :

/* Default NAPI poll() weight
 * Device drivers are strongly advised to not use bigger value
 */
#define NAPI_POLL_WEIGHT 64

we will also compare different budget values with your approach, but I
doubt it will be accepted to increase the NAPI_POLL_WEIGHT for mlx5
drivers.
furthermore increasing NAPI poll budget might cause cache overflow
with this approach since you are chunking up all "prefetch(skb->data)"
(I didn't do the math yet in regards of cache utilization with this
approach).

> mlx5e_handle_csum(netdev, cqe, rq, skb);
>
> -   skb->protocol = eth_type_trans(skb, netdev);
> -
mlx5e_handle_csum also access the skb->data in is_first_ethertype_ip
function, but i think it is not interesting since this is not the
common case,
e.g: for the none common case of L4 traffic with no HW checksum
offload you won't benefit from this optimization since we access the
skb->data to know the L3 header type, and this can be fixed in driver
code to check the CQE meta data for these fields instead of accessing
the skb->data, but I will need to look further into that.

> @@ -252,7 +257,6 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
> wqe_counter= be16_to_cpu(wqe_counter_be);
> wqe= mlx5_wq_ll_get_wqe(&rq->wq, wqe_counter);
> skb= rq->skb[wqe_counter];
> -   prefetch(skb->data);
> rq->skb[wqe_counter] = NULL;
>
> dma_unmap_single(rq->pdev,
> @@ -265,16 +269,27 @@ int mlx5e_poll_rx_cq(struct mlx5e_cq *cq, int budget)
> dev_kfree_skb(skb);
> goto wq_ll_pop;
> }
> +   prefetch(skb->data);
is this optimal for all CPU archs ? is it ok to use up to 64 cache
lines at once ?

Re: [net-next PATCH 0/7] tc offload for cls_u32 on ixgbe

2016-02-09 Thread Jamal Hadi Salim


On 16-02-09 06:24 AM, Fastabend, John R wrote:

On 2/4/2016 5:12 AM, Jamal Hadi Salim wrote:


On 16-02-03 01:48 PM, Fastabend, John R wrote:




Basically most hardware (or i should say driver implementations of
mostly TCAMS) allow you to add exactly the same filter as many times
as you want. They dont really look at what you want to filter on
and then scream "conflict". IOW, you (user) are responsible for
conflict resolution at the filter level. The driver sees this blob
and requests for some index/key from the hardware then just adds it.
You can then use this key/index to delete/replace etc.
This is what i meant by "append" mode.
However if a classifier implementation cares about filter ambiguity
resolution, then priorities are used. We need to worry about the
bigger picture.



Sure in other classifiers its used but its not needed in the set I
planned to added it later.



If you leave it open for some other hardware to use we should be fine.




For this series using cls_u32 the handle gives you everything you need
to put entries in the right table and row. Namely the ht # and order #
from 'tc'.


True - but with a caveat. There are only 2^12 max tables you can
have for example and up to 2^12 filters per bucket etc.



This is a software limitation as well right? If it hasn't showed up
as a limitation on the software side why would it be an issue here?
Do you have more than 2^12 tables on your devices? If so I guess we
can tack on another 32bits somewhere.



That handle is used as an "Address" to the 32 bit filter.
Just beware of the semantics the handle has.
It hasnt shown up as a software limitation because the defaults
are good enough for most people. But if you ever want to install
a million rules that can be looked up at a reasonable pps rate
it will become very obvious quickly. I have a sample setup in the
talk tommorow which shows such an example.


I think you need to take the u32 address and map it to something in your
hardware. But at the same time it is important to have the abstraction
closely emulate your hardware.



IMO the hardware/interface must preserve the same ordering of
filters/hash_Tables/etc. How it does that mapping should be
a driver concern and it can always abort if it fails.



Sure.


Also ran a few tests and can't see how priority works in u32 maybe you
can shed some light but as best I can tell it doesn't have any effect
on rule execution.



True.
u32 doesnt care because it will give you a nodeid if you dont specify
one. i.e conflict resolution is mapped to you not specifying exactly
the same ht:bkt:nodeid more than once. And if you will let the
kernel do it for you (as i am assumming you are saying your hardware
will) then no need.


Yep. Faithfully offloading u32 here not changing anything except
I do have to abort on some cases with the simpler devices. fm10k for
example can model hash nodes with divisors > 1.



I wonder if when we get to capabilities we can do this...







My issue is we can map flower onto u32 that is fine and u32 onto
bpf. But we lose a lot of the power of each classifier when we
do this. flower for example is nice because of its simplicity
presumably this translates into faster updates, u32 is great because
we get full parse graph support and hash tables, ebpf is the biggest
beast of all and lets us load arbitrary functions into the device.
All are nice in their own right.



Did i send you my slides? ;->


cheers,
jamal

RE: [Intel-wired-lan] [PATCH net-next V2 6/6] e1000e: call ndo_stop() instead of dev_close() when running offline selftest

2016-02-09 Thread Avargil, Raanan

Acked-by: Raanan Avargil 

--
Regards,
Raanan


-Original Message-
From: Intel-wired-lan [mailto:intel-wired-lan-boun...@lists.osuosl.org] On 
Behalf Of Stefan Assmann
Sent: Wednesday, February 03, 2016 10:21
To: intel-wired-...@lists.osuosl.org
Cc: netdev@vger.kernel.org; da...@davemloft.net; sassm...@kpanic.de
Subject: [Intel-wired-lan] [PATCH net-next V2 6/6] e1000e: call ndo_stop() 
instead of dev_close() when running offline selftest
Importance: High

Calling dev_close() causes IFF_UP to be cleared which will remove the 
interfaces routes and some addresses. That's probably not what the user 
intended when running the offline selftest. Besides this does not happen if the 
interface is brought down before the test, so the current behaviour is 
inconsistent.
Instead call the net_device_ops ndo_stop function directly and avoid touching 
IFF_UP at all.

V2: rename e1000_open(), e1000_close() to e1000e_open(), e1000e_close() to 
avoid name clash with e1000.

Signed-off-by: Stefan Assmann 
---
 drivers/net/ethernet/intel/e1000e/e1000.h   |  2 ++
 drivers/net/ethernet/intel/e1000e/ethtool.c |  4 ++--  
drivers/net/ethernet/intel/e1000e/netdev.c  | 12 ++--
 3 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/intel/e1000e/e1000.h 
b/drivers/net/ethernet/intel/e1000e/e1000.h
index 1dc293b..52eb641 100644
--- a/drivers/net/ethernet/intel/e1000e/e1000.h
+++ b/drivers/net/ethernet/intel/e1000e/e1000.h
@@ -480,6 +480,8 @@ extern const char e1000e_driver_version[];  void 
e1000e_check_options(struct e1000_adapter *adapter);  void 
e1000e_set_ethtool_ops(struct net_device *netdev);
 
+int e1000e_open(struct net_device *netdev); int e1000e_close(struct 
+net_device *netdev);
 void e1000e_up(struct e1000_adapter *adapter);  void e1000e_down(struct 
e1000_adapter *adapter, bool reset);  void e1000e_reinit_locked(struct 
e1000_adapter *adapter); diff --git 
a/drivers/net/ethernet/intel/e1000e/ethtool.c 
b/drivers/net/ethernet/intel/e1000e/ethtool.c
index 6cab1f3..1e3973a 100644
--- a/drivers/net/ethernet/intel/e1000e/ethtool.c
+++ b/drivers/net/ethernet/intel/e1000e/ethtool.c
@@ -1816,7 +1816,7 @@ static void e1000_diag_test(struct net_device *netdev,
 
if (if_running)
/* indicate we're in test mode */
-   dev_close(netdev);
+   e1000e_close(netdev);
 
if (e1000_reg_test(adapter, &data[0]))
eth_test->flags |= ETH_TEST_FL_FAILED; @@ -1849,7 
+1849,7 @@ static void e1000_diag_test(struct net_device *netdev,
 
clear_bit(__E1000_TESTING, &adapter->state);
if (if_running)
-   dev_open(netdev);
+   e1000e_open(netdev);
} else {
/* Online tests */
 
diff --git a/drivers/net/ethernet/intel/e1000e/netdev.c 
b/drivers/net/ethernet/intel/e1000e/netdev.c
index c71ba1b..02449a0 100644
--- a/drivers/net/ethernet/intel/e1000e/netdev.c
+++ b/drivers/net/ethernet/intel/e1000e/netdev.c
@@ -4495,7 +4495,7 @@ static int e1000_test_msi(struct e1000_adapter *adapter)  
}
 
 /**
- * e1000_open - Called when a network interface is made active
+ * e1000e_open - Called when a network interface is made active
  * @netdev: network interface device structure
  *
  * Returns 0 on success, negative value on failure @@ -4506,7 +4506,7 @@ 
static int e1000_test_msi(struct e1000_adapter *adapter)
  * handler is registered with the OS, the watchdog timer is started,
  * and the stack is notified that the interface is ready.
  **/
-static int e1000_open(struct net_device *netdev)
+int e1000e_open(struct net_device *netdev)
 {
struct e1000_adapter *adapter = netdev_priv(netdev);
struct e1000_hw *hw = &adapter->hw;
@@ -4604,7 +4604,7 @@ err_setup_tx:
 }
 
 /**
- * e1000_close - Disables a network interface
+ * e1000e_close - Disables a network interface
  * @netdev: network interface device structure
  *
  * Returns 0, this is not allowed to fail @@ -4614,7 +4614,7 @@ err_setup_tx:
  * needs to be disabled.  A global MAC reset is issued to stop the
  * hardware, and all transmit and receive resources are freed.
  **/
-static int e1000_close(struct net_device *netdev)
+int e1000e_close(struct net_device *netdev)
 {
struct e1000_adapter *adapter = netdev_priv(netdev);
struct pci_dev *pdev = adapter->pdev;
@@ -6920,8 +6920,8 @@ static int e1000_set_features(struct net_device *netdev,  
}
 
 static const struct net_device_ops e1000e_netdev_ops = {
-   .ndo_open   = e1000_open,
-   .ndo_stop   = e1000_close,
+   .ndo_open   = e1000e_open,
+   .ndo_stop   = e1000e_close,
.ndo_start_xmit = e1000_xmit_frame,
.ndo_get_stats64= e1000e_get_stats64,
.ndo_set_rx_mode= e1000e_set_rx_mode,
--
2.5.0

___
Intel-wired-lan mailing l

[PATCH net V2 2/3] net/mlx5e: Remove select queue ndo initialization

2016-02-09 Thread Saeed Mahameed

Currently mlx5e_select_queue is redundant since num_tc is always 1.

Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |3 ---
 1 files changed, 0 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index 6a3e430..bca6e85 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2137,9 +2137,6 @@ static void mlx5e_build_netdev(struct net_device *netdev)
 
SET_NETDEV_DEV(netdev, &mdev->pdev->dev);
 
-   if (priv->params.num_tc > 1)
-   mlx5e_netdev_ops.ndo_select_queue = mlx5e_select_queue;
-
if (MLX5_CAP_GEN(mdev, vport_group_manager)) {
mlx5e_netdev_ops.ndo_set_vf_mac = mlx5e_set_vf_mac;
mlx5e_netdev_ops.ndo_set_vf_vlan = mlx5e_set_vf_vlan;
-- 
1.7.1

[PATCH net V2 3/3] net/mlx5e: Use static constant netdevice ndos

2016-02-09 Thread Saeed Mahameed

Currently our netdevice ops is a one static global variable which
is referenced by all mlx5e netdevice instances. This can be
problematic when different driver instances do not share same
HW capabilities (e.g SRIOV PF and VFs probed to the host).

Now we have two constant global netdevice ops variables, one
for basic netdevice ops and the other with extended SRIOV ops,
on netdevice construction we choose the one suitable for
current device capabilities.

Fixes: 66e49dedada6 ("net/mlx5e: Add support for SR-IOV ndos")
Signed-off-by: Saeed Mahameed 
---
 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |   41 ++---
 1 files changed, 28 insertions(+), 13 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c 
b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
index bca6e85..d4e1c30 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_main.c
@@ -2024,18 +2024,37 @@ static int mlx5e_get_vf_stats(struct net_device *dev,
vf_stats);
 }
 
-static struct net_device_ops mlx5e_netdev_ops = {
+static const struct net_device_ops mlx5e_netdev_ops_basic = {
.ndo_open= mlx5e_open,
.ndo_stop= mlx5e_close,
.ndo_start_xmit  = mlx5e_xmit,
.ndo_get_stats64 = mlx5e_get_stats,
.ndo_set_rx_mode = mlx5e_set_rx_mode,
.ndo_set_mac_address = mlx5e_set_mac,
-   .ndo_vlan_rx_add_vid = mlx5e_vlan_rx_add_vid,
-   .ndo_vlan_rx_kill_vid= mlx5e_vlan_rx_kill_vid,
+   .ndo_vlan_rx_add_vid = mlx5e_vlan_rx_add_vid,
+   .ndo_vlan_rx_kill_vid= mlx5e_vlan_rx_kill_vid,
.ndo_set_features= mlx5e_set_features,
-   .ndo_change_mtu  = mlx5e_change_mtu,
-   .ndo_do_ioctl= mlx5e_ioctl,
+   .ndo_change_mtu  = mlx5e_change_mtu,
+   .ndo_do_ioctl= mlx5e_ioctl,
+};
+
+static const struct net_device_ops mlx5e_netdev_ops_sriov = {
+   .ndo_open= mlx5e_open,
+   .ndo_stop= mlx5e_close,
+   .ndo_start_xmit  = mlx5e_xmit,
+   .ndo_get_stats64 = mlx5e_get_stats,
+   .ndo_set_rx_mode = mlx5e_set_rx_mode,
+   .ndo_set_mac_address = mlx5e_set_mac,
+   .ndo_vlan_rx_add_vid = mlx5e_vlan_rx_add_vid,
+   .ndo_vlan_rx_kill_vid= mlx5e_vlan_rx_kill_vid,
+   .ndo_set_features= mlx5e_set_features,
+   .ndo_change_mtu  = mlx5e_change_mtu,
+   .ndo_do_ioctl= mlx5e_ioctl,
+   .ndo_set_vf_mac  = mlx5e_set_vf_mac,
+   .ndo_set_vf_vlan = mlx5e_set_vf_vlan,
+   .ndo_get_vf_config   = mlx5e_get_vf_config,
+   .ndo_set_vf_link_state   = mlx5e_set_vf_link_state,
+   .ndo_get_vf_stats= mlx5e_get_vf_stats,
 };
 
 static int mlx5e_check_required_hca_cap(struct mlx5_core_dev *mdev)
@@ -2137,15 +2156,11 @@ static void mlx5e_build_netdev(struct net_device 
*netdev)
 
SET_NETDEV_DEV(netdev, &mdev->pdev->dev);
 
-   if (MLX5_CAP_GEN(mdev, vport_group_manager)) {
-   mlx5e_netdev_ops.ndo_set_vf_mac = mlx5e_set_vf_mac;
-   mlx5e_netdev_ops.ndo_set_vf_vlan = mlx5e_set_vf_vlan;
-   mlx5e_netdev_ops.ndo_get_vf_config = mlx5e_get_vf_config;
-   mlx5e_netdev_ops.ndo_set_vf_link_state = 
mlx5e_set_vf_link_state;
-   mlx5e_netdev_ops.ndo_get_vf_stats = mlx5e_get_vf_stats;
-   }
+   if (MLX5_CAP_GEN(mdev, vport_group_manager))
+   netdev->netdev_ops = &mlx5e_netdev_ops_sriov;
+   else
+   netdev->netdev_ops = &mlx5e_netdev_ops_basic;
 
-   netdev->netdev_ops= &mlx5e_netdev_ops;
netdev->watchdog_timeo= 15 * HZ;
 
netdev->ethtool_ops   = &mlx5e_ethtool_ops;
-- 
1.7.1

[PATCH net V2 0/3] mlx5 driver fixes for 4.5-rc2

2016-02-09 Thread Saeed Mahameed

We added here a patch from Matan and Alaa for addressing Linus comments on
the mess w.r.t reserved field names in the driver/firmware auto-generated file.

Once the patch hits linus tree, we'll ask Doug to rebase his tree on that
rc so both net-next and rdma-next development for 4.6 will be done under
the fixed robust form.

Also provided two patches that addresses the dynamic ndo initialization
issue of mlx5e netdevice.

Or and Saeed.

changes from V1: (Only first patch was changed)
In this V we fixed the issues addressed in Or's previous e-mail.
1. Offsets took into account two dimensional u8 arrays
2. Offsets took into account nesting unions and structs
3. Offsets for unions
4. Offsets for any reserved field

Matan Barak (1):
  net/mlx5: Use offset based reserved field names in the IFC header
file

Saeed Mahameed (2):
  net/mlx5e: Remove select queue ndo initialization
  net/mlx5e: Use static constant netdevice ndos

 drivers/net/ethernet/mellanox/mlx5/core/en_main.c |   44 +-
 include/linux/mlx5/mlx5_ifc.h | 2968 ++--
 2 files changed, 1512 insertions(+), 1500 deletions(-)

[PATCH] net: ti: netcp: restore get/set_pad_info() functionality

2016-02-09 Thread Grygorii Strashko

From: Arnd Bergmann 

The commit 899077791403 ("netcp: try to reduce type confusion in descriptors")
introduces a regression in Kernel 4.5-rc1 and it breaks
get/set_pad_info() functionality.

The TI NETCP driver uses pad0 and pad1 fields of knav_dma_desc to
store DMA/MEM buffer pointer and buffer size respectively. And in both
cases for Keystone 2 the pointer type size is 32 bit regardless of
LAPE enabled or not, because CONFIG_ARCH_DMA_ADDR_T_64BIT originally
is not expected to be defined.

!LAPE   LPAE
sizeof(void*)   32bit   32bit
sizeof(dma_addr_t)  32bit   32bit
sizeof(phys_addr_t) 32bit   64bit

Unfortunately, above commit changed buffer's pointers save/restore
code (get/set_pad_info()) and added intermediate conversation to u64
which works incorrectly on 32bit Keystone 2 and causes TI NETCP driver
crash in RX/TX path due to "Unable to handle kernel NULL pointer"
exception. This issue was reported and discussed in [1].

Hence, fix it by partially reverting above commit and restoring
get/set_pad_info() functionality as it was before.

[1] https://www.mail-archive.com/netdev@vger.kernel.org/msg95361.html
Cc: Wingman Kwok 
Cc: Murali Karicheri 
Cc: Mugunthan V N 
Reported-by: Franklin S Cooper Jr 
Signed-off-by: Arnd Bergmann 
Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/netcp_core.c | 59 +++-
 1 file changed, 18 insertions(+), 41 deletions(-)

diff --git a/drivers/net/ethernet/ti/netcp_core.c 
b/drivers/net/ethernet/ti/netcp_core.c
index c61d66d..0b26e52 100644
--- a/drivers/net/ethernet/ti/netcp_core.c
+++ b/drivers/net/ethernet/ti/netcp_core.c
@@ -117,20 +117,10 @@ static void get_pkt_info(dma_addr_t *buff, u32 *buff_len, 
dma_addr_t *ndesc,
*ndesc = le32_to_cpu(desc->next_desc);
 }
 
-static void get_pad_info(u32 *pad0, u32 *pad1, u32 *pad2, struct knav_dma_desc 
*desc)
+static void get_pad_info(u32 *pad0, u32 *pad1, struct knav_dma_desc *desc)
 {
*pad0 = le32_to_cpu(desc->pad[0]);
*pad1 = le32_to_cpu(desc->pad[1]);
-   *pad2 = le32_to_cpu(desc->pad[2]);
-}
-
-static void get_pad_ptr(void **padptr, struct knav_dma_desc *desc)
-{
-   u64 pad64;
-
-   pad64 = le32_to_cpu(desc->pad[0]) +
-   ((u64)le32_to_cpu(desc->pad[1]) << 32);
-   *padptr = (void *)(uintptr_t)pad64;
 }
 
 static void get_org_pkt_info(dma_addr_t *buff, u32 *buff_len,
@@ -163,11 +153,10 @@ static void set_desc_info(u32 desc_info, u32 pkt_info,
desc->packet_info = cpu_to_le32(pkt_info);
 }
 
-static void set_pad_info(u32 pad0, u32 pad1, u32 pad2, struct knav_dma_desc 
*desc)
+static void set_pad_info(u32 pad0, u32 pad1, struct knav_dma_desc *desc)
 {
desc->pad[0] = cpu_to_le32(pad0);
desc->pad[1] = cpu_to_le32(pad1);
-   desc->pad[2] = cpu_to_le32(pad1);
 }
 
 static void set_org_pkt_info(dma_addr_t buff, u32 buff_len,
@@ -581,7 +570,6 @@ static void netcp_free_rx_desc_chain(struct netcp_intf 
*netcp,
dma_addr_t dma_desc, dma_buf;
unsigned int buf_len, dma_sz = sizeof(*ndesc);
void *buf_ptr;
-   u32 pad[2];
u32 tmp;
 
get_words(&dma_desc, 1, &desc->next_desc);
@@ -593,14 +581,12 @@ static void netcp_free_rx_desc_chain(struct netcp_intf 
*netcp,
break;
}
get_pkt_info(&dma_buf, &tmp, &dma_desc, ndesc);
-   get_pad_ptr(&buf_ptr, ndesc);
+   get_pad_info((u32 *)&buf_ptr, &buf_len, ndesc);
dma_unmap_page(netcp->dev, dma_buf, PAGE_SIZE, DMA_FROM_DEVICE);
__free_page(buf_ptr);
knav_pool_desc_put(netcp->rx_pool, desc);
}
-
-   get_pad_info(&pad[0], &pad[1], &buf_len, desc);
-   buf_ptr = (void *)(uintptr_t)(pad[0] + ((u64)pad[1] << 32));
+   get_pad_info((u32 *)&buf_ptr, &buf_len, desc);
 
if (buf_ptr)
netcp_frag_free(buf_len <= PAGE_SIZE, buf_ptr);
@@ -639,8 +625,8 @@ static int netcp_process_one_rx_packet(struct netcp_intf 
*netcp)
dma_addr_t dma_desc, dma_buff;
struct netcp_packet p_info;
struct sk_buff *skb;
-   u32 pad[2];
void *org_buf_ptr;
+   u32 tmp;
 
dma_desc = knav_queue_pop(netcp->rx_queue, &dma_sz);
if (!dma_desc)
@@ -653,8 +639,7 @@ static int netcp_process_one_rx_packet(struct netcp_intf 
*netcp)
}
 
get_pkt_info(&dma_buff, &buf_len, &dma_desc, desc);
-   get_pad_info(&pad[0], &pad[1], &org_buf_len, desc);
-   org_buf_ptr = (void *)(uintptr_t)(pad[0] + ((u64)pad[1] << 32));
+   get_pad_info((u32 *)&org_buf_ptr, &org_buf_len, desc);
 
if (unlikely(!org_buf_ptr)) {
dev_err(netcp->ndev_dev, "NULL bufptr in desc\n");
@@ -679,7 +664,6 @@ static int netcp_process_one_rx_packet(struct netcp_intf 
*netcp)
/* Fill in the page fragment list */
while (dma_desc) {
struct page *page;
-   void *ptr;

[PATCH net-next] net: macb: add wake-on-lan support via magic packet

2016-02-09 Thread Sergio Prado

Tested on Acqua A5 SoM (http://www.acmesystems.it/acqua).

Signed-off-by: Sergio Prado 
---
 Documentation/devicetree/bindings/net/macb.txt |  2 +
 drivers/net/ethernet/cadence/macb.c| 67 +++---
 drivers/net/ethernet/cadence/macb.h|  4 ++
 3 files changed, 67 insertions(+), 6 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/macb.txt 
b/Documentation/devicetree/bindings/net/macb.txt
index d2e243b1ec0e..c6b1cb5ffa87 100644
--- a/Documentation/devicetree/bindings/net/macb.txt
+++ b/Documentation/devicetree/bindings/net/macb.txt
@@ -25,6 +25,8 @@ Required properties:
 
 Optional properties for PHY child node:
 - reset-gpios : Should specify the gpio for phy reset
+- cdns,magic-packet : If present, indicates that the hardware supports waking
+  up via magic packet.
 
 Examples:
 
diff --git a/drivers/net/ethernet/cadence/macb.c 
b/drivers/net/ethernet/cadence/macb.c
index 50c94104f19c..69af049e55a8 100644
--- a/drivers/net/ethernet/cadence/macb.c
+++ b/drivers/net/ethernet/cadence/macb.c
@@ -58,6 +58,9 @@
 
 #define GEM_MTU_MIN_SIZE   68
 
+#define MACB_WOL_HAS_MAGIC_PACKET  (0x1 << 0)
+#define MACB_WOL_ENABLED   (0x1 << 1)
+
 /*
  * Graceful stop timeouts in us. We should allow up to
  * 1 frame time (10 Mbits/s, full-duplex, ignoring collisions)
@@ -2124,6 +2127,39 @@ static void macb_get_regs(struct net_device *dev, struct 
ethtool_regs *regs,
}
 }
 
+static void macb_get_wol(struct net_device *netdev, struct ethtool_wolinfo 
*wol)
+{
+   struct macb *bp = netdev_priv(netdev);
+
+   wol->supported = 0;
+   wol->wolopts = 0;
+
+   if (bp->wol & MACB_WOL_HAS_MAGIC_PACKET) {
+   wol->supported = WAKE_MAGIC;
+
+   if (bp->wol & MACB_WOL_ENABLED)
+   wol->wolopts |= WAKE_MAGIC;
+   }
+}
+
+static int macb_set_wol(struct net_device *netdev, struct ethtool_wolinfo *wol)
+{
+   struct macb *bp = netdev_priv(netdev);
+
+   if (!(bp->wol & MACB_WOL_HAS_MAGIC_PACKET) ||
+   (wol->wolopts & ~WAKE_MAGIC))
+   return -EOPNOTSUPP;
+
+   if (wol->wolopts & WAKE_MAGIC)
+   bp->wol |= MACB_WOL_ENABLED;
+   else
+   bp->wol &= ~MACB_WOL_ENABLED;
+
+   device_set_wakeup_enable(&bp->pdev->dev, bp->wol & MACB_WOL_ENABLED);
+
+   return 0;
+}
+
 static const struct ethtool_ops macb_ethtool_ops = {
.get_settings   = macb_get_settings,
.set_settings   = macb_set_settings,
@@ -2131,6 +2167,8 @@ static const struct ethtool_ops macb_ethtool_ops = {
.get_regs   = macb_get_regs,
.get_link   = ethtool_op_get_link,
.get_ts_info= ethtool_op_get_ts_info,
+   .get_wol= macb_get_wol,
+   .set_wol= macb_set_wol,
 };
 
 static const struct ethtool_ops gem_ethtool_ops = {
@@ -2890,6 +2928,11 @@ static int macb_probe(struct platform_device *pdev)
if (macb_config)
bp->jumbo_max_len = macb_config->jumbo_max_len;
 
+   bp->wol = 0;
+   if (of_get_property(np, "cdns,magic-packet", NULL))
+   bp->wol |= MACB_WOL_HAS_MAGIC_PACKET;
+   device_init_wakeup(&pdev->dev, bp->wol & MACB_WOL_HAS_MAGIC_PACKET);
+
spin_lock_init(&bp->lock);
 
/* setup capabilities */
@@ -3006,9 +3049,15 @@ static int __maybe_unused macb_suspend(struct device 
*dev)
netif_carrier_off(netdev);
netif_device_detach(netdev);
 
-   clk_disable_unprepare(bp->tx_clk);
-   clk_disable_unprepare(bp->hclk);
-   clk_disable_unprepare(bp->pclk);
+   if (bp->wol & MACB_WOL_ENABLED) {
+   macb_writel(bp, IER, MACB_BIT(WOL));
+   macb_writel(bp, WOL, MACB_BIT(MAG));
+   enable_irq_wake(bp->queues[0].irq);
+   } else {
+   clk_disable_unprepare(bp->tx_clk);
+   clk_disable_unprepare(bp->hclk);
+   clk_disable_unprepare(bp->pclk);
+   }
 
return 0;
 }
@@ -3019,9 +3068,15 @@ static int __maybe_unused macb_resume(struct device *dev)
struct net_device *netdev = platform_get_drvdata(pdev);
struct macb *bp = netdev_priv(netdev);
 
-   clk_prepare_enable(bp->pclk);
-   clk_prepare_enable(bp->hclk);
-   clk_prepare_enable(bp->tx_clk);
+   if (bp->wol & MACB_WOL_ENABLED) {
+   macb_writel(bp, IDR, MACB_BIT(WOL));
+   macb_writel(bp, WOL, 0);
+   disable_irq_wake(bp->queues[0].irq);
+   } else {
+   clk_prepare_enable(bp->pclk);
+   clk_prepare_enable(bp->hclk);
+   clk_prepare_enable(bp->tx_clk);
+   }
 
netif_device_attach(netdev);
 
diff --git a/drivers/net/ethernet/cadence/macb.h 
b/drivers/net/ethernet/cadence/macb.h
index 0d4ecfcd60b7..9ba416d5afff 100644
--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
@@ -312,6 +312,8 @@

[net PATCH] net: Copy inner L3 and L4 headers as unaligned on GRE TEB

2016-02-09 Thread Alexander Duyck

This patch corrects the unaligned accesses seen on GRE TEB tunnels when
generating hash keys.  Specifically what this patch does is make it so that
we force the use of skb_copy_bits when the GRE inner headers will be
unaligned due to NET_IP_ALIGNED being a non-zero value.

Signed-off-by: Alexander Duyck 
---

I don't have the ability to test it but this should fix flow dissector for
GRE TEB tunnels traffic seen on architectures that require network and
transport headers to be 4 byte aligned.

 net/core/flow_dissector.c |7 +++
 1 file changed, 7 insertions(+)

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 699b2c415cb0..9c181ba7263e 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -402,6 +402,13 @@ ip_proto_again:
goto out_bad;
proto = eth->h_proto;
nhoff += sizeof(*eth);
+
+   /* Cap headers that we access via pointers at the
+* end of the Ethernet header as our maximum alignment
+* at that point is only 2 bytes.
+*/
+   if (NET_IP_ALIGN)
+   hlen = nhoff;
}
 
key_control->flags |= FLOW_DIS_ENCAPSULATION;

RE: [PATCH] net: ti: netcp: restore get/set_pad_info() functionality

2016-02-09 Thread David Laight

From: Grygorii Strashko
> Sent: 09 February 2016 13:58
> From: Arnd Bergmann 
> 
> The commit 899077791403 ("netcp: try to reduce type confusion in descriptors")
> introduces a regression in Kernel 4.5-rc1 and it breaks
> get/set_pad_info() functionality.
> 
> The TI NETCP driver uses pad0 and pad1 fields of knav_dma_desc to
> store DMA/MEM buffer pointer and buffer size respectively. And in both
> cases for Keystone 2 the pointer type size is 32 bit regardless of
> LAPE enabled or not, because CONFIG_ARCH_DMA_ADDR_T_64BIT originally
> is not expected to be defined.
> 
>   !LAPE   LPAE
> sizeof(void*) 32bit   32bit
> sizeof(dma_addr_t)32bit   32bit
> sizeof(phys_addr_t)   32bit   64bit
> 
> Unfortunately, above commit changed buffer's pointers save/restore
> code (get/set_pad_info()) and added intermediate conversation to u64
> which works incorrectly on 32bit Keystone 2 and causes TI NETCP driver
> crash in RX/TX path due to "Unable to handle kernel NULL pointer"
> exception. This issue was reported and discussed in [1].
> 
> Hence, fix it by partially reverting above commit and restoring
> get/set_pad_info() functionality as it was before.

You should really get rid of most of the horrid pointer-integer casts.
Code like:
>   void *buf_ptr;
...
> + get_pad_info((u32 *)&buf_ptr, &buf_len, ndesc);
is just asking for trouble.

You'd be better using assignments like:
buf_ptr = (cast)get_pad_0(ndesc);
buf_len = get_pad_1(ndesc);
Then the values are passed (and cast) as numerics.

In reality the 'pad' fields ought to be renamed - since they aren't pads.
Perhaps they should be a union?

David

Re: [net PATCH] net: Copy inner L3 and L4 headers as unaligned on GRE TEB

2016-02-09 Thread Tom Herbert

On Tue, Feb 9, 2016 at 3:14 PM, Alexander Duyck  wrote:
> This patch corrects the unaligned accesses seen on GRE TEB tunnels when
> generating hash keys.  Specifically what this patch does is make it so that
> we force the use of skb_copy_bits when the GRE inner headers will be
> unaligned due to NET_IP_ALIGNED being a non-zero value.
>
> Signed-off-by: Alexander Duyck 
> ---
>
> I don't have the ability to test it but this should fix flow dissector for
> GRE TEB tunnels traffic seen on architectures that require network and
> transport headers to be 4 byte aligned.
>
>  net/core/flow_dissector.c |7 +++
>  1 file changed, 7 insertions(+)
>
> diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
> index 699b2c415cb0..9c181ba7263e 100644
> --- a/net/core/flow_dissector.c
> +++ b/net/core/flow_dissector.c
> @@ -402,6 +402,13 @@ ip_proto_again:
> goto out_bad;
> proto = eth->h_proto;
> nhoff += sizeof(*eth);
> +
> +   /* Cap headers that we access via pointers at the
> +* end of the Ethernet header as our maximum alignment
> +* at that point is only 2 bytes.
> +*/
> +   if (NET_IP_ALIGN)
> +   hlen = nhoff;

I think this should be:

if (NET_IP_ALIGN)
 goto out_good;

> }
>
> key_control->flags |= FLOW_DIS_ENCAPSULATION;
>

Re: [PATCH] net: fec: Add "phy-reset-active-low" property to DT

2016-02-09 Thread Arnd Bergmann

On Monday 08 February 2016 22:51:38 Andrew Lunn wrote:
> On Mon, Feb 08, 2016 at 10:46:42PM +0100, Arnd Bergmann wrote:
> > On Monday 08 February 2016 21:21:13 Bernhard Walle wrote:
> > > We need that for a custom hardware that needs the reverse reset
> > > sequence.
> > > 
> > > Signed-off-by: Bernhard Walle 
> > > 
> > 
> > Why can't this be specified in the gpios property?
> 
> Backwards compatibility. The flag has always been ignored until now,
> yet various DTs have a flag value which are active low. If suddenly it
> worked, various boards would break 

Ok, I see. 

Arnd

[PATCH 2/3] net: arc_emac: reset txbd_curr and txbd_dirty pointers to zero

2016-02-09 Thread Alexander Kochetkov

EMAC reset internal tx ring pointer to zero at statup.
txbd_curr and txbd_dirty can be different from zero.
That cause ethernet transfer hang (no packets transmitted).

In order to reproduce, run on device:
ifconfig eth0 down
ifconfig eth0 up

Signed-off-by: Alexander Kochetkov 

---
CC: sta...@vger.kernel.org # 3.18.x-
---
 drivers/net/ethernet/arc/emac_main.c |3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/arc/emac_main.c 
b/drivers/net/ethernet/arc/emac_main.c
index 63a63e3..4f6e5be 100644
--- a/drivers/net/ethernet/arc/emac_main.c
+++ b/drivers/net/ethernet/arc/emac_main.c
@@ -447,6 +447,9 @@ static int arc_emac_open(struct net_device *ndev)
*last_rx_bd = (*last_rx_bd + 1) % RX_BD_NUM;
}
 
+   priv->txbd_curr = 0;
+   priv->txbd_dirty = 0;
+
/* Clean Tx BD's */
memset(priv->txbd, 0, TX_RING_SZ);
 
-- 
1.7.9.5

[PATCH 1/3] net: arc_emac: fix koops caused by sk_buff free

2016-02-09 Thread Alexander Kochetkov

There is a race between arc_emac_tx() and arc_emac_tx_clean().
sk_buff got freed by arc_emac_tx_clean() while arc_emac_tx()
submitting sk_buff.

In order to free sk_buff arc_emac_tx_clean() checks:
if ((info & FOR_EMAC) || !txbd->data)
break;
...
dev_kfree_skb_irq(skb);

If condition false, arc_emac_tx_clean() free sk_buff.

In order to submit txbd, arc_emac_tx() do:
priv->tx_buff[*txbd_curr].skb = skb;
...
priv->txbd[*txbd_curr].data = cpu_to_le32(addr);
...
...  <== arc_emac_tx_clean() check condition here
...  <== (info & FOR_EMAC) is false
...  <== !txbd->data is false
...
*info = cpu_to_le32(FOR_EMAC | FIRST_OR_LAST_MASK | len);

In order to reproduce the situation,
run device:
# iperf -s
run on host:
# iperf -t 600 -c 

[   28.396284] [ cut here ]
[   28.400912] kernel BUG at .../net/core/skbuff.c:1355!
[   28.414019] Internal error: Oops - BUG: 0 [#1] SMP ARM
[   28.419150] Modules linked in:
[   28.422219] CPU: 0 PID: 0 Comm: swapper/0 Tainted: GB   4.4.0+ 
#120
[   28.429516] Hardware name: Rockchip (Device Tree)
[   28.434216] task: c0665070 ti: c066 task.ti: c066
[   28.439622] PC is at skb_put+0x10/0x54
[   28.443381] LR is at arc_emac_poll+0x260/0x474
[   28.447821] pc : []lr : []psr: a0070113
[   28.447821] sp : c0661e58  ip : eea68502  fp : ef377000
[   28.459280] r10: 012c  r9 : f08b2000  r8 : eeb57100
[   28.464498] r7 :   r6 : ef376594  r5 : 0077  r4 : ef376000
[   28.471015] r3 : 0030488b  r2 : ef13e880  r1 : 05ee  r0 : eeb57100
[   28.477534] Flags: NzCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment none
[   28.484658] Control: 10c5387d  Table: 8eaf004a  DAC: 0051
[   28.490396] Process swapper/0 (pid: 0, stack limit = 0xc0660210)
[   28.496393] Stack: (0xc0661e58 to 0xc0662000)
[   28.500745] 1e40:   
0002 
[   28.508913] 1e60:  ef376520 0028 f08b23b8  ef376520 
ef7b6900 c028fc64
[   28.517082] 1e80: 2f158000 c0661ea8 c0661eb0 012c c065e900 c03bdeac 
95e9 c0662100
[   28.525250] 1ea0: c0663924 0028 c0661ea8 c0661ea8 c0661eb0 c0661eb0 
001e c066
[   28.533417] 1ec0: 4003 0008 c0695a00 000a c066208c 0100 
c0661ee0 c0027410
[   28.541584] 1ee0: ef0fb700 2f158000 0020 95e8 0004 c0662100 
c0662080 0003
[   28.549751] 1f00:    c065b45c 001e ef005000 
c0647a30 
[   28.557919] 1f20:  c0027798  c005cf40 f0802100 c0662ffc 
c0661f60 f0803100
[   28.566088] 1f40: c0661fb8 c00093bc c000ffb4 60070013  c0661f94 
c0661fb8 c00137d4
[   28.574267] 1f60: 0001   c001ffa0  c066 
 c065a364
[   28.582441] 1f80: c0661fb8 c0647a30    c0661fb0 
c000ffb0 c000ffb4
[   28.590608] 1fa0: 60070013  0051   c005496c 
c0662400 c061bc40
[   28.598776] 1fc0:    c061b680  c0647a30 
 c0695294
[   28.606943] 1fe0: c0662488 c0647a2c c066619c 6000406a 413fc090 6000807c 
 
[   28.615127] [] (skb_put) from [] (0xef376520)
[   28.621218] Code: e5902054 e590c090 e352 0a00 (e7f001f2)
[   28.627307] ---[ end trace 4824734e2243fdb6 ]---

[   34.377068] Internal error: Oops: 17 [#1] SMP ARM
[   34.382854] Modules linked in:
[   34.385947] CPU: 0 PID: 3 Comm: ksoftirqd/0 Not tainted 4.4.0+ #120
[   34.392219] Hardware name: Rockchip (Device Tree)
[   34.396937] task: ef02d040 ti: ef05c000 task.ti: ef05c000
[   34.402376] PC is at __dev_kfree_skb_irq+0x4/0x80
[   34.407121] LR is at arc_emac_poll+0x130/0x474
[   34.411583] pc : []lr : []psr: 60030013
[   34.411583] sp : ef05de68  ip : 0008e83c  fp : ef377000
[   34.423062] r10: c001bec4  r9 :   r8 : f08b24c8
[   34.428296] r7 : f08b2400  r6 : 0075  r5 : 0019  r4 : ef376000
[   34.434827] r3 : 0006  r2 : 0042  r1 : 0001  r0 : 
[   34.441365] Flags: nZCv  IRQs on  FIQs on  Mode SVC_32  ISA ARM  Segment none
[   34.448507] Control: 10c5387d  Table: 8f25c04a  DAC: 0051
[   34.454262] Process ksoftirqd/0 (pid: 3, stack limit = 0xef05c210)
[   34.460449] Stack: (0xef05de68 to 0xef05e000)
[   34.464827] de60:   ef376000 c028fd94  c0669480 
c0669480 ef376520
[   34.473022] de80: 0028 0001 2ae4 ef376520 ef7b6900 c028fc64 
2f158000 ef05dec0
[   34.481215] dea0: ef05dec8 012c c065e900 c03bdeac 983f c0662100 
c0663924 0028
[   34.489409] dec0: ef05dec0 ef05dec0 ef05dec8 ef05dec8 ef7b6000 ef05c000 
4003 0008
[   34.497600] dee0: c0695a00 000a c066208c 0100 ef05def8 c0027410 
ef7b6000 4000
[   34.505795] df00: 04208040 983e 0004 c0662100 c0662080 0003 
ef05c000 ef027340
[   34.513985] df20: ef05c000 c0666c2c  0001 0002  
 c0027568
[   34.522176] df40: ef0273

[PATCH 3/3] net: arc_emac: fix sk_buff leak

2016-02-09 Thread Alexander Kochetkov

EMAC could be disabled, while there is some sb_buff
in use. That buffers got lost for linux.

In order to reproduce run on device during active ethernet work:
ifconfig eth0 down

Signed-off-by: Alexander Kochetkov 

---
CC: sta...@vger.kernel.org # 3.18.x-
---
 drivers/net/ethernet/arc/emac_main.c |   62 ++
 1 file changed, 62 insertions(+)

diff --git a/drivers/net/ethernet/arc/emac_main.c 
b/drivers/net/ethernet/arc/emac_main.c
index 4f6e5be..6446af1 100644
--- a/drivers/net/ethernet/arc/emac_main.c
+++ b/drivers/net/ethernet/arc/emac_main.c
@@ -518,6 +518,64 @@ static void arc_emac_set_rx_mode(struct net_device *ndev)
 }
 
 /**
+ * arc_free_tx_queue - free skb from tx queue
+ * @ndev:  Pointer to the network device.
+ *
+ * This function must be called while EMAC disable
+ */
+static void arc_free_tx_queue(struct net_device *ndev)
+{
+   struct arc_emac_priv *priv = netdev_priv(ndev);
+   unsigned int i;
+
+   for (i = 0; i < TX_BD_NUM; i++) {
+   struct arc_emac_bd *txbd = &priv->txbd[i];
+   struct buffer_state *tx_buff = &priv->tx_buff[i];
+
+   if (tx_buff->skb) {
+   dma_unmap_single(&ndev->dev, dma_unmap_addr(tx_buff, 
addr),
+dma_unmap_len(tx_buff, len), 
DMA_TO_DEVICE);
+
+   /* return the sk_buff to system */
+   dev_kfree_skb_irq(tx_buff->skb);
+   }
+
+   txbd->info = 0;
+   txbd->data = 0;
+   tx_buff->skb = NULL;
+   }
+}
+
+/**
+ * arc_free_rx_queue - free skb from rx queue
+ * @ndev:  Pointer to the network device.
+ *
+ * This function must be called while EMAC disable
+ */
+static void arc_free_rx_queue(struct net_device *ndev)
+{
+   struct arc_emac_priv *priv = netdev_priv(ndev);
+   unsigned int i;
+
+   for (i = 0; i < RX_BD_NUM; i++) {
+   struct arc_emac_bd *rxbd = &priv->rxbd[i];
+   struct buffer_state *rx_buff = &priv->rx_buff[i];
+
+   if (rx_buff->skb) {
+   dma_unmap_single(&ndev->dev, dma_unmap_addr(rx_buff, 
addr),
+   dma_unmap_len(rx_buff, len), 
DMA_FROM_DEVICE);
+
+   /* return the sk_buff to system */
+   dev_kfree_skb_irq(rx_buff->skb);
+   }
+
+   rxbd->info = 0;
+   rxbd->data = 0;
+   rx_buff->skb = NULL;
+   }
+}
+
+/**
  * arc_emac_stop - Close the network device.
  * @ndev:  Pointer to the network device.
  *
@@ -538,6 +596,10 @@ static int arc_emac_stop(struct net_device *ndev)
/* Disable EMAC */
arc_reg_clr(priv, R_CTRL, EN_MASK);
 
+   /* Return the sk_buff to system */
+   arc_free_tx_queue(ndev);
+   arc_free_rx_queue(ndev);
+
return 0;
 }
 
-- 
1.7.9.5

Fixes for rockchip EMAC

2016-02-09 Thread Alexander Kochetkov

Hello!

Here is a set of 3 patches what fix koops, memory leak and
rockchip EMAC hang. Tested on radxarock lite.

[PATCH 1/3] net: arc_emac: fix koops caused by sk_buff free
[PATCH 2/3] net: arc_emac: reset txbd_curr and txbd_dirty pointers
[PATCH 3/3] net: arc_emac: fix sk_buff leak

Re: [net PATCH] net: Copy inner L3 and L4 headers as unaligned on GRE TEB

2016-02-09 Thread Alexander Duyck

On Tue, Feb 9, 2016 at 3:33 PM, Tom Herbert  wrote:
> On Tue, Feb 9, 2016 at 3:14 PM, Alexander Duyck  wrote:
>> This patch corrects the unaligned accesses seen on GRE TEB tunnels when
>> generating hash keys.  Specifically what this patch does is make it so that
>> we force the use of skb_copy_bits when the GRE inner headers will be
>> unaligned due to NET_IP_ALIGNED being a non-zero value.
>>
>> Signed-off-by: Alexander Duyck 
>> ---
>>
>> I don't have the ability to test it but this should fix flow dissector for
>> GRE TEB tunnels traffic seen on architectures that require network and
>> transport headers to be 4 byte aligned.
>>
>>  net/core/flow_dissector.c |7 +++
>>  1 file changed, 7 insertions(+)
>>
>> diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
>> index 699b2c415cb0..9c181ba7263e 100644
>> --- a/net/core/flow_dissector.c
>> +++ b/net/core/flow_dissector.c
>> @@ -402,6 +402,13 @@ ip_proto_again:
>> goto out_bad;
>> proto = eth->h_proto;
>> nhoff += sizeof(*eth);
>> +
>> +   /* Cap headers that we access via pointers at the
>> +* end of the Ethernet header as our maximum 
>> alignment
>> +* at that point is only 2 bytes.
>> +*/
>> +   if (NET_IP_ALIGN)
>> +   hlen = nhoff;
>
> I think this should be:
>
> if (NET_IP_ALIGN)
>  goto out_good;
>

That is no good since we already updated proto with the inner header
protocol value.  I would prefer to parse the entire header and just
keep the behavior consistent between IP aligned and DMA aligned
systems.

The only change in behavior is that the reported header length from
the function eth_get_headlen will only pull to the end of the Ethernet
header since we are now only reporting the aligned IP header length.

Otherwise if we need to exit we should probably exit with "goto out_bad"

- Alex

Re: [net PATCH] net: Copy inner L3 and L4 headers as unaligned on GRE TEB

2016-02-09 Thread Tom Herbert

On Tue, Feb 9, 2016 at 3:14 PM, Alexander Duyck  wrote:
> This patch corrects the unaligned accesses seen on GRE TEB tunnels when
> generating hash keys.  Specifically what this patch does is make it so that
> we force the use of skb_copy_bits when the GRE inner headers will be
> unaligned due to NET_IP_ALIGNED being a non-zero value.
>
> Signed-off-by: Alexander Duyck 

Acked-by: Tom Herbert 

> ---
>
> I don't have the ability to test it but this should fix flow dissector for
> GRE TEB tunnels traffic seen on architectures that require network and
> transport headers to be 4 byte aligned.
>
>  net/core/flow_dissector.c |7 +++
>  1 file changed, 7 insertions(+)
>
> diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
> index 699b2c415cb0..9c181ba7263e 100644
> --- a/net/core/flow_dissector.c
> +++ b/net/core/flow_dissector.c
> @@ -402,6 +402,13 @@ ip_proto_again:
> goto out_bad;
> proto = eth->h_proto;
> nhoff += sizeof(*eth);
> +
> +   /* Cap headers that we access via pointers at the
> +* end of the Ethernet header as our maximum alignment
> +* at that point is only 2 bytes.
> +*/
> +   if (NET_IP_ALIGN)
> +   hlen = nhoff;
> }
>
> key_control->flags |= FLOW_DIS_ENCAPSULATION;
>

RE: [PATCH net-next] hv_netvsc: Increase delay for RNDIS_STATUS_NETWORK_CHANGE

2016-02-09 Thread Haiyang Zhang



> -Original Message-
> From: David Miller [mailto:da...@davemloft.net]
> Sent: Tuesday, February 9, 2016 5:05 AM
> To: Haiyang Zhang 
> Cc: netdev@vger.kernel.org; KY Srinivasan ;
> o...@aepfle.de; vkuzn...@redhat.com; linux-ker...@vger.kernel.org;
> driverdev-de...@linuxdriverproject.org
> Subject: Re: [PATCH net-next] hv_netvsc: Increase delay for
> RNDIS_STATUS_NETWORK_CHANGE
> 
> From: Haiyang Zhang 
> Date: Tue,  2 Feb 2016 16:15:56 -0800
> 
> > We simulates a link down period for RNDIS_STATUS_NETWORK_CHANGE
> > message to trigger DHCP renew. User daemons may need multiple seconds
> > to trigger the link down event. (e.g. ifplugd: 5sec, network-manager:
> > 4sec.) So update this link down period to 10 sec to properly trigger DHCP
> renew.
> >
> > Signed-off-by: Haiyang Zhang 
> 
> Two things look really bad about this to me:
> 
> 1) Any value you choose is arbitrary.  If some new network configuration
> daemon
>is slower, you will have to change this value again.
> 
>This is _NOT_ sustainable in the long term.
> 
> 2) It is completely unclear to me why this driver needs to delay at all or
>wait for anything.  I see no other driver having to deal with this issue.
> 
> Until you address both of these points I am not going to apply this patch.

1) I share your concern as well. Is there a universal way to immediately 
trigger 
DHCP renew of all current and future daemons with a single event from kernel? 
If not, can we put the delay (RNDIS_STATUS_NETWORK_CHANGE only) into a 
tunable variable of this driver?

2) We used to have the call_usermodehelper "/etc/init.d/network restart" to 
trigger DHCP renew. In commit 27a70af3f4, Vitaly has replaced it with the 
current 
code that updates the link status with at least 2 seconds interval, so that the 
"link_watch infrastructure" can send notification out. link_watch 
infrastructure 
only sends one notification per second.

Thanks,
- Haiyang

Re: [PATCH net-next 2/2] mpls: allow TTL propagation to/from IP packets to be configured

2016-02-09 Thread Robert Shearman


On 06/02/16 18:36, Eric W. Biederman wrote:

Robert Shearman  writes:


It is sometimes desirable to present an MPLS transport network as a
single hop to traffic transiting it because it prevents confusion when
diagnosing failures. An example of where confusion can be generated is
when addresses used in the provider network overlap with addresses in
the overlay network and the addresses get exposed through ICMP errors
generated as packets transit the provider network.


The configuration you are talking about is a bug.  ICMP errors can
be handled without confusion simplify by forwarding the packets out
to the end of the tunnel.  Which is how the standards require that
situation to be handled if an ICMP error is generated.


You're absolutely right that the standards say how the ICMP errors 
should be handled in order for them to be forwarded correctly back to 
the sender, but I'm referring to what source addresses customers of 
service provider see in those ICMP errors generated when e.g. doing a 
traceroute. Furthermore, the mechanism that you mention adds for scope 
for mis-diagnosis since a traceroute won't show any information for hops 
PE1, P1 and P2 if PE2 is dropping the traffic for that LSP (because the 
mechanism you describe relies on PE2 or even a further CE to hairpin the 
ICMP error back to the originator of the error-causing traffic).


If you need further evidence that this is something that network 
operators might want to do, then see RFC 3032, s2.4.3 where it states:


   It is recognized that there may be situations where a network
   administration prefers to decrement the IPv4 TTL by one as it
   traverses an MPLS domain, instead of decrementing the IPv4 TTL by the
   number of LSP hops within the domain.

And one more reference is that this behaviour is codified in RFC 3443. 
For the purposes of clarity, Uniform Model in RFC 3443 corresponds to 
ip_ttl_propagate = 1 (default) and (Short) Pipe Model corresponds to 
ip_ttl_propagate = 0.





Therefore, provide the ability to control whether the TTL value from
an MPLS packet is propagated to an IPv4/IPv6 packet when the last
label is popped through the addition of a new per-namespace sysctl:
"net.mpls.ip_ttl_propagate" which defaults to enabled.

Use the same sysctl to control whether the TTL is propagated from IP
packets into the MPLS header. If the TTL isn't propagated then a
default TTL value is used which can be configured via a new sysctl:
"net.mpls.default_ttl".


Ugh.  There is a case for this, but this feels much more like a per
tunnel/label/route egress property not a per network interface property.

I don't recall all of the gory details but some flavors of mpls labels
always require ttl propogation (the ip over mpls default) and some
flavors of mpls labels always require no propagation (pseudo wires).


Clearly, if the label isn't used for the purposes of encapsulating L3 
traffic, then you can't propagate the L3 TTL into it and you have to put 
some other value in there instead. I envisaged that the value of 
default_ttl would be used in these cases and this is why I worded the 
documentation for the default_ttl sysctl like so:


Default TTL value to use for MPLS packets where it cannot be
propagated from an IP header, either because one isn't present
or ip_ttl_propagate has been disabled.

Given that traffic arriving with a pseudo-wire label will have to be 
forwarded differently from traffic arriving for labels with L3 traffic, 
you will know that the label is associated with L2 traffic and that the 
TTL cannot be propagated.



There may be something cute in between.  For something that is a per
tunnel property I don't feel comfortable with a sysctl.


I cannot think of a use-case where it would make sense to have a mix of 
TTL being propagated and not being propagated on a per-LSP basis. I note 
that all of the most widely used proprietary MPLS implementations 
support global IP TTL propagation configuration and I'm not aware of any 
MPLS implementation that implements a per-LSP control for IP TTL 
propagation.



Especially when it is something as potentially dangerous as enabling
packets to loop in a network.  As I recall most IP over IP tunnels
also propogate the ttl between the inner and outer ip packets to prevent
loops.


There is no possibility of packets looping in a network as the TTL is 
always decremented when a label is pushed, whether the packet came in as 
IP or MPLS, and when swapping a label egress TTL must be one less than 
the ingress TTL, as defined by the MPLS RFC. When popping the last label 
we have to ensure that the MPLS TTL is not propagated to IP TTL so that 
there's no possibility of set the IP TTL beyond the value it entered the 
LSP (after the TTL decrement done as part of IP switching) with, but 
that is what this code does. Note that this is only the case if all 
routers are configured to not propagate the TTL, but the network 
operator can ensure that - if they don

RE: [PATCH] net: ti: netcp: restore get/set_pad_info() functionality

2016-02-09 Thread David Laight

From: Karicheri, Muralidharan
> Sent: 09 February 2016 16:10
...
> >In reality the 'pad' fields ought to be renamed - since they aren't pads.
> >Perhaps they should be a union?

> No. At the end of the descriptor, host software can add scratchpad which is
> not modified by the hardware, but is used by the driver. So please don't
> rename.

So comment in the definition that the hardware doesn't modify them.
The driver is defining these fields and they are definitely NOT padding.

David

RE: [PATCH] net: ti: netcp: restore get/set_pad_info() functionality

2016-02-09 Thread Karicheri, Muralidharan

>-Original Message-
>From: David Laight [mailto:david.lai...@aculab.com]
>Sent: Tuesday, February 09, 2016 9:13 AM
>To: Strashko, Grygorii; netdev@vger.kernel.org; David S . Miller; Arnd Bergmann
>Cc: Cooper Jr., Franklin; Nori, Sekhar; linux-ker...@vger.kernel.org; Kwok, 
>WingMan;
>Karicheri, Muralidharan; N, Mugunthan V
>Subject: RE: [PATCH] net: ti: netcp: restore get/set_pad_info() functionality
>
>From: Grygorii Strashko
>> Sent: 09 February 2016 13:58
>> From: Arnd Bergmann 
>>
>> The commit 899077791403 ("netcp: try to reduce type confusion in
>> descriptors") introduces a regression in Kernel 4.5-rc1 and it breaks
>> get/set_pad_info() functionality.
>>
>> The TI NETCP driver uses pad0 and pad1 fields of knav_dma_desc to
>> store DMA/MEM buffer pointer and buffer size respectively. And in both
>> cases for Keystone 2 the pointer type size is 32 bit regardless of
>> LAPE enabled or not, because CONFIG_ARCH_DMA_ADDR_T_64BIT originally
>> is not expected to be defined.
>>
>>  !LAPE   LPAE
>> sizeof(void*)32bit   32bit
>> sizeof(dma_addr_t)   32bit   32bit
>> sizeof(phys_addr_t)  32bit   64bit
>>
>> Unfortunately, above commit changed buffer's pointers save/restore
>> code (get/set_pad_info()) and added intermediate conversation to u64
>> which works incorrectly on 32bit Keystone 2 and causes TI NETCP driver
>> crash in RX/TX path due to "Unable to handle kernel NULL pointer"
>> exception. This issue was reported and discussed in [1].
>>
>> Hence, fix it by partially reverting above commit and restoring
>> get/set_pad_info() functionality as it was before.
>
>You should really get rid of most of the horrid pointer-integer casts.
>Code like:
>>  void *buf_ptr;
>...
>> +get_pad_info((u32 *)&buf_ptr, &buf_len, ndesc);
>is just asking for trouble.
>
>You'd be better using assignments like:
>   buf_ptr = (cast)get_pad_0(ndesc);
>   buf_len = get_pad_1(ndesc);
>Then the values are passed (and cast) as numerics.
>
>In reality the 'pad' fields ought to be renamed - since they aren't pads.
>Perhaps they should be a union?
No. At the end of the descriptor, host software can add scratchpad which is
not modified by the hardware, but is used by the driver. So please don't
rename.

Murali
>
>   David

RE: [PATCH] net: ti: netcp: restore get/set_pad_info() functionality

2016-02-09 Thread Karicheri, Muralidharan



Murali Karicheri
Linux Kernel, Software Development


>-Original Message-
>From: David Laight [mailto:david.lai...@aculab.com]
>Sent: Tuesday, February 09, 2016 11:10 AM
>To: Karicheri, Muralidharan; Strashko, Grygorii; netdev@vger.kernel.org; David 
>S . Miller;
>Arnd Bergmann
>Cc: Cooper Jr., Franklin; Nori, Sekhar; linux-ker...@vger.kernel.org; Kwok, 
>WingMan; N,
>Mugunthan V
>Subject: RE: [PATCH] net: ti: netcp: restore get/set_pad_info() functionality
>
>From: Karicheri, Muralidharan
>> Sent: 09 February 2016 16:10
>...
>> >In reality the 'pad' fields ought to be renamed - since they aren't pads.
>> >Perhaps they should be a union?
>
>> No. At the end of the descriptor, host software can add scratchpad
>> which is not modified by the hardware, but is used by the driver. So
>> please don't rename.
>
>So comment in the definition that the hardware doesn't modify them.
>The driver is defining these fields and they are definitely NOT padding.


It is scratch pad, not padding. Looks like pad is a confusing name. Can be
renamed to sw_data to be in sync with spec below.

The hardware spec from 
http://www.ti.com/lit/ug/sprugr9h/sprugr9h.pdf

The other SW data portion of the descriptor exists after all of the defined
words and is reserved for use by the host software to store completely
private data. This region is not used in any way by the DMA or queue manager
modules in a Multicore Navigator system and these modules will not modify
any bytes within this region.

Murali
>
>   David

Pushing AF_RXRPC rewrite patches to net/next

2016-02-09 Thread David Howells

Hi Dave,

I've split 23 patches out of my AF_RXRPC rewrite so far.  If you look here:


http://git.kernel.org/cgit/linux/kernel/git/dhowells/linux-fs.git/log/?h=rxrpc-experimental

the penultimate patch is the remnant rewrite, the 23 patches below that are
ones I've split out of it.

Are you okay with me sending some of them your way now so that you don't get a
host all in one go, or would you prefer the mass invasion approach?

Note that I've moved things between files in some patches.  Where possible
I've made a patch that *just* moves from one file to another and then the next
patch modifies the new file.

Unless you're willing to take the remnant patch whole (which would be lovely
as splitting the patch is way more work than doing the rewrite, but I deem
unlikely), I still have some more splits I can make in that.

David

[PATCH net v2 2/3] geneve: Relax MTU constraints

2016-02-09 Thread David Wragg

Allow the MTU of geneve devices to be set to large values, in order to
exploit underlying networks with larger frame sizes.

GENEVE does not have a fixed encapsulation overhead (an openvswitch
rule can add variable length options), so there is no relevant maximum
MTU to enforce.  A maximum of IP_MAX_MTU is used instead.
Encapsulated packets that are too big for the underlying network will
get dropped on the floor.

Signed-off-by: David Wragg 
---
 drivers/net/geneve.c | 12 +++-
 1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 0b14ac3..05cef11 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -1039,6 +1039,16 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, 
struct net_device *dev)
return geneve_xmit_skb(skb, dev, info);
 }
 
+static int geneve_change_mtu(struct net_device *dev, int new_mtu)
+{
+   /* GENEVE overhead is not fixed, so we can't enforce a more
+  precise max MTU. */
+   if (new_mtu < 68 || new_mtu > IP_MAX_MTU)
+   return -EINVAL;
+   dev->mtu = new_mtu;
+   return 0;
+}
+
 static int geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff 
*skb)
 {
struct ip_tunnel_info *info = skb_tunnel_info(skb);
@@ -1083,7 +1093,7 @@ static const struct net_device_ops geneve_netdev_ops = {
.ndo_stop   = geneve_stop,
.ndo_start_xmit = geneve_xmit,
.ndo_get_stats64= ip_tunnel_get_stats64,
-   .ndo_change_mtu = eth_change_mtu,
+   .ndo_change_mtu = geneve_change_mtu,
.ndo_validate_addr  = eth_validate_addr,
.ndo_set_mac_address= eth_mac_addr,
.ndo_fill_metadata_dst  = geneve_fill_metadata_dst,
-- 
2.5.0

[PATCH net v2 0/3] Set a large MTU on ovs-created tunnel devices

2016-02-09 Thread David Wragg

Prior to 4.3, tunnel vports (vxlan, gre and geneve) could transmit
vxlan packets of any size, constrained only by the ability to send out
the resulting packets.  4.3 introduced netdevs corresponding to tunnel
vports.  These netdevs have an MTU, which limits the size of a packet
that can be successfully encapsulated.  The default value for the MTUs
are low (1500 or less), which is awkwardly small in the context of
physical networks supporting jumbo frames, and leads to a conspicuous
change in behaviour for userspace.

This patch series sets the MTU on openvswitch-created netdevs to be
the relevant maximum (i.e. the maximum IP packet size minus any
relevant overhead), effectively restoring the behaviour prior to 4.3.

Where appropriate, the limits on MTU values when set on the netdevs
directly are also relaxed.

Changes in v2:
* Extend to all openvswitch tunnel types, i.e. gre and geneve as well
* Use IP_MAX_MTU

David Wragg (3):
  vxlan: Relax the MTU constraints
  geneve: Relax MTU constraints
  vxlan, gre, geneve: Set a large MTU on ovs-created tunnel devices

 drivers/net/geneve.c  | 29 +-
 drivers/net/vxlan.c   | 47 ++-
 include/net/ip_tunnels.h  |  1 +
 net/ipv4/ip_gre.c |  7 +++
 net/ipv4/ip_tunnel.c  | 21 ---
 net/openvswitch/vport-vxlan.c |  2 ++
 6 files changed, 85 insertions(+), 22 deletions(-)

-- 
2.5.0

[PATCH net v2 1/3] vxlan: Relax the MTU constraints

2016-02-09 Thread David Wragg

Allow the MTU of vxlan devices without an underlying device to be set
to larger values (up to a maximum based on IP packet limits and vxlan
overhead).

Previously, their MTUs could not be set to higher than the
conventional ethernet value of 1500.  This is a very arbitrary value
in the context of vxlan, and prevented vxlan devices from being able
to take advantage of jumbo frames etc.

The default MTU remains 1500, for compatibility.

Signed-off-by: David Wragg 
Acked-by: Roopa Prabhu 
---
 drivers/net/vxlan.c | 36 +---
 1 file changed, 25 insertions(+), 11 deletions(-)

diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index 6543918..e992c6a 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2367,29 +2367,43 @@ static void vxlan_set_multicast_list(struct net_device 
*dev)
 {
 }
 
-static int vxlan_change_mtu(struct net_device *dev, int new_mtu)
+static int __vxlan_change_mtu(struct net_device *dev,
+ struct net_device *lowerdev,
+ struct vxlan_rdst *dst, int new_mtu, bool strict)
 {
-   struct vxlan_dev *vxlan = netdev_priv(dev);
-   struct vxlan_rdst *dst = &vxlan->default_dst;
-   struct net_device *lowerdev;
-   int max_mtu;
+   int max_mtu = IP_MAX_MTU;
 
-   lowerdev = __dev_get_by_index(vxlan->net, dst->remote_ifindex);
-   if (lowerdev == NULL)
-   return eth_change_mtu(dev, new_mtu);
+   if (lowerdev)
+   max_mtu = lowerdev->mtu;
 
if (dst->remote_ip.sa.sa_family == AF_INET6)
-   max_mtu = lowerdev->mtu - VXLAN6_HEADROOM;
+   max_mtu -= VXLAN6_HEADROOM;
else
-   max_mtu = lowerdev->mtu - VXLAN_HEADROOM;
+   max_mtu -= VXLAN_HEADROOM;
 
-   if (new_mtu < 68 || new_mtu > max_mtu)
+   if (new_mtu < 68)
return -EINVAL;
 
+   if (new_mtu > max_mtu) {
+   if (strict)
+   return -EINVAL;
+
+   new_mtu = max_mtu;
+   }
+
dev->mtu = new_mtu;
return 0;
 }
 
+static int vxlan_change_mtu(struct net_device *dev, int new_mtu)
+{
+   struct vxlan_dev *vxlan = netdev_priv(dev);
+   struct vxlan_rdst *dst = &vxlan->default_dst;
+   struct net_device *lowerdev = __dev_get_by_index(vxlan->net,
+dst->remote_ifindex);
+   return __vxlan_change_mtu(dev, lowerdev, dst, new_mtu, true);
+}
+
 static int egress_ipv4_tun_info(struct net_device *dev, struct sk_buff *skb,
struct ip_tunnel_info *info,
__be16 sport, __be16 dport)
-- 
2.5.0

[PATCH net v2 3/3] vxlan, gre, geneve: Set a large MTU on ovs-created tunnel devices

2016-02-09 Thread David Wragg

Prior to 4.3, tunnel vports (vxlan, gre and geneve) could transmit
vxlan packets of any size, constrained only by the ability to send out
the resulting packets.  4.3 introduced netdevs corresponding to tunnel
vports.  These netdevs have an MTU, which limits the size of a packet
that can be successfully encapsulated.  The default value for the MTUs
are low (1500 or less), which is awkwardly small in the context of
physical networks supporting jumbo frames, and leads to a conspicuous
change in behaviour for userspace.

Instead, set the MTU on openvswitch-created netdevs to be the relevant
maximum (i.e. the maximum IP packet size minus any relevant overhead),
effectively restoring the behaviour prior to 4.3.

Signed-off-by: David Wragg 
---
 drivers/net/geneve.c  | 17 +
 drivers/net/vxlan.c   | 11 ---
 include/net/ip_tunnels.h  |  1 +
 net/ipv4/ip_gre.c |  7 +++
 net/ipv4/ip_tunnel.c  | 21 ++---
 net/openvswitch/vport-vxlan.c |  2 ++
 6 files changed, 49 insertions(+), 10 deletions(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 05cef11..9965714 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -1452,11 +1452,20 @@ struct net_device *geneve_dev_create_fb(struct net 
*net, const char *name,
 
err = geneve_configure(net, dev, &geneve_remote_unspec,
   0, 0, 0, htons(dst_port), true, 0);
-   if (err) {
-   free_netdev(dev);
-   return ERR_PTR(err);
-   }
+   if (err)
+   goto err;
+
+   /* openvswitch users expect packet sizes to be unrestricted,
+  so set the largest MTU we can. */
+   err = geneve_change_mtu(dev, IP_MAX_MTU);
+   if (err)
+   goto err;
+
return dev;
+
+ err:
+   free_netdev(dev);
+   return ERR_PTR(err);
 }
 EXPORT_SYMBOL_GPL(geneve_dev_create_fb);
 
diff --git a/drivers/net/vxlan.c b/drivers/net/vxlan.c
index e992c6a..a31cd95 100644
--- a/drivers/net/vxlan.c
+++ b/drivers/net/vxlan.c
@@ -2779,6 +2779,7 @@ static int vxlan_dev_configure(struct net *src_net, 
struct net_device *dev,
int err;
bool use_ipv6 = false;
__be16 default_port = vxlan->cfg.dst_port;
+   struct net_device *lowerdev = NULL;
 
vxlan->net = src_net;
 
@@ -2799,9 +2800,7 @@ static int vxlan_dev_configure(struct net *src_net, 
struct net_device *dev,
}
 
if (conf->remote_ifindex) {
-   struct net_device *lowerdev
-= __dev_get_by_index(src_net, conf->remote_ifindex);
-
+   lowerdev = __dev_get_by_index(src_net, conf->remote_ifindex);
dst->remote_ifindex = conf->remote_ifindex;
 
if (!lowerdev) {
@@ -2825,6 +2824,12 @@ static int vxlan_dev_configure(struct net *src_net, 
struct net_device *dev,
needed_headroom = lowerdev->hard_header_len;
}
 
+   if (conf->mtu) {
+   err = __vxlan_change_mtu(dev, lowerdev, dst, conf->mtu, false);
+   if (err)
+   return err;
+   }
+
if (use_ipv6 || conf->flags & VXLAN_F_COLLECT_METADATA)
needed_headroom += VXLAN6_HEADROOM;
else
diff --git a/include/net/ip_tunnels.h b/include/net/ip_tunnels.h
index 6db96ea..dda9abf 100644
--- a/include/net/ip_tunnels.h
+++ b/include/net/ip_tunnels.h
@@ -230,6 +230,7 @@ void ip_tunnel_xmit(struct sk_buff *skb, struct net_device 
*dev,
 int ip_tunnel_ioctl(struct net_device *dev, struct ip_tunnel_parm *p, int cmd);
 int ip_tunnel_encap(struct sk_buff *skb, struct ip_tunnel *t,
u8 *protocol, struct flowi4 *fl4);
+int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict);
 int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu);
 
 struct rtnl_link_stats64 *ip_tunnel_get_stats64(struct net_device *dev,
diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 7c51c4e..057806d 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -1240,6 +1240,13 @@ struct net_device *gretap_fb_dev_create(struct net *net, 
const char *name,
err = ipgre_newlink(net, dev, tb, NULL);
if (err < 0)
goto out;
+
+   /* openvswitch users expect packet sizes to be unrestricted,
+  so set the largest MTU we can. */
+   err = __ip_tunnel_change_mtu(dev, IP_MAX_MTU, false);
+   if (err)
+   goto out;
+
return dev;
 out:
free_netdev(dev);
diff --git a/net/ipv4/ip_tunnel.c b/net/ipv4/ip_tunnel.c
index c7bd72e..49504ed 100644
--- a/net/ipv4/ip_tunnel.c
+++ b/net/ipv4/ip_tunnel.c
@@ -943,17 +943,32 @@ done:
 }
 EXPORT_SYMBOL_GPL(ip_tunnel_ioctl);
 
-int ip_tunnel_change_mtu(struct net_device *dev, int new_mtu)
+int __ip_tunnel_change_mtu(struct net_device *dev, int new_mtu, bool strict)
 {
struct ip_tunnel *tunnel = netdev_priv(dev);
int t_hlen = tunnel->hlen + sizeof(struct iphd

RE: [PATCH] net: ti: netcp: restore get/set_pad_info() functionality

2016-02-09 Thread David Laight

From: Karicheri, Muralidharan
> Sent: 09 February 2016 16:19
> >...
> >> >In reality the 'pad' fields ought to be renamed - since they aren't pads.
> >> >Perhaps they should be a union?
> >
> >> No. At the end of the descriptor, host software can add scratchpad
> >> which is not modified by the hardware, but is used by the driver. So
> >> please don't rename.
> >
> >So comment in the definition that the hardware doesn't modify them.
> >The driver is defining these fields and they are definitely NOT padding.
> 
> 
> It is scratch pad, not padding. Looks like pad is a confusing name. Can be
> renamed to sw_data to be in sync with spec below.
> 
> The hardware spec from
> http://www.ti.com/lit/ug/sprugr9h/sprugr9h.pdf
> 
> The other SW data portion of the descriptor exists after all of the defined
> words and is reserved for use by the host software to store completely
> private data. This region is not used in any way by the DMA or queue manager
> modules in a Multicore Navigator system and these modules will not modify
> any bytes within this region.

Right, so comment that the hardware doesn't look at the fields.
But name/type the structure fields to indicate what they contain.
Maybe sw_buf_len etc - but I suspect there are much more meaningful names.

David

RE: [PATCH] net: ti: netcp: restore get/set_pad_info() functionality

2016-02-09 Thread Karicheri, Muralidharan

>-Original Message-
>From: David Laight [mailto:david.lai...@aculab.com]
>Sent: Tuesday, February 09, 2016 11:38 AM
>To: Karicheri, Muralidharan; Strashko, Grygorii; netdev@vger.kernel.org; David 
>S . Miller;
>Arnd Bergmann
>Cc: Cooper Jr., Franklin; Nori, Sekhar; linux-ker...@vger.kernel.org; Kwok, 
>WingMan; N,
>Mugunthan V
>Subject: RE: [PATCH] net: ti: netcp: restore get/set_pad_info() functionality
>
>From: Karicheri, Muralidharan
>> Sent: 09 February 2016 16:19
>> >...
>> >> >In reality the 'pad' fields ought to be renamed - since they aren't pads.
>> >> >Perhaps they should be a union?
>> >
>> >> No. At the end of the descriptor, host software can add scratchpad
>> >> which is not modified by the hardware, but is used by the driver.
>> >> So please don't rename.
>> >
>> >So comment in the definition that the hardware doesn't modify them.
>> >The driver is defining these fields and they are definitely NOT padding.
>>
>>
>> It is scratch pad, not padding. Looks like pad is a confusing name.
>> Can be renamed to sw_data to be in sync with spec below.
>>
>> The hardware spec from
>> http://www.ti.com/lit/ug/sprugr9h/sprugr9h.pdf
>>
>> The other SW data portion of the descriptor exists after all of the
>> defined words and is reserved for use by the host software to store
>> completely private data. This region is not used in any way by the DMA
>> or queue manager modules in a Multicore Navigator system and these
>> modules will not modify any bytes within this region.
>
>Right, so comment that the hardware doesn't look at the fields.
>But name/type the structure fields to indicate what they contain.
>Maybe sw_buf_len etc - but I suspect there are much more meaningful names.

The descriptors are usable by different drivers, one driver may use it as 
buf ptr/ len, other for something else. So they should remain as generic
and it is up to individual drivers to use it in whatever way it requires.
My suggestion is to rename pad field in struct knav_dma_desc to sw_data
to avoid confusion. i.e 

+   __le32  pad[4];

to 

+   __le32  sw_data[4];

Murali
>
>   David

Re: [PATCH iproute2] ip route: add mpls multipath support

2016-02-09 Thread Stephen Hemminger

On Sun,  7 Feb 2016 16:28:16 -0800
Roopa Prabhu  wrote:

> From: Roopa Prabhu 
> 
> This patch adds support to add mpls multipath
> routes.
> 
> example:
> ip -f mpls route add 100 \
>   nexthop as 200 via inet 10.1.1.2 dev swp1 \
>   nexthop as 700 via inet 10.1.1.6 dev swp2
> 
> Signed-off-by: Roopa Prabhu 

Applied

Re: [PATCH iproute2] iplink: bond_slave: fix ad_actor/partner_oper_port_state output

2016-02-09 Thread Stephen Hemminger

On Mon,  8 Feb 2016 17:13:58 +0100
Nikolay Aleksandrov  wrote:

> From: Nikolay Aleksandrov 
> 
> It seems that I've made a mistake when I exported these, instead of a
> space in the end I've put a newline character which is wrong and breaks
> the single line output.
> 
> Fixes: 7d6bc3b87abad ("bonding: export 3ad actor and partner port state")
> Reported-by: Sam Tannous 
> Signed-off-by: Nikolay Aleksandrov 

Applied

Re: [PATCH iproute2 v2 01/21] iplink: bridge: export bridge_id and designated_root

2016-02-09 Thread Stephen Hemminger

On Tue,  9 Feb 2016 00:14:19 +0100
Nikolay Aleksandrov  wrote:

> From: Nikolay Aleksandrov 
> 
> Netlink returns the bridge_id and designated_root, we just need to
> make them visible.
> 
> Signed-off-by: Nikolay Aleksandrov 

Series applied.

Re: [Intel-wired-lan] [next] igb: allow setting MAC address on i211 using a device tree blob

2016-02-09 Thread Shannon Nelson

It seem to me this should be using eth_platform_get_mac_address(), a
slightly more generic method to do this.  See the i40e driver for an
example, commit d9a84324e6 I believe.

sln

On Tue, Feb 9, 2016 at 3:59 AM, Andrew Lunn  wrote:
>> > +   dn = of_find_compatible_node(NULL, NULL, "intel,i211");
>
> Humm, NULL, NULL. That means find the first node anywhere in the
> device tree which matches. This is not going to work too well when you
> have multiple i211s.
>
> There is a way so specify a DT node is attached to a specific PCIe
> bus/slot. I think you should search only there, so solving the
> multiple device issue.
>
>  Andrew



-- 
==

Mr. Shannon NelsonNetwork Division, Intel Corp.

shannon.nel...@intel.comI don't speak for Intel

 Parents can't afford to be squeamish

Re: [PATCH net v2 2/3] geneve: Relax MTU constraints

2016-02-09 Thread Sergei Shtylyov


On 02/09/2016 07:47 PM, David Wragg wrote:


Allow the MTU of geneve devices to be set to large values, in order to
exploit underlying networks with larger frame sizes.

GENEVE does not have a fixed encapsulation overhead (an openvswitch
rule can add variable length options), so there is no relevant maximum
MTU to enforce.  A maximum of IP_MAX_MTU is used instead.
Encapsulated packets that are too big for the underlying network will
get dropped on the floor.

Signed-off-by: David Wragg 
---
  drivers/net/geneve.c | 12 +++-
  1 file changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/net/geneve.c b/drivers/net/geneve.c
index 0b14ac3..05cef11 100644
--- a/drivers/net/geneve.c
+++ b/drivers/net/geneve.c
@@ -1039,6 +1039,16 @@ static netdev_tx_t geneve_xmit(struct sk_buff *skb, 
struct net_device *dev)
return geneve_xmit_skb(skb, dev, info);
  }

+static int geneve_change_mtu(struct net_device *dev, int new_mtu)
+{
+   /* GENEVE overhead is not fixed, so we can't enforce a more
+  precise max MTU. */


   The networking code formats comments:

/* Like
 * this.
 */


+   if (new_mtu < 68 || new_mtu > IP_MAX_MTU)
+   return -EINVAL;
+   dev->mtu = new_mtu;
+   return 0;
+}
+
  static int geneve_fill_metadata_dst(struct net_device *dev, struct sk_buff 
*skb)
  {
struct ip_tunnel_info *info = skb_tunnel_info(skb);

[...]

MBR, Sergei

Re: [PATCH net v2 2/3] geneve: Relax MTU constraints

2016-02-09 Thread David Wragg

Sergei Shtylyov  writes:
>The networking code formats comments:
>
> /* Like
>  * this.
>  */

Thanks.  And I noticed another silly mistake.  Will respin.

David

[PATCH] dmascc: Return correct error codes

2016-02-09 Thread Amitoj Kaur Chawla

This change has been made with the goal that kernel functions should
return something more descriptive than -1 on failure.

A variable `err` has been introduced for storing error codes.

The return value of kzalloc on failure should return a -1 and not a
-ENOMEM. This was found using Coccinelle. A simplified version of
the semantic patch used is:

//
@@
expression *e;
identifier l1;
@@

e = kzalloc(...);
if (e == NULL) {
...
goto l1;
}
l1:
...
return -1
+ -ENOMEM
;
//
---
Not sure if -ENODEV is the right error code for probe_irq_off(irqs),
since there are few other examples with this function.

 drivers/net/hamradio/dmascc.c | 13 ++---
 1 file changed, 10 insertions(+), 3 deletions(-)

diff --git a/drivers/net/hamradio/dmascc.c b/drivers/net/hamradio/dmascc.c
index c3d3777..e4137c1 100644
--- a/drivers/net/hamradio/dmascc.c
+++ b/drivers/net/hamradio/dmascc.c
@@ -451,7 +451,7 @@ static const struct net_device_ops scc_netdev_ops = {
 
 static int __init setup_adapter(int card_base, int type, int n)
 {
-   int i, irq, chip;
+   int i, irq, chip, err;
struct scc_info *info;
struct net_device *dev;
struct scc_priv *priv;
@@ -463,14 +463,17 @@ static int __init setup_adapter(int card_base, int type, 
int n)
 
/* Initialize what is necessary for write_scc and write_scc_data */
info = kzalloc(sizeof(struct scc_info), GFP_KERNEL | GFP_DMA);
-   if (!info)
+   if (!info) {
+   err = -ENOMEM;
goto out;
+   }
 
info->dev[0] = alloc_netdev(0, "", NET_NAME_UNKNOWN, dev_setup);
if (!info->dev[0]) {
printk(KERN_ERR "dmascc: "
   "could not allocate memory for %s at %#3x\n",
   hw[type].name, card_base);
+   err = -ENOMEM;
goto out1;
}
 
@@ -479,6 +482,7 @@ static int __init setup_adapter(int card_base, int type, 
int n)
printk(KERN_ERR "dmascc: "
   "could not allocate memory for %s at %#3x\n",
   hw[type].name, card_base);
+   err = -ENOMEM;
goto out2;
}
spin_lock_init(&info->register_lock);
@@ -549,6 +553,7 @@ static int __init setup_adapter(int card_base, int type, 
int n)
printk(KERN_ERR
   "dmascc: could not find irq of %s at %#3x (irq=%d)\n",
   hw[type].name, card_base, irq);
+   err = -ENODEV;
goto out3;
}
 
@@ -585,11 +590,13 @@ static int __init setup_adapter(int card_base, int type, 
int n)
if (register_netdev(info->dev[0])) {
printk(KERN_ERR "dmascc: could not register %s\n",
   info->dev[0]->name);
+   err = -ENODEV;
goto out3;
}
if (register_netdev(info->dev[1])) {
printk(KERN_ERR "dmascc: could not register %s\n",
   info->dev[1]->name);
+   err = -ENODEV;
goto out4;
}
 
@@ -612,7 +619,7 @@ static int __init setup_adapter(int card_base, int type, 
int n)
   out1:
kfree(info);
   out:
-   return -1;
+   return err;
 }
 
 
-- 
1.9.1

Re: [PATCH iproute2 2/2] tipc: add peer remove functionality

2016-02-09 Thread Stephen Hemminger

I ended up reverting this patch from iproute2 because the code in kernel
was not accepted upstream.

After it is upstream in kernel please resubmit

[PATCH net-next iproute2] iplink: display rx nohandler stats

2016-02-09 Thread Stephen Hemminger

Support for the new rx_nohandler statistic.
This code is designed to handle the case where the kernel reported statistic
structure is smaller than the larger structure in later releases (and vice 
versa).

Signed-off-by: Stephen Hemminger 
---
 ip/ipaddress.c | 35 ++-
 1 file changed, 26 insertions(+), 9 deletions(-)

diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 9d254d2..c4a8fc3 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -481,7 +481,8 @@ static void print_link_stats64(FILE *fp, const struct 
rtnl_link_stats64 *s,
/* RX error stats */
if (show_stats > 1) {
fprintf(fp, "%s", _SL_);
-   fprintf(fp, "RX errors: length   crc frame   fifo
missed%s", _SL_);
+   fprintf(fp, "RX errors: length   crc frame   fifo
missed%s%s",
+   s->rx_nohandler ? "   nohandler" : "",  _SL_);
 
fprintf(fp, "   ");
print_num(fp, 8, s->rx_length_errors);
@@ -489,6 +490,9 @@ static void print_link_stats64(FILE *fp, const struct 
rtnl_link_stats64 *s,
print_num(fp, 7, s->rx_frame_errors);
print_num(fp, 7, s->rx_fifo_errors);
print_num(fp, 7, s->rx_missed_errors);
+   if (s->rx_nohandler)
+   print_num(fp, 7, s->rx_nohandler);
+
}
fprintf(fp, "%s", _SL_);
 
@@ -496,7 +500,6 @@ static void print_link_stats64(FILE *fp, const struct 
rtnl_link_stats64 *s,
fprintf(fp, "TX: bytes  packets  errors  dropped carrier collsns 
%s%s",
s->tx_compressed ? "compressed" : "", _SL_);
 
-
fprintf(fp, "");
print_num(fp, 10, s->tx_bytes);
print_num(fp, 8, s->tx_packets);
@@ -546,13 +549,16 @@ static void print_link_stats32(FILE *fp, const struct 
rtnl_link_stats *s,
/* RX error stats */
if (show_stats > 1) {
fprintf(fp, "%s", _SL_);
-   fprintf(fp, "RX errors: length   crc frame   fifo
missed%s", _SL_);
+   fprintf(fp, "RX errors: length   crc frame   fifo
missed%s%s",
+   s->rx_nohandler ? "   nohandler" : "",  _SL_);
fprintf(fp, "   ");
print_num(fp, 8, s->rx_length_errors);
print_num(fp, 7, s->rx_crc_errors);
print_num(fp, 7, s->rx_frame_errors);
print_num(fp, 7, s->rx_fifo_errors);
print_num(fp, 7, s->rx_missed_errors);
+   if (s->rx_nohandler)
+   print_num(fp, 7, s->rx_nohandler);
}
fprintf(fp, "%s", _SL_);
 
@@ -590,12 +596,23 @@ static void print_link_stats32(FILE *fp, const struct 
rtnl_link_stats *s,
 
 static void __print_link_stats(FILE *fp, struct rtattr **tb)
 {
-   if (tb[IFLA_STATS64])
-   print_link_stats64(fp, RTA_DATA(tb[IFLA_STATS64]),
-   tb[IFLA_CARRIER_CHANGES]);
-   else if (tb[IFLA_STATS])
-   print_link_stats32(fp, RTA_DATA(tb[IFLA_STATS]),
-   tb[IFLA_CARRIER_CHANGES]);
+   const struct rtattr *carrier_changes = tb[IFLA_CARRIER_CHANGES];
+
+   if (tb[IFLA_STATS64]) {
+   struct rtnl_link_stats64 stats = { 0 };
+
+   memcpy(&stats, RTA_DATA(tb[IFLA_STATS64]),
+  MIN(RTA_PAYLOAD(tb[IFLA_STATS64]), sizeof(stats)));
+
+   print_link_stats64(fp, &stats, carrier_changes);
+   } else if (tb[IFLA_STATS]) {
+   struct rtnl_link_stats stats = { 0 };
+
+   memcpy(&stats, RTA_DATA(tb[IFLA_STATS]),
+  MIN(RTA_PAYLOAD(tb[IFLA_STATS]), sizeof(stats)));
+
+   print_link_stats32(fp, &stats, carrier_changes);
+   }
 }
 
 static void print_link_stats(FILE *fp, struct nlmsghdr *n)
-- 
2.1.4

Re: [PATCH] net: ti: netcp: restore get/set_pad_info() functionality

2016-02-09 Thread Arnd Bergmann

On Tuesday 09 February 2016 16:55:42 Karicheri, Muralidharan wrote:
> 
> The descriptors are usable by different drivers, one driver may use it as 
> buf ptr/ len, other for something else. So they should remain as generic
> and it is up to individual drivers to use it in whatever way it requires.
> My suggestion is to rename pad field in struct knav_dma_desc to sw_data
> to avoid confusion. i.e 
> 
> +   __le32  pad[4];
> 
> to 
> 
> +   __le32  sw_data[4];
> 

If the hardware doesn't access them, they can probably just be u32
and not do any byte swapping.

Arnd

[PATCH v3 0/6] Add support for MICREL KSZ8795CLX 5-port switch

2016-02-09 Thread Helmut Buchsbaum

This patch series refactors the spi-ks8995 driver to finally add support
for the MICREL KSZ8795CLX. Additionally support for controlling a GPIO
line for resetting the switch is added.

Helmut

Changes since v2:
 - use GPIO_ACTIVE_LOW according to Andrew's remark.
 - use ePAPR compliant node name in example, thanks to Sergei for
   pointing out
Changes since v1:
 - removed initializing registers from Device Tree following Florian's
   advice
 - fixed GPIO handling for reset according to Andrew's remark.

Helmut Buchsbaum (6):
  net: phy: spi_ks8995: introduce spi_device_id table
  net: phy: spi_ks8995: verify chip and determine revision
  net: phy: spi_ks8995: add support for resetting switch using GPIO
  net: phy: spi_ks8995: generalize creation of SPI commands
  net: phy: spi_ks8995: add support for MICREL KSZ8795CLX
  dt-bindings: net: ks8995: add bindings documentation for ks8995

 .../devicetree/bindings/net/micrel-ks8995.txt  |  20 ++
 drivers/net/phy/spi_ks8995.c   | 304 +
 2 files changed, 265 insertions(+), 59 deletions(-)
 create mode 100644 Documentation/devicetree/bindings/net/micrel-ks8995.txt

-- 
2.1.4

[PATCH v3 1/6] net: phy: spi_ks8995: introduce spi_device_id table

2016-02-09 Thread Helmut Buchsbaum

Refactor to use spi_device_id table to facilitate easy
extendability.

Signed-off-by: Helmut Buchsbaum 
---
 drivers/net/phy/spi_ks8995.c | 42 --
 1 file changed, 40 insertions(+), 2 deletions(-)

diff --git a/drivers/net/phy/spi_ks8995.c b/drivers/net/phy/spi_ks8995.c
index f091d69..e848ad9 100644
--- a/drivers/net/phy/spi_ks8995.c
+++ b/drivers/net/phy/spi_ks8995.c
@@ -89,6 +89,28 @@
 
 #define KS8995_RESET_DELAY 10 /* usec */
 
+enum ks8995_chip_variant {
+   ks8995,
+   ksz8864,
+   max_variant
+};
+
+struct ks8995_chip_params {
+   char *name;
+   int regs_size;
+};
+
+static const struct ks8995_chip_params ks8995_chip[] = {
+   [ks8995] = {
+   .name = "KS8995MA",
+   .regs_size = KS8995_REGS_SIZE,
+   },
+   [ksz8864] = {
+   .name = "KSZ8864RMN",
+   .regs_size = KSZ8864_REGS_SIZE,
+   },
+};
+
 struct ks8995_pdata {
/* not yet implemented */
 };
@@ -98,8 +120,16 @@ struct ks8995_switch {
struct mutexlock;
struct ks8995_pdata *pdata;
struct bin_attributeregs_attr;
+   const struct ks8995_chip_params *chip;
 };
 
+static const struct spi_device_id ks8995_id[] = {
+   {"ks8995", ks8995},
+   {"ksz8864", ksz8864},
+   { }
+};
+MODULE_DEVICE_TABLE(spi, ks8995_id);
+
 static inline u8 get_chip_id(u8 val)
 {
return (val >> ID1_CHIPID_S) & ID1_CHIPID_M;
@@ -244,17 +274,22 @@ static const struct bin_attribute ks8995_registers_attr = 
{
 };
 
 /*  */
-
 static int ks8995_probe(struct spi_device *spi)
 {
struct ks8995_switch*ks;
struct ks8995_pdata *pdata;
u8  ids[2];
int err;
+   int variant = spi_get_device_id(spi)->driver_data;
 
/* Chip description */
pdata = spi->dev.platform_data;
 
+   if (variant >= max_variant) {
+   dev_err(&spi->dev, "bad chip variant %d\n", variant);
+   return -ENODEV;
+   }
+
ks = devm_kzalloc(&spi->dev, sizeof(*ks), GFP_KERNEL);
if (!ks)
return -ENOMEM;
@@ -262,6 +297,8 @@ static int ks8995_probe(struct spi_device *spi)
mutex_init(&ks->lock);
ks->pdata = pdata;
ks->spi = spi_dev_get(spi);
+   ks->chip = &ks8995_chip[variant];
+
spi_set_drvdata(spi, ks);
 
spi->mode = SPI_MODE_0;
@@ -287,6 +324,7 @@ static int ks8995_probe(struct spi_device *spi)
return -ENODEV;
}
 
+   ks->regs_attr.size = ks->chip->regs_size;
memcpy(&ks->regs_attr, &ks8995_registers_attr, sizeof(ks->regs_attr));
if (get_chip_id(ids[1]) != CHIPID_M) {
u8 val;
@@ -303,7 +341,6 @@ static int ks8995_probe(struct spi_device *spi)
dev_err(&spi->dev, "unknown chip:%02x,0\n", ids[1]);
return err;
}
-   ks->regs_attr.size = KSZ8864_REGS_SIZE;
}
 
err = ks8995_reset(ks);
@@ -347,6 +384,7 @@ static struct spi_driver ks8995_driver = {
},
.probe= ks8995_probe,
.remove   = ks8995_remove,
+   .id_table = ks8995_id,
 };
 
 module_spi_driver(ks8995_driver);
-- 
2.1.4

[PATCH v3 4/6] net: phy: spi_ks8995: generalize creation of SPI commands

2016-02-09 Thread Helmut Buchsbaum

Prepare creating SPI reads and writes for other switch families.
The KS8995 family uses the straight forward
<8bit CMD><8bit ADDR>
sequence.
To be able to support KSZ8795 family, which uses
<3bit CMD><12bit ADDR><1 bit TR>
make the SPI command creation chip variant dependent.

Signed-off-by: Helmut Buchsbaum 
---
 drivers/net/phy/spi_ks8995.c | 46 +---
 1 file changed, 35 insertions(+), 11 deletions(-)

diff --git a/drivers/net/phy/spi_ks8995.c b/drivers/net/phy/spi_ks8995.c
index 04d468f..f866786 100644
--- a/drivers/net/phy/spi_ks8995.c
+++ b/drivers/net/phy/spi_ks8995.c
@@ -105,6 +105,8 @@ struct ks8995_chip_params {
int family_id;
int chip_id;
int regs_size;
+   int addr_width;
+   int addr_shift;
 };
 
 static const struct ks8995_chip_params ks8995_chip[] = {
@@ -113,12 +115,16 @@ static const struct ks8995_chip_params ks8995_chip[] = {
.family_id = FAMILY_KS8995,
.chip_id = KS8995_CHIP_ID,
.regs_size = KS8995_REGS_SIZE,
+   .addr_width = 8,
+   .addr_shift = 0,
},
[ksz8864] = {
.name = "KSZ8864RMN",
.family_id = FAMILY_KS8995,
.chip_id = KSZ8864_CHIP_ID,
.regs_size = KSZ8864_REGS_SIZE,
+   .addr_width = 8,
+   .addr_shift = 0,
},
 };
 
@@ -153,20 +159,44 @@ static inline u8 get_chip_rev(u8 val)
return (val >> ID1_REVISION_S) & ID1_REVISION_M;
 }
 
+/* create_spi_cmd - create a chip specific SPI command header
+ * @ks: pointer to switch instance
+ * @cmd: SPI command for switch
+ * @address: register address for command
+ *
+ * Different chip families use different bit pattern to address the switches
+ * registers:
+ *
+ * KS8995: 8bit command + 8bit address
+ * KSZ8795: 3bit command + 12bit address + 1bit TR (?)
+ */
+static inline __be16 create_spi_cmd(struct ks8995_switch *ks, int cmd,
+   unsigned address)
+{
+   u16 result = cmd;
+
+   /* make room for address (incl. address shift) */
+   result <<= ks->chip->addr_width + ks->chip->addr_shift;
+   /* add address */
+   result |= address << ks->chip->addr_shift;
+   /* SPI protocol needs big endian */
+   return cpu_to_be16(result);
+}
 /*  */
 static int ks8995_read(struct ks8995_switch *ks, char *buf,
 unsigned offset, size_t count)
 {
-   u8 cmd[2];
+   __be16 cmd;
struct spi_transfer t[2];
struct spi_message m;
int err;
 
+   cmd = create_spi_cmd(ks, KS8995_CMD_READ, offset);
spi_message_init(&m);
 
memset(&t, 0, sizeof(t));
 
-   t[0].tx_buf = cmd;
+   t[0].tx_buf = &cmd;
t[0].len = sizeof(cmd);
spi_message_add_tail(&t[0], &m);
 
@@ -174,9 +204,6 @@ static int ks8995_read(struct ks8995_switch *ks, char *buf,
t[1].len = count;
spi_message_add_tail(&t[1], &m);
 
-   cmd[0] = KS8995_CMD_READ;
-   cmd[1] = offset;
-
mutex_lock(&ks->lock);
err = spi_sync(ks->spi, &m);
mutex_unlock(&ks->lock);
@@ -184,20 +211,20 @@ static int ks8995_read(struct ks8995_switch *ks, char 
*buf,
return err ? err : count;
 }
 
-
 static int ks8995_write(struct ks8995_switch *ks, char *buf,
 unsigned offset, size_t count)
 {
-   u8 cmd[2];
+   __be16 cmd;
struct spi_transfer t[2];
struct spi_message m;
int err;
 
+   cmd = create_spi_cmd(ks, KS8995_CMD_WRITE, offset);
spi_message_init(&m);
 
memset(&t, 0, sizeof(t));
 
-   t[0].tx_buf = cmd;
+   t[0].tx_buf = &cmd;
t[0].len = sizeof(cmd);
spi_message_add_tail(&t[0], &m);
 
@@ -205,9 +232,6 @@ static int ks8995_write(struct ks8995_switch *ks, char *buf,
t[1].len = count;
spi_message_add_tail(&t[1], &m);
 
-   cmd[0] = KS8995_CMD_WRITE;
-   cmd[1] = offset;
-
mutex_lock(&ks->lock);
err = spi_sync(ks->spi, &m);
mutex_unlock(&ks->lock);
-- 
2.1.4

[PATCH v3 6/6] dt-bindings: net: ks8995: add bindings documentation for ks8995

2016-02-09 Thread Helmut Buchsbaum

Signed-off-by: Helmut Buchsbaum 
---
 .../devicetree/bindings/net/micrel-ks8995.txt| 20 
 1 file changed, 20 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/micrel-ks8995.txt

diff --git a/Documentation/devicetree/bindings/net/micrel-ks8995.txt 
b/Documentation/devicetree/bindings/net/micrel-ks8995.txt
new file mode 100644
index 000..7f11ca6
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/micrel-ks8995.txt
@@ -0,0 +1,20 @@
+Micrel KS8995 SPI controlled Ethernet Switch families
+
+Required properties (according to spi-bus.txt):
+- compatible: either "micrel,ks8995", "micrel,ksz8864" or "micrel,ksz8795"
+
+Optional properties:
+- reset-gpios : phandle of gpio that will be used to reset chip during probe
+
+Example:
+
+spi-master {
+   ...
+   switch@0 {
+   compatible = "micrel,ksz8795";
+
+   reg = <0>;
+   spi-max-frequency = <5000>;
+   reset-gpios = <&gpio0 46 GPIO_ACTIVE_LOW>;
+   };
+};
-- 
2.1.4

[PATCH v3 5/6] net: phy: spi_ks8995: add support for MICREL KSZ8795CLX

2016-02-09 Thread Helmut Buchsbaum

Add support for MICREL KSZ8795CLX Integrated 5-Port, 10-/100-Managed
Ethernet Switch with Gigabit GMII/RGMII and MII/RMII interfaces.

Signed-off-by: Helmut Buchsbaum 
---
 drivers/net/phy/spi_ks8995.c | 29 +
 1 file changed, 29 insertions(+)

diff --git a/drivers/net/phy/spi_ks8995.c b/drivers/net/phy/spi_ks8995.c
index f866786..c2d6c23 100644
--- a/drivers/net/phy/spi_ks8995.c
+++ b/drivers/net/phy/spi_ks8995.c
@@ -77,6 +77,7 @@
 
 #define KS8995_REGS_SIZE   0x80
 #define KSZ8864_REGS_SIZE  0x100
+#define KSZ8795_REGS_SIZE  0x100
 
 #define ID1_CHIPID_M   0xf
 #define ID1_CHIPID_S   4
@@ -85,9 +86,11 @@
 #define ID1_START_SW   1   /* start the switch */
 
 #define FAMILY_KS8995  0x95
+#define FAMILY_KSZ8795 0x87
 #define CHIPID_M   0
 #define KS8995_CHIP_ID 0x00
 #define KSZ8864_CHIP_ID0x01
+#define KSZ8795_CHIP_ID0x09
 
 #define KS8995_CMD_WRITE   0x02U
 #define KS8995_CMD_READ0x03U
@@ -97,6 +100,7 @@
 enum ks8995_chip_variant {
ks8995,
ksz8864,
+   ksz8795,
max_variant
 };
 
@@ -126,6 +130,14 @@ static const struct ks8995_chip_params ks8995_chip[] = {
.addr_width = 8,
.addr_shift = 0,
},
+   [ksz8795] = {
+   .name = "KSZ8795CLX",
+   .family_id = FAMILY_KSZ8795,
+   .chip_id = KSZ8795_CHIP_ID,
+   .regs_size = KSZ8795_REGS_SIZE,
+   .addr_width = 12,
+   .addr_shift = 1,
+   },
 };
 
 struct ks8995_pdata {
@@ -145,6 +157,7 @@ struct ks8995_switch {
 static const struct spi_device_id ks8995_id[] = {
{"ks8995", ks8995},
{"ksz8864", ksz8864},
+   {"ksz8795", ksz8795},
{ }
 };
 MODULE_DEVICE_TABLE(spi, ks8995_id);
@@ -358,6 +371,22 @@ static int ks8995_get_revision(struct ks8995_switch *ks)
err = -ENODEV;
}
break;
+   case FAMILY_KSZ8795:
+   /* try reading chip id at CHIP ID1 */
+   err = ks8995_read_reg(ks, KS8995_REG_ID1, &id1);
+   if (err) {
+   err = -EIO;
+   goto err_out;
+   }
+
+   if (get_chip_id(id1) == ks->chip->chip_id) {
+   ks->revision_id = get_chip_rev(id1);
+   } else {
+   dev_err(&ks->spi->dev, "unsupported chip id for KSZ8795 
family: 0x%02x\n",
+   id1);
+   err = -ENODEV;
+   }
+   break;
default:
dev_err(&ks->spi->dev, "unsupported family id: 0x%02x\n", id0);
err = -ENODEV;
-- 
2.1.4

[PATCH v3 2/6] net: phy: spi_ks8995: verify chip and determine revision

2016-02-09 Thread Helmut Buchsbaum

Since the chip variant is now determined by spi_device_id, verify
family and chip id and determine the revision id.

Signed-off-by: Helmut Buchsbaum 
---
 drivers/net/phy/spi_ks8995.c | 118 +--
 1 file changed, 80 insertions(+), 38 deletions(-)

diff --git a/drivers/net/phy/spi_ks8995.c b/drivers/net/phy/spi_ks8995.c
index e848ad9..2803c8e 100644
--- a/drivers/net/phy/spi_ks8995.c
+++ b/drivers/net/phy/spi_ks8995.c
@@ -83,6 +83,8 @@
 
 #define FAMILY_KS8995  0x95
 #define CHIPID_M   0
+#define KS8995_CHIP_ID 0x00
+#define KSZ8864_CHIP_ID0x01
 
 #define KS8995_CMD_WRITE   0x02U
 #define KS8995_CMD_READ0x03U
@@ -97,16 +99,22 @@ enum ks8995_chip_variant {
 
 struct ks8995_chip_params {
char *name;
+   int family_id;
+   int chip_id;
int regs_size;
 };
 
 static const struct ks8995_chip_params ks8995_chip[] = {
[ks8995] = {
.name = "KS8995MA",
+   .family_id = FAMILY_KS8995,
+   .chip_id = KS8995_CHIP_ID,
.regs_size = KS8995_REGS_SIZE,
},
[ksz8864] = {
.name = "KSZ8864RMN",
+   .family_id = FAMILY_KS8995,
+   .chip_id = KSZ8864_CHIP_ID,
.regs_size = KSZ8864_REGS_SIZE,
},
 };
@@ -121,6 +129,7 @@ struct ks8995_switch {
struct ks8995_pdata *pdata;
struct bin_attributeregs_attr;
const struct ks8995_chip_params *chip;
+   int revision_id;
 };
 
 static const struct spi_device_id ks8995_id[] = {
@@ -263,6 +272,73 @@ static ssize_t ks8995_registers_write(struct file *filp, 
struct kobject *kobj,
return ks8995_write(ks8995, buf, off, count);
 }
 
+/* ks8995_get_revision - get chip revision
+ * @ks: pointer to switch instance
+ *
+ * Verify chip family and id and get chip revision.
+ */
+static int ks8995_get_revision(struct ks8995_switch *ks)
+{
+   int err;
+   u8 id0, id1, ksz8864_id;
+
+   /* read family id */
+   err = ks8995_read_reg(ks, KS8995_REG_ID0, &id0);
+   if (err) {
+   err = -EIO;
+   goto err_out;
+   }
+
+   /* verify family id */
+   if (id0 != ks->chip->family_id) {
+   dev_err(&ks->spi->dev, "chip family id mismatch: expected 
0x%02x but 0x%02x read\n",
+   ks->chip->family_id, id0);
+   err = -ENODEV;
+   goto err_out;
+   }
+
+   switch (ks->chip->family_id) {
+   case FAMILY_KS8995:
+   /* try reading chip id at CHIP ID1 */
+   err = ks8995_read_reg(ks, KS8995_REG_ID1, &id1);
+   if (err) {
+   err = -EIO;
+   goto err_out;
+   }
+
+   /* verify chip id */
+   if ((get_chip_id(id1) == CHIPID_M) &&
+   (get_chip_id(id1) == ks->chip->chip_id)) {
+   /* KS8995MA */
+   ks->revision_id = get_chip_rev(id1);
+   } else if (get_chip_id(id1) != CHIPID_M) {
+   /* KSZ8864RMN */
+   err = ks8995_read_reg(ks, KS8995_REG_ID1, &ksz8864_id);
+   if (err) {
+   err = -EIO;
+   goto err_out;
+   }
+
+   if ((ksz8864_id & 0x80) &&
+   (ks->chip->chip_id == KSZ8864_CHIP_ID)) {
+   ks->revision_id = get_chip_rev(id1);
+   }
+
+   } else {
+   dev_err(&ks->spi->dev, "unsupported chip id for KS8995 
family: 0x%02x\n",
+   id1);
+   err = -ENODEV;
+   }
+   break;
+   default:
+   dev_err(&ks->spi->dev, "unsupported family id: 0x%02x\n", id0);
+   err = -ENODEV;
+   break;
+   }
+err_out:
+   return err;
+}
+
 static const struct bin_attribute ks8995_registers_attr = {
.attr = {
.name   = "registers",
@@ -278,7 +354,6 @@ static int ks8995_probe(struct spi_device *spi)
 {
struct ks8995_switch*ks;
struct ks8995_pdata *pdata;
-   u8  ids[2];
int err;
int variant = spi_get_device_id(spi)->driver_data;
 
@@ -309,39 +384,12 @@ static int ks8995_probe(struct spi_device *spi)
return err;
}
 
-   err = ks8995_read(ks, ids, KS8995_REG_ID0, sizeof(ids));
-   if (err < 0) {
-   dev_err(&spi->dev, "unable to read id registers, err=%d\n",
-   err);
+   err = ks8995_get_revision(ks);
+   if (err)
return err;
-   }
-
-   switch (ids[0]) {
-   case FAMILY_KS8995:
-   break;
-   default:
-   dev_err(&spi->dev, "unknown family

[PATCH v3 3/6] net: phy: spi_ks8995: add support for resetting switch using GPIO

2016-02-09 Thread Helmut Buchsbaum

When using device tree it is no more possible to reset the PHY at board
level. Furthermore, doing in the driver allows to power down the switch
when it is not used any more.

The patch introduces a new optional property "reset-gpios" denoting an
appropriate GPIO handle, e.g.:

reset-gpios = <&gpio0 46 GPIO_ACTIVE_LOW>

Signed-off-by: Helmut Buchsbaum 
---
 drivers/net/phy/spi_ks8995.c | 71 ++--
 1 file changed, 62 insertions(+), 9 deletions(-)

diff --git a/drivers/net/phy/spi_ks8995.c b/drivers/net/phy/spi_ks8995.c
index 2803c8e..04d468f 100644
--- a/drivers/net/phy/spi_ks8995.c
+++ b/drivers/net/phy/spi_ks8995.c
@@ -18,6 +18,9 @@
 #include 
 #include 
 #include 
+#include 
+#include 
+#include 
 
 #include 
 
@@ -120,7 +123,8 @@ static const struct ks8995_chip_params ks8995_chip[] = {
 };
 
 struct ks8995_pdata {
-   /* not yet implemented */
+   int reset_gpio;
+   enum of_gpio_flags reset_gpio_flags;
 };
 
 struct ks8995_switch {
@@ -339,6 +343,24 @@ err_out:
return err;
 }
 
+/* ks8995_parse_dt - setup platform data from devicetree
+ * @ks: pointer to switch instance
+ *
+ * Parses supported DT properties and sets up platform data
+ * accordingly.
+ */
+static void ks8995_parse_dt(struct ks8995_switch *ks)
+{
+   struct device_node *np = ks->spi->dev.of_node;
+   struct ks8995_pdata *pdata = ks->pdata;
+
+   if (!np)
+   return;
+
+   pdata->reset_gpio = of_get_named_gpio_flags(np, "reset-gpios", 0,
+   &pdata->reset_gpio_flags);
+}
+
 static const struct bin_attribute ks8995_registers_attr = {
.attr = {
.name   = "registers",
@@ -352,14 +374,10 @@ static const struct bin_attribute ks8995_registers_attr = 
{
 /*  */
 static int ks8995_probe(struct spi_device *spi)
 {
-   struct ks8995_switch*ks;
-   struct ks8995_pdata *pdata;
-   int err;
+   struct ks8995_switch *ks;
+   int err;
int variant = spi_get_device_id(spi)->driver_data;
 
-   /* Chip description */
-   pdata = spi->dev.platform_data;
-
if (variant >= max_variant) {
dev_err(&spi->dev, "bad chip variant %d\n", variant);
return -ENODEV;
@@ -370,10 +388,42 @@ static int ks8995_probe(struct spi_device *spi)
return -ENOMEM;
 
mutex_init(&ks->lock);
-   ks->pdata = pdata;
ks->spi = spi_dev_get(spi);
ks->chip = &ks8995_chip[variant];
 
+   if (ks->spi->dev.of_node) {
+   ks->pdata = devm_kzalloc(&spi->dev, sizeof(*ks->pdata),
+GFP_KERNEL);
+   if (!ks->pdata)
+   return -ENOMEM;
+
+   ks->pdata->reset_gpio = -1;
+
+   ks8995_parse_dt(ks);
+   }
+
+   if (!ks->pdata)
+   ks->pdata = spi->dev.platform_data;
+
+   /* de-assert switch reset */
+   if (ks->pdata && gpio_is_valid(ks->pdata->reset_gpio)) {
+   unsigned long flags;
+
+   flags = (ks->pdata->reset_gpio_flags == OF_GPIO_ACTIVE_LOW ?
+GPIOF_ACTIVE_LOW : 0);
+
+   err = devm_gpio_request_one(&spi->dev,
+   ks->pdata->reset_gpio,
+   flags, "switch-reset");
+   if (err) {
+   dev_err(&spi->dev,
+   "failed to get reset-gpios: %d\n", err);
+   return -EIO;
+   }
+
+   gpiod_set_value(gpio_to_desc(ks->pdata->reset_gpio), 0);
+   }
+
spi_set_drvdata(spi, ks);
 
spi->mode = SPI_MODE_0;
@@ -414,11 +464,14 @@ static int ks8995_remove(struct spi_device *spi)
 
sysfs_remove_bin_file(&spi->dev.kobj, &ks->regs_attr);
 
+   /* assert reset */
+   if (ks->pdata && gpio_is_valid(ks->pdata->reset_gpio))
+   gpiod_set_value(gpio_to_desc(ks->pdata->reset_gpio), 1);
+
return 0;
 }
 
 /*  */
-
 static struct spi_driver ks8995_driver = {
.driver = {
.name   = "spi-ks8995",
-- 
2.1.4

[PATCH v2] net: fec: Add "phy-reset-active-low" property to DT

2016-02-09 Thread Bernhard Walle

We need that for a custom hardware that needs the reverse reset
sequence.

Signed-off-by: Bernhard Walle 
---
Changes compared to v1:
 - Add documentation to 'phy-reset-gpios' that flags are ignored
   as suggested by Andrew Lunn.

 Documentation/devicetree/bindings/net/fsl-fec.txt | 7 ++-
 drivers/net/ethernet/freescale/fec_main.c | 8 ++--
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/fsl-fec.txt 
b/Documentation/devicetree/bindings/net/fsl-fec.txt
index a9eb611..0caa429 100644
--- a/Documentation/devicetree/bindings/net/fsl-fec.txt
+++ b/Documentation/devicetree/bindings/net/fsl-fec.txt
@@ -7,11 +7,16 @@ Required properties:
 - phy-mode : See ethernet.txt file in the same directory
 
 Optional properties:
-- phy-reset-gpios : Should specify the gpio for phy reset
+- phy-reset-gpios : Should specify the gpio for phy reset. Additional
+  flags are ignored, see the non-standard 'phy-reset-active-low' property
+  instead.
 - phy-reset-duration : Reset duration in milliseconds.  Should present
   only if property "phy-reset-gpios" is available.  Missing the property
   will have the duration be 1 millisecond.  Numbers greater than 1000 are
   invalid and 1 millisecond will be used instead.
+- phy-reset-active-low : If present then the reset sequence using the GPIO
+  specified in the "phy-reset-gpios" property is reversed (H=reset state,
+  L=operation state).
 - phy-supply : regulator that powers the Ethernet PHY.
 - phy-handle : phandle to the PHY device connected to this device.
 - fixed-link : Assume a fixed link. See fixed-link.txt in the same directory.
diff --git a/drivers/net/ethernet/freescale/fec_main.c 
b/drivers/net/ethernet/freescale/fec_main.c
index 41c81f6..98caf87 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -3229,6 +3229,7 @@ static int fec_enet_init(struct net_device *ndev)
 static void fec_reset_phy(struct platform_device *pdev)
 {
int err, phy_reset;
+   bool active_low = false;
int msec = 1;
struct device_node *np = pdev->dev.of_node;
 
@@ -3244,14 +3245,17 @@ static void fec_reset_phy(struct platform_device *pdev)
if (!gpio_is_valid(phy_reset))
return;
 
+   active_low = of_property_read_bool(np, "phy-reset-active-low");
+
err = devm_gpio_request_one(&pdev->dev, phy_reset,
-   GPIOF_OUT_INIT_LOW, "phy-reset");
+   active_low ? GPIOF_OUT_INIT_HIGH : GPIOF_OUT_INIT_LOW,
+   "phy-reset");
if (err) {
dev_err(&pdev->dev, "failed to get phy-reset-gpios: %d\n", err);
return;
}
msleep(msec);
-   gpio_set_value_cansleep(phy_reset, 1);
+   gpio_set_value_cansleep(phy_reset, !active_low);
 }
 #else /* CONFIG_OF */
 static void fec_reset_phy(struct platform_device *pdev)
-- 
2.7.1

[PATCH 4/5] net: sxgbe: fix error paths in sxgbe_platform_probe()

2016-02-09 Thread Rasmus Villemoes

We need to use post-decrement to ensure that irq_dispose_mapping is
also called on priv->rxq[0]->irq_no; moreover, if one of the above for
loops failed already at i==0 (so we reach one of these labels with
that value of i), we'll enter an essentially infinite loop of
out-of-bounds accesses.

Signed-off-by: Rasmus Villemoes 
---
 drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c 
b/drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c
index b02eed12bfc5..73427e29df2a 100644
--- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c
+++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c
@@ -155,11 +155,11 @@ static int sxgbe_platform_probe(struct platform_device 
*pdev)
return 0;
 
 err_rx_irq_unmap:
-   while (--i)
+   while (i--)
irq_dispose_mapping(priv->rxq[i]->irq_no);
i = SXGBE_TX_QUEUES;
 err_tx_irq_unmap:
-   while (--i)
+   while (i--)
irq_dispose_mapping(priv->txq[i]->irq_no);
irq_dispose_mapping(priv->irq);
 err_drv_remove:
-- 
2.1.4

[PATCH 3/5] net/mlx4: fix some error handling in mlx4_multi_func_init()

2016-02-09 Thread Rasmus Villemoes

The while loop after err_slaves should use post-decrement; otherwise
we'll fail to do the kfrees for i==0, and will run into out-of-bounds
accesses if the setup above failed already at i==0.

The predecrement in the --port is ok, since ->vlan_filter is
(bizarrely) 1-indexed. But I'm changing 'if' to 'while' since it's a
bit ugly to rely on MLX4_MAX_PORTS being 2.

[I'm not sure why one even bothers populating the ->vlan_filter array:
mlx4.h isn't #included by anything outside
drivers/net/ethernet/mellanox/mlx4/, and "git grep -C2 -w vlan_filter
drivers/net/ethernet/mellanox/mlx4/" seems to suggest that the
vlan_filter elements aren't used at all.]

Signed-off-by: Rasmus Villemoes 
---
 drivers/net/ethernet/mellanox/mlx4/cmd.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/cmd.c 
b/drivers/net/ethernet/mellanox/mlx4/cmd.c
index d48d5793407d..bfe8234abbba 100644
--- a/drivers/net/ethernet/mellanox/mlx4/cmd.c
+++ b/drivers/net/ethernet/mellanox/mlx4/cmd.c
@@ -2369,7 +2369,7 @@ int mlx4_multi_func_init(struct mlx4_dev *dev)
kzalloc(sizeof(struct mlx4_vlan_fltr),
GFP_KERNEL);
if (!s_state->vlan_filter[port]) {
-   if (--port)
+   while (--port)

kfree(s_state->vlan_filter[port]);
goto err_slaves;
}
@@ -2429,7 +2429,7 @@ err_thread:
flush_workqueue(priv->mfunc.master.comm_wq);
destroy_workqueue(priv->mfunc.master.comm_wq);
 err_slaves:
-   while (--i) {
+   while (i--) {
for (port = 1; port <= MLX4_MAX_PORTS; port++)

kfree(priv->mfunc.master.slave_state[i].vlan_filter[port]);
}
-- 
2.1.4

[PATCH 0/5] pre-decrement in error paths considered harmful

2016-02-09 Thread Rasmus Villemoes

There are a few instances of

  for (i = 0; i < FOO; ++i) {
ret = do_stuff(i)
if (ret)
  goto err;
  }
  ...
  err:
  while (--i)
undo_stuff(i);

At best, this fails to undo_stuff for i==0, but if i==0 was the case
that failed, we'll end up with an "infinite" loop in the error path
doing nasty stuff.

These were found with a simple coccinelle script

@@
expression i;
identifier l;
statement S;
@@
* l:
* while (--i)
S

(and there were no false positives).

There's no dependencies between the patches; I just wanted to include
a common cover letter with a little background info.

Rasmus Villemoes (5):
  drm/gma500: fix error path in gma_intel_setup_gmbus()
  drm/i915: fix error path in intel_setup_gmbus()
  net/mlx4: fix some error handling in mlx4_multi_func_init()
  net: sxgbe: fix error paths in sxgbe_platform_probe()
  mm/backing-dev.c: fix error path in wb_init()

 drivers/gpu/drm/gma500/intel_gmbus.c| 2 +-
 drivers/gpu/drm/i915/intel_i2c.c| 2 +-
 drivers/net/ethernet/mellanox/mlx4/cmd.c| 4 ++--
 drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c | 4 ++--
 mm/backing-dev.c| 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

-- 
2.1.4

Mellanox ConnectX3 Pro and kernel 4.4 low throughput bug

2016-02-09 Thread John


I'm running into a bug with kernel 4.4.0 where a VM-VM test between two
different baremetal hosts (HP Proliant dl360gen9s) has receive-side 
throughput
that's about 25% lower than expected with a Mellanox ConnectX3-pro NIC. 
The VMs
are connected over a VXLAN tunnel that I used OpenvSwitch 2.4.90 to set 
up on
both hosts. When the mellanox NIC is the endpoint of the vxlan tunnel 
and its VM

receives a throughput test the VM gets about 6.65Gb/s throughput where other
NICs get ~8.3Gb/s (8.04 for niantic, 8.65 for broadcom). When I test the
mellanox in a (patched) 3.14.57 kernel, I get 8.9Gb/s between VMs. I 
have traced
the issue as far as a TUN interface that 'plugs in' to openvswitch, 
which takes
packets for the VM. If I run tcpdump on this tun interface (called vnet0 
in my
case), I get small tcp packets - they're all 1398 in length - when I do 
a VM-VM

test. I also see high CPU usage for the vhost kernel thread. If I run ftrace
during a throughput test and grep for the vhost thread (once done), and 
wc -l

the result there is an order of magnitude more function calls in this thread
versus the same thing with the broadcom. If I do the same test with a 
broadcom
NIC as the endpoint for the vxlan tunnel, I get large packets - the size 
varies
but generally it's in the five digit range - some are almost 65535. 
There are

fewer calls in the vhost thread, as mentioned above. This is also visible in
top, the vhost kernel thread and the libvirt+ process both have noticeably
higher CPU usage.

I've tried doing a bisect of the kernel and figuring out where the 
change
occurred that allowed the broadcom NIC to perform GRO but not the 
mellanox. I
know that between 4.2 and 4.3 the tun device started to perform GRO and 
this is
where the difference in throughput started. However there's something 
between
these two versions that breaks my setup completely and I can't get any 
kind of

traffic to or from the VM from anywhere. I tried to draw a diagram here:

|-high CPU%
->[mlx4_en/core]>[vxlan]--->[openvswitch]--->[tun]>[vhost]--->VM
   |-small packets (1398)

|-low CPU%
->[bnx2x ]>[vxlan]--->[openvswitch]--->[tun]>[vhost]--->VM
   |-big packets (~65535)


NIC info:

root@hLinux-ovstest-1:/home/john# ethtool -i rename8
driver: mlx4_en
version: 2.2-1 (Feb 2014)
firmware-version: 2.34.5010
bus-info: :08:00.0
supports-statistics: yes
supports-test: yes
supports-eeprom-access: no
supports-register-dump: no
supports-priv-flags: yes

root@hLinux-ovstest-1:/home/john# ethtool -k rename8
Features for rename8:
rx-checksumming: on
tx-checksumming: on
tx-checksum-ipv4: on
tx-checksum-ip-generic: off [fixed]
tx-checksum-ipv6: on
tx-checksum-fcoe-crc: off [fixed]
tx-checksum-sctp: off [fixed]
scatter-gather: on
tx-scatter-gather: on
tx-scatter-gather-fraglist: off [fixed]
tcp-segmentation-offload: on
tx-tcp-segmentation: on
tx-tcp-ecn-segmentation: off [fixed]
tx-tcp6-segmentation: on
udp-fragmentation-offload: off [fixed]
generic-segmentation-offload: on
generic-receive-offload: on
large-receive-offload: off [fixed]
rx-vlan-offload: on
tx-vlan-offload: on
ntuple-filters: off
receive-hashing: on
highdma: on [fixed]
rx-vlan-filter: on [fixed]
vlan-challenged: off [fixed]
tx-lockless: off [fixed]
netns-local: off [fixed]
tx-gso-robust: off [fixed]
tx-fcoe-segmentation: off [fixed]
tx-gre-segmentation: off [fixed]
tx-ipip-segmentation: off [fixed]
tx-sit-segmentation: off [fixed]
tx-udp_tnl-segmentation: on [requested off]
fcoe-mtu: off [fixed]
tx-nocache-copy: off
loopback: off
rx-fcs: off
rx-all: off
tx-vlan-stag-hw-insert: off [fixed]
rx-vlan-stag-hw-parse: off [fixed]
rx-vlan-stag-filter: off [fixed]
l2-fwd-offload: off [fixed]
busy-poll: on [fixed]

root@hLinux-ovstest-1:/home/john# lspci -vvs :08:00.0
08:00.0 Ethernet controller: Mellanox Technologies MT27520 Family 
[ConnectX-3 Pro]

Subsystem: Hewlett-Packard Company Device 801f
Physical Slot: 1
Control: I/O- Mem+ BusMaster+ SpecCycle- MemWINV- VGASnoop- 
ParErr+ Stepping- SERR+ FastB2B- DisINTx+
Status: Cap+ 66MHz- UDF- FastB2B- ParErr- DEVSEL=fast >TAbort- 
SERR- 
Latency: 0, Cache Line Size: 64 bytes
Interrupt: pin A routed to IRQ 0
Region 0: Memory at 9600 (64-bit, non-prefetchable) [size=1M]
Region 2: Memory at 9400 (64-bit, prefetchable) [size=32M]
Capabilities: [40] Power Management version 3
Flags: PMEClk- DSI- D1- D2- AuxCurrent=0mA 
PME(D0-,D1-,D2-,D3hot-,D3cold-)

Status: D0 NoSoftRst+ PME-Enable- DSel=0 DScale=0 PME-
Capabilities: [48] Vital Product Data
Product Name: HP Ethernet 10G 2-port 546SFP+ Adapter
Read-only fields:
[PN] Part number: 779793-B21
[EC] E

[PATCH net-next] Add LAN9352 Ethernet Driver

2016-02-09 Thread Bryan.Whitehead

This is the initial submission of an ethernet driver for
the Microchip LAN9352. 

The LAN9352 is a 2-Port 10/100 Managed Ethernet Switch 
with 16-Bit Non-PCI CPU Interface.

While the LAN9352 is a Managed Ethernet Switch, this driver
only supports a simple ethernet controller interface.

Signed-off-by: Bryan Whitehead 
---
 Documentation/devicetree/bindings/net/mchp9352.txt |   31 +
 MAINTAINERS|9 +
 drivers/net/ethernet/microchip/Kconfig |   23 +-
 drivers/net/ethernet/microchip/Makefile|1 +
 drivers/net/ethernet/microchip/mchp9352.c  | 2593 
 drivers/net/ethernet/microchip/mchp9352.h  |  448 
 6 files changed, 3104 insertions(+), 1 deletion(-)
 create mode 100644 Documentation/devicetree/bindings/net/mchp9352.txt
 create mode 100644 drivers/net/ethernet/microchip/mchp9352.c
 create mode 100644 drivers/net/ethernet/microchip/mchp9352.h

diff --git a/Documentation/devicetree/bindings/net/mchp9352.txt 
b/Documentation/devicetree/bindings/net/mchp9352.txt
new file mode 100644
index 000..5b22e73
--- /dev/null
+++ b/Documentation/devicetree/bindings/net/mchp9352.txt
@@ -0,0 +1,31 @@
+* Microchip LAN9352 Controller
+
+Required properties:
+- compatible : Should be "microchip,lan9352"
+- reg : Address and length of the io space for Microchip LAN
+- interrupts : Should contain Microchip LAN interrupt line
+- interrupt-parent : Should be the phandle for the interrupt controller
+  that services interrupts for this device
+- phy-mode : See ethernet.txt file in the same directory
+
+Optional properties:
+- reg-shift : Specify the quantity to shift the register offsets by
+- reg-io-width : Specify the size (in bytes) of the IO accesses that
+  should be performed on the device.  Valid value for Microchip LAN is
+  2 or 4.  If it's omitted or invalid, the size would be 2.
+- microchip,irq-active-high : Indicates the IRQ polarity is active-high
+- microchip,irq-push-pull : Indicates the IRQ type is push-pull
+- microchip,save-mac-address : Indicates that mac address needs to be saved
+  before resetting the controller
+
+Examples:
+
+lan9220@f400 {
+   compatible = "microchip,lan9352";
+   reg = <0xf400 0x200>;
+   phy-mode = "mii";
+   interrupt-parent = <&gpio1>;
+   interrupts = <31>;
+   reg-io-width = <4>;
+   microchip,irq-push-pull;
+};
diff --git a/MAINTAINERS b/MAINTAINERS
index f678c37..c39edef 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -7136,6 +7136,15 @@ T:   git git://git.monstr.eu/linux-2.6-microblaze.git
 S: Supported
 F: arch/microblaze/
 
+MICROCHIP LAN9352 ETHERNET DRIVER
+M: Microchip Linux Driver Support 
+M: Bryan Whitehead 
+L: netdev@vger.kernel.org
+S: Maintained
+F: Documentation/devicetree/bindings/net/mchp9352.txt
+F: drivers/net/ethernet/microchip/mchp9352.h
+F: drivers/net/ethernet/microchip/mchp9352.c
+
 MICROSOFT SURFACE PRO 3 BUTTON DRIVER
 M: Chen Yu 
 L: platform-driver-...@vger.kernel.org
diff --git a/drivers/net/ethernet/microchip/Kconfig 
b/drivers/net/ethernet/microchip/Kconfig
index 36a09d9..caf4011 100644
--- a/drivers/net/ethernet/microchip/Kconfig
+++ b/drivers/net/ethernet/microchip/Kconfig
@@ -5,7 +5,6 @@
 config NET_VENDOR_MICROCHIP
bool "Microchip devices"
default y
-   depends on SPI
---help---
  If you have a network (Ethernet) card belonging to this class, say Y.
 
@@ -28,6 +27,7 @@ config ENC28J60
 
 config ENC28J60_WRITEVERIFY
bool "Enable write verify"
+   depends on SPI
depends on ENC28J60
---help---
  Enable the verify after the buffer write useful for debugging purpose.
@@ -42,4 +42,25 @@ config ENCX24J600
   To compile this driver as a module, choose M here. The module will be
   called encx24j600.
 
+config MCHP9352
+   tristate "LAN9352 embedded ethernet support"
+   depends on HAS_IOMEM
+   select CRC32
+   select MII
+   select PHYLIB
+   ---help---
+ LAN9352 is a switch product but this driver only support
+ it as a simple ethernet controller. i.e. No switch features
+ are supported yet.
+
+ May support LAN9250, LAN9311, and LAN9312, but these products
+ were not tested.
+
+ Say Y here if you want support for MICROCHIP LAN9352 as a simple
+ ethernet controller.
+
+ To compile this driver as a module, choose M here. The module
+ will be called mchp9352.
+
+
 endif # NET_VENDOR_MICROCHIP
diff --git a/drivers/net/ethernet/microchip/Makefile 
b/drivers/net/ethernet/microchip/Makefile
index ff78f62..6ce5296 100644
--- a/drivers/net/ethernet/microchip/Makefile
+++ b/drivers/net/ethernet/microchip/Makefile
@@ -4,3 +4,4 @@
 
 obj-$(CONFIG_ENC28J60) += enc28j60.o
 obj-$(CONFIG_ENCX24J600) += encx24j600.o encx24j600-regmap.o
+obj-$(CONFIG_MCHP9352) += mchp9352.o
diff --git a/drivers/net/ethern

Re: [PATCH 8/9] rfkill: Userspace control for airplane mode

2016-02-09 Thread João Paulo Rechi Vita

On 8 February 2016 at 17:53, Julian Calaby  wrote:

>> +   if (ev.op == RFKILL_OP_AIRPLANE_MODE_RELEASE) {
>> +   if (rfkill_apm_owned && !data->is_apm_owner) {
>
> Are you sure this is correct?
>
> In the case that airplane mode isn't owned, the
> rfkill_apm_led_trigger_event() call will be a noop, so we should
> arguably not be calling it.
>

Ok, I'm changing the check to be consistent with _CHANGE, so the call
only succeeds if (rfkill_apm_owned && data->is_apm_owner), and return
an error otherwise.

> Also, should we just fail silently if we're not the owner? I.e. what
> does userspace learn from this op failing and is that useful?
>

I think it is better to return an error every time userspace is trying
to call an operation that it was not supposed to call at a certain
state (without acquiring control of the airplane-mode indicator). If a
program has a logic error that makes it call _RELEASE without calling
_ACQUIRE first, it's easier for the programmer to spot the problem if
we return an error here.

>> +   count = -EACCES;
>> +   } else {
>> +   bool state = 
>> rfkill_global_states[RFKILL_TYPE_ALL].cur;
>> +
>> +   rfkill_apm_owned = false;
>> +   data->is_apm_owner = false;
>> +   rfkill_apm_led_trigger_event(state);
>> +   }
>> +   }
>> +
>> +   if (ev.op == RFKILL_OP_AIRPLANE_MODE_CHANGE) {
>> +   if (rfkill_apm_owned && data->is_apm_owner)
>> +   rfkill_apm_led_trigger_event(ev.soft);
>> +   else
>> +   count = -EACCES;
>> +   }
>> +
>> if (ev.op == RFKILL_OP_CHANGE_ALL)
>> rfkill_update_global_state(ev.type, ev.soft);
>>
>> @@ -1230,7 +1261,17 @@ static int rfkill_fop_release(struct inode *inode, 
>> struct file *file)
>> struct rfkill_int_event *ev, *tmp;
>>
>> mutex_lock(&rfkill_global_mutex);
>> +
>> +   if (data->is_apm_owner) {
>> +   bool state = rfkill_global_states[RFKILL_TYPE_ALL].cur;
>> +
>> +   rfkill_apm_owned = false;
>> +   data->is_apm_owner = false;
>> +   rfkill_apm_led_trigger_event(state);
>
> Also, this code is duplicated from the _RELEASE op above. Would it
> make sense to factor it out into a separate function?
>

Yes, makes sense. This also made me notice I was assigning a negative
value to a size_t variable (count).

>> +   }
>> +
>> list_del(&data->list);
>> +
>
> (extra line)
>

After factoring out the _RELEASE code it looks better without this
additional line.

>> mutex_unlock(&rfkill_global_mutex);
>>
>> mutex_destroy(&data->mtx);
>
> Thanks,
>

Thanks for the review, Julian. I'm sending an updated version.

--
João Paulo Rechi Vita
http://about.me/jprvita

[PATCH 8/9] rfkill: Userspace control for airplane mode

2016-02-09 Thread João Paulo Rechi Vita

Provide an interface for the airplane-mode indicator be controlled from
userspace. User has to first acquire the control through
RFKILL_OP_AIRPLANE_MODE_ACQUIRE and keep the fd open for the whole time
it wants to be in control of the indicator. Closing the fd or using
RFKILL_OP_AIRPLANE_MODE_RELEASE restores the default policy.

To change state of the indicator, the RFKILL_OP_AIRPLANE_MODE_CHANGE
operation is used, passing the value on "struct rfkill_event.soft". If
the caller has not acquired the airplane-mode control beforehand, the
operation fails.

Signed-off-by: João Paulo Rechi Vita 
---
 Documentation/rfkill.txt| 10 ++
 include/uapi/linux/rfkill.h |  3 +++
 net/rfkill/core.c   | 45 +
 3 files changed, 54 insertions(+), 4 deletions(-)

diff --git a/Documentation/rfkill.txt b/Documentation/rfkill.txt
index b13025a..aa6e014 100644
--- a/Documentation/rfkill.txt
+++ b/Documentation/rfkill.txt
@@ -87,6 +87,7 @@ RFKill provides per-switch LED triggers, which can be used to 
drive LEDs
 according to the switch state (LED_FULL when blocked, LED_OFF otherwise).
 An airplane-mode indicator LED trigger is also available, which triggers
 LED_FULL when all radios known by RFKill are blocked, and LED_OFF otherwise.
+The airplane-mode indicator LED trigger policy can be overridden by userspace.
 
 
 5. Userspace support
@@ -123,5 +124,14 @@ RFKILL_TYPE
 The contents of these variables corresponds to the "name", "state" and
 "type" sysfs files explained above.
 
+Userspace can also override the default airplane-mode indicator policy through
+/dev/rfkill. Control of the airplane mode indicator has to be acquired first,
+using RFKILL_OP_AIRPLANE_MODE_ACQUIRE, and is only available for one userspace
+application at a time. Closing the fd or using RFKILL_OP_AIRPLANE_MODE_RELEASE
+reverts the airplane-mode indicator back to the default kernel policy and makes
+it available for other applications to take control. Changes to the
+airplane-mode indicator state can be made using RFKILL_OP_AIRPLANE_MODE_CHANGE,
+passing the new value in the 'soft' field of 'struct rfkill_event'.
+
 
 For further details consult Documentation/ABI/stable/sysfs-class-rfkill.
diff --git a/include/uapi/linux/rfkill.h b/include/uapi/linux/rfkill.h
index 2e00dce..9cb999b 100644
--- a/include/uapi/linux/rfkill.h
+++ b/include/uapi/linux/rfkill.h
@@ -67,6 +67,9 @@ enum rfkill_operation {
RFKILL_OP_DEL,
RFKILL_OP_CHANGE,
RFKILL_OP_CHANGE_ALL,
+   RFKILL_OP_AIRPLANE_MODE_ACQUIRE,
+   RFKILL_OP_AIRPLANE_MODE_RELEASE,
+   RFKILL_OP_AIRPLANE_MODE_CHANGE,
 };
 
 /**
diff --git a/net/rfkill/core.c b/net/rfkill/core.c
index fb11547..c0da716 100644
--- a/net/rfkill/core.c
+++ b/net/rfkill/core.c
@@ -89,6 +89,7 @@ struct rfkill_data {
struct mutexmtx;
wait_queue_head_t   read_wait;
boolinput_handler;
+   boolis_apm_owner;
 };
 
 
@@ -123,7 +124,7 @@ static struct {
 } rfkill_global_states[NUM_RFKILL_TYPES];
 
 static bool rfkill_epo_lock_active;
-
+static bool rfkill_apm_owned;
 
 #ifdef CONFIG_RFKILL_LEDS
 static struct led_trigger rfkill_apm_led_trigger;
@@ -350,7 +351,8 @@ static void rfkill_update_global_state(enum rfkill_type 
type, bool blocked)
 
for (i = 0; i < NUM_RFKILL_TYPES; i++)
rfkill_global_states[i].cur = blocked;
-   rfkill_apm_led_trigger_event(blocked);
+   if (!rfkill_apm_owned)
+   rfkill_apm_led_trigger_event(blocked);
 }
 
 #ifdef CONFIG_RFKILL_INPUT
@@ -1180,11 +1182,26 @@ static ssize_t rfkill_fop_read(struct file *file, char 
__user *buf,
return ret;
 }
 
+static int rfkill_airplane_mode_release(struct rfkill_data *data)
+{
+   bool state = rfkill_global_states[RFKILL_TYPE_ALL].cur;
+
+   if (rfkill_apm_owned && data->is_apm_owner) {
+   rfkill_apm_owned = false;
+   data->is_apm_owner = false;
+   rfkill_apm_led_trigger_event(state);
+   return 0;
+   }
+   return -EACCES;
+}
+
 static ssize_t rfkill_fop_write(struct file *file, const char __user *buf,
size_t count, loff_t *pos)
 {
+   struct rfkill_data *data = file->private_data;
struct rfkill *rfkill;
struct rfkill_event ev;
+   int ret = 0;
 
/* we don't need the 'hard' variable but accept it */
if (count < RFKILL_EVENT_SIZE_V1 - 1)
@@ -1199,7 +1216,7 @@ static ssize_t rfkill_fop_write(struct file *file, const 
char __user *buf,
if (copy_from_user(&ev, buf, count))
return -EFAULT;
 
-   if (ev.op != RFKILL_OP_CHANGE && ev.op != RFKILL_OP_CHANGE_ALL)
+   if (ev.op < RFKILL_OP_CHANGE)
return -EINVAL;
 
if (ev.type >= NUM_RFKILL_TYPES)
@@ -1207,6 +1224,25 @@ static ssize_t rfkill_fop_write(struct file *file, const 
char __user *buf,
 
mutex_lock(&rfkill_g

Re: [PATCH 8/9] rfkill: Userspace control for airplane mode

2016-02-09 Thread Julian Calaby

Hi All,

On Wed, Feb 10, 2016 at 8:36 AM, João Paulo Rechi Vita
 wrote:
> Provide an interface for the airplane-mode indicator be controlled from
> userspace. User has to first acquire the control through
> RFKILL_OP_AIRPLANE_MODE_ACQUIRE and keep the fd open for the whole time
> it wants to be in control of the indicator. Closing the fd or using
> RFKILL_OP_AIRPLANE_MODE_RELEASE restores the default policy.
>
> To change state of the indicator, the RFKILL_OP_AIRPLANE_MODE_CHANGE
> operation is used, passing the value on "struct rfkill_event.soft". If
> the caller has not acquired the airplane-mode control beforehand, the
> operation fails.
>
> Signed-off-by: João Paulo Rechi Vita 

This looks sane to me.

Reviewed-by: Julian Calaby 

Thanks,

-- 
Julian Calaby

Email: julian.cal...@gmail.com
Profile: http://www.google.com/profiles/julian.calaby/

Re: Pushing AF_RXRPC rewrite patches to net/next

2016-02-09 Thread David Miller

From: David Howells 
Date: Tue, 09 Feb 2016 16:31:01 +

> Are you okay with me sending some of them your way now so that you don't get a
> host all in one go, or would you prefer the mass invasion approach?

Please send them in reasonable chunks, thanks.

Re: [Intel-wired-lan] [next] igb: allow setting MAC address on i211 using a device tree blob

2016-02-09 Thread David Miller

From: Shannon Nelson 
Date: Tue, 9 Feb 2016 09:42:45 -0800

> It seem to me this should be using eth_platform_get_mac_address(), a
> slightly more generic method to do this.  See the i40e driver for an
> example, commit d9a84324e6 I believe.

+1

Re: [PATCH v2] net: fec: Add "phy-reset-active-low" property to DT

2016-02-09 Thread Fabio Estevam

Hi Bernhard,

On Tue, Feb 9, 2016 at 6:01 PM, Bernhard Walle  wrote:
> We need that for a custom hardware that needs the reverse reset
> sequence.
>
> Signed-off-by: Bernhard Walle 

Looks good:

Reviewed-by: Fabio Estevam

[PATCH v2 0/2] Factor out register bit twiddling in the Renesas Ethernet drivers

2016-02-09 Thread Sergei Shtylyov

Hello.

   Here's a set of 2 patches against DaveM's 'net-next.git' repo. We factor out
the often repeated pattern of reading a register, AND'ing and/or OR'ing some
bits, and then writing the value back.

[1/2] ravb: factor out register bit twiddling code
[2/2] sh_eth: factor out register bit twiddling code

MBR, Sergei

[PATCH v2 1/2] ravb: factor out register bit twiddling code

2016-02-09 Thread Sergei Shtylyov

The  driver has often repeated pattern of reading a register,  AND'ing and/or
OR'ing some bits  and writing  the  value back. Factor the pattern out into
ravb_modify() -- this saves 260 bytes of code with ARM gcc 4.7.3.

While at it, update Cogent Embedded's copyrights.

Signed-off-by: Sergei Shtylyov 

---
Changes in version 2:
- renamed ravb_modify()'s parameters.

 drivers/net/ethernet/renesas/ravb.h  |4 +
 drivers/net/ethernet/renesas/ravb_main.c |   68 ---
 drivers/net/ethernet/renesas/ravb_ptp.c  |   25 ++-
 3 files changed, 36 insertions(+), 61 deletions(-)

Index: net-next/drivers/net/ethernet/renesas/ravb.h
===
--- net-next.orig/drivers/net/ethernet/renesas/ravb.h
+++ net-next/drivers/net/ethernet/renesas/ravb.h
@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2014-2015 Renesas Electronics Corporation
  * Copyright (C) 2015 Renesas Solutions Corp.
- * Copyright (C) 2015 Cogent Embedded, Inc. 
+ * Copyright (C) 2015-2016 Cogent Embedded, Inc. 
  *
  * Based on the SuperH Ethernet driver
  *
@@ -837,6 +837,8 @@ static inline void ravb_write(struct net
iowrite32(data, priv->addr + reg);
 }
 
+void ravb_modify(struct net_device *ndev, enum ravb_reg reg, u32 clear,
+u32 set);
 int ravb_wait(struct net_device *ndev, enum ravb_reg reg, u32 mask, u32 value);
 
 irqreturn_t ravb_ptp_interrupt(struct net_device *ndev);
Index: net-next/drivers/net/ethernet/renesas/ravb_main.c
===
--- net-next.orig/drivers/net/ethernet/renesas/ravb_main.c
+++ net-next/drivers/net/ethernet/renesas/ravb_main.c
@@ -2,7 +2,7 @@
  *
  * Copyright (C) 2014-2015 Renesas Electronics Corporation
  * Copyright (C) 2015 Renesas Solutions Corp.
- * Copyright (C) 2015 Cogent Embedded, Inc. 
+ * Copyright (C) 2015-2016 Cogent Embedded, Inc. 
  *
  * Based on the SuperH Ethernet driver
  *
@@ -42,6 +42,12 @@
 NETIF_MSG_RX_ERR | \
 NETIF_MSG_TX_ERR)
 
+void ravb_modify(struct net_device *ndev, enum ravb_reg reg, u32 clear,
+u32 set)
+{
+   ravb_write(ndev, (ravb_read(ndev, reg) & ~clear) | set, reg);
+}
+
 int ravb_wait(struct net_device *ndev, enum ravb_reg reg, u32 mask, u32 value)
 {
int i;
@@ -59,8 +65,7 @@ static int ravb_config(struct net_device
int error;
 
/* Set config mode */
-   ravb_write(ndev, (ravb_read(ndev, CCC) & ~CCC_OPC) | CCC_OPC_CONFIG,
-  CCC);
+   ravb_modify(ndev, CCC, CCC_OPC, CCC_OPC_CONFIG);
/* Check if the operating mode is changed to the config mode */
error = ravb_wait(ndev, CSR, CSR_OPS, CSR_OPS_CONFIG);
if (error)
@@ -72,13 +77,8 @@ static int ravb_config(struct net_device
 static void ravb_set_duplex(struct net_device *ndev)
 {
struct ravb_private *priv = netdev_priv(ndev);
-   u32 ecmr = ravb_read(ndev, ECMR);
 
-   if (priv->duplex)   /* Full */
-   ecmr |=  ECMR_DM;
-   else/* Half */
-   ecmr &= ~ECMR_DM;
-   ravb_write(ndev, ecmr, ECMR);
+   ravb_modify(ndev, ECMR, ECMR_DM, priv->duplex ? ECMR_DM : 0);
 }
 
 static void ravb_set_rate(struct net_device *ndev)
@@ -131,13 +131,8 @@ static void ravb_mdio_ctrl(struct mdiobb
 {
struct ravb_private *priv = container_of(ctrl, struct ravb_private,
 mdiobb);
-   u32 pir = ravb_read(priv->ndev, PIR);
 
-   if (set)
-   pir |=  mask;
-   else
-   pir &= ~mask;
-   ravb_write(priv->ndev, pir, PIR);
+   ravb_modify(priv->ndev, PIR, mask, set ? mask : 0);
 }
 
 /* MDC pin control */
@@ -393,9 +388,9 @@ static int ravb_dmac_init(struct net_dev
ravb_ring_format(ndev, RAVB_NC);
 
 #if defined(__LITTLE_ENDIAN)
-   ravb_write(ndev, ravb_read(ndev, CCC) & ~CCC_BOC, CCC);
+   ravb_modify(ndev, CCC, CCC_BOC, 0);
 #else
-   ravb_write(ndev, ravb_read(ndev, CCC) | CCC_BOC, CCC);
+   ravb_modify(ndev, CCC, CCC_BOC, CCC_BOC);
 #endif
 
/* Set AVB RX */
@@ -418,8 +413,7 @@ static int ravb_dmac_init(struct net_dev
ravb_write(ndev, TIC_FTE0 | TIC_FTE1 | TIC_TFUE, TIC);
 
/* Setting the control will start the AVB-DMAC process. */
-   ravb_write(ndev, (ravb_read(ndev, CCC) & ~CCC_OPC) | CCC_OPC_OPERATION,
-  CCC);
+   ravb_modify(ndev, CCC, CCC_OPC, CCC_OPC_OPERATION);
 
return 0;
 }
@@ -493,7 +487,7 @@ static void ravb_get_tx_tstamp(struct ne
break;
}
}
-   ravb_write(ndev, ravb_read(ndev, TCCR) | TCCR_TFR, TCCR);
+   ravb_modify(ndev, TCCR, TCCR_TFR, TCCR_TFR);
}
 }
 
@@ -613,13 +607,13 @@ static bool ravb_rx(struct net_device *n
 static void ravb_rcv_snd_disable(struct net_device *ndev)
 {
/* Disable TX and RX */
-

[PATCH v2 2/2] sh_eth: factor out register bit twiddling code

2016-02-09 Thread Sergei Shtylyov

The  driver has often repeated pattern of reading a register,  AND'ing and/or
OR'ing some bits  and writing  the  value back. Factor the pattern out into
sh_eth_modify() -- this saves  84 bytes of code with ARM gcc 4.7.3.

While at it, update Cogent Embedded's copyright.

Signed-off-by: Sergei Shtylyov 

---
Changes in version 2:
- renamed sh_eth_modify()'s parameters.

 drivers/net/ethernet/renesas/sh_eth.c |   53 +++---
 1 file changed, 24 insertions(+), 29 deletions(-)

Index: net-next/drivers/net/ethernet/renesas/sh_eth.c
===
--- net-next.orig/drivers/net/ethernet/renesas/sh_eth.c
+++ net-next/drivers/net/ethernet/renesas/sh_eth.c
@@ -3,7 +3,7 @@
  *  Copyright (C) 2014  Renesas Electronics Corporation
  *  Copyright (C) 2006-2012 Nobuhiro Iwamatsu
  *  Copyright (C) 2008-2014 Renesas Solutions Corp.
- *  Copyright (C) 2013-2014 Cogent Embedded, Inc.
+ *  Copyright (C) 2013-2016 Cogent Embedded, Inc.
  *  Copyright (C) 2014 Codethink Limited
  *
  *  This program is free software; you can redistribute it and/or modify it
@@ -428,6 +428,13 @@ static u32 sh_eth_read(struct net_device
return ioread32(mdp->addr + offset);
 }
 
+static void sh_eth_modify(struct net_device *ndev, int enum_index, u32 clear,
+ u32 set)
+{
+   sh_eth_write(ndev, (sh_eth_read(ndev, enum_index) & ~clear) | set,
+enum_index);
+}
+
 static bool sh_eth_is_gether(struct sh_eth_private *mdp)
 {
return mdp->reg_offset == sh_eth_offset_gigabit;
@@ -467,10 +474,7 @@ static void sh_eth_set_duplex(struct net
 {
struct sh_eth_private *mdp = netdev_priv(ndev);
 
-   if (mdp->duplex) /* Full */
-   sh_eth_write(ndev, sh_eth_read(ndev, ECMR) | ECMR_DM, ECMR);
-   else/* Half */
-   sh_eth_write(ndev, sh_eth_read(ndev, ECMR) & ~ECMR_DM, ECMR);
+   sh_eth_modify(ndev, ECMR, ECMR_DM, mdp->duplex ? ECMR_DM : 0);
 }
 
 static void sh_eth_chip_reset(struct net_device *ndev)
@@ -583,10 +587,10 @@ static void sh_eth_set_rate_r8a777x(stru
 
switch (mdp->speed) {
case 10: /* 10BASE */
-   sh_eth_write(ndev, sh_eth_read(ndev, ECMR) & ~ECMR_ELB, ECMR);
+   sh_eth_modify(ndev, ECMR, ECMR_ELB, 0);
break;
case 100:/* 100BASE */
-   sh_eth_write(ndev, sh_eth_read(ndev, ECMR) | ECMR_ELB, ECMR);
+   sh_eth_modify(ndev, ECMR, ECMR_ELB, ECMR_ELB);
break;
default:
break;
@@ -649,10 +653,10 @@ static void sh_eth_set_rate_sh7724(struc
 
switch (mdp->speed) {
case 10: /* 10BASE */
-   sh_eth_write(ndev, sh_eth_read(ndev, ECMR) & ~ECMR_RTM, ECMR);
+   sh_eth_modify(ndev, ECMR, ECMR_RTM, 0);
break;
case 100:/* 100BASE */
-   sh_eth_write(ndev, sh_eth_read(ndev, ECMR) | ECMR_RTM, ECMR);
+   sh_eth_modify(ndev, ECMR, ECMR_RTM, ECMR_RTM);
break;
default:
break;
@@ -924,8 +928,7 @@ static int sh_eth_reset(struct net_devic
 
if (sh_eth_is_gether(mdp) || sh_eth_is_rz_fast_ether(mdp)) {
sh_eth_write(ndev, EDSR_ENALL, EDSR);
-   sh_eth_write(ndev, sh_eth_read(ndev, EDMR) | EDMR_SRST_GETHER,
-EDMR);
+   sh_eth_modify(ndev, EDMR, EDMR_SRST_GETHER, EDMR_SRST_GETHER);
 
ret = sh_eth_check_reset(ndev);
if (ret)
@@ -949,11 +952,9 @@ static int sh_eth_reset(struct net_devic
if (mdp->cd->select_mii)
sh_eth_select_mii(ndev);
} else {
-   sh_eth_write(ndev, sh_eth_read(ndev, EDMR) | EDMR_SRST_ETHER,
-EDMR);
+   sh_eth_modify(ndev, EDMR, EDMR_SRST_ETHER, EDMR_SRST_ETHER);
mdelay(3);
-   sh_eth_write(ndev, sh_eth_read(ndev, EDMR) & ~EDMR_SRST_ETHER,
-EDMR);
+   sh_eth_modify(ndev, EDMR, EDMR_SRST_ETHER, 0);
}
 
return ret;
@@ -1285,7 +1286,7 @@ static int sh_eth_dev_init(struct net_de
sh_eth_write(ndev, ndev->mtu + ETH_HLEN + VLAN_HLEN + ETH_FCS_LEN,
 RFLR);
 
-   sh_eth_write(ndev, sh_eth_read(ndev, EESR), EESR);
+   sh_eth_modify(ndev, EESR, 0, 0);
if (start) {
mdp->irq_enabled = true;
sh_eth_write(ndev, mdp->cd->eesipr_value, EESIPR);
@@ -1532,15 +1533,13 @@ static int sh_eth_rx(struct net_device *
 static void sh_eth_rcv_snd_disable(struct net_device *ndev)
 {
/* disable tx and rx */
-   sh_eth_write(ndev, sh_eth_read(ndev, ECMR) &
-   ~(ECMR_RE | ECMR_TE), ECMR);
+   sh_eth_modify(ndev, ECMR, ECMR_RE | ECMR_TE, 0);
 }
 
 static void sh_eth_rcv_snd_enable(struct net_device *ndev)
 {
/* enable tx and rx */
-   sh_eth_write(nde

[PATCH net-next 3/7] tcp: __tcp_hdrlen() helper

2016-02-09 Thread Craig Gallek

From: Craig Gallek 

tcp_hdrlen is wasteful if you already have a pointer to struct tcphdr.
This splits the size calculation into a helper function that can be
used if a struct tcphdr is already available.

Signed-off-by: Craig Gallek 
---
 include/linux/tcp.h | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/include/linux/tcp.h b/include/linux/tcp.h
index b386361ba3e8..c216707d63bf 100644
--- a/include/linux/tcp.h
+++ b/include/linux/tcp.h
@@ -29,9 +29,14 @@ static inline struct tcphdr *tcp_hdr(const struct sk_buff 
*skb)
return (struct tcphdr *)skb_transport_header(skb);
 }
 
+static inline unsigned int __tcp_hdrlen(const struct tcphdr *th)
+{
+   return th->doff * 4;
+}
+
 static inline unsigned int tcp_hdrlen(const struct sk_buff *skb)
 {
-   return tcp_hdr(skb)->doff * 4;
+   return __tcp_hdrlen(tcp_hdr(skb));
 }
 
 static inline struct tcphdr *inner_tcp_hdr(const struct sk_buff *skb)
-- 
2.7.0.rc3.207.g0ac5344

[PATCH net-next 7/7] soreuseport: BPF selection functional test for TCP

2016-02-09 Thread Craig Gallek

From: Craig Gallek 

Unfortunately the existing test relied on packet payload in order to
map incoming packets to sockets.  In order to get this to work with TCP,
TCP_FASTOPEN needed to be used.

Since the fast open path is slightly different than the standard TCP path,
I created a second test which sends to reuseport group members based
on receiving cpu core id.  This will probably serve as a better
real-world example use as well.

Signed-off-by: Craig Gallek 
---
 tools/testing/selftests/net/.gitignore  |   1 +
 tools/testing/selftests/net/Makefile|   2 +-
 tools/testing/selftests/net/reuseport_bpf.c | 117 ++-
 tools/testing/selftests/net/reuseport_bpf_cpu.c | 258 
 4 files changed, 370 insertions(+), 8 deletions(-)
 create mode 100644 tools/testing/selftests/net/reuseport_bpf_cpu.c

diff --git a/tools/testing/selftests/net/.gitignore 
b/tools/testing/selftests/net/.gitignore
index 6fb23366b258..69bb3fc38fb2 100644
--- a/tools/testing/selftests/net/.gitignore
+++ b/tools/testing/selftests/net/.gitignore
@@ -2,3 +2,4 @@ socket
 psock_fanout
 psock_tpacket
 reuseport_bpf
+reuseport_bpf_cpu
diff --git a/tools/testing/selftests/net/Makefile 
b/tools/testing/selftests/net/Makefile
index 41449b5ad0a9..c658792d47b4 100644
--- a/tools/testing/selftests/net/Makefile
+++ b/tools/testing/selftests/net/Makefile
@@ -4,7 +4,7 @@ CFLAGS = -Wall -O2 -g
 
 CFLAGS += -I../../../../usr/include/
 
-NET_PROGS = socket psock_fanout psock_tpacket reuseport_bpf
+NET_PROGS = socket psock_fanout psock_tpacket reuseport_bpf reuseport_bpf_cpu
 
 all: $(NET_PROGS)
 %: %.c
diff --git a/tools/testing/selftests/net/reuseport_bpf.c 
b/tools/testing/selftests/net/reuseport_bpf.c
index bec1b5dd2530..96ba386b1b7b 100644
--- a/tools/testing/selftests/net/reuseport_bpf.c
+++ b/tools/testing/selftests/net/reuseport_bpf.c
@@ -9,10 +9,12 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -169,9 +171,15 @@ static void build_recv_group(const struct test_params p, 
int fd[], uint16_t mod,
if (bind(fd[i], addr, sockaddr_size()))
error(1, errno, "failed to bind recv socket %d", i);
 
-   if (p.protocol == SOCK_STREAM)
+   if (p.protocol == SOCK_STREAM) {
+   opt = 4;
+   if (setsockopt(fd[i], SOL_TCP, TCP_FASTOPEN, &opt,
+  sizeof(opt)))
+   error(1, errno,
+ "failed to set TCP_FASTOPEN on %d", i);
if (listen(fd[i], p.recv_socks * 10))
error(1, errno, "failed to listen on socket");
+   }
}
free(addr);
 }
@@ -189,10 +197,8 @@ static void send_from(struct test_params p, uint16_t 
sport, char *buf,
 
if (bind(fd, saddr, sockaddr_size()))
error(1, errno, "failed to bind send socket");
-   if (connect(fd, daddr, sockaddr_size()))
-   error(1, errno, "failed to connect");
 
-   if (send(fd, buf, len, 0) < 0)
+   if (sendto(fd, buf, len, MSG_FASTOPEN, daddr, sockaddr_size()) < 0)
error(1, errno, "failed to send message");
 
close(fd);
@@ -260,7 +266,7 @@ static void test_recv_order(const struct test_params p, int 
fd[], int mod)
}
 }
 
-static void test_reuseport_ebpf(const struct test_params p)
+static void test_reuseport_ebpf(struct test_params p)
 {
int i, fd[p.recv_socks];
 
@@ -268,6 +274,7 @@ static void test_reuseport_ebpf(const struct test_params p)
build_recv_group(p, fd, p.recv_socks, attach_ebpf);
test_recv_order(p, fd, p.recv_socks);
 
+   p.send_port_min += p.recv_socks * 2;
fprintf(stderr, "Reprograming, testing mod %zd...\n", p.recv_socks / 2);
attach_ebpf(fd[0], p.recv_socks / 2);
test_recv_order(p, fd, p.recv_socks / 2);
@@ -276,7 +283,7 @@ static void test_reuseport_ebpf(const struct test_params p)
close(fd[i]);
 }
 
-static void test_reuseport_cbpf(const struct test_params p)
+static void test_reuseport_cbpf(struct test_params p)
 {
int i, fd[p.recv_socks];
 
@@ -284,6 +291,7 @@ static void test_reuseport_cbpf(const struct test_params p)
build_recv_group(p, fd, p.recv_socks, attach_cbpf);
test_recv_order(p, fd, p.recv_socks);
 
+   p.send_port_min += p.recv_socks * 2;
fprintf(stderr, "Reprograming, testing mod %zd...\n", p.recv_socks / 2);
attach_cbpf(fd[0], p.recv_socks / 2);
test_recv_order(p, fd, p.recv_socks / 2);
@@ -377,7 +385,7 @@ static void test_filter_no_reuseport(const struct 
test_params p)
 
 static void test_filter_without_bind(void)
 {
-   int fd1, fd2;
+   int fd1, fd2, opt = 1;
 
fprintf(stderr, "Testing filter add without bind...\n");
fd1 = socket(AF_INET, SOCK_DGRAM, 0);
@@ -386,6 +394

[PATCH net-next 0/7] Faster SO_REUSEPORT for TCP

2016-02-09 Thread Craig Gallek

From: Craig Gallek 

This patch series complements an earlier series (6a5ef90c58da)
which added faster SO_REUSEPORT lookup for UDP sockets by
extending the feature to TCP sockets.  It uses the same
array-based data structure which allows for socket selection
after finding the first listening socket that matches an incoming
packet.  Prior to this feature, every socket in the reuseport
group needed to be found and examined before a selection could be
made.

With this series the SO_ATTACH_REUSEPORT_CBPF and
SO_ATTACH_REUSEPORT_EBPF socket options now work for TCP sockets
as well.  The test at the end of the series includes an example of
how to use these options to select a reuseport socket based on the
cpu core id handling the incoming packet.

There are several refactoring patches that precede the feature
implementation.  Only the last two patches in this series
should result in any behavioral changes.

v2:
- In the first patched I missed a couple of hash functions that should now be
  returning int instead of void.  I missed these the first time through as it
  only generated a warning and not an error :\

Craig Gallek (7):
  sock: struct proto hash function may error
  inet: create IPv6-equivalent inet_hash function
  tcp: __tcp_hdrlen() helper
  inet: refactor inet[6]_lookup functions to take skb
  soreuseport: Prep for fast reuseport TCP socket selection
  soreuseport: fast reuseport TCP socket selection
  soreuseport: BPF selection functional test for TCP

 include/linux/tcp.h |   7 +-
 include/net/addrconf.h  |   2 +
 include/net/inet6_hashtables.h  |  13 +-
 include/net/inet_hashtables.h   |  25 ++-
 include/net/phonet/phonet.h |   2 +-
 include/net/ping.h  |   2 +-
 include/net/raw.h   |   2 +-
 include/net/sock.h  |   6 +-
 include/net/udp.h   |   2 +-
 net/core/filter.c   |   2 +-
 net/core/sock.c |   1 +
 net/dccp/ipv4.c |   2 +-
 net/dccp/ipv6.c |   4 +-
 net/ieee802154/socket.c |  17 +-
 net/ipv4/af_inet.c  |   9 +-
 net/ipv4/inet_connection_sock.c |  22 +-
 net/ipv4/inet_diag.c|   6 +-
 net/ipv4/inet_hashtables.c  |  67 +-
 net/ipv4/ping.c |   4 +-
 net/ipv4/raw.c  |   4 +-
 net/ipv4/tcp_ipv4.c |  10 +-
 net/ipv4/udp.c  |   4 +-
 net/ipv6/af_inet6.c |   6 +-
 net/ipv6/inet6_connection_sock.c|   2 +
 net/ipv6/inet6_hashtables.c |  34 +++-
 net/ipv6/tcp_ipv6.c |  10 +-
 net/l2tp/l2tp_ip6.c |   3 +-
 net/netfilter/xt_TPROXY.c   |  31 ++-
 net/netfilter/xt_socket.c   |  28 ++-
 net/phonet/socket.c |   6 +-
 net/sctp/socket.c   |   3 +-
 tools/testing/selftests/net/.gitignore  |   1 +
 tools/testing/selftests/net/Makefile|   2 +-
 tools/testing/selftests/net/reuseport_bpf.c | 117 ++-
 tools/testing/selftests/net/reuseport_bpf_cpu.c | 258 
 35 files changed, 624 insertions(+), 90 deletions(-)
 create mode 100644 tools/testing/selftests/net/reuseport_bpf_cpu.c

-- 
2.7.0.rc3.207.g0ac5344

[PATCH net-next 1/7] sock: struct proto hash function may error

2016-02-09 Thread Craig Gallek

From: Craig Gallek 

In order to support fast reuseport lookups in TCP, the hash function
defined in struct proto must be capable of returning an error code.
This patch changes the function signature of all related hash functions
to return an integer and handles or propagates this return value at
all call sites.

Signed-off-by: Craig Gallek 
---
 include/net/inet_hashtables.h   |  2 +-
 include/net/phonet/phonet.h |  2 +-
 include/net/ping.h  |  2 +-
 include/net/raw.h   |  2 +-
 include/net/sock.h  |  6 +++---
 include/net/udp.h   |  2 +-
 net/ieee802154/socket.c | 17 +
 net/ipv4/af_inet.c  |  9 ++---
 net/ipv4/inet_connection_sock.c |  8 +---
 net/ipv4/inet_hashtables.c  |  4 +++-
 net/ipv4/ping.c |  4 +++-
 net/ipv4/raw.c  |  4 +++-
 net/ipv6/af_inet6.c |  6 +-
 net/phonet/socket.c |  6 --
 net/sctp/socket.c   |  3 ++-
 15 files changed, 52 insertions(+), 25 deletions(-)

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index de2e3ade6102..554440e7f83d 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -208,7 +208,7 @@ void inet_hashinfo_init(struct inet_hashinfo *h);
 bool inet_ehash_insert(struct sock *sk, struct sock *osk);
 bool inet_ehash_nolisten(struct sock *sk, struct sock *osk);
 void __inet_hash(struct sock *sk, struct sock *osk);
-void inet_hash(struct sock *sk);
+int inet_hash(struct sock *sk);
 void inet_unhash(struct sock *sk);
 
 struct sock *__inet_lookup_listener(struct net *net,
diff --git a/include/net/phonet/phonet.h b/include/net/phonet/phonet.h
index 68e509750caa..039cc29cb4a8 100644
--- a/include/net/phonet/phonet.h
+++ b/include/net/phonet/phonet.h
@@ -51,7 +51,7 @@ void pn_sock_init(void);
 struct sock *pn_find_sock_by_sa(struct net *net, const struct sockaddr_pn *sa);
 void pn_deliver_sock_broadcast(struct net *net, struct sk_buff *skb);
 void phonet_get_local_port_range(int *min, int *max);
-void pn_sock_hash(struct sock *sk);
+int pn_sock_hash(struct sock *sk);
 void pn_sock_unhash(struct sock *sk);
 int pn_sock_get_port(struct sock *sk, unsigned short sport);
 
diff --git a/include/net/ping.h b/include/net/ping.h
index ac80cb45e630..5fd7cc244833 100644
--- a/include/net/ping.h
+++ b/include/net/ping.h
@@ -65,7 +65,7 @@ struct pingfakehdr {
 };
 
 int  ping_get_port(struct sock *sk, unsigned short ident);
-void ping_hash(struct sock *sk);
+int ping_hash(struct sock *sk);
 void ping_unhash(struct sock *sk);
 
 int  ping_init_sock(struct sock *sk);
diff --git a/include/net/raw.h b/include/net/raw.h
index 6a40c6562dd2..3e789008394d 100644
--- a/include/net/raw.h
+++ b/include/net/raw.h
@@ -57,7 +57,7 @@ int raw_seq_open(struct inode *ino, struct file *file,
 
 #endif
 
-void raw_hash_sk(struct sock *sk);
+int raw_hash_sk(struct sock *sk);
 void raw_unhash_sk(struct sock *sk);
 
 struct raw_sock {
diff --git a/include/net/sock.h b/include/net/sock.h
index f5ea148853e2..255d3e03727b 100644
--- a/include/net/sock.h
+++ b/include/net/sock.h
@@ -984,7 +984,7 @@ struct proto {
void(*release_cb)(struct sock *sk);
 
/* Keeping track of sk's, looking them up, and port selection methods. 
*/
-   void(*hash)(struct sock *sk);
+   int (*hash)(struct sock *sk);
void(*unhash)(struct sock *sk);
void(*rehash)(struct sock *sk);
int (*get_port)(struct sock *sk, unsigned short 
snum);
@@ -1194,10 +1194,10 @@ static inline void sock_prot_inuse_add(struct net *net, 
struct proto *prot,
 /* With per-bucket locks this operation is not-atomic, so that
  * this version is not worse.
  */
-static inline void __sk_prot_rehash(struct sock *sk)
+static inline int __sk_prot_rehash(struct sock *sk)
 {
sk->sk_prot->unhash(sk);
-   sk->sk_prot->hash(sk);
+   return sk->sk_prot->hash(sk);
 }
 
 void sk_prot_clear_portaddr_nulls(struct sock *sk, int size);
diff --git a/include/net/udp.h b/include/net/udp.h
index 2842541e28e7..ecaeec1acd50 100644
--- a/include/net/udp.h
+++ b/include/net/udp.h
@@ -177,7 +177,7 @@ static inline struct udphdr *udp_gro_udphdr(struct sk_buff 
*skb)
 }
 
 /* hash routines shared between UDPv4/6 and UDP-Litev4/6 */
-static inline void udp_lib_hash(struct sock *sk)
+static inline int udp_lib_hash(struct sock *sk)
 {
BUG();
 }
diff --git a/net/ieee802154/socket.c b/net/ieee802154/socket.c
index a548be247e15..e0bd013a1e5e 100644
--- a/net/ieee802154/socket.c
+++ b/net/ieee802154/socket.c
@@ -182,12 +182,14 @@ static int ieee802154_sock_ioctl(struct socket *sock, 
unsigned int cmd,
 static HLIST_HEAD(raw_head);
 static DEFINE_RWLOCK(raw_lock);
 
-static void raw_hash(struct sock *sk)
+static int raw_hash(struct sock *sk)
 {
write_lock_bh(&raw_lock);
sk_add_node(sk,

[PATCH net-next 6/7] soreuseport: fast reuseport TCP socket selection

2016-02-09 Thread Craig Gallek

From: Craig Gallek 

This change extends the fast SO_REUSEPORT socket lookup implemented
for UDP to TCP.  Listener sockets with SO_REUSEPORT and the same
receive address are additionally added to an array for faster
random access.  This means that only a single socket from the group
must be found in the listener list before any socket in the group can
be used to receive a packet.  Previously, every socket in the group
needed to be considered before handing off the incoming packet.

This feature also exposes the ability to use a BPF program when
selecting a socket from a reuseport group.

Signed-off-by: Craig Gallek 
---
 include/net/inet_hashtables.h|  5 +++-
 net/ipv4/inet_connection_sock.c  | 14 ++---
 net/ipv4/inet_hashtables.c   | 64 +---
 net/ipv4/udp.c   |  4 +--
 net/ipv6/inet6_connection_sock.c |  2 ++
 net/ipv6/inet6_hashtables.c  | 16 +-
 6 files changed, 93 insertions(+), 12 deletions(-)

diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 82403390af58..50f635c2c536 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -207,7 +207,10 @@ void inet_hashinfo_init(struct inet_hashinfo *h);
 
 bool inet_ehash_insert(struct sock *sk, struct sock *osk);
 bool inet_ehash_nolisten(struct sock *sk, struct sock *osk);
-void __inet_hash(struct sock *sk, struct sock *osk);
+int __inet_hash(struct sock *sk, struct sock *osk,
+   int (*saddr_same)(const struct sock *sk1,
+ const struct sock *sk2,
+ bool match_wildcard));
 int inet_hash(struct sock *sk);
 void inet_unhash(struct sock *sk);
 
diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
index 4c457c492b1f..0daac5d7c3b9 100644
--- a/net/ipv4/inet_connection_sock.c
+++ b/net/ipv4/inet_connection_sock.c
@@ -24,6 +24,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #ifdef INET_CSK_DEBUG
 const char inet_csk_timer_bug_msg[] = "inet_csk BUG: unknown timer value\n";
@@ -67,7 +68,8 @@ int inet_csk_bind_conflict(const struct sock *sk,
if ((!reuse || !sk2->sk_reuse ||
sk2->sk_state == TCP_LISTEN) &&
(!reuseport || !sk2->sk_reuseport ||
-   (sk2->sk_state != TCP_TIME_WAIT &&
+rcu_access_pointer(sk->sk_reuseport_cb) ||
+(sk2->sk_state != TCP_TIME_WAIT &&
 !uid_eq(uid, sock_i_uid(sk2) {
 
if (!sk2->sk_rcv_saddr || !sk->sk_rcv_saddr ||
@@ -132,6 +134,7 @@ again:
  sk->sk_state != TCP_LISTEN) ||
 (tb->fastreuseport > 0 &&
  sk->sk_reuseport &&
+ 
!rcu_access_pointer(sk->sk_reuseport_cb) &&
  uid_eq(tb->fastuid, uid))) &&
(tb->num_owners < smallest_size || 
smallest_size == -1)) {
smallest_size = tb->num_owners;
@@ -193,15 +196,18 @@ tb_found:
if (((tb->fastreuse > 0 &&
  sk->sk_reuse && sk->sk_state != TCP_LISTEN) ||
 (tb->fastreuseport > 0 &&
- sk->sk_reuseport && uid_eq(tb->fastuid, uid))) &&
-   smallest_size == -1) {
+ sk->sk_reuseport &&
+ !rcu_access_pointer(sk->sk_reuseport_cb) &&
+ uid_eq(tb->fastuid, uid))) && smallest_size == -1) {
goto success;
} else {
ret = 1;
if (inet_csk(sk)->icsk_af_ops->bind_conflict(sk, tb, 
true)) {
if (((sk->sk_reuse && sk->sk_state != 
TCP_LISTEN) ||
 (tb->fastreuseport > 0 &&
- sk->sk_reuseport && uid_eq(tb->fastuid, 
uid))) &&
+ sk->sk_reuseport &&
+ !rcu_access_pointer(sk->sk_reuseport_cb) 
&&
+ uid_eq(tb->fastuid, uid))) &&
smallest_size != -1 && --attempts >= 0) {
spin_unlock(&head->lock);
goto again;
diff --git a/net/ipv4/inet_hashtables.c b/net/ipv4/inet_hashtables.c
index 5e4290b83255..c0f9942de924 100644
--- a/net/ipv4/inet_hashtables.c
+++ b/net/ipv4/inet_hashtables.c
@@ -20,10 +20,12 @@
 #include 
 #include 
 
+#include 
 #include 
 #include 
 #include 
 #include 
+#include 
 
 static u32 inet_ehashfn(const struct net *net, const __be32 laddr,
const __u16 lport, const __be32 faddr,

[PATCH net-next 5/7] soreuseport: Prep for fast reuseport TCP socket selection

2016-02-09 Thread Craig Gallek

From: Craig Gallek 

Both of the lines in this patch probably should have been included
in the initial implementation of this code for generic socket
support, but weren't technically necessary since only UDP sockets
were supported.

First, the sk_reuseport_cb points to a structure which assumes
each socket in the group has this pointer assigned at the same
time it's added to the array in the structure.  The sk_clone_lock
function breaks this assumption.  Since a child socket shouldn't
implicitly be in a reuseport group, the simple fix is to clear
the field in the clone.

Second, the SO_ATTACH_REUSEPORT_xBPF socket options require that
SO_REUSEPORT also be set first.  For UDP sockets, this is easily
enforced at bind-time since that process both puts the socket in
the appropriate receive hlist and updates the reuseport structures.
Since these operations can happen at two different times for TCP
sockets (bind and listen) it must be explicitly checked to enforce
the use of SO_REUSEPORT with SO_ATTACH_REUSEPORT_xBPF in the
setsockopt call.

Signed-off-by: Craig Gallek 
---
 net/core/filter.c | 2 +-
 net/core/sock.c   | 1 +
 2 files changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/core/filter.c b/net/core/filter.c
index 94d26201080d..2a6e9562f1ab 100644
--- a/net/core/filter.c
+++ b/net/core/filter.c
@@ -1181,7 +1181,7 @@ static int __reuseport_attach_prog(struct bpf_prog *prog, 
struct sock *sk)
if (bpf_prog_size(prog->len) > sysctl_optmem_max)
return -ENOMEM;
 
-   if (sk_unhashed(sk)) {
+   if (sk_unhashed(sk) && sk->sk_reuseport) {
err = reuseport_alloc(sk);
if (err)
return err;
diff --git a/net/core/sock.c b/net/core/sock.c
index 6c1c8bc93412..46dc8ad7d050 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1531,6 +1531,7 @@ struct sock *sk_clone_lock(const struct sock *sk, const 
gfp_t priority)
newsk = NULL;
goto out;
}
+   RCU_INIT_POINTER(newsk->sk_reuseport_cb, NULL);
 
newsk->sk_err  = 0;
newsk->sk_priority = 0;
-- 
2.7.0.rc3.207.g0ac5344

[PATCH net-next 4/7] inet: refactor inet[6]_lookup functions to take skb

2016-02-09 Thread Craig Gallek

From: Craig Gallek 

This is a preliminary step to allow fast socket lookup of SO_REUSEPORT
groups.  Doing so with a BPF filter will require access to the
skb in question.  This change plumbs the skb (and offset to payload
data) through the call stack to the listening socket lookup
implementations where it will be used in a following patch.

Signed-off-by: Craig Gallek 
---
 include/net/addrconf.h |  2 ++
 include/net/inet6_hashtables.h | 11 +++
 include/net/inet_hashtables.h  | 18 --
 net/dccp/ipv4.c|  2 +-
 net/dccp/ipv6.c|  2 +-
 net/ipv4/inet_diag.c   |  6 +++---
 net/ipv4/inet_hashtables.c |  1 +
 net/ipv4/tcp_ipv4.c| 10 ++
 net/ipv6/inet6_hashtables.c|  8 ++--
 net/ipv6/tcp_ipv6.c|  8 +---
 net/netfilter/xt_TPROXY.c  | 31 ---
 net/netfilter/xt_socket.c  | 28 +---
 12 files changed, 85 insertions(+), 42 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 47f52d3cd8df..730d856683e5 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -87,6 +87,8 @@ int __ipv6_get_lladdr(struct inet6_dev *idev, struct in6_addr 
*addr,
  u32 banned_flags);
 int ipv6_get_lladdr(struct net_device *dev, struct in6_addr *addr,
u32 banned_flags);
+int ipv4_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
+bool match_wildcard);
 int ipv6_rcv_saddr_equal(const struct sock *sk, const struct sock *sk2,
 bool match_wildcard);
 void addrconf_join_solict(struct net_device *dev, const struct in6_addr *addr);
diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index b3c28a9dfbf1..28332bdac333 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -53,6 +53,7 @@ struct sock *__inet6_lookup_established(struct net *net,
 
 struct sock *inet6_lookup_listener(struct net *net,
   struct inet_hashinfo *hashinfo,
+  struct sk_buff *skb, int doff,
   const struct in6_addr *saddr,
   const __be16 sport,
   const struct in6_addr *daddr,
@@ -60,6 +61,7 @@ struct sock *inet6_lookup_listener(struct net *net,
 
 static inline struct sock *__inet6_lookup(struct net *net,
  struct inet_hashinfo *hashinfo,
+ struct sk_buff *skb, int doff,
  const struct in6_addr *saddr,
  const __be16 sport,
  const struct in6_addr *daddr,
@@ -71,12 +73,12 @@ static inline struct sock *__inet6_lookup(struct net *net,
if (sk)
return sk;
 
-   return inet6_lookup_listener(net, hashinfo, saddr, sport,
+   return inet6_lookup_listener(net, hashinfo, skb, doff, saddr, sport,
 daddr, hnum, dif);
 }
 
 static inline struct sock *__inet6_lookup_skb(struct inet_hashinfo *hashinfo,
- struct sk_buff *skb,
+ struct sk_buff *skb, int doff,
  const __be16 sport,
  const __be16 dport,
  int iif)
@@ -86,13 +88,14 @@ static inline struct sock *__inet6_lookup_skb(struct 
inet_hashinfo *hashinfo,
if (sk)
return sk;
 
-   return __inet6_lookup(dev_net(skb_dst(skb)->dev), hashinfo,
- &ipv6_hdr(skb)->saddr, sport,
+   return __inet6_lookup(dev_net(skb_dst(skb)->dev), hashinfo, skb,
+ doff, &ipv6_hdr(skb)->saddr, sport,
  &ipv6_hdr(skb)->daddr, ntohs(dport),
  iif);
 }
 
 struct sock *inet6_lookup(struct net *net, struct inet_hashinfo *hashinfo,
+ struct sk_buff *skb, int doff,
  const struct in6_addr *saddr, const __be16 sport,
  const struct in6_addr *daddr, const __be16 dport,
  const int dif);
diff --git a/include/net/inet_hashtables.h b/include/net/inet_hashtables.h
index 554440e7f83d..82403390af58 100644
--- a/include/net/inet_hashtables.h
+++ b/include/net/inet_hashtables.h
@@ -213,6 +213,7 @@ void inet_unhash(struct sock *sk);
 
 struct sock *__inet_lookup_listener(struct net *net,
struct inet_hashinfo *hashinfo,
+   struct sk_buff *skb, int doff,
const __be32 saddr, const __be16 sport,
const __be32 dad

[PATCH net-next 2/7] inet: create IPv6-equivalent inet_hash function

2016-02-09 Thread Craig Gallek

From: Craig Gallek 

In order to support fast lookups for TCP sockets with SO_REUSEPORT,
the function that adds sockets to the listening hash set needs
to be able to check receive address equality.  Since this equality
check is different for IPv4 and IPv6, we will need two different
socket hashing functions.

This patch adds inet6_hash identical to the existing inet_hash function
and updates the appropriate references.  A following patch will
differentiate the two by passing different comparison functions to
__inet_hash.

Signed-off-by: Craig Gallek 
---
 include/net/inet6_hashtables.h |  2 ++
 net/dccp/ipv6.c|  2 +-
 net/ipv6/inet6_hashtables.c| 12 
 net/ipv6/tcp_ipv6.c|  2 +-
 net/l2tp/l2tp_ip6.c|  3 ++-
 5 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/include/net/inet6_hashtables.h b/include/net/inet6_hashtables.h
index 7ff588ca6817..b3c28a9dfbf1 100644
--- a/include/net/inet6_hashtables.h
+++ b/include/net/inet6_hashtables.h
@@ -96,6 +96,8 @@ struct sock *inet6_lookup(struct net *net, struct 
inet_hashinfo *hashinfo,
  const struct in6_addr *saddr, const __be16 sport,
  const struct in6_addr *daddr, const __be16 dport,
  const int dif);
+
+int inet6_hash(struct sock *sk);
 #endif /* IS_ENABLED(CONFIG_IPV6) */
 
 #define INET6_MATCH(__sk, __net, __saddr, __daddr, __ports, __dif) \
diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
index 9c6d0508e63a..90a8269b28d0 100644
--- a/net/dccp/ipv6.c
+++ b/net/dccp/ipv6.c
@@ -993,7 +993,7 @@ static struct proto dccp_v6_prot = {
.sendmsg   = dccp_sendmsg,
.recvmsg   = dccp_recvmsg,
.backlog_rcv   = dccp_v6_do_rcv,
-   .hash  = inet_hash,
+   .hash  = inet6_hash,
.unhash= inet_unhash,
.accept= inet_csk_accept,
.get_port  = inet_csk_get_port,
diff --git a/net/ipv6/inet6_hashtables.c b/net/ipv6/inet6_hashtables.c
index 21ace5a2bf7c..3521b15b8b85 100644
--- a/net/ipv6/inet6_hashtables.c
+++ b/net/ipv6/inet6_hashtables.c
@@ -274,3 +274,15 @@ int inet6_hash_connect(struct inet_timewait_death_row 
*death_row,
   __inet6_check_established);
 }
 EXPORT_SYMBOL_GPL(inet6_hash_connect);
+
+int inet6_hash(struct sock *sk)
+{
+   if (sk->sk_state != TCP_CLOSE) {
+   local_bh_disable();
+   __inet_hash(sk, NULL);
+   local_bh_enable();
+   }
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(inet6_hash);
diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 006396e31cb0..d72bcfb326d8 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1865,7 +1865,7 @@ struct proto tcpv6_prot = {
.sendpage   = tcp_sendpage,
.backlog_rcv= tcp_v6_do_rcv,
.release_cb = tcp_release_cb,
-   .hash   = inet_hash,
+   .hash   = inet6_hash,
.unhash = inet_unhash,
.get_port   = inet_csk_get_port,
.enter_memory_pressure  = tcp_enter_memory_pressure,
diff --git a/net/l2tp/l2tp_ip6.c b/net/l2tp/l2tp_ip6.c
index a2c8747d2936..6b54ff3ff4cb 100644
--- a/net/l2tp/l2tp_ip6.c
+++ b/net/l2tp/l2tp_ip6.c
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -718,7 +719,7 @@ static struct proto l2tp_ip6_prot = {
.sendmsg   = l2tp_ip6_sendmsg,
.recvmsg   = l2tp_ip6_recvmsg,
.backlog_rcv   = l2tp_ip6_backlog_recv,
-   .hash  = inet_hash,
+   .hash  = inet6_hash,
.unhash= inet_unhash,
.obj_size  = sizeof(struct l2tp_ip6_sock),
 #ifdef CONFIG_COMPAT
-- 
2.7.0.rc3.207.g0ac5344

1 2 >

1 - 100 of 153 matches

Mail list logo