Re: [PATCH] at803x: fix reset handling

2016-03-22 Thread Uwe Kleine-König
Hello,

I added the author of 13a56b449325 to Cc.

On Wed, Mar 23, 2016 at 12:44:40AM +0300, Sergei Shtylyov wrote:
> The driver of course "knows" that the chip's reset signal is active low,
> so  it drives the GPIO to 0  to reset the PHY and to 1 otherwise; however
> all this will only work iff the GPIO  is  specified as active-high in the
> device tree!  I think both the driver and the device trees (if there are
> any -- I was unable to find them) need to be fixed in this case...
> 
> Fixes: 13a56b449325 ("net: phy: at803x: Add support for hardware reset")
> Signed-off-by: Sergei Shtylyov 
> 
> ---
> The patch is against DaveM's 'net.git' repo.

Don't you need to work against net-next for non-urgent stuff? Or do you
consider this urgent?

>  drivers/net/phy/at803x.c |6 +++---
>  1 file changed, 3 insertions(+), 3 deletions(-)
> 
> Index: net/drivers/net/phy/at803x.c
> ===
> --- net.orig/drivers/net/phy/at803x.c
> +++ net/drivers/net/phy/at803x.c
> @@ -277,7 +277,7 @@ static int at803x_probe(struct phy_devic
>   if (!priv)
>   return -ENOMEM;
>  
> - gpiod_reset = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_HIGH);
> + gpiod_reset = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_LOW);
>   if (IS_ERR(gpiod_reset))
>   return PTR_ERR(gpiod_reset);
>  
> @@ -362,10 +362,10 @@ static void at803x_link_change_notify(st
>  
>   at803x_context_save(phydev, &context);
>  
> - gpiod_set_value(priv->gpiod_reset, 0);
> - msleep(1);
>   gpiod_set_value(priv->gpiod_reset, 1);
>   msleep(1);
> + gpiod_set_value(priv->gpiod_reset, 0);
> + msleep(1);

The new variant is better than the old one. The change however breaks
existing device trees which is not so nice. Given there are no mainline
users this is probably ok though. So:

Acked-by: Uwe Kleine-König 

-- 
Pengutronix e.K.   | Uwe Kleine-König|
Industrial Linux Solutions | http://www.pengutronix.de/  |


Re: [PATCH] vlan: propagate gso_min_segs

2016-03-22 Thread Eric Dumazet
On Wed, 2016-03-23 at 09:35 +0800, Haishuang Yan wrote:
> vlan drivers lack proper propagation of gso_min_segs from lower device.
> 
> Signed-off-by: Haishuang Yan 
> ---

The plan was to get rid of gso_min_segs, as nothing uses it.

Otherwise I would have included this in my recent patches...

For such a rare 'issue' , we believe ndo_features_check() is better,
because it does not slow down the fast path.

Thanks.




[PATCH] vlan: propagate gso_min_segs

2016-03-22 Thread Haishuang Yan
vlan drivers lack proper propagation of gso_min_segs from lower device.

Signed-off-by: Haishuang Yan 
---
 drivers/net/ipvlan/ipvlan_main.c | 2 ++
 drivers/net/macvlan.c| 1 +
 net/8021q/vlan.c | 1 +
 net/8021q/vlan_dev.c | 1 +
 4 files changed, 5 insertions(+)

diff --git a/drivers/net/ipvlan/ipvlan_main.c b/drivers/net/ipvlan/ipvlan_main.c
index 57941d3..72a2517 100644
--- a/drivers/net/ipvlan/ipvlan_main.c
+++ b/drivers/net/ipvlan/ipvlan_main.c
@@ -120,6 +120,7 @@ static int ipvlan_init(struct net_device *dev)
dev->features |= NETIF_F_LLTX;
dev->gso_max_size = phy_dev->gso_max_size;
dev->gso_max_segs = phy_dev->gso_max_segs;
+   dev->gso_min_segs = phy_dev->gso_min_segs;
dev->hard_header_len = phy_dev->hard_header_len;
 
ipvlan_set_lockdep_class(dev);
@@ -594,6 +595,7 @@ static int ipvlan_device_event(struct notifier_block 
*unused,
ipvlan->dev->features = dev->features & IPVLAN_FEATURES;
ipvlan->dev->gso_max_size = dev->gso_max_size;
ipvlan->dev->gso_max_segs = dev->gso_max_segs;
+   ipvlan->dev->gso_min_segs = dev->gso_min_segs;
netdev_features_change(ipvlan->dev);
}
break;
diff --git a/drivers/net/macvlan.c b/drivers/net/macvlan.c
index 2bcf1f3..72991e9 100644
--- a/drivers/net/macvlan.c
+++ b/drivers/net/macvlan.c
@@ -1534,6 +1534,7 @@ static int macvlan_device_event(struct notifier_block 
*unused,
list_for_each_entry(vlan, &port->vlans, list) {
vlan->dev->gso_max_size = dev->gso_max_size;
vlan->dev->gso_max_segs = dev->gso_max_segs;
+   vlan->dev->gso_min_segs = dev->gso_min_segs;
netdev_update_features(vlan->dev);
}
break;
diff --git a/net/8021q/vlan.c b/net/8021q/vlan.c
index a1e273a..01a4de1 100644
--- a/net/8021q/vlan.c
+++ b/net/8021q/vlan.c
@@ -312,6 +312,7 @@ static void vlan_transfer_features(struct net_device *dev,
 
vlandev->gso_max_size = dev->gso_max_size;
vlandev->gso_max_segs = dev->gso_max_segs;
+   vlandev->gso_min_segs = dev->gso_min_segs;
 
if (vlan_hw_offload_capable(dev->features, vlan->vlan_proto))
vlandev->hard_header_len = dev->hard_header_len;
diff --git a/net/8021q/vlan_dev.c b/net/8021q/vlan_dev.c
index e7e6257..752263d 100644
--- a/net/8021q/vlan_dev.c
+++ b/net/8021q/vlan_dev.c
@@ -552,6 +552,7 @@ static int vlan_dev_init(struct net_device *dev)
 NETIF_F_GSO_SOFTWARE;
dev->gso_max_size = real_dev->gso_max_size;
dev->gso_max_segs = real_dev->gso_max_segs;
+   dev->gso_min_segs = real_dev->gso_min_segs;
if (dev->features & NETIF_F_VLAN_FEATURES)
netdev_warn(real_dev, "VLAN features are set incorrectly.  
Q-in-Q configurations may not work correctly.\n");
 
-- 
1.8.3.1





[PATCH] gre: fix return value of gre_rcv

2016-03-22 Thread Haishuang Yan
Dropped skb's should be documented by an appropriate return value.
Use the correct NET_RX_DROP and NET_RX_SUCCESS values for that reason.

Signed-off-by: Haishuang Yan 
---
 net/ipv4/ip_gre.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/ip_gre.c b/net/ipv4/ip_gre.c
index 31936d3..1dc0cdb 100644
--- a/net/ipv4/ip_gre.c
+++ b/net/ipv4/ip_gre.c
@@ -432,12 +432,12 @@ static int gre_rcv(struct sk_buff *skb)
goto drop;
 
if (ipgre_rcv(skb, &tpi) == PACKET_RCVD)
-   return 0;
+   return NET_RX_SUCCESS;
 
icmp_send(skb, ICMP_DEST_UNREACH, ICMP_PORT_UNREACH, 0);
 drop:
kfree_skb(skb);
-   return 0;
+   return NET_RX_DROP;
 }
 
 static __sum16 gre_checksum(struct sk_buff *skb)
-- 
1.8.3.1





[PATCH net v2] xfrm: Fix crash observed during device unregistration and decryption

2016-03-22 Thread Subash Abhinov Kasiviswanathan
A crash is observed when a decrypted packet is processed in receive
path. get_rps_cpus() tries to dereference the skb->dev fields but it
appears that the device is freed from the poison pattern.

[] get_rps_cpu+0x94/0x2f0
[] netif_rx_internal+0x140/0x1cc
[] netif_rx+0x74/0x94
[] xfrm_input+0x754/0x7d0
[] xfrm_input_resume+0x10/0x1c
[] esp_input_done+0x20/0x30
[] process_one_work+0x244/0x3fc
[] worker_thread+0x2f8/0x418
[] kthread+0xe0/0xec

-013|get_rps_cpu(
 |dev = 0xFFC08B688000,
 |skb = 0xFFC0C76AAC00 -> (
 |  dev = 0xFFC08B688000 -> (
 |name =
"..
 |name_hlist = (next = 0x, pprev =
0xAAA

Following are the sequence of events observed -

- Encrypted packet in receive path from netdevice is queued
- Encrypted packet queued for decryption (asynchronous)
- Netdevice brought down and freed
- Packet is decrypted and returned through callback in esp_input_done
- Packet is queued again for process in network stack using netif_rx

Since the device appears to have been freed, the dereference of
skb->dev in get_rps_cpus() leads to an unhandled page fault
exception.

Fix this by holding on to device reference when queueing packets
asynchronously and releasing the reference on call back return.

v2: Make the change generic to xfrm as mentioned by Steffen and
update the title to xfrm

Suggested-by: Herbert Xu 
Signed-off-by: Jerome Stanislaus 
Signed-off-by: Subash Abhinov Kasiviswanathan 
---
 net/xfrm/xfrm_input.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/net/xfrm/xfrm_input.c b/net/xfrm/xfrm_input.c
index ad7f5b3..1c4ad47 100644
--- a/net/xfrm/xfrm_input.c
+++ b/net/xfrm/xfrm_input.c
@@ -292,12 +292,15 @@ int xfrm_input(struct sk_buff *skb, int nexthdr, __be32
spi, int encap_type)
XFRM_SKB_CB(skb)->seq.input.hi = seq_hi;
 
skb_dst_force(skb);
+   dev_hold(skb->dev);
 
nexthdr = x->type->input(x, skb);
 
if (nexthdr == -EINPROGRESS)
return 0;
 resume:
+   dev_put(skb->dev);
+
spin_lock(&x->lock);
if (nexthdr <= 0) {
if (nexthdr == -EBADMSG) {
--



Re: [PATCH] ipv6: Fix the pmtu path for connected UDP socket

2016-03-22 Thread Eric Dumazet
On Tue, 2016-03-22 at 13:13 -0700, Cong Wang wrote:
> On Tue, Mar 22, 2016 at 11:03 AM, Wei Wang  wrote:
> > Thanks Martin and Cong.
> >
> > I guess then we are going with the following fix in ip6_sk_update_pmtu():
> > 1. call ip6_upate_pmtu() as it is
> > 2. do a dst_check()
> > 3. re-lookup() if it is invalid
> > 4. and then do a ip6_dst_store()/dst_set
> 
> Exactly, please try the attached patch. Note I did nothing more than a
> compile test.
> 
> Does it make sense to you now?


Hard to reply on your patch as it was not inlined.

1) Lot of code duplication, for some reason I do not yet understand.

ip6_sk_update_pmtu() and ip6_update_pmtu() will basically do the same
thing...

2)

+   if (sk->sk_state == TCP_ESTABLISHED)
+   ip6_dst_store(sk, dst, &iph->daddr, &iph->saddr);
+out:


ip6_dst_store() will do :

np->daddr_cache = daddr;  (&iph->daddr)
np->saddr_cache = saddr;  (&iph->saddr)

So when skb is freed, daddr_cache & saddr_cache point to freed data.






[net PATCH] net: Reset encap_level to avoid resetting features on inner IP headers

2016-03-22 Thread Alexander Duyck
This patch corrects an oversight in which we were allowing the encap_level
value to pass from the outer headers to the inner headers.  As a result we
were incorrectly identifying UDP or GRE tunnels as also making use of ipip
or sit when the second header actually represented a tunnel encapsulated in
either a UDP or GRE tunnel which already had the features masked.

Fixes: 76443456227097179c1482 ("net: Move GSO csum into SKB_GSO_CB")
Reported-by: Tom Herbert 
Signed-off-by: Alexander Duyck 
---
 net/ipv4/gre_offload.c |1 +
 net/ipv4/udp_offload.c |1 +
 2 files changed, 2 insertions(+)

diff --git a/net/ipv4/gre_offload.c b/net/ipv4/gre_offload.c
index dea0390d65bb..43a5c76742dc 100644
--- a/net/ipv4/gre_offload.c
+++ b/net/ipv4/gre_offload.c
@@ -49,6 +49,7 @@ static struct sk_buff *gre_gso_segment(struct sk_buff *skb,
 
/* setup inner skb. */
skb->encapsulation = 0;
+   SKB_GSO_CB(skb)->encap_level = 0;
__skb_pull(skb, tnl_hlen);
skb_reset_mac_header(skb);
skb_set_network_header(skb, skb_inner_network_offset(skb));
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 5fcb93269afb..80236a9e1769 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -56,6 +56,7 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct 
sk_buff *skb,
 
/* setup inner skb. */
skb->encapsulation = 0;
+   SKB_GSO_CB(skb)->encap_level = 0;
__skb_pull(skb, tnl_hlen);
skb_reset_mac_header(skb);
skb_set_network_header(skb, skb_inner_network_offset(skb));



[PATCH] net: mediatek: fix checking for NULL instead of IS_ERR() in .probe

2016-03-22 Thread Vladimir Zapolskiy
devm_ioremap_resource() returns ERR_PTR() value on error, it never
returns NULL, fix it and propagate the returned error upwards.

Fixes: 656e705243fd ("net-next: mediatek: add support for MT7623 ethernet")
Signed-off-by: Vladimir Zapolskiy 
---
 drivers/net/ethernet/mediatek/mtk_eth_soc.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c 
b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index 7f2126b..e0b68af 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -1690,8 +1690,8 @@ static int mtk_probe(struct platform_device *pdev)
return -ENOMEM;
 
eth->base = devm_ioremap_resource(&pdev->dev, res);
-   if (!eth->base)
-   return -EADDRNOTAVAIL;
+   if (IS_ERR(eth->base))
+   return PTR_ERR(eth->base);
 
spin_lock_init(ð->page_lock);
 
-- 
2.1.4



Re: [PATCH net-next] net: Fix remote checksum offload with GUE

2016-03-22 Thread Tom Herbert
On Tue, Mar 22, 2016 at 3:09 PM, Tom Herbert  wrote:
> On Tue, Mar 22, 2016 at 2:45 PM, Alexander Duyck
>  wrote:
>> On Tue, Mar 22, 2016 at 2:10 PM, Tom Herbert  wrote:
>>> On Tue, Mar 22, 2016 at 1:20 PM, Alexander Duyck
>>>  wrote:
 On Tue, Mar 22, 2016 at 12:19 PM, Tom Herbert  wrote:
> In skb_segment the check of whether or not to perform the checksum on
> host was changed to not consider rather remote checksum offload is
> in use. In the case that can_checksum_protocol fails the checksum
> is computed regardless. __skb_udp_tunnel_segment was modified in a
> related patch to add NETIF_F_HW_CSUM into features when grabbing
> the enc_features and remote checksum offload is being done. The
> problem is that this bit can be cleared in lower GSO layers that
> are also doing tunneling (e.g. ipip, GRE when used with GUE),
> so when we get to skb_segment that intent has been lost and
> can_checksum_protocol fails.

 So what you are describing sounds like a tunnel in tunnel scenario.
 It might work better to just skip masking the features if
 skb->remcsum_offload is set rather than trying to change how we
 perform the offload.

>>> To be clear, my patch is restoring the old behavior not implementing a new 
>>> one.
>>
>> Yes, but the old behavior could lead to kernel panics under certain
>> circumstances.
>>
 I'm pretty sure this will cause data corruption and maybe a kernel
 panic if Tx checksum offload is disabled.

>>> Nope, working fine for me.
>>
>> What are the options you used to create the tunnel?  Did you enable
>> both remcsum and udpcsum?
>>
> modprobe fou
> ./ip fou add port 6080 gue
> ./ip link add name tun1 type ipip remote 10.1.1.2 local 10.1.1.2 ttl
> 225 encap gue encap-sport auto encap-dport 6080 encap-csum
> encap-remcsum
> ifconfig tun1 192.168.1.1
> ip route add 192.168.1.0/24 dev tun1
>
Here are the command I use for VXLAN for reference:

./ip link add vxlan0 type vxlan id 10 group 224.10.10.10 ttl 10 dev
eth0 udpcsum remcsumtx remcsumrx
ifconfig vxlan0 192.168.111.1
ip route add 192.168.111.0/24 dev vxlan0

When remcsum offload is working properly (enabled and outer checksum
is offloaded) then csum_partial gets very few cycles as shown by perf.

Tom

> Thanks,
> Tom
>
>>> The outer checksum would be computed in
> This patch:
>
> - Restores the check in skb_segment to look at remote checksum offload.
> - Removes the code in __skb_udp_tunnel_segment to force the
>   NETIF_F_HW_CSUM feature since this is no longer useful with above
>   change.
> - Removes check for remote checksum offload in gso_reset_checksum.
>   A special case should not be needed here.
>
> Tested: Single netperf STREAM over GUE-ipip
>
> Before fix:
>5625 Mbps
> After fix:
>6410 Mbps
>
> Fixes: 76443456227097179c1482 ("net: Move GSO csum into SKB_GSO_CB")
> Signed-off-by: Tom Herbert 
> ---
>  include/linux/skbuff.h |  4 
>  net/core/skbuff.c  |  5 ++---
>  net/ipv4/udp_offload.c | 10 --
>  3 files changed, 2 insertions(+), 17 deletions(-)
>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 15d0df9..f6fe8a5 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -3615,10 +3615,6 @@ static inline int gso_pskb_expand_head(struct 
> sk_buff *skb, int extra)
>
>  static inline void gso_reset_checksum(struct sk_buff *skb, __wsum res)
>  {
> -   /* Do not update partial checksums if remote checksum is enabled. 
> */
> -   if (skb->remcsum_offload)
> -   return;
> -
> SKB_GSO_CB(skb)->csum = res;
> SKB_GSO_CB(skb)->csum_start = skb_checksum_start(skb) - skb->head;
>  }

 I'm pretty sure this part here will break things when you don't have
 an outer offload enabled.  What NIC did you test this on?  Did you try
 disabling the Tx checksum support in the hardware to see what would
 happen?

>>> Mellanox.
>>>
>>> When TX checksum is disabled the outer checksum is computed in
>>> __skb_udp_tunnel_segment.
>>
>> Yes, but that uses gso_make_checksum which requires the csum and
>> csum_start fields be initialized in the SKB_GSO_CB().
>>
> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index f044f97..e4eb78d 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -3259,14 +3259,13 @@ skip_fraglist:
> nskb->truesize += nskb->data_len;
>
>  perform_csum_check:
> -   if (!csum) {
> +   if (!csum && !nskb->remcsum_offload) {
> if (skb_has_shared_frag(nskb)) {
> err = __skb_linearize(nskb);
> if (err)
> goto err;
> }

[PATCH v2] prism54: isl_38xx: Replace 'struct timeval'

2016-03-22 Thread Tina Ruchandani
'struct timeval' uses a 32-bit seconds field which will overflow in
year 2038 and beyond. This patch is part of a larger effort to remove
all instances of 'struct timeval' from the kernel and replace them
with 64-bit timekeeping variables.
The correctness of the code isn't affected by this patch - the seconds
value being printed would earlier be wrong due to overflow in timeval,
and now it gets truncated to 32-bit due to the 'long' cast used on
tv.sec field to prevent compiler warnings. Truly fixing this would
require changing the debug print to print more than 8 digits and
use a different specifier from %li.
The patch was build-tested / debugged by removing the
"if VERBOSE > SHOW_ERROR_MESSAGES" guards.

Signed-off-by: Tina Ruchandani 
Suggested-by: Arnd Bergmann 
--
Changes in v2:
- Changed printf specifier as suggested by Arnd Bergmann to
avoid truncation.
---
 drivers/net/wireless/intersil/prism54/isl_38xx.c | 35 
 1 file changed, 18 insertions(+), 17 deletions(-)

diff --git a/drivers/net/wireless/intersil/prism54/isl_38xx.c 
b/drivers/net/wireless/intersil/prism54/isl_38xx.c
index 333c1a2..6700387 100644
--- a/drivers/net/wireless/intersil/prism54/isl_38xx.c
+++ b/drivers/net/wireless/intersil/prism54/isl_38xx.c
@@ -19,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 

 #include 
 #include 
@@ -113,7 +114,7 @@ isl38xx_trigger_device(int asleep, void __iomem 
*device_base)

 #if VERBOSE > SHOW_ERROR_MESSAGES
u32 counter = 0;
-   struct timeval current_time;
+   struct timespec64 current_ts64;
DEBUG(SHOW_FUNCTION_CALLS, "isl38xx trigger device\n");
 #endif

@@ -121,22 +122,22 @@ isl38xx_trigger_device(int asleep, void __iomem 
*device_base)
if (asleep) {
/* device is in powersave, trigger the device for wakeup */
 #if VERBOSE > SHOW_ERROR_MESSAGES
-   do_gettimeofday(¤t_time);
-   DEBUG(SHOW_TRACING, "%08li.%08li Device wakeup triggered\n",
- current_time.tv_sec, (long)current_time.tv_usec);
+   ktime_get_real_ts64(¤t_ts64);
+   DEBUG(SHOW_TRACING, "%lld.%09ld Device wakeup triggered\n",
+ (s64)current_ts64.tv_sec, current_ts64.tv_nsec);

-   DEBUG(SHOW_TRACING, "%08li.%08li Device register read %08x\n",
- current_time.tv_sec, (long)current_time.tv_usec,
+   DEBUG(SHOW_TRACING, "%lld.%09ld Device register read %08x\n",
+ (s64)current_ts64.tv_sec, current_ts64.tv_nsec,
  readl(device_base + ISL38XX_CTRL_STAT_REG));
 #endif

reg = readl(device_base + ISL38XX_INT_IDENT_REG);
if (reg == 0xabadface) {
 #if VERBOSE > SHOW_ERROR_MESSAGES
-   do_gettimeofday(¤t_time);
+   ktime_get_real_ts64(¤t_ts64);
DEBUG(SHOW_TRACING,
- "%08li.%08li Device register abadface\n",
- current_time.tv_sec, (long)current_time.tv_usec);
+ "%lld.%09ld Device register abadface\n",
+ (s64)current_ts64.tv_sec, current_ts64.tv_nsec);
 #endif
/* read the Device Status Register until Sleepmode bit 
is set */
while (reg = readl(device_base + ISL38XX_CTRL_STAT_REG),
@@ -149,13 +150,13 @@ isl38xx_trigger_device(int asleep, void __iomem 
*device_base)

 #if VERBOSE > SHOW_ERROR_MESSAGES
DEBUG(SHOW_TRACING,
- "%08li.%08li Device register read %08x\n",
- current_time.tv_sec, (long)current_time.tv_usec,
+ "%lld.%09ld Device register read %08x\n",
+ (s64)current_ts64.tv_sec, current_ts64.tv_nsec,
  readl(device_base + ISL38XX_CTRL_STAT_REG));
-   do_gettimeofday(¤t_time);
+   ktime_get_real_ts64(¤t_ts64);
DEBUG(SHOW_TRACING,
- "%08li.%08li Device asleep counter %i\n",
- current_time.tv_sec, (long)current_time.tv_usec,
+ "%lld.%09ld Device asleep counter %i\n",
+ (s64)current_ts64.tv_sec, current_ts64.tv_nsec,
  counter);
 #endif
}
@@ -168,9 +169,9 @@ isl38xx_trigger_device(int asleep, void __iomem 
*device_base)

/* perform another read on the Device Status Register */
reg = readl(device_base + ISL38XX_CTRL_STAT_REG);
-   do_gettimeofday(¤t_time);
-   DEBUG(SHOW_TRACING, "%08li.%08li Device register read %08x\n",
- current_time.tv_sec, (long)current_time.tv_usec, reg);
+   ktime_get_real_ts64(¤t_ts64);
+   DEBUG(SHOW_TRACING, "%lld.%00ld Device register read %08x\n",
+

Re: [PATCH net-next] net: Fix remote checksum offload with GUE

2016-03-22 Thread Tom Herbert
On Tue, Mar 22, 2016 at 2:45 PM, Alexander Duyck
 wrote:
> On Tue, Mar 22, 2016 at 2:10 PM, Tom Herbert  wrote:
>> On Tue, Mar 22, 2016 at 1:20 PM, Alexander Duyck
>>  wrote:
>>> On Tue, Mar 22, 2016 at 12:19 PM, Tom Herbert  wrote:
 In skb_segment the check of whether or not to perform the checksum on
 host was changed to not consider rather remote checksum offload is
 in use. In the case that can_checksum_protocol fails the checksum
 is computed regardless. __skb_udp_tunnel_segment was modified in a
 related patch to add NETIF_F_HW_CSUM into features when grabbing
 the enc_features and remote checksum offload is being done. The
 problem is that this bit can be cleared in lower GSO layers that
 are also doing tunneling (e.g. ipip, GRE when used with GUE),
 so when we get to skb_segment that intent has been lost and
 can_checksum_protocol fails.
>>>
>>> So what you are describing sounds like a tunnel in tunnel scenario.
>>> It might work better to just skip masking the features if
>>> skb->remcsum_offload is set rather than trying to change how we
>>> perform the offload.
>>>
>> To be clear, my patch is restoring the old behavior not implementing a new 
>> one.
>
> Yes, but the old behavior could lead to kernel panics under certain
> circumstances.
>
>>> I'm pretty sure this will cause data corruption and maybe a kernel
>>> panic if Tx checksum offload is disabled.
>>>
>> Nope, working fine for me.
>
> What are the options you used to create the tunnel?  Did you enable
> both remcsum and udpcsum?
>
modprobe fou
./ip fou add port 6080 gue
./ip link add name tun1 type ipip remote 10.1.1.2 local 10.1.1.2 ttl
225 encap gue encap-sport auto encap-dport 6080 encap-csum
encap-remcsum
ifconfig tun1 192.168.1.1
ip route add 192.168.1.0/24 dev tun1

Thanks,
Tom

>> The outer checksum would be computed in
 This patch:

 - Restores the check in skb_segment to look at remote checksum offload.
 - Removes the code in __skb_udp_tunnel_segment to force the
   NETIF_F_HW_CSUM feature since this is no longer useful with above
   change.
 - Removes check for remote checksum offload in gso_reset_checksum.
   A special case should not be needed here.

 Tested: Single netperf STREAM over GUE-ipip

 Before fix:
5625 Mbps
 After fix:
6410 Mbps

 Fixes: 76443456227097179c1482 ("net: Move GSO csum into SKB_GSO_CB")
 Signed-off-by: Tom Herbert 
 ---
  include/linux/skbuff.h |  4 
  net/core/skbuff.c  |  5 ++---
  net/ipv4/udp_offload.c | 10 --
  3 files changed, 2 insertions(+), 17 deletions(-)

 diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
 index 15d0df9..f6fe8a5 100644
 --- a/include/linux/skbuff.h
 +++ b/include/linux/skbuff.h
 @@ -3615,10 +3615,6 @@ static inline int gso_pskb_expand_head(struct 
 sk_buff *skb, int extra)

  static inline void gso_reset_checksum(struct sk_buff *skb, __wsum res)
  {
 -   /* Do not update partial checksums if remote checksum is enabled. 
 */
 -   if (skb->remcsum_offload)
 -   return;
 -
 SKB_GSO_CB(skb)->csum = res;
 SKB_GSO_CB(skb)->csum_start = skb_checksum_start(skb) - skb->head;
  }
>>>
>>> I'm pretty sure this part here will break things when you don't have
>>> an outer offload enabled.  What NIC did you test this on?  Did you try
>>> disabling the Tx checksum support in the hardware to see what would
>>> happen?
>>>
>> Mellanox.
>>
>> When TX checksum is disabled the outer checksum is computed in
>> __skb_udp_tunnel_segment.
>
> Yes, but that uses gso_make_checksum which requires the csum and
> csum_start fields be initialized in the SKB_GSO_CB().
>
 diff --git a/net/core/skbuff.c b/net/core/skbuff.c
 index f044f97..e4eb78d 100644
 --- a/net/core/skbuff.c
 +++ b/net/core/skbuff.c
 @@ -3259,14 +3259,13 @@ skip_fraglist:
 nskb->truesize += nskb->data_len;

  perform_csum_check:
 -   if (!csum) {
 +   if (!csum && !nskb->remcsum_offload) {
 if (skb_has_shared_frag(nskb)) {
 err = __skb_linearize(nskb);
 if (err)
 goto err;
 }
 -   if (!nskb->remcsum_offload)
 -   nskb->ip_summed = CHECKSUM_NONE;
 +   nskb->ip_summed = CHECKSUM_NONE;
 SKB_GSO_CB(nskb)->csum =
 skb_checksum(nskb, doffset,
  nskb->len - doffset, 0);
>>>
>>> I'm pretty sure this is going to cause a huge mess if you are
>>> requesting remote checksum but cannot perform an outer checksum.  One
>>> of the reasons I merged the

Re: [PATCH net-next] net: Fix remote checksum offload with GUE

2016-03-22 Thread Alexander Duyck
On Tue, Mar 22, 2016 at 2:10 PM, Tom Herbert  wrote:
> On Tue, Mar 22, 2016 at 1:20 PM, Alexander Duyck
>  wrote:
>> On Tue, Mar 22, 2016 at 12:19 PM, Tom Herbert  wrote:
>>> In skb_segment the check of whether or not to perform the checksum on
>>> host was changed to not consider rather remote checksum offload is
>>> in use. In the case that can_checksum_protocol fails the checksum
>>> is computed regardless. __skb_udp_tunnel_segment was modified in a
>>> related patch to add NETIF_F_HW_CSUM into features when grabbing
>>> the enc_features and remote checksum offload is being done. The
>>> problem is that this bit can be cleared in lower GSO layers that
>>> are also doing tunneling (e.g. ipip, GRE when used with GUE),
>>> so when we get to skb_segment that intent has been lost and
>>> can_checksum_protocol fails.
>>
>> So what you are describing sounds like a tunnel in tunnel scenario.
>> It might work better to just skip masking the features if
>> skb->remcsum_offload is set rather than trying to change how we
>> perform the offload.
>>
> To be clear, my patch is restoring the old behavior not implementing a new 
> one.

Yes, but the old behavior could lead to kernel panics under certain
circumstances.

>> I'm pretty sure this will cause data corruption and maybe a kernel
>> panic if Tx checksum offload is disabled.
>>
> Nope, working fine for me.

What are the options you used to create the tunnel?  Did you enable
both remcsum and udpcsum?

> The outer checksum would be computed in
>>> This patch:
>>>
>>> - Restores the check in skb_segment to look at remote checksum offload.
>>> - Removes the code in __skb_udp_tunnel_segment to force the
>>>   NETIF_F_HW_CSUM feature since this is no longer useful with above
>>>   change.
>>> - Removes check for remote checksum offload in gso_reset_checksum.
>>>   A special case should not be needed here.
>>>
>>> Tested: Single netperf STREAM over GUE-ipip
>>>
>>> Before fix:
>>>5625 Mbps
>>> After fix:
>>>6410 Mbps
>>>
>>> Fixes: 76443456227097179c1482 ("net: Move GSO csum into SKB_GSO_CB")
>>> Signed-off-by: Tom Herbert 
>>> ---
>>>  include/linux/skbuff.h |  4 
>>>  net/core/skbuff.c  |  5 ++---
>>>  net/ipv4/udp_offload.c | 10 --
>>>  3 files changed, 2 insertions(+), 17 deletions(-)
>>>
>>> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
>>> index 15d0df9..f6fe8a5 100644
>>> --- a/include/linux/skbuff.h
>>> +++ b/include/linux/skbuff.h
>>> @@ -3615,10 +3615,6 @@ static inline int gso_pskb_expand_head(struct 
>>> sk_buff *skb, int extra)
>>>
>>>  static inline void gso_reset_checksum(struct sk_buff *skb, __wsum res)
>>>  {
>>> -   /* Do not update partial checksums if remote checksum is enabled. */
>>> -   if (skb->remcsum_offload)
>>> -   return;
>>> -
>>> SKB_GSO_CB(skb)->csum = res;
>>> SKB_GSO_CB(skb)->csum_start = skb_checksum_start(skb) - skb->head;
>>>  }
>>
>> I'm pretty sure this part here will break things when you don't have
>> an outer offload enabled.  What NIC did you test this on?  Did you try
>> disabling the Tx checksum support in the hardware to see what would
>> happen?
>>
> Mellanox.
>
> When TX checksum is disabled the outer checksum is computed in
> __skb_udp_tunnel_segment.

Yes, but that uses gso_make_checksum which requires the csum and
csum_start fields be initialized in the SKB_GSO_CB().

>>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>>> index f044f97..e4eb78d 100644
>>> --- a/net/core/skbuff.c
>>> +++ b/net/core/skbuff.c
>>> @@ -3259,14 +3259,13 @@ skip_fraglist:
>>> nskb->truesize += nskb->data_len;
>>>
>>>  perform_csum_check:
>>> -   if (!csum) {
>>> +   if (!csum && !nskb->remcsum_offload) {
>>> if (skb_has_shared_frag(nskb)) {
>>> err = __skb_linearize(nskb);
>>> if (err)
>>> goto err;
>>> }
>>> -   if (!nskb->remcsum_offload)
>>> -   nskb->ip_summed = CHECKSUM_NONE;
>>> +   nskb->ip_summed = CHECKSUM_NONE;
>>> SKB_GSO_CB(nskb)->csum =
>>> skb_checksum(nskb, doffset,
>>>  nskb->len - doffset, 0);
>>
>> I'm pretty sure this is going to cause a huge mess if you are
>> requesting remote checksum but cannot perform an outer checksum.  One
>> of the reasons I merged these features together the way I did was
>> because we needed to perform the initial checksum to avoid causing a
>> kernel panic later on.  Otherwise we don't have the SKB_GSO_CB()->csum
>> and SKB_GSO_CB()->csum_start fields populated.
>>
> Nope, if we can't perform outer checksum it is done in the host. I'm
> not seeing any problem.

Are you running remcsum with udpcsum set or is udpcsum not set?  If
you don't set udpcsum it doesn't actual

[PATCH] at803x: fix reset handling

2016-03-22 Thread Sergei Shtylyov
The driver of course "knows" that the chip's reset signal is active low,
so  it drives the GPIO to 0  to reset the PHY and to 1 otherwise; however
all this will only work iff the GPIO  is  specified as active-high in the
device tree!  I think both the driver and the device trees (if there are
any -- I was unable to find them) need to be fixed in this case...

Fixes: 13a56b449325 ("net: phy: at803x: Add support for hardware reset")
Signed-off-by: Sergei Shtylyov 

---
The patch is against DaveM's 'net.git' repo.

 drivers/net/phy/at803x.c |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

Index: net/drivers/net/phy/at803x.c
===
--- net.orig/drivers/net/phy/at803x.c
+++ net/drivers/net/phy/at803x.c
@@ -277,7 +277,7 @@ static int at803x_probe(struct phy_devic
if (!priv)
return -ENOMEM;
 
-   gpiod_reset = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_HIGH);
+   gpiod_reset = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_LOW);
if (IS_ERR(gpiod_reset))
return PTR_ERR(gpiod_reset);
 
@@ -362,10 +362,10 @@ static void at803x_link_change_notify(st
 
at803x_context_save(phydev, &context);
 
-   gpiod_set_value(priv->gpiod_reset, 0);
-   msleep(1);
gpiod_set_value(priv->gpiod_reset, 1);
msleep(1);
+   gpiod_set_value(priv->gpiod_reset, 0);
+   msleep(1);
 
at803x_context_restore(phydev, &context);
 



Re: [RFC PATCH 7/9] GSO: Support partial segmentation offload

2016-03-22 Thread Alexander Duyck
On Tue, Mar 22, 2016 at 12:40 PM, Edward Cree  wrote:
> On 22/03/16 17:47, Alexander Duyck wrote:
>> On Tue, Mar 22, 2016 at 10:00 AM, Edward Cree  wrote:
>>> On 18/03/16 23:25, Alexander Duyck wrote:
 This patch adds support for something I am referring to as GSO partial.
 The basic idea is that we can support a broader range of devices for
 segmentation if we use fixed outer headers and have the hardware only
 really deal with segmenting the inner header.  The idea behind the naming
 is due to the fact that everything before csum_start will be fixed headers,
 and everything after will be the region that is handled by hardware.

 With the current implementation it allows us to add support for the
 following GSO types with an inner TSO or TSO6 offload:
 NETIF_F_GSO_GRE
 NETIF_F_GSO_GRE_CSUM
 NETIF_F_UDP_TUNNEL
 NETIF_F_UDP_TUNNEL_CSUM

 Signed-off-by: Alexander Duyck 
 ---
>>> If I'm correctly understanding what you're doing, you're building a large
>>> TCP segment, feeding it through the encapsulation drivers as normal, then
>>> at GSO time you're fixing up length fields, checksums etc. in the headers.
>>> I think we can do this more simply, by making it so that at the time when
>>> we _generate_ the TCP segment, we give it headers saying it's one MSS big,
>>> but have several MSS of data.  Similarly when adding the encap headers,
>>> they all need to get their lengths from what the layer below tells them,
>>> rather than the current length of data in the SKB.  Then at GSO time all
>>> the headers already have the right things in, and you don't need to call
>>> any per-protocol GSO callbacks for them.
>> One issue I have to deal with here is that we have no way of knowing
>> what the underlying hardware can support at the time of segment being
>> created.  You have to keep in mind that what we have access to is the
>> tunnel dev in many cases, not the underlying dev so we don't know if
>> things can be offloaded to hardware or not.  By pushing this logic
>> into the GSO code we can actually implement it without much overhead
>> since we either segment it into an MSS multiple, or into single MSS
>> sized chunks.  This way we defer the decision until the very last
>> moment when we actually know if we can offload some portion of this in
>> hardware or not.
> But won't the tunnel dev have the feature flag for GSO_PARTIAL depending
> on what the underlying dev advertises?  (Or, at least, could we make it
> bethatway?)

This stuff doesn't work.  That is why tunnels now advertise all
available features that can be offloaded via software.  Basically if
we can advertise a feature we do, and then we sort things out in
software if we cannot actually do it in hardware.

> Alternatively, have per-protocol GSO callbacks to do the fixup in the
> opposite direction to what you have now - in the long term we hope that
> hardware supporting GSO partial will become the common case, so that
> should be the fast path without bouncing backwards through GSO callbacks.
> Then, if you find out at GSO time that the hardware wants to do old-style
> TSO, you call those callbacks so as to give it a superframe with the long
> lengths filled in everywhere.  (Even that might not be necessary; it's a
> question of whether hardware makes assumptions about what those fields
> contain when folding its packet edits into checksums.  Since this is
> going to be driver-specific and drivers doing these things will have a
> fixed list of what encaps they can parse and therefore do this for, maybe
> all these fixups could be done by the driver - using common helper
> functions, of course - in its TSO path.)

I thought about doing that but decided it was much simpler to simply
update all headers.  For now I want to keep this as simple as possible
while we get the first few drivers on board.  If we later want to
optimize and add complexity we can go that route, but for now this
change is more than fast enough as it allows me to push an i40e at
20Gb/s while sending frames with outer checksums enabled.

>>> Any protocol that noticed it was putting something non-copyable in its
>>> headers (e.g. GRE with the Counter field, or an outer IP layer without DF
>>> set needing real IPIDs) would set a flag in the SKB to indicate that we
>>> really do need to call through the per-protocol GSO stuff.  (Ideally, if
>>> we had a separate skb->gso_start field rather than piggybacking on
>>> csum_start, we could reset it to point just before us, so that any further
>>> headers outside us still can be copied rather than taking callbacks.  But
>>> I'm not sure whether that's worth using up sk_buff real estate for.)
>> The idea behind piggybacking on csum_start was due to the fact that we
>> cannot perform GSO/TSO unless CHECKSUM_PARTIAL is set.  As far as I
>> know in the case of TCP offloads this always ends up being the
>> inner-most L4 header so it works out in that it actually reduces code
>> path as

Re: [PATCH] macb: fix PHY reset

2016-03-22 Thread Sergei Shtylyov

On 03/22/2016 11:07 PM, David Miller wrote:


On 03/22/2016 10:27 PM, Sergei Shtylyov wrote:


The driver calls gpiod_set_value() with GPIOD_OUT_* instead of 0 and
1, as
a result the PHY isn't really put back into reset state in
macb_remove().
Moreover, the driver assumes that something else has set the GPIO
direction
to output, so if it has not, the PHY wouldn't be taken out of reset in


s/wouldn't/may not/, sorry. Do I need to resend?


No need, I fixed it up by hand.

Applied, thanks.


   Oops, forgot another tag:

Fixes: 270c499f0993 ("net/macb: Update device tree binding for resetting PHY 
using GPIO")


  Too late probably... :-(

MBR, Sergei



Re: [PATCH net] ipv4: initialize flowi4_flags before calling fib_lookup()

2016-03-22 Thread Cong Wang
On Tue, Mar 22, 2016 at 2:02 PM, David Ahern  wrote:
> On 3/22/16 2:45 PM, Cong Wang wrote:
>>
>> @@ -1426,7 +1426,7 @@ int fib_table_lookup(struct fib_table *tb, const
>> struct flowi4 *flp,
>>  nh->nh_flags & RTNH_F_LINKDOWN &&
>>  !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
>>  continue;
>> -   if (!(flp->flowi4_flags & FLOWI_FLAG_VRFSRC)) {
>> +   if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF))
>> {
>>  if (flp->flowi4_oif &&
>>  flp->flowi4_oif != nh->nh_oif)
>>  continue;
>>
>> For me, it looks the bug was introduce by commit
>> 35ebf65e851c6d9731abc6362b1.
>>
>
> Arguably yes since it added the function without initializing flags.
>
> The commit I referenced (and even the VRF predecessor both of which
> originated in the v4.3) is the one introducing use of flow flags to the
> lookup. From a stable perspective going back to v4.3 is what matters.
>

Yeah, then it is commit 613d09b30f8b589d5a9b497. It doesn't matter
for backport, but matters for accuracy. ;)


Re: [PATCH net-next] net: Fix remote checksum offload with GUE

2016-03-22 Thread Tom Herbert
On Tue, Mar 22, 2016 at 1:20 PM, Alexander Duyck
 wrote:
> On Tue, Mar 22, 2016 at 12:19 PM, Tom Herbert  wrote:
>> In skb_segment the check of whether or not to perform the checksum on
>> host was changed to not consider rather remote checksum offload is
>> in use. In the case that can_checksum_protocol fails the checksum
>> is computed regardless. __skb_udp_tunnel_segment was modified in a
>> related patch to add NETIF_F_HW_CSUM into features when grabbing
>> the enc_features and remote checksum offload is being done. The
>> problem is that this bit can be cleared in lower GSO layers that
>> are also doing tunneling (e.g. ipip, GRE when used with GUE),
>> so when we get to skb_segment that intent has been lost and
>> can_checksum_protocol fails.
>
> So what you are describing sounds like a tunnel in tunnel scenario.
> It might work better to just skip masking the features if
> skb->remcsum_offload is set rather than trying to change how we
> perform the offload.
>
To be clear, my patch is restoring the old behavior not implementing a new one.

> I'm pretty sure this will cause data corruption and maybe a kernel
> panic if Tx checksum offload is disabled.
>
Nope, working fine for me.

The outer checksum would be computed in
>> This patch:
>>
>> - Restores the check in skb_segment to look at remote checksum offload.
>> - Removes the code in __skb_udp_tunnel_segment to force the
>>   NETIF_F_HW_CSUM feature since this is no longer useful with above
>>   change.
>> - Removes check for remote checksum offload in gso_reset_checksum.
>>   A special case should not be needed here.
>>
>> Tested: Single netperf STREAM over GUE-ipip
>>
>> Before fix:
>>5625 Mbps
>> After fix:
>>6410 Mbps
>>
>> Fixes: 76443456227097179c1482 ("net: Move GSO csum into SKB_GSO_CB")
>> Signed-off-by: Tom Herbert 
>> ---
>>  include/linux/skbuff.h |  4 
>>  net/core/skbuff.c  |  5 ++---
>>  net/ipv4/udp_offload.c | 10 --
>>  3 files changed, 2 insertions(+), 17 deletions(-)
>>
>> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
>> index 15d0df9..f6fe8a5 100644
>> --- a/include/linux/skbuff.h
>> +++ b/include/linux/skbuff.h
>> @@ -3615,10 +3615,6 @@ static inline int gso_pskb_expand_head(struct sk_buff 
>> *skb, int extra)
>>
>>  static inline void gso_reset_checksum(struct sk_buff *skb, __wsum res)
>>  {
>> -   /* Do not update partial checksums if remote checksum is enabled. */
>> -   if (skb->remcsum_offload)
>> -   return;
>> -
>> SKB_GSO_CB(skb)->csum = res;
>> SKB_GSO_CB(skb)->csum_start = skb_checksum_start(skb) - skb->head;
>>  }
>
> I'm pretty sure this part here will break things when you don't have
> an outer offload enabled.  What NIC did you test this on?  Did you try
> disabling the Tx checksum support in the hardware to see what would
> happen?
>
Mellanox.

When TX checksum is disabled the outer checksum is computed in
__skb_udp_tunnel_segment.

>> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
>> index f044f97..e4eb78d 100644
>> --- a/net/core/skbuff.c
>> +++ b/net/core/skbuff.c
>> @@ -3259,14 +3259,13 @@ skip_fraglist:
>> nskb->truesize += nskb->data_len;
>>
>>  perform_csum_check:
>> -   if (!csum) {
>> +   if (!csum && !nskb->remcsum_offload) {
>> if (skb_has_shared_frag(nskb)) {
>> err = __skb_linearize(nskb);
>> if (err)
>> goto err;
>> }
>> -   if (!nskb->remcsum_offload)
>> -   nskb->ip_summed = CHECKSUM_NONE;
>> +   nskb->ip_summed = CHECKSUM_NONE;
>> SKB_GSO_CB(nskb)->csum =
>> skb_checksum(nskb, doffset,
>>  nskb->len - doffset, 0);
>
> I'm pretty sure this is going to cause a huge mess if you are
> requesting remote checksum but cannot perform an outer checksum.  One
> of the reasons I merged these features together the way I did was
> because we needed to perform the initial checksum to avoid causing a
> kernel panic later on.  Otherwise we don't have the SKB_GSO_CB()->csum
> and SKB_GSO_CB()->csum_start fields populated.
>
Nope, if we can't perform outer checksum it is done in the host. I'm
not seeing any problem.

>> diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
>> index 8a3405a..f86f1e1 100644
>> --- a/net/ipv4/udp_offload.c
>> +++ b/net/ipv4/udp_offload.c
>> @@ -78,16 +78,6 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct 
>> sk_buff *skb,
>>
>> features &= skb->dev->hw_enc_features;
>>
>> -   /* The only checksum offload we care about from here on out is the
>> -* outer one so strip the existing checksum feature flags and
>> -* instead set the flag based on our outer checksum offload value.
>> -*/
>> -   if (remcsum

Re: [PATCH] macb: fix PHY reset

2016-03-22 Thread Sergei Shtylyov

On 03/22/2016 11:07 PM, David Miller wrote:


The driver calls gpiod_set_value() with GPIOD_OUT_* instead of 0 and
1, as
a result the PHY isn't really put back into reset state in
macb_remove().
Moreover, the driver assumes that something else has set the GPIO
direction
to output, so if it has not, the PHY wouldn't be taken out of reset in


s/wouldn't/may not/, sorry. Do I need to resend?


No need, I fixed it up by hand.

Applied, thanks.


   Thank you! gpio_request() or smth of that sort wouldn't hurt too (the pin 
won't be switched to the GPIO mode on the pin function controller that I have 
here...
   Anyway, this code is doomed if my phylib reset GPIO patch (to be posted 
yet) is accepted ...


MBR, Sergei



Re: [PATCH net] ipv4: initialize flowi4_flags before calling fib_lookup()

2016-03-22 Thread David Ahern

On 3/22/16 2:45 PM, Cong Wang wrote:

@@ -1426,7 +1426,7 @@ int fib_table_lookup(struct fib_table *tb, const
struct flowi4 *flp,
 nh->nh_flags & RTNH_F_LINKDOWN &&
 !(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
 continue;
-   if (!(flp->flowi4_flags & FLOWI_FLAG_VRFSRC)) {
+   if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) {
 if (flp->flowi4_oif &&
 flp->flowi4_oif != nh->nh_oif)
 continue;

For me, it looks the bug was introduce by commit 35ebf65e851c6d9731abc6362b1.



Arguably yes since it added the function without initializing flags.

The commit I referenced (and even the VRF predecessor both of which 
originated in the v4.3) is the one introducing use of flow flags to the 
lookup. From a stable perspective going back to v4.3 is what matters.




Re: [PATCH net] ipv4: initialize flowi4_flags before calling fib_lookup()

2016-03-22 Thread Cong Wang
On Tue, Mar 22, 2016 at 11:29 AM, David Ahern  wrote:
> On 3/22/16 9:31 AM, Lance Richardson wrote:
>>
>> Field fl4.flowi4_flags is not initialized in fib_compute_spec_dst()
>> before calling fib_lookup(), which means fib_table_lookup() is
>> using non-deterministic data at this line:
>>
>> if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) {
>>
>> Fix by initializing fl4.flowi4_flags to zero.
>>
>> Signed-off-by: Lance Richardson 
>> ---
>>   net/ipv4/fib_frontend.c | 1 +
>>   1 file changed, 1 insertion(+)
>>
>> diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
>> index 21add55..896844a 100644
>> --- a/net/ipv4/fib_frontend.c
>> +++ b/net/ipv4/fib_frontend.c
>> @@ -304,6 +304,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
>> fl4.flowi4_scope = scope;
>> fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark :
>> 0;
>> fl4.flowi4_tun_key.tun_id = 0;
>> +   fl4.flowi4_flags = 0;
>> if (!fib_lookup(net, &fl4, &res, 0))
>> return FIB_RES_PREFSRC(net, res);
>> } else {
>>
>
> Fixes: 58189ca7b2741 ("net: Fix vti use case with oif in dst lookups")

Why does this patch fix this commit? Even before this commit, flowi4_flags
was already tested for other bit:


@@ -1426,7 +1426,7 @@ int fib_table_lookup(struct fib_table *tb, const
struct flowi4 *flp,
nh->nh_flags & RTNH_F_LINKDOWN &&
!(fib_flags & FIB_LOOKUP_IGNORE_LINKSTATE))
continue;
-   if (!(flp->flowi4_flags & FLOWI_FLAG_VRFSRC)) {
+   if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) {
if (flp->flowi4_oif &&
flp->flowi4_oif != nh->nh_oif)
continue;

For me, it looks the bug was introduce by commit 35ebf65e851c6d9731abc6362b1.


Re: [PATCH net-next] net: Fix remote checksum offload with GUE

2016-03-22 Thread Alexander Duyck
On Tue, Mar 22, 2016 at 12:19 PM, Tom Herbert  wrote:
> In skb_segment the check of whether or not to perform the checksum on
> host was changed to not consider rather remote checksum offload is
> in use. In the case that can_checksum_protocol fails the checksum
> is computed regardless. __skb_udp_tunnel_segment was modified in a
> related patch to add NETIF_F_HW_CSUM into features when grabbing
> the enc_features and remote checksum offload is being done. The
> problem is that this bit can be cleared in lower GSO layers that
> are also doing tunneling (e.g. ipip, GRE when used with GUE),
> so when we get to skb_segment that intent has been lost and
> can_checksum_protocol fails.

So what you are describing sounds like a tunnel in tunnel scenario.
It might work better to just skip masking the features if
skb->remcsum_offload is set rather than trying to change how we
perform the offload.

I'm pretty sure this will cause data corruption and maybe a kernel
panic if Tx checksum offload is disabled.

> This patch:
>
> - Restores the check in skb_segment to look at remote checksum offload.
> - Removes the code in __skb_udp_tunnel_segment to force the
>   NETIF_F_HW_CSUM feature since this is no longer useful with above
>   change.
> - Removes check for remote checksum offload in gso_reset_checksum.
>   A special case should not be needed here.
>
> Tested: Single netperf STREAM over GUE-ipip
>
> Before fix:
>5625 Mbps
> After fix:
>6410 Mbps
>
> Fixes: 76443456227097179c1482 ("net: Move GSO csum into SKB_GSO_CB")
> Signed-off-by: Tom Herbert 
> ---
>  include/linux/skbuff.h |  4 
>  net/core/skbuff.c  |  5 ++---
>  net/ipv4/udp_offload.c | 10 --
>  3 files changed, 2 insertions(+), 17 deletions(-)
>
> diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
> index 15d0df9..f6fe8a5 100644
> --- a/include/linux/skbuff.h
> +++ b/include/linux/skbuff.h
> @@ -3615,10 +3615,6 @@ static inline int gso_pskb_expand_head(struct sk_buff 
> *skb, int extra)
>
>  static inline void gso_reset_checksum(struct sk_buff *skb, __wsum res)
>  {
> -   /* Do not update partial checksums if remote checksum is enabled. */
> -   if (skb->remcsum_offload)
> -   return;
> -
> SKB_GSO_CB(skb)->csum = res;
> SKB_GSO_CB(skb)->csum_start = skb_checksum_start(skb) - skb->head;
>  }

I'm pretty sure this part here will break things when you don't have
an outer offload enabled.  What NIC did you test this on?  Did you try
disabling the Tx checksum support in the hardware to see what would
happen?

> diff --git a/net/core/skbuff.c b/net/core/skbuff.c
> index f044f97..e4eb78d 100644
> --- a/net/core/skbuff.c
> +++ b/net/core/skbuff.c
> @@ -3259,14 +3259,13 @@ skip_fraglist:
> nskb->truesize += nskb->data_len;
>
>  perform_csum_check:
> -   if (!csum) {
> +   if (!csum && !nskb->remcsum_offload) {
> if (skb_has_shared_frag(nskb)) {
> err = __skb_linearize(nskb);
> if (err)
> goto err;
> }
> -   if (!nskb->remcsum_offload)
> -   nskb->ip_summed = CHECKSUM_NONE;
> +   nskb->ip_summed = CHECKSUM_NONE;
> SKB_GSO_CB(nskb)->csum =
> skb_checksum(nskb, doffset,
>  nskb->len - doffset, 0);

I'm pretty sure this is going to cause a huge mess if you are
requesting remote checksum but cannot perform an outer checksum.  One
of the reasons I merged these features together the way I did was
because we needed to perform the initial checksum to avoid causing a
kernel panic later on.  Otherwise we don't have the SKB_GSO_CB()->csum
and SKB_GSO_CB()->csum_start fields populated.

> diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
> index 8a3405a..f86f1e1 100644
> --- a/net/ipv4/udp_offload.c
> +++ b/net/ipv4/udp_offload.c
> @@ -78,16 +78,6 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct 
> sk_buff *skb,
>
> features &= skb->dev->hw_enc_features;
>
> -   /* The only checksum offload we care about from here on out is the
> -* outer one so strip the existing checksum feature flags and
> -* instead set the flag based on our outer checksum offload value.
> -*/
> -   if (remcsum || ufo) {
> -   features &= ~NETIF_F_CSUM_MASK;
> -   if (!need_csum || offload_csum)
> -   features |= NETIF_F_HW_CSUM;
> -   }
> -
> /* segment inner packet. */
> segs = gso_inner_segment(skb, features);
> if (IS_ERR_OR_NULL(segs)) {

This part breaks UDP fragmentation I am pretty sure.  We need this bit
for UFO to avoid having to perform a checksum over the data twice if
we are offloading the outer UDP checksum.


Re: [PATCH v3 0/2] AF_VSOCK: Shrink the area influenced by prepare_to_wait

2016-03-22 Thread David Miller
From: Claudio Imbrenda 
Date: Tue, 22 Mar 2016 17:05:50 +0100

> This patchset applies on net-next.
> 
> I think I found a problem with the patch submitted by Laura Abbott
> ( https://lkml.org/lkml/2016/2/4/711 ): we might miss wakeups.
> Since the condition is not checked between the prepare_to_wait and the
> schedule(), if a wakeup happens after the condition is checked but before
> the sleep happens, and we miss it. ( A description of the problem can be
> found here: http://www.makelinux.net/ldd3/chp-6-sect-2 ).
> 
> The first patch reverts the previous broken patch, while the second patch
> properly fixes the sleep-while-waiting issue.

Series applied, thanks for following up on this.


Re: [RFC PATCH 7/9] GSO: Support partial segmentation offload

2016-03-22 Thread David Miller
From: Jesse Gross 
Date: Tue, 22 Mar 2016 13:11:21 -0700

> Features that have been designed this way in the past are usually
> pretty fragile. Not only would you have to track changes in the
> routing table but you could have bridges, tc, vlan devices, etc. all
> of which might change the path of the packet and would have to somehow
> propagate this information. It's much more robust to resolve the
> device capabilities just before you hand the packet to that device.
> Plus, anything along the path of the packet (iptables, for example)
> that looks at the headers might potentially need to be aware of this
> optimization.

Indeed, this is a major fundamental issue in our stack right now.  I
keep being reminded of that ugly change we had to make to accomodate
scatter-gather limitations for Infiniband devices, exactly because
properties don't propagate properly through all of the layers right
now.

But we have to solve this somehow, the packetizer has to know certain
basic properties of the ultimate device in order to function properly.

This requirement is unavoidable.


Re: [PATCH net] ipv4: initialize flowi4_flags before calling fib_lookup()

2016-03-22 Thread Eric Dumazet
On Tue, 2016-03-22 at 13:00 -0600, David Ahern wrote:
> On 3/22/16 12:56 PM, Lance Richardson wrote:

> >
> > Fixes: 58189ca7b2741 ("net: Fix vti use case with oif in dst lookups")

> DaveM: this should go into stable releases back to v4.3.
> 

The 'Fixes' tag tells this already ;)

$ git describe --contains 58189ca7b2741
v4.3-rc3~13^2~63




Re: [PATCH] ipv6: Fix the pmtu path for connected UDP socket

2016-03-22 Thread Cong Wang
On Tue, Mar 22, 2016 at 11:03 AM, Wei Wang  wrote:
> Thanks Martin and Cong.
>
> I guess then we are going with the following fix in ip6_sk_update_pmtu():
> 1. call ip6_upate_pmtu() as it is
> 2. do a dst_check()
> 3. re-lookup() if it is invalid
> 4. and then do a ip6_dst_store()/dst_set

Exactly, please try the attached patch. Note I did nothing more than a
compile test.

Does it make sense to you now?
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ed44663..fcea05e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1346,18 +1346,20 @@ static bool rt6_cache_allowed_for_pmtu(const struct 
rt6_info *rt)
(rt->rt6i_flags & RTF_PCPU || rt->rt6i_node);
 }
 
-static void __ip6_rt_update_pmtu(struct dst_entry *dst, const struct sock *sk,
-const struct ipv6hdr *iph, u32 mtu)
+static struct dst_entry* __ip6_rt_update_pmtu(struct dst_entry *dst,
+ const struct sock *sk,
+ const struct ipv6hdr *iph,
+ u32 mtu, bool hold)
 {
struct rt6_info *rt6 = (struct rt6_info *)dst;
 
if (rt6->rt6i_flags & RTF_LOCAL)
-   return;
+   return dst;
 
dst_confirm(dst);
mtu = max_t(u32, mtu, IPV6_MIN_MTU);
if (mtu >= dst_mtu(dst))
-   return;
+   return dst;
 
if (!rt6_cache_allowed_for_pmtu(rt6)) {
rt6_do_update_pmtu(rt6, mtu);
@@ -1372,11 +1374,13 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, 
const struct sock *sk,
daddr = &sk->sk_v6_daddr;
saddr = &inet6_sk(sk)->saddr;
} else {
-   return;
+   return dst;
}
nrt6 = ip6_rt_cache_alloc(rt6, daddr, saddr);
if (nrt6) {
rt6_do_update_pmtu(nrt6, mtu);
+   if (hold)
+   dst_hold(&nrt6->dst);
 
/* ip6_ins_rt(nrt6) will bump the
 * rt6->rt6i_node->fn_sernum
@@ -1384,14 +1388,17 @@ static void __ip6_rt_update_pmtu(struct dst_entry *dst, 
const struct sock *sk,
 * invalidate the sk->sk_dst_cache.
 */
ip6_ins_rt(nrt6);
+   return &nrt6->dst;
}
}
+
+   return dst;
 }
 
 static void ip6_rt_update_pmtu(struct dst_entry *dst, struct sock *sk,
   struct sk_buff *skb, u32 mtu)
 {
-   __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu);
+   __ip6_rt_update_pmtu(dst, sk, skb ? ipv6_hdr(skb) : NULL, mtu, false);
 }
 
 void ip6_update_pmtu(struct sk_buff *skb, struct net *net, __be32 mtu,
@@ -1410,15 +1417,52 @@ void ip6_update_pmtu(struct sk_buff *skb, struct net 
*net, __be32 mtu,
 
dst = ip6_route_output(net, NULL, &fl6);
if (!dst->error)
-   __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu));
+   __ip6_rt_update_pmtu(dst, NULL, iph, ntohl(mtu), false);
dst_release(dst);
 }
 EXPORT_SYMBOL_GPL(ip6_update_pmtu);
 
 void ip6_sk_update_pmtu(struct sk_buff *skb, struct sock *sk, __be32 mtu)
 {
-   ip6_update_pmtu(skb, sock_net(sk), mtu,
-   sk->sk_bound_dev_if, sk->sk_mark);
+   const struct ipv6hdr *iph = (struct ipv6hdr *) skb->data;
+   struct net *net = sock_net(sk);
+   struct dst_entry *ndst, *dst;
+   struct flowi6 fl6;
+
+   memset(&fl6, 0, sizeof(fl6));
+
+   bh_lock_sock(sk);
+
+   fl6.flowi6_oif = sk->sk_bound_dev_if;
+   fl6.flowi6_mark = sk->sk_mark ? : IP6_REPLY_MARK(net, skb->mark);
+   fl6.daddr = iph->daddr;
+   fl6.saddr = iph->saddr;
+   fl6.flowlabel = ip6_flowinfo(iph);
+
+   dst = sk_dst_get(sk);
+   if (sock_owned_by_user(sk) || !dst) {
+   ip6_update_pmtu(skb, net, mtu, fl6.flowi6_oif, fl6.flowi6_mark);
+   goto out;
+   }
+
+   if (dst->obsolete && !dst->ops->check(dst, 0)) {
+   dst_release(dst);
+   dst = ip6_route_output(net, sk, &fl6);
+   if (dst->error)
+   goto out;
+   }
+
+   ndst = __ip6_rt_update_pmtu(dst, sk, iph, ntohl(mtu), true);
+   if (ndst != dst) {
+   dst_release(dst);
+   dst = ndst;
+   }
+
+   if (sk->sk_state == TCP_ESTABLISHED)
+   ip6_dst_store(sk, dst, &iph->daddr, &iph->saddr);
+out:
+   bh_unlock_sock(sk);
+   dst_release(dst);
 }
 EXPORT_SYMBOL_GPL(ip6_sk_update_pmtu);
 


Re: [RFC PATCH 7/9] GSO: Support partial segmentation offload

2016-03-22 Thread Jesse Gross
On Tue, Mar 22, 2016 at 12:40 PM, Edward Cree  wrote:
> On 22/03/16 17:47, Alexander Duyck wrote:
>> On Tue, Mar 22, 2016 at 10:00 AM, Edward Cree  wrote:
>>> On 18/03/16 23:25, Alexander Duyck wrote:
 This patch adds support for something I am referring to as GSO partial.
 The basic idea is that we can support a broader range of devices for
 segmentation if we use fixed outer headers and have the hardware only
 really deal with segmenting the inner header.  The idea behind the naming
 is due to the fact that everything before csum_start will be fixed headers,
 and everything after will be the region that is handled by hardware.

 With the current implementation it allows us to add support for the
 following GSO types with an inner TSO or TSO6 offload:
 NETIF_F_GSO_GRE
 NETIF_F_GSO_GRE_CSUM
 NETIF_F_UDP_TUNNEL
 NETIF_F_UDP_TUNNEL_CSUM

 Signed-off-by: Alexander Duyck 
 ---
>>> If I'm correctly understanding what you're doing, you're building a large
>>> TCP segment, feeding it through the encapsulation drivers as normal, then
>>> at GSO time you're fixing up length fields, checksums etc. in the headers.
>>> I think we can do this more simply, by making it so that at the time when
>>> we _generate_ the TCP segment, we give it headers saying it's one MSS big,
>>> but have several MSS of data.  Similarly when adding the encap headers,
>>> they all need to get their lengths from what the layer below tells them,
>>> rather than the current length of data in the SKB.  Then at GSO time all
>>> the headers already have the right things in, and you don't need to call
>>> any per-protocol GSO callbacks for them.
>> One issue I have to deal with here is that we have no way of knowing
>> what the underlying hardware can support at the time of segment being
>> created.  You have to keep in mind that what we have access to is the
>> tunnel dev in many cases, not the underlying dev so we don't know if
>> things can be offloaded to hardware or not.  By pushing this logic
>> into the GSO code we can actually implement it without much overhead
>> since we either segment it into an MSS multiple, or into single MSS
>> sized chunks.  This way we defer the decision until the very last
>> moment when we actually know if we can offload some portion of this in
>> hardware or not.
> But won't the tunnel dev have the feature flag for GSO_PARTIAL depending
> on what the underlying dev advertises?  (Or, at least, could we make it
> bethatway?)

Features that have been designed this way in the past are usually
pretty fragile. Not only would you have to track changes in the
routing table but you could have bridges, tc, vlan devices, etc. all
of which might change the path of the packet and would have to somehow
propagate this information. It's much more robust to resolve the
device capabilities just before you hand the packet to that device.
Plus, anything along the path of the packet (iptables, for example)
that looks at the headers might potentially need to be aware of this
optimization.

You're also assuming that the generating TCP stack is resident on the
same machine as the device that does the offloads. That's not
necessarily true in the case of VMs or remote senders whose packets
have been GRO'ed.

Keeping the core stack consistent and just handling this at the
GRO/driver layer as Alex has here seems preferable to me.


Re: [PATCH] macb: fix PHY reset

2016-03-22 Thread David Miller
From: Sergei Shtylyov 
Date: Tue, 22 Mar 2016 22:34:05 +0300

> On 03/22/2016 10:27 PM, Sergei Shtylyov wrote:
> 
>> The driver calls gpiod_set_value() with GPIOD_OUT_* instead of 0 and
>> 1, as
>> a result the PHY isn't really put back into reset state in
>> macb_remove().
>> Moreover, the driver assumes that something else has set the GPIO
>> direction
>> to output, so if it has not, the PHY wouldn't be taken out of reset in
> 
>s/wouldn't/may not/, sorry. Do I need to resend?

No need, I fixed it up by hand.

Applied, thanks.


Re: [PATCH net] ipv4: initialize flowi4_flags before calling fib_lookup()

2016-03-22 Thread David Miller
From: Lance Richardson 
Date: Tue, 22 Mar 2016 14:58:59 -0400 (EDT)

> Apologies, that should have been [PATCH v2 net].

No worries.

Applied and queued up for -stable, thanks.


Re: [PATCH] fsl/fman: Workaround for Errata A-007273

2016-03-22 Thread David Miller
From: 
Date: Mon, 21 Mar 2016 23:08:11 +0200

> From: Igal Liberman 
> 
> Errata A-007273 (For FMan V3 devices only):
> FMan soft reset is not finished properly if one
> of the Ethernet MAC clocks is disabled
> 
> Workaround:
> Re-enable all disabled MAC clocks through the DCFG_CCSR_DEVDISR2
> register prior to issuing an FMAN soft reset.
> Re-disable the MAC clocks after the FMAN soft reset is done.
> 
> Signed-off-by: Igal Liberman 

Applied, thanks.


Re: [PATCH v8 net-next] ravb: Add dma queue interrupt support

2016-03-22 Thread David Miller
From: Yoshihiro Kaneko 
Date: Wed, 23 Mar 2016 00:22:00 +0900

> From: Kazuya Mizuguchi 
> 
> This patch supports the following interrupts.
> 
> - One interrupt for multiple (timestamp, error, gPTP)
> - One interrupt for emac
> - Four interrupts for dma queue (best effort rx/tx, network control rx/tx)
> 
> This patch improve efficiency of the interrupt handler by adding the
> interrupt handler corresponding to each interrupt source described
> above. Additionally, it reduces the number of times of the access to
> EthernetAVB IF.
> Also this patch prevent this driver depends on the whim of a boot loader.
> 
> [ykaneko0...@gmail.com: define bit names of registers]
> [ykaneko0...@gmail.com: add comment for gen3 only registers]
> [ykaneko0...@gmail.com: fix coding style]
> [ykaneko0...@gmail.com: update changelog]
> [ykaneko0...@gmail.com: gen3: fix initialization of interrupts]
> [ykaneko0...@gmail.com: gen3: fix clearing interrupts]
> [ykaneko0...@gmail.com: gen3: add helper function for request_irq()]
> [ykaneko0...@gmail.com: gen3: remove IRQF_SHARED flag for request_irq()]
> [ykaneko0...@gmail.com: revert ravb_close() and ravb_ptp_stop()]
> [ykaneko0...@gmail.com: avoid calling free_irq() to non-hooked interrupts]
> [ykaneko0...@gmail.com: make NC/BE interrupt handler a function]
> [ykaneko0...@gmail.com: make timestamp interrupt handler a function]
> [ykaneko0...@gmail.com: timestamp interrupt is handled in multiple
>  interrupt handler instead of dma queue interrupt handler]
> Signed-off-by: Kazuya Mizuguchi 
> Signed-off-by: Yoshihiro Kaneko 

Sorry, it is not appropriate to submit new features and major optimizations
at this time.

Please wait until some reasonable time after the merge window closes to
resubmit this.

Thanks.


Re: [PATCH net v2] ipv4: fix broadcast packets reception

2016-03-22 Thread David Miller
From: Paolo Abeni 
Date: Tue, 22 Mar 2016 09:19:38 +0100

> Currently, ingress ipv4 broadcast datagrams are dropped since,
> in udp_v4_early_demux(), ip_check_mc_rcu() is invoked even on
> bcast packets.
> 
> This patch addresses the issue, invoking ip_check_mc_rcu()
> only for mcast packets.
> 
> Fixes: 6e5403093261 ("ipv4/udp: Verify multicast group is ours in 
> upd_v4_early_demux()")
> Signed-off-by: Paolo Abeni 
> 
> --
>  v1 -> v2 droped the route related bits, the fib_validate_source()
>   failures are triggered by the in_device configuration

Applied and queued up for -stable, thanks.


Re: [PATCH V2 net 00/10] net: hns: bugs fixed for hns

2016-03-22 Thread David Miller
From: Yisen Zhuang 
Date: Tue, 22 Mar 2016 16:06:21 +0800

> This series includes some bug fixes and updates for hns driver.
> 
>>from Daode, one fix about mss.
> 
>>from Kejian, one fix about ping6 issue, one fix about mac address setting,
> two fix for RSS setting, two fix about mtu setting.
> 
>>from qianqian, fixed HNS v2 xge statistic reg issue.
> 
>>from Sheng, one fix about manage packets sending, one fix about GMACs mac
> setting.
> 
> For more details, please see individual patches.

Series applied, thanks.


Re: [PATCH 4/5] net: sxgbe: fix error paths in sxgbe_platform_probe()

2016-03-22 Thread Rasmus Villemoes
ping^2

On Tue, Mar 08 2016, Rasmus Villemoes  wrote:

> ping
>
> On Tue, Feb 09 2016, Rasmus Villemoes  wrote:
>
>> We need to use post-decrement to ensure that irq_dispose_mapping is
>> also called on priv->rxq[0]->irq_no; moreover, if one of the above for
>> loops failed already at i==0 (so we reach one of these labels with
>> that value of i), we'll enter an essentially infinite loop of
>> out-of-bounds accesses.
>>
>> Signed-off-by: Rasmus Villemoes 
>> ---
>>  drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c | 4 ++--
>>  1 file changed, 2 insertions(+), 2 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c 
>> b/drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c
>> index b02eed12bfc5..73427e29df2a 100644
>> --- a/drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c
>> +++ b/drivers/net/ethernet/samsung/sxgbe/sxgbe_platform.c
>> @@ -155,11 +155,11 @@ static int sxgbe_platform_probe(struct platform_device 
>> *pdev)
>>  return 0;
>>  
>>  err_rx_irq_unmap:
>> -while (--i)
>> +while (i--)
>>  irq_dispose_mapping(priv->rxq[i]->irq_no);
>>  i = SXGBE_TX_QUEUES;
>>  err_tx_irq_unmap:
>> -while (--i)
>> +while (i--)
>>  irq_dispose_mapping(priv->txq[i]->irq_no);
>>  irq_dispose_mapping(priv->irq);
>>  err_drv_remove:


Re: [RFC PATCH 7/9] GSO: Support partial segmentation offload

2016-03-22 Thread Edward Cree
On 22/03/16 17:47, Alexander Duyck wrote:
> On Tue, Mar 22, 2016 at 10:00 AM, Edward Cree  wrote:
>> On 18/03/16 23:25, Alexander Duyck wrote:
>>> This patch adds support for something I am referring to as GSO partial.
>>> The basic idea is that we can support a broader range of devices for
>>> segmentation if we use fixed outer headers and have the hardware only
>>> really deal with segmenting the inner header.  The idea behind the naming
>>> is due to the fact that everything before csum_start will be fixed headers,
>>> and everything after will be the region that is handled by hardware.
>>>
>>> With the current implementation it allows us to add support for the
>>> following GSO types with an inner TSO or TSO6 offload:
>>> NETIF_F_GSO_GRE
>>> NETIF_F_GSO_GRE_CSUM
>>> NETIF_F_UDP_TUNNEL
>>> NETIF_F_UDP_TUNNEL_CSUM
>>>
>>> Signed-off-by: Alexander Duyck 
>>> ---
>> If I'm correctly understanding what you're doing, you're building a large
>> TCP segment, feeding it through the encapsulation drivers as normal, then
>> at GSO time you're fixing up length fields, checksums etc. in the headers.
>> I think we can do this more simply, by making it so that at the time when
>> we _generate_ the TCP segment, we give it headers saying it's one MSS big,
>> but have several MSS of data.  Similarly when adding the encap headers,
>> they all need to get their lengths from what the layer below tells them,
>> rather than the current length of data in the SKB.  Then at GSO time all
>> the headers already have the right things in, and you don't need to call
>> any per-protocol GSO callbacks for them.
> One issue I have to deal with here is that we have no way of knowing
> what the underlying hardware can support at the time of segment being
> created.  You have to keep in mind that what we have access to is the
> tunnel dev in many cases, not the underlying dev so we don't know if
> things can be offloaded to hardware or not.  By pushing this logic
> into the GSO code we can actually implement it without much overhead
> since we either segment it into an MSS multiple, or into single MSS
> sized chunks.  This way we defer the decision until the very last
> moment when we actually know if we can offload some portion of this in
> hardware or not.
But won't the tunnel dev have the feature flag for GSO_PARTIAL depending
on what the underlying dev advertises?  (Or, at least, could we make it
bethatway?)
Alternatively, have per-protocol GSO callbacks to do the fixup in the
opposite direction to what you have now - in the long term we hope that
hardware supporting GSO partial will become the common case, so that
should be the fast path without bouncing backwards through GSO callbacks.
Then, if you find out at GSO time that the hardware wants to do old-style
TSO, you call those callbacks so as to give it a superframe with the long
lengths filled in everywhere.  (Even that might not be necessary; it's a
question of whether hardware makes assumptions about what those fields
contain when folding its packet edits into checksums.  Since this is
going to be driver-specific and drivers doing these things will have a
fixed list of what encaps they can parse and therefore do this for, maybe
all these fixups could be done by the driver - using common helper
functions, of course - in its TSO path.)
>> Any protocol that noticed it was putting something non-copyable in its
>> headers (e.g. GRE with the Counter field, or an outer IP layer without DF
>> set needing real IPIDs) would set a flag in the SKB to indicate that we
>> really do need to call through the per-protocol GSO stuff.  (Ideally, if
>> we had a separate skb->gso_start field rather than piggybacking on
>> csum_start, we could reset it to point just before us, so that any further
>> headers outside us still can be copied rather than taking callbacks.  But
>> I'm not sure whether that's worth using up sk_buff real estate for.)
> The idea behind piggybacking on csum_start was due to the fact that we
> cannot perform GSO/TSO unless CHECKSUM_PARTIAL is set.  As far as I
> know in the case of TCP offloads this always ends up being the
> inner-most L4 header so it works out in that it actually reduces code
> path as we were having to deal with all the skb->encapsulation checks.
> It was a relationship that already existed, I just decided to make use
> of it since it simplifies things pretty significantly.
Yes; it's a clever idea.  Only trouble is that we really want theinner IP
header rather than the inner TCP header, so that we can (if we want to)
increment the inner IP IDs.  Of course, if we Officially Don't Care about
inner IP IDs that's not a problem.
Iwonder if we could just always fill in inner_network_headereven if we're
not doing encapsulation.  Or does it end up pointing to a 'middle' header
in the case of nested encap?
> As far as retreating I don't really see how that would work. In most
> cases it is an all-or-nothing proposition to setup these outer
> headers.  Either we can s

Re: [PATCH] net: phy: at803x: don't depend on GPIOLIB

2016-03-22 Thread Uwe Kleine-König
Hello Sebastian,

On Tue, Mar 22, 2016 at 03:34:23PM +0100, Sebastian Frias wrote:
> I think we are in a deadlock :-)
> I'm going to reply inline below, but I will also send a different email
> to Daniel with a small recap.
> I think he should share the intent of the "reset" mechanism he
> introduced, in particular if it is mandatory.

The things I said in my mail are valid in general, not only for the
at803x phy.

Let me repeat them once more:

Preconditions:
 - Some of the devices a given driver handles have a reset line and
   others don't.
 - A non-empty subset (maybe all) of the devices that have a reset line
   require that this reset line is used.

Then the way to handle this in the driver should be done as follows:

  unless reset_handling_not_necessary():
gpio = gpiod_get_optional("reset")
if IS_ERR(gpio):
  return PTR_ERR(gpio)

Checking for -ENOSYS or GPIOLIB=n is not allowed because the device
you're currently handling might need the GPIO, so you must not continue
without the ability to control the line.

So the options you have (as you have a phy that doesn't need the reset
handling):

 - enable GPIOLIB (either in your .config or introduce a Kconfig
   dependency)
 - improve reset_handling_not_necessary() to return true for your case

There is nothing else.

Best regards
Uwe

-- 
Pengutronix e.K.   | Uwe Kleine-König|
Industrial Linux Solutions | http://www.pengutronix.de/  |


Re: [PATCH] macb: fix PHY reset

2016-03-22 Thread Sergei Shtylyov

On 03/22/2016 10:27 PM, Sergei Shtylyov wrote:


The driver calls gpiod_set_value() with GPIOD_OUT_* instead of 0 and 1, as
a result the PHY isn't really  put back into reset state in macb_remove().
Moreover, the driver assumes that something else has set the GPIO direction
to output, so if  it has not, the PHY wouldn't be taken out of reset in


   s/wouldn't/may not/, sorry. Do I need to resend?


macb_probe() either...

Signed-off-by: Sergei Shtylyov 


MBR, Sergei



[PATCH] macb: fix PHY reset

2016-03-22 Thread Sergei Shtylyov
The driver calls gpiod_set_value() with GPIOD_OUT_* instead of 0 and 1, as
a result the PHY isn't really  put back into reset state in macb_remove().
Moreover, the driver assumes that something else has set the GPIO direction
to output, so if  it has not, the PHY wouldn't be taken out of reset in
macb_probe() either...

Signed-off-by: Sergei Shtylyov 

---
The patch is against David Miller's 'net.git' repo.

 drivers/net/ethernet/cadence/macb.c |4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

Index: net/drivers/net/ethernet/cadence/macb.c
===
--- net.orig/drivers/net/ethernet/cadence/macb.c
+++ net/drivers/net/ethernet/cadence/macb.c
@@ -2959,7 +2959,7 @@ static int macb_probe(struct platform_de
int gpio = of_get_named_gpio(phy_node, "reset-gpios", 0);
if (gpio_is_valid(gpio))
bp->reset_gpio = gpio_to_desc(gpio);
-   gpiod_set_value(bp->reset_gpio, GPIOD_OUT_HIGH);
+   gpiod_direction_output(bp->reset_gpio, 1);
}
of_node_put(phy_node);
 
@@ -3029,7 +3029,7 @@ static int macb_remove(struct platform_d
mdiobus_free(bp->mii_bus);
 
/* Shutdown the PHY if there is a GPIO reset */
-   gpiod_set_value(bp->reset_gpio, GPIOD_OUT_LOW);
+   gpiod_set_value(bp->reset_gpio, 0);
 
unregister_netdev(dev);
clk_disable_unprepare(bp->tx_clk);



[PATCH net-next] net: Fix remote checksum offload with GUE

2016-03-22 Thread Tom Herbert
In skb_segment the check of whether or not to perform the checksum on
host was changed to not consider rather remote checksum offload is
in use. In the case that can_checksum_protocol fails the checksum
is computed regardless. __skb_udp_tunnel_segment was modified in a
related patch to add NETIF_F_HW_CSUM into features when grabbing
the enc_features and remote checksum offload is being done. The
problem is that this bit can be cleared in lower GSO layers that
are also doing tunneling (e.g. ipip, GRE when used with GUE),
so when we get to skb_segment that intent has been lost and
can_checksum_protocol fails.

This patch:

- Restores the check in skb_segment to look at remote checksum offload.
- Removes the code in __skb_udp_tunnel_segment to force the
  NETIF_F_HW_CSUM feature since this is no longer useful with above
  change.
- Removes check for remote checksum offload in gso_reset_checksum.
  A special case should not be needed here.

Tested: Single netperf STREAM over GUE-ipip

Before fix:
   5625 Mbps
After fix:
   6410 Mbps

Fixes: 76443456227097179c1482 ("net: Move GSO csum into SKB_GSO_CB")
Signed-off-by: Tom Herbert 
---
 include/linux/skbuff.h |  4 
 net/core/skbuff.c  |  5 ++---
 net/ipv4/udp_offload.c | 10 --
 3 files changed, 2 insertions(+), 17 deletions(-)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 15d0df9..f6fe8a5 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3615,10 +3615,6 @@ static inline int gso_pskb_expand_head(struct sk_buff 
*skb, int extra)
 
 static inline void gso_reset_checksum(struct sk_buff *skb, __wsum res)
 {
-   /* Do not update partial checksums if remote checksum is enabled. */
-   if (skb->remcsum_offload)
-   return;
-
SKB_GSO_CB(skb)->csum = res;
SKB_GSO_CB(skb)->csum_start = skb_checksum_start(skb) - skb->head;
 }
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index f044f97..e4eb78d 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3259,14 +3259,13 @@ skip_fraglist:
nskb->truesize += nskb->data_len;
 
 perform_csum_check:
-   if (!csum) {
+   if (!csum && !nskb->remcsum_offload) {
if (skb_has_shared_frag(nskb)) {
err = __skb_linearize(nskb);
if (err)
goto err;
}
-   if (!nskb->remcsum_offload)
-   nskb->ip_summed = CHECKSUM_NONE;
+   nskb->ip_summed = CHECKSUM_NONE;
SKB_GSO_CB(nskb)->csum =
skb_checksum(nskb, doffset,
 nskb->len - doffset, 0);
diff --git a/net/ipv4/udp_offload.c b/net/ipv4/udp_offload.c
index 8a3405a..f86f1e1 100644
--- a/net/ipv4/udp_offload.c
+++ b/net/ipv4/udp_offload.c
@@ -78,16 +78,6 @@ static struct sk_buff *__skb_udp_tunnel_segment(struct 
sk_buff *skb,
 
features &= skb->dev->hw_enc_features;
 
-   /* The only checksum offload we care about from here on out is the
-* outer one so strip the existing checksum feature flags and
-* instead set the flag based on our outer checksum offload value.
-*/
-   if (remcsum || ufo) {
-   features &= ~NETIF_F_CSUM_MASK;
-   if (!need_csum || offload_csum)
-   features |= NETIF_F_HW_CSUM;
-   }
-
/* segment inner packet. */
segs = gso_inner_segment(skb, features);
if (IS_ERR_OR_NULL(segs)) {
-- 
2.8.0.rc2



Re: [PATCH net] ipv4: initialize flowi4_flags before calling fib_lookup()

2016-03-22 Thread David Ahern

On 3/22/16 12:56 PM, Lance Richardson wrote:

Field fl4.flowi4_flags is not initialized in fib_compute_spec_dst()
before calling fib_lookup(), which means fib_table_lookup() is
using non-deterministic data at this line:

if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) {

Fix by initializing the entire fl4 structure, which will prevent
similar issues as fields are added in the future by ensuring that
all fields are initialized to zero unless explicitly initialized
to another value.

Fixes: 58189ca7b2741 ("net: Fix vti use case with oif in dst lookups")
Suggested-by: David Ahern 
Signed-off-by: Lance Richardson 
---
  net/ipv4/fib_frontend.c | 16 +++-
  1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 21add55..8a9246d 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -280,7 +280,6 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
struct in_device *in_dev;
struct fib_result res;
struct rtable *rt;
-   struct flowi4 fl4;
struct net *net;
int scope;

@@ -296,14 +295,13 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)

scope = RT_SCOPE_UNIVERSE;
if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
-   fl4.flowi4_oif = 0;
-   fl4.flowi4_iif = LOOPBACK_IFINDEX;
-   fl4.daddr = ip_hdr(skb)->saddr;
-   fl4.saddr = 0;
-   fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
-   fl4.flowi4_scope = scope;
-   fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
-   fl4.flowi4_tun_key.tun_id = 0;
+   struct flowi4 fl4 = {
+   .flowi4_iif = LOOPBACK_IFINDEX,
+   .daddr = ip_hdr(skb)->saddr,
+   .flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
+   .flowi4_scope = scope,
+   .flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0,
+   };
if (!fib_lookup(net, &fl4, &res, 0))
return FIB_RES_PREFSRC(net, res);
} else {



Acked-by: David Ahern 

DaveM: this should go into stable releases back to v4.3.



Re: [PATCH net] ipv4: initialize flowi4_flags before calling fib_lookup()

2016-03-22 Thread Lance Richardson
Apologies, that should have been [PATCH v2 net].

- Original Message -
> From: "Lance Richardson" 
> To: netdev@vger.kernel.org
> Cc: d...@cumulusnetworks.com
> Sent: Tuesday, March 22, 2016 2:56:57 PM
> Subject: [PATCH net] ipv4: initialize flowi4_flags before calling fib_lookup()
> 
> Field fl4.flowi4_flags is not initialized in fib_compute_spec_dst()
> before calling fib_lookup(), which means fib_table_lookup() is
> using non-deterministic data at this line:
> 
>   if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) {
> 
> Fix by initializing the entire fl4 structure, which will prevent
> similar issues as fields are added in the future by ensuring that
> all fields are initialized to zero unless explicitly initialized
> to another value.
> 
> Fixes: 58189ca7b2741 ("net: Fix vti use case with oif in dst lookups")
> Suggested-by: David Ahern 
> Signed-off-by: Lance Richardson 
> ---
>  net/ipv4/fib_frontend.c | 16 +++-
>  1 file changed, 7 insertions(+), 9 deletions(-)
> 
> diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
> index 21add55..8a9246d 100644
> --- a/net/ipv4/fib_frontend.c
> +++ b/net/ipv4/fib_frontend.c
> @@ -280,7 +280,6 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
>   struct in_device *in_dev;
>   struct fib_result res;
>   struct rtable *rt;
> - struct flowi4 fl4;
>   struct net *net;
>   int scope;
>  
> @@ -296,14 +295,13 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
>  
>   scope = RT_SCOPE_UNIVERSE;
>   if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
> - fl4.flowi4_oif = 0;
> - fl4.flowi4_iif = LOOPBACK_IFINDEX;
> - fl4.daddr = ip_hdr(skb)->saddr;
> - fl4.saddr = 0;
> - fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
> - fl4.flowi4_scope = scope;
> - fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
> - fl4.flowi4_tun_key.tun_id = 0;
> + struct flowi4 fl4 = {
> + .flowi4_iif = LOOPBACK_IFINDEX,
> + .daddr = ip_hdr(skb)->saddr,
> + .flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
> + .flowi4_scope = scope,
> + .flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0,
> + };
>   if (!fib_lookup(net, &fl4, &res, 0))
>   return FIB_RES_PREFSRC(net, res);
>   } else {
> --
> 2.5.5
> 
> 


[PATCH net] ipv4: initialize flowi4_flags before calling fib_lookup()

2016-03-22 Thread Lance Richardson
Field fl4.flowi4_flags is not initialized in fib_compute_spec_dst()
before calling fib_lookup(), which means fib_table_lookup() is
using non-deterministic data at this line:

if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) {

Fix by initializing the entire fl4 structure, which will prevent
similar issues as fields are added in the future by ensuring that
all fields are initialized to zero unless explicitly initialized
to another value.

Fixes: 58189ca7b2741 ("net: Fix vti use case with oif in dst lookups")
Suggested-by: David Ahern 
Signed-off-by: Lance Richardson 
---
 net/ipv4/fib_frontend.c | 16 +++-
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 21add55..8a9246d 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -280,7 +280,6 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
struct in_device *in_dev;
struct fib_result res;
struct rtable *rt;
-   struct flowi4 fl4;
struct net *net;
int scope;
 
@@ -296,14 +295,13 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
 
scope = RT_SCOPE_UNIVERSE;
if (!ipv4_is_zeronet(ip_hdr(skb)->saddr)) {
-   fl4.flowi4_oif = 0;
-   fl4.flowi4_iif = LOOPBACK_IFINDEX;
-   fl4.daddr = ip_hdr(skb)->saddr;
-   fl4.saddr = 0;
-   fl4.flowi4_tos = RT_TOS(ip_hdr(skb)->tos);
-   fl4.flowi4_scope = scope;
-   fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
-   fl4.flowi4_tun_key.tun_id = 0;
+   struct flowi4 fl4 = {
+   .flowi4_iif = LOOPBACK_IFINDEX,
+   .daddr = ip_hdr(skb)->saddr,
+   .flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
+   .flowi4_scope = scope,
+   .flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0,
+   };
if (!fib_lookup(net, &fl4, &res, 0))
return FIB_RES_PREFSRC(net, res);
} else {
-- 
2.5.5



Re: [PATCH v4 2/3] IB/hns: Add HiSilicon RoCE driver support

2016-03-22 Thread Christoph Hellwig
>  drivers/infiniband/Kconfig |1 +
>  drivers/infiniband/hw/Makefile |1 +
>  drivers/infiniband/hw/hisilicon/hns/Kconfig|   10 +

To fit in with the other drivers drop the hisilicon level
of the directory hierarchy.

Haven't had time to look at the actual driver yet so far, though.


Re: [PATCH net] ipv4: initialize flowi4_flags before calling fib_lookup()

2016-03-22 Thread Lance Richardson
- Original Message -
> From: "David Ahern" 
> To: "Lance Richardson" , netdev@vger.kernel.org
> Sent: Tuesday, March 22, 2016 2:29:02 PM
> Subject: Re: [PATCH net] ipv4: initialize flowi4_flags before calling 
> fib_lookup()
> 
> On 3/22/16 9:31 AM, Lance Richardson wrote:
> > Field fl4.flowi4_flags is not initialized in fib_compute_spec_dst()
> > before calling fib_lookup(), which means fib_table_lookup() is
> > using non-deterministic data at this line:
> >
> > if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) {
> >
> > Fix by initializing fl4.flowi4_flags to zero.
> >
> > Signed-off-by: Lance Richardson 
> > ---
> >   net/ipv4/fib_frontend.c | 1 +
> >   1 file changed, 1 insertion(+)
> >
> > diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
> > index 21add55..896844a 100644
> > --- a/net/ipv4/fib_frontend.c
> > +++ b/net/ipv4/fib_frontend.c
> > @@ -304,6 +304,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
> > fl4.flowi4_scope = scope;
> > fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
> > fl4.flowi4_tun_key.tun_id = 0;
> > +   fl4.flowi4_flags = 0;
> > if (!fib_lookup(net, &fl4, &res, 0))
> > return FIB_RES_PREFSRC(net, res);
> > } else {
> >
> 
> Fixes: 58189ca7b2741 ("net: Fix vti use case with oif in dst lookups")
> 
> I think a more robust solution is to move fl4 to this if case and init
> when it is declared:
> 
>   struct flowi4 fl4 = {
>   .flowi4_iif = LOOPBACK_IFINDEX,
>   .daddr = ip_hdr(skb)->saddr,
>   .flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
>   .flowi4_scope = scope,
>   .flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0,
>   };
> 

Agreed... I actually debated doing something similar but opted for the
smaller delta.

v2 coming up.

Thanks for the review,

   Lance


[iproute PATCH 2/7] ipaddress: colorize peer, broadcast and anycast addresses as well

2016-03-22 Thread Phil Sutter
Signed-off-by: Phil Sutter 
---
 ip/ipaddress.c | 31 +--
 1 file changed, 17 insertions(+), 14 deletions(-)

diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 7aab8e781eae8..90d7b1096c3aa 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -862,7 +862,8 @@ int print_linkinfo(const struct sockaddr_nl *who,
fprintf(fp, " peer ");
else
fprintf(fp, " brd ");
-   fprintf(fp, "%s", 
ll_addr_n2a(RTA_DATA(tb[IFLA_BROADCAST]),
+   color_fprintf(fp, COLOR_MAC, "%s",
+   
ll_addr_n2a(RTA_DATA(tb[IFLA_BROADCAST]),
  
RTA_PAYLOAD(tb[IFLA_BROADCAST]),
  ifi->ifi_type,
  b1, sizeof(b1)));
@@ -1062,32 +1063,34 @@ int print_addrinfo(const struct sockaddr_nl *who, 
struct nlmsghdr *n,
  RTA_PAYLOAD(rta_tb[IFA_LOCAL]),
  RTA_DATA(rta_tb[IFA_LOCAL]),
  abuf, sizeof(abuf)));
-   if (rta_tb[IFA_ADDRESS] == NULL ||
-   memcmp(RTA_DATA(rta_tb[IFA_ADDRESS]), 
RTA_DATA(rta_tb[IFA_LOCAL]),
-  ifa->ifa_family == AF_INET ? 4 : 16) == 0) {
-   fprintf(fp, "/%d ", ifa->ifa_prefixlen);
-   } else {
-   fprintf(fp, " peer %s/%d ",
-   format_host(ifa->ifa_family,
-   RTA_PAYLOAD(rta_tb[IFA_ADDRESS]),
-   RTA_DATA(rta_tb[IFA_ADDRESS]),
-   abuf, sizeof(abuf)),
-   ifa->ifa_prefixlen);
+   if (rta_tb[IFA_ADDRESS] &&
+   memcmp(RTA_DATA(rta_tb[IFA_ADDRESS]),
+  RTA_DATA(rta_tb[IFA_LOCAL]),
+  ifa->ifa_family == AF_INET ? 4 : 16)) {
+   fprintf(fp, " peer ");
+   color_fprintf(fp, ifa_family_color(ifa->ifa_family),
+ "%s", format_host(ifa->ifa_family,
+ RTA_PAYLOAD(rta_tb[IFA_ADDRESS]),
+ RTA_DATA(rta_tb[IFA_ADDRESS]),
+ abuf, sizeof(abuf)));
}
+   fprintf(fp, "/%d ", ifa->ifa_prefixlen);
}
 
if (brief)
goto brief_exit;
 
if (rta_tb[IFA_BROADCAST]) {
-   fprintf(fp, "brd %s ",
+   fprintf(fp, "brd ");
+   color_fprintf(fp, ifa_family_color(ifa->ifa_family), "%s ",
format_host(ifa->ifa_family,
RTA_PAYLOAD(rta_tb[IFA_BROADCAST]),
RTA_DATA(rta_tb[IFA_BROADCAST]),
abuf, sizeof(abuf)));
}
if (rta_tb[IFA_ANYCAST]) {
-   fprintf(fp, "any %s ",
+   fprintf(fp, "any ");
+   color_fprintf(fp, ifa_family_color(ifa->ifa_family), "%s ",
format_host(ifa->ifa_family,
RTA_PAYLOAD(rta_tb[IFA_ANYCAST]),
RTA_DATA(rta_tb[IFA_ANYCAST]),
-- 
2.7.2



[iproute PATCH 1/7] color: introduce color helpers and COLOR_CLEAR

2016-03-22 Thread Phil Sutter
This adds two helper functions which map a given data field to a color,
so color_fprintf() statements don't have to be duplicated with only a
different color value depending on that data field's value. In order for
this to work in a generic way, COLOR_CLEAR has been added to serve as a
fallback default of uncolored output.

Signed-off-by: Phil Sutter 
---
 include/color.h |  5 -
 ip/ipaddress.c  | 47 +--
 lib/color.c | 30 +-
 3 files changed, 46 insertions(+), 36 deletions(-)

diff --git a/include/color.h b/include/color.h
index b85003aed19f8..c1c29831159af 100644
--- a/include/color.h
+++ b/include/color.h
@@ -7,10 +7,13 @@ enum color_attr {
COLOR_INET,
COLOR_INET6,
COLOR_OPERSTATE_UP,
-   COLOR_OPERSTATE_DOWN
+   COLOR_OPERSTATE_DOWN,
+   COLOR_CLEAR
 };
 
 void enable_color(void);
 int color_fprintf(FILE *fp, enum color_attr attr, const char *fmt, ...);
+enum color_attr ifa_family_color(__u8 ifa_family);
+enum color_attr oper_state_color(__u8 state);
 
 #endif
diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index f8c5029400949..7aab8e781eae8 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -135,25 +135,15 @@ static const char *oper_states[] = {
 
 static void print_operstate(FILE *f, __u8 state)
 {
-   if (state >= ARRAY_SIZE(oper_states))
+   if (state >= ARRAY_SIZE(oper_states)) {
fprintf(f, "state %#x ", state);
-   else {
-   if (brief) {
-   if (strcmp(oper_states[state], "UP") == 0)
-   color_fprintf(f, COLOR_OPERSTATE_UP, "%-14s ", 
oper_states[state]);
-   else if (strcmp(oper_states[state], "DOWN") == 0)
-   color_fprintf(f, COLOR_OPERSTATE_DOWN, "%-14s 
", oper_states[state]);
-   else
-   fprintf(f, "%-14s ", oper_states[state]);
-   } else {
-   fprintf(f, "state ");
-   if (strcmp(oper_states[state], "UP") == 0)
-   color_fprintf(f, COLOR_OPERSTATE_UP, "%s ", 
oper_states[state]);
-   else if (strcmp(oper_states[state], "DOWN") == 0)
-   color_fprintf(f, COLOR_OPERSTATE_DOWN, "%s ", 
oper_states[state]);
-   else
-   fprintf(f, "%s ", oper_states[state]);
-   }
+   } else if (brief) {
+   color_fprintf(f, oper_state_color(state),
+ "%-14s ", oper_states[state]);
+   } else {
+   fprintf(f, "state ");
+   color_fprintf(f, oper_state_color(state),
+ "%s ", oper_states[state]);
}
 }
 
@@ -1067,22 +1057,11 @@ int print_addrinfo(const struct sockaddr_nl *who, 
struct nlmsghdr *n,
}
 
if (rta_tb[IFA_LOCAL]) {
-   if (ifa->ifa_family == AF_INET)
-   color_fprintf(fp, COLOR_INET, "%s", 
format_host(ifa->ifa_family,
-   RTA_PAYLOAD(rta_tb[IFA_LOCAL]),
-   RTA_DATA(rta_tb[IFA_LOCAL]),
-   abuf, sizeof(abuf)));
-   else if (ifa->ifa_family == AF_INET6)
-   color_fprintf(fp, COLOR_INET6, "%s", 
format_host(ifa->ifa_family,
-   RTA_PAYLOAD(rta_tb[IFA_LOCAL]),
-   RTA_DATA(rta_tb[IFA_LOCAL]),
-   abuf, sizeof(abuf)));
-   else
-   fprintf(fp, "%s", format_host(ifa->ifa_family,
-   RTA_PAYLOAD(rta_tb[IFA_LOCAL]),
-   RTA_DATA(rta_tb[IFA_LOCAL]),
-   abuf, sizeof(abuf)));
-
+   color_fprintf(fp, ifa_family_color(ifa->ifa_family), "%s",
+ format_host(ifa->ifa_family,
+ RTA_PAYLOAD(rta_tb[IFA_LOCAL]),
+ RTA_DATA(rta_tb[IFA_LOCAL]),
+ abuf, sizeof(abuf)));
if (rta_tb[IFA_ADDRESS] == NULL ||
memcmp(RTA_DATA(rta_tb[IFA_ADDRESS]), 
RTA_DATA(rta_tb[IFA_LOCAL]),
   ifa->ifa_family == AF_INET ? 4 : 16) == 0) {
diff --git a/lib/color.c b/lib/color.c
index 8c9a48ba702bf..95596be236a05 100644
--- a/lib/color.c
+++ b/lib/color.c
@@ -1,5 +1,8 @@
 #include 
 #include 
+#include 
+#include 
+#include 
 
 #include "color.h"
 
@@ -32,7 +35,8 @@ static enum color attr_colors[] = {
C_MAGENTA,
C_BLUE,
C_GREEN,
-   C_RED
+   C_RED,
+   C_CLEAR
 };
 
 static int color_is_enabled;
@@ -62,3 +66,27

[iproute PATCH 7/7] lib/ll_addr: improve ll_addr_n2a() a bit

2016-03-22 Thread Phil Sutter
Apart from making the code a bit more compact and efficient, this also
prevents a potential buffer overflow if the passed buffer is really too
small: Although correctly decrementing the size parameter passed to
snprintf, it could become negative which would then wrap since snprintf
uses (unsigned) size_t for the parameter.

Signed-off-by: Phil Sutter 
---
 lib/ll_addr.c | 15 +++
 1 file changed, 3 insertions(+), 12 deletions(-)

diff --git a/lib/ll_addr.c b/lib/ll_addr.c
index 2ce9abfbb8c69..465ed6fa4d9a2 100644
--- a/lib/ll_addr.c
+++ b/lib/ll_addr.c
@@ -41,18 +41,9 @@ const char *ll_addr_n2a(const unsigned char *addr, int alen, 
int type, char *buf
if (alen == 16 && type == ARPHRD_TUNNEL6) {
return inet_ntop(AF_INET6, addr, buf, blen);
}
-   l = 0;
-   for (i=0; i

[iproute PATCH 4/7] utils: make rt_addr_n2a() non-reentrant by default

2016-03-22 Thread Phil Sutter
There is only a single user who needs it to be reentrant (not really,
but it's safer like this), add rt_addr_n2a_r() for it to use.

Signed-off-by: Phil Sutter 
---
 include/utils.h   |  3 ++-
 ip/ip6tunnel.c|  2 +-
 ip/iplink_bond.c  |  5 +
 ip/ipmroute.c |  7 ++-
 ip/ipprefix.c |  5 +
 ip/iproute.c  | 10 +++---
 ip/iproute_lwtunnel.c |  7 ++-
 ip/iprule.c   |  8 ++--
 ip/iptunnel.c |  2 +-
 ip/ipxfrm.c   | 29 ++---
 ip/link_ip6tnl.c  |  7 ++-
 ip/xfrm_monitor.c | 16 +++-
 lib/utils.c   | 11 +--
 tc/f_flower.c |  7 ++-
 14 files changed, 37 insertions(+), 82 deletions(-)

diff --git a/include/utils.h b/include/utils.h
index 84083b0dbba71..bc2cbce0cc303 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -125,8 +125,9 @@ int af_byte_len(int af);
 const char *format_host_r(int af, int len, const void *addr,
   char *buf, int buflen);
 const char *format_host(int af, int lne, const void *addr);
-const char *rt_addr_n2a(int af, int len, const void *addr,
+const char *rt_addr_n2a_r(int af, int len, const void *addr,
   char *buf, int buflen);
+const char *rt_addr_n2a(int af, int len, const void *addr);
 
 int read_family(const char *name);
 const char *family_name(int family);
diff --git a/ip/ip6tunnel.c b/ip/ip6tunnel.c
index d588645eb942a..c02fa0746ab69 100644
--- a/ip/ip6tunnel.c
+++ b/ip/ip6tunnel.c
@@ -78,7 +78,7 @@ static void print_tunnel(struct ip6_tnl_parm2 *p)
   p->name,
   tnl_strproto(p->proto),
   format_host_r(AF_INET6, 16, &p->raddr, s1, sizeof(s1)),
-  rt_addr_n2a(AF_INET6, 16, &p->laddr, s2, sizeof(s2)));
+  rt_addr_n2a_r(AF_INET6, 16, &p->laddr, s2, sizeof(s2)));
if (p->link) {
const char *n = ll_index_to_name(p->link);
 
diff --git a/ip/iplink_bond.c b/ip/iplink_bond.c
index 45473f66da065..7da58e4556c07 100644
--- a/ip/iplink_bond.c
+++ b/ip/iplink_bond.c
@@ -411,7 +411,6 @@ static void bond_print_opt(struct link_util *lu, FILE *f, 
struct rtattr *tb[])
 
if (tb[IFLA_BOND_ARP_IP_TARGET]) {
struct rtattr *iptb[BOND_MAX_ARP_TARGETS + 1];
-   char buf[INET_ADDRSTRLEN];
int i;
 
parse_rtattr_nested(iptb, BOND_MAX_ARP_TARGETS,
@@ -425,9 +424,7 @@ static void bond_print_opt(struct link_util *lu, FILE *f, 
struct rtattr *tb[])
fprintf(f, "%s",
rt_addr_n2a(AF_INET,
RTA_PAYLOAD(iptb[i]),
-   RTA_DATA(iptb[i]),
-   buf,
-   INET_ADDRSTRLEN));
+   RTA_DATA(iptb[i])));
if (i < BOND_MAX_ARP_TARGETS-1 && iptb[i+1])
fprintf(f, ",");
}
diff --git a/ip/ipmroute.c b/ip/ipmroute.c
index 34543c00ecb75..2b9f892a62630 100644
--- a/ip/ipmroute.c
+++ b/ip/ipmroute.c
@@ -58,7 +58,6 @@ int print_mroute(const struct sockaddr_nl *who, struct 
nlmsghdr *n, void *arg)
struct rtmsg *r = NLMSG_DATA(n);
int len = n->nlmsg_len;
struct rtattr *tb[RTA_MAX+1];
-   char abuf[256];
char obuf[256];
 
SPRINT_BUF(b1);
@@ -126,16 +125,14 @@ int print_mroute(const struct sockaddr_nl *who, struct 
nlmsghdr *n, void *arg)
len = snprintf(obuf, sizeof(obuf),
   "(%s, ", rt_addr_n2a(family,
RTA_PAYLOAD(tb[RTA_SRC]),
-   RTA_DATA(tb[RTA_SRC]),
-   abuf, sizeof(abuf)));
+   RTA_DATA(tb[RTA_SRC])));
else
len = sprintf(obuf, "(unknown, ");
if (tb[RTA_DST])
snprintf(obuf + len, sizeof(obuf) - len,
 "%s)", rt_addr_n2a(family,
RTA_PAYLOAD(tb[RTA_DST]),
-   RTA_DATA(tb[RTA_DST]),
-   abuf, sizeof(abuf)));
+   RTA_DATA(tb[RTA_DST])));
else
snprintf(obuf + len, sizeof(obuf) - len, "unknown) ");
 
diff --git a/ip/ipprefix.c b/ip/ipprefix.c
index 2524f784965b5..4d986dbc1a5d1 100644
--- a/ip/ipprefix.c
+++ b/ip/ipprefix.c
@@ -75,15 +75,12 @@ int print_prefix(const struct sockaddr_nl *who, struct 
nlmsghdr *n, void *arg)
 
if (tb[PREFIX_ADDRESS]) {
struct in6_addr *pfx;
-   char abuf[256];
 
pfx = (struct in6_addr *)RTA_

[iproute PATCH 6/7] lib/utils: introduce rt_addr_n2a_rta()

2016-03-22 Thread Phil Sutter
This simple macro eases calling rt_addr_n2a() with data from an rt_attr
pointer.

Signed-off-by: Phil Sutter 
---
 include/utils.h   |  2 ++
 ip/iplink_bond.c  |  4 +---
 ip/ipmroute.c |  8 ++--
 ip/ipprefix.c | 14 +++---
 ip/iproute.c  | 20 +++-
 ip/iproute_lwtunnel.c | 19 ---
 ip/iprule.c   | 16 ++--
 ip/link_ip6tnl.c  |  8 ++--
 tc/f_flower.c |  8 ++--
 9 files changed, 29 insertions(+), 70 deletions(-)

diff --git a/include/utils.h b/include/utils.h
index ebb80c9c20b6d..ef81d00f3d70d 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -130,6 +130,8 @@ const char *format_host(int af, int lne, const void *addr);
 const char *rt_addr_n2a_r(int af, int len, const void *addr,
   char *buf, int buflen);
 const char *rt_addr_n2a(int af, int len, const void *addr);
+#define rt_addr_n2a_rta(af, rta) \
+   rt_addr_n2a(af, RTA_PAYLOAD(rta), RTA_DATA(rta))
 
 int read_family(const char *name);
 const char *family_name(int family);
diff --git a/ip/iplink_bond.c b/ip/iplink_bond.c
index 7da58e4556c07..fe83479a091a8 100644
--- a/ip/iplink_bond.c
+++ b/ip/iplink_bond.c
@@ -422,9 +422,7 @@ static void bond_print_opt(struct link_util *lu, FILE *f, 
struct rtattr *tb[])
for (i = 0; i < BOND_MAX_ARP_TARGETS; i++) {
if (iptb[i])
fprintf(f, "%s",
-   rt_addr_n2a(AF_INET,
-   RTA_PAYLOAD(iptb[i]),
-   RTA_DATA(iptb[i])));
+   rt_addr_n2a_rta(AF_INET, iptb[i]));
if (i < BOND_MAX_ARP_TARGETS-1 && iptb[i+1])
fprintf(f, ",");
}
diff --git a/ip/ipmroute.c b/ip/ipmroute.c
index 2b9f892a62630..c33cdcbbde21b 100644
--- a/ip/ipmroute.c
+++ b/ip/ipmroute.c
@@ -123,16 +123,12 @@ int print_mroute(const struct sockaddr_nl *who, struct 
nlmsghdr *n, void *arg)
 
if (tb[RTA_SRC])
len = snprintf(obuf, sizeof(obuf),
-  "(%s, ", rt_addr_n2a(family,
-   RTA_PAYLOAD(tb[RTA_SRC]),
-   RTA_DATA(tb[RTA_SRC])));
+  "(%s, ", rt_addr_n2a_rta(family, tb[RTA_SRC]));
else
len = sprintf(obuf, "(unknown, ");
if (tb[RTA_DST])
snprintf(obuf + len, sizeof(obuf) - len,
-"%s)", rt_addr_n2a(family,
-   RTA_PAYLOAD(tb[RTA_DST]),
-   RTA_DATA(tb[RTA_DST])));
+"%s)", rt_addr_n2a_rta(family, tb[RTA_DST]));
else
snprintf(obuf + len, sizeof(obuf) - len, "unknown) ");
 
diff --git a/ip/ipprefix.c b/ip/ipprefix.c
index 4d986dbc1a5d1..a833efcf67c4a 100644
--- a/ip/ipprefix.c
+++ b/ip/ipprefix.c
@@ -71,19 +71,11 @@ int print_prefix(const struct sockaddr_nl *who, struct 
nlmsghdr *n, void *arg)
 
parse_rtattr(tb, RTA_MAX, RTM_RTA(prefix), len);
 
-   fprintf(fp, "prefix ");
-
if (tb[PREFIX_ADDRESS]) {
-   struct in6_addr *pfx;
-
-   pfx = (struct in6_addr *)RTA_DATA(tb[PREFIX_ADDRESS]);
-
-   fprintf(fp, "%s", rt_addr_n2a(family,
- RTA_PAYLOAD(tb[PREFIX_ADDRESS]),
- pfx));
+   fprintf(fp, "prefix %s/%u",
+   rt_addr_n2a_rta(family, tb[PREFIX_ADDRESS]),
+   prefix->prefix_len);
}
-   fprintf(fp, "/%u ", prefix->prefix_len);
-
fprintf(fp, "dev %s ", ll_index_to_name(prefix->prefix_ifindex));
 
if (prefix->prefix_flags & IF_PREFIX_ONLINK)
diff --git a/ip/iproute.c b/ip/iproute.c
index 67d551b54d00c..8224d7ffa94bf 100644
--- a/ip/iproute.c
+++ b/ip/iproute.c
@@ -370,11 +370,9 @@ int print_route(const struct sockaddr_nl *who, struct 
nlmsghdr *n, void *arg)
 
if (tb[RTA_DST]) {
if (r->rtm_dst_len != host_len) {
-   fprintf(fp, "%s/%u ", rt_addr_n2a(r->rtm_family,
-  RTA_PAYLOAD(tb[RTA_DST]),
-  RTA_DATA(tb[RTA_DST])),
-   r->rtm_dst_len
-   );
+   fprintf(fp, "%s/%u ",
+   rt_addr_n2a_rta(r->rtm_family, tb[RTA_DST]),
+   r->rtm_dst_len);
} else {
fprintf(fp, "%s ",
format_host_rta(r->rtm_family, tb[RTA_DST]));
@@ -386,11 +384,9 @@ int print_route(const struct sockaddr_nl *who, struct 
nlmsg

[iproute PATCH 3/7] make format_host non-reentrant by default

2016-03-22 Thread Phil Sutter
There are only three users which require it to be reentrant, the rest is
fine without. Instead, provide a reentrant format_host_r() for users
which need it.

Signed-off-by: Phil Sutter 
---
 bridge/fdb.c  |  4 +---
 include/utils.h   |  3 ++-
 ip/ip6tunnel.c|  2 +-
 ip/ipaddress.c| 13 -
 ip/ipaddrlabel.c  |  4 +---
 ip/iplink_geneve.c|  5 ++---
 ip/iplink_vxlan.c | 13 ++---
 ip/ipmaddr.c  |  6 +-
 ip/ipneigh.c  |  4 +---
 ip/iproute.c  | 21 +++--
 ip/iproute_lwtunnel.c |  4 +---
 ip/iprule.c   |  9 +++--
 ip/iptoken.c  |  4 +---
 ip/iptunnel.c |  6 +++---
 ip/link_gre.c |  5 ++---
 ip/link_gre6.c|  5 ++---
 ip/link_iptnl.c   |  7 +++
 ip/link_vti.c |  5 ++---
 ip/link_vti6.c|  5 ++---
 ip/tcp_metrics.c  |  6 ++
 lib/utils.c   |  9 -
 misc/ss.c |  4 ++--
 tc/m_nat.c|  4 ++--
 23 files changed, 59 insertions(+), 89 deletions(-)

diff --git a/bridge/fdb.c b/bridge/fdb.c
index 88f1b63c233e9..e8c314a3c0771 100644
--- a/bridge/fdb.c
+++ b/bridge/fdb.c
@@ -104,7 +104,6 @@ int print_fdb(const struct sockaddr_nl *who, struct 
nlmsghdr *n, void *arg)
fprintf(fp, "dev %s ", ll_index_to_name(r->ndm_ifindex));
 
if (tb[NDA_DST]) {
-   SPRINT_BUF(abuf);
int family = AF_INET;
 
if (RTA_PAYLOAD(tb[NDA_DST]) == sizeof(struct in6_addr))
@@ -113,8 +112,7 @@ int print_fdb(const struct sockaddr_nl *who, struct 
nlmsghdr *n, void *arg)
fprintf(fp, "dst %s ",
format_host(family,
RTA_PAYLOAD(tb[NDA_DST]),
-   RTA_DATA(tb[NDA_DST]),
-   abuf, sizeof(abuf)));
+   RTA_DATA(tb[NDA_DST])));
}
 
if (tb[NDA_VLAN]) {
diff --git a/include/utils.h b/include/utils.h
index c43427c35a6cc..84083b0dbba71 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -122,8 +122,9 @@ int addr64_n2a(__u64 addr, char *buff, size_t len);
 int af_bit_len(int af);
 int af_byte_len(int af);
 
-const char *format_host(int af, int len, const void *addr,
+const char *format_host_r(int af, int len, const void *addr,
   char *buf, int buflen);
+const char *format_host(int af, int lne, const void *addr);
 const char *rt_addr_n2a(int af, int len, const void *addr,
   char *buf, int buflen);
 
diff --git a/ip/ip6tunnel.c b/ip/ip6tunnel.c
index 2e9d3ed40d003..d588645eb942a 100644
--- a/ip/ip6tunnel.c
+++ b/ip/ip6tunnel.c
@@ -77,7 +77,7 @@ static void print_tunnel(struct ip6_tnl_parm2 *p)
printf("%s: %s/ipv6 remote %s local %s",
   p->name,
   tnl_strproto(p->proto),
-  format_host(AF_INET6, 16, &p->raddr, s1, sizeof(s1)),
+  format_host_r(AF_INET6, 16, &p->raddr, s1, sizeof(s1)),
   rt_addr_n2a(AF_INET6, 16, &p->laddr, s2, sizeof(s2)));
if (p->link) {
const char *n = ll_index_to_name(p->link);
diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 90d7b1096c3aa..03c8c03cd4a17 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -964,7 +964,6 @@ int print_addrinfo(const struct sockaddr_nl *who, struct 
nlmsghdr *n,
/* Use local copy of ifa_flags to not interfere with filtering code */
unsigned int ifa_flags;
struct rtattr *rta_tb[IFA_MAX+1];
-   char abuf[256];
 
SPRINT_BUF(b1);
 
@@ -1061,8 +1060,7 @@ int print_addrinfo(const struct sockaddr_nl *who, struct 
nlmsghdr *n,
color_fprintf(fp, ifa_family_color(ifa->ifa_family), "%s",
  format_host(ifa->ifa_family,
  RTA_PAYLOAD(rta_tb[IFA_LOCAL]),
- RTA_DATA(rta_tb[IFA_LOCAL]),
- abuf, sizeof(abuf)));
+ RTA_DATA(rta_tb[IFA_LOCAL])));
if (rta_tb[IFA_ADDRESS] &&
memcmp(RTA_DATA(rta_tb[IFA_ADDRESS]),
   RTA_DATA(rta_tb[IFA_LOCAL]),
@@ -1071,8 +1069,7 @@ int print_addrinfo(const struct sockaddr_nl *who, struct 
nlmsghdr *n,
color_fprintf(fp, ifa_family_color(ifa->ifa_family),
  "%s", format_host(ifa->ifa_family,
  RTA_PAYLOAD(rta_tb[IFA_ADDRESS]),
- RTA_DATA(rta_tb[IFA_ADDRESS]),
- abuf, sizeof(abuf)));
+ RTA_DATA(rta_tb[IFA_ADDRESS])));
}
fprintf(fp, "/%d ", ifa->ifa_prefixlen);
}
@@ -1085,16 +1082,14 @@ int print_addrinfo(const struct sockaddr_nl *who, 
struct nlmsghdr *n,

[iproute PATCH 0/7] Refactor some internal library functions

2016-03-22 Thread Phil Sutter
The following series is a result of reviewing color output support and
some formatting helpers usually used with struct rtattr fields.

Phil Sutter (7):
  color: introduce color helpers and COLOR_CLEAR
  ipaddress: colorize peer, broadcast and anycast addresses as well
  make format_host non-reentrant by default
  utils: make rt_addr_n2a() non-reentrant by default
  lib/utils: introduce format_host_rta()
  lib/utils: introduce rt_addr_n2a_rta()
  lib/ll_addr: improve ll_addr_n2a() a bit

 bridge/fdb.c  |  4 +--
 include/color.h   |  5 ++-
 include/utils.h   | 10 --
 ip/ip6tunnel.c|  4 +--
 ip/ipaddress.c| 87 ++-
 ip/ipaddrlabel.c  |  7 ++---
 ip/iplink_bond.c  |  7 +
 ip/iplink_geneve.c|  5 ++-
 ip/iplink_vxlan.c | 13 
 ip/ipmaddr.c  |  6 +---
 ip/ipmroute.c | 11 ++-
 ip/ipneigh.c  |  6 +---
 ip/ipprefix.c | 17 ++
 ip/iproute.c  | 68 
 ip/iproute_lwtunnel.c | 29 -
 ip/iprule.c   | 39 +++
 ip/iptoken.c  | 11 ++-
 ip/iptunnel.c |  8 ++---
 ip/ipxfrm.c   | 29 -
 ip/link_gre.c |  5 ++-
 ip/link_gre6.c|  5 ++-
 ip/link_ip6tnl.c  | 11 ++-
 ip/link_iptnl.c   |  7 ++---
 ip/link_vti.c |  5 ++-
 ip/link_vti6.c|  5 ++-
 ip/tcp_metrics.c  |  6 ++--
 ip/xfrm_monitor.c | 16 ++
 lib/color.c   | 30 +-
 lib/ll_addr.c | 15 ++---
 lib/utils.c   | 20 ++--
 misc/ss.c |  4 +--
 tc/f_flower.c | 11 ++-
 tc/m_nat.c|  4 +--
 33 files changed, 187 insertions(+), 323 deletions(-)

-- 
2.7.2



Re: [PATCH net] ipv4: initialize flowi4_flags before calling fib_lookup()

2016-03-22 Thread David Ahern

On 3/22/16 9:31 AM, Lance Richardson wrote:

Field fl4.flowi4_flags is not initialized in fib_compute_spec_dst()
before calling fib_lookup(), which means fib_table_lookup() is
using non-deterministic data at this line:

if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) {

Fix by initializing fl4.flowi4_flags to zero.

Signed-off-by: Lance Richardson 
---
  net/ipv4/fib_frontend.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 21add55..896844a 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -304,6 +304,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
fl4.flowi4_scope = scope;
fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
fl4.flowi4_tun_key.tun_id = 0;
+   fl4.flowi4_flags = 0;
if (!fib_lookup(net, &fl4, &res, 0))
return FIB_RES_PREFSRC(net, res);
} else {



Fixes: 58189ca7b2741 ("net: Fix vti use case with oif in dst lookups")

I think a more robust solution is to move fl4 to this if case and init 
when it is declared:


struct flowi4 fl4 = {
.flowi4_iif = LOOPBACK_IFINDEX,
.daddr = ip_hdr(skb)->saddr,
.flowi4_tos = RT_TOS(ip_hdr(skb)->tos),
.flowi4_scope = scope,
.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0,
};


[iproute PATCH 5/7] lib/utils: introduce format_host_rta()

2016-03-22 Thread Phil Sutter
This simple macro eases calling format_host() with data from an rt_attr
pointer.

Signed-off-by: Phil Sutter 
---
 include/utils.h   |  2 ++
 ip/ipaddress.c| 20 
 ip/ipaddrlabel.c  |  5 ++---
 ip/ipneigh.c  |  4 +---
 ip/iproute.c  | 33 +++--
 ip/iproute_lwtunnel.c |  5 ++---
 ip/iprule.c   | 16 ++--
 ip/iptoken.c  |  9 +++--
 8 files changed, 35 insertions(+), 59 deletions(-)

diff --git a/include/utils.h b/include/utils.h
index bc2cbce0cc303..ebb80c9c20b6d 100644
--- a/include/utils.h
+++ b/include/utils.h
@@ -125,6 +125,8 @@ int af_byte_len(int af);
 const char *format_host_r(int af, int len, const void *addr,
   char *buf, int buflen);
 const char *format_host(int af, int lne, const void *addr);
+#define format_host_rta(af, rta) \
+   format_host(af, RTA_PAYLOAD(rta), RTA_DATA(rta))
 const char *rt_addr_n2a_r(int af, int len, const void *addr,
   char *buf, int buflen);
 const char *rt_addr_n2a(int af, int len, const void *addr);
diff --git a/ip/ipaddress.c b/ip/ipaddress.c
index 03c8c03cd4a17..3998d8cec4ab2 100644
--- a/ip/ipaddress.c
+++ b/ip/ipaddress.c
@@ -1058,18 +1058,16 @@ int print_addrinfo(const struct sockaddr_nl *who, 
struct nlmsghdr *n,
 
if (rta_tb[IFA_LOCAL]) {
color_fprintf(fp, ifa_family_color(ifa->ifa_family), "%s",
- format_host(ifa->ifa_family,
- RTA_PAYLOAD(rta_tb[IFA_LOCAL]),
- RTA_DATA(rta_tb[IFA_LOCAL])));
+ format_host_rta(ifa->ifa_family,
+ rta_tb[IFA_LOCAL]));
if (rta_tb[IFA_ADDRESS] &&
memcmp(RTA_DATA(rta_tb[IFA_ADDRESS]),
   RTA_DATA(rta_tb[IFA_LOCAL]),
   ifa->ifa_family == AF_INET ? 4 : 16)) {
fprintf(fp, " peer ");
color_fprintf(fp, ifa_family_color(ifa->ifa_family),
- "%s", format_host(ifa->ifa_family,
- RTA_PAYLOAD(rta_tb[IFA_ADDRESS]),
- RTA_DATA(rta_tb[IFA_ADDRESS])));
+ "%s", format_host_rta(ifa->ifa_family,
+ rta_tb[IFA_ADDRESS]));
}
fprintf(fp, "/%d ", ifa->ifa_prefixlen);
}
@@ -1080,16 +1078,14 @@ int print_addrinfo(const struct sockaddr_nl *who, 
struct nlmsghdr *n,
if (rta_tb[IFA_BROADCAST]) {
fprintf(fp, "brd ");
color_fprintf(fp, ifa_family_color(ifa->ifa_family), "%s ",
-   format_host(ifa->ifa_family,
-   RTA_PAYLOAD(rta_tb[IFA_BROADCAST]),
-   RTA_DATA(rta_tb[IFA_BROADCAST])));
+ format_host_rta(ifa->ifa_family,
+ rta_tb[IFA_BROADCAST]));
}
if (rta_tb[IFA_ANYCAST]) {
fprintf(fp, "any ");
color_fprintf(fp, ifa_family_color(ifa->ifa_family), "%s ",
-   format_host(ifa->ifa_family,
-   RTA_PAYLOAD(rta_tb[IFA_ANYCAST]),
-   RTA_DATA(rta_tb[IFA_ANYCAST])));
+ format_host_rta(ifa->ifa_family,
+ rta_tb[IFA_ANYCAST]));
}
fprintf(fp, "scope %s ", rtnl_rtscope_n2a(ifa->ifa_scope, b1, 
sizeof(b1)));
if (ifa_flags & IFA_F_SECONDARY) {
diff --git a/ip/ipaddrlabel.c b/ip/ipaddrlabel.c
index 6076bb952297f..b4cd784094719 100644
--- a/ip/ipaddrlabel.c
+++ b/ip/ipaddrlabel.c
@@ -75,9 +75,8 @@ int print_addrlabel(const struct sockaddr_nl *who, struct 
nlmsghdr *n, void *arg
 
if (tb[IFAL_ADDRESS]) {
fprintf(fp, "prefix %s/%u ",
-   format_host(ifal->ifal_family,
-   RTA_PAYLOAD(tb[IFAL_ADDRESS]),
-   RTA_DATA(tb[IFAL_ADDRESS])),
+   format_host_rta(ifal->ifal_family,
+   tb[IFAL_ADDRESS]),
ifal->ifal_prefixlen);
}
 
diff --git a/ip/ipneigh.c b/ip/ipneigh.c
index 583aad30dc820..c49fb4e7f7b58 100644
--- a/ip/ipneigh.c
+++ b/ip/ipneigh.c
@@ -278,9 +278,7 @@ int print_neigh(const struct sockaddr_nl *who, struct 
nlmsghdr *n, void *arg)
fprintf(fp, "miss ");
if (tb[NDA_DST]) {
fprintf(fp, "%s ",
-   format_host(r->ndm_family,
-   RTA_PAYLOAD(tb[NDA_DST]),
-   RTA_DATA(tb[NDA_DST])));
+   format_host_rta(r->ndm_family

Re: [PATCH] ipv6: Fix the pmtu path for connected UDP socket

2016-03-22 Thread Cong Wang
On Tue, Mar 22, 2016 at 10:39 AM, Martin KaFai Lau  wrote:
> On Tue, Mar 22, 2016 at 09:53:35AM -0700, Cong Wang wrote:
>> On Mon, Mar 21, 2016 at 11:02 PM, Martin KaFai Lau  wrote:
>> > In term of difference, AFAICT, the current patch is an optimization in the
>> > sense that the update_pmtu() code path does not have to do a dst_check to
>> > discover its sk->sk_dst_cache is invalid, and then do a relookup to find 
>> > out
>> > that the just created RTF_CACHE clone should be used.  To get this, it may
>> > make more sense to remove all the relookup code together during 
>> > update_pmtu().
>> > Even if this slow path was to be optimized, should it be put in a
>> > separate patch where net-next is a better candidate?
>> >
>>
>> Speaking of RTF_CACHE, I am curious why you didn't use FIB next hop exception
>> as what ipv4 does to cache exceptions? This makes IPv6 has more gap with 
>> IPv4.
>> This is (almost) irrelevant to this patch.
> There are a few differences between IPv6 and IPv4.  Both in terms of
> data structure and functionality.  The last 'RTF_CACHE on exception' patchset 
> is one
> step toward this direction. More patches are needed and are welcomed ;)

Sure, I will take a look at this once net-next is re-open.


>
>>
>>
>> > I think fixing it in __udp6_lib_err() or what Cong Wang is suggesting makes
>> > more sense for a net branch fix.  If there is logic specific to 
>> > connected-udp,
>> > I would do it in the __udp6_lib_err() instead.  After looking at
>> > udpv6_sendmsg() and how it calls ip6_dst_store(), may also need to be 
>> > careful
>> > what daddr and saddr should be passed to ip6_dst_store(), or at least a 
>> > commit
>> > message.  The first patch is essentially passing NULL to daddr and saddr
>> > while the second patch seems passing something else.
>>
>> Raw socket needs to fix too, we can't just fix __udp6_lib_err(), this is also
>> why fixing ip6_sk_update_pmtu() is better, its call path is better.
> I don't see rawv6 socket is storing the dst.  I probably have overlooked it.  
> Can
> you point it out?


I thought sk->sk_dst_cache is generic to all sockets, but it is up to
each kind of
socket to decide to use it or not, and you are right, raw socket doesn't seem to
care about it even though it calls *_sk_update_pmtu().


Re: [PATCH] ipv6: Fix the pmtu path for connected UDP socket

2016-03-22 Thread Martin KaFai Lau
On Tue, Mar 22, 2016 at 11:03:29AM -0700, Wei Wang wrote:
> But one thing here, we will have to generate the same flowi6 in both
> ip6_sk_update_pmtu() as well as ip6_update_pmtu(). Is this considered
> as a not clean enough fix?
If they share common codes to build flowi6, can the common codes
be factored out?


Re: [PATCH] ipv6: Fix the pmtu path for connected UDP socket

2016-03-22 Thread Wei Wang
Thanks Martin and Cong.

I guess then we are going with the following fix in ip6_sk_update_pmtu():
1. call ip6_upate_pmtu() as it is
2. do a dst_check()
3. re-lookup() if it is invalid
4. and then do a ip6_dst_store()/dst_set

But one thing here, we will have to generate the same flowi6 in both
ip6_sk_update_pmtu() as well as ip6_update_pmtu(). Is this considered
as a not clean enough fix?


On Tue, Mar 22, 2016 at 10:39 AM, Martin KaFai Lau  wrote:
> On Tue, Mar 22, 2016 at 09:53:35AM -0700, Cong Wang wrote:
>> On Mon, Mar 21, 2016 at 11:02 PM, Martin KaFai Lau  wrote:
>> > In term of difference, AFAICT, the current patch is an optimization in the
>> > sense that the update_pmtu() code path does not have to do a dst_check to
>> > discover its sk->sk_dst_cache is invalid, and then do a relookup to find 
>> > out
>> > that the just created RTF_CACHE clone should be used.  To get this, it may
>> > make more sense to remove all the relookup code together during 
>> > update_pmtu().
>> > Even if this slow path was to be optimized, should it be put in a
>> > separate patch where net-next is a better candidate?
>> >
>>
>> Speaking of RTF_CACHE, I am curious why you didn't use FIB next hop exception
>> as what ipv4 does to cache exceptions? This makes IPv6 has more gap with 
>> IPv4.
>> This is (almost) irrelevant to this patch.
> There are a few differences between IPv6 and IPv4.  Both in terms of
> data structure and functionality.  The last 'RTF_CACHE on exception' patchset 
> is one
> step toward this direction. More patches are needed and are welcomed ;)
>
>>
>>
>> > I think fixing it in __udp6_lib_err() or what Cong Wang is suggesting makes
>> > more sense for a net branch fix.  If there is logic specific to 
>> > connected-udp,
>> > I would do it in the __udp6_lib_err() instead.  After looking at
>> > udpv6_sendmsg() and how it calls ip6_dst_store(), may also need to be 
>> > careful
>> > what daddr and saddr should be passed to ip6_dst_store(), or at least a 
>> > commit
>> > message.  The first patch is essentially passing NULL to daddr and saddr
>> > while the second patch seems passing something else.
>>
>> Raw socket needs to fix too, we can't just fix __udp6_lib_err(), this is also
>> why fixing ip6_sk_update_pmtu() is better, its call path is better.
> I don't see rawv6 socket is storing the dst.  I probably have overlooked it.  
> Can
> you point it out?
>
> Having said that, I don't feel strongly on any of the two places.  I think 
> only
> implementation can tell.


Re: [RFC PATCH 7/9] GSO: Support partial segmentation offload

2016-03-22 Thread Alexander Duyck
On Tue, Mar 22, 2016 at 10:00 AM, Edward Cree  wrote:
> On 18/03/16 23:25, Alexander Duyck wrote:
>> This patch adds support for something I am referring to as GSO partial.
>> The basic idea is that we can support a broader range of devices for
>> segmentation if we use fixed outer headers and have the hardware only
>> really deal with segmenting the inner header.  The idea behind the naming
>> is due to the fact that everything before csum_start will be fixed headers,
>> and everything after will be the region that is handled by hardware.
>>
>> With the current implementation it allows us to add support for the
>> following GSO types with an inner TSO or TSO6 offload:
>> NETIF_F_GSO_GRE
>> NETIF_F_GSO_GRE_CSUM
>> NETIF_F_UDP_TUNNEL
>> NETIF_F_UDP_TUNNEL_CSUM
>>
>> Signed-off-by: Alexander Duyck 
>> ---
> If I'm correctly understanding what you're doing, you're building a large
> TCP segment, feeding it through the encapsulation drivers as normal, then
> at GSO time you're fixing up length fields, checksums etc. in the headers.
> I think we can do this more simply, by making it so that at the time when
> we _generate_ the TCP segment, we give it headers saying it's one MSS big,
> but have several MSS of data.  Similarly when adding the encap headers,
> they all need to get their lengths from what the layer below tells them,
> rather than the current length of data in the SKB.  Then at GSO time all
> the headers already have the right things in, and you don't need to call
> any per-protocol GSO callbacks for them.

One issue I have to deal with here is that we have no way of knowing
what the underlying hardware can support at the time of segment being
created.  You have to keep in mind that what we have access to is the
tunnel dev in many cases, not the underlying dev so we don't know if
things can be offloaded to hardware or not.  By pushing this logic
into the GSO code we can actually implement it without much overhead
since we either segment it into an MSS multiple, or into single MSS
sized chunks.  This way we defer the decision until the very last
moment when we actually know if we can offload some portion of this in
hardware or not.

> Any protocol that noticed it was putting something non-copyable in its
> headers (e.g. GRE with the Counter field, or an outer IP layer without DF
> set needing real IPIDs) would set a flag in the SKB to indicate that we
> really do need to call through the per-protocol GSO stuff.  (Ideally, if
> we had a separate skb->gso_start field rather than piggybacking on
> csum_start, we could reset it to point just before us, so that any further
> headers outside us still can be copied rather than taking callbacks.  But
> I'm not sure whether that's worth using up sk_buff real estate for.)

The idea behind piggybacking on csum_start was due to the fact that we
cannot perform GSO/TSO unless CHECKSUM_PARTIAL is set.  As far as I
know in the case of TCP offloads this always ends up being the
inner-most L4 header so it works out in that it actually reduces code
path as we were having to deal with all the skb->encapsulation checks.
It was a relationship that already existed, I just decided to make use
of it since it simplifies things pretty significantly.

As far as retreating I don't really see how that would work. In most
cases it is an all-or-nothing proposition to setup these outer
headers.  Either we can segment the frame with the outer headers
replicated or we cannot.  I suspect it would end up being a common
case where the hardware will update the outer IP and inner TCP
headers, but I think the outer L4 and inner IP headers will be the
ones that most likely always end up being static.  Also we already
have code paths in place in the GRE driver for instance that prevent
us from using GSO in the case of TUNNEL_SEQ being enabled.

> (It might still be necessary to put the true length in the TCP header, if
> hardware is using that as an input to segmentation.  I think sfc hardware
> just uses 'total length of all payload DMA descriptors', but others might
> behave differently.)

That is what most drivers do.  The way I kind of retained that is that
the TCP header doesn't include an actual length field, but I left the
pseudo-header using the full length of all data.  My thought was to
end up using something like the ixgbe approach for most devices.  What
I did there was replicate the tunnel headers and inner IPv4 or IPv6
header.  In the case of ixgbe and i40e I can throw away the checksum
and length values for the outer IP header, one thing I was curious
about is if I really needed to retain the full packet size for those.

> However, I haven't yet had the time to attempt to implement this, so there
> might be some obvious reason I'm missing why this is impossible.
> Also, it's possible that I've completely misunderstood your patch and it's
> orthogonal to and can coexist with what I'm suggesting.

The one piece I could really use would be an understanding of what
inputs your hard

Re: [PATCH] ipv6: Fix the pmtu path for connected UDP socket

2016-03-22 Thread Martin KaFai Lau
On Tue, Mar 22, 2016 at 09:53:35AM -0700, Cong Wang wrote:
> On Mon, Mar 21, 2016 at 11:02 PM, Martin KaFai Lau  wrote:
> > In term of difference, AFAICT, the current patch is an optimization in the
> > sense that the update_pmtu() code path does not have to do a dst_check to
> > discover its sk->sk_dst_cache is invalid, and then do a relookup to find out
> > that the just created RTF_CACHE clone should be used.  To get this, it may
> > make more sense to remove all the relookup code together during 
> > update_pmtu().
> > Even if this slow path was to be optimized, should it be put in a
> > separate patch where net-next is a better candidate?
> >
>
> Speaking of RTF_CACHE, I am curious why you didn't use FIB next hop exception
> as what ipv4 does to cache exceptions? This makes IPv6 has more gap with IPv4.
> This is (almost) irrelevant to this patch.
There are a few differences between IPv6 and IPv4.  Both in terms of
data structure and functionality.  The last 'RTF_CACHE on exception' patchset 
is one
step toward this direction. More patches are needed and are welcomed ;)

>
>
> > I think fixing it in __udp6_lib_err() or what Cong Wang is suggesting makes
> > more sense for a net branch fix.  If there is logic specific to 
> > connected-udp,
> > I would do it in the __udp6_lib_err() instead.  After looking at
> > udpv6_sendmsg() and how it calls ip6_dst_store(), may also need to be 
> > careful
> > what daddr and saddr should be passed to ip6_dst_store(), or at least a 
> > commit
> > message.  The first patch is essentially passing NULL to daddr and saddr
> > while the second patch seems passing something else.
>
> Raw socket needs to fix too, we can't just fix __udp6_lib_err(), this is also
> why fixing ip6_sk_update_pmtu() is better, its call path is better.
I don't see rawv6 socket is storing the dst.  I probably have overlooked it.  
Can
you point it out?

Having said that, I don't feel strongly on any of the two places.  I think only
implementation can tell.


Re: [PATCH] ath9k: fix checkpatch.pl identation and sapce errors.

2016-03-22 Thread Joe Perches
On Tue, 2016-03-22 at 12:23 +0530, Ashwini Singh wrote:
> Signed-off-by: Ashwini Singh 

It might be better to break this into a few different patches.

o Remove trailing whitespace
o Fix vertical line / brace location
o Multi-line statement parenthesis alignment
o Space to tab conversions
o Whitespace addition / removal



Re: ip-token: unable to remove a token & multi-token handling & concurrent use w/ EUI64/privacy

2016-03-22 Thread Daniel Borkmann

Hi Robin,

On 03/19/2016 07:53 PM, Robin H. Johnson wrote:
[...]

Playing around with IPv6 tokens, I ran into a problem:
Once you have a token set on an interface, it's impossible to remove it!

# ip token set :: dev eth0
RTNETLINK answers: Invalid argument


I'll have a look into a fix, I think this was intentional, but I currently
fail to recall a reason why (should have put a note into the commit log). ;)
The draft is pretty terse in any case, it seems as we only invalidate other
tokenized addresses, it should be okay to just remove it.


This is a side-effect of rejecting ipv6_addr_any in inet6_set_iftoken.

While this gets fixed, I have two related feature requests for this:
- Please make it possible to configure multiple tokens on an interface:
   Use case: Deploying local services on well-known addresses inside a
   network without explicit prefix configuration.
- Adding a token causes other address generation methods to be disabled,
   this is problematic if you wish to prefer privacy addresses for
   outbound connections.

Design suggestion:
Convert from using a single token to using a list of tokens, with an
explicit default IPv6-any-addr (::) in the list, to represent that
other address generation should ALSO take place (EUI64/privacy).
Deletion of the any-addr from the list should disable EUI64/privacy
addresses.


Seems you already have some patches, please feel free to send them. ;)

Thanks for the feedback!
Daniel


Re: [RFC PATCH 7/9] GSO: Support partial segmentation offload

2016-03-22 Thread Edward Cree
On 18/03/16 23:25, Alexander Duyck wrote:
> This patch adds support for something I am referring to as GSO partial.
> The basic idea is that we can support a broader range of devices for
> segmentation if we use fixed outer headers and have the hardware only
> really deal with segmenting the inner header.  The idea behind the naming
> is due to the fact that everything before csum_start will be fixed headers,
> and everything after will be the region that is handled by hardware.
>
> With the current implementation it allows us to add support for the
> following GSO types with an inner TSO or TSO6 offload:
> NETIF_F_GSO_GRE
> NETIF_F_GSO_GRE_CSUM
> NETIF_F_UDP_TUNNEL
> NETIF_F_UDP_TUNNEL_CSUM
>
> Signed-off-by: Alexander Duyck 
> ---
If I'm correctly understanding what you're doing, you're building a large
TCP segment, feeding it through the encapsulation drivers as normal, then
at GSO time you're fixing up length fields, checksums etc. in the headers.
I think we can do this more simply, by making it so that at the time when
we _generate_ the TCP segment, we give it headers saying it's one MSS big,
but have several MSS of data.  Similarly when adding the encap headers,
they all need to get their lengths from what the layer below tells them,
rather than the current length of data in the SKB.  Then at GSO time all
the headers already have the right things in, and you don't need to call
any per-protocol GSO callbacks for them.
Any protocol that noticed it was putting something non-copyable in its
headers (e.g. GRE with the Counter field, or an outer IP layer without DF
set needing real IPIDs) would set a flag in the SKB to indicate that we
really do need to call through the per-protocol GSO stuff.  (Ideally, if
we had a separate skb->gso_start field rather than piggybacking on
csum_start, we could reset it to point just before us, so that any further
headers outside us still can be copied rather than taking callbacks.  But
I'm not sure whether that's worth using up sk_buff real estate for.)
(It might still be necessary to put the true length in the TCP header, if
hardware is using that as an input to segmentation.  I think sfc hardware
just uses 'total length of all payload DMA descriptors', but others might
behave differently.)
However, I haven't yet had the time to attempt to implement this, so there
might be some obvious reason I'm missing why this is impossible.
Also, it's possible that I've completely misunderstood your patch and it's
orthogonal to and can coexist with what I'm suggesting.
-Ed


Re: [PATCH 3/3] libceph: use KMEM_CACHE macro

2016-03-22 Thread Ilya Dryomov
On Sun, Mar 13, 2016 at 8:18 AM, Geliang Tang  wrote:
> Use KMEM_CACHE() instead of kmem_cache_create() to simplify the code.
>
> Signed-off-by: Geliang Tang 
> ---
>  net/ceph/messenger.c  | 10 ++
>  net/ceph/osd_client.c |  5 +
>  2 files changed, 3 insertions(+), 12 deletions(-)
>
> diff --git a/net/ceph/messenger.c b/net/ceph/messenger.c
> index 9382619..32c997e 100644
> --- a/net/ceph/messenger.c
> +++ b/net/ceph/messenger.c
> @@ -235,18 +235,12 @@ static struct workqueue_struct *ceph_msgr_wq;
>  static int ceph_msgr_slab_init(void)
>  {
> BUG_ON(ceph_msg_cache);
> -   ceph_msg_cache = kmem_cache_create("ceph_msg",
> -   sizeof (struct ceph_msg),
> -   __alignof__(struct ceph_msg), 0, 
> NULL);
> -
> +   ceph_msg_cache = KMEM_CACHE(ceph_msg, 0);
> if (!ceph_msg_cache)
> return -ENOMEM;
>
> BUG_ON(ceph_msg_data_cache);
> -   ceph_msg_data_cache = kmem_cache_create("ceph_msg_data",
> -   sizeof (struct ceph_msg_data),
> -   __alignof__(struct ceph_msg_data),
> -   0, NULL);
> +   ceph_msg_data_cache = KMEM_CACHE(ceph_msg_data, 0);
> if (ceph_msg_data_cache)
> return 0;
>
> diff --git a/net/ceph/osd_client.c b/net/ceph/osd_client.c
> index 5bc0537..7558855 100644
> --- a/net/ceph/osd_client.c
> +++ b/net/ceph/osd_client.c
> @@ -2783,10 +2783,7 @@ EXPORT_SYMBOL(ceph_osdc_writepages);
>  int ceph_osdc_setup(void)
>  {
> BUG_ON(ceph_osd_request_cache);
> -   ceph_osd_request_cache = kmem_cache_create("ceph_osd_request",
> -   sizeof (struct ceph_osd_request),
> -   __alignof__(struct ceph_osd_request),
> -   0, NULL);
> +   ceph_osd_request_cache = KMEM_CACHE(ceph_osd_request, 0);
>
> return ceph_osd_request_cache ? 0 : -ENOMEM;
>  }

Applied, with osd_client.c hunk dropped.  ceph_osd_request cache
objects are not longer sizeof(struct ceph_osd_request).

Thanks,

Ilya


Re: [PATCH] ipv6: Fix the pmtu path for connected UDP socket

2016-03-22 Thread Cong Wang
On Mon, Mar 21, 2016 at 11:02 PM, Martin KaFai Lau  wrote:
> I think Cong Wang is suggesting, in ip6_sk_update_pmtu():
> 1. call ip6_upate_pmtu() as it is
> 2. do a dst_check()
> 3. re-lookup() if it is invalid
> 4. and then do a ip6_dst_store()/dst_set
>
> The above is exactly what inet6_csk_update_pmtu(), which was also used in the
> first patch, is doing.

Well, the reasons why I suggest to fix ip6_sk_update_pmtu() instead of anything
else (no matter __udp6_lib_err() in v1 or ip6_update_pmtu() in v2) are that:

1) Catch up with ipv4 part, which is ipv4_sk_update_pmtu()
2) Straight-forward, (ideally) no need to bother other irrelevant
callers like xfrm.

Functionally all the solutions should work to fix Wei's problem, we just need to
find which one is more elegant.


>
> In term of difference, AFAICT, the current patch is an optimization in the
> sense that the update_pmtu() code path does not have to do a dst_check to
> discover its sk->sk_dst_cache is invalid, and then do a relookup to find out
> that the just created RTF_CACHE clone should be used.  To get this, it may
> make more sense to remove all the relookup code together during update_pmtu().
> Even if this slow path was to be optimized, should it be put in a
> separate patch where net-next is a better candidate?
>

Speaking of RTF_CACHE, I am curious why you didn't use FIB next hop exception
as what ipv4 does to cache exceptions? This makes IPv6 has more gap with IPv4.
This is (almost) irrelevant to this patch.


> I think fixing it in __udp6_lib_err() or what Cong Wang is suggesting makes
> more sense for a net branch fix.  If there is logic specific to connected-udp,
> I would do it in the __udp6_lib_err() instead.  After looking at
> udpv6_sendmsg() and how it calls ip6_dst_store(), may also need to be careful
> what daddr and saddr should be passed to ip6_dst_store(), or at least a commit
> message.  The first patch is essentially passing NULL to daddr and saddr
> while the second patch seems passing something else.

Raw socket needs to fix too, we can't just fix __udp6_lib_err(), this is also
why fixing ip6_sk_update_pmtu() is better, its call path is better.


Re: [PATCH v5 2/4] Documentation: Bindings: Add STM32 DWMAC glue

2016-03-22 Thread Chen-Yu Tsai
On Mon, Mar 21, 2016 at 10:02 PM, Giuseppe CAVALLARO
 wrote:
> On 3/21/2016 11:45 AM, Alexandre Torgue wrote:
>>
>> Hi,
>>
>> 2016-03-18 17:00 GMT+01:00 Chen-Yu Tsai :
>>>
>>> Hi,
>>>
>>> On Fri, Mar 18, 2016 at 11:37 PM, Alexandre TORGUE
>>>  wrote:

 Signed-off-by: Alexandre TORGUE 

 diff --git a/Documentation/devicetree/bindings/net/stm32-dwmac.txt
 b/Documentation/devicetree/bindings/net/stm32-dwmac.txt
 new file mode 100644
 index 000..ada2aa4
 --- /dev/null
 +++ b/Documentation/devicetree/bindings/net/stm32-dwmac.txt
 @@ -0,0 +1,32 @@
 +STMicroelectronics STM32 / MCU DWMAC glue layer controller
 +
 +This file documents platform glue layer for stmmac.
 +Please see stmmac.txt for the other unchanged properties.
 +
 +The device node has following properties.
 +
 +Required properties:
 +- compatible:  Should be "st,stm32-dwmac" to select glue, and
 +  "snps,dwmac-3.50a" to select IP vesrion.
>
>
> Almost all the synp gmac chips have the HW capability register that is
> used for setting all the parameters at probe time.
> This will override fields passed from DT. In theory, it is not
> necessary to pass: "snps,dwmac-3.50a" from device tree at least there
> is either no HW cap reg or the glue has some w/a for a specific chip
> revision.
> To be honest, I like to see the "snps,dwmac-3.50a" as compatibility
> to also have a better readability (that's my personal view ;-) ).

I agree having the versioned strings is good for informational purposes,
and to signal hardware capability. It is not so good for directly
binding drivers in the implementation though.

Unfortunately, as Joachim pointed out, exynos5440 uses it so we cannot
change it.

ChenYu

> Peppe
>
>
>>>
>>> If you need have sort of hardware glue, then it is not compatible.
>>>
>>
>> We could have the case where the glue is set by a bootloader.
>> In this case, we will select IP version in compatible and we will use
>> generic dwmac glue to probe stmmac driver.
>>
>> Regards
>>
>> Alex.
>>
>>> ChenYu
>>>
 +- clocks: Must contain a phandle for each entry in clock-names.
 +- clock-names: Should be "stmmaceth" for the host clock.
 +  Should be "tx-clk" for the MAC TX clock.
 +  Should be "rx-clk" for the MAC RX clock.
 +- st,syscon : Should be phandle/offset pair. The phandle to the syscon
 node which
 + encompases the glue register, and the offset of the
 control register.
 +Example:
 +
 +   ethernet0: dwmac@40028000 {
 +   compatible = "st,stm32-dwmac",
 "snps,dwmac-3.50a";
 +   status = "disabled";
 +   reg = <0x40028000 0x8000>;
 +   reg-names = "stmmaceth";
 +   interrupts = <0 61 0>, <0 62 0>;
 +   interrupt-names = "macirq", "eth_wake_irq";
 +   clock-names = "stmmaceth", "tx-clk", "rx-clk";
 +   clocks = <&rcc 0 25>, <&rcc 0 26>, <&rcc 0 27>;
 +   st,syscon = <&syscfg 0x4>;
 +   snps,pbl = <8>;
 +   snps,mixed-burst;
 +   dma-ranges;
 +   };
 --
 1.9.1


 ___
 linux-arm-kernel mailing list
 linux-arm-ker...@lists.infradead.org
 http://lists.infradead.org/mailman/listinfo/linux-arm-kernel
>>
>>
>>
>


Re: [PATCH v5 2/4] Documentation: Bindings: Add STM32 DWMAC glue

2016-03-22 Thread Alexandre Torgue
Hi guys,

I will fix typo issues (s/vesrion/version and ethernet @).

Concerning compatible string. For sure "snps,dwmac-3.50a" string is
not used inside glue driver.
I perfere to keep it for information but if you really want that I
remove it I will not block ;)

2016-03-21 16:36 GMT+01:00 Joachim  Eastwood :
> On 21 March 2016 at 13:40, Rob Herring  wrote:
>> On Sat, Mar 19, 2016 at 12:00:22AM +0800, Chen-Yu Tsai wrote:
>>> Hi,
>>>
>>> On Fri, Mar 18, 2016 at 11:37 PM, Alexandre TORGUE
>>>  wrote:
>>> > +- clocks: Must contain a phandle for each entry in clock-names.
>>> > +- clock-names: Should be "stmmaceth" for the host clock.
>>
We can remove host clock (stmmac eth) entry here and refer to
stmmac.txt binding for common entry

>> This doesn't sound like the clock input signal name...
>>
>>> > +  Should be "tx-clk" for the MAC TX clock.
>>> > +  Should be "rx-clk" for the MAC RX clock.
>>
>> How can other DWMAC blocks not have these clocks? The glue can't really
>> add these clocks. It could combine them into one or a new version of
>> DWMAC could have a different number of clock inputs. So if there is
>> variation here, then some of the bindings are probably wrong. I guess
>> the only change I'm suggesting is possibly moving these into common
>> binding doc.
>
> The LPC18xx implementation probably have these clocks as well but the
> LPC1850 user manual only documents the main clock. Someone with access
> to the IP block doc from Synopsys should be able to check which clocks
> the MAC really needs.
>
> Rockchip bindings have two clocks named "mac_clk_rx" and "mac_clk_tx".
> These are probably the same as stm32 needs so maybe use these names
> and move them into the main doc and update the rockchip binding.
>
I think we can use same name. But I have a doubt on moving it in a
common bindings (maybe I don't well understood). When you say "common
binding file" is it "stmmac.txt" binding ? If yes does it mean that we
have to control it inside stmmac driver (no more in glue) ? In this
case those clocks will become "required" for stm32 and rockship but
not for others chip. It could create confusion?

Best regards

Alex

>
> regards,
> Joachim Eastwood


[PATCH v3 1/2] Revert "vsock: Fix blocking ops call in prepare_to_wait"

2016-03-22 Thread Claudio Imbrenda
This reverts commit 5988818008257ca42010d6b43a3e0e48afec9898 ("vsock: Fix
blocking ops call in prepare_to_wait")

The commit reverted with this patch caused us to potentially miss wakeups.
Since the condition is not checked between the prepare_to_wait and the
schedule(), if a wakeup happens after the condition is checked but before
the sleep happens, we will miss it. ( A description of the problem can be
found here: http://www.makelinux.net/ldd3/chp-6-sect-2 ).

By reverting the patch, the behaviour is still incorrect (since we
shouldn't sleep between the prepare_to_wait and the schedule) but at least
it will not miss wakeups.

The next patch in the series actually fixes the behaviour.

Signed-off-by: Claudio Imbrenda 
---
 net/vmw_vsock/af_vsock.c | 19 +--
 1 file changed, 13 insertions(+), 6 deletions(-)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index bbe65dc..7fd1220 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1557,6 +1557,8 @@ static int vsock_stream_sendmsg(struct socket *sock, 
struct msghdr *msg,
if (err < 0)
goto out;
 
+   prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
+
while (total_written < len) {
ssize_t written;
 
@@ -1576,9 +1578,7 @@ static int vsock_stream_sendmsg(struct socket *sock, 
struct msghdr *msg,
goto out_wait;
 
release_sock(sk);
-   prepare_to_wait(sk_sleep(sk), &wait, 
TASK_INTERRUPTIBLE);
timeout = schedule_timeout(timeout);
-   finish_wait(sk_sleep(sk), &wait);
lock_sock(sk);
if (signal_pending(current)) {
err = sock_intr_errno(timeout);
@@ -1588,6 +1588,8 @@ static int vsock_stream_sendmsg(struct socket *sock, 
struct msghdr *msg,
goto out_wait;
}
 
+   prepare_to_wait(sk_sleep(sk), &wait,
+   TASK_INTERRUPTIBLE);
}
 
/* These checks occur both as part of and after the loop
@@ -1633,6 +1635,7 @@ static int vsock_stream_sendmsg(struct socket *sock, 
struct msghdr *msg,
 out_wait:
if (total_written > 0)
err = total_written;
+   finish_wait(sk_sleep(sk), &wait);
 out:
release_sock(sk);
return err;
@@ -1713,6 +1716,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr 
*msg, size_t len,
if (err < 0)
goto out;
 
+   prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 
while (1) {
s64 ready = vsock_stream_has_data(vsk);
@@ -1723,7 +1727,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr 
*msg, size_t len,
 */
 
err = -ENOMEM;
-   goto out;
+   goto out_wait;
} else if (ready > 0) {
ssize_t read;
 
@@ -1746,7 +1750,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr 
*msg, size_t len,
vsk, target, read,
!(flags & MSG_PEEK), &recv_data);
if (err < 0)
-   goto out;
+   goto out_wait;
 
if (read >= target || flags & MSG_PEEK)
break;
@@ -1769,9 +1773,7 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr 
*msg, size_t len,
break;
 
release_sock(sk);
-   prepare_to_wait(sk_sleep(sk), &wait, 
TASK_INTERRUPTIBLE);
timeout = schedule_timeout(timeout);
-   finish_wait(sk_sleep(sk), &wait);
lock_sock(sk);
 
if (signal_pending(current)) {
@@ -1781,6 +1783,9 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr 
*msg, size_t len,
err = -EAGAIN;
break;
}
+
+   prepare_to_wait(sk_sleep(sk), &wait,
+   TASK_INTERRUPTIBLE);
}
}
 
@@ -1811,6 +1816,8 @@ vsock_stream_recvmsg(struct socket *sock, struct msghdr 
*msg, size_t len,
err = copied;
}
 
+out_wait:
+   finish_wait(sk_sleep(sk), &wait);
 out:
release_sock(sk);
return err;
-- 
1.9.1



[PATCH v3 2/2] AF_VSOCK: Shrink the area influenced by prepare_to_wait

2016-03-22 Thread Claudio Imbrenda
When a thread is prepared for waiting by calling prepare_to_wait, sleeping
is not allowed until either the wait has taken place or finish_wait has
been called.  The existing code in af_vsock imposed unnecessary no-sleep
assumptions to a broad list of backend functions.
This patch shrinks the influence of prepare_to_wait to the area where it
is strictly needed, therefore relaxing the no-sleep restriction there.

Signed-off-by: Claudio Imbrenda 
---
 net/vmw_vsock/af_vsock.c | 158 +--
 1 file changed, 85 insertions(+), 73 deletions(-)

diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
index 7fd1220..3dce53e 100644
--- a/net/vmw_vsock/af_vsock.c
+++ b/net/vmw_vsock/af_vsock.c
@@ -1209,10 +1209,14 @@ static int vsock_stream_connect(struct socket *sock, 
struct sockaddr *addr,
 
if (signal_pending(current)) {
err = sock_intr_errno(timeout);
-   goto out_wait_error;
+   sk->sk_state = SS_UNCONNECTED;
+   sock->state = SS_UNCONNECTED;
+   goto out_wait;
} else if (timeout == 0) {
err = -ETIMEDOUT;
-   goto out_wait_error;
+   sk->sk_state = SS_UNCONNECTED;
+   sock->state = SS_UNCONNECTED;
+   goto out_wait;
}
 
prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
@@ -1220,20 +1224,17 @@ static int vsock_stream_connect(struct socket *sock, 
struct sockaddr *addr,
 
if (sk->sk_err) {
err = -sk->sk_err;
-   goto out_wait_error;
-   } else
+   sk->sk_state = SS_UNCONNECTED;
+   sock->state = SS_UNCONNECTED;
+   } else {
err = 0;
+   }
 
 out_wait:
finish_wait(sk_sleep(sk), &wait);
 out:
release_sock(sk);
return err;
-
-out_wait_error:
-   sk->sk_state = SS_UNCONNECTED;
-   sock->state = SS_UNCONNECTED;
-   goto out_wait;
 }
 
 static int vsock_accept(struct socket *sock, struct socket *newsock, int flags)
@@ -1270,18 +1271,20 @@ static int vsock_accept(struct socket *sock, struct 
socket *newsock, int flags)
   listener->sk_err == 0) {
release_sock(listener);
timeout = schedule_timeout(timeout);
+   finish_wait(sk_sleep(listener), &wait);
lock_sock(listener);
 
if (signal_pending(current)) {
err = sock_intr_errno(timeout);
-   goto out_wait;
+   goto out;
} else if (timeout == 0) {
err = -EAGAIN;
-   goto out_wait;
+   goto out;
}
 
prepare_to_wait(sk_sleep(listener), &wait, TASK_INTERRUPTIBLE);
}
+   finish_wait(sk_sleep(listener), &wait);
 
if (listener->sk_err)
err = -listener->sk_err;
@@ -1301,19 +1304,15 @@ static int vsock_accept(struct socket *sock, struct 
socket *newsock, int flags)
 */
if (err) {
vconnected->rejected = true;
-   release_sock(connected);
-   sock_put(connected);
-   goto out_wait;
+   } else {
+   newsock->state = SS_CONNECTED;
+   sock_graft(connected, newsock);
}
 
-   newsock->state = SS_CONNECTED;
-   sock_graft(connected, newsock);
release_sock(connected);
sock_put(connected);
}
 
-out_wait:
-   finish_wait(sk_sleep(listener), &wait);
 out:
release_sock(listener);
return err;
@@ -1557,11 +1556,11 @@ static int vsock_stream_sendmsg(struct socket *sock, 
struct msghdr *msg,
if (err < 0)
goto out;
 
-   prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
 
while (total_written < len) {
ssize_t written;
 
+   prepare_to_wait(sk_sleep(sk), &wait, TASK_INTERRUPTIBLE);
while (vsock_stream_has_space(vsk) == 0 &&
   sk->sk_err == 0 &&
   !(sk->sk_shutdown & SEND_SHUTDOWN) &&
@@ -1570,27 +1569,33 @@ static int vsock_stream_sendmsg(struct socket *sock, 
struct msghdr *msg,
/* Don't wait for non-blocking sockets. */
if (timeout == 0) {
err = -EAGAIN;
-   goto out_wait;
+   finish_wait(sk_sleep(sk), &wait);
+   goto out_err;
}
 
err = transport->notify_send_pre_block(vsk, &send_data);
-   if (err < 0)
-   goto o

[PATCH v3 0/2] AF_VSOCK: Shrink the area influenced by prepare_to_wait

2016-03-22 Thread Claudio Imbrenda
This patchset applies on net-next.

I think I found a problem with the patch submitted by Laura Abbott
( https://lkml.org/lkml/2016/2/4/711 ): we might miss wakeups.
Since the condition is not checked between the prepare_to_wait and the
schedule(), if a wakeup happens after the condition is checked but before
the sleep happens, and we miss it. ( A description of the problem can be
found here: http://www.makelinux.net/ldd3/chp-6-sect-2 ).

The first patch reverts the previous broken patch, while the second patch
properly fixes the sleep-while-waiting issue.


Claudio Imbrenda (2):
  Revert "vsock: Fix blocking ops call in prepare_to_wait"
  AF_VSOCK: Shrink the area influenced by prepare_to_wait

 net/vmw_vsock/af_vsock.c | 155 ++-
 1 file changed, 87 insertions(+), 68 deletions(-)

-- 
1.9.1



[iproute PATCH v3] Use ARRAY_SIZE macro everywhere

2016-03-22 Thread Phil Sutter
This patch was generated by the following semantic patch (a trimmed down
version of what is shipped with Linux sources):

@@
type T;
T[] E;
@@
(
- (sizeof(E)/sizeof(*E))
+ ARRAY_SIZE(E)
|
- (sizeof(E)/sizeof(E[...]))
+ ARRAY_SIZE(E)
|
- (sizeof(E)/sizeof(T))
+ ARRAY_SIZE(E)
)

The only manual adjustment was to include utils.h in misc/nstat.c to make
the macro known there.

Signed-off-by: Phil Sutter 
---
Changes since v1:
- Rebased onto current master to avoid merge conflicts.

Changes since v2:
- Patch recreated from scratch.
---
 bridge/link.c | 2 +-
 misc/nstat.c  | 2 +-
 misc/ss.c | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/bridge/link.c b/bridge/link.c
index 353e1c3da45db..b347040ccf91d 100644
--- a/bridge/link.c
+++ b/bridge/link.c
@@ -319,7 +319,7 @@ static int brlink_modify(int argc, char **argv)
} else if (strcmp(*argv, "state") == 0) {
NEXT_ARG();
char *endptr;
-   size_t nstates = sizeof(port_states) / 
sizeof(*port_states);
+   size_t nstates = ARRAY_SIZE(port_states);
 
state = strtol(*argv, &endptr, 10);
if (!(**argv != '\0' && *endptr == '\0')) {
diff --git a/misc/nstat.c b/misc/nstat.c
index a9e0f20789e3c..4f3863ff99121 100644
--- a/misc/nstat.c
+++ b/misc/nstat.c
@@ -95,7 +95,7 @@ static int useless_number(const char *id)
 {
int i;
 
-   for (i = 0; i < sizeof(useless_numbers)/sizeof(*useless_numbers); i++)
+   for (i = 0; i < ARRAY_SIZE(useless_numbers); i++)
if (strcmp(id, useless_numbers[i]) == 0)
return 1;
return 0;
diff --git a/misc/ss.c b/misc/ss.c
index 192389cc82371..449c391579af1 100644
--- a/misc/ss.c
+++ b/misc/ss.c
@@ -666,7 +666,7 @@ static int get_slabstat(struct slabstat *s)
while (fgets(buf, sizeof(buf), fp) != NULL) {
int i;
 
-   for (i = 0; i < sizeof(slabstat_ids)/sizeof(slabstat_ids[0]); 
i++) {
+   for (i = 0; i < ARRAY_SIZE(slabstat_ids); i++) {
if (memcmp(buf, slabstat_ids[i], 
strlen(slabstat_ids[i])) == 0) {
sscanf(buf, "%*s%d", ((int *)s) + i);
cnt--;
-- 
2.7.2



Re: [PATCH] lan78xx: Protect runtime_auto check by #ifdef CONFIG_PM

2016-03-22 Thread Oliver Neukum
On Tue, 2016-03-22 at 11:13 -0400, Alan Stern wrote:
> > Indeed. In that case the point is moot. But it is correct to ask
> > the core whether the device is autosuspended at that point rather
> > than keep a private flag if you can.
> 
> That's why we have pm_runtime_status_suspended().

I guess we are in violent agreement though we were unaware of being
in that state.

Regards
Oliver




[PATCH net] ipv4: initialize flowi4_flags before calling fib_lookup()

2016-03-22 Thread Lance Richardson
Field fl4.flowi4_flags is not initialized in fib_compute_spec_dst()
before calling fib_lookup(), which means fib_table_lookup() is
using non-deterministic data at this line:

if (!(flp->flowi4_flags & FLOWI_FLAG_SKIP_NH_OIF)) {

Fix by initializing fl4.flowi4_flags to zero.

Signed-off-by: Lance Richardson 
---
 net/ipv4/fib_frontend.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 21add55..896844a 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -304,6 +304,7 @@ __be32 fib_compute_spec_dst(struct sk_buff *skb)
fl4.flowi4_scope = scope;
fl4.flowi4_mark = IN_DEV_SRC_VMARK(in_dev) ? skb->mark : 0;
fl4.flowi4_tun_key.tun_id = 0;
+   fl4.flowi4_flags = 0;
if (!fib_lookup(net, &fl4, &res, 0))
return FIB_RES_PREFSRC(net, res);
} else {
-- 
2.5.0



Re: net/sctp: stack-out-of-bounds in sctp_getsockopt

2016-03-22 Thread Marcelo Ricardo Leitner
On Tue, Mar 22, 2016 at 08:21:28AM -0700, Eric Dumazet wrote:
> On Tue, 2016-03-22 at 23:08 +0800, Baozeng Ding wrote:
> > Hi all,
> > 
> > The following program triggers an out-of-bounds bug in
> > sctp_getsockopt. The kernel version is 4.5 (on Mar 16
> > commit 09fd671ccb2475436bd5f597f751ca4a7d177aea). 
> > 
> > ==
> > BUG: KASAN: stack-out-of-bounds in string+0x1ef/0x200 at addr
> > 88003ae679e0
> > Read of size 1 by task syz-executor/19753
> > page:eaeb99c0 count:0 mapcount:0 mapping:  (null)
> > index:0x0
> > flags: 0x1fffc00()
> > page dumped because: kasan: bad access detected
> > CPU: 3 PID: 19753 Comm: syz-executor Not tainted 4.5.0+ #8
> > Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
> > rel-1.8.2-0-g33fbe13 by qemu-project.org 04/01/2014
> >  0003 88003ae67578 82945051 88003ae67608
> >  88003ae679e0 0096 dc00 88003ae675f8
> >  81709f88 030d  0286
> > Call Trace:
> >  [< inline >] __dump_stack lib/dump_stack.c:15
> >  [] dump_stack+0xb3/0x112 lib/dump_stack.c:51
> >  [< inline >] print_address_description mm/kasan/report.c:150
> >  [] kasan_report_error+0x4f8/0x530 mm/kasan/report.c:236
> >  [] ? __lock_acquire+0x15fb/0x5dd0 
> > kernel/locking/lockdep.c:3226
> >  [< inline >] kasan_report mm/kasan/report.c:259
> >  [] __asan_report_load1_noabort+0x3e/0x40 
> > mm/kasan/report.c:277
> >  [] ? string+0x1ef/0x200 lib/vsprintf.c:591
> >  [] string+0x1ef/0x200 lib/vsprintf.c:591
> >  [] vsnprintf+0xb83/0x1900 lib/vsprintf.c:2049
> >  [] ? pointer+0xab0/0xab0 lib/vsprintf.c:1584
> >  [] __request_module+0x132/0x6b0 kernel/kmod.c:146
> >  [] ? mark_held_locks+0xd0/0x130 
> > kernel/locking/lockdep.c:2552
> >  [] ? call_usermodehelper_setup+0x2b0/0x2b0 
> > kernel/kmod.c:530
> >  [] ? mutex_lock_interruptible_nested+0x980/0x980
> >  [] ? __might_fault+0xe4/0x1d0 mm/memory.c:3833
> >  [] find_inlist_lock.constprop.17+0x10c/0x210 
> > net/bridge/netfilter/ebtables.c:347
> >  [< inline >] find_table_lock net/bridge/netfilter/ebtables.c:356
> >  [] do_ebt_get_ctl+0x13b/0x540 
> > net/bridge/netfilter/ebtables.c:1524
> >  [] ? copy_everything_to_user+0x600/0x600 
> > net/bridge/netfilter/ebtables.c:1455
> >  [< inline >] ? __mutex_unlock_common_slowpath 
> > kernel/locking/mutex.c:751
> >  [] ? __mutex_unlock_slowpath+0x239/0x3f0 
> > kernel/locking/mutex.c:762
> >  [] ? mutex_unlock+0x9/0x10 kernel/locking/mutex.c:437
> >  [] ? nf_sockopt_find+0x1a6/0x220 
> > net/netfilter/nf_sockopt.c:87
> >  [< inline >] nf_sockopt net/netfilter/nf_sockopt.c:103
> >  [] nf_getsockopt+0x6d/0xc0 net/netfilter/nf_sockopt.c:121
> >  [] ip_getsockopt+0x135/0x190 net/ipv4/ip_sockglue.c:1523
> >  [] ? do_ip_getsockopt+0x1520/0x1520 
> > net/ipv4/ip_sockglue.c:1353
> >  [< inline >] ? wake_up_process kernel/sched/core.c:2024
> >  [] ? wake_up_q+0x82/0xe0 kernel/sched/core.c:416
> >  [< inline >] ? atomic_dec_and_test 
> > /arch/x86/include/asm/atomic.h:117
> >  [< inline >] ? mmdrop include/linux/sched.h:2611
> >  [] ? drop_futex_key_refs.isra.13+0x70/0xe0 
> > kernel/futex.c:444
> >  [] sctp_getsockopt+0x18d/0x3f40 net/sctp/socket.c:5964
> >  [] ? __lock_acquire+0x15fb/0x5dd0 
> > kernel/locking/lockdep.c:3226
> >  [] ? sctp_do_peeloff+0x2b0/0x2b0 net/sctp/socket.c:4434
> >  [] ? debug_check_no_locks_freed+0x290/0x290 
> > kernel/locking/lockdep.c:4104
> >  [< inline >] ? rcu_read_unlock include/linux/rcupdate.h:922
> >  [] ? __fget+0x20c/0x3b0 fs/file.c:712
> >  [< inline >] ? rcu_lock_release include/linux/rcupdate.h:491
> >  [< inline >] ? rcu_read_unlock include/linux/rcupdate.h:926
> >  [] ? __fget+0x235/0x3b0 fs/file.c:712
> >  [] ? __fget+0x47/0x3b0 fs/file.c:696
> >  [] ? __fget_light+0xa1/0x1f0 fs/file.c:759
> >  [] sock_common_getsockopt+0x95/0xd0 net/core/sock.c:2579
> >  [< inline >] SYSC_getsockopt net/socket.c:1783
> >  [] SyS_getsockopt+0x142/0x230 net/socket.c:1765
> >  [] ? SyS_setsockopt+0x240/0x240 net/socket.c:1752
> >  [] ? entry_SYSCALL_64_fastpath+0x5/0xc1 
> > arch/x86/entry/entry_64.S:191
> >  [] ? trace_hardirqs_on_thunk+0x17/0x19 
> > arch/x86/entry/thunk_64.S:39
> >  [] entry_SYSCALL_64_fastpath+0x23/0xc1 
> > arch/x86/entry/entry_64.S:207
> > Memory state around the buggy address:
> >  88003ae67880: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >  88003ae67900: 00 f1 f1 f1 f1 04 f4 f4 f4 f2 f2 f2 f2 00 00 00
> > >88003ae67980: 00 00 00 00 00 00 00 00 00 00 00 00 f4 f3 f3 f3
> >^
> >  88003ae67a00: f3 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >  88003ae67a80: f1 f1 f1 f1 04 f4 f4 f4 f3 f3 f3 f3 00 00 00 00
> > ==
> > 
> > #include 
> > #inc

Re: net/sctp: stack-out-of-bounds in sctp_getsockopt

2016-03-22 Thread Eric Dumazet
On Tue, 2016-03-22 at 08:21 -0700, Eric Dumazet wrote:
> On Tue, 2016-03-22 at 23:08 +0800, Baozeng Ding wrote:
> > Hi all,
> > 
> > The following program triggers an out-of-bounds bug in
> > sctp_getsockopt. The kernel version is 4.5 (on Mar 16
> > commit 09fd671ccb2475436bd5f597f751ca4a7d177aea). 
> > 
> > ==
> > BUG: KASAN: stack-out-of-bounds in string+0x1ef/0x200 at addr
> > 88003ae679e0
> > Read of size 1 by task syz-executor/19753
> > page:eaeb99c0 count:0 mapcount:0 mapping:  (null)
> > index:0x0
> > flags: 0x1fffc00()
> > page dumped because: kasan: bad access detected
> > CPU: 3 PID: 19753 Comm: syz-executor Not tainted 4.5.0+ #8
> > Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
> > rel-1.8.2-0-g33fbe13 by qemu-project.org 04/01/2014
> >  0003 88003ae67578 82945051 88003ae67608
> >  88003ae679e0 0096 dc00 88003ae675f8
> >  81709f88 030d  0286
> > Call Trace:
> >  [< inline >] __dump_stack lib/dump_stack.c:15
> >  [] dump_stack+0xb3/0x112 lib/dump_stack.c:51
> >  [< inline >] print_address_description mm/kasan/report.c:150
> >  [] kasan_report_error+0x4f8/0x530 mm/kasan/report.c:236
> >  [] ? __lock_acquire+0x15fb/0x5dd0 
> > kernel/locking/lockdep.c:3226
> >  [< inline >] kasan_report mm/kasan/report.c:259
> >  [] __asan_report_load1_noabort+0x3e/0x40 
> > mm/kasan/report.c:277
> >  [] ? string+0x1ef/0x200 lib/vsprintf.c:591
> >  [] string+0x1ef/0x200 lib/vsprintf.c:591
> >  [] vsnprintf+0xb83/0x1900 lib/vsprintf.c:2049
> >  [] ? pointer+0xab0/0xab0 lib/vsprintf.c:1584
> >  [] __request_module+0x132/0x6b0 kernel/kmod.c:146
> >  [] ? mark_held_locks+0xd0/0x130 
> > kernel/locking/lockdep.c:2552
> >  [] ? call_usermodehelper_setup+0x2b0/0x2b0 
> > kernel/kmod.c:530
> >  [] ? mutex_lock_interruptible_nested+0x980/0x980
> >  [] ? __might_fault+0xe4/0x1d0 mm/memory.c:3833
> >  [] find_inlist_lock.constprop.17+0x10c/0x210 
> > net/bridge/netfilter/ebtables.c:347
> >  [< inline >] find_table_lock net/bridge/netfilter/ebtables.c:356
> >  [] do_ebt_get_ctl+0x13b/0x540 
> > net/bridge/netfilter/ebtables.c:1524
> >  [] ? copy_everything_to_user+0x600/0x600 
> > net/bridge/netfilter/ebtables.c:1455
> >  [< inline >] ? __mutex_unlock_common_slowpath 
> > kernel/locking/mutex.c:751
> >  [] ? __mutex_unlock_slowpath+0x239/0x3f0 
> > kernel/locking/mutex.c:762
> >  [] ? mutex_unlock+0x9/0x10 kernel/locking/mutex.c:437
> >  [] ? nf_sockopt_find+0x1a6/0x220 
> > net/netfilter/nf_sockopt.c:87
> >  [< inline >] nf_sockopt net/netfilter/nf_sockopt.c:103
> >  [] nf_getsockopt+0x6d/0xc0 net/netfilter/nf_sockopt.c:121
> >  [] ip_getsockopt+0x135/0x190 net/ipv4/ip_sockglue.c:1523
> >  [] ? do_ip_getsockopt+0x1520/0x1520 
> > net/ipv4/ip_sockglue.c:1353
> >  [< inline >] ? wake_up_process kernel/sched/core.c:2024
> >  [] ? wake_up_q+0x82/0xe0 kernel/sched/core.c:416
> >  [< inline >] ? atomic_dec_and_test 
> > /arch/x86/include/asm/atomic.h:117
> >  [< inline >] ? mmdrop include/linux/sched.h:2611
> >  [] ? drop_futex_key_refs.isra.13+0x70/0xe0 
> > kernel/futex.c:444
> >  [] sctp_getsockopt+0x18d/0x3f40 net/sctp/socket.c:5964
> >  [] ? __lock_acquire+0x15fb/0x5dd0 
> > kernel/locking/lockdep.c:3226
> >  [] ? sctp_do_peeloff+0x2b0/0x2b0 net/sctp/socket.c:4434
> >  [] ? debug_check_no_locks_freed+0x290/0x290 
> > kernel/locking/lockdep.c:4104
> >  [< inline >] ? rcu_read_unlock include/linux/rcupdate.h:922
> >  [] ? __fget+0x20c/0x3b0 fs/file.c:712
> >  [< inline >] ? rcu_lock_release include/linux/rcupdate.h:491
> >  [< inline >] ? rcu_read_unlock include/linux/rcupdate.h:926
> >  [] ? __fget+0x235/0x3b0 fs/file.c:712
> >  [] ? __fget+0x47/0x3b0 fs/file.c:696
> >  [] ? __fget_light+0xa1/0x1f0 fs/file.c:759
> >  [] sock_common_getsockopt+0x95/0xd0 net/core/sock.c:2579
> >  [< inline >] SYSC_getsockopt net/socket.c:1783
> >  [] SyS_getsockopt+0x142/0x230 net/socket.c:1765
> >  [] ? SyS_setsockopt+0x240/0x240 net/socket.c:1752
> >  [] ? entry_SYSCALL_64_fastpath+0x5/0xc1 
> > arch/x86/entry/entry_64.S:191
> >  [] ? trace_hardirqs_on_thunk+0x17/0x19 
> > arch/x86/entry/thunk_64.S:39
> >  [] entry_SYSCALL_64_fastpath+0x23/0xc1 
> > arch/x86/entry/entry_64.S:207
> > Memory state around the buggy address:
> >  88003ae67880: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >  88003ae67900: 00 f1 f1 f1 f1 04 f4 f4 f4 f2 f2 f2 f2 00 00 00
> > >88003ae67980: 00 00 00 00 00 00 00 00 00 00 00 00 f4 f3 f3 f3
> >^
> >  88003ae67a00: f3 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
> >  88003ae67a80: f1 f1 f1 f1 04 f4 f4 f4 f3 f3 f3 f3 00 00 00 00
> > ==
> > 
> > #include 
> > #include 
>

[PATCH v8 net-next] ravb: Add dma queue interrupt support

2016-03-22 Thread Yoshihiro Kaneko
From: Kazuya Mizuguchi 

This patch supports the following interrupts.

- One interrupt for multiple (timestamp, error, gPTP)
- One interrupt for emac
- Four interrupts for dma queue (best effort rx/tx, network control rx/tx)

This patch improve efficiency of the interrupt handler by adding the
interrupt handler corresponding to each interrupt source described
above. Additionally, it reduces the number of times of the access to
EthernetAVB IF.
Also this patch prevent this driver depends on the whim of a boot loader.

[ykaneko0...@gmail.com: define bit names of registers]
[ykaneko0...@gmail.com: add comment for gen3 only registers]
[ykaneko0...@gmail.com: fix coding style]
[ykaneko0...@gmail.com: update changelog]
[ykaneko0...@gmail.com: gen3: fix initialization of interrupts]
[ykaneko0...@gmail.com: gen3: fix clearing interrupts]
[ykaneko0...@gmail.com: gen3: add helper function for request_irq()]
[ykaneko0...@gmail.com: gen3: remove IRQF_SHARED flag for request_irq()]
[ykaneko0...@gmail.com: revert ravb_close() and ravb_ptp_stop()]
[ykaneko0...@gmail.com: avoid calling free_irq() to non-hooked interrupts]
[ykaneko0...@gmail.com: make NC/BE interrupt handler a function]
[ykaneko0...@gmail.com: make timestamp interrupt handler a function]
[ykaneko0...@gmail.com: timestamp interrupt is handled in multiple
 interrupt handler instead of dma queue interrupt handler]
Signed-off-by: Kazuya Mizuguchi 
Signed-off-by: Yoshihiro Kaneko 
---

This patch is based on the master branch of David Miller's next networking
tree.

v8 [Yoshihiro Kaneko]
  - rebased

v7 [Yoshihiro Kaneko]
  drivers/net/ethernet/renesas/ravb_main.c:
  - read TIS in ravb_timestamp_interrupt()
  - avoid overwriting the 'result' by a return value of ravb_ptp_interrupt()
* As suggested by Sergei Shtylyov
  drivers/net/ethernet/renesas/ravb_main.c:
  - read RIS, RIC, TIS and TIC in ravb_queue_interrupt()
  - rename ravb_rx_tx_interrupt to ravb_dma_interrupt
  - shorten argument ravb_queue of ravb_dma_interrupt()

v6 [Yoshihiro Kaneko]
* As suggested by Sergei Shtylyov
  drivers/net/ethernet/renesas/ravb_main.c:
- rename ravb_nc_be_interrupt to ravb_queue_interrupt, change the type
   of return value to 'bool', rename ravb_queue to 'q'
- stop use of 'for' loop for queue interrupt in ravb_interrupt()
- fix comment for ravb_multi_interrupt()
- rename ravb_dmaq_interrupt to ravb_rx_tx_interrupt
- move timestamp interrupt handler into ravb_multi_interrupt()
- make timestamp interrupt handler a funtion
- rename out_free_irq2 label to out_free_irq_nc_tx
- remove IRQF_SHARED flag for request_irq()
  drivers/net/ethernet/renesas/ravb_ptp.c:
- fix coding style

v5 [Yoshihiro Kaneko]
* As suggested by Sergei Shtylyov
  drivers/net/ethernet/renesas/ravb_main.c:
- stop copying ravb_queue parameter in ravb_dmaq_interrupt()
- clear TFUF instead of disabling
- factored out NC/BE interrupt handler
- rename hook_irq() in ravb_hook_irq()
- add calling free_irq() for the EMAC IRQ
- stop using a loop for free_irq() to avoid calling free_irq() for
  non-hooked interrupt handlers
- add test for failure of devm_kasprintf in ravb_hook_irq()
- update changelog

v4 [Yoshihiro Kaneko]
* compile tested only
* As suggested by Sergei Shtylyov
  drivers/net/ethernet/renesas/ravb.h:
- make two lines of comment into one line.
- remove unused definition of xxx_ALL.
  drivers/net/ethernet/renesas/ravb_main.c:
- remove unrelated change (fix indentation).
- output warning messages when napi_schedule_prep() fails in ravb_dmaq_
  interrupt() like ravb_interrupt().
- change the function name from req_irq to hook_irq.
- fix programming error in hook_irq().
- do free_irq() for rx_irqs[] and tx_irqs[] for only gen3 in out_free_
  irq label in ravb_open().

v3 [Yoshihiro Kaneko]
* compile tested only
* As suggested by Sergei Shtylyov
  - update changelog
  drivers/net/ethernet/renesas/ravb.h:
- add comments to the additional registers like CIE
  drivers/net/ethernet/renesas/ravb_main.c:
- fix the initialization of the interrupt in ravb_dmac_init()
- revert ravb_error_interrupt() because gen3 code is wrong
- change the comment "Management" in ravb_multi_interrupt()
- add a helper function for request_irq() in ravb_open()
- revert ravb_close() because atomicity is not necessary here
  drivers/net/ethernet/renesas/ravb_ptp.c:
- revert ravb_ptp_stop() because atomicity is not necessary here

v2 [Yoshihiro Kaneko]
* compile tested only
* As suggested by Sergei Shtylyov
  - add comment to CIE
  - remove comments from CIE bits
  - fix value of TIx_ALL
  - define each bits for CIE, GIE, GID, RIE0, RID0, RIE2, RID2, TIE, TID
  - reversed Christmas tree declaration ordered
  - rename _ravb_emac_interrupt() to ravb_emac_interrupt_unlocked()
  - remove unnecessary clearing of CIE
  - use a bit name corresponding to the target register, RIE0, RIE2, TIE,
TID, RID2, G

Re: net/sctp: stack-out-of-bounds in sctp_getsockopt

2016-03-22 Thread Eric Dumazet
On Tue, 2016-03-22 at 23:08 +0800, Baozeng Ding wrote:
> Hi all,
> 
> The following program triggers an out-of-bounds bug in
> sctp_getsockopt. The kernel version is 4.5 (on Mar 16
> commit 09fd671ccb2475436bd5f597f751ca4a7d177aea). 
> 
> ==
> BUG: KASAN: stack-out-of-bounds in string+0x1ef/0x200 at addr
> 88003ae679e0
> Read of size 1 by task syz-executor/19753
> page:eaeb99c0 count:0 mapcount:0 mapping:  (null)
> index:0x0
> flags: 0x1fffc00()
> page dumped because: kasan: bad access detected
> CPU: 3 PID: 19753 Comm: syz-executor Not tainted 4.5.0+ #8
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
> rel-1.8.2-0-g33fbe13 by qemu-project.org 04/01/2014
>  0003 88003ae67578 82945051 88003ae67608
>  88003ae679e0 0096 dc00 88003ae675f8
>  81709f88 030d  0286
> Call Trace:
>  [< inline >] __dump_stack lib/dump_stack.c:15
>  [] dump_stack+0xb3/0x112 lib/dump_stack.c:51
>  [< inline >] print_address_description mm/kasan/report.c:150
>  [] kasan_report_error+0x4f8/0x530 mm/kasan/report.c:236
>  [] ? __lock_acquire+0x15fb/0x5dd0 
> kernel/locking/lockdep.c:3226
>  [< inline >] kasan_report mm/kasan/report.c:259
>  [] __asan_report_load1_noabort+0x3e/0x40 
> mm/kasan/report.c:277
>  [] ? string+0x1ef/0x200 lib/vsprintf.c:591
>  [] string+0x1ef/0x200 lib/vsprintf.c:591
>  [] vsnprintf+0xb83/0x1900 lib/vsprintf.c:2049
>  [] ? pointer+0xab0/0xab0 lib/vsprintf.c:1584
>  [] __request_module+0x132/0x6b0 kernel/kmod.c:146
>  [] ? mark_held_locks+0xd0/0x130 
> kernel/locking/lockdep.c:2552
>  [] ? call_usermodehelper_setup+0x2b0/0x2b0 
> kernel/kmod.c:530
>  [] ? mutex_lock_interruptible_nested+0x980/0x980
>  [] ? __might_fault+0xe4/0x1d0 mm/memory.c:3833
>  [] find_inlist_lock.constprop.17+0x10c/0x210 
> net/bridge/netfilter/ebtables.c:347
>  [< inline >] find_table_lock net/bridge/netfilter/ebtables.c:356
>  [] do_ebt_get_ctl+0x13b/0x540 
> net/bridge/netfilter/ebtables.c:1524
>  [] ? copy_everything_to_user+0x600/0x600 
> net/bridge/netfilter/ebtables.c:1455
>  [< inline >] ? __mutex_unlock_common_slowpath 
> kernel/locking/mutex.c:751
>  [] ? __mutex_unlock_slowpath+0x239/0x3f0 
> kernel/locking/mutex.c:762
>  [] ? mutex_unlock+0x9/0x10 kernel/locking/mutex.c:437
>  [] ? nf_sockopt_find+0x1a6/0x220 
> net/netfilter/nf_sockopt.c:87
>  [< inline >] nf_sockopt net/netfilter/nf_sockopt.c:103
>  [] nf_getsockopt+0x6d/0xc0 net/netfilter/nf_sockopt.c:121
>  [] ip_getsockopt+0x135/0x190 net/ipv4/ip_sockglue.c:1523
>  [] ? do_ip_getsockopt+0x1520/0x1520 
> net/ipv4/ip_sockglue.c:1353
>  [< inline >] ? wake_up_process kernel/sched/core.c:2024
>  [] ? wake_up_q+0x82/0xe0 kernel/sched/core.c:416
>  [< inline >] ? atomic_dec_and_test /arch/x86/include/asm/atomic.h:117
>  [< inline >] ? mmdrop include/linux/sched.h:2611
>  [] ? drop_futex_key_refs.isra.13+0x70/0xe0 
> kernel/futex.c:444
>  [] sctp_getsockopt+0x18d/0x3f40 net/sctp/socket.c:5964
>  [] ? __lock_acquire+0x15fb/0x5dd0 
> kernel/locking/lockdep.c:3226
>  [] ? sctp_do_peeloff+0x2b0/0x2b0 net/sctp/socket.c:4434
>  [] ? debug_check_no_locks_freed+0x290/0x290 
> kernel/locking/lockdep.c:4104
>  [< inline >] ? rcu_read_unlock include/linux/rcupdate.h:922
>  [] ? __fget+0x20c/0x3b0 fs/file.c:712
>  [< inline >] ? rcu_lock_release include/linux/rcupdate.h:491
>  [< inline >] ? rcu_read_unlock include/linux/rcupdate.h:926
>  [] ? __fget+0x235/0x3b0 fs/file.c:712
>  [] ? __fget+0x47/0x3b0 fs/file.c:696
>  [] ? __fget_light+0xa1/0x1f0 fs/file.c:759
>  [] sock_common_getsockopt+0x95/0xd0 net/core/sock.c:2579
>  [< inline >] SYSC_getsockopt net/socket.c:1783
>  [] SyS_getsockopt+0x142/0x230 net/socket.c:1765
>  [] ? SyS_setsockopt+0x240/0x240 net/socket.c:1752
>  [] ? entry_SYSCALL_64_fastpath+0x5/0xc1 
> arch/x86/entry/entry_64.S:191
>  [] ? trace_hardirqs_on_thunk+0x17/0x19 
> arch/x86/entry/thunk_64.S:39
>  [] entry_SYSCALL_64_fastpath+0x23/0xc1 
> arch/x86/entry/entry_64.S:207
> Memory state around the buggy address:
>  88003ae67880: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>  88003ae67900: 00 f1 f1 f1 f1 04 f4 f4 f4 f2 f2 f2 f2 00 00 00
> >88003ae67980: 00 00 00 00 00 00 00 00 00 00 00 00 f4 f3 f3 f3
>^
>  88003ae67a00: f3 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
>  88003ae67a80: f1 f1 f1 f1 04 f4 f4 f4 f3 f3 f3 f3 00 00 00 00
> ==
> 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> #include 
> 
> int main()
> {
> int sock = 0;
> int sock_dup = 0;
> mmap((void *)0x2000ul, 0x5000ul, PROT_READ|PROT_WRITE,
> MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0);
> sock 

Re: [PATCH] lan78xx: Protect runtime_auto check by #ifdef CONFIG_PM

2016-03-22 Thread Alan Stern
On Tue, 22 Mar 2016, Oliver Neukum wrote:

> On Tue, 2016-03-22 at 10:21 -0400, Alan Stern wrote:
> > I don't see any point in resuming the device just in order to collect 
> > operating statistics.  If it was already suspended then it wasn't 
> > operating, so there will be no statistics to collect.
> 
> Indeed. In that case the point is moot. But it is correct to ask
> the core whether the device is autosuspended at that point rather
> than keep a private flag if you can.

That's why we have pm_runtime_status_suspended().

> All that is relevant only if the upper layers ask for information
> that the driver cannot provide without resuming the device.
> Those are fundamentally different issues.

Of course.

Alan Stern



net/sctp: stack-out-of-bounds in sctp_getsockopt

2016-03-22 Thread Baozeng Ding
Hi all,

The following program triggers an out-of-bounds bug in
sctp_getsockopt. The kernel version is 4.5 (on Mar 16
commit 09fd671ccb2475436bd5f597f751ca4a7d177aea). 

==
BUG: KASAN: stack-out-of-bounds in string+0x1ef/0x200 at addr
88003ae679e0
Read of size 1 by task syz-executor/19753
page:eaeb99c0 count:0 mapcount:0 mapping:  (null)
index:0x0
flags: 0x1fffc00()
page dumped because: kasan: bad access detected
CPU: 3 PID: 19753 Comm: syz-executor Not tainted 4.5.0+ #8
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
rel-1.8.2-0-g33fbe13 by qemu-project.org 04/01/2014
 0003 88003ae67578 82945051 88003ae67608
 88003ae679e0 0096 dc00 88003ae675f8
 81709f88 030d  0286
Call Trace:
 [< inline >] __dump_stack lib/dump_stack.c:15
 [] dump_stack+0xb3/0x112 lib/dump_stack.c:51
 [< inline >] print_address_description mm/kasan/report.c:150
 [] kasan_report_error+0x4f8/0x530 mm/kasan/report.c:236
 [] ? __lock_acquire+0x15fb/0x5dd0 
kernel/locking/lockdep.c:3226
 [< inline >] kasan_report mm/kasan/report.c:259
 [] __asan_report_load1_noabort+0x3e/0x40 
mm/kasan/report.c:277
 [] ? string+0x1ef/0x200 lib/vsprintf.c:591
 [] string+0x1ef/0x200 lib/vsprintf.c:591
 [] vsnprintf+0xb83/0x1900 lib/vsprintf.c:2049
 [] ? pointer+0xab0/0xab0 lib/vsprintf.c:1584
 [] __request_module+0x132/0x6b0 kernel/kmod.c:146
 [] ? mark_held_locks+0xd0/0x130 kernel/locking/lockdep.c:2552
 [] ? call_usermodehelper_setup+0x2b0/0x2b0 kernel/kmod.c:530
 [] ? mutex_lock_interruptible_nested+0x980/0x980
 [] ? __might_fault+0xe4/0x1d0 mm/memory.c:3833
 [] find_inlist_lock.constprop.17+0x10c/0x210 
net/bridge/netfilter/ebtables.c:347
 [< inline >] find_table_lock net/bridge/netfilter/ebtables.c:356
 [] do_ebt_get_ctl+0x13b/0x540 
net/bridge/netfilter/ebtables.c:1524
 [] ? copy_everything_to_user+0x600/0x600 
net/bridge/netfilter/ebtables.c:1455
 [< inline >] ? __mutex_unlock_common_slowpath 
kernel/locking/mutex.c:751
 [] ? __mutex_unlock_slowpath+0x239/0x3f0 
kernel/locking/mutex.c:762
 [] ? mutex_unlock+0x9/0x10 kernel/locking/mutex.c:437
 [] ? nf_sockopt_find+0x1a6/0x220 
net/netfilter/nf_sockopt.c:87
 [< inline >] nf_sockopt net/netfilter/nf_sockopt.c:103
 [] nf_getsockopt+0x6d/0xc0 net/netfilter/nf_sockopt.c:121
 [] ip_getsockopt+0x135/0x190 net/ipv4/ip_sockglue.c:1523
 [] ? do_ip_getsockopt+0x1520/0x1520 
net/ipv4/ip_sockglue.c:1353
 [< inline >] ? wake_up_process kernel/sched/core.c:2024
 [] ? wake_up_q+0x82/0xe0 kernel/sched/core.c:416
 [< inline >] ? atomic_dec_and_test /arch/x86/include/asm/atomic.h:117
 [< inline >] ? mmdrop include/linux/sched.h:2611
 [] ? drop_futex_key_refs.isra.13+0x70/0xe0 kernel/futex.c:444
 [] sctp_getsockopt+0x18d/0x3f40 net/sctp/socket.c:5964
 [] ? __lock_acquire+0x15fb/0x5dd0 
kernel/locking/lockdep.c:3226
 [] ? sctp_do_peeloff+0x2b0/0x2b0 net/sctp/socket.c:4434
 [] ? debug_check_no_locks_freed+0x290/0x290 
kernel/locking/lockdep.c:4104
 [< inline >] ? rcu_read_unlock include/linux/rcupdate.h:922
 [] ? __fget+0x20c/0x3b0 fs/file.c:712
 [< inline >] ? rcu_lock_release include/linux/rcupdate.h:491
 [< inline >] ? rcu_read_unlock include/linux/rcupdate.h:926
 [] ? __fget+0x235/0x3b0 fs/file.c:712
 [] ? __fget+0x47/0x3b0 fs/file.c:696
 [] ? __fget_light+0xa1/0x1f0 fs/file.c:759
 [] sock_common_getsockopt+0x95/0xd0 net/core/sock.c:2579
 [< inline >] SYSC_getsockopt net/socket.c:1783
 [] SyS_getsockopt+0x142/0x230 net/socket.c:1765
 [] ? SyS_setsockopt+0x240/0x240 net/socket.c:1752
 [] ? entry_SYSCALL_64_fastpath+0x5/0xc1 
arch/x86/entry/entry_64.S:191
 [] ? trace_hardirqs_on_thunk+0x17/0x19 
arch/x86/entry/thunk_64.S:39
 [] entry_SYSCALL_64_fastpath+0x23/0xc1 
arch/x86/entry/entry_64.S:207
Memory state around the buggy address:
 88003ae67880: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 88003ae67900: 00 f1 f1 f1 f1 04 f4 f4 f4 f2 f2 f2 f2 00 00 00
>88003ae67980: 00 00 00 00 00 00 00 00 00 00 00 00 f4 f3 f3 f3
   ^
 88003ae67a00: f3 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
 88003ae67a80: f1 f1 f1 f1 04 f4 f4 f4 f3 f3 f3 f3 00 00 00 00
==

#include 
#include 
#include 
#include 
#include 
#include 
#include 

int main()
{
int sock = 0;
int sock_dup = 0;
mmap((void *)0x2000ul, 0x5000ul, PROT_READ|PROT_WRITE,
MAP_PRIVATE|MAP_FIXED|MAP_ANONYMOUS, -1, 0);
sock = socket(AF_INET, SOCK_STREAM, IPPROTO_SCTP);
sock_dup = dup(sock);

memcpy((void*)0x2bf3,"\xac\x71\x93\x68\x02\xb3\xd1\x86\x52\xf1\xf0\x18\x09\x56\xc6\x98\x6f\x8e\x74\xb7\x17\xd4\x3a\x64\x51\x68\x13\x2d\x25\xba\x6d\x3f\x74\x68\x84\x89\x04\xd1\xa6\xe2\x7d\xaf\xfa\

Re: [PATCH] net: phy: at803x: don't depend on GPIOLIB

2016-03-22 Thread Sebastian Frias
Hi,

On 03/21/2016 09:41 PM, Uwe Kleine-König wrote:
>>My patch basically gets rid of all this code. The thing that worries me
>> is that the driver assumes that the reset singal is active low, despite what
>> the GPIO specifier in the device tree has for the GPIO polarity. In fact, it
>> will only work correctly if the specified has GPIO_ACTIVE_HIGH -- which is
>> wrong because the reset signal is active low!
> 
> Note that gpio descriptors handle the polarity just fine (i.e. the pin
> is set to 0 after doing gpiod_set_value(1) if the gpio is active low).
> 

Isn't that source of bugs?
What about using some #define (or probably better, an enum)?, something
like:

gpiod_set_value(gpiod, GPIO_SET_VALUE_ACTIVE)
gpiod_set_value(gpiod, GPIO_SET_VALUE_INACTIVE)
gpiod_set_value(gpiod, GPIO_SET_VALUE_TRISTATE)

then, somebody reading the code would have to stop and think what do
these mean.
IIUC, currently the "0" or "1" can easily be confused with the actual
logical value of the GPIO.

gpiod_set_value() could also return an int with the actual value it
applied to the GPIO.
For example: if gpiod is active low, gpiod_set_value(gpiod,
GPIO_SET_VALUE_ACTIVE) would return 0;
Conversely, if gpiod is active high, gpiod_set_value(gpiod,
GPIO_SET_VALUE_ACTIVE) would return 1;

Best regards,

Sebastian

> But having said that, the driver gets it wrong.
> 
> The right sequence to reset a device using a gpio is:
> 
>   gpiod_set_value(priv->gpiod_reset, 1);
>   msleep(some_time);
>   gpiod_set_value(priv->gpiod_reset, 0);
> 
> and if the gpio is active low, this should be specified in the device
> tree. This was done wrong in 13a56b449325 (net: phy: at803x: Add support
> for hardware reset).
> 
> Best regards
> Uwe
> 


[iproute PATCH v2 8/8] man: tc-vlan.8: Describe CONTROL option

2016-03-22 Thread Phil Sutter
This should be made generic and part of a common tc-actions man page.
Though leave it here for now to not confuse readers of the example which
uses it.

Signed-off-by: Phil Sutter 
---
 man/man8/tc-vlan.8 | 56 +-
 1 file changed, 55 insertions(+), 1 deletion(-)

diff --git a/man/man8/tc-vlan.8 b/man/man8/tc-vlan.8
index e650b72d3b395..4bfd72b129aaf 100644
--- a/man/man8/tc-vlan.8
+++ b/man/man8/tc-vlan.8
@@ -6,13 +6,17 @@ vlan - vlan manipulation module
 .in +8
 .ti -8
 .BR tc " ... " "action vlan" " { " pop " |"
-.IR PUSH " }"
+.IR PUSH " } [ " CONTROL " ]"
 
 .ti -8
 .IR PUSH " := "
 .BR push " [ " protocol
 .IR VLANPROTO " ]"
 .BI id " VLANID"
+
+.ti -8
+.IR CONTROL " := { "
+.BR reclassify " | " pipe " | " drop " | " continue " | " pass " }"
 .SH DESCRIPTION
 The
 .B vlan
@@ -50,5 +54,55 @@ for hexadecimal interpretation, etc.).
 .BI protocol " VLANPROTO"
 Choose the VLAN protocol to use. At the time of writing, the kernel accepts 
only
 .BR 802.1Q " or " 802.1ad .
+.TP
+.I CONTROL
+How to continue after executing this action.
+.RS
+.TP
+.B reclassify
+Restarts classification by jumping back to the first filter attached to this
+action's parent.
+.TP
+.B pipe
+Continue with the next action, this is the default.
+.TP
+.B drop
+Packet will be dropped without running further actions.
+.TP
+.B continue
+Continue classification with next filter in line.
+.TP
+.B pass
+Return to calling qdisc for packet processing. This ends the classification
+process.
+.RE
+.SH EXAMPLES
+The following example encapsulates incoming ICMP packets on eth0 from 10.0.0.2
+into VLAN ID 123:
+
+.RS
+.EX
+#tc qdisc add dev eth0 handle : ingress
+#tc filter add dev eth0 parent : pref 11 protocol ip \\
+   u32 match ip protocol 1 0xff flowid 1:1 \\
+   u32 match ip src 10.0.0.2 flowid 1:1 \\
+   action vlan push id 123
+.EE
+.RE
+
+Here is an example of the
+.B pop
+function: Incoming VLAN packets on eth0 are decapsulated and the classification
+process then restarted for the plain packet:
+
+.RS
+.EX
+#tc qdisc add dev eth0 handle : ingress
+#tc filter add dev $ETH parent : pref 1 protocol 802.1Q \\
+   u32 match u32 0 0 flowid 1:1 \\
+   action vlan pop reclassify
+.EE
+.RE
+
 .SH SEE ALSO
 .BR tc (8)
-- 
2.7.2



[iproute PATCH v2 1/8] doc/tc-filters.tex: Drop overly subjective paragraphs

2016-03-22 Thread Phil Sutter
Cc: Alexei Starovoitov 
Signed-off-by: Phil Sutter 
---
 doc/tc-filters.tex | 23 ---
 1 file changed, 4 insertions(+), 19 deletions(-)

diff --git a/doc/tc-filters.tex b/doc/tc-filters.tex
index 59127d6672ed7..54cc0c9920ce2 100644
--- a/doc/tc-filters.tex
+++ b/doc/tc-filters.tex
@@ -18,10 +18,6 @@
 \date{January 2016}
 \maketitle
 
-TC, the Traffic Control utility, has been there for a very long time - forever
-in my humble perception. It is still (and has ever been if I'm not mistaken) 
the
-only tool to configure QoS in Linux.
-
 Standard practice when transmitting packets over a medium which may block (due
 to congestion, e.g.) is to use a queue which temporarily holds these packets. 
In
 Linux, this queueing approach is where QoS happens: A Queueing Discipline
@@ -496,21 +492,10 @@ kernel itself doesn't.
 
 \section*{Conclusion}
 
-My personal impression is that although the \cmd{tc} utility is an absolute
-necessity for anyone aiming at doing QoS in Linux professionally, there are way
-too many loose ends and trip wires present in it's environment. Contributing to
-this is the fact, that much of the non-essential functionality is redundantly
-available in netfilter. Another problem which adds weight to the first one is a
-general lack of documentation. Of course, there are many HOWTOs and guides in
-the internet, but since it's often not clear how up to date these are, I prefer
-the usual resources such as man or info pages. Surely nothing one couldn't fix
-in hindsight, but quality certainly suffers if the original author of the code
-does not or can not contribute to that.
-
-All that being said, once the steep learning curve has been mastered, the
-conglomerate of (classful) qdiscs, filters and actions provides a highly
-sophisticated and flexible infrastructure to perform QoS, which plays nicely
-along with routing and firewalling setups.
+Once the steep learning curve has been mastered, the conglomerate of (classful)
+qdiscs, filters and actions provides a highly sophisticated and flexible
+infrastructure to perform QoS, which plays nicely along with routing and
+firewalling setups.
 
 
 \section*{Further Reading}
-- 
2.7.2



[iproute PATCH v2 6/8] man: tc-skbedit.8: Elaborate a bit on TX queues

2016-03-22 Thread Phil Sutter
Signed-off-by: Phil Sutter 
---
 man/man8/tc-skbedit.8 | 12 
 1 file changed, 12 insertions(+)

diff --git a/man/man8/tc-skbedit.8 b/man/man8/tc-skbedit.8
index b585a4d4253ba..e6902960eee27 100644
--- a/man/man8/tc-skbedit.8
+++ b/man/man8/tc-skbedit.8
@@ -17,6 +17,18 @@ The
 action allows to change a packet's associated meta data. It complements the
 .B pedit
 action, which in turn allows to change parts of the packet data itself.
+
+The most unique feature of
+.B skbedit
+is it's ability to decide over which queue of an interface with multiple
+transmit queues the packet is to be sent out. The number of available transmit
+queues is reflected by sysfs entries within
+.I /sys/class/net//queues
+with name
+.I tx-N
+(where
+.I N
+is the actual queue number).
 .SH OPTIONS
 .TP
 .BI queue_mapping " QUEUE_MAPPING"
-- 
2.7.2



[iproute PATCH v2 4/8] man: tc-mirred.8: Reword man page a bit, add generic mirror example

2016-03-22 Thread Phil Sutter
Signed-off-by: Phil Sutter 
---
 man/man8/tc-mirred.8 | 26 +++---
 1 file changed, 19 insertions(+), 7 deletions(-)

diff --git a/man/man8/tc-mirred.8 b/man/man8/tc-mirred.8
index 52d98bc416563..bba96e0e5d8c0 100644
--- a/man/man8/tc-mirred.8
+++ b/man/man8/tc-mirred.8
@@ -21,11 +21,9 @@ mirred - mirror/redirect action
 .SH DESCRIPTION
 The
 .B mirred
-action allows to redirect or mirror packets to another network interface on the
-same system. It is typically used in combination with the
-.B ifb
-pseudo device to create a shrared instance where QoS happens, but serves well
-for debugging or monitoring purposes, too.
+action allows packet mirroring (copying) or redirecting (stealing) the packet 
it
+receives. Mirroring is what is sometimes referred to as Switch Port Analyzer
+(SPAN) and is commonly used to analyze and/or debug flows.
 .SH OPTIONS
 .TP
 .B ingress
@@ -67,9 +65,23 @@ debugging purposes:
 .EE
 .RE
 
-Use an
+Mirror all incoming ICMP packets on eth0 to a dummy interface for examination
+with e.g. tcpdump:
+
+.RS
+.EX
+# ip link add dummy0 type dummy
+# ip link set dummy0 up
+# tc qdisc add dev eth0 handle : ingress
+# tc filter add dev eth0 parent : protocol ip \\
+   u32 match ip protocol 1 0xff \\
+   action mirred egress mirror dev dummy0
+.EE
+.RE
+
+Using an
 .B ifb
-interface to send ingress traffic on eth0 through an instance of
+interface, it is possible to send ingress traffic through an instance of
 .BR sfq :
 
 .RS
-- 
2.7.2



[iproute PATCH v2 0/8] Follow-up to my action man pages series

2016-03-22 Thread Phil Sutter
The following patch series aims at addressing feedback provided by Jamal
and Alexei. Thanks a lot for your input!

Changes since v1:
- Rebased onto current master.
- Dropped some leftover TODO note from tc-skbedit.8

Phil Sutter (8):
  doc/tc-filters.tex: Drop overly subjective paragraphs
  tc: connmark, pedit: Rename BRANCH to CONTROL
  man: tc-csum.8: Add an example
  man: tc-mirred.8: Reword man page a bit, add generic mirror example
  man: tc-police.8: Emphasize on the two rate control mechanisms
  man: tc-skbedit.8: Elaborate a bit on TX queues
  tc/m_vlan.c: mention CONTROL option in help text
  man: tc-vlan.8: Describe CONTROL option

 doc/tc-filters.tex | 23 -
 man/man8/tc-connmark.8 |  6 +++---
 man/man8/tc-csum.8 | 15 ++
 man/man8/tc-mirred.8   | 26 ---
 man/man8/tc-pedit.8|  6 +++---
 man/man8/tc-police.8   | 29 ++
 man/man8/tc-skbedit.8  | 12 +++
 man/man8/tc-vlan.8 | 56 +-
 tc/m_connmark.c|  4 ++--
 tc/m_pedit.c   |  4 ++--
 tc/m_vlan.c|  3 ++-
 11 files changed, 142 insertions(+), 42 deletions(-)

-- 
2.7.2



[iproute PATCH v2 2/8] tc: connmark, pedit: Rename BRANCH to CONTROL

2016-03-22 Thread Phil Sutter
As Jamal suggested, BRANCH is the wrong name, as these keywords go
beyond simple branch control - e.g. loops are possible, too. Therefore
rename the non-terminal to CONTROL instead which should be more
appropriate.

Signed-off-by: Phil Sutter 
---
 man/man8/tc-connmark.8 | 6 +++---
 man/man8/tc-pedit.8| 6 +++---
 tc/m_connmark.c| 4 ++--
 tc/m_pedit.c   | 4 ++--
 4 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/man/man8/tc-connmark.8 b/man/man8/tc-connmark.8
index bb4cf7543dfdb..44f29f508d673 100644
--- a/man/man8/tc-connmark.8
+++ b/man/man8/tc-connmark.8
@@ -6,12 +6,12 @@ connmark - netfilter connmark retriever action
 .in +8
 .ti -8
 .BR tc " ... " "action connmark " [ " zone"
-.IR u16_zone_index " ] [ " BRANCH " ] ["
+.IR u16_zone_index " ] [ " CONTROL " ] ["
 .BI index " u32_index "
 ]
 
 .ti -8
-.IR BRANCH " := { " reclassify " | " pipe " | " drop " | " continue " | " ok " 
}"
+.IR CONTROL " := { " reclassify " | " pipe " | " drop " | " continue " | " ok 
" }"
 .SH DESCRIPTION
 The connmark action is used to restore the connection's mark value into the
 packet's fwmark.
@@ -22,7 +22,7 @@ Specify the conntrack zone when doing conntrack lookups for 
packets.
 .I u16_zone_index
 is a 16bit unsigned decimal value.
 .TP
-.I BRANCH
+.I CONTROL
 How to continue after executing this action.
 .RS
 .TP
diff --git a/man/man8/tc-pedit.8 b/man/man8/tc-pedit.8
index c30927ec50954..c34520c046a6c 100644
--- a/man/man8/tc-pedit.8
+++ b/man/man8/tc-pedit.8
@@ -6,7 +6,7 @@ pedit - generic packet editor action
 .in +8
 .ti -8
 .BR tc " ... " "action pedit munge " {
-.IR RAW_OP " | " LAYERED_OP " } [ " BRANCH " ]"
+.IR RAW_OP " | " LAYERED_OP " } [ " CONTROL " ]"
 
 .ti -8
 .IR RAW_OP " := "
@@ -45,7 +45,7 @@ pedit - generic packet editor action
 .IR RVAL " ]"
 
 .ti -8
-.IR BRANCH " := {"
+.IR CONTROL " := {"
 .BR reclassify " | " pipe " | " drop " | " shot " | " continue " | " pass " }"
 .SH DESCRIPTION
 The
@@ -165,7 +165,7 @@ This optional extra part of
 .I CMD_SPEC
 allows to exclude bits from being changed.
 .TP
-.I BRANCH
+.I CONTROL
 The following keywords allow to control how the tree of qdisc, classes,
 filters and actions is further traversed after this action.
 .RS
diff --git a/tc/m_connmark.c b/tc/m_connmark.c
index 2414f321c1f8f..b1c7d3af54cc4 100644
--- a/tc/m_connmark.c
+++ b/tc/m_connmark.c
@@ -27,10 +27,10 @@
 static void
 explain(void)
 {
-   fprintf(stderr, "Usage: ... connmark [zone ZONE] [BRANCH] [index 
]\n");
+   fprintf(stderr, "Usage: ... connmark [zone ZONE] [CONTROL] [index 
]\n");
fprintf(stderr, "where :\n"
"\tZONE is the conntrack zone\n"
-   "\tBRANCH := reclassify|pipe|drop|continue|ok\n");
+   "\tCONTROL := reclassify|pipe|drop|continue|ok\n");
 }
 
 static void
diff --git a/tc/m_pedit.c b/tc/m_pedit.c
index ca78a83dd9d9d..9fe1a7ae3b90c 100644
--- a/tc/m_pedit.c
+++ b/tc/m_pedit.c
@@ -35,7 +35,7 @@ static int pedit_debug;
 static void
 explain(void)
 {
-   fprintf(stderr, "Usage: ... pedit munge  []\n");
+   fprintf(stderr, "Usage: ... pedit munge  [CONTROL]\n");
fprintf(stderr,
"Where: MUNGE := |\n"
"\t:= [ATC]\n \t\tOFFSETC:= offset  
\n "
@@ -43,7 +43,7 @@ explain(void)
"\t\tNOTE: maskval is a 32 bit hex number\n \t\tNOTE: shiftval 
is a is a shift value\n "
"\t\tCMD:= clear | invert | set | retain\n 
\t:= ip  | ip6  \n "
" \t\t| udp  | tcp  | icmp \n"
-   "\t:= reclassify | pipe | drop | continue | pass\n"
+   "\tCONTROL:= reclassify | pipe | drop | continue | pass\n"
"For Example usage look at the examples directory\n");
 
 }
-- 
2.7.2



[iproute PATCH v2 7/8] tc/m_vlan.c: mention CONTROL option in help text

2016-03-22 Thread Phil Sutter
Signed-off-by: Phil Sutter 
---
 tc/m_vlan.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/tc/m_vlan.c b/tc/m_vlan.c
index 8d97963f351bc..3233d207375bf 100644
--- a/tc/m_vlan.c
+++ b/tc/m_vlan.c
@@ -22,9 +22,10 @@
 static void explain(void)
 {
fprintf(stderr, "Usage: vlan pop\n");
-   fprintf(stderr, "   vlan push [ protocol VLANPROTO ] id VLANID\n");
+   fprintf(stderr, "   vlan push [ protocol VLANPROTO ] id VLANID 
[CONTROL]\n");
fprintf(stderr, "   VLANPROTO is one of 802.1Q or 802.1AD\n");
fprintf(stderr, "with default: 802.1Q\n");
+   fprintf(stderr, "   CONTROL := reclassify | pipe | drop | continue 
| pass\n");
 }
 
 static void usage(void)
-- 
2.7.2



[iproute PATCH v2 5/8] man: tc-police.8: Emphasize on the two rate control mechanisms

2016-03-22 Thread Phil Sutter
As Jamal pointed out, there are two different approaches to bandwidth
measurement. Try to make this clear by separating them in synopsis and
also documenting the way to fine-tune avrate.

Signed-off-by: Phil Sutter 
---
 man/man8/tc-police.8 | 29 +
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/man/man8/tc-police.8 b/man/man8/tc-police.8
index 2b1537ec52875..5c5a632335dc9 100644
--- a/man/man8/tc-police.8
+++ b/man/man8/tc-police.8
@@ -12,13 +12,21 @@ police - policing action
 .IR BYTES [\fB/ BYTES "] ] ["
 .BI peakrate " RATE"
 ] [
-.BI avrate " RATE"
-] [
 .BI overhead " BYTES"
 ] [
 .BI linklayer " TYPE"
 ] [
-.BI conform-exceed " EXCEEDACT\fR[\fB/\fIEXCEEDACT\fR]"
+.IR CONTROL " ]"
+
+.ti -8
+.BR tc " ... " filter " ... [ " estimator
+.IR "SAMPLE AVERAGE " ]
+.BR "action police avrate"
+.IR RATE " [ " CONTROL " ]"
+
+.ti -8
+.IR CONTROL " :="
+.BI conform-exceed " EXCEEDACT\fR[\fB/\fIEXCEEDACT"
 
 .ti -8
 .IR EXCEEDACT " := { "
@@ -27,7 +35,14 @@ police - policing action
 The
 .B police
 action allows to limit bandwidth of traffic matched by the filter it is
-attached to.
+attached to. Basically there are two different algorithms available to measure
+the packet rate: The first one uses an internal dual token bucket and is
+configured using the
+.BR rate ", " burst ", " mtu ", " peakrate ", " overhead " and " linklayer
+parameters. The second one uses an in-kernel sampling mechanism. It can be
+fine-tuned using the
+.B estimator
+filter parameter.
 .SH OPTIONS
 .TP
 .BI rate " RATE"
@@ -73,6 +88,12 @@ cell sizes, for
 .B ethernet
 no action is taken.
 .TP
+.BI estimator " SAMPLE AVERAGE"
+Fine-tune the in-kernel packet rate estimator.
+.IR SAMPLE " and " AVERAGE
+are time values and control the frequency in which samples are taken and over
+what timespan an average is built.
+.TP
 .BI conform-exceed " EXCEEDACT\fR[\fB/\fIEXCEEDACT\fR]"
 Define how to handle packets which exceed (and, if the second
 .I EXCEEDACT
-- 
2.7.2



[iproute PATCH v2 3/8] man: tc-csum.8: Add an example

2016-03-22 Thread Phil Sutter
Signed-off-by: Phil Sutter 
---
 man/man8/tc-csum.8 | 15 +++
 1 file changed, 15 insertions(+)

diff --git a/man/man8/tc-csum.8 b/man/man8/tc-csum.8
index 9d00aae346af0..3a64c82f09ba8 100644
--- a/man/man8/tc-csum.8
+++ b/man/man8/tc-csum.8
@@ -49,6 +49,21 @@ UDPLite header
 .TP
 .B SWEETS
 These are merely syntactic sugar and ignored internally.
+.SH EXAMPLES
+The following performs stateless NAT for incoming packets from 192.168.1.100 to
+new destination 18.52.86.120 (0x12345678 in hex). Assuming these are UDP
+packets, both IP and UDP checksums have to be recalculated:
+
+.RS
+.EX
+# tc qdisc add dev eth0 ingress handle :
+# tc filter add eth0 prio 1 protocol ip parent : \\
+   u32 match ip src 192.168.1.100/32 flowid :1 \\
+   action pedit munge ip dst set 0x12345678 pipe \\
+   csum ip and udp
+.EE
+.RE
+
 .SH SEE ALSO
 .BR tc (8),
 .BR tc-pedit (8)
-- 
2.7.2



Re: [PATCH] net: phy: at803x: don't depend on GPIOLIB

2016-03-22 Thread Sebastian Frias
Hi Sergei,

On 03/21/2016 09:15 PM, Sergei Shtylyov wrote:
> 
>Do you have the PHY that requires the GPIO reset workaround?

Unfortunately (or luckily :-) ) I don't have the faulty PHY, sorry.

Best regards,

Sebastian


Re: [PATCH] net: phy: at803x: don't depend on GPIOLIB

2016-03-22 Thread Sebastian Frias
Hi Daniel,

Would you mind commenting on this thread?
Uwe and I are in a sort of deadlock because we each have a different
understanding of the intention of your commit 13a56b449325.

Basically the questions are:
- What is the intention of 13a56b449325?
- Did you mean for "reset" to be mandatory for faulty PHYs?
Mandatory meaning that you do not want the driver to work without.
- What do you think about the dependency on GPIOLIB?
You did not introduced such dependency with your change so it may point
to "reset" not being mandatory.

Best regards,

Sebastian

On 03/18/2016 04:56 PM, Sebastian Frias wrote:
> Hi Uwe, Daniel,
> 
> On 03/18/2016 01:54 PM, Uwe Kleine-König wrote:
>> [expand cc a bit more]
>>
>> Hello,
>>
>> On Wed, Mar 16, 2016 at 06:25:59PM +0100, Sebastian Frias wrote:
>>> Commit 687908c2b649 ("net: phy: at803x: simplify using
>>> devm_gpiod_get_optional and its 4th argument") introduced a dependency
>>> on GPIOLIB that was not there before.
>>>
>>> This commit removes such dependency by checking the return code and
>>> comparing it against ENOSYS which is returned when GPIOLIB is not
>>> selected.
>>>
>>> Fixes: 687908c2b649 ("net: phy: at803x: simplify using 
>>> devm_gpiod_get_optional and its 4th argument")
>>>
>>> Signed-off-by: Sebastian Frias 
>>> ---
>>>  drivers/net/phy/at803x.c | 4 +++-
>>>  1 file changed, 3 insertions(+), 1 deletion(-)
>>>
>>> diff --git a/drivers/net/phy/at803x.c b/drivers/net/phy/at803x.c
>>> index 2174ec9..88b7ff3 100644
>>> --- a/drivers/net/phy/at803x.c
>>> +++ b/drivers/net/phy/at803x.c
>>> @@ -252,7 +252,9 @@ static int at803x_probe(struct phy_device *phydev)
>>> return -ENOMEM;
>>>
>>> gpiod_reset = devm_gpiod_get_optional(dev, "reset", GPIOD_OUT_HIGH);
>>> -   if (IS_ERR(gpiod_reset))
>>> +   if (PTR_ERR(gpiod_reset) == -ENOSYS)
>>> +   gpiod_reset = NULL;
>>> +   else if (IS_ERR(gpiod_reset))
>>
>> this isn't better either (IMHO it's worse, but maybe this is debatable
>> and also depends on your POV).
> 
> Well, from the code, the reset hack is only required when the PHY is
> ATH8030_PHY_ID, right?
> However, such change was introduced by Daniel Mack on commit 13a56b449325.
> Hopefully he can chime in and give his opinion on this.
> 
> Daniel, while on the subject, I have a question for you:
> 
> Change 2b8f2a28eac1 introduced "link_status_notify" callback which is
> called systematically on the PHY state machine.
> Any reason to make the call systematic as opposed to let say:
> 
>   if (phydev->state != old_state) {
>   if (phydev->drv->link_change_notify)
>   phydev->drv->link_change_notify(phydev);
>   }
> 
> (it does means that the callback would be called after the state machine
> processing though)
> 
>>
>> With 687908c2b649 I made kernels without GPIOLIB fail to bind this
>> device. I admit this is not maximally nice.
> 
> I see, that was not clear from the commit message, sorry.
> 
>>
>> Your change makes the driver bind in this case again and then the reset
>> gpio isn't handled at all which might result in a later and harder to
>> debug error.
>>
>> The better approach to fix your problem is: make the driver depend (or
>> force) on GPIOLIB. 
> 
> It was one of the solutions I had in mind, but:
> - since the dependency on GPIOLIB was not included on the patch
> - and given that the previous code had provision to work without GPIO
> I thought it was an overlook.
> 
>> Or alternatively, drop reset-handling from the
>> driver.
>>
>> From a driver perspecitive, it would be nice if devm_gpiod_get_optional
>> returned NULL iff the respective gpio isn't specified even with
>> GPIOLIB=n, but this isn't sensible either because it would result in
>> quite some gpiolib code to not being conditionally compiled on
>> CONFIG_GPIOLIB any more.
> 
> Let's say that was the case, what would the PHY code do?
> 
> I mean, it did not get a GPIO, whether it was because GPIOLIB=n or
> because there's no 'reset' GPIO attached
> Shall it fail? Shall it continue in a sort of degraded mode? Shall it warn?
> Because that's the real question here.
> 
> What would you think of making at803x_link_change_notify() print a
> message every time it should do a reset but does not has a way to do it?
> I can make such change to my patch if everybody agrees on it.
> By the way, in that case, can somebody suggest a way to print such warning?
> Would printk() be ok or should I use dev_dbg() ?
> 
> Best regards,
> 
> Sebastian
> 
>>
>> Best regards
>> Uwe
>>


Re: [PATCH] net: phy: at803x: don't depend on GPIOLIB

2016-03-22 Thread Sebastian Frias
Hi Uwe,

I think we are in a deadlock :-)
I'm going to reply inline below, but I will also send a different email
to Daniel with a small recap.
I think he should share the intent of the "reset" mechanism he
introduced, in particular if it is mandatory.


On 03/21/2016 09:12 PM, Uwe Kleine-König wrote:
>>
>> gpiod=NULL (because the key is not there) or gpiod=ERR (because
>> GPIOLIB=n + my patch) will result in the same behaviour, ie: driver
>> binds, but fails to reset.
> 
> Assuming the source for the hardware description is dt (the same
> argument applies if it's ACPI or something else).
> 
> The driver uses devm_gpiod_get_optional(..."reset"...). That means some
> hardware has a reset line, some don't. If a reset gpio specification is
> there, that means the hardware has such a signal and it seems that means
> it must not be ignored. 

The problem is all those "it seems" :-)
"it seems it must not be ignored" is also an hypothesis considering the
driver is not refusing to work even if it detects that the faulty PHY
has not been provided with a reset line.
Also, further down the thread you acknowledge that's a possibility.

>If there is no reset gpio specification that
> means that driver has to assume there is no reset line. (In the real
> world there might be other reasons the reset line isn't in the device
> tree, but it's not in the scope of the driver to guess why it's missing.
> If it's not there the only sensible thing to assume for the driver is:
> There is no reset line.)

But that is not what the current code does, the current code does not
checks if the DT presents the "reset" property directly.
It assumes that GPIOLIB works and checks GPIOLIB return codes, to
indirectly (and incorrectly) guess if a given device has a "reset" property.
It has not been clearly explained anywhere that the "reset" property for
AT8030 is supposed to be mandatory and that the driver should fail if
not provided with one.
The code does not back up such hypothesis either, ie: GPIOLIB=y +
missing "reset" will still allow the driver to work.

> 
> So the conclusions are: If there is a reset line in dt, it must be used.
> If you don't know if there is a reset line (because GPIOLIB=n) the
> driver should not bind. Everything else yields to more problems than
> good.

a) "If there is a reset line in dt, it must be used": that is an
hypothesis (I would say "it can be used");
b) if it were true, then another way of knowing if the "reset" key is
present should be used;
indeed, there should be:
c) a way to unambiguously determine if the "reset" key is in the DT
(regardless of GPIOLIB status);
that in order to know if the DT is specifying that the "reset" mechanism
must be used, then:
d) iff the "reset" is absolutely necessary (ie: the faulty PHY is in use
AND the original intention of the hack was for it *mandatory* for faulty
PHYs), some decision can be taken;

However, right now those conditions are not met.

>>> With your patch and GPIOLIB=n you bind the driver even for the devices
>>> that need the hack. This is broken!
>>
>> It is a degraded mode I agree and had proposed to print a warning.
>> However, I fail to see how is GPIOLIB=n different from just not having
>> "reset" present.
> 
> GPIOLIB=n means "the driver doesn't know if a reset line is
> present/necessary", not having reset means "there is no reset line".

That's the problem, that the driver is relying on GPIOLIB to know if:
- "reset" key is present
- it should fail to bind or not

and it does that based solely on GPIOLIB return codes, thus indirectly.
As opposed to checking directly the presence of the "reset" key
(regardless of GPIOLIB status) and failing immediately.

> 
> And don't do error handling by printk assuming anyone will read it. That
> doesn't work.

I proposed a warning, not error handling.
Also if we assume people won't read the logs, why put any at all?
Some people may not read the logs, some others will, it is their choice
to bang their heads one way or another.
Currently there's no warning.
So I had to debug why the driver was not binding, and it is not obvious
that GPIOLIB must be selected, considering:
- I don't have a faulty PHY
- I don't have a "reset" key on DT
- Even if I wanted, I don't have a GPIO to connect to it

(also, even if this driver does not bind, some generic one would take over)

> 
>> The fact that GPIOLIB=y does not means that the "reset" key will be there.
> 
> Right, but with GPIOLIB=y you know if it's there, and if yes, you can
> control the line.

Exactly, and what I say is that knowing if "reset" is there or not
should not depend on having GPIOLIB=y

To me the problem comes from the aliasing between "GPIOLIB=y" and
"reset" present.
Indeed, the driver should query if the "reset" key is present
(regardless of GPIOLIB status), and then try to acquire the GPIO line.
If it cannot get the reset GPIO, then it can give up saying "you told me
I need a reset line but then you did not give me one".

> 
>> I mean, you assume that 

Re: [PATCH] lan78xx: Protect runtime_auto check by #ifdef CONFIG_PM

2016-03-22 Thread Oliver Neukum
On Tue, 2016-03-22 at 10:21 -0400, Alan Stern wrote:
> I don't see any point in resuming the device just in order to collect 
> operating statistics.  If it was already suspended then it wasn't 
> operating, so there will be no statistics to collect.

Indeed. In that case the point is moot. But it is correct to ask
the core whether the device is autosuspended at that point rather
than keep a private flag if you can.

All that is relevant only if the upper layers ask for information
that the driver cannot provide without resuming the device.
Those are fundamentally different issues.

Regards
Oliver




Re: [PATCH] lan78xx: Protect runtime_auto check by #ifdef CONFIG_PM

2016-03-22 Thread Alan Stern
On Tue, 22 Mar 2016, Oliver Neukum wrote:

> On Mon, 2016-03-21 at 15:30 -0400, Alan Stern wrote:
> > On Mon, 21 Mar 2016, Oliver Neukum wrote:
> > 
> 
> > > We have an autosuspend timeout because we think that IO, if it will
> > > come at all, is likeliest to come soon. If, however, the IO is
> > > periodic that heuristics is false.
> > > To save most power the driver must either decide that the interval
> > > is too short or suspend immediately. So if we are lucky enough
> > > to have the frequency in the kernel, we should use that information.
> > 
> > The autosuspend timeout is set by userspace.  The kernel may assign a
> 
> Thus it should apply to all IO originating in user space.
> But only to that IO.

Fair enough.

> > default value, but the user can always override it.  Given this, I 
> > don't see how the kernel can use frequency information (and I'm not 
> > sure where that information would come from in the first place).
> 
> It can ignore internal IO for the purpose of the timeout.
> If such IO is performed while the device is active, don't
> alter the timer.

Come to think of it, we don't.  If pm_runtime_get_sync() and then
pm_runtime_put() are called while the device is already at full power, 
the PM core doesn't update the last_busy time.  So if the driver 
doesn't update it either, the statistics collection won't interfere 
with autosuspend (except when it races with the autosuspend timer 
expiration).

> Otherwise resume the device and look at
> the provided hint and suspend again immediately if the period is long
> enough.

I don't see any point in resuming the device just in order to collect 
operating statistics.  If it was already suspended then it wasn't 
operating, so there will be no statistics to collect.

> If IO is generated periodically in the kernel, the kernel must know that
> period.

Alan Stern



[PATCH] fsl/fman: Workaround for Errata A-007273

2016-03-22 Thread igal.liberman
From: Igal Liberman 

Errata A-007273 (For FMan V3 devices only):
FMan soft reset is not finished properly if one
of the Ethernet MAC clocks is disabled

Workaround:
Re-enable all disabled MAC clocks through the DCFG_CCSR_DEVDISR2
register prior to issuing an FMAN soft reset.
Re-disable the MAC clocks after the FMAN soft reset is done.

Signed-off-by: Igal Liberman 
---
 drivers/net/ethernet/freescale/fman/fman.c |  104 +++-
 1 file changed, 88 insertions(+), 16 deletions(-)

diff --git a/drivers/net/ethernet/freescale/fman/fman.c 
b/drivers/net/ethernet/freescale/fman/fman.c
index 79a210a..ea83712 100644
--- a/drivers/net/ethernet/freescale/fman/fman.c
+++ b/drivers/net/ethernet/freescale/fman/fman.c
@@ -35,6 +35,7 @@
 #include "fman.h"
 #include "fman_muram.h"
 
+#include 
 #include 
 #include 
 #include 
@@ -1871,6 +1872,90 @@ err_fm_state:
return -EINVAL;
 }
 
+static int fman_reset(struct fman *fman)
+{
+   u32 count;
+   int err = 0;
+
+   if (fman->state->rev_info.major < 6) {
+   iowrite32be(FPM_RSTC_FM_RESET, &fman->fpm_regs->fm_rstc);
+   /* Wait for reset completion */
+   count = 100;
+   do {
+   udelay(1);
+   } while (((ioread32be(&fman->fpm_regs->fm_rstc)) &
+FPM_RSTC_FM_RESET) && --count);
+   if (count == 0)
+   err = -EBUSY;
+
+   goto _return;
+   } else {
+   struct device_node *guts_node;
+   struct ccsr_guts __iomem *guts_regs;
+   u32 devdisr2, reg;
+
+   /* Errata A007273 */
+   guts_node =
+   of_find_compatible_node(NULL, NULL,
+   "fsl,qoriq-device-config-2.0");
+   if (!guts_node) {
+   dev_err(fman->dev, "%s: Couldn't find guts node\n",
+   __func__);
+   goto guts_node;
+   }
+
+   guts_regs = of_iomap(guts_node, 0);
+   if (!guts_regs) {
+   dev_err(fman->dev, "%s: Couldn't map %s regs\n",
+   __func__, guts_node->full_name);
+   goto guts_regs;
+   }
+#define FMAN1_ALL_MACS_MASK0xFCC0
+#define FMAN2_ALL_MACS_MASK0x000FCC00
+   /* Read current state */
+   devdisr2 = ioread32be(&guts_regs->devdisr2);
+   if (fman->dts_params.id == 0)
+   reg = devdisr2 & ~FMAN1_ALL_MACS_MASK;
+   else
+   reg = devdisr2 & ~FMAN2_ALL_MACS_MASK;
+
+   /* Enable all MACs */
+   iowrite32be(reg, &guts_regs->devdisr2);
+
+   /* Perform FMan reset */
+   iowrite32be(FPM_RSTC_FM_RESET, &fman->fpm_regs->fm_rstc);
+
+   /* Wait for reset completion */
+   count = 100;
+   do {
+   udelay(1);
+   } while (((ioread32be(&fman->fpm_regs->fm_rstc)) &
+FPM_RSTC_FM_RESET) && --count);
+   if (count == 0) {
+   iounmap(guts_regs);
+   of_node_put(guts_node);
+   err = -EBUSY;
+   goto _return;
+   }
+
+   /* Restore devdisr2 value */
+   iowrite32be(devdisr2, &guts_regs->devdisr2);
+
+   iounmap(guts_regs);
+   of_node_put(guts_node);
+
+   goto _return;
+
+guts_regs:
+   of_node_put(guts_node);
+guts_node:
+   dev_dbg(fman->dev, "%s: Didn't perform FManV3 reset due to 
Errata A007273!\n",
+   __func__);
+   }
+_return:
+   return err;
+}
+
 static int fman_init(struct fman *fman)
 {
struct fman_cfg *cfg = NULL;
@@ -1914,22 +1999,9 @@ static int fman_init(struct fman *fman)
fman->liodn_base[i] = liodn_base;
}
 
-   /* FMan Reset (supported only for FMan V2) */
-   if (fman->state->rev_info.major >= 6) {
-   /* Errata A007273 */
-   dev_dbg(fman->dev, "%s: FManV3 reset is not supported!\n",
-   __func__);
-   } else {
-   iowrite32be(FPM_RSTC_FM_RESET, &fman->fpm_regs->fm_rstc);
-   /* Wait for reset completion */
-   count = 100;
-   do {
-   udelay(1);
-   } while (((ioread32be(&fman->fpm_regs->fm_rstc)) &
-FPM_RSTC_FM_RESET) && --count);
-   if (count == 0)
-   return -EBUSY;
-   }
+   err = fman_reset(fman);
+   if (err)
+   return err;
 
if (ioread32be(&fman->qmi_regs->fmqm_gs) & QMI_GS_HALT_NOT_BUSY) {
resume(fman->fpm_regs);
-- 
1.7.9.5



[iproute PATCH v2 3/4] tc: pedit: Fix raw op

2016-03-22 Thread Phil Sutter
The retain value was wrong for u16 and u8 types.

Signed-off-by: Phil Sutter 
---
 tc/m_pedit.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tc/m_pedit.c b/tc/m_pedit.c
index 30a6f3673e896..8ccec8bc4a99f 100644
--- a/tc/m_pedit.c
+++ b/tc/m_pedit.c
@@ -339,12 +339,12 @@ parse_offset(int *argc_p, char ***argv_p, struct 
tc_pedit_sel *sel, struct tc_pe
}
if (matches(*argv, "u16") == 0) {
len = 2;
-   retain = 0x0;
+   retain = 0x;
goto done;
}
if (matches(*argv, "u8") == 0) {
len = 1;
-   retain = 0x0;
+   retain = 0xff;
goto done;
}
 
-- 
2.7.2



[iproute PATCH v2 4/4] testsuite: add a test for tc pedit action

2016-03-22 Thread Phil Sutter
This is not a full test, since kernel functionality is not actually
tested. It only compares that the kernel returned values when listing
the action are what one expects them to be.

Since this test succeeded on both a little-endian and a big-endian
system, it shows that any endianness issues have been resolved in
tc/p_ip.c at least.

Signed-off-by: Phil Sutter 
---
 testsuite/tests/tc/pedit.t | 217 +
 1 file changed, 217 insertions(+)
 create mode 100755 testsuite/tests/tc/pedit.t

diff --git a/testsuite/tests/tc/pedit.t b/testsuite/tests/tc/pedit.t
new file mode 100755
index 0..e9b6c333accbe
--- /dev/null
+++ b/testsuite/tests/tc/pedit.t
@@ -0,0 +1,217 @@
+#!/bin/sh
+
+source lib/generic.sh
+
+DEV="$(rand_dev)"
+ts_ip "$0" "Add $DEV dummy interface" link add dev $DEV type dummy
+ts_ip "$0" "Enable $DEV" link set $DEV up
+ts_tc "pedit" "Add ingress qdisc" qdisc add dev $DEV ingress
+
+
+do_pedit() {
+   ts_tc "pedit" "Drop ingress qdisc" \
+   qdisc del dev $DEV ingress
+   ts_tc "pedit" "Add ingress qdisc" \
+   qdisc add dev $DEV ingress
+   ts_tc "pedit" "Add pedit action $*" \
+   filter add dev $DEV parent : \
+   u32 match u32 0 0 \
+   action pedit munge $@
+   ts_tc "pedit" "Show ingress filters" \
+   filter show dev $DEV parent :
+}
+
+do_pedit offset 12 u32 set 0x12345678
+test_on "key #0  at 12: val 12345678 mask "
+do_pedit offset 12 u16 set 0x1234
+test_on "key #0  at 12: val 1234 mask "
+do_pedit offset 14 u16 set 0x1234
+test_on "key #0  at 12: val 1234 mask "
+do_pedit offset 12 u8 set 0x23
+test_on "key #0  at 12: val 2300 mask 00ff"
+do_pedit offset 13 u8 set 0x23
+test_on "key #0  at 12: val 0023 mask ff00"
+do_pedit offset 14 u8 set 0x23
+test_on "key #0  at 12: val 2300 mask 00ff"
+do_pedit offset 15 u8 set 0x23
+test_on "key #0  at 12: val 0023 mask ff00"
+
+do_pedit offset 13 u8 invert
+test_on "key #0  at 12: val 00ff mask "
+do_pedit offset 13 u8 clear
+test_on "key #0  at 12: val  mask ff00"
+do_pedit offset 13 u8 preserve
+test_on "key #0  at 12: val  mask "
+
+# the following set of tests has been auto-generated by running this little
+# shell script:
+#
+# do_it() {
+#  echo "do_pedit $@"
+#  tc qd del dev veth0 ingress >/dev/null 2>&1
+#  tc qd add dev veth0 ingress >/dev/null 2>&1
+#  tc filter add dev veth0 parent : u32 \
+#  match u32 0 0 \
+#  action pedit munge $@ >/dev/null 2>&1
+#  tc filter show dev veth0 parent : | \
+#  sed -n 's/^[\t ]*\(key #0.*\)/test_on "\1"/p'
+# }
+#
+# do_it_all() { # (field, val1 [, val2, ...])
+#  local field=$1
+#  shift
+#  for val in $@; do
+#  do_it ip $field set $val
+#  done
+#  for i in preserve invert clear; do
+#  do_it ip $field $i
+#  done
+# }
+#
+# do_it_all ihl 0x04 0x40
+# do_it_all src 1.2.3.4
+# do_it_all dst 1.2.3.4
+# do_it_all tos 0x1 0x10
+# do_it_all protocol 0x23
+# do_it_all nofrag 0x23 0xf4
+# do_it_all firstfrag 0x03 0xfa
+# do_it_all ce 0x23 0x04 0xf3
+# do_it_all df 0x23 0x04 0xf3
+# do_it_all mf 0x23 0x04 0xf3
+# do_it_all dport 0x1234
+# do_it_all sport 0x1234
+# do_it_all icmp_type 0x23
+# do_it_all icmp_code 0x23
+
+do_pedit ip ihl set 0x04
+test_on "key #0  at 0: val 0400 mask f0ff"
+do_pedit ip ihl set 0x40
+test_on "key #0  at 0: val  mask f0ff"
+do_pedit ip ihl preserve
+test_on "key #0  at 0: val  mask "
+do_pedit ip ihl invert
+test_on "key #0  at 0: val 0f00 mask "
+do_pedit ip ihl clear
+test_on "key #0  at 0: val  mask f0ff"
+do_pedit ip src set 1.2.3.4
+test_on "key #0  at 12: val 01020304 mask "
+do_pedit ip src preserve
+test_on "key #0  at 12: val  mask "
+do_pedit ip src invert
+test_on "key #0  at 12: val  mask "
+do_pedit ip src clear
+test_on "key #0  at 12: val  mask "
+do_pedit ip dst set 1.2.3.4
+test_on "key #0  at 16: val 01020304 mask "
+do_pedit ip dst preserve
+test_on "key #0  at 16: val  mask "
+do_pedit ip dst invert
+test_on "key #0  at 16: val  mask "
+do_pedit ip dst clear
+test_on "key #0  at 16: val  mask "
+do_pedit ip tos set 0x1
+test_on "key #0  at 0: val 0001 mask ff00"
+do_pedit ip tos set 0x10
+test_on "key #0  at 0: val 0010 mask ff00"
+do_pedit ip tos preserve
+test_on "key #0  at 0: val  mask "
+do_pedit ip tos invert
+test_on "key #0  at 0: val 00ff mask "
+do_pedit ip tos clear
+test_on "key #0  at 0: val  mask ff00"
+do_pedit ip protocol set 0x23
+test_on "key #0  at 8: val 0023 mask ff00"
+do_pedit ip protocol preserve
+test_on "key #0  at 8: val  mask "

[iproute PATCH v2 0/4] tc: pedit: further fixes

2016-03-22 Thread Phil Sutter
The following series was created after testing pedit on a big-endian
system. It starts with a minor patch fixing coding style in tc/p_ip.c,
then the actual big-endian fixup, a fix for raw op discovered when
writing the pedit test for testsuite and finally a testsuite addon to
verify pedit functionality.

Changes since v1:
- Rebased whole series onto current master branch.
- Replaced patch 1/4 with a smaller one containing only the rebase
  leftovers.

Phil Sutter (4):
  tc/p_ip.c: Minor coding style cleanup
  tc: pedit: Fix for big-endian systems
  tc: pedit: Fix raw op
  testsuite: add a test for tc pedit action

 tc/m_pedit.c   |  23 ++---
 tc/p_ip.c  |   8 +-
 testsuite/tests/tc/pedit.t | 217 +
 3 files changed, 235 insertions(+), 13 deletions(-)
 create mode 100755 testsuite/tests/tc/pedit.t

-- 
2.7.2



[iproute PATCH v2 2/4] tc: pedit: Fix for big-endian systems

2016-03-22 Thread Phil Sutter
This was tricky to get right:
- The 'stride' value used for 8 and 16 bit values must behave inverse to
  the value's intra word offset to work correctly with big-endian data
  act_pedit is editing.
- The 'm' array's values are in host byte order, so they have to be
  converted as well (and the ordering was just inverse, for some
  reason).
- The only sane way of getting this right is to manipulate value/mask in
  host byte order and convert the output.
- TIPV4 (i.e. 'munge ip src/dst') had it's own pitfall: the address
  parser converts to network byte order automatically. This patch fixes
  this by converting it back before calling pack_key32, which is a hack
  but at least does not require to implement a completely separate code
  flow.

Signed-off-by: Phil Sutter 
---
 tc/m_pedit.c | 19 +++
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/tc/m_pedit.c b/tc/m_pedit.c
index ca78a83dd9d9d..30a6f3673e896 100644
--- a/tc/m_pedit.c
+++ b/tc/m_pedit.c
@@ -156,7 +156,7 @@ int
 pack_key16(__u32 retain, struct tc_pedit_sel *sel, struct tc_pedit_key *tkey)
 {
int ind, stride;
-   __u32 m[4] = {0x, 0xFFFF, 0x};
+   __u32 m[4] = {0x, 0xFFFF, 0x};
 
if (tkey->val > 0x || tkey->mask > 0x) {
fprintf(stderr, "pack_key16 bad value\n");
@@ -170,9 +170,9 @@ pack_key16(__u32 retain, struct tc_pedit_sel *sel, struct 
tc_pedit_key *tkey)
return -1;
}
 
-   stride = 8 * ind;
-   tkey->val = htons(tkey->val & retain) << stride;
-   tkey->mask = (htons(tkey->mask | ~retain) << stride) | m[ind];
+   stride = 8 * (2 - ind);
+   tkey->val = htonl((tkey->val & retain) << stride);
+   tkey->mask = htonl(((tkey->mask | ~retain) << stride) | m[ind]);
 
tkey->off &= ~3;
 
@@ -186,7 +186,7 @@ int
 pack_key8(__u32 retain, struct tc_pedit_sel *sel, struct tc_pedit_key *tkey)
 {
int ind, stride;
-   __u32 m[4] = {0xFF00, 0x00FF, 0xFF00, 0x00FF};
+   __u32 m[4] = {0x00FF, 0xFF00, 0x00FF, 0xFF00};
 
if (tkey->val > 0xFF || tkey->mask > 0xFF) {
fprintf(stderr, "pack_key8 bad value (val %x mask %x\n", 
tkey->val, tkey->mask);
@@ -195,9 +195,9 @@ pack_key8(__u32 retain, struct tc_pedit_sel *sel, struct 
tc_pedit_key *tkey)
 
ind = tkey->off & 3;
 
-   stride = 8 * ind;
-   tkey->val = (tkey->val & retain) << stride;
-   tkey->mask = ((tkey->mask | ~retain) << stride) | m[ind];
+   stride = 8 * (3 - ind);
+   tkey->val = htonl((tkey->val & retain) << stride);
+   tkey->mask = htonl(((tkey->mask | ~retain) << stride) | m[ind]);
 
tkey->off &= ~3;
 
@@ -283,6 +283,9 @@ parse_cmd(int *argc_p, char ***argv_p, __u32 len, int type, 
__u32 retain, struct
tkey->val = val;
tkey->mask = mask;
 
+   if (type == TIPV4)
+   tkey->val = ntohl(tkey->val);
+
if (len == 1) {
res = pack_key8(retain, sel, tkey);
goto done;
-- 
2.7.2



[iproute PATCH v2 1/4] tc/p_ip.c: Minor coding style cleanup

2016-03-22 Thread Phil Sutter
Break overlong function definitions and remove one extraneous
whitespace.

Signed-off-by: Phil Sutter 
---
 tc/p_ip.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/tc/p_ip.c b/tc/p_ip.c
index 53dfb2693d47e..535151e5d7668 100644
--- a/tc/p_ip.c
+++ b/tc/p_ip.c
@@ -24,7 +24,8 @@
 #include "m_pedit.h"
 
 static int
-parse_ip(int *argc_p, char ***argv_p, struct tc_pedit_sel *sel, struct 
tc_pedit_key *tkey)
+parse_ip(int *argc_p, char ***argv_p,
+struct tc_pedit_sel *sel, struct tc_pedit_key *tkey)
 {
int res = -1;
int argc = *argc_p;
@@ -52,7 +53,7 @@ parse_ip(int *argc_p, char ***argv_p, struct tc_pedit_sel 
*sel, struct tc_pedit_
if (strcmp(*argv, "tos") == 0 || matches(*argv, "dsfield") == 0) {
NEXT_ARG();
tkey->off = 1;
-   res = parse_cmd(&argc, &argv,  1, TU32, RU8, sel, tkey);
+   res = parse_cmd(&argc, &argv, 1, TU32, RU8, sel, tkey);
goto done;
}
if (strcmp(*argv, "ihl") == 0) {
@@ -139,7 +140,8 @@ done:
 }
 
 static int
-parse_ip6(int *argc_p, char ***argv_p, struct tc_pedit_sel *sel, struct 
tc_pedit_key *tkey)
+parse_ip6(int *argc_p, char ***argv_p,
+ struct tc_pedit_sel *sel, struct tc_pedit_key *tkey)
 {
int res = -1;
return res;
-- 
2.7.2



[PATCH v4 3/3] IB/hns: Add binding document for HiSilicon RoCE driver

2016-03-22 Thread Lijun Ou
This patch adds related DTS binding document for HiSilicon RoCE driver.

Signed-off-by: Lijun Ou 
Signed-off-by: Wei Hu(Xavier) 
---
 .../bindings/infiniband/hisilicon-hns-roce.txt | 107 +
 1 file changed, 107 insertions(+)
 create mode 100644 
Documentation/devicetree/bindings/infiniband/hisilicon-hns-roce.txt

diff --git 
a/Documentation/devicetree/bindings/infiniband/hisilicon-hns-roce.txt 
b/Documentation/devicetree/bindings/infiniband/hisilicon-hns-roce.txt
new file mode 100644
index 000..5180fef
--- /dev/null
+++ b/Documentation/devicetree/bindings/infiniband/hisilicon-hns-roce.txt
@@ -0,0 +1,107 @@
+HiSilicon RoCE DT description
+
+HiSilicon RoCE engine is a part of network subsystem.
+It works depending on other part of network wubsytem, such as, gmac and
+dsa fabric.
+
+Additional properties are described here:
+
+Required properties:
+- compatible: Should contain "hisilicon,hns-roce-v1".
+- reg: Physical base address of the roce driver and
+length of memory mapped region.
+- eth-handle: phandle, specifies a reference to a node
+representing a ethernet device.
+- dsaf-handle: phandle, specifies a reference to a node
+representing a dsaf device.
+- #address-cells: must be 2
+- #size-cells: must be 2
+Optional properties:
+- dma-coherent: Present if DMA operations are coherent.
+- interrupt-parent: the interrupt parent of this device.
+- interrupts: should contain 32 completion event irq,1 async event irq
+and 1 event overflow irq.
+- interrupt-names:should be one of 34 irqs for roce device
+  - roce_ce0_irq ~ roce_ce31_irq: 32 complete event irq
+  - roce_ae_irq: 1 async event irq
+  - roce_common_irq: named common exception warning irq
+Example:
+   infiniband@c400 {
+   compatible = "hisilicon,hns-roce-v1";
+   reg = <0x0 0xc400 0x0 0x10>;
+   dma-coherent;
+   eth-handle = <ð2 ð3 ð4 ð5 ð6 ð7>;
+   dsaf-handle = <&soc0_dsa>;
+   #address-cells = <2>;
+   #size-cells = <2>;
+   interrupt-parent = <&mbigen_dsa>;
+   interrupts = <722 1>,
+   <723 1>,
+   <724 1>,
+   <725 1>,
+   <726 1>,
+   <727 1>,
+   <728 1>,
+   <729 1>,
+   <730 1>,
+   <731 1>,
+   <732 1>,
+   <733 1>,
+   <734 1>,
+   <735 1>,
+   <736 1>,
+   <737 1>,
+   <738 1>,
+   <739 1>,
+   <740 1>,
+   <741 1>,
+   <742 1>,
+   <743 1>,
+   <744 1>,
+   <745 1>,
+   <746 1>,
+   <747 1>,
+   <748 1>,
+   <749 1>,
+   <750 1>,
+   <751 1>,
+   <752 1>,
+   <753 1>,
+   <785 1>,
+   <754 4>;
+
+   interrupt-names = "roce_ce0_irq",
+   "roce_ce1_irq",
+   "roce_ce2_irq",
+   "roce_ce3_irq",
+   "roce_ce4_irq",
+   "roce_ce5_irq",
+   "roce_ce6_irq",
+   "roce_ce7_irq",
+   "roce_ce8_irq",
+   "roce_ce9_irq",
+   "roce_ce10_irq",
+   "roce_ce11_irq",
+   "roce_ce12_irq",
+   "roce_ce13_irq",
+   "roce_ce14_irq",
+   "roce_ce15_irq",
+   "roce_ce16_irq",
+   "roce_ce17_irq",
+   "roce_ce18_irq",
+   "roce_ce19_irq",
+   "roc

[PATCH v4 0/3] IB/hns: Add HiSilicon RoCE driver

2016-03-22 Thread Lijun Ou
The HiSilicon Network Substem is a long term evolution IP which is
supposed to be used in HiSilicon ICT SoC. RoCE is a feature of hns.
The driver for HiSilicon RoCE engine is a platform driver.
The driver will support mulitple versions of hns. Currently only "v1"
for hip06 SoC is supported.

Changes v3 -> v4:
1. modify roce.o into hns-roce.o in Makefile and Kconfig file.

Changes v2 -> v3:
1. modify the formats of RoCE driver code base v2 by the experts 
reviewing. also, it used kmalloc_array instead of kmalloc, used kcalloc
instead of kzalloc, when refer to memory allocation for array
2. remove some functions without use and unconnected macros
3. modify the binding document with RoCE DT base v2 which added interrupt-names
4. redesign the port_map and si_map in hns_dsaf_roce_reset
5. add HiSilicon RoCE driver maintainers introduction in MAINTAINERS document

Changes v1 -> v2:
1. modify the formats of roce driver code by the experts reviewing
2. modify the bindings file with roce dts. add the attribute named 
interrput-names.
3. modify the way of defining port mode in hns_dsaf_main.c
4. move the Kconfig file into the hns directory and send it with roce

Lijun Ou (3):
  net: hns: Add reset function support for RoCE driver
  IB/hns: Add HiSilicon RoCE driver support
  IB/hns: Add binding document for HiSilicon RoCE driver

 .../bindings/infiniband/hisilicon-hns-roce.txt |  107 +
 MAINTAINERS|8 +
 drivers/infiniband/Kconfig |1 +
 drivers/infiniband/hw/Makefile |1 +
 drivers/infiniband/hw/hisilicon/hns/Kconfig|   10 +
 drivers/infiniband/hw/hisilicon/hns/Makefile   |9 +
 drivers/infiniband/hw/hisilicon/hns/hns_roce_ah.c  |  110 +
 .../infiniband/hw/hisilicon/hns/hns_roce_alloc.c   |  239 ++
 drivers/infiniband/hw/hisilicon/hns/hns_roce_cmd.c |  338 +++
 drivers/infiniband/hw/hisilicon/hns/hns_roce_cmd.h |   80 +
 .../infiniband/hw/hisilicon/hns/hns_roce_common.h  |  308 +++
 drivers/infiniband/hw/hisilicon/hns/hns_roce_cq.c  |  436 +++
 .../infiniband/hw/hisilicon/hns/hns_roce_device.h  |  794 ++
 drivers/infiniband/hw/hisilicon/hns/hns_roce_eq.c  |  758 ++
 drivers/infiniband/hw/hisilicon/hns/hns_roce_eq.h  |  132 +
 drivers/infiniband/hw/hisilicon/hns/hns_roce_icm.c |  578 
 drivers/infiniband/hw/hisilicon/hns/hns_roce_icm.h |  112 +
 .../infiniband/hw/hisilicon/hns/hns_roce_main.c| 1097 
 drivers/infiniband/hw/hisilicon/hns/hns_roce_mr.c  |  605 +
 drivers/infiniband/hw/hisilicon/hns/hns_roce_pd.c  |  124 +
 drivers/infiniband/hw/hisilicon/hns/hns_roce_qp.c  |  841 ++
 .../infiniband/hw/hisilicon/hns/hns_roce_user.h|   31 +
 .../infiniband/hw/hisilicon/hns/hns_roce_v1_hw.c   | 2832 
 .../infiniband/hw/hisilicon/hns/hns_roce_v1_hw.h   |  985 +++
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c |   84 +
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h |   30 +
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c |   62 +-
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_reg.h  |   13 +
 28 files changed, 10715 insertions(+), 10 deletions(-)
 create mode 100644 
Documentation/devicetree/bindings/infiniband/hisilicon-hns-roce.txt
 create mode 100644 drivers/infiniband/hw/hisilicon/hns/Kconfig
 create mode 100644 drivers/infiniband/hw/hisilicon/hns/Makefile
 create mode 100644 drivers/infiniband/hw/hisilicon/hns/hns_roce_ah.c
 create mode 100644 drivers/infiniband/hw/hisilicon/hns/hns_roce_alloc.c
 create mode 100644 drivers/infiniband/hw/hisilicon/hns/hns_roce_cmd.c
 create mode 100644 drivers/infiniband/hw/hisilicon/hns/hns_roce_cmd.h
 create mode 100644 drivers/infiniband/hw/hisilicon/hns/hns_roce_common.h
 create mode 100644 drivers/infiniband/hw/hisilicon/hns/hns_roce_cq.c
 create mode 100644 drivers/infiniband/hw/hisilicon/hns/hns_roce_device.h
 create mode 100644 drivers/infiniband/hw/hisilicon/hns/hns_roce_eq.c
 create mode 100644 drivers/infiniband/hw/hisilicon/hns/hns_roce_eq.h
 create mode 100644 drivers/infiniband/hw/hisilicon/hns/hns_roce_icm.c
 create mode 100644 drivers/infiniband/hw/hisilicon/hns/hns_roce_icm.h
 create mode 100644 drivers/infiniband/hw/hisilicon/hns/hns_roce_main.c
 create mode 100644 drivers/infiniband/hw/hisilicon/hns/hns_roce_mr.c
 create mode 100644 drivers/infiniband/hw/hisilicon/hns/hns_roce_pd.c
 create mode 100644 drivers/infiniband/hw/hisilicon/hns/hns_roce_qp.c
 create mode 100644 drivers/infiniband/hw/hisilicon/hns/hns_roce_user.h
 create mode 100644 drivers/infiniband/hw/hisilicon/hns/hns_roce_v1_hw.c
 create mode 100644 drivers/infiniband/hw/hisilicon/hns/hns_roce_v1_hw.h

-- 
1.9.1



[PATCH v4 1/3] net: hns: Add reset function support for RoCE driver

2016-03-22 Thread Lijun Ou
It added reset function for RoCE driver. RoCE is a feature of hns.
In hip06 SoC, in RoCE reset process, it's needed to configure dsaf
channel reset, port and sl map info. Reset function of RoCE is
located in dsaf module, we only call it in RoCE driver when needed.

Signed-off-by: Lijun Ou 
Signed-off-by: Wei Hu(Xavier) 
Signed-off-by: Lisheng 
---
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c | 84 ++
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h | 30 
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c | 62 +---
 drivers/net/ethernet/hisilicon/hns/hns_dsaf_reg.h  | 13 
 4 files changed, 179 insertions(+), 10 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c 
b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c
index 38fc5be..9370d46 100644
--- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c
@@ -12,6 +12,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -2593,6 +2594,89 @@ static struct platform_driver g_dsaf_driver = {
 
 module_platform_driver(g_dsaf_driver);
 
+/**
+ * hns_dsaf_roce_reset - reset dsaf and roce
+ * @dsaf_fwnode: Pointer to framework node for the dasf
+ * @val: 0 - request reset , 1 - drop reset
+ * retuen 0 - success , negative -fail
+ */
+int hns_dsaf_roce_reset(struct fwnode_handle *dsaf_fwnode, u32 val)
+{
+   struct dsaf_device *dsaf_dev;
+   struct platform_device *pdev;
+   unsigned int mp;
+   unsigned int sl;
+   unsigned int credit;
+   int i;
+   const u32 port_map[DSAF_ROCE_CREDIT_CHN][DSAF_ROCE_CHAN_MODE_NUM] = {
+   {DSAF_ROCE_PORT_0, DSAF_ROCE_PORT_0, DSAF_ROCE_PORT_0},
+   {DSAF_ROCE_PORT_1, DSAF_ROCE_PORT_0, DSAF_ROCE_PORT_0},
+   {DSAF_ROCE_PORT_2, DSAF_ROCE_PORT_1, DSAF_ROCE_PORT_0},
+   {DSAF_ROCE_PORT_3, DSAF_ROCE_PORT_1, DSAF_ROCE_PORT_0},
+   {DSAF_ROCE_PORT_4, DSAF_ROCE_PORT_2, DSAF_ROCE_PORT_1},
+   {DSAF_ROCE_PORT_4, DSAF_ROCE_PORT_2, DSAF_ROCE_PORT_1},
+   {DSAF_ROCE_PORT_5, DSAF_ROCE_PORT_3, DSAF_ROCE_PORT_1},
+   {DSAF_ROCE_PORT_5, DSAF_ROCE_PORT_3, DSAF_ROCE_PORT_1},
+   };
+   const u32 sl_map[DSAF_ROCE_CREDIT_CHN][DSAF_ROCE_CHAN_MODE_NUM] = {
+   {DSAF_ROCE_SL_0, DSAF_ROCE_SL_0, DSAF_ROCE_SL_0},
+   {DSAF_ROCE_SL_0, DSAF_ROCE_SL_1, DSAF_ROCE_SL_1},
+   {DSAF_ROCE_SL_0, DSAF_ROCE_SL_0, DSAF_ROCE_SL_2},
+   {DSAF_ROCE_SL_0, DSAF_ROCE_SL_1, DSAF_ROCE_SL_3},
+   {DSAF_ROCE_SL_0, DSAF_ROCE_SL_0, DSAF_ROCE_SL_0},
+   {DSAF_ROCE_SL_1, DSAF_ROCE_SL_1, DSAF_ROCE_SL_1},
+   {DSAF_ROCE_SL_0, DSAF_ROCE_SL_0, DSAF_ROCE_SL_2},
+   {DSAF_ROCE_SL_1, DSAF_ROCE_SL_1, DSAF_ROCE_SL_3},
+   };
+
+   if (!is_of_node(dsaf_fwnode)) {
+   pr_err("Only support DT node!\n");
+   return -EINVAL;
+   }
+   pdev = of_find_device_by_node(to_of_node(dsaf_fwnode));
+   dsaf_dev = dev_get_drvdata(&pdev->dev);
+   if (AE_IS_VER1(dsaf_dev->dsaf_ver)) {
+   dev_err(dsaf_dev->dev, "%s v1 chip do not support roce!\n",
+   dsaf_dev->ae_dev.name);
+   return -ENODEV;
+   }
+
+   if (!val) {
+   /* Reset rocee-channels in dsaf and rocee */
+   hns_dsaf_srst_chns(dsaf_dev, 0x3f000, 0);
+   hns_dsaf_roce_srst(dsaf_dev, 0);
+   } else {
+   /* Configure dsaf tx roce correspond to port map and sl map */
+   mp = dsaf_read_dev(dsaf_dev, DSAF_ROCE_PORT_MAP_REG);
+   for (i = 0; i < DSAF_ROCE_CREDIT_CHN; i++)
+   dsaf_set_field(mp, 7 << i * 3, i * 3,
+  port_map[i][DSAF_ROCE_6PORT_MODE]);
+   dsaf_set_field(mp, 3 << i * 3, i * 3, 0);
+   dsaf_write_dev(dsaf_dev, DSAF_ROCE_PORT_MAP_REG, mp);
+
+   sl = dsaf_read_dev(dsaf_dev, DSAF_ROCE_SL_MAP_REG);
+   for (i = 0; i < DSAF_ROCE_CREDIT_CHN; i++)
+   dsaf_set_field(sl, 3 << i * 2, i * 2,
+  sl_map[i][DSAF_ROCE_6PORT_MODE]);
+   dsaf_write_dev(dsaf_dev, DSAF_ROCE_SL_MAP_REG, sl);
+
+   /* De-reset rocee-channels in dsaf and rocee */
+   hns_dsaf_srst_chns(dsaf_dev, 0x3f000, 1);
+   msleep(20);
+   hns_dsaf_roce_srst(dsaf_dev, 1);
+
+   /* Eanble dsaf channel rocee credit */
+   credit = dsaf_read_dev(dsaf_dev, DSAF_SBM_ROCEE_CFG_REG_REG);
+   dsaf_set_bit(credit, DSAF_SBM_ROCEE_CFG_CRD_EN_B, 0);
+   dsaf_write_dev(dsaf_dev, DSAF_SBM_ROCEE_CFG_REG_REG, credit);
+
+   dsaf_set_bit(credit, DSAF_SBM_ROCEE_CFG_CRD_EN_B, 1);
+   dsaf_write_dev(dsaf_dev, DSAF_SBM_ROCEE_CFG_REG_REG, credit);
+   }
+ 

  1   2   >