[patch net-next] mlxsw: spectrum_router: Use correct config option

2017-08-15 Thread Jiri Pirko
From: Ido Schimmel 

I made an embarrassing mistake and used 'IPV6' instead of 'CONFIG_IPV6'
around the function that updates the kernel about IPv6 neighbours
activity. This can be a problem if the kernel has more neighbours than a
certain threshold and it starts deleting those that are supposedly
inactive.

Fixes: b5f3e0d43012 ("mlxsw: spectrum_router: Fix build when IPv6 isn't 
enabled")
Signed-off-by: Ido Schimmel 
Signed-off-by: Jiri Pirko 
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 4895d5b..a0a9728 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -1042,7 +1042,7 @@ static void mlxsw_sp_router_neigh_ent_ipv4_process(struct 
mlxsw_sp *mlxsw_sp,
neigh_release(n);
 }
 
-#if IS_ENABLED(IPV6)
+#if IS_ENABLED(CONFIG_IPV6)
 static void mlxsw_sp_router_neigh_ent_ipv6_process(struct mlxsw_sp *mlxsw_sp,
   char *rauhtd_pl,
   int rec_index)
-- 
2.9.3



[patch net-next] ipv6: fib: Provide offload indication using nexthop flags

2017-08-15 Thread Jiri Pirko
From: Ido Schimmel 

IPv6 routes currently lack nexthop flags as in IPv4. This has several
implications.

In the forwarding path, it requires us to check the carrier state of the
nexthop device and potentially ignore a linkdown route, instead of
checking for RTNH_F_LINKDOWN.

It also requires capable drivers to use the user facing IPv6-specific
route flags to provide offload indication, instead of using the nexthop
flags as in IPv4.

Add nexthop flags to IPv6 routes in the 40 bytes hole and use it to
provide offload indication instead of the RTF_OFFLOAD flag, which is
removed while it's still not part of any official kernel release.

In the near future we would like to use the field for the
RTNH_F_{LINKDOWN,DEAD} flags, but this change is more involved and might
not be ready in time for the current cycle.

Signed-off-by: Ido Schimmel 
Signed-off-by: Jiri Pirko 
---
 drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 8 
 include/net/ip6_fib.h | 2 ++
 include/uapi/linux/ipv6_route.h   | 1 -
 net/ipv6/route.c  | 7 +--
 4 files changed, 7 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c 
b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
index 16676ff..4895d5b 100644
--- a/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
+++ b/drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c
@@ -2397,7 +2397,7 @@ mlxsw_sp_fib6_entry_offload_set(struct mlxsw_sp_fib_entry 
*fib_entry)
 
if (fib_entry->type == MLXSW_SP_FIB_ENTRY_TYPE_LOCAL) {
list_first_entry(&fib6_entry->rt6_list, struct mlxsw_sp_rt6,
-list)->rt->rt6i_flags |= RTF_OFFLOAD;
+list)->rt->rt6i_nh_flags |= RTNH_F_OFFLOAD;
return;
}
 
@@ -2407,9 +2407,9 @@ mlxsw_sp_fib6_entry_offload_set(struct mlxsw_sp_fib_entry 
*fib_entry)
 
nh = mlxsw_sp_rt6_nexthop(nh_grp, mlxsw_sp_rt6);
if (nh && nh->offloaded)
-   mlxsw_sp_rt6->rt->rt6i_flags |= RTF_OFFLOAD;
+   mlxsw_sp_rt6->rt->rt6i_nh_flags |= RTNH_F_OFFLOAD;
else
-   mlxsw_sp_rt6->rt->rt6i_flags &= ~RTF_OFFLOAD;
+   mlxsw_sp_rt6->rt->rt6i_nh_flags &= ~RTNH_F_OFFLOAD;
}
 }
 
@@ -2424,7 +2424,7 @@ mlxsw_sp_fib6_entry_offload_unset(struct 
mlxsw_sp_fib_entry *fib_entry)
list_for_each_entry(mlxsw_sp_rt6, &fib6_entry->rt6_list, list) {
struct rt6_info *rt = mlxsw_sp_rt6->rt;
 
-   rt->rt6i_flags &= ~RTF_OFFLOAD;
+   rt->rt6i_nh_flags &= ~RTNH_F_OFFLOAD;
}
 }
 
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 1d790ea..71c1646 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -120,6 +120,8 @@ struct rt6_info {
 
atomic_trt6i_ref;
 
+   unsigned intrt6i_nh_flags;
+
/* These are in a separate cache line. */
struct rt6key   rt6i_dst cacheline_aligned_in_smp;
u32 rt6i_flags;
diff --git a/include/uapi/linux/ipv6_route.h b/include/uapi/linux/ipv6_route.h
index 33e2a57..d496c02 100644
--- a/include/uapi/linux/ipv6_route.h
+++ b/include/uapi/linux/ipv6_route.h
@@ -35,7 +35,6 @@
 #define RTF_PREF(pref) ((pref) << 27)
 #define RTF_PREF_MASK  0x1800
 
-#define RTF_OFFLOAD0x2000  /* offloaded route  */
 #define RTF_PCPU   0x4000  /* read-only: can not be set by user */
 #define RTF_LOCAL  0x8000
 
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 035762f..6793135 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -1820,11 +1820,6 @@ static struct rt6_info *ip6_route_info_create(struct 
fib6_config *cfg,
goto out;
}
 
-   if (cfg->fc_flags & RTF_OFFLOAD) {
-   NL_SET_ERR_MSG(extack, "Userspace can not set RTF_OFFLOAD");
-   goto out;
-   }
-
if (cfg->fc_dst_len > 128) {
NL_SET_ERR_MSG(extack, "Invalid prefix length");
goto out;
@@ -3335,7 +3330,7 @@ static int rt6_nexthop_info(struct sk_buff *skb, struct 
rt6_info *rt,
goto nla_put_failure;
}
 
-   if (rt->rt6i_flags & RTF_OFFLOAD)
+   if (rt->rt6i_nh_flags & RTNH_F_OFFLOAD)
*flags |= RTNH_F_OFFLOAD;
 
/* not needed for multipath encoding b/c it has a rtnexthop struct */
-- 
2.9.3



Re: [PATCH 1/2] mlx4: remove unnecessary pci_set_drvdata()

2017-08-15 Thread Leon Romanovsky
On Tue, Aug 15, 2017 at 02:33:05AM -0400, Zhu Yanjun wrote:
> The driver core clears the driver data to NULL after device_release
> or on probe failure. Thus, it is not necessary to manually clear the
> device driver data to NULL.
>

It makes sense and I'm pretty sure that you are right, but I'm failing
to find the function in device core which sets it to NULL. Can you help
me and present the actual call stack to that code place?

Thanks,


signature.asc
Description: PGP signature


Re: [PATCH net-next] liquidio: update debug console logging mechanism

2017-08-15 Thread Leon Romanovsky
On Mon, Aug 14, 2017 at 04:28:50PM +, Ricardo Farrington wrote:
> Hi Leon - the code to which this patch applies handles a data stream from our 
> card's firmware to the host (over PCI).
> There are device-specific registers which are accessed to transfer log data 
> from our firmware to the host.
> I don't think there is kernel code that we could use to perform this; this is 
> not general purpose host driver logging.

I see, it is not really clear form the name "console".

Thanks


signature.asc
Description: PGP signature


Re: [PATCH] netfilter: fix indent on in statements

2017-08-15 Thread walter harms


Am 15.08.2017 08:50, schrieb Colin King:
> From: Colin Ian King 
> 
> The returns on some if statements are not indented correctly,
> add in the missing tab.
> 
> Signed-off-by: Colin Ian King 
> ---
>  net/bridge/netfilter/ebt_ip.c  | 4 ++--
>  net/bridge/netfilter/ebt_ip6.c | 2 +-
>  2 files changed, 3 insertions(+), 3 deletions(-)
> 
> diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c
> index d06968bdf5ec..2b46c50abce0 100644
> --- a/net/bridge/netfilter/ebt_ip.c
> +++ b/net/bridge/netfilter/ebt_ip.c
> @@ -64,14 +64,14 @@ ebt_ip_mt(const struct sk_buff *skb, struct 
> xt_action_param *par)
>   if (NF_INVF(info, EBT_IP_DPORT,
>   dst < info->dport[0] ||
>   dst > info->dport[1]))
> - return false;
> + return false;


This is hard to read, perhaps it gets better when the result is stored in a 
tmp-var.
something like:
int isbetween=dst < info->dport[0] ||dst > info->dport[1] ;
int state=NF_INVF(info, EBT_IP_DPORT, isbetween );

if ( state )
return false;

just my 2 cents,
re,
 wh

>   }
>   if (info->bitmask & EBT_IP_SPORT) {
>   u32 src = ntohs(pptr->src);
>   if (NF_INVF(info, EBT_IP_SPORT,
>   src < info->sport[0] ||
>   src > info->sport[1]))
> - return false;
> + return false;
>   }
>   }
>   return true;
> diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c
> index 4617491be41e..2a5a52a53ec4 100644
> --- a/net/bridge/netfilter/ebt_ip6.c
> +++ b/net/bridge/netfilter/ebt_ip6.c
> @@ -89,7 +89,7 @@ ebt_ip6_mt(const struct sk_buff *skb, struct 
> xt_action_param *par)
>   if (NF_INVF(info, EBT_IP6_SPORT,
>   src < info->sport[0] ||
>   src > info->sport[1]))
> - return false;
> + return false;
>   }
>   if ((info->bitmask & EBT_IP6_ICMP6) &&
>   NF_INVF(info, EBT_IP6_ICMP6,


Re: general protection fault in fib_dump_info

2017-08-15 Thread Dmitry Vyukov
That must be in https://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git
Which one should we test?


On Tue, Aug 15, 2017 at 8:51 AM, Dmitry Vyukov  wrote:
> Eric, what's the "David Miller net tree"? Is it
> https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git? I
> don't see 2c87d63ac853550e734edfd45e1be5e5aa44fbcc there.
> https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git is
> what we are testing and the last commit on which we saw this bug is:
>
> commit cb44a8606f06301242ada4b8fa2fb26769b3
> Merge: 54161ed4eede a656d34a6e5a
> Author: David S. Miller
> Date:   Mon Aug 14 11:18:16 2017 -0700
> Merge branch 'mlnx-i2c'
>
>
>
> On Tue, Aug 15, 2017 at 8:42 AM, idaifish  wrote:
>> The bug still looks like reproducible after applying the patch [
>> https://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git/commit/?id=2c87d63ac853550e734edfd45e1be5e5aa44fbcc
>> ]
>>
>>
>>
>> 2017-08-15 11:33 GMT+08:00 Eric Dumazet :
>>> On Tue, 2017-08-15 at 10:49 +0800, idaifish wrote:
 Syzkaller hit 'general protection fault in fib_dump_info' bug on
 commit 4.13-rc5..

 Guilty file: net/ipv4/fib_semantics.c

 kasan: GPF could be caused by NULL-ptr deref or user memory access
 general protection fault:  [#1] SMP KASAN
 Modules linked in:
 CPU: 0 PID: 2808 Comm: syz-executor0 Not tainted 4.13.0-rc5 #1
 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
 Ubuntu-1.8.2-1ubuntu1 04/01/2014
 task: 880078562700 task.stack: 88007811
 RIP: 0010:fib_dump_info+0x388/0x1170 net/ipv4/fib_semantics.c:1314
 RSP: 0018:880078117010 EFLAGS: 00010206
 RAX: dc00 RBX: 00fe RCX: 0002
 RDX: 0006 RSI: 880078117084 RDI: 0030
 RBP: 880078117268 R08: 000c R09: 8800780d80c8
 R10: 58d629b4 R11: 67fce681 R12: 
 R13: 8800784bd540 R14: 8800780d80b5 R15: 8800780d80a4
 FS:  022fa940() GS:88007fc0() 
 knlGS:
 CS:  0010 DS:  ES:  CR0: 80050033
 CR2: 004387d0 CR3: 79135000 CR4: 06f0
 Call Trace:
  inet_rtm_getroute+0xc89/0x1f50 net/ipv4/route.c:2766
  rtnetlink_rcv_msg+0x288/0x680 net/core/rtnetlink.c:4217
  netlink_rcv_skb+0x340/0x470 net/netlink/af_netlink.c:2397
  rtnetlink_rcv+0x28/0x30 net/core/rtnetlink.c:4223
  netlink_unicast_kernel net/netlink/af_netlink.c:1265 [inline]
  netlink_unicast+0x4c4/0x6e0 net/netlink/af_netlink.c:1291
  netlink_sendmsg+0x8c4/0xca0 net/netlink/af_netlink.c:1854
  sock_sendmsg_nosec net/socket.c:633 [inline]
  sock_sendmsg+0xca/0x110 net/socket.c:643
  ___sys_sendmsg+0x779/0x8d0 net/socket.c:2035
  __sys_sendmsg+0xd1/0x170 net/socket.c:2069
  SYSC_sendmsg net/socket.c:2080 [inline]
  SyS_sendmsg+0x2d/0x50 net/socket.c:2076
  entry_SYSCALL_64_fastpath+0x1a/0xa5
 RIP: 0033:0x4512e9
 RSP: 002b:7ffc75584cc8 EFLAGS: 0216 ORIG_RAX: 002e
 RAX: ffda RBX: 0002 RCX: 004512e9
 RDX:  RSI: 20f2cfc8 RDI: 0003
 RBP: 000e R08:  R09: 
 R10:  R11: 0216 R12: fffe
 R13: 00718000 R14: 20c44ff0 R15: 
 Code: 00 0f b6 8d ec fd ff ff 48 8b 85 f0 fd ff ff 88 48 17 48 8b 45
 28 48 8d 78 30 48 b8 00 00 00 00 00 fc ff df 48 89 fa 48 c1 ea 03 <0f>
 b6 04 02 84 c0 74 08 3c 03 0f 8e cb 0c 00 00 48 8b 45 28 44
 RIP: fib_dump_info+0x388/0x1170 net/ipv4/fib_semantics.c:1314 RSP:
 880078117010
 ---[ end trace 254a7af28348f88b ]---
 Kernel panic - not syncing: Fatal exception
 Kernel Offset: disabled
 Rebooting in 86400 seconds..

 -

 .config and reproducer.prog are attached.  Unfortunately the extracted
 C program can't work.
 Maybe you can follow the instruction
 [https://github.com/google/syzkaller/blob/master/docs/executing_syzkaller_programs.md]
 to reproduce the bug.


>>>
>>> Probably fixed by commit 2c87d63ac853550e734edfd45e1be5e5aa44fbcc
>>> ("ipv4: route: fix inet_rtm_getroute induced crash")
>>> (In David Miller net tree)
>>>
>>>
>>
>>
>>
>> --
>> Regards,
>> idaifish
>>
>> --
>> You received this message because you are subscribed to the Google Groups 
>> "syzkaller" group.
>> To unsubscribe from this group and stop receiving emails from it, send an 
>> email to syzkaller+unsubscr...@googlegroups.com.
>> For more options, visit https://groups.google.com/d/optout.


iproute2 invalid argument mpls labels

2017-08-15 Thread Алексей Болдырев
I updated the kernel 4.12.6. When the mote is hung on the route more than 8 
mpls of marks through iproute2, I get the following:

root@ne-vlezay80:~# ip route add 10.10.10.0/24 encap mpls 
50/60/70/80/90/100/110/120/130 dev lo
RTNETLINK answers: Invalid argument

root@ne-vlezay80:~# ip route add 10.10.10.0/24 encap mpls 
50/60/70/80/90/100/110/120 dev lo
root@ne-vlezay80:~#

root@ne-vlezay80:~# ip r
default via 10.247.0.1 dev ic-br0 proto zebra metric 20 
10.10.10.0/24  encap mpls  ///120 dev lo scope link 

root@ne-vlezay80:~# cat /usr/src/linux-4.12.6/net/mpls/internal.h|grep MAX_NEW
#define MAX_NEW_LABELS 30
root@ne-vlezay80:~# 

What is the problem, and how can more than 8 MPLS markers be hung on through 
iproute2?

root@ne-vlezay80:~# uname -r
4.12.6
root@ne-vlezay80:~# 


Re: AF_VSOCK unimplemented sockopts

2017-08-15 Thread Stefan Hajnoczi
On Fri, Aug 11, 2017 at 09:23:17AM +, Jorgen S. Hansen wrote:
> > On Aug 3, 2017, at 12:41 PM, Stefan Hajnoczi  wrote:
> > 
> > Hi Jorgen,
> > There are 3 sockopts defined in include/uapi/linux/vm_sockets.h that are
> > currently not implemented in net/vmw_vsock/af_vsock.c:
> > 
> > * SO_VM_SOCKETS_PEER_HOST_VM_ID
> > * SO_VM_SOCKETS_TRUSTED
> > * SO_VM_SOCKETS_NONBLOCK_TXRX
> > 
> > I noticed this because SO_VM_SOCKETS_TRUSTED is interesting for
> > virtio-vsock.  Services listening on AF_VSOCK inside the guest may not
> > want arbitrary unprivileged host processes to connect.  Instead of
> > inventing a new solution I wanted to look into SO_VM_SOCKETS_TRUSTED but
> > found it is not implemented in linux.git.
> > 
> > What is the status of these sockets?
> 
> These options were only implemented for ESX host endpoints, so were never 
> part of the Linux host side support. It looks like they could have been 
> omitted from vm_sockets.h, when the initial upstreaming was performed.
> 
> On ESX, the equivalent of SO_VM_SOCKETS_TRUSTED, is used for retrieving the 
> value of s->trusted of a VMCI socket. It cannot be used to mark a socket as 
> trusted. On Linux, trusted is tied to the CAP_NET_ADMIN capability of the 
> socket creator. VMCI based vSockets will per default only allow host side 
> sockets that are trusted, or are created by the same user as the VM, to 
> communicate with a given VM. This is achieved by per default creating VMs 
> with the VMCI privilege flag VMCI_PRIVILEGE_FLAG_RESTRICTED. It is possible 
> to create a VM that isn’t restricted, in which case any host process will be 
> able to communicate with the VM.
> 
> So it should be straight forward to implement the getsockopt part of 
> SO_VM_SOCKETS_TRUSTED, since it just needs to return s->trusted.

Currently virtio-vsock does not implement the 'restricted' mode but I'm
evaluating using it by default for stronger security.  Thanks for your
response!

Stefan


signature.asc
Description: PGP signature


Re: Kernel 4.13.0-rc4-next-20170811 - IP Routing / Forwarding performance vs Core/RSS number / HT on

2017-08-15 Thread Paweł Staszewski

Hi


Yes it helped - now there is almost no difference when using vlans or not:

10.5Mpps - with vlan

11Mpps - without vlan




W dniu 2017-08-15 o 03:17, Eric Dumazet pisze:

On Mon, 2017-08-14 at 18:07 -0700, Eric Dumazet wrote:


Or try to hack the IFF_XMIT_DST_RELEASE flag on the vlan netdev.

Something like :

diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
index 
5e831de3103e2f7092c7fa15534def403bc62fb4..9472de846d5c0960996261cb2843032847fa4bf7
 100644
--- a/net/8021q/vlan_netlink.c
+++ b/net/8021q/vlan_netlink.c
@@ -143,6 +143,7 @@ static int vlan_newlink(struct net *src_net, struct 
net_device *dev,
vlan->vlan_proto = proto;
vlan->vlan_id = nla_get_u16(data[IFLA_VLAN_ID]);
vlan->real_dev= real_dev;
+   dev->priv_flags |= (real_dev->priv_flags & IFF_XMIT_DST_RELEASE);
vlan->flags   = VLAN_FLAG_REORDER_HDR;
  
  	err = vlan_check_real_dev(real_dev, vlan->vlan_proto, vlan->vlan_id);









Re: Kernel 4.13.0-rc4-next-20170811 - IP Routing / Forwarding performance vs Core/RSS number / HT on

2017-08-15 Thread Paweł Staszewski

With hack:

14.44%  [kernel]   [k] do_raw_spin_lock
 8.30%  [kernel]   [k] page_frag_free
 7.06%  [mlx5_core][k] mlx5e_xmit
 5.97%  [kernel]   [k] acpi_processor_ffh_cstate_enter
 5.73%  [kernel]   [k] fib_table_lookup
 4.81%  [mlx5_core][k] mlx5e_poll_tx_cq
 4.51%  [mlx5_core][k] skb_from_cqe.isra.32
 3.81%  [kernel]   [k] virt_to_head_page
 2.45%  [kernel]   [k] __dev_queue_xmit
 1.84%  [kernel]   [k] ipt_do_table
 1.77%  [kernel]   [k] napi_consume_skb
 1.62%  [kernel]   [k] __build_skb
 1.46%  [kernel]   [k] netif_skb_features
 1.43%  [kernel]   [k] __netif_receive_skb_core
 1.41%  [kernel]   [k] ip_rcv
 1.08%  [kernel]   [k] dev_hard_start_xmit
 1.02%  [kernel]   [k] build_skb
 1.00%  [mlx5_core][k] mlx5_cqwq_get_cqe
 0.96%  [kernel]   [k] ip_route_input_rcu
 0.95%  [kernel]   [k] ip_forward
 0.89%  [kernel]   [k] ip_finish_output2
 0.89%  [kernel]   [k] kmem_cache_alloc
 0.78%  [kernel]   [k] __local_bh_enable_ip
 0.76%  [kernel]   [k] udp_v4_early_demux
 0.75%  [kernel]   [k] compound_head
 0.75%  [kernel]   [k] __netdev_pick_tx
 0.73%  [kernel]   [k] sch_direct_xmit
 0.65%  [kernel]   [k] irq_entries_start
 0.63%  [mlx5_core][k] mlx5e_free_rx_wqe_reuse
 0.61%  [kernel]   [k] netdev_pick_tx
 0.61%  [kernel]   [k] validate_xmit_skb
 0.55%  [kernel]   [k] skb_network_protocol
 0.53%  [mlx5_core][k] mlx5e_rx_cache_get
 0.53%  [mlx5_core][k] mlx5e_build_rx_skb
 0.51%  [kernel]   [k] ip_rcv_finish
 0.50%  [kernel]   [k] eth_header
 0.50%  [kernel]   [k] fib_validate_source
 0.50%  [mlx5_core][k] mlx5e_handle_rx_cqe
 0.48%  [mlx5_core][k] eq_update_ci
 0.47%  [kernel]   [k] kmem_cache_free_bulk
 0.44%  [kernel]   [k] deliver_ptype_list_skb
 0.43%  [kernel]   [k] skb_release_data
 0.42%  [kernel]   [k] cpuidle_enter_state
 0.40%  [kernel]   [k] virt_to_head_page
 0.39%  [kernel]   [k] vlan_dev_hard_start_xmit
 0.39%  [kernel]   [k] neigh_connected_output
 0.38%  [kernel]   [k] eth_type_vlan
 0.35%  [mlx5_core][k] mlx5e_alloc_rx_wqe
 0.32%  [kernel]   [k] nf_hook_slow
 0.32%  [kernel]   [k] swiotlb_map_page
 0.31%  [kernel]   [k] ip_finish_output
 0.29%  [kernel]   [k] ip_output
 0.28%  [kernel]   [k] skb_free_head
 0.25%  [kernel]   [k] netif_receive_skb_internal
 0.25%  [kernel]   [k] __jhash_nwords



Without hack:

14.25%  [kernel]   [k] dst_release
14.17%  [kernel]   [k] skb_dst_force
13.41%  [kernel]   [k] rt_cache_valid
11.47%  [kernel]   [k] ip_finish_output2
 7.01%  [kernel]   [k] do_raw_spin_lock
 5.07%  [kernel]   [k] page_frag_free
 3.47%  [mlx5_core][k] mlx5e_xmit
 2.88%  [kernel]   [k] fib_table_lookup
 2.43%  [mlx5_core][k] skb_from_cqe.isra.32
 1.97%  [kernel]   [k] virt_to_head_page
 1.81%  [mlx5_core][k] mlx5e_poll_tx_cq
 0.93%  [kernel]   [k] __dev_queue_xmit
 0.87%  [kernel]   [k] __build_skb
 0.84%  [kernel]   [k] ipt_do_table
 0.79%  [kernel]   [k] ip_rcv
 0.79%  [kernel]   [k] acpi_processor_ffh_cstate_enter
 0.78%  [kernel]   [k] netif_skb_features
 0.73%  [kernel]   [k] __netif_receive_skb_core
 0.52%  [kernel]   [k] dev_hard_start_xmit
 0.52%  [kernel]   [k] build_skb
 0.51%  [kernel]   [k] ip_route_input_rcu
 0.50%  [kernel]   [k] skb_unref
 0.49%  [kernel]   [k] ip_forward
 0.48%  [mlx5_core][k] mlx5_cqwq_get_cqe
 0.44%  [kernel]   [k] udp_v4_early_demux
 0.41%  [kernel]   [k] napi_consume_skb
 0.40%  [kernel]   [k] __local_bh_enable_ip
 0.39%  [kernel]   [k] ip_rcv_finish
 0.39%  [kernel]   [k] kmem_cache_alloc
 0.38%  [kernel]   [k] sch_direct_xmit
 0.33%  [kernel]   [k] validate_xmit_skb
 0.32%  [mlx5_core][k] mlx5e_free_rx_wqe_reuse
 0.29%  [kernel]   [k] netdev_pick_tx
 0.28%  [mlx5_core][k] mlx5e_build_rx_skb
 0.27%  [kernel]   [k] deliver_ptype_list_skb
 0.26%  [kernel]   [k] fib_validate_source
 0.26%  [mlx5_core][k] mlx5e_napi_poll
 0.26%  [mlx5_core][k] mlx5e_handle_rx_cqe
 0.26%  [mlx5_core][k] mlx5e_rx_cache_get
 0.25%  [kernel]   [k] eth_header
 0.23%  [kernel]   [k] skb_network_protocol
 0.20%  [kernel]   [k] nf_hook_slow
 0.20%  [kernel]   [k] vlan_passthru_hard_header
 0.20%  [kernel]   [k] vlan_dev_hard_start_xmit
 0.19%  [kernel]   [k] swiotlb_map_page
 0.18%  [kernel]   [k] compound_head
 0.18%  [kernel]   [k] neigh_connected_output
 0.18%  [mlx5_core]

Re: Kernel 4.13.0-rc4-next-20170811 - IP Routing / Forwarding performance vs Core/RSS number / HT on

2017-08-15 Thread Jesper Dangaard Brouer
On Tue, 15 Aug 2017 02:38:56 +0200
Paweł Staszewski  wrote:

> W dniu 2017-08-14 o 18:19, Jesper Dangaard Brouer pisze:
> > On Sun, 13 Aug 2017 18:58:58 +0200 Paweł Staszewski  
> > wrote:
> >  
> >> To show some difference below comparision vlan/no-vlan traffic
> >>
> >> 10Mpps forwarded traffic vith no-vlan vs 6.9Mpps with vlan  
> > I'm trying to reproduce in my testlab (with ixgbe).  I do see, a
> > performance reduction of about 10-19% when I forward out a VLAN
> > interface.  This is larger than I expected, but still lower than what
> > you reported 30-40% slowdown.
> >
> > [...]  
> Ok mellanox afrrived (MT27700 - mlnx5 driver)
> And to compare melannox with vlans and without: 33% performance 
> degradation (less than with ixgbe where i reach ~40% with same settings)
> 
> Mellanox without TX traffix on vlan:
> ID;CPU_CORES / RSS QUEUES;PKT_SIZE;PPS_RX;BPS_RX;PPS_TX;BPS_TX
> 0;16;64;11089305;709715520;8871553;567779392
> 1;16;64;11096292;710162688;11095566;710116224
> 2;16;64;11095770;710129280;11096799;710195136
> 3;16;64;11097199;710220736;11097702;710252928
> 4;16;64;11080984;567081856;11079662;709098368
> 5;16;64;11077696;708972544;11077039;708930496
> 6;16;64;11082991;709311424;8864802;567347328
> 7;16;64;11089596;709734144;8870927;709789184
> 8;16;64;11094043;710018752;11095391;710105024
> 
> Mellanox with TX traffic on vlan:
> ID;CPU_CORES / RSS QUEUES;PKT_SIZE;PPS_RX;BPS_RX;PPS_TX;BPS_TX
> 0;16;64;7369914;471674496;7370281;471697980
> 1;16;64;7368896;471609408;7368043;471554752
> 2;16;64;7367577;471524864;7367759;471536576
> 3;16;64;7368744;377305344;7369391;471641024
> 4;16;64;7366824;471476736;7364330;471237120
> 5;16;64;7368352;471574528;7367239;471503296
> 6;16;64;7367459;471517376;7367806;471539584
> 7;16;64;7367190;471500160;7367988;471551232
> 8;16;64;7368023;471553472;7368076;471556864

I wonder if the drivers page recycler is active/working or not, and if
the situation is different between VLAN vs no-vlan (given
page_frag_free is so high in you perf top).  The Mellanox drivers
fortunately have a stats counter to tell us this explicitly (which the
ixgbe driver doesn't).

You can use my ethtool_stats.pl script watch these stats:
 
https://github.com/netoptimizer/network-testing/blob/master/bin/ethtool_stats.pl
(Hint perl dependency:  dnf install perl-Time-HiRes)


> ethtool settings for both tests:
> ifc='enp175s0f0 enp175s0f1'
> for i in $ifc
>  do
>  ip link set up dev $i
>  ethtool -A $i autoneg off rx off tx off
>  ethtool -G $i rx 128 tx 256

The ring queue size recommendations, might be different for the mlx5
driver (Cc'ing Mellanox maintainers).  


>  ip link set $i txqueuelen 1000
>  ethtool -C $i rx-usecs 25
>  ethtool -L $i combined 16
>  ethtool -K $i gro off tso off gso off sg on l2-fwd-offload off 
> tx-nocache-copy off ntuple on
>  ethtool -N $i rx-flow-hash udp4 sdfn
>  done

Thanks for being explicit about what you setup is :-)
 
> and perf top:
> PerfTop:   83650 irqs/sec  kernel:99.7%  exact:  0.0% [4000Hz 
> cycles],  (all, 56 CPUs)
> ---
> 
>  14.25%  [kernel]   [k] dst_release
>  14.17%  [kernel]   [k] skb_dst_force
>  13.41%  [kernel]   [k] rt_cache_valid
>  11.47%  [kernel]   [k] ip_finish_output2
>   7.01%  [kernel]   [k] do_raw_spin_lock
>   5.07%  [kernel]   [k] page_frag_free
>   3.47%  [mlx5_core][k] mlx5e_xmit
>   2.88%  [kernel]   [k] fib_table_lookup
>   2.43%  [mlx5_core][k] skb_from_cqe.isra.32
>   1.97%  [kernel]   [k] virt_to_head_page
>   1.81%  [mlx5_core][k] mlx5e_poll_tx_cq
>   0.93%  [kernel]   [k] __dev_queue_xmit
>   0.87%  [kernel]   [k] __build_skb
>   0.84%  [kernel]   [k] ipt_do_table
>   0.79%  [kernel]   [k] ip_rcv
>   0.79%  [kernel]   [k] acpi_processor_ffh_cstate_enter
>   0.78%  [kernel]   [k] netif_skb_features
>   0.73%  [kernel]   [k] __netif_receive_skb_core
>   0.52%  [kernel]   [k] dev_hard_start_xmit
>   0.52%  [kernel]   [k] build_skb
>   0.51%  [kernel]   [k] ip_route_input_rcu
>   0.50%  [kernel]   [k] skb_unref
>   0.49%  [kernel]   [k] ip_forward
>   0.48%  [mlx5_core][k] mlx5_cqwq_get_cqe
>   0.44%  [kernel]   [k] udp_v4_early_demux
>   0.41%  [kernel]   [k] napi_consume_skb
>   0.40%  [kernel]   [k] __local_bh_enable_ip
>   0.39%  [kernel]   [k] ip_rcv_finish
>   0.39%  [kernel]   [k] kmem_cache_alloc
>   0.38%  [kernel]   [k] sch_direct_xmit
>   0.33%  [kernel]   [k] validate_xmit_skb
>   0.32%  [mlx5_core][k] mlx5e_free_rx_wqe_reuse
>   0.29%  [kernel]   [k] netdev_pick_tx
>  

Re: Kernel 4.13.0-rc4-next-20170811 - IP Routing / Forwarding performance vs Core/RSS number / HT on

2017-08-15 Thread Paweł Staszewski



W dniu 2017-08-15 o 11:23, Jesper Dangaard Brouer pisze:

On Tue, 15 Aug 2017 02:38:56 +0200
Paweł Staszewski  wrote:


W dniu 2017-08-14 o 18:19, Jesper Dangaard Brouer pisze:

On Sun, 13 Aug 2017 18:58:58 +0200 Paweł Staszewski  
wrote:
  

To show some difference below comparision vlan/no-vlan traffic

10Mpps forwarded traffic vith no-vlan vs 6.9Mpps with vlan

I'm trying to reproduce in my testlab (with ixgbe).  I do see, a
performance reduction of about 10-19% when I forward out a VLAN
interface.  This is larger than I expected, but still lower than what
you reported 30-40% slowdown.

[...]

Ok mellanox afrrived (MT27700 - mlnx5 driver)
And to compare melannox with vlans and without: 33% performance
degradation (less than with ixgbe where i reach ~40% with same settings)

Mellanox without TX traffix on vlan:
ID;CPU_CORES / RSS QUEUES;PKT_SIZE;PPS_RX;BPS_RX;PPS_TX;BPS_TX
0;16;64;11089305;709715520;8871553;567779392
1;16;64;11096292;710162688;11095566;710116224
2;16;64;11095770;710129280;11096799;710195136
3;16;64;11097199;710220736;11097702;710252928
4;16;64;11080984;567081856;11079662;709098368
5;16;64;11077696;708972544;11077039;708930496
6;16;64;11082991;709311424;8864802;567347328
7;16;64;11089596;709734144;8870927;709789184
8;16;64;11094043;710018752;11095391;710105024

Mellanox with TX traffic on vlan:
ID;CPU_CORES / RSS QUEUES;PKT_SIZE;PPS_RX;BPS_RX;PPS_TX;BPS_TX
0;16;64;7369914;471674496;7370281;471697980
1;16;64;7368896;471609408;7368043;471554752
2;16;64;7367577;471524864;7367759;471536576
3;16;64;7368744;377305344;7369391;471641024
4;16;64;7366824;471476736;7364330;471237120
5;16;64;7368352;471574528;7367239;471503296
6;16;64;7367459;471517376;7367806;471539584
7;16;64;7367190;471500160;7367988;471551232
8;16;64;7368023;471553472;7368076;471556864

I wonder if the drivers page recycler is active/working or not, and if
the situation is different between VLAN vs no-vlan (given
page_frag_free is so high in you perf top).  The Mellanox drivers
fortunately have a stats counter to tell us this explicitly (which the
ixgbe driver doesn't).

You can use my ethtool_stats.pl script watch these stats:
  
https://github.com/netoptimizer/network-testing/blob/master/bin/ethtool_stats.pl
(Hint perl dependency:  dnf install perl-Time-HiRes)

For RX NIC:
Show adapter(s) (enp175s0f0) statistics (ONLY that changed!)
Ethtool(enp175s0f0) stat: 78380071 ( 78,380,071) <= rx0_bytes /sec
Ethtool(enp175s0f0) stat:   230978 (230,978) <= 
rx0_cache_reuse /sec
Ethtool(enp175s0f0) stat:  1152648 (  1,152,648) <= 
rx0_csum_complete /sec

Ethtool(enp175s0f0) stat:  1152648 (  1,152,648) <= rx0_packets /sec
Ethtool(enp175s0f0) stat:   921614 (921,614) <= 
rx0_page_reuse /sec

Ethtool(enp175s0f0) stat: 78956591 ( 78,956,591) <= rx1_bytes /sec
Ethtool(enp175s0f0) stat:   233343 (233,343) <= 
rx1_cache_reuse /sec
Ethtool(enp175s0f0) stat:  1161126 (  1,161,126) <= 
rx1_csum_complete /sec

Ethtool(enp175s0f0) stat:  1161126 (  1,161,126) <= rx1_packets /sec
Ethtool(enp175s0f0) stat:   927793 (927,793) <= 
rx1_page_reuse /sec

Ethtool(enp175s0f0) stat: 79677124 ( 79,677,124) <= rx2_bytes /sec
Ethtool(enp175s0f0) stat:   233735 (233,735) <= 
rx2_cache_reuse /sec
Ethtool(enp175s0f0) stat:  1171722 (  1,171,722) <= 
rx2_csum_complete /sec

Ethtool(enp175s0f0) stat:  1171722 (  1,171,722) <= rx2_packets /sec
Ethtool(enp175s0f0) stat:   937989 (937,989) <= 
rx2_page_reuse /sec

Ethtool(enp175s0f0) stat: 78392893 ( 78,392,893) <= rx3_bytes /sec
Ethtool(enp175s0f0) stat:   230311 (230,311) <= 
rx3_cache_reuse /sec
Ethtool(enp175s0f0) stat:  1152837 (  1,152,837) <= 
rx3_csum_complete /sec

Ethtool(enp175s0f0) stat:  1152837 (  1,152,837) <= rx3_packets /sec
Ethtool(enp175s0f0) stat:   922513 (922,513) <= 
rx3_page_reuse /sec

Ethtool(enp175s0f0) stat: 65165583 ( 65,165,583) <= rx4_bytes /sec
Ethtool(enp175s0f0) stat:   191969 (191,969) <= 
rx4_cache_reuse /sec
Ethtool(enp175s0f0) stat:   958317 (958,317) <= 
rx4_csum_complete /sec

Ethtool(enp175s0f0) stat:   958317 (958,317) <= rx4_packets /sec
Ethtool(enp175s0f0) stat:   766332 (766,332) <= 
rx4_page_reuse /sec

Ethtool(enp175s0f0) stat: 66920721 ( 66,920,721) <= rx5_bytes /sec
Ethtool(enp175s0f0) stat:   197150 (197,150) <= 
rx5_cache_reuse /sec
Ethtool(enp175s0f0) stat:   984128 (984,128) <= 
rx5_csum_complete /sec

Ethtool(enp175s0f0) stat:   984128 (984,128) <= rx5_packets /sec
Ethtool(enp175s0f0) stat:   786978 (786,978) <= 
rx5_page_reuse /sec

Ethtool(enp175s0f0) stat: 79076984 ( 79,076,984) <= rx6_bytes /sec
Ethtool(enp175s0f0) stat:   233735 (233,735) <= 
rx6_cache_reuse /sec
Ethtool(enp175s0f0) stat:  1162897 (  1,162,897) <= 
rx6_csum_complete /

Re: Kernel 4.13.0-rc4-next-20170811 - IP Routing / Forwarding performance vs Core/RSS number / HT on

2017-08-15 Thread Jesper Dangaard Brouer
On Mon, 14 Aug 2017 18:57:50 +0200
Paolo Abeni  wrote:

> On Mon, 2017-08-14 at 18:19 +0200, Jesper Dangaard Brouer wrote:
> > The output (extracted below) didn't show who called 'do_raw_spin_lock',
> > BUT it showed another interesting thing.  The kernel code
> > __dev_queue_xmit() in might create route dst-cache problem for itself(?),
> > as it will first call skb_dst_force() and then skb_dst_drop() when the
> > packet is transmitted on a VLAN.
> > 
> >  static int __dev_queue_xmit(struct sk_buff *skb, void *accel_priv)
> >  {
> >  [...]
> > /* If device/qdisc don't need skb->dst, release it right now while
> >  * its hot in this cpu cache.
> >  */
> > if (dev->priv_flags & IFF_XMIT_DST_RELEASE)
> > skb_dst_drop(skb);
> > else
> > skb_dst_force(skb);  
> 
> I think that the high impact of the above code in this specific test is
> mostly due to the following:
> 
> - ingress packets with different RSS rx hash lands on different CPUs
> - but they use the same dst entry, since the destination IPs belong to
> the same subnet
> - the dst refcnt cacheline is contented between all the CPUs

Good point and explanation Paolo :-)
I changed my pktgen setup to be closer to Pawel's to provoke this
situation some more, and I get closer to provoke this although not as
clearly as Pawel.

A perf diff does show, that the overhead in the VLAN cause originates
from the routing "dst_release" code.  Diff Baseline==non-vlan case.

[jbrouer@canyon ~]$ sudo ~/perf diff
# Event 'cycles'
#
# Baseline  Delta Abs  Shared Object Symbol 
  
#   .    
.
#
 3.23% +4.32%  [kernel.vmlinux]  [k] __dev_queue_xmit
   +3.43%  [kernel.vmlinux]  [k] dst_release
13.54% -3.17%  [kernel.vmlinux]  [k] fib_table_lookup
 9.33% -2.73%  [kernel.vmlinux]  [k] _raw_spin_lock
 7.91% -1.75%  [ixgbe]   [k] ixgbe_poll
   +1.64%  [8021q]   [k] vlan_dev_hard_start_xmit
 7.23% -1.26%  [ixgbe]   [k] ixgbe_xmit_frame_ring
 3.34% -1.10%  [kernel.vmlinux]  [k] eth_type_trans
 5.20% +0.97%  [kernel.vmlinux]  [k] ip_route_input_rcu
 1.13% +0.95%  [kernel.vmlinux]  [k] ip_rcv_finish
 2.49% -0.82%  [kernel.vmlinux]  [k] ip_forward
 3.05% -0.80%  [kernel.vmlinux]  [k] __build_skb
 0.44% +0.74%  [kernel.vmlinux]  [k] __netif_receive_skb
   +0.71%  [kernel.vmlinux]  [k] neigh_connected_output
 1.70% +0.68%  [kernel.vmlinux]  [k] validate_xmit_skb
 1.42% +0.67%  [kernel.vmlinux]  [k] dev_hard_start_xmit
 0.49% +0.66%  [kernel.vmlinux]  [k] netif_receive_skb_internal
   +0.62%  [kernel.vmlinux]  [k] eth_header
   +0.57%  [ixgbe]   [k] ixgbe_tx_ctxtdesc
 1.19% -0.55%  [kernel.vmlinux]  [k] __netdev_pick_tx
 2.54% -0.48%  [kernel.vmlinux]  [k] fib_validate_source
 2.83% +0.46%  [kernel.vmlinux]  [k] ip_finish_output2
 1.45% +0.45%  [kernel.vmlinux]  [k] netif_skb_features
 1.66% -0.45%  [kernel.vmlinux]  [k] napi_gro_receive
 0.90% -0.40%  [kernel.vmlinux]  [k] validate_xmit_skb_list
 1.45% -0.39%  [kernel.vmlinux]  [k] ip_finish_output
   +0.36%  [8021q]   [k] vlan_passthru_hard_header
 1.28% -0.33%  [kernel.vmlinux]  [k] netdev_pick_tx
 

> Perhaps we can inprove the situation setting the IFF_XMIT_DST_RELEASE
> flag for vlan if the underlaying device does not have (relevant)
> classifier attached? (and clearing it as needed)

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


Re: [PATCH 1/2] mpls: add handlers

2017-08-15 Thread David Lamparter
On Sat, Aug 12, 2017 at 08:29:18PM -0700, Roopa Prabhu wrote:
> On Sat, Aug 12, 2017 at 6:35 AM, Amine Kherbouche
>  wrote:
> >
> >
> > On 11/08/2017 16:37, Roopa Prabhu wrote:
> >>
> >> On Fri, Aug 11, 2017 at 5:34 AM, David Lamparter 
> >> wrote:
> >>>
> >>> On Thu, Aug 10, 2017 at 10:28:36PM +0200, Amine Kherbouche wrote:
> 
>  Mpls handler allows creation/deletion of mpls routes without using
>  rtnetlink. When an incoming mpls packet matches this route, the saved
>  function handler is called.
> >>>
> >>> Since I originally authored this patch, I have come to believe that it
> >>> might be unneccessarily complicated.  It is unlikely that a lot of
> >>> different "handlers" will exist here;  the only things I can think of
> >>> are VPLS support and BIER-MPLS multicast replication.  I'm not saying
> >>> it's a bad idea, but, well, this was in the README that I gave to 6WIND
> >>> with this code:
> >>>
> >>> ...
> >>
> >> yes, I would also prefer just exporting the functions  and calling
> >> them directly instead of adding a
> >> handler layer. We can move to that later if it becomes necessary.
> >
> > I understand that the handler layer is an overhead (as said by David's
> > note), and I agree with the solution for exporting the mpls functions that
> > allows route creation/deletion, but how about forwarding the right mpls
> > packet to the right vpls device with the device ptr? I don't see
> > another way.
> 
> 
> hmm...ok, so you are adding a mpls route to get into vpls_rcv and you
> want this route to carry the vpls_rcv information. Ideally if you knew
> the route is pointing to a vpls device kind, you can directly call
> vpls_rcv.
> But, am not sure if a route is necessary here either.
> 
> It just seems like the vpls device information is duplicated in the
> mpls route per vpls dev. Wondering if we can skip the route part and
> always do a lookup on vpls-id/label in mpls_rcv to send it to a
> vpls_rcv if there is a match.  This will be the l2 handler for mpls.

I think the reverse is the better option, removing the vpls device
information and just going with the route table.  My approach to this
would be to add a new netlink route attribute "RTA_VPLS" which
identifies the vpls device, is stored in the route table, and provides
the device ptr needed here.
(The control word config should also be on the route.)

My reason for thinking this is that the VPLS code needs exactly the same
information as does a normal MPLS route:  it attaches to an incoming
label (decapsulating packets instead of forwarding them), and for TX it
does the same operation of looking up a nexthop (possibly with ECMP
support) and adding a label stack.  The code should, in fact, probably
reuse the TX path.

This also fits both an 1:1 and 1:n model pretty well.  Creating a VPLS
head-end netdevice doesn't even need any config.  It'd just work like:
- ip link add name vpls123 type vpls
- ip -f mpls route add \
1234 \  # incoming label for decap
vpls vpls123 \  # new attr: VPLS device
as 2345 via inet 10.0.0.1 dev eth0  # outgoing label for encap

For a 1:n model, one would simply add multiple routes on the same vpls
device.


-David


Re: [PATCH] netfilter: fix indent on in statements

2017-08-15 Thread Sergei Shtylyov

Hello!

On 8/15/2017 9:50 AM, Colin King wrote:


From: Colin Ian King 

The returns on some if statements are not indented correctly,


   s/in/if/ in the subject?


add in the missing tab.

Signed-off-by: Colin Ian King 

[...]

MBR, Sergei


Re: general protection fault in fib_dump_info

2017-08-15 Thread Eric Dumazet
On Tue, 2017-08-15 at 08:51 +0200, Dmitry Vyukov wrote:
> Eric, what's the "David Miller net tree"? Is it
> https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git? I
> don't see 2c87d63ac853550e734edfd45e1be5e5aa44fbcc there.
> https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git is
> what we are testing and the last commit on which we saw this bug is:
> 
> commit cb44a8606f06301242ada4b8fa2fb26769b3
> Merge: 54161ed4eede a656d34a6e5a
> Author: David S. Miller
> Date:   Mon Aug 14 11:18:16 2017 -0700
> Merge branch 'mlnx-i2c'
> 

Extracted from Documentation/networking/netdev-FAQ.txt

A: There are always two trees (git repositories) in play.  Both are driven
   by David Miller, the main network maintainer.  There is the "net" tree,
   and the "net-next" tree.  As you can probably guess from the names, the
   net tree is for fixes to existing code already in the mainline tree from
   Linus, and net-next is where the new code goes for the future release.
   You can find the trees here:

https://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git
https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git


Hope this helps.




[PATCH][V2] netfilter: fix indent on if statements

2017-08-15 Thread Colin King
From: Colin Ian King 

The returns on some if statements are not indented correctly,
add in the missing tab.

Signed-off-by: Colin Ian King 
---
 net/bridge/netfilter/ebt_ip.c  | 4 ++--
 net/bridge/netfilter/ebt_ip6.c | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/net/bridge/netfilter/ebt_ip.c b/net/bridge/netfilter/ebt_ip.c
index d06968bdf5ec..2b46c50abce0 100644
--- a/net/bridge/netfilter/ebt_ip.c
+++ b/net/bridge/netfilter/ebt_ip.c
@@ -64,14 +64,14 @@ ebt_ip_mt(const struct sk_buff *skb, struct xt_action_param 
*par)
if (NF_INVF(info, EBT_IP_DPORT,
dst < info->dport[0] ||
dst > info->dport[1]))
-   return false;
+   return false;
}
if (info->bitmask & EBT_IP_SPORT) {
u32 src = ntohs(pptr->src);
if (NF_INVF(info, EBT_IP_SPORT,
src < info->sport[0] ||
src > info->sport[1]))
-   return false;
+   return false;
}
}
return true;
diff --git a/net/bridge/netfilter/ebt_ip6.c b/net/bridge/netfilter/ebt_ip6.c
index 4617491be41e..2a5a52a53ec4 100644
--- a/net/bridge/netfilter/ebt_ip6.c
+++ b/net/bridge/netfilter/ebt_ip6.c
@@ -89,7 +89,7 @@ ebt_ip6_mt(const struct sk_buff *skb, struct xt_action_param 
*par)
if (NF_INVF(info, EBT_IP6_SPORT,
src < info->sport[0] ||
src > info->sport[1]))
-   return false;
+   return false;
}
if ((info->bitmask & EBT_IP6_ICMP6) &&
NF_INVF(info, EBT_IP6_ICMP6,
-- 
2.11.0



Re: [PATCH] netfilter: fix indent on in statements

2017-08-15 Thread Colin Ian King
On 15/08/17 10:45, Sergei Shtylyov wrote:
> Hello!
> 
> On 8/15/2017 9:50 AM, Colin King wrote:
> 
>> From: Colin Ian King 
>>
>> The returns on some if statements are not indented correctly,
> 
>s/in/if/ in the subject?

Doh, fix resent.

> 
>> add in the missing tab.
>>
>> Signed-off-by: Colin Ian King 
> [...]
> 
> MBR, Sergei
> -- 
> To unsubscribe from this list: send the line "unsubscribe
> kernel-janitors" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html



Re: Kernel 4.13.0-rc4-next-20170811 - IP Routing / Forwarding performance vs Core/RSS number / HT on

2017-08-15 Thread Jesper Dangaard Brouer

On Tue, 15 Aug 2017 11:30:43 +0200 Paweł Staszewski  
wrote:

> W dniu 2017-08-15 o 11:23, Jesper Dangaard Brouer pisze:
> > On Tue, 15 Aug 2017 02:38:56 +0200
> > Paweł Staszewski  wrote:
> >  
> >> W dniu 2017-08-14 o 18:19, Jesper Dangaard Brouer pisze:  
> >>> On Sun, 13 Aug 2017 18:58:58 +0200 Paweł Staszewski 
> >>>  wrote:
> >>> 
>  To show some difference below comparision vlan/no-vlan traffic
> 
>  10Mpps forwarded traffic vith no-vlan vs 6.9Mpps with vlan  
> >>> I'm trying to reproduce in my testlab (with ixgbe).  I do see, a
> >>> performance reduction of about 10-19% when I forward out a VLAN
> >>> interface.  This is larger than I expected, but still lower than what
> >>> you reported 30-40% slowdown.
> >>>
> >>> [...]  
> >> Ok mellanox afrrived (MT27700 - mlnx5 driver)
> >> And to compare melannox with vlans and without: 33% performance
> >> degradation (less than with ixgbe where i reach ~40% with same settings)
> >>
> >> Mellanox without TX traffix on vlan:
> >> ID;CPU_CORES / RSS QUEUES;PKT_SIZE;PPS_RX;BPS_RX;PPS_TX;BPS_TX
> >> 0;16;64;11089305;709715520;8871553;567779392
> >> 1;16;64;11096292;710162688;11095566;710116224
> >> 2;16;64;11095770;710129280;11096799;710195136
> >> 3;16;64;11097199;710220736;11097702;710252928
> >> 4;16;64;11080984;567081856;11079662;709098368
> >> 5;16;64;11077696;708972544;11077039;708930496
> >> 6;16;64;11082991;709311424;8864802;567347328
> >> 7;16;64;11089596;709734144;8870927;709789184
> >> 8;16;64;11094043;710018752;11095391;710105024
> >>
> >> Mellanox with TX traffic on vlan:
> >> ID;CPU_CORES / RSS QUEUES;PKT_SIZE;PPS_RX;BPS_RX;PPS_TX;BPS_TX
> >> 0;16;64;7369914;471674496;7370281;471697980
> >> 1;16;64;7368896;471609408;7368043;471554752
> >> 2;16;64;7367577;471524864;7367759;471536576
> >> 3;16;64;7368744;377305344;7369391;471641024
> >> 4;16;64;7366824;471476736;7364330;471237120
> >> 5;16;64;7368352;471574528;7367239;471503296
> >> 6;16;64;7367459;471517376;7367806;471539584
> >> 7;16;64;7367190;471500160;7367988;471551232
> >> 8;16;64;7368023;471553472;7368076;471556864  
> > I wonder if the drivers page recycler is active/working or not, and if
> > the situation is different between VLAN vs no-vlan (given
> > page_frag_free is so high in you perf top).  The Mellanox drivers
> > fortunately have a stats counter to tell us this explicitly (which the
> > ixgbe driver doesn't).
> >
> > You can use my ethtool_stats.pl script watch these stats:
> >   
> > https://github.com/netoptimizer/network-testing/blob/master/bin/ethtool_stats.pl
> > (Hint perl dependency:  dnf install perl-Time-HiRes)  
> For RX NIC:
> Show adapter(s) (enp175s0f0) statistics (ONLY that changed!)
> Ethtool(enp175s0f0) stat: 78380071 ( 78,380,071) <= rx0_bytes /sec
> Ethtool(enp175s0f0) stat:   230978 (230,978) <= rx0_cache_reuse 
> /sec
> Ethtool(enp175s0f0) stat:  1152648 (  1,152,648) <= rx0_csum_complete 
> /sec
> Ethtool(enp175s0f0) stat:  1152648 (  1,152,648) <= rx0_packets /sec
> Ethtool(enp175s0f0) stat:   921614 (921,614) <= rx0_page_reuse 
> /sec
> Ethtool(enp175s0f0) stat: 78956591 ( 78,956,591) <= rx1_bytes /sec
> Ethtool(enp175s0f0) stat:   233343 (233,343) <= rx1_cache_reuse 
> /sec
> Ethtool(enp175s0f0) stat:  1161126 (  1,161,126) <= rx1_csum_complete 
> /sec
> Ethtool(enp175s0f0) stat:  1161126 (  1,161,126) <= rx1_packets /sec
> Ethtool(enp175s0f0) stat:   927793 (927,793) <= rx1_page_reuse 
> /sec
> Ethtool(enp175s0f0) stat: 79677124 ( 79,677,124) <= rx2_bytes /sec
> Ethtool(enp175s0f0) stat:   233735 (233,735) <= rx2_cache_reuse 
> /sec
> Ethtool(enp175s0f0) stat:  1171722 (  1,171,722) <= rx2_csum_complete 
> /sec
> Ethtool(enp175s0f0) stat:  1171722 (  1,171,722) <= rx2_packets /sec
> Ethtool(enp175s0f0) stat:   937989 (937,989) <= rx2_page_reuse 
> /sec
> Ethtool(enp175s0f0) stat: 78392893 ( 78,392,893) <= rx3_bytes /sec
> Ethtool(enp175s0f0) stat:   230311 (230,311) <= rx3_cache_reuse 
> /sec
> Ethtool(enp175s0f0) stat:  1152837 (  1,152,837) <= rx3_csum_complete 
> /sec
> Ethtool(enp175s0f0) stat:  1152837 (  1,152,837) <= rx3_packets /sec
> Ethtool(enp175s0f0) stat:   922513 (922,513) <= rx3_page_reuse 
> /sec
> Ethtool(enp175s0f0) stat: 65165583 ( 65,165,583) <= rx4_bytes /sec
> Ethtool(enp175s0f0) stat:   191969 (191,969) <= rx4_cache_reuse 
> /sec
> Ethtool(enp175s0f0) stat:   958317 (958,317) <= rx4_csum_complete 
> /sec
> Ethtool(enp175s0f0) stat:   958317 (958,317) <= rx4_packets /sec
> Ethtool(enp175s0f0) stat:   766332 (766,332) <= rx4_page_reuse 
> /sec
> Ethtool(enp175s0f0) stat: 66920721 ( 66,920,721) <= rx5_bytes /sec
> Ethtool(enp175s0f0) stat:   197150 (197,150) <= rx5_cache_reuse 
> /sec
> Ethtool(enp175s0f0) stat:   984128 (984,128) <= rx5_csum_c

Re: Kernel 4.13.0-rc4-next-20170811 - IP Routing / Forwarding performance vs Core/RSS number / HT on

2017-08-15 Thread Paweł Staszewski



W dniu 2017-08-15 o 11:57, Jesper Dangaard Brouer pisze:

On Tue, 15 Aug 2017 11:30:43 +0200 Paweł Staszewski  
wrote:


W dniu 2017-08-15 o 11:23, Jesper Dangaard Brouer pisze:

On Tue, 15 Aug 2017 02:38:56 +0200
Paweł Staszewski  wrote:
  

W dniu 2017-08-14 o 18:19, Jesper Dangaard Brouer pisze:

On Sun, 13 Aug 2017 18:58:58 +0200 Paweł Staszewski  
wrote:
 

To show some difference below comparision vlan/no-vlan traffic

10Mpps forwarded traffic vith no-vlan vs 6.9Mpps with vlan

I'm trying to reproduce in my testlab (with ixgbe).  I do see, a
performance reduction of about 10-19% when I forward out a VLAN
interface.  This is larger than I expected, but still lower than what
you reported 30-40% slowdown.

[...]

Ok mellanox afrrived (MT27700 - mlnx5 driver)
And to compare melannox with vlans and without: 33% performance
degradation (less than with ixgbe where i reach ~40% with same settings)

Mellanox without TX traffix on vlan:
ID;CPU_CORES / RSS QUEUES;PKT_SIZE;PPS_RX;BPS_RX;PPS_TX;BPS_TX
0;16;64;11089305;709715520;8871553;567779392
1;16;64;11096292;710162688;11095566;710116224
2;16;64;11095770;710129280;11096799;710195136
3;16;64;11097199;710220736;11097702;710252928
4;16;64;11080984;567081856;11079662;709098368
5;16;64;11077696;708972544;11077039;708930496
6;16;64;11082991;709311424;8864802;567347328
7;16;64;11089596;709734144;8870927;709789184
8;16;64;11094043;710018752;11095391;710105024

Mellanox with TX traffic on vlan:
ID;CPU_CORES / RSS QUEUES;PKT_SIZE;PPS_RX;BPS_RX;PPS_TX;BPS_TX
0;16;64;7369914;471674496;7370281;471697980
1;16;64;7368896;471609408;7368043;471554752
2;16;64;7367577;471524864;7367759;471536576
3;16;64;7368744;377305344;7369391;471641024
4;16;64;7366824;471476736;7364330;471237120
5;16;64;7368352;471574528;7367239;471503296
6;16;64;7367459;471517376;7367806;471539584
7;16;64;7367190;471500160;7367988;471551232
8;16;64;7368023;471553472;7368076;471556864

I wonder if the drivers page recycler is active/working or not, and if
the situation is different between VLAN vs no-vlan (given
page_frag_free is so high in you perf top).  The Mellanox drivers
fortunately have a stats counter to tell us this explicitly (which the
ixgbe driver doesn't).

You can use my ethtool_stats.pl script watch these stats:
   
https://github.com/netoptimizer/network-testing/blob/master/bin/ethtool_stats.pl
(Hint perl dependency:  dnf install perl-Time-HiRes)

For RX NIC:
Show adapter(s) (enp175s0f0) statistics (ONLY that changed!)
Ethtool(enp175s0f0) stat: 78380071 ( 78,380,071) <= rx0_bytes /sec
Ethtool(enp175s0f0) stat:   230978 (230,978) <= rx0_cache_reuse /sec
Ethtool(enp175s0f0) stat:  1152648 (  1,152,648) <= rx0_csum_complete 
/sec
Ethtool(enp175s0f0) stat:  1152648 (  1,152,648) <= rx0_packets /sec
Ethtool(enp175s0f0) stat:   921614 (921,614) <= rx0_page_reuse /sec
Ethtool(enp175s0f0) stat: 78956591 ( 78,956,591) <= rx1_bytes /sec
Ethtool(enp175s0f0) stat:   233343 (233,343) <= rx1_cache_reuse /sec
Ethtool(enp175s0f0) stat:  1161126 (  1,161,126) <= rx1_csum_complete 
/sec
Ethtool(enp175s0f0) stat:  1161126 (  1,161,126) <= rx1_packets /sec
Ethtool(enp175s0f0) stat:   927793 (927,793) <= rx1_page_reuse /sec
Ethtool(enp175s0f0) stat: 79677124 ( 79,677,124) <= rx2_bytes /sec
Ethtool(enp175s0f0) stat:   233735 (233,735) <= rx2_cache_reuse /sec
Ethtool(enp175s0f0) stat:  1171722 (  1,171,722) <= rx2_csum_complete 
/sec
Ethtool(enp175s0f0) stat:  1171722 (  1,171,722) <= rx2_packets /sec
Ethtool(enp175s0f0) stat:   937989 (937,989) <= rx2_page_reuse /sec
Ethtool(enp175s0f0) stat: 78392893 ( 78,392,893) <= rx3_bytes /sec
Ethtool(enp175s0f0) stat:   230311 (230,311) <= rx3_cache_reuse /sec
Ethtool(enp175s0f0) stat:  1152837 (  1,152,837) <= rx3_csum_complete 
/sec
Ethtool(enp175s0f0) stat:  1152837 (  1,152,837) <= rx3_packets /sec
Ethtool(enp175s0f0) stat:   922513 (922,513) <= rx3_page_reuse /sec
Ethtool(enp175s0f0) stat: 65165583 ( 65,165,583) <= rx4_bytes /sec
Ethtool(enp175s0f0) stat:   191969 (191,969) <= rx4_cache_reuse /sec
Ethtool(enp175s0f0) stat:   958317 (958,317) <= rx4_csum_complete 
/sec
Ethtool(enp175s0f0) stat:   958317 (958,317) <= rx4_packets /sec
Ethtool(enp175s0f0) stat:   766332 (766,332) <= rx4_page_reuse /sec
Ethtool(enp175s0f0) stat: 66920721 ( 66,920,721) <= rx5_bytes /sec
Ethtool(enp175s0f0) stat:   197150 (197,150) <= rx5_cache_reuse /sec
Ethtool(enp175s0f0) stat:   984128 (984,128) <= rx5_csum_complete 
/sec
Ethtool(enp175s0f0) stat:   984128 (984,128) <= rx5_packets /sec
Ethtool(enp175s0f0) stat:   786978 (786,978) <= rx5_page_reuse /sec
Ethtool(enp175s0f0) stat: 79076984 ( 79,076,984) <= rx6_bytes /sec
Ethtool(enp175s0f0) stat:   233735 (233,735) <=

Re: Kernel 4.13.0-rc4-next-20170811 - IP Routing / Forwarding performance vs Core/RSS number / HT on

2017-08-15 Thread Jesper Dangaard Brouer

On Tue, 15 Aug 2017 11:11:57 +0200 Paweł Staszewski  
wrote:

> Yes it helped - now there is almost no difference when using vlans or not:
> 
> 10.5Mpps - with vlan
> 
> 11Mpps - without vlan

Great! - it seems like we have pinpointed the root-cause.  It also
demonstrate how big the benefit is of Eric commit (thanks!):
 https://git.kernel.org/torvalds/c/93f154b594fe


> W dniu 2017-08-15 o 03:17, Eric Dumazet pisze:
> > On Mon, 2017-08-14 at 18:07 -0700, Eric Dumazet wrote:
> >  
> >> Or try to hack the IFF_XMIT_DST_RELEASE flag on the vlan netdev.  
> > Something like :
> >
> > diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
> > index 
> > 5e831de3103e2f7092c7fa15534def403bc62fb4..9472de846d5c0960996261cb2843032847fa4bf7
> >  100644
> > --- a/net/8021q/vlan_netlink.c
> > +++ b/net/8021q/vlan_netlink.c
> > @@ -143,6 +143,7 @@ static int vlan_newlink(struct net *src_net, struct 
> > net_device *dev,
> > vlan->vlan_proto = proto;
> > vlan->vlan_id= nla_get_u16(data[IFLA_VLAN_ID]);
> > vlan->real_dev   = real_dev;
> > +   dev->priv_flags |= (real_dev->priv_flags & IFF_XMIT_DST_RELEASE);
> > vlan->flags  = VLAN_FLAG_REORDER_HDR;
> >   
> > err = vlan_check_real_dev(real_dev, vlan->vlan_proto, vlan->vlan_id);

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


Re: Kernel 4.13.0-rc4-next-20170811 - IP Routing / Forwarding performance vs Core/RSS number / HT on

2017-08-15 Thread Paweł Staszewski



W dniu 2017-08-15 o 12:02, Paweł Staszewski pisze:



W dniu 2017-08-15 o 11:57, Jesper Dangaard Brouer pisze:
On Tue, 15 Aug 2017 11:30:43 +0200 Paweł Staszewski 
 wrote:



W dniu 2017-08-15 o 11:23, Jesper Dangaard Brouer pisze:

On Tue, 15 Aug 2017 02:38:56 +0200
Paweł Staszewski  wrote:

W dniu 2017-08-14 o 18:19, Jesper Dangaard Brouer pisze:
On Sun, 13 Aug 2017 18:58:58 +0200 Paweł Staszewski 
 wrote:

To show some difference below comparision vlan/no-vlan traffic

10Mpps forwarded traffic vith no-vlan vs 6.9Mpps with vlan

I'm trying to reproduce in my testlab (with ixgbe).  I do see, a
performance reduction of about 10-19% when I forward out a VLAN
interface.  This is larger than I expected, but still lower than 
what

you reported 30-40% slowdown.

[...]

Ok mellanox afrrived (MT27700 - mlnx5 driver)
And to compare melannox with vlans and without: 33% performance
degradation (less than with ixgbe where i reach ~40% with same 
settings)


Mellanox without TX traffix on vlan:
ID;CPU_CORES / RSS QUEUES;PKT_SIZE;PPS_RX;BPS_RX;PPS_TX;BPS_TX
0;16;64;11089305;709715520;8871553;567779392
1;16;64;11096292;710162688;11095566;710116224
2;16;64;11095770;710129280;11096799;710195136
3;16;64;11097199;710220736;11097702;710252928
4;16;64;11080984;567081856;11079662;709098368
5;16;64;11077696;708972544;11077039;708930496
6;16;64;11082991;709311424;8864802;567347328
7;16;64;11089596;709734144;8870927;709789184
8;16;64;11094043;710018752;11095391;710105024

Mellanox with TX traffic on vlan:
ID;CPU_CORES / RSS QUEUES;PKT_SIZE;PPS_RX;BPS_RX;PPS_TX;BPS_TX
0;16;64;7369914;471674496;7370281;471697980
1;16;64;7368896;471609408;7368043;471554752
2;16;64;7367577;471524864;7367759;471536576
3;16;64;7368744;377305344;7369391;471641024
4;16;64;7366824;471476736;7364330;471237120
5;16;64;7368352;471574528;7367239;471503296
6;16;64;7367459;471517376;7367806;471539584
7;16;64;7367190;471500160;7367988;471551232
8;16;64;7368023;471553472;7368076;471556864

I wonder if the drivers page recycler is active/working or not, and if
the situation is different between VLAN vs no-vlan (given
page_frag_free is so high in you perf top).  The Mellanox drivers
fortunately have a stats counter to tell us this explicitly (which the
ixgbe driver doesn't).

You can use my ethtool_stats.pl script watch these stats:
https://github.com/netoptimizer/network-testing/blob/master/bin/ethtool_stats.pl
(Hint perl dependency:  dnf install perl-Time-HiRes)

For RX NIC:
Show adapter(s) (enp175s0f0) statistics (ONLY that changed!)
Ethtool(enp175s0f0) stat: 78380071 ( 78,380,071) <= 
rx0_bytes /sec
Ethtool(enp175s0f0) stat:   230978 (230,978) <= 
rx0_cache_reuse /sec
Ethtool(enp175s0f0) stat:  1152648 (  1,152,648) <= 
rx0_csum_complete /sec
Ethtool(enp175s0f0) stat:  1152648 (  1,152,648) <= 
rx0_packets /sec
Ethtool(enp175s0f0) stat:   921614 (921,614) <= 
rx0_page_reuse /sec
Ethtool(enp175s0f0) stat: 78956591 ( 78,956,591) <= 
rx1_bytes /sec
Ethtool(enp175s0f0) stat:   233343 (233,343) <= 
rx1_cache_reuse /sec
Ethtool(enp175s0f0) stat:  1161126 (  1,161,126) <= 
rx1_csum_complete /sec
Ethtool(enp175s0f0) stat:  1161126 (  1,161,126) <= 
rx1_packets /sec
Ethtool(enp175s0f0) stat:   927793 (927,793) <= 
rx1_page_reuse /sec
Ethtool(enp175s0f0) stat: 79677124 ( 79,677,124) <= 
rx2_bytes /sec
Ethtool(enp175s0f0) stat:   233735 (233,735) <= 
rx2_cache_reuse /sec
Ethtool(enp175s0f0) stat:  1171722 (  1,171,722) <= 
rx2_csum_complete /sec
Ethtool(enp175s0f0) stat:  1171722 (  1,171,722) <= 
rx2_packets /sec
Ethtool(enp175s0f0) stat:   937989 (937,989) <= 
rx2_page_reuse /sec
Ethtool(enp175s0f0) stat: 78392893 ( 78,392,893) <= 
rx3_bytes /sec
Ethtool(enp175s0f0) stat:   230311 (230,311) <= 
rx3_cache_reuse /sec
Ethtool(enp175s0f0) stat:  1152837 (  1,152,837) <= 
rx3_csum_complete /sec
Ethtool(enp175s0f0) stat:  1152837 (  1,152,837) <= 
rx3_packets /sec
Ethtool(enp175s0f0) stat:   922513 (922,513) <= 
rx3_page_reuse /sec
Ethtool(enp175s0f0) stat: 65165583 ( 65,165,583) <= 
rx4_bytes /sec
Ethtool(enp175s0f0) stat:   191969 (191,969) <= 
rx4_cache_reuse /sec
Ethtool(enp175s0f0) stat:   958317 (958,317) <= 
rx4_csum_complete /sec
Ethtool(enp175s0f0) stat:   958317 (958,317) <= 
rx4_packets /sec
Ethtool(enp175s0f0) stat:   766332 (766,332) <= 
rx4_page_reuse /sec
Ethtool(enp175s0f0) stat: 66920721 ( 66,920,721) <= 
rx5_bytes /sec
Ethtool(enp175s0f0) stat:   197150 (197,150) <= 
rx5_cache_reuse /sec
Ethtool(enp175s0f0) stat:   984128 (984,128) <= 
rx5_csum_complete /sec
Ethtool(enp175s0f0) stat:   984128 (984,128) <= 
rx5_packets /sec
Ethtool(enp175s0f0) stat:   786978 (786,978) <= 
rx5_page_reuse /sec
Ethtool(enp175s0f0) stat: 79076984 ( 79,076,984) <= 
rx6_b

Re: Kernel 4.13.0-rc4-next-20170811 - IP Routing / Forwarding performance vs Core/RSS number / HT on

2017-08-15 Thread Jesper Dangaard Brouer

On Tue, 15 Aug 2017 12:05:37 +0200 Paweł Staszewski  
wrote:
> W dniu 2017-08-15 o 12:02, Paweł Staszewski pisze:
> > W dniu 2017-08-15 o 11:57, Jesper Dangaard Brouer pisze:  
> >> On Tue, 15 Aug 2017 11:30:43 +0200 Paweł Staszewski 
> >>  wrote:
> >>> W dniu 2017-08-15 o 11:23, Jesper Dangaard Brouer pisze:  
>  On Tue, 15 Aug 2017 02:38:56 +0200
>  Paweł Staszewski  wrote:  
> > W dniu 2017-08-14 o 18:19, Jesper Dangaard Brouer pisze:  
> >> On Sun, 13 Aug 2017 18:58:58 +0200 Paweł Staszewski 
> >>  wrote:  
[... cut ...]

> >>> Ethtool(enp175s0f1) stat:  8895566 (  8,895,566) <= 
> >>> tx_prio0_packets /sec
> >>> Ethtool(enp175s0f1) stat:640470657 (640,470,657) <= 
> >>> tx_vport_unicast_bytes /sec
> >>> Ethtool(enp175s0f1) stat:  8895427 (  8,895,427) <= 
> >>> tx_vport_unicast_packets /sec
> >>> Ethtool(enp175s0f1) stat:  498 (498) <= tx_xmit_more 
> >>> /sec  
> >>
> >> We are seeing some xmit_more, this is interesting.  Have you noticed,
> >> if (in the VLAN case) there is a queue in the qdisc layer?
> >>
> >> Simply inspect with: tc -s qdisc show dev ixgbe2  
[...]
> > physical interface mq attached with pfifo_fast:
> >
> > tc -s -d qdisc show dev enp175s0f1
> > qdisc mq 0: root
> >  Sent 1397200697212 bytes 3965888669 pkt (dropped 78065663, overlimits 0 
> > requeues 629868)
> >  backlog 0b 0p requeues 629868
> > qdisc pfifo_fast 0: parent :38 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 
> > 1 1
> >  Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
> >  backlog 0b 0p requeues 0
> > qdisc pfifo_fast 0: parent :37 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 
> > 1 1
> >  Sent 0 bytes 0 pkt (dropped 0, overlimits 0 requeues 0)
> >  backlog 0b 0p requeues 0
[...]

So, it doesn't look like there is any backlog queue.  Although, this
can be difficult to measure/see this way (as the kernel empty the queue
quickly via bulk deq), also given the small amount of xmit_more which
indicate that the queue was likely very small.

There is a "dropped" counter, which indicate that you likely had a
setup (earlier) where you managed to overflow the qdisc queues. 

> just see that after changing RSS on nics did't deleted qdisc and added 
> again:
> Here situation with qdisc del / add
> tc -s -d qdisc show dev enp175s0f1
> qdisc mq 1: root
>   Sent 43738523966 bytes 683414438 pkt (dropped 0, overlimits 0 requeues 1886)
>   backlog 0b 0p requeues 1886
> qdisc pfifo_fast 0: parent 1:10 bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 
> 1 1
>   Sent 2585011904 bytes 40390811 pkt (dropped 0, overlimits 0 requeues 110)
>   backlog 0b 0p requeues 110
> qdisc pfifo_fast 0: parent 1:f bands 3 priomap  1 2 2 2 1 2 0 0 1 1 1 1 1 1 1 
> 1
>   Sent 2602068416 bytes 40657319 pkt (dropped 0, overlimits 0 requeues 121)
>   backlog 0b 0p requeues 121
[...]

Exactly as you indicated above, these "dropped" stats came from another
(earlier) test case. (Great that you caught this yourself)

While trying to reproduce you case, I also managed to cause a situation
with qdisc overload.  This caused some weird behavior, where I saw
RX=8Mpps and TX only 4Mpps.  (I didn't figure out the exact tuning that
caused this, and cannot reproduce it now).

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  LinkedIn: http://www.linkedin.com/in/brouer


[PATCH] i40e{,vf}: Fix out-of-bound cpumask read in IRQ affinity handler

2017-08-15 Thread Stefano Brivio
The cpumask used in i40e{,vf}_irq_affinity_notify() is allocated
by irq_affinity_notify() with alloc_cpumask_var(), which doesn't
allocate NR_CPUS bits, but only nr_cpumask_bits bits. If we just
dereference it, we'll read way more than what is allocated, e.g.
1024 bytes vs. 8 bytes allocated on x86_64 machine with 24 CPUs.

Use cpumask_copy() instead. A comprehensive explanation is given
in the comments about cpumask_var_t, in include/linux/cpumask.h.

KASAN reports:
[   25.242312] BUG: KASAN: slab-out-of-bounds in 
i40e_irq_affinity_notify+0x30/0x50 [i40e] at addr 880462eea960
[   25.242315] Read of size 1024 by task kworker/2:1/170
[   25.242322] CPU: 2 PID: 170 Comm: kworker/2:1 Not tainted 
4.11.0-22.el7a.x86_64 #1
[   25.242325] Hardware name: HP ProLiant DL380 Gen9, BIOS P89 05/06/2015
[   25.242336] Workqueue: events irq_affinity_notify
[   25.242340] Call Trace:
[   25.242350]  dump_stack+0x63/0x8d
[   25.242358]  kasan_object_err+0x21/0x70
[   25.242364]  kasan_report+0x288/0x540
[   25.242397]  ? i40e_irq_affinity_notify+0x30/0x50 [i40e]
[   25.242403]  check_memory_region+0x13c/0x1a0
[   25.242408]  __asan_loadN+0xf/0x20
[   25.242440]  i40e_irq_affinity_notify+0x30/0x50 [i40e]
[   25.242446]  irq_affinity_notify+0x1b4/0x230
[   25.242452]  ? irq_set_affinity_notifier+0x130/0x130
[   25.242457]  ? kasan_slab_free+0x89/0xc0
[   25.242466]  process_one_work+0x32f/0x6f0
[   25.242472]  worker_thread+0x89/0x770
[   25.242481]  ? pci_mmcfg_check_reserved+0xc0/0xc0
[   25.242488]  kthread+0x18c/0x1e0
[   25.242493]  ? process_one_work+0x6f0/0x6f0
[   25.242499]  ? kthread_create_on_node+0xc0/0xc0
[   25.242506]  ret_from_fork+0x2c/0x40
[   25.242511] Object at 880462eea960, in cache kmalloc-8 size: 8
[   25.242513] Allocated:
[   25.242514] PID = 170
[   25.242522]  save_stack_trace+0x1b/0x20
[   25.242529]  save_stack+0x46/0xd0
[   25.242533]  kasan_kmalloc+0xad/0xe0
[   25.242537]  __kmalloc_node+0x12c/0x2b0
[   25.242542]  alloc_cpumask_var_node+0x3c/0x60
[   25.242546]  alloc_cpumask_var+0xe/0x10
[   25.242550]  irq_affinity_notify+0x94/0x230
[   25.242555]  process_one_work+0x32f/0x6f0
[   25.242559]  worker_thread+0x89/0x770
[   25.242564]  kthread+0x18c/0x1e0
[   25.242568]  ret_from_fork+0x2c/0x40
[   25.242569] Freed:
[   25.242570] PID = 0
[   25.242572] (stack is not available)
[   25.242573] Memory state around the buggy address:
[   25.242578]  880462eea800: fc fc 00 fc fc 00 fc fc 00 fc fc 00 fc fc fb 
fc
[   25.242582]  880462eea880: fc fb fc fc fb fc fc 00 fc fc 00 fc fc 00 fc 
fc
[   25.242586] >880462eea900: 00 fc fc 00 fc fc 00 fc fc fb fc fc 00 fc fc 
fc
[   25.242588]   ^
[   25.242592]  880462eea980: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc 
fc
[   25.242596]  880462eeaa00: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc 
fc
[   25.242597] 
==

Fixes: 96db776a3682 ("i40e/i40evf: fix interrupt affinity bug")
Signed-off-by: Stefano Brivio 
---
This should be considered for -stable, back to 4.10.

 drivers/net/ethernet/intel/i40e/i40e_main.c | 2 +-
 drivers/net/ethernet/intel/i40evf/i40evf_main.c | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c 
b/drivers/net/ethernet/intel/i40e/i40e_main.c
index 2db93d3f6d23..c0e42d162c7c 100644
--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
+++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
@@ -3495,7 +3495,7 @@ static void i40e_irq_affinity_notify(struct 
irq_affinity_notify *notify,
struct i40e_q_vector *q_vector =
container_of(notify, struct i40e_q_vector, affinity_notify);
 
-   q_vector->affinity_mask = *mask;
+   cpumask_copy(&q_vector->affinity_mask, mask);
 }
 
 /**
diff --git a/drivers/net/ethernet/intel/i40evf/i40evf_main.c 
b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
index 7c213a347909..a4b60367ecce 100644
--- a/drivers/net/ethernet/intel/i40evf/i40evf_main.c
+++ b/drivers/net/ethernet/intel/i40evf/i40evf_main.c
@@ -520,7 +520,7 @@ static void i40evf_irq_affinity_notify(struct 
irq_affinity_notify *notify,
struct i40e_q_vector *q_vector =
container_of(notify, struct i40e_q_vector, affinity_notify);
 
-   q_vector->affinity_mask = *mask;
+   cpumask_copy(&q_vector->affinity_mask, mask);
 }
 
 /**
-- 
2.9.4



Re: [PATCH] i40e{,vf}: Fix out-of-bound cpumask read in IRQ affinity handler

2017-08-15 Thread Stefano Brivio
On Tue, 15 Aug 2017 12:30:14 +0200
Stefano Brivio  wrote:

> The cpumask used in i40e{,vf}_irq_affinity_notify() is allocated
> by irq_affinity_notify() with alloc_cpumask_var(), which doesn't
> allocate NR_CPUS bits, but only nr_cpumask_bits bits. If we just
> dereference it, we'll read way more than what is allocated, e.g.
> 1024 bytes vs. 8 bytes allocated on x86_64 machine with 24 CPUs.

Sorry, just two minutes after sending this I noticed Juergen submitted
the same fixes on Saturday:

From: Juergen Gross 
To: linux-ker...@vger.kernel.org, netdev@vger.kernel.org, 
intel-wired-...@lists.osuosl.org
Cc: jeffrey.t.kirs...@intel.com, Juergen Gross , 
sta...@vger.kernel.org
Subject: [PATCH] net/i40e: use cpumask_copy() for assigning cpumask
Date: Sat, 12 Aug 2017 18:09:46 +0200

Please discard.

--
Stefano


RE: [PATCH] New Chapter on CodingStyle .

2017-08-15 Thread David Laight
From: Jonathan Corbet
> Sent: 12 August 2017 15:55
...
> > +   Chapter 20: Put values on initialisers without exception
> > +
> > +When declaring variables on functions must put values:
> 
> Thanks for sending a patch for the kernel's documentation.
> Unfortunately, I can't accept this patch for a couple of reasons:
...
> - The coding style document is there to describe the community's
>   standards for kernel code.  It is *not* a mechanism for imposing new
>   standards.  If you really think that the kernel community should adopt
>   this rule, you will need to argue for it on the mailing lists.  I will
>   say, though, that I do not expect that this effort would be successful.

I'd even go as far as suggesting almost the opposite.
Declarations should only have initialisers if the value is constant.

David



[PATCH v2 net-next 0/3] Use correct sk->sk_prot for IPV6

2017-08-15 Thread Ilya Lesokhin
The tls ulp overrides sk->prot with a new tls specific proto structs.
The tls specific structs were previously based on the ipv4 specific
tcp_prot sturct.
As a result, attaching the tls ulp to an ipv6 tcp socket replaced
some ipv6 callback with the ipv4 equivalents.

This patch adds ipv6 tls proto structs and uses them when
attached to ipv6 sockets.

Changes since v1:
- TLS now dependes on IPV6
This fixes complication issues when TLS is built-in and IPV6 is a module.
The downside should be small as it is unlikely that there are kernel TLS 
users who can't afford to include IPV6 in thier kernel.
- tls_init now checks sk->sk_prot directly
This is somewhat safer then checking indirectly through sk->sk_family

Ilya Lesokhin (3):
  ipv6: Prevent unexpected sk->sk_prot changes
  net: Export tcpv6_prot
  tls: Use correct sk->sk_prot for IPV6

 net/ipv6/ipv6_sockglue.c | 12 
 net/ipv6/tcp_ipv6.c  |  1 +
 net/tls/Kconfig  |  1 +
 net/tls/tls_main.c   | 50 
 4 files changed, 52 insertions(+), 12 deletions(-)

-- 
1.8.3.1



[PATCH v2 net-next 3/3] tls: Use correct sk->sk_prot for IPV6

2017-08-15 Thread Ilya Lesokhin
The tls ulp overrides sk->prot with a new tls specific proto structs.
The tls specific structs were previously based on the ipv4 specific
tcp_prot sturct.
As a result, attaching the tls ulp to an ipv6 tcp socket replaced
some ipv6 callback with the ipv4 equivalents.

This patch adds ipv6 tls proto structs and uses them when
attached to ipv6 sockets.

Signed-off-by: Boris Pismenny 
Signed-off-by: Ilya Lesokhin 
---
 net/tls/Kconfig|  1 +
 net/tls/tls_main.c | 50 ++
 2 files changed, 39 insertions(+), 12 deletions(-)

diff --git a/net/tls/Kconfig b/net/tls/Kconfig
index eb58303..7e9cf8b 100644
--- a/net/tls/Kconfig
+++ b/net/tls/Kconfig
@@ -7,6 +7,7 @@ config TLS
select CRYPTO
select CRYPTO_AES
select CRYPTO_GCM
+   select IPV6
default n
---help---
Enable kernel support for TLS protocol. This allows symmetric
diff --git a/net/tls/tls_main.c b/net/tls/tls_main.c
index 60aff60..9caad11 100644
--- a/net/tls/tls_main.c
+++ b/net/tls/tls_main.c
@@ -40,13 +40,25 @@
 #include 
 
 #include 
+#include 
 
 MODULE_AUTHOR("Mellanox Technologies");
 MODULE_DESCRIPTION("Transport Layer Security Support");
 MODULE_LICENSE("Dual BSD/GPL");
 
-static struct proto tls_base_prot;
-static struct proto tls_sw_prot;
+enum {
+   TLSV4,
+   TLSV6,
+   TLS_NUM_PROTS,
+};
+
+enum {
+   TLS_BASE_TX,
+   TLS_SW_TX,
+   TLS_NUM_CONFIG,
+};
+
+static struct proto tls_prots[TLS_NUM_PROTS][TLS_NUM_CONFIG];
 
 int wait_on_pending_writer(struct sock *sk, long *timeo)
 {
@@ -342,6 +354,7 @@ static int do_tls_setsockopt_tx(struct sock *sk, char 
__user *optval,
struct tls_context *ctx = tls_get_ctx(sk);
struct proto *prot = NULL;
int rc = 0;
+   int ip_ver = sk->sk_family == AF_INET6 ? TLSV6 : TLSV4;
 
if (!optval || (optlen < sizeof(*crypto_info))) {
rc = -EINVAL;
@@ -396,7 +409,7 @@ static int do_tls_setsockopt_tx(struct sock *sk, char 
__user *optval,
 
/* currently SW is default, we will have ethtool in future */
rc = tls_set_sw_offload(sk, ctx);
-   prot = &tls_sw_prot;
+   prot = &tls_prots[ip_ver][TLS_SW_TX];
if (rc)
goto err_crypto_info;
 
@@ -443,6 +456,12 @@ static int tls_init(struct sock *sk)
struct inet_connection_sock *icsk = inet_csk(sk);
struct tls_context *ctx;
int rc = 0;
+   int ip_ver = TLSV4;
+
+   if (sk->sk_prot == &tcpv6_prot)
+   ip_ver = TLSV6;
+   else if (sk->sk_prot != &tcp_prot)
+   return -EINVAL;
 
/* allocate tls context */
ctx = kzalloc(sizeof(*ctx), GFP_KERNEL);
@@ -453,7 +472,8 @@ static int tls_init(struct sock *sk)
icsk->icsk_ulp_data = ctx;
ctx->setsockopt = sk->sk_prot->setsockopt;
ctx->getsockopt = sk->sk_prot->getsockopt;
-   sk->sk_prot = &tls_base_prot;
+
+   sk->sk_prot = &tls_prots[ip_ver][TLS_BASE_TX];
 out:
return rc;
 }
@@ -464,16 +484,22 @@ static int tls_init(struct sock *sk)
.init   = tls_init,
 };
 
+static void build_protos(struct proto *prot, struct proto *base)
+{
+   prot[TLS_BASE_TX] = *base;
+   prot[TLS_BASE_TX].setsockopt = tls_setsockopt;
+   prot[TLS_BASE_TX].getsockopt = tls_getsockopt;
+
+   prot[TLS_SW_TX] = prot[TLS_BASE_TX];
+   prot[TLS_SW_TX].close   = tls_sk_proto_close;
+   prot[TLS_SW_TX].sendmsg = tls_sw_sendmsg;
+   prot[TLS_SW_TX].sendpage= tls_sw_sendpage;
+}
+
 static int __init tls_register(void)
 {
-   tls_base_prot   = tcp_prot;
-   tls_base_prot.setsockopt= tls_setsockopt;
-   tls_base_prot.getsockopt= tls_getsockopt;
-
-   tls_sw_prot = tls_base_prot;
-   tls_sw_prot.sendmsg = tls_sw_sendmsg;
-   tls_sw_prot.sendpage= tls_sw_sendpage;
-   tls_sw_prot.close   = tls_sk_proto_close;
+   build_protos(tls_prots[TLSV4], &tcp_prot);
+   build_protos(tls_prots[TLSV6], &tcpv6_prot);
 
tcp_register_ulp(&tcp_tls_ulp_ops);
 
-- 
1.8.3.1



[PATCH v2 net-next 1/3] ipv6: Prevent unexpected sk->sk_prot changes

2017-08-15 Thread Ilya Lesokhin
With this patch IPV6 code ensure that only sockets with the
expected sk->sk_prot are converted to IPV4.

Signed-off-by: Boris Pismenny 
---
 net/ipv6/ipv6_sockglue.c | 12 
 1 file changed, 12 insertions(+)

diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
index 02d795f..318cd344 100644
--- a/net/ipv6/ipv6_sockglue.c
+++ b/net/ipv6/ipv6_sockglue.c
@@ -174,6 +174,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, 
int optname,
if (val == PF_INET) {
struct ipv6_txoptions *opt;
struct sk_buff *pktopt;
+   struct proto *expected_prot;
 
if (sk->sk_type == SOCK_RAW)
break;
@@ -199,6 +200,17 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, 
int optname,
break;
}
 
+   if (sk->sk_protocol == IPPROTO_TCP &&
+   sk->sk_prot != &tcpv6_prot)
+   break;
+
+   expected_prot = &udpv6_prot;
+   if (sk->sk_protocol == IPPROTO_UDPLITE)
+   expected_prot = &udplitev6_prot;
+
+   if (sk->sk_prot != expected_prot)
+   break;
+
fl6_free_socklist(sk);
__ipv6_sock_mc_close(sk);
 
-- 
1.8.3.1



[PATCH v2 net-next 2/3] net: Export tcpv6_prot

2017-08-15 Thread Ilya Lesokhin
Want to be able to use these in TLS.

Signed-off-by: Boris Pismenny 
---
 net/ipv6/tcp_ipv6.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
index 2521690..ef8d5b4 100644
--- a/net/ipv6/tcp_ipv6.c
+++ b/net/ipv6/tcp_ipv6.c
@@ -1944,6 +1944,7 @@ struct proto tcpv6_prot = {
 #endif
.diag_destroy   = tcp_abort,
 };
+EXPORT_SYMBOL_GPL(tcpv6_prot);
 
 static struct inet6_protocol tcpv6_protocol = {
.early_demux=   tcp_v6_early_demux,
-- 
1.8.3.1



[PATCH net] ipv6: fix NULL dereference in ip6_route_dev_notify()

2017-08-15 Thread Eric Dumazet
From: Eric Dumazet 

Based on a syzkaller report [1], I found that a per cpu allocation
failure in snmp6_alloc_dev() would then lead to NULL dereference in
ip6_route_dev_notify().

It seems this is a very old bug, thus no Fixes tag in this submission.

Let's add in6_dev_put_clear() helper, as we will probably use
it elsewhere (once available/present in net-next)


[1] 
kasan: CONFIG_KASAN_INLINE enabled
kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault:  [#1] SMP KASAN
Dumping ftrace buffer:
   (ftrace buffer empty)
Modules linked in:
CPU: 1 PID: 17294 Comm: syz-executor6 Not tainted 4.13.0-rc2+ #10
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
01/01/2011
task: 88019f456680 task.stack: 8801c6e58000
RIP: 0010:__read_once_size include/linux/compiler.h:250 [inline]
RIP: 0010:atomic_read arch/x86/include/asm/atomic.h:26 [inline]
RIP: 0010:refcount_sub_and_test+0x7d/0x1b0 lib/refcount.c:178
RSP: 0018:8801c6e5f1b0 EFLAGS: 00010202
RAX: 0037 RBX: dc00 RCX: c90005d25000
RDX: 8801c6e5f218 RSI: 82342bbf RDI: 0001
RBP: 8801c6e5f240 R08: 0001 R09: 
R10:  R11:  R12: 110038dcbe37
R13: 0006 R14: 0001 R15: 01b8
FS:  7f21e0429700() GS:8801dc10() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2: 001ddbc22000 CR3: 0001d632b000 CR4: 001426e0
DR0: 2000 DR1:  DR2: 
DR3:  DR6: 0ff0 DR7: 0600
Call Trace:
 refcount_dec_and_test+0x1a/0x20 lib/refcount.c:211
 in6_dev_put include/net/addrconf.h:335 [inline]
 ip6_route_dev_notify+0x1c9/0x4a0 net/ipv6/route.c:3732
 notifier_call_chain+0x136/0x2c0 kernel/notifier.c:93
 __raw_notifier_call_chain kernel/notifier.c:394 [inline]
 raw_notifier_call_chain+0x2d/0x40 kernel/notifier.c:401
 call_netdevice_notifiers_info+0x51/0x90 net/core/dev.c:1678
 call_netdevice_notifiers net/core/dev.c:1694 [inline]
 rollback_registered_many+0x91c/0xe80 net/core/dev.c:7107
 rollback_registered+0x1be/0x3c0 net/core/dev.c:7149
 register_netdevice+0xbcd/0xee0 net/core/dev.c:7587
 register_netdev+0x1a/0x30 net/core/dev.c:7669
 loopback_net_init+0x76/0x160 drivers/net/loopback.c:214
 ops_init+0x10a/0x570 net/core/net_namespace.c:118
 setup_net+0x313/0x710 net/core/net_namespace.c:294
 copy_net_ns+0x27c/0x580 net/core/net_namespace.c:418
 create_new_namespaces+0x425/0x880 kernel/nsproxy.c:107
 unshare_nsproxy_namespaces+0xae/0x1e0 kernel/nsproxy.c:206
 SYSC_unshare kernel/fork.c:2347 [inline]
 SyS_unshare+0x653/0xfa0 kernel/fork.c:2297
 entry_SYSCALL_64_fastpath+0x1f/0xbe
RIP: 0033:0x4512c9
RSP: 002b:7f21e0428c08 EFLAGS: 0216 ORIG_RAX: 0110
RAX: ffda RBX: 00718150 RCX: 004512c9
RDX:  RSI:  RDI: 62020200
RBP: 0086 R08:  R09: 
R10:  R11: 0216 R12: 004b973d
R13:  R14: 2001d000 R15: 02dd
Code: 50 2b 34 82 c7 00 f1 f1 f1 f1 c7 40 04 04 f2 f2 f2 c7 40 08 f3 f3
f3 f3 e8 a1 43 39 ff 4c 89 f8 48 8b 95 70 ff ff ff 48 c1 e8 03 <0f> b6
0c 18 4c 89 f8 83 e0 07 83 c0 03 38 c8 7c 08 84 c9 0f 85 
RIP: __read_once_size include/linux/compiler.h:250 [inline] RSP:
8801c6e5f1b0
RIP: atomic_read arch/x86/include/asm/atomic.h:26 [inline] RSP:
8801c6e5f1b0
RIP: refcount_sub_and_test+0x7d/0x1b0 lib/refcount.c:178 RSP:
8801c6e5f1b0
---[ end trace e441d046c6410d31 ]---

Signed-off-by: Eric Dumazet 
Reported-by: Dmitry Vyukov 
---
 include/net/addrconf.h |   10 ++
 net/ipv6/route.c   |6 +++---
 2 files changed, 13 insertions(+), 3 deletions(-)

diff --git a/include/net/addrconf.h b/include/net/addrconf.h
index 6df79e96a780..f44ff2476758 100644
--- a/include/net/addrconf.h
+++ b/include/net/addrconf.h
@@ -336,6 +336,16 @@ static inline void in6_dev_put(struct inet6_dev *idev)
in6_dev_finish_destroy(idev);
 }
 
+static inline void in6_dev_put_clear(struct inet6_dev **pidev)
+{
+   struct inet6_dev *idev = *pidev;
+
+   if (idev) {
+   in6_dev_put(idev);
+   *pidev = NULL;
+   }
+}
+
 static inline void __in6_dev_put(struct inet6_dev *idev)
 {
refcount_dec(&idev->refcnt);
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 99d4727f2b18..94d6a13d47f0 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3721,10 +3721,10 @@ static int ip6_route_dev_notify(struct notifier_block 
*this,
/* NETDEV_UNREGISTER could be fired for multiple times by
 * netdev_wait_allrefs(). Make sure we only call this once.
 */
-   in6_dev_put(net->ipv6.ip6_null_entry->rt6i_idev);
+   in6_dev_put_clear(&net->ipv6.ip6_n

RE: [PATCH] of_mdio: merge branch tails in of_phy_register_fixed_link()

2017-08-15 Thread David Laight
From: David Miller
> Sent: 14 August 2017 04:09
> From: Sergei Shtylyov 
> Date: Sun, 13 Aug 2017 00:03:06 +0300
> 
> > Looks  like gcc isn't always able to figure  out that 3 *if* branches in
> > of_phy_register_fixed_link() calling fixed_phy_register() at their ends
> > are similar enough and thus can be merged. The "manual" merge saves 40
> > bytes of the object code (AArch64 gcc 4.8.5), and still saves 12 bytes
> > even  if gcc was able to merge the branch tails (ARM gcc 4.8.5)...
> >
> > Signed-off-by: Sergei Shtylyov 
> 
> Applied, but if two instances of the "same" compiler just with
> different targets changes the optimization, it could be because of a
> tradeoff which is specific to parameters expressed in that target's
> backend.
> 
> So in the future we should probably back away from trying to "help"
> the compiler in this way.

Probably a trade off between code size and execution speed.
I've had 'fun' trying to stop gcc merging tail code paths
in order to avoid the cost of the branch instruction.

David



[PATCH] net: Fix a typo in comment about sock flags.

2017-08-15 Thread Tonghao Zhang
Signed-off-by: Tonghao Zhang 
---
 include/linux/net.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/include/linux/net.h b/include/linux/net.h
index b5c15b3..d97d80d 100644
--- a/include/linux/net.h
+++ b/include/linux/net.h
@@ -37,7 +37,7 @@
 
 /* Historically, SOCKWQ_ASYNC_NOSPACE & SOCKWQ_ASYNC_WAITDATA were located
  * in sock->flags, but moved into sk->sk_wq->flags to be RCU protected.
- * Eventually all flags will be in sk->sk_wq_flags.
+ * Eventually all flags will be in sk->sk_wq->flags.
  */
 #define SOCKWQ_ASYNC_NOSPACE   0
 #define SOCKWQ_ASYNC_WAITDATA  1
-- 
1.8.3.1



[PATCH net V2] openvswitch: fix skb_panic due to the incorrect actions attrlen

2017-08-15 Thread Liping Zhang
From: Liping Zhang 

For sw_flow_actions, the actions_len only represents the kernel part's
size, and when we dump the actions to the userspace, we will do the
convertions, so it's true size may become bigger than the actions_len.

But unfortunately, for OVS_PACKET_ATTR_ACTIONS, we use the actions_len
to alloc the skbuff, so the user_skb's size may become insufficient and
oops will happen like this:
  skbuff: skb_over_panic: text:8148fabf len:1749 put:157 head:
  881300f39000 data:881300f39000 tail:0x6d5 end:0x6c0 dev:
  [ cut here ]
  kernel BUG at net/core/skbuff.c:129!
  [...]
  Call Trace:
   
   [] skb_put+0x43/0x44
   [] skb_zerocopy+0x6c/0x1f4
   [] queue_userspace_packet+0x3a3/0x448 [openvswitch]
   [] ovs_dp_upcall+0x30/0x5c [openvswitch]
   [] output_userspace+0x132/0x158 [openvswitch]
   [] ? ip6_rcv_finish+0x74/0x77 [ipv6]
   [] do_execute_actions+0xcc1/0xdc8 [openvswitch]
   [] ovs_execute_actions+0x74/0x106 [openvswitch]
   [] ovs_dp_process_packet+0xe1/0xfd [openvswitch]
   [] ? key_extract+0x63c/0x8d5 [openvswitch]
   [] ovs_vport_receive+0xa1/0xc3 [openvswitch]
  [...]

Also we can find that the actions_len is much little than the orig_len:
  crash> struct sw_flow_actions 0x8812f539d000
  struct sw_flow_actions {
rcu = {
  next = 0x8812f5398800,
  func = 0xe3b00035db32
},
orig_len = 1384,
actions_len = 592,
actions = 0x8812f539d01c
  }

So as a quick fix, use the orig_len instead of the actions_len to alloc
the user_skb.

Last, this oops happened on our system running a relative old kernel, but
the same risk still exists on the mainline, since we use the wrong
actions_len from the beginning.

Fixes: ccea74457bbd ("openvswitch: include datapath actions with sampled-packet 
upcall to userspace")
Cc: Neil McKee 
Signed-off-by: Liping Zhang 
---
 V2: move actions_attrlen into ovs_skb_cb, which will make codes more
 clean, suggested by Pravin Shelar.

 net/openvswitch/actions.c  | 2 ++
 net/openvswitch/datapath.c | 2 +-
 net/openvswitch/datapath.h | 3 +++
 3 files changed, 6 insertions(+), 1 deletion(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index e4610676299b..f849ef52853f 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -921,6 +921,7 @@ static int output_userspace(struct datapath *dp, struct 
sk_buff *skb,
/* Include actions. */
upcall.actions = actions;
upcall.actions_len = actions_len;
+   upcall.actions_attrlen = OVS_CB(skb)->acts_origlen;
break;
}
 
@@ -1337,6 +1338,7 @@ int ovs_execute_actions(struct datapath *dp, struct 
sk_buff *skb,
goto out;
}
 
+   OVS_CB(skb)->acts_origlen = acts->orig_len;
err = do_execute_actions(dp, skb, key,
 acts->actions, acts->actions_len);
 
diff --git a/net/openvswitch/datapath.c b/net/openvswitch/datapath.c
index 45fe8c8a884d..66162e64e8b5 100644
--- a/net/openvswitch/datapath.c
+++ b/net/openvswitch/datapath.c
@@ -398,7 +398,7 @@ static size_t upcall_msg_size(const struct dp_upcall_info 
*upcall_info,
 
/* OVS_PACKET_ATTR_ACTIONS */
if (upcall_info->actions_len)
-   size += nla_total_size(upcall_info->actions_len);
+   size += nla_total_size(upcall_info->actions_attrlen);
 
/* OVS_PACKET_ATTR_MRU */
if (upcall_info->mru)
diff --git a/net/openvswitch/datapath.h b/net/openvswitch/datapath.h
index 5d8dcd88815f..8fd902c946ff 100644
--- a/net/openvswitch/datapath.h
+++ b/net/openvswitch/datapath.h
@@ -99,11 +99,13 @@ struct datapath {
  * when a packet is received by OVS.
  * @mru: The maximum received fragement size; 0 if the packet is not
  * fragmented.
+ * @acts_origlen: The netlink size of the flow actions applied to this skb.
  * @cutlen: The number of bytes from the packet end to be removed.
  */
 struct ovs_skb_cb {
struct vport*input_vport;
u16 mru;
+   u16 acts_origlen;
u32 cutlen;
 };
 #define OVS_CB(skb) ((struct ovs_skb_cb *)(skb)->cb)
@@ -124,6 +126,7 @@ struct dp_upcall_info {
const struct nlattr *userdata;
const struct nlattr *actions;
int actions_len;
+   int actions_attrlen;
u32 portid;
u8 cmd;
u16 mru;
-- 
2.13.4




pull-request: wireless-drivers 2017-08-15

2017-08-15 Thread Kalle Valo
Hi Dave,

more fixes to net tree for 4.13. More info in the signed tag below,
please let me know if there are any problems.

Kalle

The following changes since commit 5f5d03143de5e0c593da4ab18fc6393c2815e108:

  brcmfmac: fix memleak due to calling brcmf_sdiod_sgtable_alloc() twice 
(2017-07-27 14:03:14 +0300)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/kvalo/wireless-drivers.git 
tags/wireless-drivers-for-davem-2017-08-15

for you to fetch changes up to e9bf53ab1ee34bb05c104bbfd2b77c844773f8e6:

  brcmfmac: feature check for multi-scheduled scan fails on bcm4343x devices 
(2017-08-14 11:09:30 +0300)


wireless-drivers fixes for 4.13

This time quite a few fixes for iwlwifi and one major regression fix
for brcmfmac. For the iwlwifi aggregation bug a small change was
needed for mac80211, but as Johannes is still away the mac80211 patch
is taken via wireless-drivers tree.

brcmfmac

* fix firmware crash (a recent regression in bcm4343{0,1,8}

iwlwifi

* Some simple PCI HW ID fix-ups and additions for family 9000

* Remove a bogus warning message with new FWs (bug #196915)

* Don't allow illegal channel options to be used (bug #195299)

* A fix for checksum offload in family 9000

* A fix serious throughput degradation in 11ac with multiple streams

* An old bug in SMPS where the firmware was not aware of SMPS changes

* Fix a memory leak in the SAR code

* Fix a stuck queue case in AP mode;

* Convert a WARN to a simple debug in a legitimate race case (from
  which we can recover)

* Fix a severe throughput aggregation on 9000-family devices due to
  aggregation issues, needed a small change in mac80211


Arend Van Spriel (1):
  brcmfmac: feature check for multi-scheduled scan fails on bcm4343x devices

Avraham Stern (1):
  iwlwifi: mvm: start mac queues when deferred tx frames are purged

Christophe Jaillet (1):
  iwlwifi: mvm: Fix a memory leak in an error handling path in 
'iwl_mvm_sar_get_wgds_table()'

Emmanuel Grumbach (4):
  iwlwifi: mvm: fix TCP CSUM offload with WEP and A000 series
  iwlwifi: add TLV for MLME offload firmware capability
  iwlwifi: split the regulatory rules when the bandwidth flags require it
  iwlwifi: mvm: don't WARN when a legit race happens in A-MPDU

Gregory Greenman (2):
  iwlwifi: mvm: set A-MPDU bit upon empty BA notification from FW
  iwlwifi: mvm: rs: fix TLC statistics collection

Haim Dreyfuss (1):
  iwlwifi: fix fw_pre_next_step to apply also for C step

Kalle Valo (2):
  Merge tag 'iwlwifi-for-kalle-2017-08-02' of 
git://git.kernel.org/.../iwlwifi/iwlwifi-fixes
  Merge tag 'iwlwifi-for-kalle-2018-08-09' of 
git://git.kernel.org/.../iwlwifi/iwlwifi-fixes

Naftali Goldstein (3):
  iwlwifi: mvm: set the RTS_MIMO_PROT bit in flag mask when sending sta to 
fw
  mac80211: add api to start ba session timer expired flow
  iwlwifi: mvm: send delba upon rx ba session timeout

Tzipi Peres (1):
  iwlwifi: add the new 9000 series PCI IDs

 .../wireless/broadcom/brcm80211/brcmfmac/feature.c |  6 --
 drivers/net/wireless/intel/iwlwifi/cfg/9000.c  | 14 +++---
 drivers/net/wireless/intel/iwlwifi/fw/file.h   |  2 ++
 drivers/net/wireless/intel/iwlwifi/iwl-config.h|  8 
 drivers/net/wireless/intel/iwlwifi/iwl-drv.c   |  5 +++--
 drivers/net/wireless/intel/iwlwifi/iwl-nvm-parse.c | 19 +++
 drivers/net/wireless/intel/iwlwifi/mvm/fw.c|  6 --
 drivers/net/wireless/intel/iwlwifi/mvm/mac80211.c  | 12 +++-
 drivers/net/wireless/intel/iwlwifi/mvm/rs.c|  8 
 drivers/net/wireless/intel/iwlwifi/mvm/rxmq.c  | 10 ++
 drivers/net/wireless/intel/iwlwifi/mvm/sta.c   |  7 ---
 drivers/net/wireless/intel/iwlwifi/mvm/tx.c| 12 ++--
 drivers/net/wireless/intel/iwlwifi/pcie/drv.c  | 20 
 include/net/mac80211.h | 15 +++
 net/mac80211/agg-rx.c  | 22 +-
 15 files changed, 126 insertions(+), 40 deletions(-)


Re: [PATCH v2 net-next 1/3] ipv6: Prevent unexpected sk->sk_prot changes

2017-08-15 Thread Eric Dumazet
On Tue, 2017-08-15 at 14:08 +0300, Ilya Lesokhin wrote:
> With this patch IPV6 code ensure that only sockets with the
> expected sk->sk_prot are converted to IPV4.

It looks like you fix a bug added by a recent commit (in net-next, not
in net tree) ?

Please provide Fixes: tag to ease code review and maintenance.

Thanks.




Re: [PATCH net-next] bpf/verifier: track liveness for pruning

2017-08-15 Thread Daniel Borkmann

On 08/14/2017 07:55 PM, Edward Cree wrote:

State of a register doesn't matter if it wasn't read in reaching an exit;
  a write screens off all reads downstream of it from all explored_states
  upstream of it.
This allows us to prune many more branches; here are some processed insn
  counts for some Cilium programs:
Program  before  after
bpf_lb_opt_-DLB_L3.o   6515   3361
bpf_lb_opt_-DLB_L4.o   8976   5176
bpf_lb_opt_-DUNKNOWN.o 2960   1137
bpf_lxc_opt_-DDROP_ALL.o  95412  48537
bpf_lxc_opt_-DUNKNOWN.o  141706  79048
bpf_netdev.o  24251  17995
bpf_overlay.o 10999   9385

The runtime is also improved; here are 'time' results in ms:
Program  before  after
bpf_lb_opt_-DLB_L3.o 24  6
bpf_lb_opt_-DLB_L4.o 26 11
bpf_lb_opt_-DUNKNOWN.o   11  2
bpf_lxc_opt_-DDROP_ALL.o   1288152
bpf_lxc_opt_-DUNKNOWN.o1768257
bpf_netdev.o 62 31
bpf_overlay.o15 13

Signed-off-by: Edward Cree 


Awesome work!

[...]

if (arg_type == ARG_ANYTHING) {
if (is_pointer_value(env, regno)) {
@@ -1639,10 +1675,13 @@ static int check_call(struct bpf_verifier_env *env, int 
func_id, int insn_idx)
}

/* reset caller saved regs */
-   for (i = 0; i < CALLER_SAVED_REGS; i++)
+   for (i = 0; i < CALLER_SAVED_REGS; i++) {
mark_reg_not_init(regs, caller_saved[i]);
+   check_reg_arg(env, i, DST_OP_NO_MARK);


Don't we need the same in check_ld_abs() since we treat it similar
to a function call?


+   }

/* update return register */
+   check_reg_arg(env, BPF_REG_0, DST_OP_NO_MARK);

[...]


[PATCH 0/5] net/9p: Fine-tuning for some function implementations

2017-08-15 Thread SF Markus Elfring
From: Markus Elfring 
Date: Tue, 15 Aug 2017 13:43:21 +0200

Some update suggestions were taken into account
from static source code analysis.

Markus Elfring (5):
  Delete an error message for a failed memory allocation in five functions
  Improve 19 size determinations
  Add a jump target in p9_client_walk()
  Adjust a jump target in p9_client_attach()
  Delete an unnecessary variable initialisation in p9_client_attach()

 net/9p/client.c   | 51 +--
 net/9p/protocol.c |  6 +++---
 net/9p/trans_fd.c | 16 +++-
 net/9p/trans_rdma.c   |  8 +++-
 net/9p/trans_virtio.c |  6 ++
 net/9p/util.c |  2 +-
 6 files changed, 37 insertions(+), 52 deletions(-)

-- 
2.14.0



[PATCH 1/5] net/9p: Delete an error message for a failed memory allocation in five functions

2017-08-15 Thread SF Markus Elfring
From: Markus Elfring 
Date: Tue, 15 Aug 2017 08:25:41 +0200

Omit an extra message for a memory allocation failure in these functions.

This issue was detected by using the Coccinelle software.

Signed-off-by: Markus Elfring 
---
 net/9p/client.c   | 7 ++-
 net/9p/trans_fd.c | 6 ++
 net/9p/trans_rdma.c   | 6 ++
 net/9p/trans_virtio.c | 1 -
 4 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/net/9p/client.c b/net/9p/client.c
index 4674235b0d9b..2273181e9ba9 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -160,11 +160,9 @@ static int parse_opts(char *opts, struct p9_client *clnt)
return 0;
 
tmp_options = kstrdup(opts, GFP_KERNEL);
-   if (!tmp_options) {
-   p9_debug(P9_DEBUG_ERROR,
-"failed to allocate copy of option string\n");
+   if (!tmp_options)
return -ENOMEM;
-   }
+
options = tmp_options;
 
while ((p = strsep(&options, ",")) != NULL) {
@@ -277,7 +275,6 @@ p9_tag_alloc(struct p9_client *c, u16 tag, unsigned int 
max_size)
sizeof(struct p9_req_t), GFP_ATOMIC);
 
if (!c->reqs[row]) {
-   pr_err("Couldn't grow tag array\n");
spin_unlock_irqrestore(&c->lock, flags);
return ERR_PTR(-ENOMEM);
}
diff --git a/net/9p/trans_fd.c b/net/9p/trans_fd.c
index ddfa86648f95..3c272e5bc9ea 100644
--- a/net/9p/trans_fd.c
+++ b/net/9p/trans_fd.c
@@ -762,11 +762,9 @@ static int parse_opts(char *params, struct p9_fd_opts 
*opts)
return 0;
 
tmp_options = kstrdup(params, GFP_KERNEL);
-   if (!tmp_options) {
-   p9_debug(P9_DEBUG_ERROR,
-"failed to allocate copy of option string\n");
+   if (!tmp_options)
return -ENOMEM;
-   }
+
options = tmp_options;
 
while ((p = strsep(&options, ",")) != NULL) {
diff --git a/net/9p/trans_rdma.c b/net/9p/trans_rdma.c
index 6d8e3031978f..f98b6aae308b 100644
--- a/net/9p/trans_rdma.c
+++ b/net/9p/trans_rdma.c
@@ -205,11 +205,9 @@ static int parse_opts(char *params, struct p9_rdma_opts 
*opts)
return 0;
 
tmp_options = kstrdup(params, GFP_KERNEL);
-   if (!tmp_options) {
-   p9_debug(P9_DEBUG_ERROR,
-"failed to allocate copy of option string\n");
+   if (!tmp_options)
return -ENOMEM;
-   }
+
options = tmp_options;
 
while ((p = strsep(&options, ",")) != NULL) {
diff --git a/net/9p/trans_virtio.c b/net/9p/trans_virtio.c
index f24b25c25106..8a2cf9748398 100644
--- a/net/9p/trans_virtio.c
+++ b/net/9p/trans_virtio.c
@@ -552,7 +552,6 @@ static int p9_virtio_probe(struct virtio_device *vdev)
 
chan = kmalloc(sizeof(struct virtio_chan), GFP_KERNEL);
if (!chan) {
-   pr_err("Failed to allocate virtio 9P channel\n");
err = -ENOMEM;
goto fail;
}
-- 
2.14.0



Re: [PATCH v2 net-next 1/3] ipv6: Prevent unexpected sk->sk_prot changes

2017-08-15 Thread Eric Dumazet
On Tue, 2017-08-15 at 14:08 +0300, Ilya Lesokhin wrote:
> With this patch IPV6 code ensure that only sockets with the
> expected sk->sk_prot are converted to IPV4.
> 
> Signed-off-by: Boris Pismenny 
> ---
>  net/ipv6/ipv6_sockglue.c | 12 
>  1 file changed, 12 insertions(+)
> 
> diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
> index 02d795f..318cd344 100644
> --- a/net/ipv6/ipv6_sockglue.c
> +++ b/net/ipv6/ipv6_sockglue.c
> @@ -174,6 +174,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int level, 
> int optname,
>   if (val == PF_INET) {
>   struct ipv6_txoptions *opt;
>   struct sk_buff *pktopt;
> + struct proto *expected_prot;
>  
>   if (sk->sk_type == SOCK_RAW)
>   break;
> @@ -199,6 +200,17 @@ static int do_ipv6_setsockopt(struct sock *sk, int 
> level, int optname,
>   break;
>   }
>  
> + if (sk->sk_protocol == IPPROTO_TCP &&
> + sk->sk_prot != &tcpv6_prot)
> + break;
> +
> + expected_prot = &udpv6_prot;
> + if (sk->sk_protocol == IPPROTO_UDPLITE)
> + expected_prot = &udplitev6_prot;
> +
> + if (sk->sk_prot != expected_prot)
> + break;
> +
>   fl6_free_socklist(sk);
>   __ipv6_sock_mc_close(sk);
>  

I am afraid I do not understand this patch at all.

Direct references to tcpv6_prot, udpv6_prot, and udplitev6_prot in
net/ipv6/ipv6_sockglue.c looks completely broken.

Please provide something cleaner, maybe by adding a new method
(implementation would then be provided in TCP / UDP code )





[PATCH 2/5] net/9p: Improve 19 size determinations

2017-08-15 Thread SF Markus Elfring
From: Markus Elfring 
Date: Tue, 15 Aug 2017 09:36:20 +0200

Replace the specification of data structures by variable references
as the parameter for the operator "sizeof" to make the corresponding size
determination a bit safer according to the Linux coding style convention.

Signed-off-by: Markus Elfring 
---
 net/9p/client.c   | 19 +--
 net/9p/protocol.c |  6 +++---
 net/9p/trans_fd.c | 10 +-
 net/9p/trans_rdma.c   |  2 +-
 net/9p/trans_virtio.c |  5 ++---
 net/9p/util.c |  2 +-
 6 files changed, 21 insertions(+), 23 deletions(-)

diff --git a/net/9p/client.c b/net/9p/client.c
index 2273181e9ba9..2ca55d4b0b7d 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -272,8 +272,8 @@ p9_tag_alloc(struct p9_client *c, u16 tag, unsigned int 
max_size)
while (tag >= c->max_tag) {
row = (tag / P9_ROW_MAXTAG);
c->reqs[row] = kcalloc(P9_ROW_MAXTAG,
-   sizeof(struct p9_req_t), GFP_ATOMIC);
-
+  sizeof(*c->reqs[row]),
+  GFP_ATOMIC);
if (!c->reqs[row]) {
spin_unlock_irqrestore(&c->lock, flags);
return ERR_PTR(-ENOMEM);
@@ -907,7 +907,7 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt)
unsigned long flags;
 
p9_debug(P9_DEBUG_FID, "clnt %p\n", clnt);
-   fid = kmalloc(sizeof(struct p9_fid), GFP_KERNEL);
+   fid = kmalloc(sizeof(*fid), GFP_KERNEL);
if (!fid)
return ERR_PTR(-ENOMEM);
 
@@ -918,7 +918,7 @@ static struct p9_fid *p9_fid_create(struct p9_client *clnt)
}
fid->fid = ret;
 
-   memset(&fid->qid, 0, sizeof(struct p9_qid));
+   memset(&fid->qid, 0, sizeof(fid->qid));
fid->mode = -1;
fid->uid = current_fsuid();
fid->clnt = clnt;
@@ -1015,7 +1015,7 @@ struct p9_client *p9_client_create(const char *dev_name, 
char *options)
char *client_id;
 
err = 0;
-   clnt = kmalloc(sizeof(struct p9_client), GFP_KERNEL);
+   clnt = kmalloc(sizeof(*clnt), GFP_KERNEL);
if (!clnt)
return ERR_PTR(-ENOMEM);
 
@@ -1157,7 +1157,7 @@ struct p9_fid *p9_client_attach(struct p9_client *clnt, 
struct p9_fid *afid,
p9_debug(P9_DEBUG_9P, "<<< RATTACH qid %x.%llx.%x\n",
 qid.type, (unsigned long long)qid.path, qid.version);
 
-   memmove(&fid->qid, &qid, sizeof(struct p9_qid));
+   memmove(&fid->qid, &qid, sizeof(qid));
 
p9_free_req(clnt, req);
return fid;
@@ -1227,7 +1227,7 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, 
uint16_t nwname,
wqids[count].version);
 
if (nwname)
-   memmove(&fid->qid, &wqids[nwqids - 1], sizeof(struct p9_qid));
+   memmove(&fid->qid, &wqids[nwqids - 1], sizeof(fid->qid));
else
fid->qid = oldfid->qid;
 
@@ -1697,7 +1697,7 @@ struct p9_wstat *p9_client_stat(struct p9_fid *fid)
 {
int err;
struct p9_client *clnt;
-   struct p9_wstat *ret = kmalloc(sizeof(struct p9_wstat), GFP_KERNEL);
+   struct p9_wstat *ret = kmalloc(sizeof(*ret), GFP_KERNEL);
struct p9_req_t *req;
u16 ignored;
 
@@ -1749,8 +1749,7 @@ struct p9_stat_dotl *p9_client_getattr_dotl(struct p9_fid 
*fid,
 {
int err;
struct p9_client *clnt;
-   struct p9_stat_dotl *ret = kmalloc(sizeof(struct p9_stat_dotl),
-   GFP_KERNEL);
+   struct p9_stat_dotl *ret = kmalloc(sizeof(*ret), GFP_KERNEL);
struct p9_req_t *req;
 
p9_debug(P9_DEBUG_9P, ">>> TGETATTR fid %d, request_mask %lld\n",
diff --git a/net/9p/protocol.c b/net/9p/protocol.c
index 16e10680518c..b8dc30f7de07 100644
--- a/net/9p/protocol.c
+++ b/net/9p/protocol.c
@@ -200,7 +200,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const 
char *fmt,
struct p9_wstat *stbuf =
va_arg(ap, struct p9_wstat *);
 
-   memset(stbuf, 0, sizeof(struct p9_wstat));
+   memset(stbuf, 0, sizeof(*stbuf));
stbuf->n_uid = stbuf->n_muid = INVALID_UID;
stbuf->n_gid = INVALID_GID;
 
@@ -286,7 +286,7 @@ p9pdu_vreadf(struct p9_fcall *pdu, int proto_version, const 
char *fmt,
if (!errcode) {
*wqids =
kmalloc(*nwqid *
-   sizeof(struct p9_qid),
+   sizeof(**wqids),
GFP_NOFS);
if (*wqids == NULL)

[PATCH 3/5] net/9p: Add a jump target in p9_client_walk()

2017-08-15 Thread SF Markus Elfring
From: Markus Elfring 
Date: Tue, 15 Aug 2017 10:07:22 +0200

Replace a variable assignment by a goto statement so that an extra check
will be avoided at the end of this function.

Signed-off-by: Markus Elfring 
---
 net/9p/client.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/net/9p/client.c b/net/9p/client.c
index 2ca55d4b0b7d..6c2fc796edfb 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -1237,12 +1237,11 @@ struct p9_fid *p9_client_walk(struct p9_fid *oldfid, 
uint16_t nwname,
 clunk_fid:
kfree(wqids);
p9_client_clunk(fid);
-   fid = NULL;
-
+   goto exit;
 error:
if (fid && (fid != oldfid))
p9_fid_destroy(fid);
-
+exit:
return ERR_PTR(err);
 }
 EXPORT_SYMBOL(p9_client_walk);
-- 
2.14.0



[PATCH 4/5] net/9p: Adjust a jump target in p9_client_attach()

2017-08-15 Thread SF Markus Elfring
From: Markus Elfring 
Date: Tue, 15 Aug 2017 11:17:23 +0200

Adjust jump labels so that the function implementation becomes smaller.

Delete an extra variable assignment and a check (at the end of
this function).

Signed-off-by: Markus Elfring 
---
 net/9p/client.c | 18 +++---
 1 file changed, 7 insertions(+), 11 deletions(-)

diff --git a/net/9p/client.c b/net/9p/client.c
index 6c2fc796edfb..38c08171acc6 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -1133,25 +1133,23 @@ struct p9_fid *p9_client_attach(struct p9_client *clnt, 
struct p9_fid *afid,
p9_debug(P9_DEBUG_9P, ">>> TATTACH afid %d uname %s aname %s\n",
 afid ? afid->fid : -1, uname, aname);
fid = p9_fid_create(clnt);
-   if (IS_ERR(fid)) {
-   err = PTR_ERR(fid);
-   fid = NULL;
-   goto error;
-   }
+   if (IS_ERR(fid))
+   return fid;
+
fid->uid = n_uname;
 
req = p9_client_rpc(clnt, P9_TATTACH, "ddss?u", fid->fid,
afid ? afid->fid : P9_NOFID, uname, aname, n_uname);
if (IS_ERR(req)) {
err = PTR_ERR(req);
-   goto error;
+   goto destroy_fid;
}
 
err = p9pdu_readf(req->rc, clnt->proto_version, "Q", &qid);
if (err) {
trace_9p_protocol_dump(clnt, req->rc);
p9_free_req(clnt, req);
-   goto error;
+   goto destroy_fid;
}
 
p9_debug(P9_DEBUG_9P, "<<< RATTACH qid %x.%llx.%x\n",
@@ -1161,10 +1159,8 @@ struct p9_fid *p9_client_attach(struct p9_client *clnt, 
struct p9_fid *afid,
 
p9_free_req(clnt, req);
return fid;
-
-error:
-   if (fid)
-   p9_fid_destroy(fid);
+destroy_fid:
+   p9_fid_destroy(fid);
return ERR_PTR(err);
 }
 EXPORT_SYMBOL(p9_client_attach);
-- 
2.14.0



[PATCH 5/5] net/9p: Delete an unnecessary variable initialisation in p9_client_attach()

2017-08-15 Thread SF Markus Elfring
From: Markus Elfring 
Date: Tue, 15 Aug 2017 11:25:31 +0200

The local variable "err" will eventually be set to an appropriate value
a bit later. Thus omit the explicit initialisation at the beginning.

Signed-off-by: Markus Elfring 
---
 net/9p/client.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/9p/client.c b/net/9p/client.c
index 38c08171acc6..1d59db9aafb3 100644
--- a/net/9p/client.c
+++ b/net/9p/client.c
@@ -1124,7 +1124,7 @@ EXPORT_SYMBOL(p9_client_begin_disconnect);
 struct p9_fid *p9_client_attach(struct p9_client *clnt, struct p9_fid *afid,
const char *uname, kuid_t n_uname, const char *aname)
 {
-   int err = 0;
+   int err;
struct p9_req_t *req;
struct p9_fid *fid;
struct p9_qid qid;
-- 
2.14.0



Re: general protection fault in fib_dump_info

2017-08-15 Thread Florian Westphal
idaifish  wrote:
> Syzkaller hit 'general protection fault in fib_dump_info' bug on
> commit 4.13-rc5..

CC Roopa

> Guilty file: net/ipv4/fib_semantics.c
> 
> kasan: GPF could be caused by NULL-ptr deref or user memory access
> general protection fault:  [#1] SMP KASAN
> Modules linked in:
> CPU: 0 PID: 2808 Comm: syz-executor0 Not tainted 4.13.0-rc5 #1
> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
> Ubuntu-1.8.2-1ubuntu1 04/01/2014
> task: 880078562700 task.stack: 88007811
> RIP: 0010:fib_dump_info+0x388/0x1170 net/ipv4/fib_semantics.c:1314
> RSP: 0018:880078117010 EFLAGS: 00010206
> RAX: dc00 RBX: 00fe RCX: 0002
> RDX: 0006 RSI: 880078117084 RDI: 0030
> RBP: 880078117268 R08: 000c R09: 8800780d80c8
> R10: 58d629b4 R11: 67fce681 R12: 
> R13: 8800784bd540 R14: 8800780d80b5 R15: 8800780d80a4
> FS:  022fa940() GS:88007fc0() knlGS:
> CS:  0010 DS:  ES:  CR0: 80050033
> CR2: 004387d0 CR3: 79135000 CR4: 06f0
> Call Trace:
>  inet_rtm_getroute+0xc89/0x1f50 net/ipv4/route.c:2766
>  rtnetlink_rcv_msg+0x288/0x680 net/core/rtnetlink.c:4217

Seems like this is from
b61798130f1be5bff08712308126c2d7ebe390ef

Roopa, it seems to assume res.fi != NULL, but afaics there
is no guarantee, f.e. in ip_route_input_rcu() in the multicast
branch res isn't changed at all.

If thats true, we might need this fix?

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 7effa62beed3..fc0708f7792d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2763,7 +2763,7 @@ static int inet_rtm_getroute(struct sk_buff *in_skb, 
struct nlmsghdr *nlh,
if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)
table_id = rt->rt_table_id;
 
-   if (rtm->rtm_flags & RTM_F_FIB_MATCH)
+   if ((rtm->rtm_flags & RTM_F_FIB_MATCH) && res.fi)
err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,
nlh->nlmsg_seq, RTM_NEWROUTE, table_id,
rt->rt_type, res.prefix, res.prefixlen,
fl4.flowi4_tos, res.fi, 0);



Mellanox driver / cant access /prox/irq/XX/smp_affinity after upgrade from linux-4.13-rc4-next-20170811 to 4.13.0-rc5-next-20170815

2017-08-15 Thread Paweł Staszewski

Hi


After upgrading kernel to latest next there is problem with accessing 
smp_affinity to write for mellanox driver


For example - MEllanox:

cat /proc/interrupts | grep ml
 217:  3  0  0  0 0  0  
0  0  0  0 0  0  0  
0  0  0 0  0  0  0  
0  0 0  0  0  0  0  0 
0  0  0  0  0  0 0  
0  0  0  0  0 0  0  
0  0  0  0 0  0  0  
0  0  0 0  0  0  0   PCI-MSI 
91750400-edge mlx5_pages_eq@pci::af:00.0
 218:  12237  0  0  0 0  0  
0  0  0  0 0  0  0  
0  0  0 0  0  0  0  
0  0 0  0  0  0  0  0 
0  0  0  0  0  0 0  
0  0  0  0  0 0  0  
0  0  0  0 0  0  0  
0  0  0 0  0  0  0   PCI-MSI 
91750401-edge mlx5_cmd_eq@pci::af:00.0


cat /proc/irq/218/smp_affinity
ff,

echo  "ff, " > /proc/irq/218/smp_affinity
-su: echo: write error: Input/output error


For intel ixgbe:

  90:  1  0  0  0 0  0  
0  0  0  0 0  0  0  
0  0  0 0  0  0  0  
0  0 0  0  0  0  0  0 
0  0  0  0  0  0 0  
0  0  0  0  0 0  0  
0  0  0  0 0  0  0  
0  0  0 0  0  0  0   PCI-MSI 
12589056-edge enp24s0f3


cat /proc/irq/90/smp_affinity
0003ff,f0003fff

 echo "0003ff,f0003fff" > /proc/irq/90/smp_affinity

Without errors.



BR

Pawel



[PATCH net] ipv4: fix NULL dereference in free_fib_info_rcu()

2017-08-15 Thread Eric Dumazet
From: Eric Dumazet 

If fi->fib_metrics could not be allocated in fib_create_info()
we attempt to dereference a NULL pointer in free_fib_info_rcu() :

m = fi->fib_metrics;
if (m != &dst_default_metrics && atomic_dec_and_test(&m->refcnt))
kfree(m);

Before my recent patch, we used to call kfree(NULL) and nothing wrong
happened.

Instead of using RCU to defer freeing while we are under memory stress,
it seems better to take immediate action.

This was reported by syzkaller team.

Fixes: 3fb07daff8e9 ("ipv4: add reference counting to metrics")
Signed-off-by: Eric Dumazet 
Reported-by: Dmitry Vyukov 
---
 net/ipv4/fib_semantics.c |   12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index b8d18171cca3..ec3a9ce281a6 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -1083,15 +1083,17 @@ struct fib_info *fib_create_info(struct fib_config *cfg,
fi = kzalloc(sizeof(*fi)+nhs*sizeof(struct fib_nh), GFP_KERNEL);
if (!fi)
goto failure;
-   fib_info_cnt++;
if (cfg->fc_mx) {
fi->fib_metrics = kzalloc(sizeof(*fi->fib_metrics), GFP_KERNEL);
-   if (!fi->fib_metrics)
-   goto failure;
+   if (unlikely(!fi->fib_metrics)) {
+   kfree(fi);
+   return ERR_PTR(err);
+   }
atomic_set(&fi->fib_metrics->refcnt, 1);
-   } else
+   } else {
fi->fib_metrics = (struct dst_metrics *)&dst_default_metrics;
-
+   }
+   fib_info_cnt++;
fi->fib_net = net;
fi->fib_protocol = cfg->fc_protocol;
fi->fib_scope = cfg->fc_scope;




RE: [iproute PATCH 51/51] lib/bpf: Check return value of write()

2017-08-15 Thread David Laight
From: Phil Sutter
> Sent: 12 August 2017 13:05
> This is merely to silence the compiler warning. If write to stderr
> failed, assume that printing an error message will fail as well so don't
> even try.
> 
> Signed-off-by: Phil Sutter 
> ---
>  lib/bpf.c | 3 ++-
>  1 file changed, 2 insertions(+), 1 deletion(-)
> 
> diff --git a/lib/bpf.c b/lib/bpf.c
> index 1dcb261dc915f..825e071cea572 100644
> --- a/lib/bpf.c
> +++ b/lib/bpf.c
> @@ -591,7 +591,8 @@ int bpf_trace_pipe(void)
> 
>   ret = read(fd, buff, sizeof(buff) - 1);
>   if (ret > 0) {
> - write(2, buff, ret);
> + if (write(STDERR_FILENO, buff, ret) != ret)
> + return -1;
>   fflush(stderr);
>   }

WTF is this code doing anyway?
write() is a system call, fflush() writes out any data buffered in the
stdio stream.
If there was anything buffered you'd want to output it earlier.
Otherwise if it is going to use fflush() it should be using fwrite().

I presume the function is allowed to write to stderr - since in general
library functions shouldn't assume fd 0/1/2 or stdin/out/err are valid.
There is a lot of code out there that does close(0); close(1); close(2);
but leaves stdout/err valid. Call printf() instead of sprint() and eventually
10k of data gets written somewhere rather unexpected.

If it is a copy loop, what is wrong with the last byte of buff[].
It is valid for write() to return a partial length - the code should
probably loop until all the data is accepted (or error).

David




RE: [net-next 08/15] i40e/i40evf: organize and re-number feature flags

2017-08-15 Thread David Laight
From: Keller, Jacob E
> Sent: 14 August 2017 23:11
> > From: David Miller [mailto:da...@davemloft.net]
> > Sent: Saturday, August 12, 2017 1:04 PM
> > From: Jeff Kirsher 
> > Date: Sat, 12 Aug 2017 04:08:41 -0700
> >
> > > Also ensure that the flags variable is actually a u64 to guarantee
> > > 64bits of space on all architectures.
> >
> > Why?  You don't need 64-bits, you only need 27.
> >
> > This will be unnecessarily expensive on 32-bit platforms.
> >
> > Please don't do this.
> 
> I suppose a better method would be to switch to using a declare_bitmap 
> instead, so that it
> automatically sizes based on the number of flags we have. The reason we chose 
> 64bits is because we
> will add flags in the future, as we originally had more than 32 flags prior 
> to this patch until we
> moved some into a separate field.
> 
> But now that I think about it, using DECLARE_BITMAP makes more sense, though 
> it's a bit more invasive
> of the code.

And horribly stupid unless you really need dynamic indexes.

David



Re: [PATCH net-next] bpf/verifier: track liveness for pruning

2017-08-15 Thread Edward Cree
On 15/08/17 12:52, Daniel Borkmann wrote:
> On 08/14/2017 07:55 PM, Edward Cree wrote:
>>   if (arg_type == ARG_ANYTHING) {
>>   if (is_pointer_value(env, regno)) {
>> @@ -1639,10 +1675,13 @@ static int check_call(struct bpf_verifier_env *env, 
>> int func_id, int insn_idx)
>>   }
>>
>>   /* reset caller saved regs */
>> -for (i = 0; i < CALLER_SAVED_REGS; i++)
>> +for (i = 0; i < CALLER_SAVED_REGS; i++) {
>>   mark_reg_not_init(regs, caller_saved[i]);
>> +check_reg_arg(env, i, DST_OP_NO_MARK);
>
> Don't we need the same in check_ld_abs() since we treat it similar
> to a function call? 
Yes, I forgot about LD_ABS.  I'll fix it and spin a v2.
-Ed


[PATCH v4 iproute2 2/7] rdma: Add dev object

2017-08-15 Thread Leon Romanovsky
Device (dev) object represents struct ib_device to the user space.

Device properties:
 * Device capabilities
 * FW version to the device output
 * node_guid and sys_image_guid
 * node_type

Signed-off-by: Leon Romanovsky 
---
 rdma/Makefile |   2 +-
 rdma/dev.c| 227 ++
 rdma/rdma.c   |   3 +-
 rdma/rdma.h   |  13 
 rdma/utils.c  |  54 +-
 5 files changed, 296 insertions(+), 3 deletions(-)
 create mode 100644 rdma/dev.c

diff --git a/rdma/Makefile b/rdma/Makefile
index 64da2142..123d7ac5 100644
--- a/rdma/Makefile
+++ b/rdma/Makefile
@@ -2,7 +2,7 @@ include ../Config
 
 ifeq ($(HAVE_MNL),y)
 
-RDMA_OBJ = rdma.o utils.o
+RDMA_OBJ = rdma.o utils.o dev.o
 
 TARGETS=rdma
 CFLAGS += $(shell $(PKG_CONFIG) libmnl --cflags)
diff --git a/rdma/dev.c b/rdma/dev.c
new file mode 100644
index ..e984f805
--- /dev/null
+++ b/rdma/dev.c
@@ -0,0 +1,227 @@
+/*
+ * dev.c   RDMA tool
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ * Authors: Leon Romanovsky 
+ */
+
+#include "rdma.h"
+
+static int dev_help(struct rd *rd)
+{
+   pr_out("Usage: %s dev show [DEV]\n", rd->filename);
+   return 0;
+}
+
+static const char *dev_caps_to_str(uint32_t idx)
+{
+   uint64_t cap = 1 << idx;
+
+   switch (cap) {
+   case RDMA_DEV_RESIZE_MAX_WR: return "RESIZE_MAX_WR";
+   case RDMA_DEV_BAD_PKEY_CNTR: return "BAD_PKEY_CNTR";
+   case RDMA_DEV_BAD_QKEY_CNTR: return "BAD_QKEY_CNTR";
+   case RDMA_DEV_RAW_MULTI: return "RAW_MULTI";
+   case RDMA_DEV_AUTO_PATH_MIG: return "AUTO_PATH_MIG";
+   case RDMA_DEV_CHANGE_PHY_PORT: return "CHANGE_PHY_POR";
+   case RDMA_DEV_UD_AV_PORT_ENFORCE: return "UD_AV_PORT_ENFORCE";
+   case RDMA_DEV_CURR_QP_STATE_MOD: return "CURR_QP_STATE_MOD";
+   case RDMA_DEV_SHUTDOWN_PORT: return "SHUTDOWN_PORT";
+   case RDMA_DEV_INIT_TYPE: return "INIT_TYPE";
+   case RDMA_DEV_PORT_ACTIVE_EVENT: return "PORT_ACTIVE_EVENT";
+   case RDMA_DEV_SYS_IMAGE_GUID: return "SYS_IMAGE_GUID";
+   case RDMA_DEV_RC_RNR_NAK_GEN: return "RC_RNR_NAK_GEN";
+   case RDMA_DEV_SRQ_RESIZE: return "SRQ_RESIZE";
+   case RDMA_DEV_N_NOTIFY_CQ: return "N_NOTIFY_CQ";
+   case RDMA_DEV_LOCAL_DMA_LKEY: return "LOCAL_DMA_LKEY";
+   case RDMA_DEV_MEM_WINDOW: return "MEM_WINDOW";
+   case RDMA_DEV_UD_IP_CSUM: return "UD_IP_CSUM";
+   case RDMA_DEV_UD_TSO: return "UD_TSO";
+   case RDMA_DEV_XRC: return "XRC";
+   case RDMA_DEV_MEM_MGT_EXTENSIONS: return "MEM_MGT_EXTENSIONS";
+   case RDMA_DEV_BLOCK_MULTICAST_LOOPBACK:
+   return "BLOCK_MULTICAST_LOOPBACK";
+   case RDMA_DEV_MEM_WINDOW_TYPE_2A: return "MEM_WINDOW_TYPE_2A";
+   case RDMA_DEV_MEM_WINDOW_TYPE_2B: return "MEM_WINDOW_TYPE_2B";
+   case RDMA_DEV_RC_IP_CSUM: return "RC_IP_CSUM";
+   case RDMA_DEV_RAW_IP_CSUM: return "RAW_IP_CSUM";
+   case RDMA_DEV_CROSS_CHANNEL: return "CROSS_CHANNEL";
+   case RDMA_DEV_MANAGED_FLOW_STEERING: return "MANAGED_FLOW_STEERING";
+   case RDMA_DEV_SIGNATURE_HANDOVER: return "SIGNATURE_HANDOVER";
+   case RDMA_DEV_ON_DEMAND_PAGING: return "ON_DEMAND_PAGING";
+   case RDMA_DEV_SG_GAPS_REG: return "SG_GAPS_REG";
+   case RDMA_DEV_VIRTUAL_FUNCTION: return "VIRTUAL_FUNCTION";
+   case RDMA_DEV_RAW_SCATTER_FCS: return "RAW_SCATTER_FCS";
+   case RDMA_DEV_RDMA_NETDEV_OPA_VNIC: return "RDMA_NETDEV_OPA_VNIC";
+   default: return "UNKNOWN";
+   }
+}
+
+static void dev_print_caps(struct nlattr **tb)
+{
+   uint64_t caps;
+   uint32_t idx;
+
+   if (!tb[RDMA_NLDEV_ATTR_CAP_FLAGS])
+   return;
+
+   caps = mnl_attr_get_u64(tb[RDMA_NLDEV_ATTR_CAP_FLAGS]);
+
+   pr_out("\ncaps: <");
+   for (idx = 0; caps; idx++) {
+   if (caps & 0x1) {
+   pr_out("%s", dev_caps_to_str(idx));
+   if (caps >> 0x1)
+   pr_out(", ");
+   }
+   caps >>= 0x1;
+   }
+
+   pr_out(">");
+}
+
+static void dev_print_fw(struct nlattr **tb)
+{
+   if (!tb[RDMA_NLDEV_ATTR_FW_VERSION])
+   return;
+
+   pr_out("fw %s ",
+  mnl_attr_get_str(tb[RDMA_NLDEV_ATTR_FW_VERSION]));
+}
+
+static void dev_print_node_guid(struct nlattr **tb)
+{
+   uint64_t node_guid;
+
+   if (!tb[RDMA_NLDEV_ATTR_NODE_GUID])
+   return;
+
+   node_guid = mnl_attr_get_u64(tb[RDMA_NLDEV_ATTR_NODE_GUID]);
+   rd_print_u64("node_guid", node_guid);
+}
+
+static void dev_print_sys_image_guid(struct nlattr **tb)
+{
+   uint64_t sys_image_guid;
+
+   if (!tb[RDMA_NLDEV_ATTR_SYS_IMAGE_GU

[PATCH v4 iproute2 0/7] RDMAtool

2017-08-15 Thread Leon Romanovsky
Hi,

This is fourth revision of series implementing the RDAMtool -  the tool
to configure RDMA devices. The initial proposal was sent as RFC [1] and
was based on sysfs entries as POC.

The current series was rewritten completely to work with RDMA netlinks as
a source of user<->kernel communications. In order to achieve that, the
RDMA netlinks were extensively refactored and modernized [2, 3, 4 and 5].

The Doug's for-next tag includes most of the needed patches for this
tool and I posted to the ML the last batch [6] which exports various
device and port properties.

The following is an example of various runs on my machine with 5 devices
(4 in IB mode and one in Ethernet mode).

### Without parameters
$ rdma
Usage: rdma [ OPTIONS ] OBJECT { COMMAND | help }
where  OBJECT := { dev | link | help }
   OPTIONS := { -V[ersion] | -d[etails] | -j[son] | -p[retty]}

### With unspecified device name
$ rdma dev
1: mlx5_0: node_type ca fw 2.8. node_guid 5254:00c0:fe12:3457 
sys_image_guid 5254:00c0:fe12:3457
2: mlx5_1: node_type ca fw 2.8. node_guid 5254:00c0:fe12:3458 
sys_image_guid 5254:00c0:fe12:3458
3: mlx5_2: node_type ca fw 2.8. node_guid 5254:00c0:fe12:3459 
sys_image_guid 5254:00c0:fe12:3459
4: mlx5_3: node_type ca fw 2.8. node_guid 5254:00c0:fe12:345a 
sys_image_guid 5254:00c0:fe12:345a
5: mlx5_4: node_type ca fw 2.8. node_guid 5254:00c0:fe12:345b 
sys_image_guid 5254:00c0:fe12:345b

### Detailed mode
$ rdma -d dev
1: mlx5_0: node_type ca fw 2.8. node_guid 5254:00c0:fe12:3457 
sys_image_guid 5254:00c0:fe12:3457
caps: 
2: mlx5_1: node_type ca fw 2.8. node_guid 5254:00c0:fe12:3458 
sys_image_guid 5254:00c0:fe12:3458
caps: 
3: mlx5_2: node_type ca fw 2.8. node_guid 5254:00c0:fe12:3459 
sys_image_guid 5254:00c0:fe12:3459
caps: 
4: mlx5_3: node_type ca fw 2.8. node_guid 5254:00c0:fe12:345a 
sys_image_guid 5254:00c0:fe12:345a
caps: 
5: mlx5_4: node_type ca fw 2.8. node_guid 5254:00c0:fe12:345b 
sys_image_guid 5254:00c0:fe12:345b
caps: 

### Specific device
$ rdma dev show mlx5_4
5: mlx5_4: node_type ca fw 2.8. node_guid 5254:00c0:fe12:345b 
sys_image_guid 5254:00c0:fe12:345b

### Specific device in detailed mode
$ rdma dev show mlx5_4 -d
5: mlx5_4: node_type ca fw 2.8. node_guid 5254:00c0:fe12:345b 
sys_image_guid 5254:00c0:fe12:345b
caps: 

### Unknown command (caps)
$ rdma dev show mlx5_4 caps
Unknown parameter 'caps'.

### Link properties without device name
$ rdma link
1/1: mlx5_0/1: subnet_prefix fe80::: lid 13399 sm_lid 49151 lmc 0 
state ACTIVE physical_state LINK_UP
2/1: mlx5_1/1: subnet_prefix fe80::: lid 13400 sm_lid 49151 lmc 0 
state ACTIVE physical_state LINK_UP
3/1: mlx5_2/1: subnet_prefix fe80::: lid 13401 sm_lid 49151 lmc 0 
state ACTIVE physical_state LINK_UP
4/1: mlx5_3/1: state DOWN physical_state DISABLED
5/1: mlx5_4/1: subnet_prefix fe80::: lid 13403 sm_lid 49151 lmc 0 
state ACTIVE physical_state LINK_UP

### Link properties in detailed mode
$ rdma link -d
1/1: mlx5_0/1: subnet_prefix fe80::: lid 13399 sm_lid 49151 lmc 0 
state ACTIVE physical_state LINK_UP
caps: 
2/1: mlx5_1/1: subnet_prefix fe80::: lid 13400 sm_lid 49151 lmc 0 
state ACTIVE physical_state LINK_UP
caps: 
3/1: mlx5_2/1: subnet_prefix fe80::: lid 13401 sm_lid 49151 lmc 0 
state ACTIVE physical_state LINK_UP
caps: 
4/1: mlx5_3/1: state DOWN physical_state DISABLED
caps: 
5/1: mlx5_4/1: subnet_prefix fe80::: lid 13403 sm_lid 49151 lmc 0 
state ACTIVE physical_state LINK_UP
caps: 

### All links for specific device
$ rdma link show mlx5_3
1/1: mlx5_0/1: subnet_prefix fe80::: lid 13399 sm_lid 49151 lmc 0 
state ACTIVE physical_state LINK_UP

### Detailed link properties for specific device
$ rdma link -d show mlx5_3
1/1: mlx5_0/1: subnet_prefix fe80::: lid 13399 sm_lid 49151 lmc 0 
state ACTIVE physical_state LINK_UP
caps: 

### Specific port for specific device
$ rdma link show mlx5_4/1
1/1: mlx5_0/1: subnet_prefix fe80::: lid 13399 sm_lid 49151 lmc 0 
state ACTIVE physical_state LINK_UP

### Unknown parameter
$ rdma link show mlx5_4/1 caps
Unknown parameter 'caps'.

Thanks

Changelog:
v2->v4:
 * Rebased on latest net-next branch
 * Added JSON output -j (json) and -p (pretty output)
 * Exported and reused kernel UAPIs and defines instead of hard coded
   version.
v2->v3:
 * Removed MAX()
 * Reduced scope of rd_argv_match
 * Removed return from rdma_free_devmap
 * Added extra break at rdma_send_msg
v1->v2:
 * Squashed multiple (and similar) patches to be one patch for dev object
   and one patch for link object.
 * Removed port_map struct
 * Removed global netlink dump during initialization, it removed the need to 
store
   the intermediate variables and reuse ability of netlink to signal if variable
   exists or doesn't.
 * Added "-d" --details option and put all CAPs under it.

v0->v1:
 * Moved hunk 

Re: [iproute PATCH 51/51] lib/bpf: Check return value of write()

2017-08-15 Thread Daniel Borkmann

On 08/15/2017 02:31 PM, David Laight wrote:
[...]

WTF is this code doing anyway?
write() is a system call, fflush() writes out any data buffered in the
stdio stream.
If there was anything buffered you'd want to output it earlier.
Otherwise if it is going to use fflush() it should be using fwrite().

I presume the function is allowed to write to stderr - since in general
library functions shouldn't assume fd 0/1/2 or stdin/out/err are valid.
There is a lot of code out there that does close(0); close(1); close(2);
but leaves stdout/err valid. Call printf() instead of sprint() and eventually
10k of data gets written somewhere rather unexpected.

If it is a copy loop, what is wrong with the last byte of buff[].
It is valid for write() to return a partial length - the code should
probably loop until all the data is accepted (or error).


Just send a patch if you really care; would have probably been faster
than typing up your email. ;) Thank you!


[PATCH v4 iproute2 4/7] rdma: Add initial manual for the tool

2017-08-15 Thread Leon Romanovsky
Signed-off-by: Leon Romanovsky 
---
 man/man8/rdma-dev.8  |  55 +++
 man/man8/rdma-link.8 |  55 +++
 man/man8/rdma.8  | 102 +++
 3 files changed, 212 insertions(+)
 create mode 100644 man/man8/rdma-dev.8
 create mode 100644 man/man8/rdma-link.8
 create mode 100644 man/man8/rdma.8

diff --git a/man/man8/rdma-dev.8 b/man/man8/rdma-dev.8
new file mode 100644
index ..461681b6
--- /dev/null
+++ b/man/man8/rdma-dev.8
@@ -0,0 +1,55 @@
+.TH RDMA\-DEV 8 "06 Jul 2017" "iproute2" "Linux"
+.SH NAME
+rdmak-dev \- RDMA device configuration
+.SH SYNOPSIS
+.sp
+.ad l
+.in +8
+.ti -8
+.B rdma
+.RI "[ " OPTIONS " ]"
+.B dev
+.RI  " { " COMMAND " | "
+.BR help " }"
+.sp
+
+.ti -8
+.IR OPTIONS " := { "
+\fB\-V\fR[\fIersion\fR] |
+\fB\-d\fR[\fIetails\fR] }
+
+.ti -8
+.B rdma dev show
+.RI "[ " DEV " ]"
+
+.ti -8
+.B rdma dev help
+
+.SH "DESCRIPTION"
+.SS rdma dev show - display rdma device attributes
+
+.PP
+.I "DEV"
+- specifies the RDMA device to show.
+If this argument is omitted all devices are listed.
+
+.SH "EXAMPLES"
+.PP
+rdma dev
+.RS 4
+Shows the state of all RDMA devices on the system.
+.RE
+.PP
+rdma dev show mlx5_3
+.RS 4
+Shows the state of specified RDMA device.
+.RE
+.PP
+
+.SH SEE ALSO
+.BR rdma (8),
+.BR rdma-link (8),
+.br
+
+.SH AUTHOR
+Leon Romanovsky 
diff --git a/man/man8/rdma-link.8 b/man/man8/rdma-link.8
new file mode 100644
index ..8ed049ef
--- /dev/null
+++ b/man/man8/rdma-link.8
@@ -0,0 +1,55 @@
+.TH RDMA\-LINK 8 "06 Jul 2017" "iproute2" "Linux"
+.SH NAME
+rdma-link \- rdma link configuration
+.SH SYNOPSIS
+.sp
+.ad l
+.in +8
+.ti -8
+.B devlink
+.RI "[ " OPTIONS " ]"
+.B link
+.RI  " { " COMMAND " | "
+.BR help " }"
+.sp
+
+.ti -8
+.IR OPTIONS " := { "
+\fB\-V\fR[\fIersion\fR] |
+\fB\-d\fR[\fIetails\fR] }
+
+.ti -8
+.B rdma link show
+.RI "[ " DEV/PORT_INDEX " ]"
+
+.ti -8
+.B rdma link help
+
+.SH "DESCRIPTION"
+.SS rdma link show - display rdma link attributes
+
+.PP
+.I "DEV/PORT_INDEX"
+- specifies the RDMa link to show.
+If this argument is omitted all links are listed.
+
+.SH "EXAMPLES"
+.PP
+rdma link show
+.RS 4
+Shows the state of all rdma links on the system.
+.RE
+.PP
+rdma link show mlx5_2/1
+.RS 4
+Shows the state of specified rdma link.
+.RE
+.PP
+
+.SH SEE ALSO
+.BR rdma (8),
+.BR rdma-dev (8),
+.br
+
+.SH AUTHOR
+Leon Romanovsky 
diff --git a/man/man8/rdma.8 b/man/man8/rdma.8
new file mode 100644
index ..798b33d3
--- /dev/null
+++ b/man/man8/rdma.8
@@ -0,0 +1,102 @@
+.TH RDMA 8 "28 Mar 2017" "iproute2" "Linux"
+.SH NAME
+rdma \- RDMA tool
+.SH SYNOPSIS
+.sp
+.ad l
+.in +8
+.ti -8
+.B rdma
+.RI "[ " OPTIONS " ] " OBJECT " { " COMMAND " | "
+.BR help " }"
+.sp
+
+.ti -8
+.IR OBJECT " := { "
+.BR dev " | " link " }"
+.sp
+
+.ti -8
+.IR OPTIONS " := { "
+\fB\-V\fR[\fIersion\fR] |
+\fB\-d\fR[\fIetails\fR] }
+\fB\-j\fR[\fIson\fR] }
+\fB\-p\fR[\fIretty\fR] }
+
+.SH OPTIONS
+
+.TP
+.BR "\-V" , " -Version"
+Print the version of the
+.B rdma
+tool and exit.
+
+.TP
+.BR "\-d" , " --details"
+Otuput detailed information.
+
+.TP
+.BR "\-p" , " --pretty"
+When combined with -j generate a pretty JSON output.
+
+.TP
+.BR "\-j" , " --json"
+Generate JSON output.
+
+.SS
+.I OBJECT
+
+.TP
+.B dev
+- RDMA device.
+
+.TP
+.B link
+- RDMA port related.
+
+.PP
+The names of all objects may be written in full or
+abbreviated form, for example
+.B stats
+can be abbreviated as
+.B stat
+or just
+.B s.
+
+.SS
+.I COMMAND
+
+Specifies the action to perform on the object.
+The set of possible actions depends on the object type.
+As a rule, it is possible to
+.B show
+(or
+.B list
+) objects, but some objects do not allow all of these operations
+or have some additional commands. The
+.B help
+command is available for all objects. It prints
+out a list of available commands and argument syntax conventions.
+.sp
+If no command is given, some default command is assumed.
+Usually it is
+.B list
+or, if the objects of this class cannot be listed,
+.BR "help" .
+
+.SH EXIT STATUS
+Exit status is 0 if command was successful or a positive integer upon failure.
+
+.SH SEE ALSO
+.BR rdma-dev (8),
+.BR rdma-link (8),
+.br
+
+.SH REPORTING BUGS
+Report any bugs to the Linux RDMA mailing list
+.B 
+where the development and maintenance is primarily done.
+You do not have to be subscribed to the list to send a message there.
+
+.SH AUTHOR
+Leon Romanovsky 
-- 
2.14.0



[PATCH v4 iproute2 5/7] rdma: Add json and pretty outputs

2017-08-15 Thread Leon Romanovsky
Signed-off-by: Leon Romanovsky 
---
 rdma/rdma.c | 31 ---
 rdma/rdma.h |  4 
 2 files changed, 32 insertions(+), 3 deletions(-)

diff --git a/rdma/rdma.c b/rdma/rdma.c
index 74c09e8b..f9f4f2a2 100644
--- a/rdma/rdma.c
+++ b/rdma/rdma.c
@@ -16,7 +16,7 @@ static void help(char *name)
 {
pr_out("Usage: %s [ OPTIONS ] OBJECT { COMMAND | help }\n"
   "where  OBJECT := { dev | link | help }\n"
-  "   OPTIONS := { -V[ersion] | -d[etails]}\n", name);
+  "   OPTIONS := { -V[ersion] | -d[etails] | -j[son] | 
-p[retty]}\n", name);
 }
 
 static int cmd_help(struct rd *rd)
@@ -47,6 +47,16 @@ static int rd_init(struct rd *rd, int argc, char **argv, 
char *filename)
rd->argc = argc;
rd->argv = argv;
INIT_LIST_HEAD(&rd->dev_map_list);
+
+   if (rd->json_output) {
+   rd->jw = jsonw_new(stdout);
+   if (!rd->jw) {
+   pr_err("Failed to create JSON writer\n");
+   return -ENOMEM;
+   }
+   jsonw_pretty(rd->jw, rd->pretty_output);
+   }
+
rd->buff = malloc(MNL_SOCKET_BUFFER_SIZE);
if (!rd->buff)
return -ENOMEM;
@@ -62,6 +72,8 @@ static int rd_init(struct rd *rd, int argc, char **argv, char 
*filename)
 
 static void rd_free(struct rd *rd)
 {
+   if (rd->json_output)
+   jsonw_destroy(&rd->jw);
free(rd->buff);
rd_free_devmap(rd);
 }
@@ -71,10 +83,14 @@ int main(int argc, char **argv)
static const struct option long_options[] = {
{ "version",no_argument,NULL, 'V' },
{ "help",   no_argument,NULL, 'h' },
+   { "json",   no_argument,NULL, 'j' },
+   { "pretty", no_argument,NULL, 'p' },
{ "details",no_argument,NULL, 'd' },
{ NULL, 0, NULL, 0 }
};
+   bool pretty_output = false;
bool show_details = false;
+   bool json_output = false;
char *filename;
struct rd rd;
int opt;
@@ -82,16 +98,22 @@ int main(int argc, char **argv)
 
filename = basename(argv[0]);
 
-   while ((opt = getopt_long(argc, argv, "Vhd",
+   while ((opt = getopt_long(argc, argv, "Vhdpj",
  long_options, NULL)) >= 0) {
switch (opt) {
case 'V':
printf("%s utility, iproute2-ss%s\n",
   filename, SNAPSHOT);
return EXIT_SUCCESS;
+   case 'p':
+   pretty_output = true;
+   break;
case 'd':
show_details = true;
break;
+   case 'j':
+   json_output = true;
+   break;
case 'h':
help(filename);
return EXIT_SUCCESS;
@@ -105,11 +127,14 @@ int main(int argc, char **argv)
argc -= optind;
argv += optind;
 
+   rd.show_details = show_details;
+   rd.json_output = json_output;
+   rd.pretty_output = pretty_output;
+
err = rd_init(&rd, argc, argv, filename);
if (err)
goto out;
 
-   rd.show_details = show_details;
err = rd_cmd(&rd);
 out:
/* Always cleanup */
diff --git a/rdma/rdma.h b/rdma/rdma.h
index b9d75d29..620ec3db 100644
--- a/rdma/rdma.h
+++ b/rdma/rdma.h
@@ -23,6 +23,7 @@
 #include 
 
 #include "list.h"
+#include "json_writer.h"
 
 #define pr_err(args...) fprintf(stderr, ##args)
 #define pr_out(args...) fprintf(stdout, ##args)
@@ -45,6 +46,9 @@ struct rd {
struct mnl_socket *nl;
struct nlmsghdr *nlh;
char *buff;
+   json_writer_t *jw;
+   bool json_output;
+   bool pretty_output;
 };
 
 struct rd_cmd {
-- 
2.14.0



[PATCH v4 iproute2 6/7] rdma: Implement json output for dev object

2017-08-15 Thread Leon Romanovsky
The example output for machine with two devices

root@mtr-leonro:~# rdma dev -j -p
[{
"ifindex": 1,
"ifname": "mlx5_0",
"node_type": "ca",
"fw": "2.8.",
"node_guid": "5254:00c0:fe12:3457",
"sys_image_guid": 5254:00c0:fe12:3457",
"caps": [ "BAD_PKEY_CNTR", "BAD_QKEY_CNTR", "CHANGE_PHY_POR",
  "PORT_ACTIVE_EVENT", "SYS_IMAGE_GUID", "RC_RNR_NAK_GEN",
  "MEM_WINDOW", "UD_IP_CSUM", "UD_TSO", "XRC",
  "MEM_MGT_EXTENSIONS", "BLOCK_MULTICAST_LOOPBACK",
  "MEM_WINDOW_TYPE_2B", "RAW_IP_CSUM",
  "MANAGED_FLOW_STEERING", "RESIZE_MAX_WR" ]
},{
"ifindex": 2,
"ifname": mlx5_1,
"node_type": "ca",
"fw": "2.8.",
"node_guid": "5254:00c0:fe12:3458",
"sys_image_guid": "5254:00c0:fe12:3458",
"caps": [ "BAD_PKEY_CNTR", "BAD_QKEY_CNTR", "CHANGE_PHY_POR",
  "PORT_ACTIVE_EVENT", "SYS_IMAGE_GUID", "RC_RNR_NAK_GEN",
  "MEM_WINDOW", "UD_IP_CSUM", "UD_TSO", "XRC",
  "MEM_MGT_EXTENSIONS", "BLOCK_MULTICAST_LOOPBACK",
  "MEM_WINDOW_TYPE_2B", "RAW_IP_CSUM",
  "MANAGED_FLOW_STEERING", "RESIZE_MAX_WR" ]
}
]

Signed-off-by: Leon Romanovsky 
---
 rdma/dev.c | 110 +
 1 file changed, 82 insertions(+), 28 deletions(-)

diff --git a/rdma/dev.c b/rdma/dev.c
index e984f805..621c4808 100644
--- a/rdma/dev.c
+++ b/rdma/dev.c
@@ -61,7 +61,7 @@ static const char *dev_caps_to_str(uint32_t idx)
}
 }
 
-static void dev_print_caps(struct nlattr **tb)
+static void dev_print_caps(struct rd *rd, struct nlattr **tb)
 {
uint64_t caps;
uint32_t idx;
@@ -71,48 +71,78 @@ static void dev_print_caps(struct nlattr **tb)
 
caps = mnl_attr_get_u64(tb[RDMA_NLDEV_ATTR_CAP_FLAGS]);
 
-   pr_out("\ncaps: <");
+   if (rd->json_output) {
+   jsonw_name(rd->jw, "caps");
+   jsonw_start_array(rd->jw);
+   } else {
+   pr_out("\ncaps: <");
+   }
for (idx = 0; caps; idx++) {
if (caps & 0x1) {
-   pr_out("%s", dev_caps_to_str(idx));
-   if (caps >> 0x1)
-   pr_out(", ");
+   if (rd->json_output) {
+   jsonw_string(rd->jw, dev_caps_to_str(idx));
+   } else {
+   pr_out("%s", dev_caps_to_str(idx));
+   if (caps >> 0x1)
+   pr_out(", ");
+   }
}
caps >>= 0x1;
}
 
-   pr_out(">");
+   if (rd->json_output)
+   jsonw_end_array(rd->jw);
+   else
+   pr_out(">");
 }
 
-static void dev_print_fw(struct nlattr **tb)
+static void dev_print_fw(struct rd *rd, struct nlattr **tb)
 {
+   const char *str;
if (!tb[RDMA_NLDEV_ATTR_FW_VERSION])
return;
 
-   pr_out("fw %s ",
-  mnl_attr_get_str(tb[RDMA_NLDEV_ATTR_FW_VERSION]));
+   str = mnl_attr_get_str(tb[RDMA_NLDEV_ATTR_FW_VERSION]);
+   if (rd->json_output)
+   jsonw_string_field(rd->jw, "fw", str);
+   else
+   pr_out("fw %s ", str);
 }
 
-static void dev_print_node_guid(struct nlattr **tb)
+static void dev_print_node_guid(struct rd *rd, struct nlattr **tb)
 {
uint64_t node_guid;
+   uint16_t vp[4];
+   char str[32];
 
if (!tb[RDMA_NLDEV_ATTR_NODE_GUID])
return;
 
node_guid = mnl_attr_get_u64(tb[RDMA_NLDEV_ATTR_NODE_GUID]);
-   rd_print_u64("node_guid", node_guid);
+   memcpy(vp, &node_guid, sizeof(uint64_t));
+   snprintf(str, 32, "%04x:%04x:%04x:%04x", vp[3], vp[2], vp[1], vp[0]);
+   if (rd->json_output)
+   jsonw_string_field(rd->jw, "node_guid", str);
+   else
+   pr_out("node_guid %s ", str);
 }
 
-static void dev_print_sys_image_guid(struct nlattr **tb)
+static void dev_print_sys_image_guid(struct rd *rd, struct nlattr **tb)
 {
uint64_t sys_image_guid;
+   uint16_t vp[4];
+   char str[32];
 
if (!tb[RDMA_NLDEV_ATTR_SYS_IMAGE_GUID])
return;
 
sys_image_guid = mnl_attr_get_u64(tb[RDMA_NLDEV_ATTR_SYS_IMAGE_GUID]);
-   rd_print_u64("sys_image_guid", sys_image_guid);
+   memcpy(vp, &sys_image_guid, sizeof(uint64_t));
+   snprintf(str, 32, "%04x:%04x:%04x:%04x", vp[3], vp[2], vp[1], vp[0]);
+   if (rd->json_output)
+   jsonw_string_field(rd->jw, "sys_image_guid", str);
+   else
+   pr_out("sys_image_guid %s ", str);
 }
 
 static const char *node_type_to_str(uint8_t node_type)
@@ -128,37 +158,51 @@ static const char *node_type_to_str(uint8_t node_type)
}
 }
 
-static void dev_print_node_type(struct

[PATCH v4 iproute2 3/7] rdma: Add link object

2017-08-15 Thread Leon Romanovsky
Link (port) object represent struct ib_port to the user space.

Link properties:
 * Port capabilities
 * IB subnet prefix
 * LID, SM_LID and LMC
 * Port state
 * Physical state

Signed-off-by: Leon Romanovsky 
---
 rdma/Makefile |   2 +-
 rdma/link.c   | 274 ++
 rdma/rdma.c   |   3 +-
 rdma/utils.c  |   5 ++
 4 files changed, 282 insertions(+), 2 deletions(-)
 create mode 100644 rdma/link.c

diff --git a/rdma/Makefile b/rdma/Makefile
index 123d7ac5..1a9e4b1a 100644
--- a/rdma/Makefile
+++ b/rdma/Makefile
@@ -2,7 +2,7 @@ include ../Config
 
 ifeq ($(HAVE_MNL),y)
 
-RDMA_OBJ = rdma.o utils.o dev.o
+RDMA_OBJ = rdma.o utils.o dev.o link.o
 
 TARGETS=rdma
 CFLAGS += $(shell $(PKG_CONFIG) libmnl --cflags)
diff --git a/rdma/link.c b/rdma/link.c
new file mode 100644
index ..51858965
--- /dev/null
+++ b/rdma/link.c
@@ -0,0 +1,274 @@
+/*
+ * link.c  RDMA tool
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ * Authors: Leon Romanovsky 
+ */
+
+#include "rdma.h"
+
+static int link_help(struct rd *rd)
+{
+   pr_out("Usage: %s link show [DEV/PORT_INDEX]\n", rd->filename);
+   return 0;
+}
+
+static const char *caps_to_str(uint32_t idx)
+{
+   uint64_t cap = 1 << idx;
+
+   switch (cap) {
+   case RDMA_PORT_SM: return "SM";
+   case RDMA_PORT_NOTICE: return "NOTICE";
+   case RDMA_PORT_TRAP: return "TRAP";
+   case RDMA_PORT_OPT_IPD: return "OPT_IPD";
+   case RDMA_PORT_AUTO_MIGR: return "AUTO_MIG";
+   case RDMA_PORT_SL_MAP: return "SL_MAP";
+   case RDMA_PORT_MKEY_NVRAM: return "MKEY_NVRAM";
+   case RDMA_PORT_PKEY_NVRAM: return "PKEY_NVRAM";
+   case RDMA_PORT_LED_INFO: return "LED_INFO";
+   case RDMA_PORT_SM_DISABLED: return "SM_DISABLED";
+   case RDMA_PORT_SYS_IMAGE_GUID: return "SYS_IMAGE_GUID";
+   case RDMA_PORT_PKEY_SW_EXT_PORT_TRAP: return "PKEY_SW_EXT_PORT_TRAP";
+   case RDMA_PORT_EXTENDED_SPEEDS: return "EXTENDED_SPEEDS";
+   case RDMA_PORT_CM: return "CM";
+   case RDMA_PORT_SNMP_TUNNEL: return "SNMP_TUNNEL";
+   case RDMA_PORT_REINIT: return "REINIT";
+   case RDMA_PORT_DEVICE_MGMT: return "DEVICE_MGMT";
+   case RDMA_PORT_VENDOR_CLASS: return "VENDOR_CLASS";
+   case RDMA_PORT_DR_NOTICE: return "PORT_DR_NOTICE";
+   case RDMA_PORT_CAP_MASK_NOTICE: return "CAP_MASK_NOTICE";
+   case RDMA_PORT_BOOT_MGMT: return "BOOT_MGMT";
+   case RDMA_PORT_LINK_LATENCY: return "LINK_LATENCY";
+   case RDMA_PORT_CLIENT_REG: return "CLIENT_REG";
+   case RDMA_PORT_IP_BASED_GIDS: return "IP_BASED_GIDS";
+   default: return "UKNOWN";
+   }
+};
+
+static void link_print_caps(struct nlattr **tb)
+{
+   uint64_t caps;
+   uint32_t idx;
+
+   if (!tb[RDMA_NLDEV_ATTR_CAP_FLAGS])
+   return;
+
+   caps = mnl_attr_get_u64(tb[RDMA_NLDEV_ATTR_CAP_FLAGS]);
+
+   pr_out("\ncaps: <");
+   for (idx = 0; caps; idx++) {
+   if (caps & 0x1) {
+   pr_out("%s", caps_to_str(idx));
+   if (caps >> 0x1)
+   pr_out(", ");
+   }
+   caps >>= 0x1;
+   }
+
+   pr_out(">");
+}
+
+static void link_print_subnet_prefix(struct nlattr **tb)
+{
+   uint64_t subnet_prefix;
+
+   if (!tb[RDMA_NLDEV_ATTR_SUBNET_PREFIX])
+   return;
+
+   subnet_prefix = mnl_attr_get_u64(tb[RDMA_NLDEV_ATTR_SUBNET_PREFIX]);
+   rd_print_u64("subnet_prefix", subnet_prefix);
+}
+
+static void link_print_lid(struct nlattr **tb)
+{
+   if (!tb[RDMA_NLDEV_ATTR_LID])
+   return;
+
+   pr_out("lid %u ",
+  mnl_attr_get_u32(tb[RDMA_NLDEV_ATTR_LID]));
+}
+
+static void link_print_sm_lid(struct nlattr **tb)
+{
+   if (!tb[RDMA_NLDEV_ATTR_SM_LID])
+   return;
+
+   pr_out("sm_lid %u ",
+  mnl_attr_get_u32(tb[RDMA_NLDEV_ATTR_SM_LID]));
+}
+
+static void link_print_lmc(struct nlattr **tb)
+{
+   if (!tb[RDMA_NLDEV_ATTR_LMC])
+   return;
+
+   pr_out("lmc %u ", mnl_attr_get_u8(tb[RDMA_NLDEV_ATTR_LMC]));
+}
+
+static const char *link_state_to_str(uint8_t link_state)
+{
+   switch (link_state) {
+   case RDMA_LINK_STATE_NOP: return "NOP";
+   case RDMA_LINK_STATE_DOWN: return "DOWN";
+   case RDMA_LINK_STATE_INIT: return "INIT";
+   case RDMA_LINK_STATE_ARMED: return "ARMED";
+   case RDMA_LINK_STATE_ACTIVE: return "ACTIVE";
+   case RDMA_LINK_STATE_ACTIVE_DEFER: return "ACTIVE_DEFER";
+   default: return "UKNOWN";
+   }
+};
+
+static void link_print_state(struct nlattr **tb)
+{
+   uint8_t state;
+
+   if (!tb[RDMA_NLDEV_ATTR_PORT_STATE

[PATCH v4 iproute2 7/7] rdma: Add json output to link object

2017-08-15 Thread Leon Romanovsky
An example for the JSON output for two devices system.

root@mtr-leonro:~# rdma link -d -p -j
[{
"ifindex": 1,
"port": 1,
"ifname": "mlx5_0/1",
"subnet_prefix": "fe80:::",
"lid": 13399,
"sm_lid": 49151,
"lmc": 0,
"state": "ACTIVE",
"physical_state": "LINK_UP",
"caps": ["AUTO_MIG"
]
},{
"ifindex": 2,
"port": 1,
"ifname": "mlx5_1/1",
"subnet_prefix": "fe80:::",
"lid": 13400,
"sm_lid": 49151,
"lmc": 0,
"state": "ACTIVE",
"physical_state": "LINK_UP",
"caps": ["AUTO_MIG"
]
}
]

Signed-off-by: Leon Romanovsky 
---
 rdma/link.c  | 144 +++
 rdma/rdma.h  |   1 -
 rdma/utils.c |   8 
 3 files changed, 105 insertions(+), 48 deletions(-)

diff --git a/rdma/link.c b/rdma/link.c
index 51858965..b3316e7e 100644
--- a/rdma/link.c
+++ b/rdma/link.c
@@ -50,7 +50,7 @@ static const char *caps_to_str(uint32_t idx)
}
 };
 
-static void link_print_caps(struct nlattr **tb)
+static void link_print_caps(struct rd *rd, struct nlattr **tb)
 {
uint64_t caps;
uint32_t idx;
@@ -60,54 +60,89 @@ static void link_print_caps(struct nlattr **tb)
 
caps = mnl_attr_get_u64(tb[RDMA_NLDEV_ATTR_CAP_FLAGS]);
 
-   pr_out("\ncaps: <");
+   if (rd->json_output) {
+   jsonw_name(rd->jw, "caps");
+   jsonw_start_array(rd->jw);
+   } else {
+   pr_out("\ncaps: <");
+   }
for (idx = 0; caps; idx++) {
if (caps & 0x1) {
-   pr_out("%s", caps_to_str(idx));
-   if (caps >> 0x1)
-   pr_out(", ");
+   if (rd->json_output) {
+   jsonw_string(rd->jw, caps_to_str(idx));
+   } else {
+   pr_out("%s", caps_to_str(idx));
+   if (caps >> 0x1)
+   pr_out(", ");
+   }
}
caps >>= 0x1;
}
 
-   pr_out(">");
+   if (rd->json_output)
+   jsonw_end_array(rd->jw);
+   else
+   pr_out(">");
 }
 
-static void link_print_subnet_prefix(struct nlattr **tb)
+static void link_print_subnet_prefix(struct rd *rd, struct nlattr **tb)
 {
uint64_t subnet_prefix;
+   uint16_t vp[4];
+   char str[32];
 
if (!tb[RDMA_NLDEV_ATTR_SUBNET_PREFIX])
return;
 
subnet_prefix = mnl_attr_get_u64(tb[RDMA_NLDEV_ATTR_SUBNET_PREFIX]);
-   rd_print_u64("subnet_prefix", subnet_prefix);
+   memcpy(vp, &subnet_prefix, sizeof(uint64_t));
+   snprintf(str, 32, "%04x:%04x:%04x:%04x", vp[3], vp[2], vp[1], vp[0]);
+   if (rd->json_output)
+   jsonw_string_field(rd->jw, "subnet_prefix", str);
+   else
+   pr_out("subnet_prefix %s ", str);
 }
 
-static void link_print_lid(struct nlattr **tb)
+static void link_print_lid(struct rd *rd, struct nlattr **tb)
 {
+   uint32_t lid;
+
if (!tb[RDMA_NLDEV_ATTR_LID])
return;
 
-   pr_out("lid %u ",
-  mnl_attr_get_u32(tb[RDMA_NLDEV_ATTR_LID]));
+   lid = mnl_attr_get_u32(tb[RDMA_NLDEV_ATTR_LID]);
+   if (rd->json_output)
+   jsonw_uint_field(rd->jw, "lid", lid);
+   else
+   pr_out("lid %u ", lid);
 }
 
-static void link_print_sm_lid(struct nlattr **tb)
+static void link_print_sm_lid(struct rd *rd, struct nlattr **tb)
 {
+   uint32_t sm_lid;
+
if (!tb[RDMA_NLDEV_ATTR_SM_LID])
return;
 
-   pr_out("sm_lid %u ",
-  mnl_attr_get_u32(tb[RDMA_NLDEV_ATTR_SM_LID]));
+   sm_lid = mnl_attr_get_u32(tb[RDMA_NLDEV_ATTR_SM_LID]);
+   if (rd->json_output)
+   jsonw_uint_field(rd->jw, "sm_lid", sm_lid);
+   else
+   pr_out("sm_lid %u ", sm_lid);
 }
 
-static void link_print_lmc(struct nlattr **tb)
+static void link_print_lmc(struct rd *rd, struct nlattr **tb)
 {
+   uint8_t lmc;
+
if (!tb[RDMA_NLDEV_ATTR_LMC])
return;
 
-   pr_out("lmc %u ", mnl_attr_get_u8(tb[RDMA_NLDEV_ATTR_LMC]));
+   lmc = mnl_attr_get_u8(tb[RDMA_NLDEV_ATTR_LMC]);
+   if (rd->json_output)
+   jsonw_uint_field(rd->jw, "lmc", lmc);
+   else
+   pr_out("lmc %u ", lmc);
 }
 
 static const char *link_state_to_str(uint8_t link_state)
@@ -123,7 +158,7 @@ static const char *link_state_to_str(uint8_t link_state)
}
 };
 
-static void link_print_state(struct nlattr **tb)
+static void link_print_state(struct rd *rd, struct nlattr **tb)
 {
uint8_t state;
 
@@ -131,7 +166,10 @@ static void link_print_state(struct nlattr **tb)
return;
 
state = mnl_attr_get_u8(tb[RDMA_NLDEV_ATTR_PORT_STAT

[PATCH v4 iproute2 1/7] rdma: Add basic infrastructure for RDMA tool

2017-08-15 Thread Leon Romanovsky
RDMA devices are cross-functional devices from one side,
but very tailored for the specific markets from another.

Such diversity caused to spread of RDMA related configuration
across various tools, e.g. devlink, ip, ethtool, ib specific and
vendor specific solutions.

This patch adds ability to fill device and port information
by reading RDMA netlink.

Signed-off-by: Leon Romanovsky 
---
 Makefile|   2 +-
 rdma/.gitignore |   1 +
 rdma/Makefile   |  22 ++
 rdma/rdma.c | 116 ++
 rdma/rdma.h |  74 +++
 rdma/utils.c| 217 
 6 files changed, 431 insertions(+), 1 deletion(-)
 create mode 100644 rdma/.gitignore
 create mode 100644 rdma/Makefile
 create mode 100644 rdma/rdma.c
 create mode 100644 rdma/rdma.h
 create mode 100644 rdma/utils.c

diff --git a/Makefile b/Makefile
index 1f88f7f5..dbb4a4af 100644
--- a/Makefile
+++ b/Makefile
@@ -49,7 +49,7 @@ WFLAGS += -Wmissing-declarations -Wold-style-definition 
-Wformat=2
 CFLAGS := $(WFLAGS) $(CCOPTS) -I../include $(DEFINES) $(CFLAGS)
 YACCFLAGS = -d -t -v
 
-SUBDIRS=lib ip tc bridge misc netem genl tipc devlink man
+SUBDIRS=lib ip tc bridge misc netem genl tipc devlink rdma man
 
 LIBNETLINK=../lib/libnetlink.a ../lib/libutil.a
 LDLIBS += $(LIBNETLINK)
diff --git a/rdma/.gitignore b/rdma/.gitignore
new file mode 100644
index ..51fb172b
--- /dev/null
+++ b/rdma/.gitignore
@@ -0,0 +1 @@
+rdma
diff --git a/rdma/Makefile b/rdma/Makefile
new file mode 100644
index ..64da2142
--- /dev/null
+++ b/rdma/Makefile
@@ -0,0 +1,22 @@
+include ../Config
+
+ifeq ($(HAVE_MNL),y)
+
+RDMA_OBJ = rdma.o utils.o
+
+TARGETS=rdma
+CFLAGS += $(shell $(PKG_CONFIG) libmnl --cflags)
+LDLIBS += $(shell $(PKG_CONFIG) libmnl --libs)
+
+endif
+
+all:   $(TARGETS) $(LIBS)
+
+rdma:  $(RDMA_OBJ) $(LIBS)
+   $(QUIET_LINK)$(CC) $^ $(LDFLAGS) $(LDLIBS) -o $@
+
+install: all
+   install -m 0755 $(TARGETS) $(DESTDIR)$(SBINDIR)
+
+clean:
+   rm -f $(RDMA_OBJ) $(TARGETS)
diff --git a/rdma/rdma.c b/rdma/rdma.c
new file mode 100644
index ..d850e396
--- /dev/null
+++ b/rdma/rdma.c
@@ -0,0 +1,116 @@
+/*
+ * rdma.c  RDMA tool
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ * Authors: Leon Romanovsky 
+ */
+
+#include "rdma.h"
+#include "SNAPSHOT.h"
+
+static void help(char *name)
+{
+   pr_out("Usage: %s [ OPTIONS ] OBJECT { COMMAND | help }\n"
+  "where  OBJECT := { help }\n"
+  "   OPTIONS := { -V[ersion] | -d[etails]}\n", name);
+}
+
+static int cmd_help(struct rd *rd)
+{
+   help(rd->filename);
+   return 0;
+}
+
+static int rd_cmd(struct rd *rd)
+{
+   const struct rd_cmd cmds[] = {
+   { NULL, cmd_help },
+   { "help",   cmd_help },
+   { 0 }
+   };
+
+   return rd_exec_cmd(rd, cmds, "object");
+}
+
+static int rd_init(struct rd *rd, int argc, char **argv, char *filename)
+{
+   uint32_t seq;
+   int ret;
+
+   rd->filename = filename;
+   rd->argc = argc;
+   rd->argv = argv;
+   INIT_LIST_HEAD(&rd->dev_map_list);
+   rd->buff = malloc(MNL_SOCKET_BUFFER_SIZE);
+   if (!rd->buff)
+   return -ENOMEM;
+
+   rd_prepare_msg(rd, RDMA_NLDEV_CMD_GET,
+  &seq, (NLM_F_REQUEST | NLM_F_ACK | NLM_F_DUMP));
+   ret = rd_send_msg(rd);
+   if (ret)
+   return ret;
+
+   return rd_recv_msg(rd, rd_dev_init_cb, rd, seq);
+}
+
+static void rd_free(struct rd *rd)
+{
+   free(rd->buff);
+   rd_free_devmap(rd);
+}
+
+int main(int argc, char **argv)
+{
+   static const struct option long_options[] = {
+   { "version",no_argument,NULL, 'V' },
+   { "help",   no_argument,NULL, 'h' },
+   { "details",no_argument,NULL, 'd' },
+   { NULL, 0, NULL, 0 }
+   };
+   bool show_details = false;
+   char *filename;
+   struct rd rd;
+   int opt;
+   int err;
+
+   filename = basename(argv[0]);
+
+   while ((opt = getopt_long(argc, argv, "Vhd",
+ long_options, NULL)) >= 0) {
+   switch (opt) {
+   case 'V':
+   printf("%s utility, iproute2-ss%s\n",
+  filename, SNAPSHOT);
+   return EXIT_SUCCESS;
+   case 'd':
+   show_details = true;
+   break;
+   case 'h':
+   help(filename);
+   return EXIT_SUCCESS;
+   default:
+

[PATCH] Adding-Agile-SD-TCP-module-and-modifying-Kconfig-and-makefile

2017-08-15 Thread mohamedalrshah
This commit implements a new TCP congestion control algorithm, namely Agile-SD. 
A detailed description of Agile-SD is published in the following 2 articles:

[1] "Agile-SD: a Linux-based TCP congestion control algorithm for supporting 
high-speed and short-distance networks",
Alrshah, M.A., Othman, M., Ali, B. and Hanapi, Z.M. Journal of Network and 
Computer Applications, Vol. 55, pages.181-190, May-June 2015. 

[2] "Modeling the Throughput of the Linux-Based Agile-SD Transmission Control 
Protocol",
Alrshah, M.A., Othman, M., Ali, B. and Hanapi, Z.M.
IEEE Access journal, Vol. 4, pages.9724-9732, Jan 2017.

The Internet has predominantly used Reno or CUBIC, relying on packet loss as 
the 
signal to slow down. While this worked well for many years, these congestion 
control 
algorithms are unfortunately unable to present an acceptable bandwidth 
utilization 
over today's networks. On today's Internet, data losses are very common and the 
existing congestion control algorithms are very sensitive to packet losses 
causing 
unwanted delay and retransmission of data. Also, the existing congestion 
control 
algorithms shows long epochs which affect the general performance of TCP.

Agile-SD has significantly increased throughput, reduced sensitivity to packet 
loss, 
and improved the ability to work with small and large buffers. Agile-SD shrinks 
the 
needed epoch time to recover after data loss, which improves throughput up to 
50% in 
many cases, see [1,2]. 

Agile-SD is a Congestion Control Algorithm for High-speed Networks. Agile-SD is 
a 
loss-based and RTT-independent TCP congestion control algorithm designed to 
support 
high-speed networks. Agile-SD requires only changes on the sender side, not in 
the 
network or the receiver side. So, it can be incrementally deployed on today's 
Internet 
and/or in datacenters. Agile-SD introduces the Agility Factor Mechanism (AFM), 
which 
allows Agile-SD to deal with small buffer sizes while reducing its sensitivity 
to packet 
loss. Due to the use of this mechanism, Agile-SD improves the throughput of TCP 
up to 50% 
compared to Cubic-TCP and Compound-TCP in many cases, especially when the 
buffer is small 
and the data loss is common. Its performance was evaluated using simulation and 
testbed 
to measure the average throughput, loss ratio, and fairness.

Our long-term goal is to improve the congestion control algorithms used on the 
Internet. 
We are hopeful that Agile-SD can help advance the efforts toward this goal, and 
motivate 
the community to do further research.

Signed-off-by: Mohamed A. Alrshah 
Signed-off-by: Mohamed Othman 
Signed-off-by: Borhanuddin Ali 
Signed-off-by: Zurina Hanapi 

---
 net/ipv4/Kconfig   |  15 
 net/ipv4/Makefile  |   1 +
 net/ipv4/tcp_agilesd.c | 193 +
 3 files changed, 209 insertions(+)
 create mode 100755 net/ipv4/tcp_agilesd.c

diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig
index 91a2557..474f72c 100644
--- a/net/ipv4/Kconfig
+++ b/net/ipv4/Kconfig
@@ -677,6 +677,17 @@ config TCP_CONG_BBR
bufferbloat, policers, or AQM schemes that do not provide a delay
signal. It requires the fq ("Fair Queue") pacing packet scheduler.
 
+config TCP_CONG_AGILESD
+tristate "Agile-SD Congestion control"
+default n
+---help---
+
+This is version 1.0 of Agile-SD TCP. It is a sender-side only. 
+It contributes the Agility Factor (AF) to shorten the epoch time 
+and to make TCP independent from RTT. AF reduces the sensitivity 
+to packet losses, which in turn Agile-SD to achieve better throughput 
+over high-speed networks.
+
 choice
prompt "Default TCP congestion control"
default DEFAULT_CUBIC
@@ -713,6 +724,9 @@ choice
 
config DEFAULT_BBR
bool "BBR" if TCP_CONG_BBR=y
+
+config DEFAULT_AGILESD
+   bool "AGILESD" if TCP_CONG_AGILESD=y
 
config DEFAULT_RENO
bool "Reno"
@@ -738,6 +752,7 @@ config DEFAULT_TCP_CONG
default "dctcp" if DEFAULT_DCTCP
default "cdg" if DEFAULT_CDG
default "bbr" if DEFAULT_BBR
+default "agilesd" if DEFAULT_AGILESD
default "cubic"
 
 config TCP_MD5SIG
diff --git a/net/ipv4/Makefile b/net/ipv4/Makefile
index f83de23..33d398b 100644
--- a/net/ipv4/Makefile
+++ b/net/ipv4/Makefile
@@ -44,6 +44,7 @@ obj-$(CONFIG_INET_UDP_DIAG) += udp_diag.o
 obj-$(CONFIG_INET_RAW_DIAG) += raw_diag.o
 obj-$(CONFIG_NET_TCPPROBE) += tcp_probe.o
 obj-$(CONFIG_TCP_CONG_BBR) += tcp_bbr.o
+obj-$(CONFIG_TCP_CONG_AGILESD) += tcp_agilesd.o
 obj-$(CONFIG_TCP_CONG_BIC) += tcp_bic.o
 obj-$(CONFIG_TCP_CONG_CDG) += tcp_cdg.o
 obj-$(CONFIG_TCP_CONG_CUBIC) += tcp_cubic.o
diff --git a/net/ipv4/tcp_agilesd.c b/net/ipv4/tcp_agilesd.c
new file mode 100755
index 000..5de4779
--- /dev/null
+++ b/net/ipv4/tcp_agilesd.c
@@ -0,0 +1,193 @@
+/* Agile-SD is a Loss-Based Congestion Control Algo

RE: [PATCH v2 net-next 1/3] ipv6: Prevent unexpected sk->sk_prot changes

2017-08-15 Thread Boris Pismenny
Hi Eric,

> -Original Message-
> From: Eric Dumazet [mailto:eric.duma...@gmail.com]
> Sent: Tuesday, August 15, 2017 14:59
> To: Ilya Lesokhin 
> Cc: netdev@vger.kernel.org; da...@davemloft.net; davejwat...@fb.com;
> Aviad Yehezkel ; Boris Pismenny
> 
> Subject: Re: [PATCH v2 net-next 1/3] ipv6: Prevent unexpected sk->sk_prot
> changes
> 
> On Tue, 2017-08-15 at 14:08 +0300, Ilya Lesokhin wrote:
> > With this patch IPV6 code ensure that only sockets with the
> > expected sk->sk_prot are converted to IPV4.
> >
> > Signed-off-by: Boris Pismenny 
> > ---
> >  net/ipv6/ipv6_sockglue.c | 12 
> >  1 file changed, 12 insertions(+)
> >
> > diff --git a/net/ipv6/ipv6_sockglue.c b/net/ipv6/ipv6_sockglue.c
> > index 02d795f..318cd344 100644
> > --- a/net/ipv6/ipv6_sockglue.c
> > +++ b/net/ipv6/ipv6_sockglue.c
> > @@ -174,6 +174,7 @@ static int do_ipv6_setsockopt(struct sock *sk, int
> level, int optname,
> > if (val == PF_INET) {
> > struct ipv6_txoptions *opt;
> > struct sk_buff *pktopt;
> > +   struct proto *expected_prot;
> >
> > if (sk->sk_type == SOCK_RAW)
> > break;
> > @@ -199,6 +200,17 @@ static int do_ipv6_setsockopt(struct sock *sk, int
> level, int optname,
> > break;
> > }
> >
> > +   if (sk->sk_protocol == IPPROTO_TCP &&
> > +   sk->sk_prot != &tcpv6_prot)
> > +   break;
> > +
> > +   expected_prot = &udpv6_prot;
> > +   if (sk->sk_protocol == IPPROTO_UDPLITE)
> > +   expected_prot = &udplitev6_prot;
> > +
> > +   if (sk->sk_prot != expected_prot)
> > +   break;
> > +
> > fl6_free_socklist(sk);
> > __ipv6_sock_mc_close(sk);
> >
> 
> I am afraid I do not understand this patch at all.
> 
> Direct references to tcpv6_prot, udpv6_prot, and udplitev6_prot in
> net/ipv6/ipv6_sockglue.c looks completely broken.
> 
> Please provide something cleaner, maybe by adding a new method
> (implementation would then be provided in TCP / UDP code )
> 
> 

The IPV6_ADDRFORM socket option assumes that when 
(sk->sk_protocol == IPPROTO_TCP)
then the sk_proto is set to tcpv6_prot and it replaces it with tcp_prot.

This patch ensures that the IPV6_ADDRFORM socket option doesn't replace
the socket's sk_prot to tcp when it is not expected. For example, TLS sockets
also replace sk_prot, and we need to prevent IPV6_ADDRFORM from
overriding these.

Are you suggesting that each socket protocol will provide a method that
converts it from IPv6 to IPv4?


Possible race in c4.ko

2017-08-15 Thread Anton Volkov

Hello.

While searching for races in the Linux kernel I've come across 
"drivers/isdn/hardware/avm/c4.ko" module. Here is a question that I came 
up with while analyzing results. Lines are given using the info from 
Linux v4.12.


Consider the following case:

Thread 1:  Thread 2:
c4_probe
->c4_add_card
request_irq()
   c4_interrupt
   ->c4_handle_interrupt
 ->c4_handle_rx
card->cardnr = ... cidx = f(card->cardnr)
(c4.c: line 1227)  (c4.c: line 526)
   if (cidx >= card->nlogcontr) cidx = 0;
   ctrl = &card->ctrlinfo[cidx].capi_ctrl

card->cardnr is 0 until it is initialized in c4_add_card(). If at the 
moment of read access in c4_handle_rx() it is still 0, cidx may then be 
assigned an undesirable value and wrong controller may handle messages. 
Is this case feasible from your point of view?


Thank you for your time.

-- Anton Volkov
Linux Verification Center, ISPRAS
web: http://linuxtesting.org
e-mail: avol...@ispras.ru


[PATCH] net_sched: reset pointers to tcf blocks in classful qdiscs' destructors

2017-08-15 Thread Konstantin Khlebnikov
Traffic filters could keep direct pointers to classes in classful qdisc,
thus qdisc destruction first removes all filters before freeing classes.
Class destruction methods also tries to free attached filters but now
this isn't safe because tcf_block_put() unlike to tcf_destroy_chain()
cannot be called second time.

This patch set class->block to NULL after first tcf_block_put() and
turn second call into no-op.

Signed-off-by: Konstantin Khlebnikov 
Fixes: 6529eaba33f0 ("net: sched: introduce tcf block infractructure")
---
 net/sched/sch_atm.c  |4 +++-
 net/sched/sch_cbq.c  |4 +++-
 net/sched/sch_hfsc.c |4 +++-
 net/sched/sch_htb.c  |4 +++-
 4 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/net/sched/sch_atm.c b/net/sched/sch_atm.c
index 572fe2584e48..c403c87aff7a 100644
--- a/net/sched/sch_atm.c
+++ b/net/sched/sch_atm.c
@@ -572,8 +572,10 @@ static void atm_tc_destroy(struct Qdisc *sch)
struct atm_flow_data *flow, *tmp;
 
pr_debug("atm_tc_destroy(sch %p,[qdisc %p])\n", sch, p);
-   list_for_each_entry(flow, &p->flows, list)
+   list_for_each_entry(flow, &p->flows, list) {
tcf_block_put(flow->block);
+   flow->block = NULL;
+   }
 
list_for_each_entry_safe(flow, tmp, &p->flows, list) {
if (flow->ref > 1)
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 481036f6b54e..780db43300b1 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1431,8 +1431,10 @@ static void cbq_destroy(struct Qdisc *sch)
 * be bound to classes which have been destroyed already. --TGR '04
 */
for (h = 0; h < q->clhash.hashsize; h++) {
-   hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode)
+   hlist_for_each_entry(cl, &q->clhash.hash[h], common.hnode) {
tcf_block_put(cl->block);
+   cl->block = NULL;
+   }
}
for (h = 0; h < q->clhash.hashsize; h++) {
hlist_for_each_entry_safe(cl, next, &q->clhash.hash[h],
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 3ad02bbe6903..fd15200f8627 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1530,8 +1530,10 @@ hfsc_destroy_qdisc(struct Qdisc *sch)
unsigned int i;
 
for (i = 0; i < q->clhash.hashsize; i++) {
-   hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode)
+   hlist_for_each_entry(cl, &q->clhash.hash[i], cl_common.hnode) {
tcf_block_put(cl->block);
+   cl->block = NULL;
+   }
}
for (i = 0; i < q->clhash.hashsize; i++) {
hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 203286ab4427..5d65ec5207e9 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1258,8 +1258,10 @@ static void htb_destroy(struct Qdisc *sch)
tcf_block_put(q->block);
 
for (i = 0; i < q->clhash.hashsize; i++) {
-   hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode)
+   hlist_for_each_entry(cl, &q->clhash.hash[i], common.hnode) {
tcf_block_put(cl->block);
+   cl->block = NULL;
+   }
}
for (i = 0; i < q->clhash.hashsize; i++) {
hlist_for_each_entry_safe(cl, next, &q->clhash.hash[i],



[PATCH 1/2] net_sched: call qlen_notify only if child qdisc is empty

2017-08-15 Thread Konstantin Khlebnikov
This callback is used for deactivating class in parent qdisc.
This is cheaper to test queue length right here.

Also this allows to catch draining screwed backlog and prevent
second deactivation of already inactive parent class which will
crash kernel for sure. Kernel with print warning at destruction
of child qdisc where no packets but backlog is not zero.

Signed-off-by: Konstantin Khlebnikov 
---
 net/sched/sch_api.c  |   10 +-
 net/sched/sch_cbq.c  |3 +--
 net/sched/sch_drr.c  |3 +--
 net/sched/sch_hfsc.c |6 ++
 net/sched/sch_htb.c  |3 +--
 net/sched/sch_qfq.c  |3 +--
 6 files changed, 15 insertions(+), 13 deletions(-)

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index bd24a550e0f9..18da45c0769c 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -752,6 +752,7 @@ void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned 
int n,
const struct Qdisc_class_ops *cops;
unsigned long cl;
u32 parentid;
+   bool notify;
int drops;
 
if (n == 0 && len == 0)
@@ -764,6 +765,13 @@ void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned 
int n,
 
if (sch->flags & TCQ_F_NOPARENT)
break;
+   /* Notify parent qdisc only if child qdisc becomes empty.
+*
+* If child was empty even before update then backlog
+* counter is screwed and we skip notification because
+* parent class is already passive.
+*/
+   notify = !sch->q.qlen && !WARN_ON_ONCE(!n);
/* TODO: perform the search on a per txq basis */
sch = qdisc_lookup(qdisc_dev(sch), TC_H_MAJ(parentid));
if (sch == NULL) {
@@ -771,7 +779,7 @@ void qdisc_tree_reduce_backlog(struct Qdisc *sch, unsigned 
int n,
break;
}
cops = sch->ops->cl_ops;
-   if (cops->qlen_notify) {
+   if (notify && cops->qlen_notify) {
cl = cops->get(sch, parentid);
cops->qlen_notify(sch, cl);
cops->put(sch, cl);
diff --git a/net/sched/sch_cbq.c b/net/sched/sch_cbq.c
index 780db43300b1..1bdb0106f342 100644
--- a/net/sched/sch_cbq.c
+++ b/net/sched/sch_cbq.c
@@ -1385,8 +1385,7 @@ static void cbq_qlen_notify(struct Qdisc *sch, unsigned 
long arg)
 {
struct cbq_class *cl = (struct cbq_class *)arg;
 
-   if (cl->q->q.qlen == 0)
-   cbq_deactivate_class(cl);
+   cbq_deactivate_class(cl);
 }
 
 static unsigned long cbq_get(struct Qdisc *sch, u32 classid)
diff --git a/net/sched/sch_drr.c b/net/sched/sch_drr.c
index a413dc1c2098..1d2f6235dfcf 100644
--- a/net/sched/sch_drr.c
+++ b/net/sched/sch_drr.c
@@ -246,8 +246,7 @@ static void drr_qlen_notify(struct Qdisc *csh, unsigned 
long arg)
 {
struct drr_class *cl = (struct drr_class *)arg;
 
-   if (cl->qdisc->q.qlen == 0)
-   list_del(&cl->alist);
+   list_del(&cl->alist);
 }
 
 static int drr_dump_class(struct Qdisc *sch, unsigned long arg,
diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index fd15200f8627..14c99870cdb6 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -1221,10 +1221,8 @@ hfsc_qlen_notify(struct Qdisc *sch, unsigned long arg)
 {
struct hfsc_class *cl = (struct hfsc_class *)arg;
 
-   if (cl->qdisc->q.qlen == 0) {
-   update_vf(cl, 0, 0);
-   set_passive(cl);
-   }
+   update_vf(cl, 0, 0);
+   set_passive(cl);
 }
 
 static unsigned long
diff --git a/net/sched/sch_htb.c b/net/sched/sch_htb.c
index 5d65ec5207e9..dcf3c85e1f4f 100644
--- a/net/sched/sch_htb.c
+++ b/net/sched/sch_htb.c
@@ -1186,8 +1186,7 @@ static void htb_qlen_notify(struct Qdisc *sch, unsigned 
long arg)
 {
struct htb_class *cl = (struct htb_class *)arg;
 
-   if (cl->un.leaf.q->q.qlen == 0)
-   htb_deactivate(qdisc_priv(sch), cl);
+   htb_deactivate(qdisc_priv(sch), cl);
 }
 
 static unsigned long htb_get(struct Qdisc *sch, u32 classid)
diff --git a/net/sched/sch_qfq.c b/net/sched/sch_qfq.c
index 0e16dfda0bd7..9caa959f91e1 100644
--- a/net/sched/sch_qfq.c
+++ b/net/sched/sch_qfq.c
@@ -1428,8 +1428,7 @@ static void qfq_qlen_notify(struct Qdisc *sch, unsigned 
long arg)
struct qfq_sched *q = qdisc_priv(sch);
struct qfq_class *cl = (struct qfq_class *)arg;
 
-   if (cl->qdisc->q.qlen == 0)
-   qfq_deactivate_class(q, cl);
+   qfq_deactivate_class(q, cl);
 }
 
 static int qfq_init_qdisc(struct Qdisc *sch, struct nlattr *opt)



[PATCH net-next] dsa: fix flow disector null pointer

2017-08-15 Thread Craig Gallek
From: Craig Gallek 

A recent change to fix up DSA device behavior made the assumption that
all skbs passing through the flow disector will be associated with a
device. This does not appear to be a safe assumption.  Syzkaller found
the crash below by attaching a BPF socket filter that tries to find the
payload offset of a packet passing between two unix sockets.

kasan: GPF could be caused by NULL-ptr deref or user memory access
general protection fault:  [#1] SMP KASAN
Dumping ftrace buffer:
   (ftrace buffer empty)
Modules linked in:
CPU: 0 PID: 2940 Comm: syzkaller872007 Not tainted 4.13.0-rc4-next-20170811 #1
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
01/01/2011
task: 8801d1b425c0 task.stack: 8801d0bc
RIP: 0010:__skb_flow_dissect+0xdcd/0x3ae0 net/core/flow_dissector.c:445
RSP: 0018:8801d0bc7340 EFLAGS: 00010206
RAX: dc00 RBX:  RCX: 
RDX: 0060 RSI: 856dc080 RDI: 0300
RBP: 8801d0bc7870 R08:  R09: 
R10: 0008 R11: ed003a178f1e R12: 
R13:  R14: 856dc080 R15: 8801ce223140
FS:  016ed880() GS:8801dc00() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2: 20008000 CR3: 0001ce22d000 CR4: 001406f0
DR0:  DR1:  DR2: 
DR3:  DR6: fffe0ff0 DR7: 0400
Call Trace:
 skb_flow_dissect_flow_keys include/linux/skbuff.h:1176 [inline]
 skb_get_poff+0x9a/0x1a0 net/core/flow_dissector.c:1079
 __skb_get_pay_offset net/core/filter.c:114 [inline]
 __skb_get_pay_offset+0x15/0x20 net/core/filter.c:112
Code: 80 3c 02 00 44 89 6d 10 0f 85 44 2b 00 00 4d 8b 67 20 48 b8 00 00 00 00 
00 fc ff df 49 8d bc 24 00 03 00 00 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 13 
2b 00 00 4d 8b a4 24 00 03 00 00 4d 85 e4
RIP: __skb_flow_dissect+0xdcd/0x3ae0 net/core/flow_dissector.c:445 RSP: 
8801d0bc7340

Fixes: 43e665287f93 ("net-next: dsa: fix flow dissection")
Reported-by: Dmitry Vyukov 
Signed-off-by: Craig Gallek 
---
 net/core/flow_dissector.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/flow_dissector.c b/net/core/flow_dissector.c
index 79b9c06c83ad..e2eaa1ff948d 100644
--- a/net/core/flow_dissector.c
+++ b/net/core/flow_dissector.c
@@ -442,7 +442,7 @@ bool __skb_flow_dissect(const struct sk_buff *skb,
nhoff = skb_network_offset(skb);
hlen = skb_headlen(skb);
 #if IS_ENABLED(CONFIG_NET_DSA)
-   if (unlikely(netdev_uses_dsa(skb->dev))) {
+   if (unlikely(skb->dev && netdev_uses_dsa(skb->dev))) {
const struct dsa_device_ops *ops;
int offset;
 
-- 
2.14.0.434.g98096fd7a8-goog



[PATCH] net_sched/sfq: update hierarchical backlog when drop packet

2017-08-15 Thread Konstantin Khlebnikov
When sfq_enqueue() drops head packet or packet from another queue it
have to update backlog at upper qdiscs too.

Signed-off-by: Konstantin Khlebnikov 
Fixes: 2f5fb43f ("net_sched: update hierarchical backlog too")
---
 net/sched/sch_sfq.c |5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index f80ea2cc5f1f..82469ef9655e 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -437,6 +437,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct 
sk_buff **to_free)
qdisc_drop(head, sch, to_free);
 
slot_queue_add(slot, skb);
+   qdisc_tree_reduce_backlog(sch, 0, delta);
return NET_XMIT_CN;
}
 
@@ -468,8 +469,10 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct 
sk_buff **to_free)
/* Return Congestion Notification only if we dropped a packet
 * from this flow.
 */
-   if (qlen != slot->qlen)
+   if (qlen != slot->qlen) {
+   qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
return NET_XMIT_CN;
+   }
 
/* As we dropped a packet, better let upper stack know this */
qdisc_tree_reduce_backlog(sch, 1, dropped);



[PATCH] net_sched: remove warning from qdisc_hash_add

2017-08-15 Thread Konstantin Khlebnikov
It was added in commit e57a784d8cae ("pkt_sched: set root qdisc
before change() in attach_default_qdiscs()") to hide duplicates
from "tc qdisc show" for incative deivices.

After 59cc1f61f ("net: sched: convert qdisc linked list to hashtable")
it triggered when classful qdisc is added to inactive device because
default qdiscs are added before switching root qdisc.

Anyway after commit ea3274695353 ("net: sched: avoid duplicates in
qdisc dump") duplicates are filtered right in dumper.

Signed-off-by: Konstantin Khlebnikov 
---
 net/sched/sch_api.c |3 ---
 1 file changed, 3 deletions(-)

diff --git a/net/sched/sch_api.c b/net/sched/sch_api.c
index 18da45c0769c..2d2cf539668c 100644
--- a/net/sched/sch_api.c
+++ b/net/sched/sch_api.c
@@ -286,9 +286,6 @@ static struct Qdisc *qdisc_match_from_root(struct Qdisc 
*root, u32 handle)
 void qdisc_hash_add(struct Qdisc *q, bool invisible)
 {
if ((q->parent != TC_H_ROOT) && !(q->flags & TCQ_F_INGRESS)) {
-   struct Qdisc *root = qdisc_dev(q)->qdisc;
-
-   WARN_ON_ONCE(root == &noop_qdisc);
ASSERT_RTNL();
hash_add_rcu(qdisc_dev(q)->qdisc_hash, &q->hash, q->handle);
if (invisible)



[PATCH 2/2] net_sched/hfsc: opencode trivial set_active() and set_passive()

2017-08-15 Thread Konstantin Khlebnikov
Any move comment abount update_vf() into right place.

Signed-off-by: Konstantin Khlebnikov 
---
 net/sched/sch_hfsc.c |   45 -
 1 file changed, 16 insertions(+), 29 deletions(-)

diff --git a/net/sched/sch_hfsc.c b/net/sched/sch_hfsc.c
index 14c99870cdb6..15f09cb9f1ff 100644
--- a/net/sched/sch_hfsc.c
+++ b/net/sched/sch_hfsc.c
@@ -829,28 +829,6 @@ update_vf(struct hfsc_class *cl, unsigned int len, u64 
cur_time)
}
 }
 
-static void
-set_active(struct hfsc_class *cl, unsigned int len)
-{
-   if (cl->cl_flags & HFSC_RSC)
-   init_ed(cl, len);
-   if (cl->cl_flags & HFSC_FSC)
-   init_vf(cl, len);
-
-}
-
-static void
-set_passive(struct hfsc_class *cl)
-{
-   if (cl->cl_flags & HFSC_RSC)
-   eltree_remove(cl);
-
-   /*
-* vttree is now handled in update_vf() so that update_vf(cl, 0, 0)
-* needs to be called explicitly to remove a class from vttree.
-*/
-}
-
 static unsigned int
 qdisc_peek_len(struct Qdisc *sch)
 {
@@ -1221,8 +1199,12 @@ hfsc_qlen_notify(struct Qdisc *sch, unsigned long arg)
 {
struct hfsc_class *cl = (struct hfsc_class *)arg;
 
+   /* vttree is now handled in update_vf() so that update_vf(cl, 0, 0)
+* needs to be called explicitly to remove a class from vttree.
+*/
update_vf(cl, 0, 0);
-   set_passive(cl);
+   if (cl->cl_flags & HFSC_RSC)
+   eltree_remove(cl);
 }
 
 static unsigned long
@@ -1583,7 +1565,12 @@ hfsc_enqueue(struct sk_buff *skb, struct Qdisc *sch, 
struct sk_buff **to_free)
}
 
if (cl->qdisc->q.qlen == 1) {
-   set_active(cl, qdisc_pkt_len(skb));
+   unsigned int len = qdisc_pkt_len(skb);
+
+   if (cl->cl_flags & HFSC_RSC)
+   init_ed(cl, len);
+   if (cl->cl_flags & HFSC_FSC)
+   init_vf(cl, len);
/*
 * If this is the first packet, isolate the head so an eventual
 * head drop before the first dequeue operation has no chance
@@ -1647,18 +1634,18 @@ hfsc_dequeue(struct Qdisc *sch)
if (realtime)
cl->cl_cumul += qdisc_pkt_len(skb);
 
-   if (cl->qdisc->q.qlen != 0) {
-   if (cl->cl_flags & HFSC_RSC) {
+   if (cl->cl_flags & HFSC_RSC) {
+   if (cl->qdisc->q.qlen != 0) {
/* update ed */
next_len = qdisc_peek_len(cl->qdisc);
if (realtime)
update_ed(cl, next_len);
else
update_d(cl, next_len);
+   } else {
+   /* the class becomes passive */
+   eltree_remove(cl);
}
-   } else {
-   /* the class becomes passive */
-   set_passive(cl);
}
 
qdisc_bstats_update(sch, skb);



Re: [PATCH] net/sched: reset block pointer in tcf_block_put()

2017-08-15 Thread Konstantin Khlebnikov



On 15.08.2017 00:15, Cong Wang wrote:

On Mon, Aug 14, 2017 at 5:59 AM, Konstantin Khlebnikov
 wrote:


This should work, I suppose.

But this approach requires careful review for all qdisc, mine is completely
mechanical.


Well, we don't have many classful qdisc's. Your patch actually
touches more qdisc's than mine, because you change an API, so
it is slightly harder to backport. ;)



Ok, I've fixed this right in qdiscs: [PATCH] net_sched: reset pointers to tcf 
blocks in classful qdiscs' destructors


Re: [PATCH] Adding-Agile-SD-TCP-module-and-modifying-Kconfig-and-makefile

2017-08-15 Thread Neal Cardwell
On Tue, Aug 15, 2017 at 9:08 AM, mohamedalrshah
 wrote:
> +
> +/* Agile-SD Parameters */
> +struct agilesdtcp {
> +   u32 loss_cwnd; /* congestion window at last loss.*/

Please rebase your change on top of the latest net-next changes and
update this module to use the latest approach from the recent commit:

  
https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git/commit/?id=f1722a1be19dc38e0a4b282d4e6e6ec5e1b11a67
  tcp: consolidate congestion control undo functions

Specifically:

- reference tp->prior_cwnd instead of ca->loss_cwnd
- remove the  ca->loss_cwnd field
- have the .undo_cwnd field reference tcp_reno_undo_cwnd

> +   u32 frac_tracer;   /* This is to trace the fractions of 
> the increment.*/
> +   u32 degraded_loss_cwnd;/* loss_cwnd after degradation.*/
> +   enumdystate{SS=0, CA=1} agilesd_tcp_status;

Per Linux style, please define the enum separately before declaring
the variable of that type, and format the enum using Linux style. Also
please use a longer, more specific name, to avoid name collisions. I'd
suggest:

enum dystate {
AGILE_SD_SS = 0,  /* comment ... */
AGILE_SD_CA = 1,  /* comment ... */
};


> +};
> +
> +/* To reset the parameters if needed*/
> +static inline void agilesdtcp_reset(struct sock *sk)
> +{
> +
> +}
> +
> +/* This function is called after the first acknowledgment is received and 
> before the congestion
> + * control algorithm will be called for the first time. If the congestion 
> control algorithm has
> + * private data, it should initialize its private date here. */

Multi-line comments should end with the trailing */ on a line by
itself. Here and elsewhere.

Please read:
  https://www.kernel.org/doc/html/v4.10/process/coding-style.html

Please check the style of patches before submitting with the following
script in the Linux kernel tree:
  scripts/checkpatch.pl

> +static void agilesdtcp_init(struct sock *sk)
> +{
> +   struct agilesdtcp *ca = inet_csk_ca(sk);
> +
> +   /* The value of initial_ssthresh parameter is not used here, thus, 
> snd_ssthresh is initialized by a large value.*/
> +   tcp_sk(sk)->snd_ssthresh = 0x7fff;
> +
> +   ca->loss_cwnd   = 0;
> +   ca->frac_tracer = 0;
> +   ca->agilesd_tcp_status  = SS;
> +}
> +
> +/* This function is called whenever an ack is received and the congestion 
> window can be increased.
> + * This is equivalent to opencwnd in tcp.cc.
> + * ack is the number of bytes that are acknowledged in the latest 
> acknowledgment;
> + * rtt is the the rtt measured by the latest acknowledgment;
> + * in_flight is the packet in flight before the latest acknowledgment;
> + * good_ack is an indicator whether the current situation is normal (no 
> duplicate ack, no loss and no SACK). */
> +static void agilesdtcp_cong_avoid(struct sock *sk, u32 ack, u32 in_flight)
> +{
> +   struct tcp_sock *tp = tcp_sk(sk);
> +   struct agilesdtcp *ca = inet_csk_ca(sk);
> +   u32 inc_factor;
> +   u32 ca_inc;
> +   u32 current_gap, total_gap;

For coding style, please order local variable declarations from
longest to shortest line, also know as Reverse Christmas Tree Format.

> +   /* The value of inc_factor is limited by lower_fl and upper_fl.
> +* The lower_fl must be always = 1. The greater the upper_fl the 
> higher the aggressiveness.
> +* But, if upper_fl set to 1, Agile-SD will work exactly as newreno.
> +* We have already designed an equation to calculate the optimum 
> upper_fl based on the given beta.
> +* This equation will be revealed once its article is published*/
> +   u32 lower_fl = 1 * SCALE;
> +   u32 upper_fl = 3 * SCALE;
> +
> +   if (!tcp_is_cwnd_limited(sk)) return;

Please put this return (or any if/else body) on a line by itself.

> +
> +   if (tp->snd_cwnd < tp->snd_ssthresh){

Need a space between ) and {.

> +   ca->agilesd_tcp_status = SS;
> +   tcp_slow_start(tp, in_flight);
> +   }
> +   else {

The else needs to go on the same line as the closing brace.


> +   inc_factor = min(max(((upper_fl * current_gap) / total_gap), 
> lower_fl), upper_fl);

Please use the existing kernel helper macro for this:

#define clamp(val, lo, hi) min((typeof(val))max(val, lo), hi)


> +
> +   ca_inc = ((inc_factor * SCALE) / tp->snd_cwnd);   /* SCALE is 
> used to avoid fractions*/
> +
> +   ca->frac_tracer += ca_inc;/* This in 
> order to take the fraction increase into account */
> +   if (ca->frac_tracer >= Double_SCALE)  /* To take 
> factor scale into account */
> +   {

The opening brace goes on the previous line.

> +/* This function is called when the TCP flow detects a loss.
> + * It returns the slow start threshold of a flow, after a packet loss is 
> detected. */

Trailing */ style issue again.

> +stati

[PATCH v2 net-next] bpf/verifier: track liveness for pruning

2017-08-15 Thread Edward Cree
State of a register doesn't matter if it wasn't read in reaching an exit;
 a write screens off all reads downstream of it from all explored_states
 upstream of it.
This allows us to prune many more branches; here are some processed insn
 counts for some Cilium programs:
Program  before  after
bpf_lb_opt_-DLB_L3.o   6515   3361
bpf_lb_opt_-DLB_L4.o   8976   5176
bpf_lb_opt_-DUNKNOWN.o 2960   1137
bpf_lxc_opt_-DDROP_ALL.o  95412  48537
bpf_lxc_opt_-DUNKNOWN.o  141706  78718
bpf_netdev.o  24251  17995
bpf_overlay.o 10999   9385

The runtime is also improved; here are 'time' results in ms:
Program  before  after
bpf_lb_opt_-DLB_L3.o 24  6
bpf_lb_opt_-DLB_L4.o 26 11
bpf_lb_opt_-DUNKNOWN.o   11  2
bpf_lxc_opt_-DDROP_ALL.o   1288139
bpf_lxc_opt_-DUNKNOWN.o1768234
bpf_netdev.o 62 31
bpf_overlay.o15 13

Signed-off-by: Edward Cree 
---
v2: update liveness in LD_ABS|IND, as pointed out by Daniel Borkmann.  The
 numbers are mostly unchanged; bpf_lxc_opt_-DUNKNOWN.o dropped about 300
 insns and 20ms, while bpf_lxc_opt_-DDROP_ALL.o (despite not changing its
 #insns) also dropped 13ms.

 include/linux/bpf_verifier.h |  11 ++-
 kernel/bpf/verifier.c| 188 +--
 2 files changed, 156 insertions(+), 43 deletions(-)

diff --git a/include/linux/bpf_verifier.h b/include/linux/bpf_verifier.h
index c61c3033..91d07ef 100644
--- a/include/linux/bpf_verifier.h
+++ b/include/linux/bpf_verifier.h
@@ -21,6 +21,12 @@
  */
 #define BPF_MAX_VAR_SIZINT_MAX
 
+enum bpf_reg_liveness {
+   REG_LIVE_NONE = 0, /* reg hasn't been read or written this branch */
+   REG_LIVE_READ, /* reg was read, so we're sensitive to initial value */
+   REG_LIVE_WRITTEN, /* reg was written first, screening off later reads */
+};
+
 struct bpf_reg_state {
enum bpf_reg_type type;
union {
@@ -40,7 +46,7 @@ struct bpf_reg_state {
 * came from, when one is tested for != NULL.
 */
u32 id;
-   /* These five fields must be last.  See states_equal() */
+   /* Ordering of fields matters.  See states_equal() */
/* For scalar types (SCALAR_VALUE), this represents our knowledge of
 * the actual value.
 * For pointer types, this represents the variable part of the offset
@@ -57,6 +63,8 @@ struct bpf_reg_state {
s64 smax_value; /* maximum possible (s64)value */
u64 umin_value; /* minimum possible (u64)value */
u64 umax_value; /* maximum possible (u64)value */
+   /* This field must be last, for states_equal() reasons. */
+   enum bpf_reg_liveness live;
 };
 
 enum bpf_stack_slot_type {
@@ -74,6 +82,7 @@ struct bpf_verifier_state {
struct bpf_reg_state regs[MAX_BPF_REG];
u8 stack_slot_type[MAX_BPF_STACK];
struct bpf_reg_state spilled_regs[MAX_BPF_STACK / BPF_REG_SIZE];
+   struct bpf_verifier_state *parent;
 };
 
 /* linked list of verifier states used to prune search */
diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c
index ecc590e..3affb8d 100644
--- a/kernel/bpf/verifier.c
+++ b/kernel/bpf/verifier.c
@@ -629,8 +629,10 @@ static void init_reg_state(struct bpf_reg_state *regs)
 {
int i;
 
-   for (i = 0; i < MAX_BPF_REG; i++)
+   for (i = 0; i < MAX_BPF_REG; i++) {
mark_reg_not_init(regs, i);
+   regs[i].live = REG_LIVE_NONE;
+   }
 
/* frame pointer */
regs[BPF_REG_FP].type = PTR_TO_STACK;
@@ -647,9 +649,26 @@ enum reg_arg_type {
DST_OP_NO_MARK  /* same as above, check only, don't mark */
 };
 
-static int check_reg_arg(struct bpf_reg_state *regs, u32 regno,
+static void mark_reg_read(const struct bpf_verifier_state *state, u32 regno)
+{
+   struct bpf_verifier_state *parent = state->parent;
+
+   while (parent) {
+   /* if read wasn't screened by an earlier write ... */
+   if (state->regs[regno].live & REG_LIVE_WRITTEN)
+   break;
+   /* ... then we depend on parent's value */
+   parent->regs[regno].live |= REG_LIVE_READ;
+   state = parent;
+   parent = state->parent;
+   }
+}
+
+static int check_reg_arg(struct bpf_verifier_env *env, u32 regno,
 enum reg_arg_type t)
 {
+   struct bpf_reg_state *regs = env->cur_state.regs;
+
if (regno >= MAX_BPF_REG) {
verbose("R%d is invalid\n", regno);
return -EINVAL;
@@ -661,12 +680,14 @@ static int check_reg_arg(struct bpf_reg_state *regs, u32 
regno,
verbose("R%d !read_ok\n", regno);
return -EACCES;
}
+   mark_reg_read(&env->cur_state, regno);
} else {
/* check whether register used as dest operand can be written 
to */
if (reg

[PATCH net] sfc: don't try and read ef10 data on non-ef10 NIC

2017-08-15 Thread Bert Kenward
The MAC stats command takes a port ID, which doesn't exist on
pre-ef10 NICs (5000- and 6000- series). This is extracted from the
NIC specific data; we misinterpret this as the ef10 data structure,
causing us to read potentially unallocated data. With a KASAN kernel
this can cause errors with:
   BUG: KASAN: slab-out-of-bounds in efx_mcdi_mac_stats

Fixes: 0a2ab4d988d7 ("sfc: set the port-id when calling MC_CMD_MAC_STATS")
Reported-by: Stefano Brivio 
Tested-by: Stefano Brivio 
Signed-off-by: Bert Kenward 
---
 drivers/net/ethernet/sfc/mcdi_port.c | 8 ++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/sfc/mcdi_port.c 
b/drivers/net/ethernet/sfc/mcdi_port.c
index c905971c5f3a..990a63d7fcb7 100644
--- a/drivers/net/ethernet/sfc/mcdi_port.c
+++ b/drivers/net/ethernet/sfc/mcdi_port.c
@@ -938,7 +938,6 @@ enum efx_stats_action {
 static int efx_mcdi_mac_stats(struct efx_nic *efx,
  enum efx_stats_action action, int clear)
 {
-   struct efx_ef10_nic_data *nic_data = efx->nic_data;
MCDI_DECLARE_BUF(inbuf, MC_CMD_MAC_STATS_IN_LEN);
int rc;
int change = action == EFX_STATS_PULL ? 0 : 1;
@@ -960,7 +959,12 @@ static int efx_mcdi_mac_stats(struct efx_nic *efx,
  MAC_STATS_IN_PERIODIC_NOEVENT, 1,
  MAC_STATS_IN_PERIOD_MS, period);
MCDI_SET_DWORD(inbuf, MAC_STATS_IN_DMA_LEN, dma_len);
-   MCDI_SET_DWORD(inbuf, MAC_STATS_IN_PORT_ID, nic_data->vport_id);
+
+   if (efx_nic_rev(efx) >= EFX_REV_HUNT_A0) {
+   struct efx_ef10_nic_data *nic_data = efx->nic_data;
+
+   MCDI_SET_DWORD(inbuf, MAC_STATS_IN_PORT_ID, nic_data->vport_id);
+   }
 
rc = efx_mcdi_rpc_quiet(efx, MC_CMD_MAC_STATS, inbuf, sizeof(inbuf),
NULL, 0, NULL);
-- 
2.7.5



Re: [PATCH net-next] dsa: fix flow disector null pointer

2017-08-15 Thread Andrew Lunn
On Tue, Aug 15, 2017 at 09:43:40AM -0400, Craig Gallek wrote:
> From: Craig Gallek 
> 
> A recent change to fix up DSA device behavior made the assumption that
> all skbs passing through the flow disector will be associated with a
> device. This does not appear to be a safe assumption.  Syzkaller found
> the crash below by attaching a BPF socket filter that tries to find the
> payload offset of a packet passing between two unix sockets.
> 
> kasan: GPF could be caused by NULL-ptr deref or user memory access
> general protection fault:  [#1] SMP KASAN
> Dumping ftrace buffer:
>(ftrace buffer empty)
> Modules linked in:
> CPU: 0 PID: 2940 Comm: syzkaller872007 Not tainted 4.13.0-rc4-next-20170811 #1
> Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS 
> Google 01/01/2011
> task: 8801d1b425c0 task.stack: 8801d0bc
> RIP: 0010:__skb_flow_dissect+0xdcd/0x3ae0 net/core/flow_dissector.c:445
> RSP: 0018:8801d0bc7340 EFLAGS: 00010206
> RAX: dc00 RBX:  RCX: 
> RDX: 0060 RSI: 856dc080 RDI: 0300
> RBP: 8801d0bc7870 R08:  R09: 
> R10: 0008 R11: ed003a178f1e R12: 
> R13:  R14: 856dc080 R15: 8801ce223140
> FS:  016ed880() GS:8801dc00() knlGS:
> CS:  0010 DS:  ES:  CR0: 80050033
> CR2: 20008000 CR3: 0001ce22d000 CR4: 001406f0
> DR0:  DR1:  DR2: 
> DR3:  DR6: fffe0ff0 DR7: 0400
> Call Trace:
>  skb_flow_dissect_flow_keys include/linux/skbuff.h:1176 [inline]
>  skb_get_poff+0x9a/0x1a0 net/core/flow_dissector.c:1079
>  __skb_get_pay_offset net/core/filter.c:114 [inline]
>  __skb_get_pay_offset+0x15/0x20 net/core/filter.c:112
> Code: 80 3c 02 00 44 89 6d 10 0f 85 44 2b 00 00 4d 8b 67 20 48 b8 00 00 00 00 
> 00 fc ff df 49 8d bc 24 00 03 00 00 48 89 fa 48 c1 ea 03 <80> 3c 02 00 0f 85 
> 13 2b 00 00 4d 8b a4 24 00 03 00 00 4d 85 e4
> RIP: __skb_flow_dissect+0xdcd/0x3ae0 net/core/flow_dissector.c:445 RSP: 
> 8801d0bc7340
> 
> Fixes: 43e665287f93 ("net-next: dsa: fix flow dissection")
> Reported-by: Dmitry Vyukov 
> Signed-off-by: Craig Gallek 

Reviewed-by: Andrew Lunn 

Andrew


Re: [PATCH v11 0/5] Add new PCI_DEV_FLAGS_NO_RELAXED_ORDERING flag

2017-08-15 Thread Eric Dumazet
On Mon, 2017-08-14 at 22:15 -0700, David Miller wrote:
> From: Ding Tianhong 
> Date: Tue, 15 Aug 2017 11:23:22 +0800
> 
> > Some devices have problems with Transaction Layer Packets with the Relaxed
> > Ordering Attribute set.  This patch set adds a new PCIe Device Flag,
> > PCI_DEV_FLAGS_NO_RELAXED_ORDERING, a set of PCI Quirks to catch some known
> > devices with Relaxed Ordering issues, and a use of this new flag by the
> > cxgb4 driver to avoid using Relaxed Ordering with problematic Root Complex
> > Ports.
>  ...
> 
> Series applied, thanks.

I got a NULL deref in pci_find_pcie_root_port()

Was it expected ?

This local hack seems to fix the issue.

diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
index 
af0cc3456dc1b48b1325c06c5edd2ca8cc22a640..cfd8eb5a3d0ba8347d44952ffab28d9c761044d3
 100644
--- a/drivers/pci/pci.c
+++ b/drivers/pci/pci.c
@@ -522,7 +522,7 @@ struct pci_dev *pci_find_pcie_root_port(struct pci_dev *dev)
bridge = pci_upstream_bridge(bridge);
}
 
-   if (pci_pcie_type(highest_pcie_bridge) != PCI_EXP_TYPE_ROOT_PORT)
+   if (highest_pcie_bridge && pci_pcie_type(highest_pcie_bridge) != 
PCI_EXP_TYPE_ROOT_PORT)
return NULL;
 
return highest_pcie_bridge;



Re: [PATCH] net_sched: reset pointers to tcf blocks in classful qdiscs' destructors

2017-08-15 Thread Jiri Pirko
Tue, Aug 15, 2017 at 03:35:21PM CEST, khlebni...@yandex-team.ru wrote:
>Traffic filters could keep direct pointers to classes in classful qdisc,
>thus qdisc destruction first removes all filters before freeing classes.
>Class destruction methods also tries to free attached filters but now
>this isn't safe because tcf_block_put() unlike to tcf_destroy_chain()
>cannot be called second time.
>
>This patch set class->block to NULL after first tcf_block_put() and
>turn second call into no-op.
>
>Signed-off-by: Konstantin Khlebnikov 
>Fixes: 6529eaba33f0 ("net: sched: introduce tcf block infractructure")

Acked-by: Jiri Pirko 

Thanks!


Re: [PATCH] Adding-Agile-SD-TCP-module-and-modifying-Kconfig-and-makefile

2017-08-15 Thread Neal Cardwell
On Tue, Aug 15, 2017 at 9:08 AM, mohamedalrshah
 wrote:
> This commit implements a new TCP congestion control algorithm, namely 
> Agile-SD.

Also, please use a summary line for your patch that is more in keeping
with Linux style, using spaces rather than dashes, and leading with
tcp: or tcp_agile_sd:. For example:
  tcp: add Agile SD TCP congestion control module

And please read:
  https://www.kernel.org/doc/html/latest/process/submitting-patches.html

Thanks,
neal


Re: [PATCH v11 0/5] Add new PCI_DEV_FLAGS_NO_RELAXED_ORDERING flag

2017-08-15 Thread Eric Dumazet
On Tue, 2017-08-15 at 06:58 -0700, Eric Dumazet wrote:
> On Mon, 2017-08-14 at 22:15 -0700, David Miller wrote:
> > From: Ding Tianhong 
> > Date: Tue, 15 Aug 2017 11:23:22 +0800
> > 
> > > Some devices have problems with Transaction Layer Packets with the Relaxed
> > > Ordering Attribute set.  This patch set adds a new PCIe Device Flag,
> > > PCI_DEV_FLAGS_NO_RELAXED_ORDERING, a set of PCI Quirks to catch some known
> > > devices with Relaxed Ordering issues, and a use of this new flag by the
> > > cxgb4 driver to avoid using Relaxed Ordering with problematic Root Complex
> > > Ports.
> >  ...
> > 
> > Series applied, thanks.
> 
> I got a NULL deref in pci_find_pcie_root_port()
> 

This was :

[4.241029] BUG: unable to handle kernel NULL pointer dereference at 
0050
[4.247001] IP: pci_find_pcie_root_port+0x62/0x80
[4.253011] PGD 0 
[4.253011] P4D 0 
[4.253011] 
[4.258013] Oops:  [#1] SMP DEBUG_PAGEALLOC
[4.262015] Modules linked in:
[4.265005] CPU: 31 PID: 1 Comm: swapper/0 Not tainted 4.13.0-dbx-DEV #316
[4.271002] Hardware name: Intel RML,PCH/Iota_QC_19, BIOS 2.40.0 06/22/2016
[4.279002] task: a2ee38cfa040 task.stack: a51ec0004000
[4.285001] RIP: 0010:pci_find_pcie_root_port+0x62/0x80
[4.290012] RSP: :a51ec0007ab8 EFLAGS: 00010246
[4.295003] RAX:  RBX: a2ee36bae000 RCX: 0006
[4.303002] RDX: 081c RSI: a2ee38cfa8c8 RDI: a2ee36bae000
[4.310013] RBP: a51ec0007b58 R08: 0001 R09: 
[4.317001] R10:  R11:  R12: a51ec0007ad0
[4.324005] R13: a2ee36bae098 R14: 0002 R15: a2ee37204818
[4.331002] FS:  () GS:a2ee3fcc() 
knlGS:
[4.339002] CS:  0010 DS:  ES:  CR0: 80050033
[4.345001] CR2: 0050 CR3: 00401000f000 CR4: 001406e0
[4.351002] Call Trace:
[4.354012]  ? pci_configure_device+0x19f/0x570
[4.359002]  ? pci_conf1_read+0xb8/0xf0
[4.363002]  ? raw_pci_read+0x23/0x40
[4.366011]  ? pci_read+0x2c/0x30
[4.370014]  ? pci_read_config_word+0x67/0x70
[4.374012]  pci_device_add+0x28/0x230
[4.378012]  ? pci_vpd_f0_read+0x50/0x80
[4.382014]  pci_scan_single_device+0x96/0xc0
[4.386012]  pci_scan_slot+0x79/0xf0
[4.389001]  pci_scan_child_bus+0x31/0x180
[4.394014]  acpi_pci_root_create+0x1c6/0x240
[4.398013]  pci_acpi_scan_root+0x15f/0x1b0
[4.402012]  acpi_pci_root_add+0x2e6/0x400
[4.406012]  ? acpi_evaluate_integer+0x37/0x60
[4.411002]  acpi_bus_attach+0xdf/0x200
[4.415002]  acpi_bus_attach+0x6a/0x200
[4.418014]  acpi_bus_attach+0x6a/0x200
[4.422013]  acpi_bus_scan+0x38/0x70
[4.426011]  acpi_scan_init+0x10c/0x271
[4.429001]  acpi_init+0x2fa/0x348
[4.433004]  ? acpi_sleep_proc_init+0x2d/0x2d
[4.437001]  do_one_initcall+0x43/0x169
[4.441001]  kernel_init_freeable+0x1d0/0x258
[4.445003]  ? rest_init+0xe0/0xe0
[4.449001]  kernel_init+0xe/0x150
[4.451002]  ret_from_fork+0x27/0x40
[4.457004] Code: 85 d2 74 27 80 7a 4a 00 74 21 48 89 d0 48 89 c2 f6 80 1b 
09 00 00 10 74 07 48 8b 90 a0 0a 00 00 48 8b 52 10 48 83 7a 10 00 75 d0 <0f> b7 
50 50 5d 81 e2 f0 00 00 00 83 fa 40 ba 00 00 00 00 48 0f 
[4.474012] RIP: pci_find_pcie_root_port+0x62/0x80 RSP: a51ec0007ab8
[4.481004] CR2: 0050
[4.484001] ---[ end trace 6f9be6a057581199 ]---
[4.488001] Kernel panic - not syncing: Fatal exception
[4.494013] Rebooting in 10 seconds..
[4.494013] ACPI MEMORY or I/O RESET_REG.

> 
> This local hack seems to fix the issue.
> 
> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
> index 
> af0cc3456dc1b48b1325c06c5edd2ca8cc22a640..cfd8eb5a3d0ba8347d44952ffab28d9c761044d3
>  100644
> --- a/drivers/pci/pci.c
> +++ b/drivers/pci/pci.c
> @@ -522,7 +522,7 @@ struct pci_dev *pci_find_pcie_root_port(struct pci_dev 
> *dev)
> bridge = pci_upstream_bridge(bridge);
> }
>  
> -   if (pci_pcie_type(highest_pcie_bridge) != PCI_EXP_TYPE_ROOT_PORT)
> +   if (highest_pcie_bridge && pci_pcie_type(highest_pcie_bridge) != 
> PCI_EXP_TYPE_ROOT_PORT)
> return NULL;
>  
> return highest_pcie_bridge;




Re: [PATCH V4 net 0/2] ipv6: fix flowlabel issue for reset packet

2017-08-15 Thread Tom Herbert
On Mon, Aug 14, 2017 at 7:52 PM, Shaohua Li  wrote:
> On Fri, Aug 11, 2017 at 06:00:20PM -0700, Tom Herbert wrote:
>> On Thu, Aug 10, 2017 at 12:13 PM, Shaohua Li  wrote:
>> > On Thu, Aug 10, 2017 at 11:30:51AM -0700, Tom Herbert wrote:
>> >> On Thu, Aug 10, 2017 at 9:30 AM, Shaohua Li  wrote:
>> >> > On Wed, Aug 09, 2017 at 09:40:08AM -0700, Tom Herbert wrote:
>> >> >> On Mon, Jul 31, 2017 at 3:19 PM, Shaohua Li  wrote:
>> >> >> > From: Shaohua Li 
>> >> >> >
>> >> >> > Please see below tcpdump output:
>> >> >> > 21:00:48.109122 IP6 (flowlabel 0x43304, hlim 64, next-header TCP (6) 
>> >> >> > payload length: 40) fec0::5054:ff:fe12:3456.55804 > 
>> >> >> > fec0::5054:ff:fe12:3456.: Flags [S], cksum 0x0529 (incorrect -> 
>> >> >> > 0xf56c), seq 3282214508, win 43690, options [mss 65476,sackOK,TS val 
>> >> >> > 2500903437 ecr 0,nop,wscale 7], length 0
>> >> >> > 21:00:48.109381 IP6 (flowlabel 0xd827f, hlim 64, next-header TCP (6) 
>> >> >> > payload length: 40) fec0::5054:ff:fe12:3456. > 
>> >> >> > fec0::5054:ff:fe12:3456.55804: Flags [S.], cksum 0x0529 (incorrect 
>> >> >> > -> 0x49ad), seq 1923801573, ack 3282214509, win 43690, options [mss 
>> >> >> > 65476,sackOK,TS val 2500903437 ecr 2500903437,nop,wscale 7], length 0
>> >> >> > 21:00:48.109548 IP6 (flowlabel 0x43304, hlim 64, next-header TCP (6) 
>> >> >> > payload length: 32) fec0::5054:ff:fe12:3456.55804 > 
>> >> >> > fec0::5054:ff:fe12:3456.: Flags [.], cksum 0x0521 (incorrect -> 
>> >> >> > 0x1bdf), seq 1, ack 1, win 342, options [nop,nop,TS val 2500903437 
>> >> >> > ecr 2500903437], length 0
>> >> >> > 21:00:48.109823 IP6 (flowlabel 0x43304, hlim 64, next-header TCP (6) 
>> >> >> > payload length: 62) fec0::5054:ff:fe12:3456.55804 > 
>> >> >> > fec0::5054:ff:fe12:3456.: Flags [P.], cksum 0x053f (incorrect -> 
>> >> >> > 0xb8b1), seq 1:31, ack 1, win 342, options [nop,nop,TS val 
>> >> >> > 2500903437 ecr 2500903437], length 30
>> >> >> > 21:00:48.109910 IP6 (flowlabel 0xd827f, hlim 64, next-header TCP (6) 
>> >> >> > payload length: 32) fec0::5054:ff:fe12:3456. > 
>> >> >> > fec0::5054:ff:fe12:3456.55804: Flags [.], cksum 0x0521 (incorrect -> 
>> >> >> > 0x1bc1), seq 1, ack 31, win 342, options [nop,nop,TS val 2500903437 
>> >> >> > ecr 2500903437], length 0
>> >> >> > 21:00:48.110043 IP6 (flowlabel 0xd827f, hlim 64, next-header TCP (6) 
>> >> >> > payload length: 56) fec0::5054:ff:fe12:3456. > 
>> >> >> > fec0::5054:ff:fe12:3456.55804: Flags [P.], cksum 0x0539 (incorrect 
>> >> >> > -> 0xb726), seq 1:25, ack 31, win 342, options [nop,nop,TS val 
>> >> >> > 2500903438 ecr 2500903437], length 24
>> >> >> > 21:00:48.110173 IP6 (flowlabel 0x43304, hlim 64, next-header TCP (6) 
>> >> >> > payload length: 32) fec0::5054:ff:fe12:3456.55804 > 
>> >> >> > fec0::5054:ff:fe12:3456.: Flags [.], cksum 0x0521 (incorrect -> 
>> >> >> > 0x1ba7), seq 31, ack 25, win 342, options [nop,nop,TS val 2500903438 
>> >> >> > ecr 2500903438], length 0
>> >> >> > 21:00:48.110211 IP6 (flowlabel 0xd827f, hlim 64, next-header TCP (6) 
>> >> >> > payload length: 32) fec0::5054:ff:fe12:3456. > 
>> >> >> > fec0::5054:ff:fe12:3456.55804: Flags [F.], cksum 0x0521 (incorrect 
>> >> >> > -> 0x1ba7), seq 25, ack 31, win 342, options [nop,nop,TS val 
>> >> >> > 2500903438 ecr 2500903437], length 0
>> >> >> > 21:00:48.151099 IP6 (flowlabel 0x43304, hlim 64, next-header TCP (6) 
>> >> >> > payload length: 32) fec0::5054:ff:fe12:3456.55804 > 
>> >> >> > fec0::5054:ff:fe12:3456.: Flags [.], cksum 0x0521 (incorrect -> 
>> >> >> > 0x1ba6), seq 31, ack 26, win 342, options [nop,nop,TS val 2500903438 
>> >> >> > ecr 2500903438], length 0
>> >> >> > 21:00:49.110524 IP6 (flowlabel 0x43304, hlim 64, next-header TCP (6) 
>> >> >> > payload length: 56) fec0::5054:ff:fe12:3456.55804 > 
>> >> >> > fec0::5054:ff:fe12:3456.: Flags [P.], cksum 0x0539 (incorrect -> 
>> >> >> > 0xb324), seq 31:55, ack 26, win 342, options [nop,nop,TS val 
>> >> >> > 2500904438 ecr 2500903438], length 24
>> >> >> > 21:00:49.110637 IP6 (flowlabel 0xb34d5, hlim 64, next-header TCP (6) 
>> >> >> > payload length: 20) fec0::5054:ff:fe12:3456. > 
>> >> >> > fec0::5054:ff:fe12:3456.55804: Flags [R], cksum 0x0515 (incorrect -> 
>> >> >> > 0x668c), seq 1923801599, win 0, length 0
>> >> >> >
>> >> >> > The flowlabel of reset packet (0xb34d5) and flowlabel of normal 
>> >> >> > packet
>> >> >> > (0xd827f) are different. This causes our router doesn't correctly 
>> >> >> > close tcp
>> >> >> > connection. The patches try to fix the issue.
>> >> >> >
>> >> >> Shaohua,
>> >> >>
>> >> >> Can you give some more detail about what the router doesn't close the
>> >> >> TCP connection means? I'm guessing the problem is either: 1) the
>> >> >> router is maintaining connection state that includes the flow label in
>> >> >> a connection tuple. 2) some router in the path is maintaining
>> >> >> connection state, but when the flow label changes the flow's packet
>> >> >> are routed through a differe

Re: [PATCH] net_sched/sfq: update hierarchical backlog when drop packet

2017-08-15 Thread Eric Dumazet
On Tue, 2017-08-15 at 16:37 +0300, Konstantin Khlebnikov wrote:
> When sfq_enqueue() drops head packet or packet from another queue it
> have to update backlog at upper qdiscs too.
> 
> Signed-off-by: Konstantin Khlebnikov 
> Fixes: 2f5fb43f ("net_sched: update hierarchical backlog too")
> ---
>  net/sched/sch_sfq.c |5 -
>  1 file changed, 4 insertions(+), 1 deletion(-)
> 
> diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
> index f80ea2cc5f1f..82469ef9655e 100644
> --- a/net/sched/sch_sfq.c
> +++ b/net/sched/sch_sfq.c
> @@ -437,6 +437,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, 
> struct sk_buff **to_free)
>   qdisc_drop(head, sch, to_free);
>  
>   slot_queue_add(slot, skb);
> + qdisc_tree_reduce_backlog(sch, 0, delta);
>   return NET_XMIT_CN;
>   }
>  
> @@ -468,8 +469,10 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, 
> struct sk_buff **to_free)
>   /* Return Congestion Notification only if we dropped a packet
>* from this flow.
>*/
> - if (qlen != slot->qlen)
> + if (qlen != slot->qlen) {
> + qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
>   return NET_XMIT_CN;
> + }
>  
>   /* As we dropped a packet, better let upper stack know this */
>   qdisc_tree_reduce_backlog(sch, 1, dropped);
> 

Are you sure you have tested this patch ?




Possible race in via-ircc.ko

2017-08-15 Thread Anton Volkov

Hello.

While searching for races in the Linux kernel I've come across 
"drivers/net/irda/via-ircc.ko" module. Here are questions that I came up 
with while analyzing results. Lines are given using the info from Linux 
v4.12.


Consider the following case:

Thread 1:Thread 2:
via_ircc_net_open
  request_irq
  
 via_ircc_interrupt
-> via_ircc_dma_receive  -> RxTimerHandler
   (via-ircc.c: line 1488)  (via-ircc.c: line 1315)
 self->... = ...  ... = self->...

In the via_ircc_dma_receive a lot of fields of 'self' structure are 
initialized and via_ircc_interrupt with RxTimerHandler use those fields. 
If no initialization happened interrupt handler and other functions that 
it calls may work with incorrect data. I'm not sure how bad this case 
can be and thus here are my questions. Is this situation feasible from 
your point of view? If it is feasible, is it a benign race or something 
serious?


Thank you for your time.

-- Anton Volkov
Linux Verification Center, ISPRAS
web: http://linuxtesting.org
e-mail: avol...@ispras.ru


Re: [patch net-next] ipv6: fib: Provide offload indication using nexthop flags

2017-08-15 Thread David Ahern
On 8/15/17 1:09 AM, Jiri Pirko wrote:
> From: Ido Schimmel 
> 
> IPv6 routes currently lack nexthop flags as in IPv4. This has several
> implications.
> 
> In the forwarding path, it requires us to check the carrier state of the
> nexthop device and potentially ignore a linkdown route, instead of
> checking for RTNH_F_LINKDOWN.
> 
> It also requires capable drivers to use the user facing IPv6-specific
> route flags to provide offload indication, instead of using the nexthop
> flags as in IPv4.
> 
> Add nexthop flags to IPv6 routes in the 40 bytes hole and use it to
> provide offload indication instead of the RTF_OFFLOAD flag, which is
> removed while it's still not part of any official kernel release.
> 
> In the near future we would like to use the field for the
> RTNH_F_{LINKDOWN,DEAD} flags, but this change is more involved and might
> not be ready in time for the current cycle.
> 
> Signed-off-by: Ido Schimmel 
> Signed-off-by: Jiri Pirko 
> ---
>  drivers/net/ethernet/mellanox/mlxsw/spectrum_router.c | 8 
>  include/net/ip6_fib.h | 2 ++
>  include/uapi/linux/ipv6_route.h   | 1 -
>  net/ipv6/route.c  | 7 +--
>  4 files changed, 7 insertions(+), 11 deletions(-)

LGTM.

Acked-by: David Ahern 


Re: [PATCH 2/5] net/9p: Improve 19 size determinations

2017-08-15 Thread Al Viro
On Tue, Aug 15, 2017 at 02:00:06PM +0200, SF Markus Elfring wrote:
> From: Markus Elfring 
> Date: Tue, 15 Aug 2017 09:36:20 +0200
> 
> Replace the specification of data structures by variable references
> as the parameter for the operator "sizeof" to make the corresponding size
> determination a bit safer according to the Linux coding style convention.

Garbage.  This makes it so much harder to find where the objects of given
type are created.  It's _not_ safer and any patches of that sort around
VFS will be shitcanned.  What to do with net/9p patches is up to net/9p
maintainers, but I would strongly recommend the same treatment.


Re: [PATCH] net_sched/sfq: update hierarchical backlog when drop packet

2017-08-15 Thread Konstantin Khlebnikov

On 15.08.2017 17:09, Eric Dumazet wrote:

On Tue, 2017-08-15 at 16:37 +0300, Konstantin Khlebnikov wrote:

When sfq_enqueue() drops head packet or packet from another queue it
have to update backlog at upper qdiscs too.

Signed-off-by: Konstantin Khlebnikov 
Fixes: 2f5fb43f ("net_sched: update hierarchical backlog too")
---
  net/sched/sch_sfq.c |5 -
  1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/sched/sch_sfq.c b/net/sched/sch_sfq.c
index f80ea2cc5f1f..82469ef9655e 100644
--- a/net/sched/sch_sfq.c
+++ b/net/sched/sch_sfq.c
@@ -437,6 +437,7 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct 
sk_buff **to_free)
qdisc_drop(head, sch, to_free);
  
  		slot_queue_add(slot, skb);

+   qdisc_tree_reduce_backlog(sch, 0, delta);
return NET_XMIT_CN;
}
  
@@ -468,8 +469,10 @@ sfq_enqueue(struct sk_buff *skb, struct Qdisc *sch, struct sk_buff **to_free)

/* Return Congestion Notification only if we dropped a packet
 * from this flow.
 */
-   if (qlen != slot->qlen)
+   if (qlen != slot->qlen) {
+   qdisc_tree_reduce_backlog(sch, 0, dropped - qdisc_pkt_len(skb));
return NET_XMIT_CN;
+   }
  
  	/* As we dropped a packet, better let upper stack know this */

qdisc_tree_reduce_backlog(sch, 1, dropped);



Are you sure you have tested this patch ?



Nope. I'm not sure. But we have something similar in our 4.4 kernel for a while.

Also fq_codel and pfifo_head_drop do something similar tho this.

Probably this might crash without "[PATCH 1/2] net_sched: call qlen_notify only if 
child qdisc is empty".
I hadn't tested them separately.


[PATCH net-next 3/4] ipv6: route: set ipv6 RTM_GETROUTE to not use rtnl

2017-08-15 Thread Florian Westphal
Signed-off-by: Florian Westphal 
---
 net/ipv6/route.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index ec694fdb8cc5..3c15f005c90e 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -4112,7 +4112,8 @@ int __init ip6_route_init(void)
ret = -ENOBUFS;
if (__rtnl_register(PF_INET6, RTM_NEWROUTE, inet6_rtm_newroute, NULL, 
0) ||
__rtnl_register(PF_INET6, RTM_DELROUTE, inet6_rtm_delroute, NULL, 
0) ||
-   __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL, 
0))
+   __rtnl_register(PF_INET6, RTM_GETROUTE, inet6_rtm_getroute, NULL,
+   RTNL_FLAG_DOIT_UNLOCKED))
goto out_register_late_subsys;
 
ret = register_netdevice_notifier(&ip6_route_dev_notifier);
-- 
2.13.0



[PATCH net-next 4/4] ipv4: route: set ipv4 RTM_GETROUTE to not use rtnl

2017-08-15 Thread Florian Westphal
Signed-off-by: Florian Westphal 
---
 net/ipv4/route.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 6810d2076b1b..618bbe1405fc 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -3073,7 +3073,8 @@ int __init ip_rt_init(void)
xfrm_init();
xfrm4_init();
 #endif
-   rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL, 0);
+   rtnl_register(PF_INET, RTM_GETROUTE, inet_rtm_getroute, NULL,
+ RTNL_FLAG_DOIT_UNLOCKED);
 
 #ifdef CONFIG_SYSCTL
register_pernet_subsys(&sysctl_route_ops);
-- 
2.13.0



[PATCH net-next 0/4] inet: make RTM_GETROUTE work without rtnl

2017-08-15 Thread Florian Westphal
ipv4 getroute doesn't assume rtnl lock is held anymore, also make
this true for ipv6, then switch both to DOIT_UNLOCKED.



[PATCH net-next 2/4] ipv6: route: make rtm_getroute not assume rtnl is locked

2017-08-15 Thread Florian Westphal
__dev_get_by_index assumes RTNL is held, use _rcu version instead.

Signed-off-by: Florian Westphal 
---
 net/ipv6/route.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 035762fed07d..ec694fdb8cc5 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -3616,8 +3616,11 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, 
struct nlmsghdr *nlh,
struct net_device *dev;
int flags = 0;
 
-   dev = __dev_get_by_index(net, iif);
+   rcu_read_lock();
+
+   dev = dev_get_by_index_rcu(net, iif);
if (!dev) {
+   rcu_read_unlock();
err = -ENODEV;
goto errout;
}
@@ -3629,6 +3632,8 @@ static int inet6_rtm_getroute(struct sk_buff *in_skb, 
struct nlmsghdr *nlh,
 
if (!fibmatch)
dst = ip6_route_input_lookup(net, dev, &fl6, flags);
+
+   rcu_read_unlock();
} else {
fl6.flowi6_oif = oif;
 
-- 
2.13.0



[PATCH net-next 1/4] selftests: add 'ip get' to rtnetlink.sh

2017-08-15 Thread Florian Westphal
exercise ip/ip6 RTM_GETROUTE doit() callpath.

Signed-off-by: Florian Westphal 
---
 tools/testing/selftests/net/rtnetlink.sh | 32 
 1 file changed, 32 insertions(+)

diff --git a/tools/testing/selftests/net/rtnetlink.sh 
b/tools/testing/selftests/net/rtnetlink.sh
index 5b04ad912525..84b4acf5baa9 100755
--- a/tools/testing/selftests/net/rtnetlink.sh
+++ b/tools/testing/selftests/net/rtnetlink.sh
@@ -164,6 +164,37 @@ kci_test_polrouting()
echo "PASS: policy routing"
 }
 
+kci_test_route_get()
+{
+   ret=0
+
+   ip route get 127.0.0.1 > /dev/null
+   check_err $?
+   ip route get 127.0.0.1 dev "$devdummy" > /dev/null
+   check_err $?
+   ip route get ::1 > /dev/null
+   check_err $?
+   ip route get fe80::1 dev "$devdummy" > /dev/null
+   check_err $?
+   ip route get 127.0.0.1 from 127.0.0.1 oif lo tos 0x1 mark 0x1 > 
/dev/null
+   check_err $?
+   ip route get ::1 from ::1 iif lo oif lo tos 0x1 mark 0x1 > /dev/null
+   check_err $?
+   ip addr add dev "$devdummy" 10.23.7.11/24
+   check_err $?
+   ip route get 10.23.7.11 from 10.23.7.12 iif "$devdummy" > /dev/null
+   check_err $?
+   ip addr del dev "$devdummy" 10.23.7.11/24
+   check_err $?
+
+   if [ $ret -ne 0 ];then
+   echo "FAIL: route get"
+   return 1
+   fi
+
+   echo "PASS: route get"
+}
+
 kci_test_rtnl()
 {
kci_add_dummy
@@ -173,6 +204,7 @@ kci_test_rtnl()
fi
 
kci_test_polrouting
+   kci_test_route_get
kci_test_tc
kci_test_gre
kci_test_bridge
-- 
2.13.0



Re: [PATCH v2 net-next 1/3] ipv6: Prevent unexpected sk->sk_prot changes

2017-08-15 Thread Eric Dumazet
On Tue, 2017-08-15 at 13:08 +, Boris Pismenny wrote:
> Hi Eric,


> 
> The IPV6_ADDRFORM socket option assumes that when 
> (sk->sk_protocol == IPPROTO_TCP)
> then the sk_proto is set to tcpv6_prot and it replaces it with tcp_prot.
> 
> This patch ensures that the IPV6_ADDRFORM socket option doesn't replace
> the socket's sk_prot to tcp when it is not expected. For example, TLS sockets
> also replace sk_prot, and we need to prevent IPV6_ADDRFORM from
> overriding these.
> 
> Are you suggesting that each socket protocol will provide a method that
> converts it from IPv6 to IPv4?

I am just saying IPV6_ADDRFORM is becoming spaghetti code, and maybe
this is time to make it modular.

The UDP or TCP magic really should be implemented in TCP or UDP files.

(Who knows maybe one day SCTP or DCCP will support IPVADDRFORM)





Re: [PATCH v11 0/5] Add new PCI_DEV_FLAGS_NO_RELAXED_ORDERING flag

2017-08-15 Thread Ding Tianhong


On 2017/8/15 22:03, Eric Dumazet wrote:
> On Tue, 2017-08-15 at 06:58 -0700, Eric Dumazet wrote:
>> On Mon, 2017-08-14 at 22:15 -0700, David Miller wrote:
>>> From: Ding Tianhong 
>>> Date: Tue, 15 Aug 2017 11:23:22 +0800
>>>
 Some devices have problems with Transaction Layer Packets with the Relaxed
 Ordering Attribute set.  This patch set adds a new PCIe Device Flag,
 PCI_DEV_FLAGS_NO_RELAXED_ORDERING, a set of PCI Quirks to catch some known
 devices with Relaxed Ordering issues, and a use of this new flag by the
 cxgb4 driver to avoid using Relaxed Ordering with problematic Root Complex
 Ports.
>>>  ...
>>>
>>> Series applied, thanks.
>>
>> I got a NULL deref in pci_find_pcie_root_port()
>>
> 
> This was :
> 
> [4.241029] BUG: unable to handle kernel NULL pointer dereference at 
> 0050
> [4.247001] IP: pci_find_pcie_root_port+0x62/0x80
> [4.253011] PGD 0 
> [4.253011] P4D 0 
> [4.253011] 
> [4.258013] Oops:  [#1] SMP DEBUG_PAGEALLOC
> [4.262015] Modules linked in:
> [4.265005] CPU: 31 PID: 1 Comm: swapper/0 Not tainted 4.13.0-dbx-DEV #316
> [4.271002] Hardware name: Intel RML,PCH/Iota_QC_19, BIOS 2.40.0 06/22/2016
> [4.279002] task: a2ee38cfa040 task.stack: a51ec0004000
> [4.285001] RIP: 0010:pci_find_pcie_root_port+0x62/0x80
> [4.290012] RSP: :a51ec0007ab8 EFLAGS: 00010246
> [4.295003] RAX:  RBX: a2ee36bae000 RCX: 
> 0006
> [4.303002] RDX: 081c RSI: a2ee38cfa8c8 RDI: 
> a2ee36bae000
> [4.310013] RBP: a51ec0007b58 R08: 0001 R09: 
> 
> [4.317001] R10:  R11:  R12: 
> a51ec0007ad0
> [4.324005] R13: a2ee36bae098 R14: 0002 R15: 
> a2ee37204818
> [4.331002] FS:  () GS:a2ee3fcc() 
> knlGS:
> [4.339002] CS:  0010 DS:  ES:  CR0: 80050033
> [4.345001] CR2: 0050 CR3: 00401000f000 CR4: 
> 001406e0
> [4.351002] Call Trace:
> [4.354012]  ? pci_configure_device+0x19f/0x570
> [4.359002]  ? pci_conf1_read+0xb8/0xf0
> [4.363002]  ? raw_pci_read+0x23/0x40
> [4.366011]  ? pci_read+0x2c/0x30
> [4.370014]  ? pci_read_config_word+0x67/0x70
> [4.374012]  pci_device_add+0x28/0x230
> [4.378012]  ? pci_vpd_f0_read+0x50/0x80
> [4.382014]  pci_scan_single_device+0x96/0xc0
> [4.386012]  pci_scan_slot+0x79/0xf0
> [4.389001]  pci_scan_child_bus+0x31/0x180
> [4.394014]  acpi_pci_root_create+0x1c6/0x240
> [4.398013]  pci_acpi_scan_root+0x15f/0x1b0
> [4.402012]  acpi_pci_root_add+0x2e6/0x400
> [4.406012]  ? acpi_evaluate_integer+0x37/0x60
> [4.411002]  acpi_bus_attach+0xdf/0x200
> [4.415002]  acpi_bus_attach+0x6a/0x200
> [4.418014]  acpi_bus_attach+0x6a/0x200
> [4.422013]  acpi_bus_scan+0x38/0x70
> [4.426011]  acpi_scan_init+0x10c/0x271
> [4.429001]  acpi_init+0x2fa/0x348
> [4.433004]  ? acpi_sleep_proc_init+0x2d/0x2d
> [4.437001]  do_one_initcall+0x43/0x169
> [4.441001]  kernel_init_freeable+0x1d0/0x258
> [4.445003]  ? rest_init+0xe0/0xe0
> [4.449001]  kernel_init+0xe/0x150
> [4.451002]  ret_from_fork+0x27/0x40
> [4.457004] Code: 85 d2 74 27 80 7a 4a 00 74 21 48 89 d0 48 89 c2 f6 80 1b 
> 09 00 00 10 74 07 48 8b 90 a0 0a 00 00 48 8b 52 10 48 83 7a 10 00 75 d0 <0f> 
> b7 50 50 5d 81 e2 f0 00 00 00 83 fa 40 ba 00 00 00 00 48 0f 
> [4.474012] RIP: pci_find_pcie_root_port+0x62/0x80 RSP: a51ec0007ab8
> [4.481004] CR2: 0050
> [4.484001] ---[ end trace 6f9be6a057581199 ]---
> [4.488001] Kernel panic - not syncing: Fatal exception
> [4.494013] Rebooting in 10 seconds..
> [4.494013] ACPI MEMORY or I/O RESET_REG.
> 
>>
>> This local hack seems to fix the issue.
>>
>> diff --git a/drivers/pci/pci.c b/drivers/pci/pci.c
>> index 
>> af0cc3456dc1b48b1325c06c5edd2ca8cc22a640..cfd8eb5a3d0ba8347d44952ffab28d9c761044d3
>>  100644
>> --- a/drivers/pci/pci.c
>> +++ b/drivers/pci/pci.c
>> @@ -522,7 +522,7 @@ struct pci_dev *pci_find_pcie_root_port(struct pci_dev 
>> *dev)
>> bridge = pci_upstream_bridge(bridge);
>> }
>>  
>> -   if (pci_pcie_type(highest_pcie_bridge) != PCI_EXP_TYPE_ROOT_PORT)
>> +   if (highest_pcie_bridge && pci_pcie_type(highest_pcie_bridge) != 
>> PCI_EXP_TYPE_ROOT_PORT)
>> return NULL;
>>  
>> return highest_pcie_bridge;
> 

It is very strange that I could not reproduce this problem on my server which 
is Xeon 2690v3,
but it is really a obviously issue when the dev could not find a upstream 
bridge in the
pci_find_pcie_root_port(), so the better way is just like your did in this 
patch. Thanks.

Regards
Tianhong

> 
> 
> .
> 



Re: [PATCH] net_sched/sfq: update hierarchical backlog when drop packet

2017-08-15 Thread Eric Dumazet
On Tue, 2017-08-15 at 17:33 +0300, Konstantin Khlebnikov wrote:

> Nope. I'm not sure. But we have something similar in our 4.4 kernel
> for a while.
> 
> Also fq_codel and pfifo_head_drop do something similar tho this.
> 
> Probably this might crash without "[PATCH 1/2] net_sched: call
> qlen_notify only if child qdisc is empty".
> I hadn't tested them separately.

Thanks for the info. I've read this patch and it looks fine indeed :)

Acked-by: Eric Dumazet 




[PATCH ] dt-bindings: net: ravb : Add support for r8a7745 SoC

2017-08-15 Thread Biju Das
Add a new compatible string for the RZ/G1E (R8A7745) SoC.

Signed-off-by: Biju Das 
---
This patch is tested against linux-next tag next-20170815
as well as net-next.

 Documentation/devicetree/bindings/net/renesas,ravb.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/net/renesas,ravb.txt 
b/Documentation/devicetree/bindings/net/renesas,ravb.txt
index 4717bc2..1672353 100644
--- a/Documentation/devicetree/bindings/net/renesas,ravb.txt
+++ b/Documentation/devicetree/bindings/net/renesas,ravb.txt
@@ -6,6 +6,7 @@ interface contains.
 Required properties:
 - compatible: Must contain one or more of the following:
   - "renesas,etheravb-r8a7743" for the R8A7743 SoC.
+  - "renesas,etheravb-r8a7745" for the R8A7745 SoC.
   - "renesas,etheravb-r8a7790" for the R8A7790 SoC.
   - "renesas,etheravb-r8a7791" for the R8A7791 SoC.
   - "renesas,etheravb-r8a7792" for the R8A7792 SoC.
-- 
1.9.1



Re: general protection fault in fib_dump_info

2017-08-15 Thread Roopa Prabhu
On Tue, Aug 15, 2017 at 5:05 AM, Florian Westphal  wrote:
> idaifish  wrote:
>> Syzkaller hit 'general protection fault in fib_dump_info' bug on
>> commit 4.13-rc5..
>
> CC Roopa
>
>> Guilty file: net/ipv4/fib_semantics.c
>>
>> kasan: GPF could be caused by NULL-ptr deref or user memory access
>> general protection fault:  [#1] SMP KASAN
>> Modules linked in:
>> CPU: 0 PID: 2808 Comm: syz-executor0 Not tainted 4.13.0-rc5 #1
>> Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS
>> Ubuntu-1.8.2-1ubuntu1 04/01/2014
>> task: 880078562700 task.stack: 88007811
>> RIP: 0010:fib_dump_info+0x388/0x1170 net/ipv4/fib_semantics.c:1314
>> RSP: 0018:880078117010 EFLAGS: 00010206
>> RAX: dc00 RBX: 00fe RCX: 0002
>> RDX: 0006 RSI: 880078117084 RDI: 0030
>> RBP: 880078117268 R08: 000c R09: 8800780d80c8
>> R10: 58d629b4 R11: 67fce681 R12: 
>> R13: 8800784bd540 R14: 8800780d80b5 R15: 8800780d80a4
>> FS:  022fa940() GS:88007fc0() knlGS:
>> CS:  0010 DS:  ES:  CR0: 80050033
>> CR2: 004387d0 CR3: 79135000 CR4: 06f0
>> Call Trace:
>>  inet_rtm_getroute+0xc89/0x1f50 net/ipv4/route.c:2766
>>  rtnetlink_rcv_msg+0x288/0x680 net/core/rtnetlink.c:4217
>
> Seems like this is from
> b61798130f1be5bff08712308126c2d7ebe390ef
>
> Roopa, it seems to assume res.fi != NULL, but afaics there
> is no guarantee, f.e. in ip_route_input_rcu() in the multicast
> branch res isn't changed at all.

yes, you are right. Just checked. Thanks for catching this and digging
into it further.



>
> If thats true, we might need this fix?

I think fib match should error out for this case.
still debating about the return error code... But the below should fix it.



diff --git a/net/ipv4/route.c b/net/ipv4/route.c

index 0383e66f..f21c760 100644

--- a/net/ipv4/route.c

+++ b/net/ipv4/route.c

@@ -2762,14 +2762,19 @@ static int inet_rtm_getroute(struct sk_buff
*in_skb, struct nlmsghdr *nlh,

if (rtm->rtm_flags & RTM_F_LOOKUP_TABLE)

table_id = rt->rt_table_id;



-   if (rtm->rtm_flags & RTM_F_FIB_MATCH)

+   if (rtm->rtm_flags & RTM_F_FIB_MATCH) {

+   if (!res->fi) {

+   err = -EINVAL;

+   goto errout_free;

+   }

err = fib_dump_info(skb, NETLINK_CB(in_skb).portid,

nlh->nlmsg_seq, RTM_NEWROUTE, table_id,

rt->rt_type, res.prefix, res.prefixlen,

fl4.flowi4_tos, res.fi, 0);

-   else

+   } else {

err = rt_fill_info(net, dst, src, table_id, &fl4, skb,

   NETLINK_CB(in_skb).portid, nlh->nlmsg_seq);

+   }

if (err < 0)

goto errout_free;


[PATCH net-next 00/12] s390/net: updates for 4.14

2017-08-15 Thread Julian Wiedmann
Hi Dave,

a mixed bag of minor fixes, cleanups and refactors for net-next. Please apply.

Thanks,
Julian

Julian Wiedmann (9):
  s390/qeth: don't access skb after transmission
  s390/qeth: remove extra L2 adapterparms query
  s390/qeth: remove extra L3 adapterparms query
  s390/qeth: simplify fragment type selection
  s390/qeth: straighten out fill_buffer() interface
  s390/qeth: clean up fill_buffer() offset logic
  s390/qeth: make more use of skb API
  s390/net: reduce inlining
  s390/qeth: extract bridgeport cmd builder

Kittipon Meesompop (3):
  s390/qeth: reject multicast rxip addresses
  s390/qeth: fix trace-messages for deleting rxip addresses
  s390/qeth: fix using of ref counter for rxip addresses

 drivers/s390/net/ctcm_main.c  |   2 +-
 drivers/s390/net/lcs.c|  28 
 drivers/s390/net/netiucv.c|   4 +-
 drivers/s390/net/qeth_core.h  |  11 ++-
 drivers/s390/net/qeth_core_main.c | 147 ++
 drivers/s390/net/qeth_core_sys.c  |   2 +-
 drivers/s390/net/qeth_l2_main.c   | 116 +++---
 drivers/s390/net/qeth_l3_main.c   |  49 +
 drivers/s390/net/qeth_l3_sys.c|  17 +
 9 files changed, 169 insertions(+), 207 deletions(-)

-- 
2.11.2



[PATCH net-next 11/12] s390/qeth: fix trace-messages for deleting rxip addresses

2017-08-15 Thread Julian Wiedmann
From: Kittipon Meesompop 

change trace-messages:
- from addrxip4 to delrxip4
- from addrxip6 to delrxip6

Signed-off-by: Kittipon Meesompop 
Signed-off-by: Julian Wiedmann 
---
 drivers/s390/net/qeth_l3_main.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index 140ed124d2f0..41bd00454d0f 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -784,11 +784,11 @@ void qeth_l3_del_rxip(struct qeth_card *card, enum 
qeth_prot_versions proto,
ipaddr = qeth_l3_get_addr_buffer(proto);
if (ipaddr) {
if (proto == QETH_PROT_IPV4) {
-   QETH_CARD_TEXT(card, 2, "addrxip4");
+   QETH_CARD_TEXT(card, 2, "delrxip4");
memcpy(&ipaddr->u.a4.addr, addr, 4);
ipaddr->u.a4.mask = 0;
} else if (proto == QETH_PROT_IPV6) {
-   QETH_CARD_TEXT(card, 2, "addrxip6");
+   QETH_CARD_TEXT(card, 2, "delrxip6");
memcpy(&ipaddr->u.a6.addr, addr, 16);
ipaddr->u.a6.pfxlen = 0;
}
-- 
2.11.2



[PATCH net-next 12/12] s390/qeth: fix using of ref counter for rxip addresses

2017-08-15 Thread Julian Wiedmann
From: Kittipon Meesompop 

IP-address setting and removal are delayed when the device is not yet in
state SOFTSETUP or UP. ref_counter has been implemented only for
ip-address with type normal. In this patch ref_counter logic is also used
for ip-address with type rxip to allow appropriate handling of multiple
postponed rxip add and del calls.

Signed-off-by: Kittipon Meesompop 
Signed-off-by: Julian Wiedmann 
---
 drivers/s390/net/qeth_l3_main.c | 8 +---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/drivers/s390/net/qeth_l3_main.c b/drivers/s390/net/qeth_l3_main.c
index 41bd00454d0f..0a3dc14a1381 100644
--- a/drivers/s390/net/qeth_l3_main.c
+++ b/drivers/s390/net/qeth_l3_main.c
@@ -247,7 +247,8 @@ int qeth_l3_delete_ip(struct qeth_card *card, struct 
qeth_ipaddr *tmp_addr)
return -ENOENT;
 
addr->ref_counter--;
-   if (addr->type == QETH_IP_TYPE_NORMAL && addr->ref_counter > 0)
+   if (addr->ref_counter > 0 && (addr->type == QETH_IP_TYPE_NORMAL ||
+ addr->type == QETH_IP_TYPE_RXIP))
return rc;
if (addr->in_progress)
return -EINPROGRESS;
@@ -329,8 +330,9 @@ int qeth_l3_add_ip(struct qeth_card *card, struct 
qeth_ipaddr *tmp_addr)
kfree(addr);
}
} else {
-   if (addr->type == QETH_IP_TYPE_NORMAL)
-   addr->ref_counter++;
+   if (addr->type == QETH_IP_TYPE_NORMAL ||
+   addr->type == QETH_IP_TYPE_RXIP)
+   addr->ref_counter++;
}
 
return rc;
-- 
2.11.2



[PATCH net-next 04/12] s390/qeth: simplify fragment type selection

2017-08-15 Thread Julian Wiedmann
Improve readability of the code that determines a buffer element's
fragment type, and reduce the number of cases down from 5 to 3.

Signed-off-by: Julian Wiedmann 
Acked-by: Ursula Braun 
---
 drivers/s390/net/qeth_core_main.c | 23 ---
 1 file changed, 8 insertions(+), 15 deletions(-)

diff --git a/drivers/s390/net/qeth_core_main.c 
b/drivers/s390/net/qeth_core_main.c
index 4792cabb862e..3623ba23ff0b 100644
--- a/drivers/s390/net/qeth_core_main.c
+++ b/drivers/s390/net/qeth_core_main.c
@@ -3918,23 +3918,16 @@ static inline void __qeth_fill_buffer(struct sk_buff 
*skb,
buffer->element[element].addr = data;
buffer->element[element].length = length_here;
length -= length_here;
-   if (!length) {
-   if (first_lap)
-   if (skb_shinfo(skb)->nr_frags)
-   buffer->element[element].eflags =
-   SBAL_EFLAGS_FIRST_FRAG;
-   else
-   buffer->element[element].eflags = 0;
-   else
-   buffer->element[element].eflags =
-   SBAL_EFLAGS_MIDDLE_FRAG;
-   } else {
-   if (first_lap)
+   if (first_lap) {
+   if (length || skb_is_nonlinear(skb))
+   /* skb needs additional elements */
buffer->element[element].eflags =
-   SBAL_EFLAGS_FIRST_FRAG;
+   SBAL_EFLAGS_FIRST_FRAG;
else
-   buffer->element[element].eflags =
-   SBAL_EFLAGS_MIDDLE_FRAG;
+   buffer->element[element].eflags = 0;
+   } else {
+   buffer->element[element].eflags =
+   SBAL_EFLAGS_MIDDLE_FRAG;
}
data += length_here;
element++;
-- 
2.11.2



[PATCH net-next 10/12] s390/qeth: reject multicast rxip addresses

2017-08-15 Thread Julian Wiedmann
From: Kittipon Meesompop 

There exist different commands to add unicast and multicast addresses on
the OSA card. rxip addresses are always set as unicast addresses and
thus just unicast addresses should be allowed.

Adding a multicast address now fails and a grace message is generated.

Signed-off-by: Kittipon Meesompop 
Signed-off-by: Julian Wiedmann 
---
 drivers/s390/net/qeth_l3_sys.c | 17 +
 1 file changed, 17 insertions(+)

diff --git a/drivers/s390/net/qeth_l3_sys.c b/drivers/s390/net/qeth_l3_sys.c
index f2f94f59e0fa..2000ef190e94 100644
--- a/drivers/s390/net/qeth_l3_sys.c
+++ b/drivers/s390/net/qeth_l3_sys.c
@@ -895,9 +895,26 @@ static ssize_t qeth_l3_dev_rxip_add4_show(struct device 
*dev,
 static int qeth_l3_parse_rxipe(const char *buf, enum qeth_prot_versions proto,
 u8 *addr)
 {
+   __be32 ipv4_addr;
+   struct in6_addr ipv6_addr;
+
if (qeth_l3_string_to_ipaddr(buf, proto, addr)) {
return -EINVAL;
}
+   if (proto == QETH_PROT_IPV4) {
+   memcpy(&ipv4_addr, addr, sizeof(ipv4_addr));
+   if (ipv4_is_multicast(ipv4_addr)) {
+   QETH_DBF_MESSAGE(2, "multicast rxip not supported.\n");
+   return -EINVAL;
+   }
+   } else if (proto == QETH_PROT_IPV6) {
+   memcpy(&ipv6_addr, addr, sizeof(ipv6_addr));
+   if (ipv6_addr_is_multicast(&ipv6_addr)) {
+   QETH_DBF_MESSAGE(2, "multicast rxip not supported.\n");
+   return -EINVAL;
+   }
+   }
+
return 0;
 }
 
-- 
2.11.2



[PATCH net-next 08/12] s390/net: reduce inlining

2017-08-15 Thread Julian Wiedmann
Clean up the inline cruft in s390 net drivers. Many of the inlined
functions had only one caller anyway.

Suggested-by: Joe Perches 
Signed-off-by: Julian Wiedmann 
---
 drivers/s390/net/ctcm_main.c  |  2 +-
 drivers/s390/net/lcs.c| 28 +++--
 drivers/s390/net/netiucv.c|  4 +--
 drivers/s390/net/qeth_core_main.c | 66 +--
 drivers/s390/net/qeth_core_sys.c  |  2 +-
 drivers/s390/net/qeth_l2_main.c   | 24 +-
 drivers/s390/net/qeth_l3_main.c   | 15 +
 7 files changed, 67 insertions(+), 74 deletions(-)

diff --git a/drivers/s390/net/ctcm_main.c b/drivers/s390/net/ctcm_main.c
index 2ade6131a89f..26363e0816fe 100644
--- a/drivers/s390/net/ctcm_main.c
+++ b/drivers/s390/net/ctcm_main.c
@@ -305,7 +305,7 @@ static long ctcm_check_irb_error(struct ccw_device *cdev, 
struct irb *irb)
  *  ch The channel, the sense code belongs to.
  *  sense  The sense code to inspect.
  */
-static inline void ccw_unit_check(struct channel *ch, __u8 sense)
+static void ccw_unit_check(struct channel *ch, __u8 sense)
 {
CTCM_DBF_TEXT_(TRACE, CTC_DBF_DEBUG,
"%s(%s): %02x",
diff --git a/drivers/s390/net/lcs.c b/drivers/s390/net/lcs.c
index 619da81dca70..d01b5c2a7760 100644
--- a/drivers/s390/net/lcs.c
+++ b/drivers/s390/net/lcs.c
@@ -327,8 +327,7 @@ lcs_set_allowed_threads(struct lcs_card *card, unsigned 
long threads)
spin_unlock_irqrestore(&card->mask_lock, flags);
wake_up(&card->wait_q);
 }
-static inline int
-lcs_threads_running(struct lcs_card *card, unsigned long threads)
+static int lcs_threads_running(struct lcs_card *card, unsigned long threads)
 {
 unsigned long flags;
 int rc = 0;
@@ -346,8 +345,7 @@ lcs_wait_for_threads(struct lcs_card *card, unsigned long 
threads)
 lcs_threads_running(card, threads) == 0);
 }
 
-static inline int
-lcs_set_thread_start_bit(struct lcs_card *card, unsigned long thread)
+static int lcs_set_thread_start_bit(struct lcs_card *card, unsigned long 
thread)
 {
 unsigned long flags;
 
@@ -373,8 +371,7 @@ lcs_clear_thread_running_bit(struct lcs_card *card, 
unsigned long thread)
 wake_up(&card->wait_q);
 }
 
-static inline int
-__lcs_do_run_thread(struct lcs_card *card, unsigned long thread)
+static int __lcs_do_run_thread(struct lcs_card *card, unsigned long thread)
 {
 unsigned long flags;
 int rc = 0;
@@ -444,8 +441,7 @@ lcs_setup_card(struct lcs_card *card)
INIT_LIST_HEAD(&card->lancmd_waiters);
 }
 
-static inline void
-lcs_clear_multicast_list(struct lcs_card *card)
+static void lcs_clear_multicast_list(struct lcs_card *card)
 {
 #ifdef CONFIG_IP_MULTICAST
struct lcs_ipm_list *ipm;
@@ -656,8 +652,7 @@ __lcs_resume_channel(struct lcs_channel *channel)
 /**
  * Make a buffer ready for processing.
  */
-static inline void
-__lcs_ready_buffer_bits(struct lcs_channel *channel, int index)
+static void __lcs_ready_buffer_bits(struct lcs_channel *channel, int index)
 {
int prev, next;
 
@@ -1169,8 +1164,8 @@ lcs_get_mac_for_ipm(__be32 ipm, char *mac, struct 
net_device *dev)
 /**
  * function called by net device to handle multicast address relevant things
  */
-static inline void
-lcs_remove_mc_addresses(struct lcs_card *card, struct in_device *in4_dev)
+static void lcs_remove_mc_addresses(struct lcs_card *card,
+   struct in_device *in4_dev)
 {
struct ip_mc_list *im4;
struct list_head *l;
@@ -1196,8 +1191,9 @@ lcs_remove_mc_addresses(struct lcs_card *card, struct 
in_device *in4_dev)
spin_unlock_irqrestore(&card->ipm_lock, flags);
 }
 
-static inline struct lcs_ipm_list *
-lcs_check_addr_entry(struct lcs_card *card, struct ip_mc_list *im4, char *buf)
+static struct lcs_ipm_list *lcs_check_addr_entry(struct lcs_card *card,
+struct ip_mc_list *im4,
+char *buf)
 {
struct lcs_ipm_list *tmp, *ipm = NULL;
struct list_head *l;
@@ -1218,8 +1214,8 @@ lcs_check_addr_entry(struct lcs_card *card, struct 
ip_mc_list *im4, char *buf)
return ipm;
 }
 
-static inline void
-lcs_set_mc_addresses(struct lcs_card *card, struct in_device *in4_dev)
+static void lcs_set_mc_addresses(struct lcs_card *card,
+struct in_device *in4_dev)
 {
 
struct ip_mc_list *im4;
diff --git a/drivers/s390/net/netiucv.c b/drivers/s390/net/netiucv.c
index 7e0e6a4019f3..b9c7c1e61da2 100644
--- a/drivers/s390/net/netiucv.c
+++ b/drivers/s390/net/netiucv.c
@@ -249,14 +249,14 @@ struct ll_header {
  * Compatibility macros for busy handling
  * of network devices.
  */
-static inline void netiucv_clear_busy(struct net_device *dev)
+static void netiucv_clear_busy(struct net_device *dev)
 {
struct netiucv_priv *priv = netdev_priv(dev);
clear_bit(0, &priv->tbusy);
netif_wake_q

[PATCH net-next 09/12] s390/qeth: extract bridgeport cmd builder

2017-08-15 Thread Julian Wiedmann
Consolidation of duplicated code, no functional change.

Signed-off-by: Julian Wiedmann 
---
 drivers/s390/net/qeth_l2_main.c | 62 +
 1 file changed, 26 insertions(+), 36 deletions(-)

diff --git a/drivers/s390/net/qeth_l2_main.c b/drivers/s390/net/qeth_l2_main.c
index 368fb85d8851..438a7f29e99f 100644
--- a/drivers/s390/net/qeth_l2_main.c
+++ b/drivers/s390/net/qeth_l2_main.c
@@ -1724,11 +1724,26 @@ static int qeth_bridgeport_makerc(struct qeth_card 
*card,
return rc;
 }
 
-static inline int ipa_cmd_sbp(struct qeth_card *card)
+static struct qeth_cmd_buffer *qeth_sbp_build_cmd(struct qeth_card *card,
+ enum qeth_ipa_sbp_cmd sbp_cmd,
+ unsigned int cmd_length)
 {
-   return (card->info.type == QETH_CARD_TYPE_IQD) ?
-   IPA_CMD_SETBRIDGEPORT_IQD :
-   IPA_CMD_SETBRIDGEPORT_OSA;
+   enum qeth_ipa_cmds ipa_cmd = (card->info.type == QETH_CARD_TYPE_IQD) ?
+   IPA_CMD_SETBRIDGEPORT_IQD :
+   IPA_CMD_SETBRIDGEPORT_OSA;
+   struct qeth_cmd_buffer *iob;
+   struct qeth_ipa_cmd *cmd;
+
+   iob = qeth_get_ipacmd_buffer(card, ipa_cmd, 0);
+   if (!iob)
+   return iob;
+   cmd = (struct qeth_ipa_cmd *)(iob->data+IPA_PDU_HEADER_SIZE);
+   cmd->data.sbp.hdr.cmdlength = sizeof(struct qeth_ipacmd_sbp_hdr) +
+ cmd_length;
+   cmd->data.sbp.hdr.command_code = sbp_cmd;
+   cmd->data.sbp.hdr.used_total = 1;
+   cmd->data.sbp.hdr.seq_no = 1;
+   return iob;
 }
 
 static int qeth_bridgeport_query_support_cb(struct qeth_card *card,
@@ -1758,21 +1773,13 @@ static int qeth_bridgeport_query_support_cb(struct 
qeth_card *card,
 static void qeth_bridgeport_query_support(struct qeth_card *card)
 {
struct qeth_cmd_buffer *iob;
-   struct qeth_ipa_cmd *cmd;
struct _qeth_sbp_cbctl cbctl;
 
QETH_CARD_TEXT(card, 2, "brqsuppo");
-   iob = qeth_get_ipacmd_buffer(card, ipa_cmd_sbp(card), 0);
+   iob = qeth_sbp_build_cmd(card, IPA_SBP_QUERY_COMMANDS_SUPPORTED,
+sizeof(struct qeth_sbp_query_cmds_supp));
if (!iob)
return;
-   cmd = (struct qeth_ipa_cmd *)(iob->data+IPA_PDU_HEADER_SIZE);
-   cmd->data.sbp.hdr.cmdlength =
-   sizeof(struct qeth_ipacmd_sbp_hdr) +
-   sizeof(struct qeth_sbp_query_cmds_supp);
-   cmd->data.sbp.hdr.command_code =
-   IPA_SBP_QUERY_COMMANDS_SUPPORTED;
-   cmd->data.sbp.hdr.used_total = 1;
-   cmd->data.sbp.hdr.seq_no = 1;
if (qeth_send_ipa_cmd(card, iob, qeth_bridgeport_query_support_cb,
(void *)&cbctl) ||
qeth_bridgeport_makerc(card, &cbctl,
@@ -1826,7 +1833,6 @@ int qeth_bridgeport_query_ports(struct qeth_card *card,
 {
int rc = 0;
struct qeth_cmd_buffer *iob;
-   struct qeth_ipa_cmd *cmd;
struct _qeth_sbp_cbctl cbctl = {
.data = {
.qports = {
@@ -1839,16 +1845,9 @@ int qeth_bridgeport_query_ports(struct qeth_card *card,
QETH_CARD_TEXT(card, 2, "brqports");
if (!(card->options.sbp.supported_funcs & IPA_SBP_QUERY_BRIDGE_PORTS))
return -EOPNOTSUPP;
-   iob = qeth_get_ipacmd_buffer(card, ipa_cmd_sbp(card), 0);
+   iob = qeth_sbp_build_cmd(card, IPA_SBP_QUERY_BRIDGE_PORTS, 0);
if (!iob)
return -ENOMEM;
-   cmd = (struct qeth_ipa_cmd *)(iob->data+IPA_PDU_HEADER_SIZE);
-   cmd->data.sbp.hdr.cmdlength =
-   sizeof(struct qeth_ipacmd_sbp_hdr);
-   cmd->data.sbp.hdr.command_code =
-   IPA_SBP_QUERY_BRIDGE_PORTS;
-   cmd->data.sbp.hdr.used_total = 1;
-   cmd->data.sbp.hdr.seq_no = 1;
rc = qeth_send_ipa_cmd(card, iob, qeth_bridgeport_query_ports_cb,
(void *)&cbctl);
if (rc < 0)
@@ -1880,7 +1879,6 @@ int qeth_bridgeport_setrole(struct qeth_card *card, enum 
qeth_sbp_roles role)
int rc = 0;
int cmdlength;
struct qeth_cmd_buffer *iob;
-   struct qeth_ipa_cmd *cmd;
struct _qeth_sbp_cbctl cbctl;
enum qeth_ipa_sbp_cmd setcmd;
 
@@ -1888,32 +1886,24 @@ int qeth_bridgeport_setrole(struct qeth_card *card, 
enum qeth_sbp_roles role)
switch (role) {
case QETH_SBP_ROLE_NONE:
setcmd = IPA_SBP_RESET_BRIDGE_PORT_ROLE;
-   cmdlength =  sizeof(struct qeth_ipacmd_sbp_hdr) +
-   sizeof(struct qeth_sbp_reset_role);
+   cmdlength = sizeof(struct qeth_sbp_reset_role);
break;
case QETH_SBP_ROLE_PRIMARY:
setcmd = IPA_SBP_SET_PRIMARY_BRIDGE_PORT;
-   cmdlength =  sizeof(struct qeth_ipacmd_sbp_hdr) +
-   

  1   2   3   >