date:20160624

Re: [PATCH iproute2 net-next v4 0/5] bridge: json support for fdb and vlan show

2016-06-24 Thread Rami Rosen

Hi all,

>Also, for external automation and orchestration tools (to whom this
>patch-set is addressed),
>there is no reason for them to write and maintain their own tools
>using netlink when they
>can use iproute2 directly to create a link or query its properties.

+1 for this, seems very reasonable,

Just want to remind us all, in recent netdev 1.1 conference in Seville
(Feb 2016), Damascene Joachimpillai (DJ) from Verizon gave a talk
("Linux Networking and Data Center Operations Challenges"); IIRC, he
mentioned that he misses a REST API to the Linux Kernel networking
stack control plane. And it seems that Rest API and JSON (and maybe
other APIs, like python based APIs) are natural candidates for such an
interface nowadays.


Regards,
Rami Rosen
http://ramirose.wix.com/ramirosen

Re: tcp md5: one more crypto-sg-on-the-stack instance

2016-06-24 Thread Eric Dumazet

On Sat, 2016-06-25 at 06:26 +0200, Eric Dumazet wrote:
> On Sat, 2016-06-25 at 06:11 +0200, Eric Dumazet wrote:
> 
> > Simply extend tcp_md5sig_pool to contain a copy of the TCP headers ?
> > 
> > At most 40 bytes of extra per cpu storage is not a big problem.
> > 
> 
> Correction : This is exactly 20 bytes for tcphdr, not 40.

Something like :

 include/net/tcp.h   |2 +-
 net/ipv4/tcp.c  |   17 -
 net/ipv4/tcp_ipv4.c |   31 +--
 net/ipv6/tcp_ipv6.c |   25 ++---
 4 files changed, 28 insertions(+), 47 deletions(-)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 
a79894b667265cdf9e3fe793b4757e2f932b378a..2dd919e0289839130d2c5435b7925592082d62b5
 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1385,6 +1385,7 @@ union tcp_md5sum_block {
 struct tcp_md5sig_pool {
struct ahash_request*md5_req;
union tcp_md5sum_block  md5_blk;
+   struct tcphdr   tcp_hdr;
 };
 
 /* - functions */
@@ -1420,7 +1421,6 @@ static inline void tcp_put_md5sig_pool(void)
local_bh_enable();
 }
 
-int tcp_md5_hash_header(struct tcp_md5sig_pool *, const struct tcphdr *);
 int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *, const struct sk_buff *,
  unsigned int header_len);
 int tcp_md5_hash_key(struct tcp_md5sig_pool *hp,
diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
index 
5c7ed147449c1b7ba029b12e033ad779a631460a..5fc9336934a11387e725300d6bca4aabd3991f19
 100644
--- a/net/ipv4/tcp.c
+++ b/net/ipv4/tcp.c
@@ -3025,23 +3025,6 @@ struct tcp_md5sig_pool *tcp_get_md5sig_pool(void)
 }
 EXPORT_SYMBOL(tcp_get_md5sig_pool);
 
-int tcp_md5_hash_header(struct tcp_md5sig_pool *hp,
-   const struct tcphdr *th)
-{
-   struct scatterlist sg;
-   struct tcphdr hdr;
-
-   /* We are not allowed to change tcphdr, make a local copy */
-   memcpy(&hdr, th, sizeof(hdr));
-   hdr.check = 0;
-
-   /* options aren't included in the hash */
-   sg_init_one(&sg, &hdr, sizeof(hdr));
-   ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(hdr));
-   return crypto_ahash_update(hp->md5_req);
-}
-EXPORT_SYMBOL(tcp_md5_hash_header);
-
 int tcp_md5_hash_skb_data(struct tcp_md5sig_pool *hp,
  const struct sk_buff *skb, unsigned int header_len)
 {
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 
3708de2a66833cf1d4a221a2b6ce3923bde978c4..c3c5a4cc53aac147b82e85a5d8d7001832594c6a
 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1018,27 +1018,26 @@ static int tcp_v4_parse_md5_keys(struct sock *sk, char 
__user *optval,
  GFP_KERNEL);
 }
 
-static int tcp_v4_md5_hash_pseudoheader(struct tcp_md5sig_pool *hp,
-   __be32 daddr, __be32 saddr, int nbytes)
+static int tcp_v4_md5_hash_headers(struct tcp_md5sig_pool *hp,
+  __be32 daddr, __be32 saddr,
+  const struct tcphdr *th, int nbytes)
 {
-   struct tcp4_pseudohdr *bp;
+   struct tcp4_pseudohdr *bp = &hp->md5_blk.ip4;
struct scatterlist sg;
+   struct tcphdr *_th;
 
-   bp = &hp->md5_blk.ip4;
-
-   /*
-* 1. the TCP pseudo-header (in the order: source IP address,
-* destination IP address, zero-padded protocol number, and
-* segment length)
-*/
bp->saddr = saddr;
bp->daddr = daddr;
bp->pad = 0;
bp->protocol = IPPROTO_TCP;
bp->len = cpu_to_be16(nbytes);
 
-   sg_init_one(&sg, bp, sizeof(*bp));
-   ahash_request_set_crypt(hp->md5_req, &sg, NULL, sizeof(*bp));
+   _th = (struct tcphdr *)(bp + 1);
+   memcpy(_th, th, sizeof(*th));
+   _th->check = 0;
+   sg_init_one(&sg, bp, sizeof(*bp) + sizeof(*th));
+   ahash_request_set_crypt(hp->md5_req, &sg, NULL,
+   sizeof(*bp) + sizeof(*th));
return crypto_ahash_update(hp->md5_req);
 }
 
@@ -1055,9 +1054,7 @@ static int tcp_v4_md5_hash_hdr(char *md5_hash, const 
struct tcp_md5sig_key *key,
 
if (crypto_ahash_init(req))
goto clear_hash;
-   if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, th->doff << 2))
-   goto clear_hash;
-   if (tcp_md5_hash_header(hp, th))
+   if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, th->doff << 2))
goto clear_hash;
if (tcp_md5_hash_key(hp, key))
goto clear_hash;
@@ -1101,9 +1098,7 @@ int tcp_v4_md5_hash_skb(char *md5_hash, const struct 
tcp_md5sig_key *key,
if (crypto_ahash_init(req))
goto clear_hash;
 
-   if (tcp_v4_md5_hash_pseudoheader(hp, daddr, saddr, skb->len))
-   goto clear_hash;
-   if (tcp_md5_hash_header(hp, th))
+   if (tcp_v4_md5_hash_headers(hp, daddr, saddr, th, skb->len))
goto clear_hash;
if (tcp_md5_hash_skb_data(hp, skb, th->doff << 2))

Re: tcp md5: one more crypto-sg-on-the-stack instance

2016-06-24 Thread Eric Dumazet

On Sat, 2016-06-25 at 06:11 +0200, Eric Dumazet wrote:

> Simply extend tcp_md5sig_pool to contain a copy of the TCP headers ?
> 
> At most 40 bytes of extra per cpu storage is not a big problem.
> 

Correction : This is exactly 20 bytes for tcphdr, not 40.

Re: tcp md5: one more crypto-sg-on-the-stack instance

2016-06-24 Thread Eric Dumazet

On Fri, 2016-06-24 at 18:51 -0700, Andy Lutomirski wrote:
> Hi all-
> 
> tcp_md5_hash_header does crypto using an sg that points to the stack.
> This will break with virtually mapped stacks.  It also looks like it's
> probably much slower than it deserves to be (it's trying to compute
> the MD5 hash of a few tens of bytes -- going through a scatterlist is
> a lot of overhead for an otherwise very fast operation).

I guess nobody cares about TCP MD5 speed really.

> 
> I don't suppose one of you could fix it or at least advise as to how
> it should be fixed.

Simply extend tcp_md5sig_pool to contain a copy of the TCP headers ?

At most 40 bytes of extra per cpu storage is not a big problem.

[PATCH] notifier: Fix soft lockup for notifier_call_chain().

2016-06-24 Thread Ding Tianhong

The problem was occurs in my system that a lot of drviers register
its own handler to the notifier call chain for netdev_chain, and
then create 4095 vlan dev for one nic, and add several ipv6 address
on each one of them, just like this:

for i in `seq 1 4095`; do ip link add link eth0 name eth0.$i type vlan id $i; 
done
for i in `seq 1 4095`; do ip -6 addr add 2001::$i dev eth0.$i; done
for i in `seq 1 4095`; do ip -6 addr add 2002::$i dev eth0.$i; done
for i in `seq 1 4095`; do ip -6 addr add 2003::$i dev eth0.$i; done

ifconfig eth0 up
ifconfig eth0 down

then it will halt several seconds, and occurs softlockup:

<0>[ 7620.364058]NMI watchdog: BUG: soft lockup - CPU#0 stuck for 23s! 
[ifconfig:19186]
<0>[ 7620.364592]Call trace:
<4>[ 7620.364599][] dump_backtrace+0x0/0x220
<4>[ 7620.364603][] show_stack+0x20/0x28
<4>[ 7620.364607][] dump_stack+0x90/0xb0
<4>[ 7620.364612][] watchdog_timer_fn+0x41c/0x460
<4>[ 7620.364617][] __run_hrtimer+0x98/0x2d8
<4>[ 7620.364620][] hrtimer_interrupt+0x110/0x288
<4>[ 7620.364624][] arch_timer_handler_phys+0x38/0x48
<4>[ 7620.364628][] handle_percpu_devid_irq+0x9c/0x190
<4>[ 7620.364632][] generic_handle_irq+0x40/0x58
<4>[ 7620.364635][] __handle_domain_irq+0x68/0xc0
<4>[ 7620.364638][] gic_handle_irq+0xc4/0x1c8
<4>[ 7620.364641]Exception stack(0xffc0309b3640 to 0xffc0309b3770)
<4>[ 7620.364644]3640: 1000  ffc0309b37c0 
ffbfa1019cf8
<4>[ 7620.364647]3660: 8145 ffc0309b3958  
ffbfa1013008
<4>[ 7620.364651]3680: 07f0 ffbfa131b770 ffd08aaadc40 
ffbfa1019cf8
<4>[ 7620.364654]36a0: ffbfa1019cc4 ffd089c2b000 ffd08eff8000 
ffc0309b3958
<4>[ 7620.364656]36c0: ffbfa101c5c0   
ffbfa101c66c
<4>[ 7620.364659]36e0: 7f7f7f7f7f7f7f7f 0030  

<4>[ 7620.364662]3700:   ffc000393d58 
007f794d67b0
<4>[ 7620.364665]3720: 007fe62215d0 ffc0309b3830 ffc00021d8e0 
ffbfa1049b68
<4>[ 7620.364668]3740: ffc000697578 ffc0006974b8 ffc0309b3958 

<4>[ 7620.364670]3760: ffbfa1013008 07f0
<4>[ 7620.364673][] el1_irq+0x80/0x100
<4>[ 7620.364692][] fib6_walk+0x3c/0x70 [ipv6]
<4>[ 7620.364710][] fib6_clean_tree+0x68/0x90 [ipv6]
<4>[ 7620.364727][] __fib6_clean_all+0x88/0xc0 [ipv6]
<4>[ 7620.364746][] fib6_clean_all+0x28/0x30 [ipv6]
<4>[ 7620.364763][] rt6_ifdown+0x64/0x148 [ipv6]
<4>[ 7620.364781][] addrconf_ifdown+0x68/0x540 [ipv6]
<4>[ 7620.364798][] addrconf_notify+0xd0/0x8b8 [ipv6]
<4>[ 7620.364801][] notifier_call_chain+0x5c/0xa0
<4>[ 7620.364804][] raw_notifier_call_chain+0x20/0x28
<4>[ 7620.364809][] call_netdevice_notifiers_info+0x4c/0x80
<4>[ 7620.364812][] dev_close_many+0xd0/0x138
<4>[ 7620.364821][] vlan_device_event+0x4a8/0x6a0 [8021q]
<4>[ 7620.364824][] notifier_call_chain+0x5c/0xa0
<4>[ 7620.364827][] raw_notifier_call_chain+0x20/0x28
<4>[ 7620.364830][] call_netdevice_notifiers_info+0x4c/0x80
<4>[ 7620.364833][] __dev_notify_flags+0xb8/0xe0
<4>[ 7620.364836][] dev_change_flags+0x54/0x68
<4>[ 7620.364840][] devinet_ioctl+0x650/0x700
<4>[ 7620.364843][] inet_ioctl+0xa4/0xc8
<4>[ 7620.364847][] sock_do_ioctl+0x44/0x88
<4>[ 7620.364850][] sock_ioctl+0x23c/0x308
<4>[ 7620.364854][] do_vfs_ioctl+0x48c/0x620
<4>[ 7620.364857][] SyS_ioctl+0x94/0xa8

=cut 
here

It looks that the notifier_call_chain has to deal with too much handler, and 
will not
feed the watchdog until finish the work, so add cond_resched() in the loops to 
fix
this problem, and it will not panic again.

Signed-off-by: Ding Tianhong 
---
 kernel/notifier.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/notifier.c b/kernel/notifier.c
index fd2c9ac..9c30411 100644
--- a/kernel/notifier.c
+++ b/kernel/notifier.c
@@ -92,6 +92,8 @@ static int notifier_call_chain(struct notifier_block **nl,
 #endif
ret = nb->notifier_call(nb, val, v);
 
+   cond_resched();
+
if (nr_calls)
(*nr_calls)++;
 
-- 
1.9.0

[RFC] tun: Support VIRTIO_NET_HDR_F_DATA_VALID in tun_get_user

2016-06-24 Thread Subash Abhinov Kasiviswanathan

Userspace applications might sometimes process packets from hardware
which has already validated checksum, perform trivial operations and
then queue them back to the network stack. By not recomputing the
checksum here, we can see significant improvement in performance.

Sample application here is CLAT which does IPv6 to IPv4 translation.
IPv6 packets for which checksum is validated in hardware are captured
in CLAT and then translated to IPv4 and then queued back to network
stack. In this case, it is expected that the application would not
corrupt the packet and recomputing the checksum would be redundant.

Pass the hint to kernel to skip checksum validation if
VIRTIO_NET_HDR_F_DATA_VALID is set from userspace.

Signed-off-by: Subash Abhinov Kasiviswanathan 
---
 drivers/net/tun.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/tun.c b/drivers/net/tun.c
index e16487c..a5828a5 100644
--- a/drivers/net/tun.c
+++ b/drivers/net/tun.c
@@ -1263,6 +1263,9 @@ static ssize_t tun_get_user(struct tun_struct *tun, 
struct tun_file *tfile,
}
}
 
+   if (gso.flags & VIRTIO_NET_HDR_F_DATA_VALID)
+   skb->ip_summed = CHECKSUM_UNNECESSARY;
+
switch (tun->flags & TUN_TYPE_MASK) {
case IFF_TUN:
if (tun->flags & IFF_NO_PI) {
-- 
1.9.1

tcp md5: one more crypto-sg-on-the-stack instance

2016-06-24 Thread Andy Lutomirski

Hi all-

tcp_md5_hash_header does crypto using an sg that points to the stack.
This will break with virtually mapped stacks.  It also looks like it's
probably much slower than it deserves to be (it's trying to compute
the MD5 hash of a few tens of bytes -- going through a scatterlist is
a lot of overhead for an otherwise very fast operation).

I don't suppose one of you could fix it or at least advise as to how
it should be fixed.

Thanks,
Andy

DMA from stack in virtio_net and virtio_console

2016-06-24 Thread Andy Lutomirski

virtio_net does DMA on the stack when it calls sg_init_one in
virtio_set_queues, virtnet_vlan_rx_add_vid, and
virtnet_vlan_rx_kill_vid.  Michael, I think these are examples we
missed somehow when fixing these issues earlier on.

virtio_console does it here:

sg_init_one(sg, &cpkt, sizeof(cpkt));

This will cause problems on some architectures (Xen at the very least,
and it'll cause more subtle problems on other architectures if they
start using the DMA API), and it will blow up horribly with virtually
mapped stacks.

Could you fix these, please?

Thanks,
Andy

[perf core] c5dfd78eb7: BUG: unable to handle kernel NULL pointer dereference at 00000c40

2016-06-24 Thread kernel test robot

extra tests on HEAD of linux-devel/devel-hourly-2016062414
git bisect  bad e8d665056895dafedd7882bfe250ff6cf7dfbc0d  # 08:10  0- 
53  0day head guard for 'devel-hourly-2016062414'
# extra tests on tree/branch linus/master
git bisect  bad 63c04ee7d3b7c8d8e2726cb7c5f8a5f6fcc1e3b2  # 08:22  0-  
3  Merge tag 'upstream-4.7-rc5' of git://git.infradead.org/linux-ubifs
# extra tests on tree/branch linus/master
git bisect  bad 63c04ee7d3b7c8d8e2726cb7c5f8a5f6fcc1e3b2  # 08:23  0-  
5  Merge tag 'upstream-4.7-rc5' of git://git.infradead.org/linux-ubifs
# extra tests on tree/branch linux-next/master
git bisect  bad 2cf991dfda8b36d2878c249bcdf492366ec24c19  # 08:29 14-  
1  Add linux-next specific files for 20160624


This script may reproduce the error.


#!/bin/bash

kernel=$1
initrd=quantal-core-i386.cgz

wget --no-clobber 
https://github.com/fengguang/reproduce-kernel-bug/raw/master/initrd/$initrd

kvm=(
qemu-system-x86_64
-enable-kvm
-cpu kvm64
-kernel $kernel
-initrd $initrd
-m 300
-smp 2
-device e1000,netdev=net0
-netdev user,id=net0
-boot order=nc
-no-reboot
-watchdog i6300esb
-rtc base=localtime
-serial stdio
-display none
-monitor null 
)

append=(
hung_task_panic=1
earlyprintk=ttyS0,115200
systemd.log_level=err
debug
apic=debug
sysrq_always_enabled
rcupdate.rcu_cpu_stall_timeout=100
panic=-1
softlockup_panic=1
nmi_watchdog=panic
oops=panic
load_ramdisk=2
prompt_ramdisk=0
console=ttyS0,115200
console=tty0
vga=normal
root=/dev/ram0
rw
drbd.minor_count=8
)

"${kvm[@]}" --append "${append[*]}"


---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/lkp  Intel Corporation


dmesg-quantal-kbuild-53:20160625075710:i386-randconfig-h0-06242012:4.6.0-rc4-00181-gc5dfd78:1.gz
Description: application/gzip
#
# Automatically generated file; DO NOT EDIT.
# Linux/i386 4.6.0-rc4 Kernel Configuration
#
# CONFIG_64BIT is not set
CONFIG_X86_32=y
CONFIG_X86=y
CONFIG_INSTRUCTION_DECODER=y
CONFIG_OUTPUT_FORMAT="elf32-i386"
CONFIG_ARCH_DEFCONFIG="arch/x86/configs/i386_defconfig"
CONFIG_LOCKDEP_SUPPORT=y
CONFIG_STACKTRACE_SUPPORT=y
CONFIG_MMU=y
CONFIG_ARCH_MMAP_RND_BITS_MIN=8
CONFIG_ARCH_MMAP_RND_BITS_MAX=16
CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MIN=8
CONFIG_ARCH_MMAP_RND_COMPAT_BITS_MAX=16
CONFIG_NEED_SG_DMA_LENGTH=y
CONFIG_GENERIC_ISA_DMA=y
CONFIG_GENERIC_BUG=y
CONFIG_GENERIC_HWEIGHT=y
CONFIG_ARCH_MAY_HAVE_PC_FDC=y
CONFIG_RWSEM_XCHGADD_ALGORITHM=y
CONFIG_GENERIC_CALIBRATE_DELAY=y
CONFIG_ARCH_HAS_CPU_RELAX=y
CONFIG_ARCH_HAS_CACHE_LINE_SIZE=y
CONFIG_HAVE_SETUP_PER_CPU_AREA=y
CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y
CONFIG_NEED_PER_CPU_PAGE_FIRST_CHUNK=y
CONFIG_ARCH_HIBERNATION_POSSIBLE=y
CONFIG_ARCH_SUSPEND_POSSIBLE=y
CONFIG_ARCH_WANT_HUGE_PMD_SHARE=y
CONFIG_ARCH_WANT_GENERAL_HUGETLB=y
CONFIG_ARCH_SUPPORTS_OPTIMIZED_INLINING=y
CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
CONFIG_X86_32_SMP=y
CONFIG_X86_32_LAZY_GS=y
CONFIG_ARCH_HWEIGHT_CFLAGS="-fcall-saved-ecx -fcall-saved-edx"
CONFIG_ARCH_SUPPORTS_UPROBES=y
CONFIG_FIX_EARLYCON_MEM=y
CONFIG_DEBUG_RODATA=y
CONFIG_PGTABLE_LEVELS=2
CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
CONFIG_CONSTRUCTORS=y
CONFIG_IRQ_WORK=y
CONFIG_BUILDTIME_EXTABLE_SORT=y

#
# General setup
#
CONFIG_INIT_ENV_ARG_LIMIT=32
CONFIG_CROSS_COMPILE=""
# CONFIG_COMPILE_TEST is not set
CONFIG_LOCALVERSION=""
CONFIG_LOCALVERSION_AUTO=y
CONFIG_HAVE_KERNEL_GZIP=y
CONFIG_HAVE_KERNEL_BZIP2=y
CONFIG_HAVE_KERNEL_LZMA=y
CONFIG_HAVE_KERNEL_XZ=y
CONFIG_HAVE_KERNEL_LZO=y
CONFIG_HAVE_KERNEL_LZ4=y
# CONFIG_KERNEL_GZIP is not set
CONFIG_KERNEL_BZIP2=y
# CONFIG_KERNEL_LZMA is not set
# CONFIG_KERNEL_XZ is not set
# CONFIG_KERNEL_LZO is not set
# CONFIG_KERNEL_LZ4 is not set
CONFIG_DEFAULT_HOSTNAME="(none)"
# CONFIG_SYSVIPC is not set
# CONFIG_POSIX_MQUEUE is not set
# CONFIG_CROSS_MEMORY_ATTACH is not set
CONFIG_FHANDLE=y
# CONFIG_USELIB is not set
# CONFIG_AUDIT is not set
CONFIG_HAVE_ARCH_AUDITSYSCALL=y

#
# IRQ subsystem
#
CONFIG_GENERIC_IRQ_PROBE=y
CONFIG_GENERIC_IRQ_SHOW=y
CONFIG_GENERIC_PENDING_IRQ=y
CONFIG_IRQ_DOMAIN=y
CONFIG_IRQ_DOMAIN_HIERARCHY=y
# CONFIG_IRQ_DOMAIN_DEBUG is not set
CONFIG_IRQ_FORCED_THREADING=y
CONFIG_SPARSE_IRQ=y
CONFIG_CLOCKSOURCE_WATCHDOG=y
CONFIG_ARCH_CLOCKSOURCE_DATA=y
CONFIG_CLOCKSOURCE_VALIDATE_LAST_CYCLE=y
CONFIG_GENERIC_TIME_VSYSCALL=y
CONFIG_GENERIC_CLOCKEVENTS=y
CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y
CONFIG_GENERIC_CLOCKEVENTS_MIN_ADJUST=y
CO

Re: [PATCH net-next 0/8] tou: Transports over UDP - part I

2016-06-24 Thread Rick Jones


On 06/24/2016 04:43 PM, Tom Herbert wrote:

Here's Christoph's slides on TFO in the wild which presents a good
summary of the middlebox problem. There is one significant difference
in that ECN needs network support whereas TFO didn't. Given that
experience, I'm doubtful other new features at L4 could ever be
productively use (like EDO or maybe TCP-ENO).

https://www.ietf.org/proceedings/94/slides/slides-94-tcpm-13.pdf


Perhaps I am being overly optimistic, but my takeaway from those slides 
is Apple were able to come-up with ways to deal with the middleboxes and 
so could indeed productively use TCP FastOpen.


"Overall, very good success-rate"
though tempered by
"But... middleboxes were a big issue in some ISPs..."

Though it doesn't get into how big (some connections, many, most, all?) 
and how many ISPs.


rick jones

Just an anecdote...  Not that I am a "power user" of my iPhone running 
9.3.2 (13F69) nor that I know that anything I am using is the Apple 
Service stated as using TFO (mostly Safari, Mail and Messages) but if it 
is, I cannot say that any troubles under the covers have been noticed by me.

[PATCH] [v6] net: emac: emac gigabit ethernet controller driver

2016-06-24 Thread Timur Tabi

Add supports for ethernet controller HW on Qualcomm Technologies, Inc. SoC.
This driver supports the following features:
1) Checksum offload.
2) Interrupt coalescing support.
3) SGMII phy.
4) phylib interface for external phy

Based on original work by
Niranjana Vishwanathapura 
Gilad Avidov 

Signed-off-by: Timur Tabi 
---

v6:
 - Properly ordered local variables
 - use built-in GEN_MASK instead of BITS_MASK
 - remove redundant call to emac_rx_mode_set from emac_mac_up
 - removed emac_rfd structure, use dma_addr_t directly instead
 - removed emac_mac_speed enun, replaced with macros
 - removed superfluous phy_stop from emac_mac_down(), which prevented reloading 
module
 - add missing netif_napi_del
 - set the DMA mask

v5:
 - changed author to Timur, added MAINTAINERS entry
 - use phylib, replacing internal phy code
 - added support for EMAC internal SGMII v2
 - fix ~DIS_INT warning
 - update DT bindings, including removing unused properties
 - removed interrupt handler for internal sgmii
 - removed link status check handler/state (replaced with phylib)
 - removed periodic timer handler (replaced with phylib)
 - removed power management code (will be rewritten later)
 - external phy is now required, not optional
 - removed redundant EMAC_STATUS_DOWN status flag
 - removed redundant link status and speed variables
 - removed redundant status bits (vlan strip, promiscuous, loopback, etc)
 - removed useless watchdog status
 - removed command-line parameters
 - cleaned up probe messages
 - removed redundant params from emac_sgmii_link_init()
 - always call netdev_completed_queue() (per review comment)
 - fix emac_napi_rtx() (per review comment)
 - removed max_ints loop in interrupt handler
 - removed redundant mutex around phy read/write calls
 - added lock for reading emac status (per review comment)
 - generate random MAC address if it can't be read from firmware
 - replace EMAC_DMA_ADDR_HI/LO with upper/lower_32_bits
 - don't test return value from platform_get_resource (per review comment)
 - use net_warn_ratelimited (per review comment)
 - don't set the dma masks (will be set by DT or IORT code)
 - remove unused emac_tx_tpd_ts_save()
 - removed redundant local MTU variable

v4:
 - add missing ipv6 header file
 - correct compatible string
 - fix spacing in emac_reg_write arrays
 - drop unnecessary cell-index property
 - remove unsupported DT properties from docs
 - remove GPIO initialization and update docs

v3:
 - remove most of the memory barriers by using the non xxx_relaxed() api.
 - remove RSS and WOL support.
 - correct comments from physical address to dma address.
 - rearrange structs to make them packed.
 - replace polling loops with readl_poll_timeout().
 - remove unnecessary wrapper functions from phy layer.
 - add blank line before return statements.
 - set to null clocks after clk_put().
 - use module_platform_driver() and dma_set_mask_and_coherent()
 - replace long hex bitmasks with BIT() macro.

v2:
 - replace hw bit fields to macros with bitwise operations.
 - change all iterators to unsized types (int)
 - some minor code flow improvements.
 - change return type to void for functions which return value is never
   used.
 - replace instance of l_relaxed() io followed by mb() with a
   readl()/writel().


 .../devicetree/bindings/net/qcom-emac.txt  |   63 +
 MAINTAINERS|6 +
 drivers/net/ethernet/qualcomm/Kconfig  |   11 +
 drivers/net/ethernet/qualcomm/Makefile |2 +
 drivers/net/ethernet/qualcomm/emac/Makefile|7 +
 drivers/net/ethernet/qualcomm/emac/emac-mac.c  | 1661 
 drivers/net/ethernet/qualcomm/emac/emac-mac.h  |  271 
 drivers/net/ethernet/qualcomm/emac/emac-phy.c  |  211 +++
 drivers/net/ethernet/qualcomm/emac/emac-phy.h  |   32 +
 drivers/net/ethernet/qualcomm/emac/emac-sgmii.c|  700 +
 drivers/net/ethernet/qualcomm/emac/emac-sgmii.h|   24 +
 drivers/net/ethernet/qualcomm/emac/emac.c  |  809 ++
 drivers/net/ethernet/qualcomm/emac/emac.h  |  369 +
 13 files changed, 4166 insertions(+)
 create mode 100644 Documentation/devicetree/bindings/net/qcom-emac.txt
 create mode 100644 drivers/net/ethernet/qualcomm/emac/Makefile
 create mode 100644 drivers/net/ethernet/qualcomm/emac/emac-mac.c
 create mode 100644 drivers/net/ethernet/qualcomm/emac/emac-mac.h
 create mode 100644 drivers/net/ethernet/qualcomm/emac/emac-phy.c
 create mode 100644 drivers/net/ethernet/qualcomm/emac/emac-phy.h
 create mode 100644 drivers/net/ethernet/qualcomm/emac/emac-sgmii.c
 create mode 100644 drivers/net/ethernet/qualcomm/emac/emac-sgmii.h
 create mode 100644 drivers/net/ethernet/qualcomm/emac/emac.c
 create mode 100644 drivers/net/ethernet/qualcomm/emac/emac.h

diff --git a/Documentation/devicetree/bindings/net/qcom-emac.txt 
b/Documentation/devicetree/bindings/net/qcom-emac.txt
new file mode 100644
index 000..4e1a53

Re: [PATCH net-next 0/8] tou: Transports over UDP - part I

2016-06-24 Thread Tom Herbert

On Fri, Jun 24, 2016 at 3:06 PM, Rick Jones  wrote:
> On 06/24/2016 02:46 PM, Tom Herbert wrote:
>>
>> On Fri, Jun 24, 2016 at 2:36 PM, Rick Jones  wrote:
>>>
>>> How would you define "severely?"  Has it actually been more severe than
>>> for
>>> say ECN?  Or it was for say SACK or PAWS?
>>>
>> ECN is probably even a bigger disappointment in terms of seeing
>> deployment :-( From http://ecn.ethz.ch/ecn-pam15.pdf:
>>
>> "Even though ECN was standardized in 2001, and it is widely
>> implemented in end-systems, it is barely deployed. This is due to a
>> history of problems with severely broken middleboxes shortly after
>> standardization, which led to connectivity failure and guidance to
>> leave ECN disabled."
>>
>> SACK and PAWS seemed to have faired a little better I believe.
>
>
> The conclusion of that (rather interesting) paper reads:
>
> "Our analysis therefore indicates that enabling ECN by default would
> lead to connections to about five websites per thousand to suffer
> additional setup latency with RFC 3168 fallback. This represents an
> order of magnitude fewer than the about forty per thousand which
> experience transient or permanent connection failure due to other
> operational issues"
>
> Doesn't that then suggest that not enabling ECN is basically a matter of FUD
> more than remaining assumed broken middleboxes?
>
> My main point is that in the past at least, trouble with broken middleboxes
> didn't lead us to start wrapping all our TCP/transport traffic in UDP to try
> to hide it from them.  We've managed to get SACK and PAWS universal without
> having to resort to that, and it would seem we could get ECN universal if we
> could overcome our FUD.  Why would TFO for instance be any different?
>
Here's Christoph's slides on TFO in the wild which presents a good
summary of the middlebox problem. There is one significant difference
in that ECN needs network support whereas TFO didn't. Given that
experience, I'm doubtful other new features at L4 could ever be
productively use (like EDO or maybe TCP-ENO).

https://www.ietf.org/proceedings/94/slides/slides-94-tcpm-13.pdf

Tom

> There was an equally interesting second paragraph in the conclusion:
>
> "As not all websites are equally popular, failures on five per thousand
> websites does not by any means imply that five per thousand connection
> attempts will fail. While estimation of connection attempt rate by rank is
> out of scope of this work, we note that the highest ranked website
> exhibiting stable connection failure has rank 596, and only 13 such sites
> appear in the top 5000"
>
> rick jones

[PATCH] net: phy: Manage fixed PHY address space using IDA

2016-06-24 Thread Florian Fainelli

If we have a system which uses fixed PHY devices and calls
fixed_phy_register() then fixed_phy_unregister() we can exhaust the
number of fixed PHYs available after a while, since we keep incrementing
the variable phy_fixed_addr, but we never decrement it.

This patch fixes that by converting the fixed PHY allocation to using
IDA, which takes care of the allocation/dealloaction of the PHY
addresses for us.

Fixes: a75951217472 ("net: phy: extend fixed driver with fixed_phy_register()")
Signed-off-by: Florian Fainelli 
---
 drivers/net/phy/fixed_phy.c | 22 +++---
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c
index 2d2e4339f0df..9ec7f7353434 100644
--- a/drivers/net/phy/fixed_phy.c
+++ b/drivers/net/phy/fixed_phy.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define MII_REGS_NUM 29
 
@@ -286,6 +287,8 @@ err_regs:
 }
 EXPORT_SYMBOL_GPL(fixed_phy_add);
 
+static DEFINE_IDA(phy_fixed_ida);
+
 static void fixed_phy_del(int phy_addr)
 {
struct fixed_mdio_bus *fmb = &platform_fmb;
@@ -297,14 +300,12 @@ static void fixed_phy_del(int phy_addr)
if (gpio_is_valid(fp->link_gpio))
gpio_free(fp->link_gpio);
kfree(fp);
+   ida_simple_remove(&phy_fixed_ida, phy_addr);
return;
}
}
 }
 
-static int phy_fixed_addr;
-static DEFINE_SPINLOCK(phy_fixed_addr_lock);
-
 struct phy_device *fixed_phy_register(unsigned int irq,
  struct fixed_phy_status *status,
  int link_gpio,
@@ -319,17 +320,15 @@ struct phy_device *fixed_phy_register(unsigned int irq,
return ERR_PTR(-EPROBE_DEFER);
 
/* Get the next available PHY address, up to PHY_MAX_ADDR */
-   spin_lock(&phy_fixed_addr_lock);
-   if (phy_fixed_addr == PHY_MAX_ADDR) {
-   spin_unlock(&phy_fixed_addr_lock);
-   return ERR_PTR(-ENOSPC);
-   }
-   phy_addr = phy_fixed_addr++;
-   spin_unlock(&phy_fixed_addr_lock);
+   phy_addr = ida_simple_get(&phy_fixed_ida, 0, PHY_MAX_ADDR, GFP_KERNEL);
+   if (phy_addr < 0)
+   return ERR_PTR(phy_addr);
 
ret = fixed_phy_add(irq, phy_addr, status, link_gpio);
-   if (ret < 0)
+   if (ret < 0) {
+   ida_simple_remove(&phy_fixed_ida, phy_addr);
return ERR_PTR(ret);
+   }
 
phy = get_phy_device(fmb->mii_bus, phy_addr, false);
if (IS_ERR(phy)) {
@@ -434,6 +433,7 @@ static void __exit fixed_mdio_bus_exit(void)
list_del(&fp->node);
kfree(fp);
}
+   ida_destroy(&phy_fixed_ida);
 }
 module_exit(fixed_mdio_bus_exit);
 
-- 
2.7.4

Re: [PATCH net] net: phy: Decrement phy_fixed_addr during unregister

2016-06-24 Thread Florian Fainelli

On 06/24/2016 04:06 PM, Russell King - ARM Linux wrote:
> On Fri, Jun 24, 2016 at 03:58:39PM -0700, Florian Fainelli wrote:
>> On 06/24/2016 03:55 PM, Russell King - ARM Linux wrote:
>>> On Fri, Jun 24, 2016 at 03:44:11PM -0700, Florian Fainelli wrote:
 If we have a system which uses fixed PHY devices and calls
 fixed_phy_register() then fixed_phy_unregister() we can exhaust the
 number of fixed PHYs available after a while, since we keep incrementing
 the variable phy_fixed_addr, but we never decrement it.

 This patch fixes that by decrementing phy_fixed_addr during
 fixed_phy_del(), and in order to do that, we need to move the
 phy_fixed_addr integer and its spinlock above that function.
>>>
>>> Is this really a good idea?
>>
>> In the sense that it is symetrical to the register code, probably.
>>
>>>
>>> What if we have two fixed phys register, and the first one is
>>> unregistered and a new one subsequently registered?
>>>
>>> First phy registered, gets address 0, phy_fixed_addr becomes 1.
>>> Second phy registered, gets address 1, phy_fixed_addr becomes 2.
>>> First phy is unregistered, phy_fixed_addr becomes 1.
>>> Third phy registered, gets address 1, conflicts with the second phy.
>>>
>>> Obviously not a good outcome.
>>>
>>
>> What would you suggest we do instead? Would switching to IDA/IDR give us
>> better results for instance (I have not looked too closely yet)?
> 
> I would expect an IDA to be suitable, because the IDA would track which
> indexes (==addresses) are currently in-use.

OK, thanks!

> 
> If you want to go further, using an IDR would allow fixed_mdio_read() to
> find the right fixed_phy struct without needing to loop over fmb->phys.

Since I am targetting this as a bugfix, the switch to IDA seems more
appropriate to be backported, but yes, that's a good idea though.

> Whether that's worth it or not depends if you have a large number of
> fixed phys.  I suspect we're talking about small quantities here though.
> 

Yes, at the moment we are limited to 32 PHYs maximum, just like a real
MDIO bus, which in some systems could actually be not enough, but then
you run into other problems, like the need to register more than a
single fixed MDIO bus driver to get a larger address space...
-- 
Florian

Re: [PATCH net] net: phy: Decrement phy_fixed_addr during unregister

2016-06-24 Thread Russell King - ARM Linux

On Fri, Jun 24, 2016 at 03:58:39PM -0700, Florian Fainelli wrote:
> On 06/24/2016 03:55 PM, Russell King - ARM Linux wrote:
> > On Fri, Jun 24, 2016 at 03:44:11PM -0700, Florian Fainelli wrote:
> >> If we have a system which uses fixed PHY devices and calls
> >> fixed_phy_register() then fixed_phy_unregister() we can exhaust the
> >> number of fixed PHYs available after a while, since we keep incrementing
> >> the variable phy_fixed_addr, but we never decrement it.
> >>
> >> This patch fixes that by decrementing phy_fixed_addr during
> >> fixed_phy_del(), and in order to do that, we need to move the
> >> phy_fixed_addr integer and its spinlock above that function.
> > 
> > Is this really a good idea?
> 
> In the sense that it is symetrical to the register code, probably.
> 
> > 
> > What if we have two fixed phys register, and the first one is
> > unregistered and a new one subsequently registered?
> > 
> > First phy registered, gets address 0, phy_fixed_addr becomes 1.
> > Second phy registered, gets address 1, phy_fixed_addr becomes 2.
> > First phy is unregistered, phy_fixed_addr becomes 1.
> > Third phy registered, gets address 1, conflicts with the second phy.
> > 
> > Obviously not a good outcome.
> >
> 
> What would you suggest we do instead? Would switching to IDA/IDR give us
> better results for instance (I have not looked too closely yet)?

I would expect an IDA to be suitable, because the IDA would track which
indexes (==addresses) are currently in-use.

If you want to go further, using an IDR would allow fixed_mdio_read() to
find the right fixed_phy struct without needing to loop over fmb->phys.
Whether that's worth it or not depends if you have a large number of
fixed phys.  I suspect we're talking about small quantities here though.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
according to speedtest.net.

Re: [PATCH net] net: phy: Decrement phy_fixed_addr during unregister

2016-06-24 Thread Florian Fainelli

On 06/24/2016 03:55 PM, Russell King - ARM Linux wrote:
> On Fri, Jun 24, 2016 at 03:44:11PM -0700, Florian Fainelli wrote:
>> If we have a system which uses fixed PHY devices and calls
>> fixed_phy_register() then fixed_phy_unregister() we can exhaust the
>> number of fixed PHYs available after a while, since we keep incrementing
>> the variable phy_fixed_addr, but we never decrement it.
>>
>> This patch fixes that by decrementing phy_fixed_addr during
>> fixed_phy_del(), and in order to do that, we need to move the
>> phy_fixed_addr integer and its spinlock above that function.
> 
> Is this really a good idea?

In the sense that it is symetrical to the register code, probably.

> 
> What if we have two fixed phys register, and the first one is
> unregistered and a new one subsequently registered?
> 
> First phy registered, gets address 0, phy_fixed_addr becomes 1.
> Second phy registered, gets address 1, phy_fixed_addr becomes 2.
> First phy is unregistered, phy_fixed_addr becomes 1.
> Third phy registered, gets address 1, conflicts with the second phy.
> 
> Obviously not a good outcome.
>

What would you suggest we do instead? Would switching to IDA/IDR give us
better results for instance (I have not looked too closely yet)?
-- 
Florian

[PATCH] samples/bpf: set resource limit to infinity.

2016-06-24 Thread William Tu

Signed-off-by: William Tu 
---
 samples/bpf/test_maps.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/samples/bpf/test_maps.c b/samples/bpf/test_maps.c
index 47bf085..d2bc96e 100644
--- a/samples/bpf/test_maps.c
+++ b/samples/bpf/test_maps.c
@@ -16,6 +16,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "libbpf.h"
 
 static int map_flags;
@@ -483,6 +484,9 @@ static void run_all_tests(void)
 
 int main(void)
 {
+   struct rlimit r = {RLIM_INFINITY, RLIM_INFINITY};
+
+   setrlimit(RLIMIT_MEMLOCK, &r);
map_flags = 0;
run_all_tests();
map_flags = BPF_F_NO_PREALLOC;
-- 
2.5.0

Re: [PATCH net] net: phy: Decrement phy_fixed_addr during unregister

2016-06-24 Thread Russell King - ARM Linux

On Fri, Jun 24, 2016 at 03:44:11PM -0700, Florian Fainelli wrote:
> If we have a system which uses fixed PHY devices and calls
> fixed_phy_register() then fixed_phy_unregister() we can exhaust the
> number of fixed PHYs available after a while, since we keep incrementing
> the variable phy_fixed_addr, but we never decrement it.
> 
> This patch fixes that by decrementing phy_fixed_addr during
> fixed_phy_del(), and in order to do that, we need to move the
> phy_fixed_addr integer and its spinlock above that function.

Is this really a good idea?

What if we have two fixed phys register, and the first one is
unregistered and a new one subsequently registered?

First phy registered, gets address 0, phy_fixed_addr becomes 1.
Second phy registered, gets address 1, phy_fixed_addr becomes 2.
First phy is unregistered, phy_fixed_addr becomes 1.
Third phy registered, gets address 1, conflicts with the second phy.

Obviously not a good outcome.

-- 
RMK's Patch system: http://www.armlinux.org.uk/developer/patches/
FTTC broadband for 0.8mile line: currently at 9.6Mbps down 400kbps up
according to speedtest.net.

[PATCH net] net: phy: Decrement phy_fixed_addr during unregister

2016-06-24 Thread Florian Fainelli

If we have a system which uses fixed PHY devices and calls
fixed_phy_register() then fixed_phy_unregister() we can exhaust the
number of fixed PHYs available after a while, since we keep incrementing
the variable phy_fixed_addr, but we never decrement it.

This patch fixes that by decrementing phy_fixed_addr during
fixed_phy_del(), and in order to do that, we need to move the
phy_fixed_addr integer and its spinlock above that function.

Fixes: a75951217472 ("net: phy: extend fixed driver with fixed_phy_register()")
Signed-off-by: Florian Fainelli 
---
 drivers/net/phy/fixed_phy.c | 9 ++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/drivers/net/phy/fixed_phy.c b/drivers/net/phy/fixed_phy.c
index 2d2e4339f0df..050bc5657b9d 100644
--- a/drivers/net/phy/fixed_phy.c
+++ b/drivers/net/phy/fixed_phy.c
@@ -286,6 +286,9 @@ err_regs:
 }
 EXPORT_SYMBOL_GPL(fixed_phy_add);
 
+static int phy_fixed_addr;
+static DEFINE_SPINLOCK(phy_fixed_addr_lock);
+
 static void fixed_phy_del(int phy_addr)
 {
struct fixed_mdio_bus *fmb = &platform_fmb;
@@ -297,14 +300,14 @@ static void fixed_phy_del(int phy_addr)
if (gpio_is_valid(fp->link_gpio))
gpio_free(fp->link_gpio);
kfree(fp);
+   spin_lock(&phy_fixed_addr_lock);
+   phy_fixed_addr--;
+   spin_unlock(&phy_fixed_addr_lock);
return;
}
}
 }
 
-static int phy_fixed_addr;
-static DEFINE_SPINLOCK(phy_fixed_addr_lock);
-
 struct phy_device *fixed_phy_register(unsigned int irq,
  struct fixed_phy_status *status,
  int link_gpio,
-- 
2.7.4

Re: [PATCH net] sock_diag: invert socket destroy broadcast check

2016-06-24 Thread Willem de Bruijn

On Fri, Jun 24, 2016 at 4:41 PM, Eric W. Biederman
 wrote:
> Willem de Bruijn  writes:
>
>> From: Willem de Bruijn 
>>
>> Socket destruction is only broadcast for a socket sk if a diag
>> listener is registered and sk is not a kernel socket.
>>
>> Invert the test to not even check for listeners for kernel sockets.
>>
>> The sock_diag_has_destroy_listeners invocation dereferences
>> sock_net(sk), which for kernel sockets can be invalid as they do not
>> take a reference on the network namespace.
>
> No.  That isn't so.  A kernel socket for a network namespace must be
> destroyed in the network namespace teardown.

The issue would be an skbuff with a reference to that sk and
sock_wfree as destructor escaping the network namespace with that
reference intact.

Both macvlan and veth scrub packets before passing them between
namespaces. That alone should prevent this.

I'll try to reproduce with a short dedicated test on the current net branch.

>> I observed skbs queued on a device queue in another namespace from
>> a kernel socket in SOCK_DEAD state with dangling sock_net(sk). Socket
>> refcnt is zero, but sk_wmem_alloc is not. (This was on an older
>> kernel, have not yet tried to reproduce on net).

Re: [PATCH net-next 0/8] tou: Transports over UDP - part I

2016-06-24 Thread Rick Jones


On 06/24/2016 02:46 PM, Tom Herbert wrote:

On Fri, Jun 24, 2016 at 2:36 PM, Rick Jones  wrote:

How would you define "severely?"  Has it actually been more severe than for
say ECN?  Or it was for say SACK or PAWS?


ECN is probably even a bigger disappointment in terms of seeing
deployment :-( From http://ecn.ethz.ch/ecn-pam15.pdf:

"Even though ECN was standardized in 2001, and it is widely
implemented in end-systems, it is barely deployed. This is due to a
history of problems with severely broken middleboxes shortly after
standardization, which led to connectivity failure and guidance to
leave ECN disabled."

SACK and PAWS seemed to have faired a little better I believe.


The conclusion of that (rather interesting) paper reads:

"Our analysis therefore indicates that enabling ECN by default would
lead to connections to about five websites per thousand to suffer
additional setup latency with RFC 3168 fallback. This represents an
order of magnitude fewer than the about forty per thousand which
experience transient or permanent connection failure due to other
operational issues"

Doesn't that then suggest that not enabling ECN is basically a matter of 
FUD more than remaining assumed broken middleboxes?


My main point is that in the past at least, trouble with broken 
middleboxes didn't lead us to start wrapping all our TCP/transport 
traffic in UDP to try to hide it from them.  We've managed to get SACK 
and PAWS universal without having to resort to that, and it would seem 
we could get ECN universal if we could overcome our FUD.  Why would TFO 
for instance be any different?


There was an equally interesting second paragraph in the conclusion:

"As not all websites are equally popular, failures on five per thousand
websites does not by any means imply that five per thousand connection 
attempts will fail. While estimation of connection attempt rate by rank 
is out of scope of this work, we note that the highest ranked website 
exhibiting stable connection failure has rank 596, and only 13 such 
sites appear in the top 5000"


rick jones

Re: switch / linux STP interoperation issues.

2016-06-24 Thread Michal Soltys


On 2016-06-24 23:09, Elad Raz wrote:

On Fri, Jun 24, 2016 at 10:14 PM, Michal Soltys  wrote:

Hi,
 
The switch respected BPDUs sent to it (if applicable) - for example it
complied properly if it's priority was less (numerically higher) than
linux's - showing linux box as root bridge, marking one port as root, the
other as alternate/blocking.

The linux box itself was completely deaf to any BPDUs arriving to it (e.g.
if it's priority was lower) and just keept pushing its own data units all
the time with little care (quickly leading to loops in some scenarios).
Whether it was builtin stp implementation, or whether it was mstpd's
stp/rstp/mstp - the behaviour was the same.



Linux is fresh stock archlinux (so vanilla 4.6.2 kernel and the most (or
almost) recent userland utils - iproute2, etc.), running on relatively
recent poweredge dell.

I'm kind of lost at this point - am I missing some basic
options/sysctls/sysfs ? Is there some known incompatibility here somewhere
between switch/linux/nic/versions/etc. ? Some by-default enabled BPDU
filtering maybe ?
 

Any suggestions / hints appreciated.


Please see Ido Schimmel's fix "[net] bridge: Fix incorrect
re-injection of STP packets",
https://patchwork.ozlabs.org/patch/629768/



Thanks !

Will test it on monday asap.

Re: [PATCH net-next 0/8] tou: Transports over UDP - part I

2016-06-24 Thread Tom Herbert

On Fri, Jun 24, 2016 at 2:36 PM, Rick Jones  wrote:
> On 06/24/2016 02:12 PM, Tom Herbert wrote:
>>
>> The client OS side is only part of the story. Middlebox intrusion at
>> L4 is also a major issue we need to address. The "failure" of TFO is a
>> good case study. Both the upgrade issues on clients and the tendency
>> for some middleboxes to drop SYN packets with data have together
>> severely hindered what otherwise should have been straightforward and
>> useful feature to deploy.
>
>
> How would you define "severely?"  Has it actually been more severe than for
> say ECN?  Or it was for say SACK or PAWS?
>
ECN is probably even a bigger disappointment in terms of seeing
deployment :-( From http://ecn.ethz.ch/ecn-pam15.pdf:

"Even though ECN was standardized in 2001, and it is widely
implemented in end-systems, it is barely deployed. This is due to a
history of problems with severely broken middleboxes shortly after
standardization, which led to connectivity failure and guidance to
leave ECN disabled."

SACK and PAWS seemed to have faired a little better I believe.

Tom

> rick jones
>

Re: [PATCH net-next 0/8] tou: Transports over UDP - part I

2016-06-24 Thread Rick Jones


On 06/24/2016 02:12 PM, Tom Herbert wrote:

The client OS side is only part of the story. Middlebox intrusion at
L4 is also a major issue we need to address. The "failure" of TFO is a
good case study. Both the upgrade issues on clients and the tendency
for some middleboxes to drop SYN packets with data have together
severely hindered what otherwise should have been straightforward and
useful feature to deploy.


How would you define "severely?"  Has it actually been more severe than 
for say ECN?  Or it was for say SACK or PAWS?


rick jones

Re: [PATCH net-next 0/8] tou: Transports over UDP - part I

2016-06-24 Thread Tom Herbert

On Thu, Jun 23, 2016 at 12:50 AM, Richard Weinberger  wrote:
> Am 23.06.2016 um 09:40 schrieb David Miller:
>> From: Richard Weinberger 
>> Date: Thu, 23 Jun 2016 00:15:04 +0200
>>
>>> On Thu, Jun 16, 2016 at 7:51 PM, Tom Herbert  wrote:
 Transports over UDP is intended to encapsulate TCP and other transport
 protocols directly and securely in UDP.

 The goal of this work is twofold:

 1) Allow applications to run their own transport layer stack (i.e.from
userspace). This eliminates dependencies on the OS (e.g. solves a
major dependency issue for Facebook on clients).
>>>
>>> Facebook on clients would be a Facebook app on mobile devices?
>>> Does that mean that the Facebook app is so advanced and complicated
>>> that it needs a special TCP stack?!
>>
>> No, the TCP stack in the android/iOS/Windows kernel is so out of date
>> that in order to get even moderately recent TCP features it is
>> necessary to do this.
>
> I see.
> So the plan is bringing TOU into almost every kernel out there
> and then ship Apps with their own TCP stacks since vendors are unable
> to deliver decent updates.
>
> I didn't realize that the situation is *that* worse. :(
>
The client OS side is only part of the story. Middlebox intrusion at
L4 is also a major issue we need to address. The "failure" of TFO is a
good case study. Both the upgrade issues on clients and the tendency
for some middleboxes to drop SYN packets with data have together
severely hindered what otherwise should have been straightforward and
useful feature to deploy.

Tom

> //richard

Re: switch / linux STP interoperation issues.

2016-06-24 Thread Elad Raz

On Fri, Jun 24, 2016 at 10:14 PM, Michal Soltys  wrote:
> Hi,
>
> In the last week I've been trying to get STP on the linux side (both its
> builtin STP implementation as well as mstpd userspace daemon). Initially I
> started with more complex setups (vlan aware bridge, bonds, mst) and
> gradually (with identical problems on each step) ended with the most basic
> setup that can be summarized by:
>
> brctl addbr br0
> brctl addif br0 eno1
> brctl addif br0 eno2
> brctl stp br0 on
> ip li set eno1 up
> ip li set eno2 up
> ip li set br0 up
>
> The same config on switch's side (cisco 2960-x in its most basic
> incarnation) - in the other words two cables between linux machine and the
> switch, enabled stp, access ports in vlan1.
>
> The end effect of this setup (and any of the more complex previous ones):
>
> The switch respected BPDUs sent to it (if applicable) - for example it
> complied properly if it's priority was less (numerically higher) than
> linux's - showing linux box as root bridge, marking one port as root, the
> other as alternate/blocking.
>
> The linux box itself was completely deaf to any BPDUs arriving to it (e.g.
> if it's priority was lower) and just keept pushing its own data units all
> the time with little care (quickly leading to loops in some scenarios).
> Whether it was builtin stp implementation, or whether it was mstpd's
> stp/rstp/mstp - the behaviour was the same.
>
> With the bridge itself happily claiming to be the root (despite lower
> priority):
>
> br0
>  bridge id  a000.000af77cddc4
>  designated roota000.000af77cddc4
>  root port 0
>
> 
>
> enp8s0f0 (3)
>  port id8003state forwarding
>  designated roota000.000af77cddc4   path cost4
>  designated bridge  a000.000af77cddc4
>
> 
> (and analogous output from mstpctl tool)
>
> tcpdump looked like:
>
> 17:33:28.701425 00:0a:f7:7c:dd:c4 > 01:80:c2:00:00:00, 802.3, length 52:
> LLC, dsap STP (0x42) Individual, ssap STP (0x42) Command, ctrl 0x03
> : STP 802.1d, Config, Flags [none], bridge-id a000.00:0a:f7:7c:dd:c4.8003,
> length 35
> message-age 0.00s, max-age 20.00s, hello-time 2.00s,
> forwarding-delay 15.00s
> root-id a000.00:0a:f7:7c:dd:c4, root-pathcost 0
> 17:33:29.026185 18:8b:45:6f:38:86 > 01:80:c2:00:00:00, 802.3, length 60:
> LLC, dsap STP (0x42) Individual, ssap STP (0x42) Command, ctrl 0x03
> : STP 802.1d, Config, Flags [none], bridge-id 2001.18:8b:45:6f:38:80.8006,
> length 43
> message-age 0.00s, max-age 20.00s, hello-time 2.00s,
> forwarding-delay 15.00s
> root-id 2001.18:8b:45:6f:38:80, root-pathcost 0
>
> The first sent by linux box, the second by the switch (the above from basic
> stp scenario on both sides).
>
>
> The cards in question used:
> Ethernet controller: Broadcom Corporation NetXtreme BCM5719 Gigabit Ethernet
> PCIe (rev 01) handled by:
>
> driver: tg3
> version: 3.137
> firmware-version: FFV7.10.17 bc 5719-v1.37
>
>
> Linux is fresh stock archlinux (so vanilla 4.6.2 kernel and the most (or
> almost) recent userland utils - iproute2, etc.), running on relatively
> recent poweredge dell.
>
> I'm kind of lost at this point - am I missing some basic
> options/sysctls/sysfs ? Is there some known incompatibility here somewhere
> between switch/linux/nic/versions/etc. ? Some by-default enabled BPDU
> filtering maybe ?
>
>
> Any suggestions / hints appreciated.

Please see Ido Schimmel's fix "[net] bridge: Fix incorrect
re-injection of STP packets",
https://patchwork.ozlabs.org/patch/629768/

Re: [PATCH] mac80211_hwsim: Added vendor echo command

2016-06-24 Thread Jouni Malinen

On Fri, Jun 24, 2016 at 10:13:54AM +0200, Erik Stromdahl wrote:
> The purpose of the echo command is to provide a test
> facility for user space programs.

> diff --git a/drivers/net/wireless/mac80211_hwsim.c 
> b/drivers/net/wireless/mac80211_hwsim.c
> @@ -332,14 +332,16 @@ static const struct ieee80211_rate hwsim_rates[] = {
>  #define QCA_NL80211_SUBCMD_TEST 1
> +#define QCA_NL80211_SUBCMD_ECHO 2

NAK. That QCA vendor specific value has not been assigned nor have I
even seen a request to assign such a value.

>  enum qca_nl80211_vendor_subcmds {
>   QCA_WLAN_VENDOR_ATTR_TEST = 8,
> - QCA_WLAN_VENDOR_ATTR_MAX = QCA_WLAN_VENDOR_ATTR_TEST
> + QCA_WLAN_VENDOR_ATTR_ECHO,

And this vendor attribute value has already been assigned for another
purpose.

It is no acceptable to pick arbitrary values of identifiers without
proper request to the owner of the OUI that is used to assign the
values.

As far as the QCA vendor specific identifiers are concerned, their
assignment process is described here:
http://w1.fi/cgit/hostap/plain/src/common/qca-vendor.h

Please do not submit kernel changes that use any unassigned vendor
identifier or even worse, a value that have already been assigned for a
completely different purpose.

-- 
Jouni MalinenPGP id EFC895FA

Re: [PATCH net] sock_diag: invert socket destroy broadcast check

2016-06-24 Thread Eric W. Biederman

Willem de Bruijn  writes:

> From: Willem de Bruijn 
>
> Socket destruction is only broadcast for a socket sk if a diag
> listener is registered and sk is not a kernel socket.
>
> Invert the test to not even check for listeners for kernel sockets.
>
> The sock_diag_has_destroy_listeners invocation dereferences
> sock_net(sk), which for kernel sockets can be invalid as they do not
> take a reference on the network namespace.

No.  That isn't so.  A kernel socket for a network namespace must be
destroyed in the network namespace teardown.

Therefore sock_net(sk) should always be valid.  If sock_net(sk) is not
valid something very broken is going on.

Eric

> Fixes: b922622ec6ef ("sock_diag: don't broadcast kernel sockets")
> Fixes: 26abe14379f8 ("net: Modify sk_alloc to not reference count the
>   netns of kernel sockets.")
> Signed-off-by: Willem de Bruijn 
>
> ---
>
> This patch fixes this immediate codepath. A broader issue of live
> kernel sockets pointing to deleted namespaces may persist.
>
> I observed skbs queued on a device queue in another namespace from
> a kernel socket in SOCK_DEAD state with dangling sock_net(sk). Socket
> refcnt is zero, but sk_wmem_alloc is not. (This was on an older
> kernel, have not yet tried to reproduce on net).
>
> It seems that we may need to reintroduce namespace reference counting
> for kernel sockets (with two-stage deletion to avoid the circular
> reference), scrub packets between namespaces, or reparent kernel
> sockets to init_net on namespace destruction.
> ---
>  net/core/sock.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/net/core/sock.c b/net/core/sock.c
> index 08bf97e..ba082b4 100644
> --- a/net/core/sock.c
> +++ b/net/core/sock.c
> @@ -1473,7 +1473,7 @@ void sk_destruct(struct sock *sk)
>  
>  static void __sk_free(struct sock *sk)
>  {
> - if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
> + if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
>   sock_diag_broadcast_destroy(sk);
>   else
>   sk_destruct(sk);

[PATCH v3] netfilter/nflog: nflog-range does not truncate packets (userspace)

2016-06-24 Thread Vishwanath Pai

Added tests to libxt_NFLOG.t for the new option --nflog-size

--

netfilter/nflog: nflog-range does not truncate packets

The option --nflog-range has never worked, but we cannot just fix this
because users might be using this feature option and their behavior would
change. Instead add a new option --nflog-size. This option works the same
way nflog-range should have, and both of them are mutually exclusive. When
someone uses --nflog-range we print a warning message informing them that
this feature has no effect.

To indicate the kernel that the user has set --nflog-size we have to pass a
new flag XT_NFLOG_F_COPY_LEN.

Also updated the man page to reflect the new option and added tests to
extensions/libxt_NFLOG.t

Reported-by: Joe Dollard 
Reviewed-by: Josh Hunt 
Signed-off-by: Vishwanath Pai 

diff --git a/extensions/libxt_NFLOG.c b/extensions/libxt_NFLOG.c
index f611631..8c67066 100644
--- a/extensions/libxt_NFLOG.c
+++ b/extensions/libxt_NFLOG.c
@@ -12,7 +12,10 @@ enum {
O_GROUP = 0,
O_PREFIX,
O_RANGE,
+   O_SIZE,
O_THRESHOLD,
+   F_RANGE = 1 << O_RANGE,
+   F_SIZE = 1 << O_SIZE,
 };
 
 #define s struct xt_nflog_info
@@ -22,7 +25,9 @@ static const struct xt_option_entry NFLOG_opts[] = {
{.name = "nflog-prefix", .id = O_PREFIX, .type = XTTYPE_STRING,
 .min = 1, .flags = XTOPT_PUT, XTOPT_POINTER(s, prefix)},
{.name = "nflog-range", .id = O_RANGE, .type = XTTYPE_UINT32,
-.flags = XTOPT_PUT, XTOPT_POINTER(s, len)},
+.excl = F_SIZE, .flags = XTOPT_PUT, XTOPT_POINTER(s, len)},
+   {.name = "nflog-size", .id = O_SIZE, .type = XTTYPE_UINT32,
+.excl = F_RANGE, .flags = XTOPT_PUT, XTOPT_POINTER(s, len)},
{.name = "nflog-threshold", .id = O_THRESHOLD, .type = XTTYPE_UINT16,
 .flags = XTOPT_PUT, XTOPT_POINTER(s, threshold)},
XTOPT_TABLEEND,
@@ -33,7 +38,8 @@ static void NFLOG_help(void)
 {
printf("NFLOG target options:\n"
   " --nflog-group NUM  NETLINK group used for 
logging\n"
-  " --nflog-range NUM  Number of byte to copy\n"
+  " --nflog-range NUM  This option has no effect, use 
--nflog-size\n"
+  " --nflog-size NUM   Number of bytes to copy\n"
   " --nflog-threshold NUM  Message threshold of in-kernel 
queue\n"
   " --nflog-prefix STRING  Prefix string for log 
messages\n");
 }
@@ -57,6 +63,18 @@ static void NFLOG_parse(struct xt_option_call *cb)
}
 }
 
+static void NFLOG_check(struct xt_fcheck_call *cb)
+{
+   struct xt_nflog_info *info = cb->data;
+
+   if (cb->xflags & F_RANGE)
+   fprintf(stderr, "warn: --nflog-range has never worked and is no"
+   " longer supported, please use --nflog-size insted\n");
+
+   if (cb->xflags & F_SIZE)
+   info->flags |= XT_NFLOG_F_COPY_LEN;
+}
+
 static void nflog_print(const struct xt_nflog_info *info, char *prefix)
 {
if (info->prefix[0] != '\0') {
@@ -65,7 +83,9 @@ static void nflog_print(const struct xt_nflog_info *info, 
char *prefix)
}
if (info->group)
printf(" %snflog-group %u", prefix, info->group);
-   if (info->len)
+   if (info->len && info->flags & XT_NFLOG_F_COPY_LEN)
+   printf(" %snflog-size %u", prefix, info->len);
+   else if (info->len)
printf(" %snflog-range %u", prefix, info->len);
if (info->threshold != XT_NFLOG_DEFAULT_THRESHOLD)
printf(" %snflog-threshold %u", prefix, info->threshold);
@@ -117,6 +137,7 @@ static struct xtables_target nflog_target = {
.userspacesize  = XT_ALIGN(sizeof(struct xt_nflog_info)),
.help   = NFLOG_help,
.init   = NFLOG_init,
+   .x6_fcheck  = NFLOG_check,
.x6_parse   = NFLOG_parse,
.print  = NFLOG_print,
.save   = NFLOG_save,
diff --git a/extensions/libxt_NFLOG.man b/extensions/libxt_NFLOG.man
index 1b6dbf1..318e630 100644
--- a/extensions/libxt_NFLOG.man
+++ b/extensions/libxt_NFLOG.man
@@ -17,6 +17,9 @@ A prefix string to include in the log message, up to 64 
characters
 long, useful for distinguishing messages in the logs.
 .TP
 \fB\-\-nflog\-range\fP \fIsize\fP
+This option has never worked, use --nflog-size instead
+.TP
+\fB\-\-nflog\-size\fP \fIsize\fP
 The number of bytes to be copied to userspace (only applicable for
 nfnetlink_log). nfnetlink_log instances may specify their own
 range, this option overrides it.
diff --git a/extensions/libxt_NFLOG.t b/extensions/libxt_NFLOG.t
index f9768aa..78076b5 100644
--- a/extensions/libxt_NFLOG.t
+++ b/extensions/libxt_NFLOG.t
@@ -7,6 +7,10 @@
 -j NFLOG --nflog-range 4294967295;=;OK
 -j NFLOG --nflog-range 4294967296;;FAIL
 -j NFLOG --nflog-range -1;;FAIL
+-j NFLOG --nflog-size 1;=;OK
+-j NFLOG --nflog-size 4294967295;=;OK
+-j NFLOG --nflog-size 4294967296;;FAIL
+-

[PATCH] netfilter: Convert FWINV<[foo]> macros and uses to NF_INVF

2016-06-24 Thread Joe Perches

netfilter uses multiple FWINV #defines with identical form that hide a
specific structure variable and dereference it with a invflags member.

$ git grep "#define FWINV"
include/linux/netfilter_bridge/ebtables.h:#define FWINV(bool,invflg) ((bool) ^ 
!!(info->invflags & invflg))
net/bridge/netfilter/ebtables.c:#define FWINV2(bool, invflg) ((bool) ^ 
!!(e->invflags & invflg))
net/ipv4/netfilter/arp_tables.c:#define FWINV(bool, invflg) ((bool) ^ 
!!(arpinfo->invflags & (invflg)))
net/ipv4/netfilter/ip_tables.c:#define FWINV(bool, invflg) ((bool) ^ 
!!(ipinfo->invflags & (invflg)))
net/ipv6/netfilter/ip6_tables.c:#define FWINV(bool, invflg) ((bool) ^ 
!!(ip6info->invflags & (invflg)))
net/netfilter/xt_tcpudp.c:#define FWINVTCP(bool, invflg) ((bool) ^ 
!!(tcpinfo->invflags & (invflg)))

Consolidate these macros into a single NF_INVF macro.

Miscellanea:

o Neaten the alignment around these uses
o A few lines are > 80 columns for intelligibility

Signed-off-by: Joe Perches 
---
 include/linux/netfilter/x_tables.h|  4 +++
 include/linux/netfilter_bridge/ebtables.h |  2 --
 net/bridge/netfilter/ebt_802_3.c  |  6 ++--
 net/bridge/netfilter/ebt_arp.c| 38 +++---
 net/bridge/netfilter/ebt_ip.c | 28 -
 net/bridge/netfilter/ebt_ip6.c| 41 +---
 net/bridge/netfilter/ebt_stp.c| 52 ---
 net/bridge/netfilter/ebtables.c   | 27 
 net/ipv4/netfilter/arp_tables.c   | 41 
 net/ipv4/netfilter/ip_tables.c| 20 ++--
 net/ipv6/netfilter/ip6_tables.c   | 16 +-
 net/netfilter/xt_tcpudp.c |  7 ++---
 12 files changed, 144 insertions(+), 138 deletions(-)

diff --git a/include/linux/netfilter/x_tables.h 
b/include/linux/netfilter/x_tables.h
index dc4f58a..e94e81a 100644
--- a/include/linux/netfilter/x_tables.h
+++ b/include/linux/netfilter/x_tables.h
@@ -6,6 +6,10 @@
 #include 
 #include 
 
+/* Test a struct->invflags and a boolean for inequality */
+#define NF_INVF(ptr, flag, boolean)\
+   ((boolean) ^ !!((ptr)->invflags & (flag)))
+
 /**
  * struct xt_action_param - parameters for matches/targets
  *
diff --git a/include/linux/netfilter_bridge/ebtables.h 
b/include/linux/netfilter_bridge/ebtables.h
index 2ea517c..984b211 100644
--- a/include/linux/netfilter_bridge/ebtables.h
+++ b/include/linux/netfilter_bridge/ebtables.h
@@ -115,8 +115,6 @@ extern unsigned int ebt_do_table(struct sk_buff *skb,
 const struct nf_hook_state *state,
 struct ebt_table *table);
 
-/* Used in the kernel match() functions */
-#define FWINV(bool,invflg) ((bool) ^ !!(info->invflags & invflg))
 /* True if the hook mask denotes that the rule is in a base chain,
  * used in the check() functions */
 #define BASE_CHAIN (par->hook_mask & (1 << NF_BR_NUMHOOKS))
diff --git a/net/bridge/netfilter/ebt_802_3.c b/net/bridge/netfilter/ebt_802_3.c
index 2a449b7..5fc4aff 100644
--- a/net/bridge/netfilter/ebt_802_3.c
+++ b/net/bridge/netfilter/ebt_802_3.c
@@ -20,16 +20,16 @@ ebt_802_3_mt(const struct sk_buff *skb, struct 
xt_action_param *par)
__be16 type = hdr->llc.ui.ctrl & IS_UI ? hdr->llc.ui.type : 
hdr->llc.ni.type;
 
if (info->bitmask & EBT_802_3_SAP) {
-   if (FWINV(info->sap != hdr->llc.ui.ssap, EBT_802_3_SAP))
+   if (NF_INVF(info, EBT_802_3_SAP, info->sap != hdr->llc.ui.ssap))
return false;
-   if (FWINV(info->sap != hdr->llc.ui.dsap, EBT_802_3_SAP))
+   if (NF_INVF(info, EBT_802_3_SAP, info->sap != hdr->llc.ui.dsap))
return false;
}
 
if (info->bitmask & EBT_802_3_TYPE) {
if (!(hdr->llc.ui.dsap == CHECK_TYPE && hdr->llc.ui.ssap == 
CHECK_TYPE))
return false;
-   if (FWINV(info->type != type, EBT_802_3_TYPE))
+   if (NF_INVF(info, EBT_802_3_TYPE, info->type != type))
return false;
}
 
diff --git a/net/bridge/netfilter/ebt_arp.c b/net/bridge/netfilter/ebt_arp.c
index cca0a89..2271422 100644
--- a/net/bridge/netfilter/ebt_arp.c
+++ b/net/bridge/netfilter/ebt_arp.c
@@ -25,14 +25,14 @@ ebt_arp_mt(const struct sk_buff *skb, struct 
xt_action_param *par)
ah = skb_header_pointer(skb, 0, sizeof(_arph), &_arph);
if (ah == NULL)
return false;
-   if (info->bitmask & EBT_ARP_OPCODE && FWINV(info->opcode !=
-  ah->ar_op, EBT_ARP_OPCODE))
+   if ((info->bitmask & EBT_ARP_OPCODE) &&
+   NF_INVF(info, EBT_ARP_OPCODE, info->opcode != ah->ar_op))
return false;
-   if (info->bitmask & EBT_ARP_HTYPE && FWINV(info->htype !=
-  ah->ar_hrd, EBT_ARP_HTYPE))
+   if ((info->bitmask & EBT_ARP_HTYPE) &&
+   NF_INVF(info, EBT_ARP_HT

[PATCH net] sock_diag: invert socket destroy broadcast check

2016-06-24 Thread Willem de Bruijn

From: Willem de Bruijn 

Socket destruction is only broadcast for a socket sk if a diag
listener is registered and sk is not a kernel socket.

Invert the test to not even check for listeners for kernel sockets.

The sock_diag_has_destroy_listeners invocation dereferences
sock_net(sk), which for kernel sockets can be invalid as they do not
take a reference on the network namespace.

Fixes: b922622ec6ef ("sock_diag: don't broadcast kernel sockets")
Fixes: 26abe14379f8 ("net: Modify sk_alloc to not reference count the
  netns of kernel sockets.")
Signed-off-by: Willem de Bruijn 

---

This patch fixes this immediate codepath. A broader issue of live
kernel sockets pointing to deleted namespaces may persist.

I observed skbs queued on a device queue in another namespace from
a kernel socket in SOCK_DEAD state with dangling sock_net(sk). Socket
refcnt is zero, but sk_wmem_alloc is not. (This was on an older
kernel, have not yet tried to reproduce on net).

It seems that we may need to reintroduce namespace reference counting
for kernel sockets (with two-stage deletion to avoid the circular
reference), scrub packets between namespaces, or reparent kernel
sockets to init_net on namespace destruction.
---
 net/core/sock.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index 08bf97e..ba082b4 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1473,7 +1473,7 @@ void sk_destruct(struct sock *sk)
 
 static void __sk_free(struct sock *sk)
 {
-   if (unlikely(sock_diag_has_destroy_listeners(sk) && sk->sk_net_refcnt))
+   if (unlikely(sk->sk_net_refcnt && sock_diag_has_destroy_listeners(sk)))
sock_diag_broadcast_destroy(sk);
else
sk_destruct(sk);
-- 
2.8.0.rc3.226.g39d4020

[PATCH net] sock_diag: do not broadcast raw socket destruction

2016-06-24 Thread Willem de Bruijn

From: Willem de Bruijn 

Diag intends to broadcast tcp_sk and udp_sk socket destruction.
Testing sk->sk_protocol for IPPROTO_TCP/IPPROTO_UDP alone is not
sufficient for this. Raw sockets can have the same type.

Add a test for sk->sk_type.

Fixes: eb4cb008529c ("sock_diag: define destruction multicast groups")
Signed-off-by: Willem de Bruijn 
---
 include/linux/sock_diag.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/include/linux/sock_diag.h b/include/linux/sock_diag.h
index 4018b48..a0596ca0 100644
--- a/include/linux/sock_diag.h
+++ b/include/linux/sock_diag.h
@@ -36,6 +36,9 @@ enum sknetlink_groups sock_diag_destroy_group(const struct 
sock *sk)
 {
switch (sk->sk_family) {
case AF_INET:
+   if (sk->sk_type == SOCK_RAW)
+   return SKNLGRP_NONE;
+
switch (sk->sk_protocol) {
case IPPROTO_TCP:
return SKNLGRP_INET_TCP_DESTROY;
@@ -45,6 +48,9 @@ enum sknetlink_groups sock_diag_destroy_group(const struct 
sock *sk)
return SKNLGRP_NONE;
}
case AF_INET6:
+   if (sk->sk_type == SOCK_RAW)
+   return SKNLGRP_NONE;
+
switch (sk->sk_protocol) {
case IPPROTO_TCP:
return SKNLGRP_INET6_TCP_DESTROY;
-- 
2.8.0.rc3.226.g39d4020

Re: [PATCH] Maxim/driver: Add driver for maxim ds26522

2016-06-24 Thread David Miller

From: Qiang Zhao 
Date: Fri, 24 Jun 2016 02:00:33 +

> On Thu, 2016-06-23 at 10:59PM, David Miller wrote:
>> -Original Message-
>> From: David Miller [mailto:da...@davemloft.net]
>> Sent: Thursday, June 23, 2016 10:59 PM
>> To: Qiang Zhao 
>> Cc: o...@buserror.net; linux-ker...@vger.kernel.org; netdev@vger.kernel.org;
>> Xiaobo Xie 
>> Subject: Re: [PATCH] Maxim/driver: Add driver for maxim ds26522
>> 
>> From: Zhao Qiang 
>> Date: Thu, 23 Jun 2016 09:09:45 +0800
>> 
>> > +MODULE_DESCRIPTION(DRV_DESC);
>> 
>> There is no definition of DRV_DESC, so this makes it look like you didn't 
>> even
>> compile this driver.
> 
> I really, really compiled this driver.
> Thank you for your review and comments. I will modify it the next version.
> 
> [zhaoqiang@titan:~/upstream/linux]$ll drivers/net/wan/slic_ds26522.o
> -rw-r--r-- 1 zhaoqiang klocwork 153288 Jun 22 15:48 
> drivers/net/wan/slic_ds26522.o
> [zhaoqiang@titan:~/upstream/linux]$date

Obviously with the driver not configured as a module, and thus explicitly
not testing the failing statement at all.

switch / linux STP interoperation issues.

2016-06-24 Thread Michal Soltys


Hi,

In the last week I've been trying to get STP on the linux side (both its 
builtin STP implementation as well as mstpd userspace daemon). Initially 
I started with more complex setups (vlan aware bridge, bonds, mst) and 
gradually (with identical problems on each step) ended with the most 
basic setup that can be summarized by:


brctl addbr br0
brctl addif br0 eno1
brctl addif br0 eno2
brctl stp br0 on
ip li set eno1 up
ip li set eno2 up
ip li set br0 up

The same config on switch's side (cisco 2960-x in its most basic 
incarnation) - in the other words two cables between linux machine and 
the switch, enabled stp, access ports in vlan1.


The end effect of this setup (and any of the more complex previous ones):

The switch respected BPDUs sent to it (if applicable) - for example it 
complied properly if it's priority was less (numerically higher) than 
linux's - showing linux box as root bridge, marking one port as root, 
the other as alternate/blocking.


The linux box itself was completely deaf to any BPDUs arriving to it 
(e.g. if it's priority was lower) and just keept pushing its own data 
units all the time with little care (quickly leading to loops in some 
scenarios). Whether it was builtin stp implementation, or whether it was 
mstpd's stp/rstp/mstp - the behaviour was the same.


With the bridge itself happily claiming to be the root (despite lower 
priority):


br0
 bridge id  a000.000af77cddc4
 designated roota000.000af77cddc4
 root port 0



enp8s0f0 (3)
 port id8003state 
forwarding
 designated roota000.000af77cddc4   path cost 
   4

 designated bridge  a000.000af77cddc4


(and analogous output from mstpctl tool)

tcpdump looked like:

17:33:28.701425 00:0a:f7:7c:dd:c4 > 01:80:c2:00:00:00, 802.3, length 52: 
LLC, dsap STP (0x42) Individual, ssap STP (0x42) Command, ctrl 0x03
: STP 802.1d, Config, Flags [none], bridge-id 
a000.00:0a:f7:7c:dd:c4.8003, length 35
message-age 0.00s, max-age 20.00s, hello-time 2.00s, 
forwarding-delay 15.00s

root-id a000.00:0a:f7:7c:dd:c4, root-pathcost 0
17:33:29.026185 18:8b:45:6f:38:86 > 01:80:c2:00:00:00, 802.3, length 60: 
LLC, dsap STP (0x42) Individual, ssap STP (0x42) Command, ctrl 0x03
: STP 802.1d, Config, Flags [none], bridge-id 
2001.18:8b:45:6f:38:80.8006, length 43
message-age 0.00s, max-age 20.00s, hello-time 2.00s, 
forwarding-delay 15.00s

root-id 2001.18:8b:45:6f:38:80, root-pathcost 0

The first sent by linux box, the second by the switch (the above from 
basic stp scenario on both sides).



The cards in question used:
Ethernet controller: Broadcom Corporation NetXtreme BCM5719 Gigabit 
Ethernet PCIe (rev 01) handled by:


driver: tg3
version: 3.137
firmware-version: FFV7.10.17 bc 5719-v1.37


Linux is fresh stock archlinux (so vanilla 4.6.2 kernel and the most (or 
almost) recent userland utils - iproute2, etc.), running on relatively 
recent poweredge dell.


I'm kind of lost at this point - am I missing some basic 
options/sysctls/sysfs ? Is there some known incompatibility here 
somewhere between switch/linux/nic/versions/etc. ? Some by-default 
enabled BPDU filtering maybe ?



Any suggestions / hints appreciated.

Re: [PATCH] net: ethernet: ti: cpdma: switch to use genalloc

2016-06-24 Thread Lennart Sorensen

On Fri, Jun 24, 2016 at 07:58:32PM +0300, Grygorii Strashko wrote:
> Oh. nice :( So, seems, I'd need to send v3. Right?
> By the way, this code hasn't been introduced by this patch - I've
> just moved whole function from one place to another.

Well since it is moving I would think that was a handy time to fix the
coding style violation too, since it got noticed.

That leaves just one place in that file violating that part of the coding
style (the other is in cpdma_chan_dump).

Somehow it wasn't spotted when the code was put in back in 2010, and since
they were wrapped lines, they don't stand out quite as much visually.

-- 
Len Sorensen

[PATCH] etherdevice.h & bridge: netfilter: Add and use ether_addr_equal_masked

2016-06-24 Thread Joe Perches

There are code duplications of a masked ethernet address comparison here
so make it a separate function instead.

Miscellanea:

o Neaten alignment of FWINV macro uses to make it clearer for the reader

Signed-off-by: Joe Perches 
---
 include/linux/etherdevice.h | 23 +++
 net/bridge/netfilter/ebt_arp.c  | 17 +-
 net/bridge/netfilter/ebt_stp.c  | 49 ++---
 net/bridge/netfilter/ebtables.c | 17 +-
 4 files changed, 57 insertions(+), 49 deletions(-)

diff --git a/include/linux/etherdevice.h b/include/linux/etherdevice.h
index 37ff4a6..6fec9e8 100644
--- a/include/linux/etherdevice.h
+++ b/include/linux/etherdevice.h
@@ -374,6 +374,29 @@ static inline bool ether_addr_equal_unaligned(const u8 
*addr1, const u8 *addr2)
 }
 
 /**
+ * ether_addr_equal_masked - Compare two Ethernet addresses with a mask
+ * @addr1: Pointer to a six-byte array containing the 1st Ethernet address
+ * @addr2: Pointer to a six-byte array containing the 2nd Ethernet address
+ * @mask: Pointer to a six-byte array containing the Ethernet address bitmask
+ *
+ * Compare two Ethernet addresses with a mask, returns true if for every bit
+ * set in the bitmask the equivalent bits in the ethernet addresses are equal.
+ * Using a mask with all bits set is a slower ether_addr_equal.
+ */
+static inline bool ether_addr_equal_masked(const u8 *addr1, const u8 *addr2,
+  const u8 *mask)
+{
+   int i;
+
+   for (i = 0; i < ETH_ALEN; i++) {
+   if ((addr1[i] ^ addr2[i]) & mask[i])
+   return false;
+   }
+
+   return true;
+}
+
+/**
  * is_etherdev_addr - Tell if given Ethernet address belongs to the device.
  * @dev: Pointer to a device structure
  * @addr: Pointer to a six-byte array containing the Ethernet address
diff --git a/net/bridge/netfilter/ebt_arp.c b/net/bridge/netfilter/ebt_arp.c
index cd457b8..cca0a89 100644
--- a/net/bridge/netfilter/ebt_arp.c
+++ b/net/bridge/netfilter/ebt_arp.c
@@ -65,7 +65,6 @@ ebt_arp_mt(const struct sk_buff *skb, struct xt_action_param 
*par)
if (info->bitmask & (EBT_ARP_SRC_MAC | EBT_ARP_DST_MAC)) {
const unsigned char *mp;
unsigned char _mac[ETH_ALEN];
-   uint8_t verdict, i;
 
if (ah->ar_hln != ETH_ALEN || ah->ar_hrd != htons(ARPHRD_ETHER))
return false;
@@ -74,11 +73,9 @@ ebt_arp_mt(const struct sk_buff *skb, struct xt_action_param 
*par)
sizeof(_mac), &_mac);
if (mp == NULL)
return false;
-   verdict = 0;
-   for (i = 0; i < 6; i++)
-   verdict |= (mp[i] ^ info->smaddr[i]) &
-  info->smmsk[i];
-   if (FWINV(verdict != 0, EBT_ARP_SRC_MAC))
+   if (FWINV(!ether_addr_equal_masked(mp, info->smaddr,
+  info->smmsk),
+ EBT_ARP_SRC_MAC))
return false;
}
 
@@ -88,11 +85,9 @@ ebt_arp_mt(const struct sk_buff *skb, struct xt_action_param 
*par)
sizeof(_mac), &_mac);
if (mp == NULL)
return false;
-   verdict = 0;
-   for (i = 0; i < 6; i++)
-   verdict |= (mp[i] ^ info->dmaddr[i]) &
-   info->dmmsk[i];
-   if (FWINV(verdict != 0, EBT_ARP_DST_MAC))
+   if (FWINV(!ether_addr_equal_masked(mp, info->dmaddr,
+  info->dmmsk),
+ EBT_ARP_DST_MAC))
return false;
}
}
diff --git a/net/bridge/netfilter/ebt_stp.c b/net/bridge/netfilter/ebt_stp.c
index e77f90b..45f73d5 100644
--- a/net/bridge/netfilter/ebt_stp.c
+++ b/net/bridge/netfilter/ebt_stp.c
@@ -46,7 +46,6 @@ static bool ebt_filter_config(const struct ebt_stp_info *info,
const struct ebt_stp_config_info *c;
u16 v16;
u32 v32;
-   int verdict, i;
 
c = &info->config;
if ((info->bitmask & EBT_STP_FLAGS) &&
@@ -54,66 +53,62 @@ static bool ebt_filter_config(const struct ebt_stp_info 
*info,
return false;
if (info->bitmask & EBT_STP_ROOTPRIO) {
v16 = NR16(stpc->root);
-   if (FWINV(v16 < c->root_priol ||
-   v16 > c->root_priou, EBT_STP_ROOTPRIO))
+   if (FWINV(v16 < c->root_priol || v16 > c->root_priou,
+ EBT_STP_ROOTPRIO))
return false;
}
if (info->bitmask & EBT_STP_ROOTADDR) {
-

[PATCH v2 01/15] drivers: net: cpsw: fix suspend when all ethX devices are down

2016-06-24 Thread Grygorii Strashko

The cpsw_suspend() could trigger L3 error and CPSW will stop
functioning if System enters suspend when all ethX net-devices are
down - in this case CPSW could be already suspended by PM runtime, but
cpsw_suspend() will try to call soft_reset_slave() unconditionally
and access CPSW registers.

Hence, fix it by moving soft_reset_slave() from cpsw_suspend() to
cpsw_slave_stop(). This way slave ports will be reset when CPSW is
active and will be in proper state during Suspend.

Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/cpsw.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index e6bb0ec..736c77a 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -1244,6 +1244,7 @@ static void cpsw_slave_stop(struct cpsw_slave *slave, 
struct cpsw_priv *priv)
slave->phy = NULL;
cpsw_ale_control_set(priv->ale, slave_port,
 ALE_PORT_STATE, ALE_PORT_STATE_DISABLE);
+   soft_reset_slave(slave);
 }
 
 static int cpsw_ndo_open(struct net_device *ndev)
@@ -2558,12 +2559,10 @@ static int cpsw_suspend(struct device *dev)
for (i = 0; i < priv->data.slaves; i++) {
if (netif_running(priv->slaves[i].ndev))
cpsw_ndo_stop(priv->slaves[i].ndev);
-   soft_reset_slave(priv->slaves + i);
}
} else {
if (netif_running(ndev))
cpsw_ndo_stop(ndev);
-   for_each_slave(priv, soft_reset_slave);
}
 
pm_runtime_put_sync(&pdev->dev);
-- 
2.9.0

[PATCH v2 02/15] drivers: net: cpsw: check return code from pm runtime calls

2016-06-24 Thread Grygorii Strashko

Add missed check of return code from PM runtime get() calls.

Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/cpsw.c | 12 ++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 736c77a..c76f9db 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -1253,7 +1253,11 @@ static int cpsw_ndo_open(struct net_device *ndev)
int i, ret;
u32 reg;
 
-   pm_runtime_get_sync(&priv->pdev->dev);
+   ret = pm_runtime_get_sync(&priv->pdev->dev);
+   if (ret < 0) {
+   pm_runtime_put_noidle(&priv->pdev->dev);
+   return ret;
+   }
 
if (!cpsw_common_res_usage_state(priv))
cpsw_intr_disable(priv);
@@ -2322,7 +2326,11 @@ static int cpsw_probe(struct platform_device *pdev)
/* Need to enable clocks with runtime PM api to access module
 * registers
 */
-   pm_runtime_get_sync(&pdev->dev);
+   ret = pm_runtime_get_sync(&pdev->dev);
+   if (ret < 0) {
+   pm_runtime_put_noidle(&pdev->dev);
+   goto clean_runtime_disable_ret;
+   }
priv->version = readl(&priv->regs->id_ver);
pm_runtime_put_sync(&pdev->dev);
 
-- 
2.9.0

[PATCH v2 00/15] drivers: net: cpsw: improve runtime pm

2016-06-24 Thread Grygorii Strashko

This series intended to improve runtime PM and allow CPSW to be
RPM suspended when all ethX netdevices are down.

To achieve above goal it is required to relax runtime PM constraints for
Davinci MDIO which blocks CPSW runtime PM now, because Davinci MDIO is always
powered on during probe and powered off only when it's going to be removed.
- Patches 6-11 implement PM runtime autosuspend for Davinci MDIO, but keep it
disabled by default, because Davinci MDIO is integrated in big set of TI devices
and not all of them verified to work correctly with RPM autosuspend enabled:
 expected to work on SoCs where MDIO is defined as part of CPSW in DT
 (cpsw.c DRA7/am57x, am437x, am335x)
The CPSW need to be fixed before RPM suspended can be allowed:
 - Patches 1-5 ensure that CPSW will not cause L3 errors while it is in RPM
   suspended state.

Davinci MDIO RPM autosuspend can be enabled through sysfs:
 echo 100 > 
/sys/devices/../48484000.ethernet/48485000.mdio/power/autosuspend_delay_ms

Patches 12 - 15: introduce new compatible string "ti,cpsw-mdio" which is used
then to enable RPM for am335x/am437x/dra7 SoCs.

Tested on am335x, am437x, am572x and k2g (on k2g with RPM disabled for Davinci 
MDIO)
These changes should not affect on errata i877 implementation on DRA7.

Power measurement on am335x GP EVM:
 Without this series:  547.60 mW total SoC power
 With this series + "ifconfig eth0 down": 477.32 mW Total Soc Power

Changes in v2:
- CPSW ethtool interface updated to use .begin()/.complete() callbacks
- kbuild failure fixed
- davinci_mdio DT updated with proper description of allowed compatible strings
  combinations

Link on v1:
 https://lkml.org/lkml/2016/6/15/362

Grygorii Strashko (15):
  drivers: net: cpsw: fix suspend when all ethX devices are down
  drivers: net: cpsw: check return code from pm runtime calls
  drivers: net: cpsw: remove pm runtime calls from suspend callbacks
  drivers: net: cpsw: ethtool: fix accessing to suspended device
  drivers: net: cpsw: ndev: fix accessing to suspended device
  drivers: net: davinci_mdio: do pm runtime initialization later in probe
  drivers: net: davinci_mdio: remove pm runtime calls from suspend callbacks
  drivers: net: davinci_mdio: drop suspended and lock fields from mdio_data
  drivers: net: davinci_mdio: split reset function on init_clk and enable
  drivers: net: davinci_mdio: add pm runtime callbacks
  drivers: net: davinci_mdio: implement pm runtime auto mode
  net: davinci_mdio: document missed "ti,am4372-mdio" compat string
  net: davinci_mdio: introduce "ti,cpsw-mdio" compat string
  drivers: net: davinci_mdio: enable pm runtime auto for ti cpsw-mdio
  ARM: dts: am335x/am437x/dra7: use new "ti,cpsw-mdio" compat string

 .../devicetree/bindings/net/davinci-mdio.txt   |   5 +-
 arch/arm/boot/dts/am33xx.dtsi  |   2 +-
 arch/arm/boot/dts/am4372.dtsi  |   2 +-
 arch/arm/boot/dts/dra7.dtsi|   2 +-
 drivers/net/ethernet/ti/cpsw.c |  79 --
 drivers/net/ethernet/ti/davinci_mdio.c | 169 +
 6 files changed, 182 insertions(+), 77 deletions(-)

-- 
2.9.0

[PATCH v2 03/15] drivers: net: cpsw: remove pm runtime calls from suspend callbacks

2016-06-24 Thread Grygorii Strashko

PM runtime is properly handled in cpsw_ndo_open/stop(), as result it
isn't required to duplicate these calls in .suspend()/.resume()
callbacks. Moreover, it might cause unnecessary RPM resume of CPSW
during System suspend in the case it's already suspended because
all ethX interfaces are down already, before System suspend started.

Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/cpsw.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index c76f9db..ba81d4e 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -2573,8 +2573,6 @@ static int cpsw_suspend(struct device *dev)
cpsw_ndo_stop(ndev);
}
 
-   pm_runtime_put_sync(&pdev->dev);
-
/* Select sleep pin state */
pinctrl_pm_select_sleep_state(&pdev->dev);
 
@@ -2587,8 +2585,6 @@ static int cpsw_resume(struct device *dev)
struct net_device   *ndev = platform_get_drvdata(pdev);
struct cpsw_priv*priv = netdev_priv(ndev);
 
-   pm_runtime_get_sync(&pdev->dev);
-
/* Select default pin state */
pinctrl_pm_select_default_state(&pdev->dev);
 
-- 
2.9.0

[PATCH v2 05/15] drivers: net: cpsw: ndev: fix accessing to suspended device

2016-06-24 Thread Grygorii Strashko

The CPSW might be suspended by RPM if all ethX interfaces are down,
but it still could be accesible through net_device_ops interfce. In
this case net_device_ops operations requiring registers access will
cause L3 errors and CPSW crash.

Hence, fix it by adding RPM get/put calls in net_device_ops callbacks
which need to access CPSW registers: .ndo_set_mac_address(),
.ndo_vlan_rx_add_vid(), .ndo_vlan_rx_kill_vid().

Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/cpsw.c | 33 ++---
 1 file changed, 30 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index 5fea986..33f9957 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -1616,10 +1616,17 @@ static int cpsw_ndo_set_mac_address(struct net_device 
*ndev, void *p)
struct sockaddr *addr = (struct sockaddr *)p;
int flags = 0;
u16 vid = 0;
+   int ret;
 
if (!is_valid_ether_addr(addr->sa_data))
return -EADDRNOTAVAIL;
 
+   ret = pm_runtime_get_sync(&priv->pdev->dev);
+   if (ret < 0) {
+   pm_runtime_put_noidle(&priv->pdev->dev);
+   return ret;
+   }
+
if (priv->data.dual_emac) {
vid = priv->slaves[priv->emac_port].port_vlan;
flags = ALE_VLAN;
@@ -1634,6 +1641,8 @@ static int cpsw_ndo_set_mac_address(struct net_device 
*ndev, void *p)
memcpy(ndev->dev_addr, priv->mac_addr, ETH_ALEN);
for_each_slave(priv, cpsw_set_slave_mac, priv);
 
+   pm_runtime_put(&priv->pdev->dev);
+
return 0;
 }
 
@@ -1698,10 +1707,17 @@ static int cpsw_ndo_vlan_rx_add_vid(struct net_device 
*ndev,
__be16 proto, u16 vid)
 {
struct cpsw_priv *priv = netdev_priv(ndev);
+   int ret;
 
if (vid == priv->data.default_vlan)
return 0;
 
+   ret = pm_runtime_get_sync(&priv->pdev->dev);
+   if (ret < 0) {
+   pm_runtime_put_noidle(&priv->pdev->dev);
+   return ret;
+   }
+
if (priv->data.dual_emac) {
/* In dual EMAC, reserved VLAN id should not be used for
 * creating VLAN interfaces as this can break the dual
@@ -1716,7 +1732,10 @@ static int cpsw_ndo_vlan_rx_add_vid(struct net_device 
*ndev,
}
 
dev_info(priv->dev, "Adding vlanid %d to vlan filter\n", vid);
-   return cpsw_add_vlan_ale_entry(priv, vid);
+   ret = cpsw_add_vlan_ale_entry(priv, vid);
+
+   pm_runtime_put(&priv->pdev->dev);
+   return ret;
 }
 
 static int cpsw_ndo_vlan_rx_kill_vid(struct net_device *ndev,
@@ -1728,6 +1747,12 @@ static int cpsw_ndo_vlan_rx_kill_vid(struct net_device 
*ndev,
if (vid == priv->data.default_vlan)
return 0;
 
+   ret = pm_runtime_get_sync(&priv->pdev->dev);
+   if (ret < 0) {
+   pm_runtime_put_noidle(&priv->pdev->dev);
+   return ret;
+   }
+
if (priv->data.dual_emac) {
int i;
 
@@ -1747,8 +1772,10 @@ static int cpsw_ndo_vlan_rx_kill_vid(struct net_device 
*ndev,
if (ret != 0)
return ret;
 
-   return cpsw_ale_del_mcast(priv->ale, priv->ndev->broadcast,
- 0, ALE_VLAN, vid);
+   ret = cpsw_ale_del_mcast(priv->ale, priv->ndev->broadcast,
+0, ALE_VLAN, vid);
+   pm_runtime_put(&priv->pdev->dev);
+   return ret;
 }
 
 static const struct net_device_ops cpsw_netdev_ops = {
-- 
2.9.0

[PATCH v2 11/15] drivers: net: davinci_mdio: implement pm runtime auto mode

2016-06-24 Thread Grygorii Strashko

Davinci MDIO is always used as slave device which services
read/write requests from MDIO/PHY core. It doesn't use IRQ also.

As result, It's possible to relax PM runtime constraints for Davinci
MDIO and enable it on demand, instead of powering it during probe
and powering off during removal.

Hence, implement PM runtime autosuspend for Davinci MDIO, but keep it
disabled by default, because Davinci MDIO is integrated in big set of
TI devices and not all of them expected to work corectly with RPM
 autosuspend enabled:
- expected to work on SoCs where MDIO is part of TI CPSW
(cpsw.c DRA7/am57x, am437x, am335x, dm814x)
- not verified on Keystone 2 and other SoCs where MDIO is used with TI EMAC IP
(davinci_emac.c:  dm6467-emac, am3517-emac, dm816-emac).

Davinci MDIO RPM autosuspend can be enabled through sysfs:
 echo 100 > 
/sys/devices/../48484000.ethernet/48485000.mdio/power/autosuspend_delay_ms

Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/davinci_mdio.c | 48 +++---
 1 file changed, 39 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/ti/davinci_mdio.c 
b/drivers/net/ethernet/ti/davinci_mdio.c
index 13f5080..ce3ec42 100644
--- a/drivers/net/ethernet/ti/davinci_mdio.c
+++ b/drivers/net/ethernet/ti/davinci_mdio.c
@@ -93,6 +93,7 @@ struct davinci_mdio_data {
struct clk  *clk;
struct device   *dev;
struct mii_bus  *bus;
+   boolactive_in_suspend;
unsigned long   access_time; /* jiffies */
/* Indicates that driver shouldn't modify phy_mask in case
 * if MDIO bus is registered from DT.
@@ -141,8 +142,13 @@ static int davinci_mdio_reset(struct mii_bus *bus)
 {
struct davinci_mdio_data *data = bus->priv;
u32 phy_mask, ver;
+   int ret;
 
-   davinci_mdio_enable(data);
+   ret = pm_runtime_get_sync(data->dev);
+   if (ret < 0) {
+   pm_runtime_put_noidle(data->dev);
+   return ret;
+   }
 
/* wait for scan logic to settle */
msleep(PHY_MAX_ADDR * data->access_time);
@@ -153,7 +159,7 @@ static int davinci_mdio_reset(struct mii_bus *bus)
 (ver >> 8) & 0xff, ver & 0xff);
 
if (data->skip_scan)
-   return 0;
+   goto done;
 
/* get phy mask from the alive register */
phy_mask = __raw_readl(&data->regs->alive);
@@ -168,6 +174,10 @@ static int davinci_mdio_reset(struct mii_bus *bus)
}
data->bus->phy_mask = phy_mask;
 
+done:
+   pm_runtime_mark_last_busy(data->dev);
+   pm_runtime_put_autosuspend(data->dev);
+
return 0;
 }
 
@@ -228,6 +238,12 @@ static int davinci_mdio_read(struct mii_bus *bus, int 
phy_id, int phy_reg)
if (phy_reg & ~PHY_REG_MASK || phy_id & ~PHY_ID_MASK)
return -EINVAL;
 
+   ret = pm_runtime_get_sync(data->dev);
+   if (ret < 0) {
+   pm_runtime_put_noidle(data->dev);
+   return ret;
+   }
+
reg = (USERACCESS_GO | USERACCESS_READ | (phy_reg << 21) |
   (phy_id << 16));
 
@@ -251,6 +267,8 @@ static int davinci_mdio_read(struct mii_bus *bus, int 
phy_id, int phy_reg)
break;
}
 
+   pm_runtime_mark_last_busy(data->dev);
+   pm_runtime_put_autosuspend(data->dev);
return ret;
 }
 
@@ -264,6 +282,12 @@ static int davinci_mdio_write(struct mii_bus *bus, int 
phy_id,
if (phy_reg & ~PHY_REG_MASK || phy_id & ~PHY_ID_MASK)
return -EINVAL;
 
+   ret = pm_runtime_get_sync(data->dev);
+   if (ret < 0) {
+   pm_runtime_put_noidle(data->dev);
+   return ret;
+   }
+
reg = (USERACCESS_GO | USERACCESS_WRITE | (phy_reg << 21) |
   (phy_id << 16) | (phy_data & USERACCESS_DATA));
 
@@ -282,7 +306,10 @@ static int davinci_mdio_write(struct mii_bus *bus, int 
phy_id,
break;
}
 
-   return 0;
+   pm_runtime_mark_last_busy(data->dev);
+   pm_runtime_put_autosuspend(data->dev);
+
+   return ret;
 }
 
 #if IS_ENABLED(CONFIG_OF)
@@ -357,8 +384,9 @@ static int davinci_mdio_probe(struct platform_device *pdev)
 
davinci_mdio_init_clk(data);
 
+   pm_runtime_set_autosuspend_delay(&pdev->dev, -1);
+   pm_runtime_use_autosuspend(&pdev->dev);
pm_runtime_enable(&pdev->dev);
-   pm_runtime_get_sync(&pdev->dev);
 
/* register the mii bus
 * Create PHYs from DT only in case if PHY child nodes are explicitly
@@ -387,9 +415,8 @@ static int davinci_mdio_probe(struct platform_device *pdev)
return 0;
 
 bail_out:
-   pm_runtime_put_sync(&pdev->dev);
+   pm_runtime_dont_use_autosuspend(&pdev->dev);
pm_runtime_disable(&pdev->dev);
-
return ret;
 }
 
@@ -400,7 +427,7 @@ static int davinci_mdio_remove(struct platform_device *pdev)
if (data->bus)
mdiobus_unregister(data->bus);
 
-   pm_runtime_put_sync(&pdev->dev)

[PATCH v2 04/15] drivers: net: cpsw: ethtool: fix accessing to suspended device

2016-06-24 Thread Grygorii Strashko

The CPSW might be suspended by RPM if all ethX interfaces are down,
but it still could be accesible through ethtool interfce. In this case
ethtool operations, requiring registers access, will cause L3 errors and
CPSW crash.

ethtool callbcaks which need to access CPSW registers now:
.set_coalesce(), .get_ethtool_stats(), .set_pauseparam(), .get_regs()

Hence, fix it by adding .begin()/.complete() ethtool callbacks, which
will be called before/after each ethtool operation runs, and do CPSW
RPM handling in these callbacks. That way CPSW will be active while
handling ethtool requests.

Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/cpsw.c | 27 ++-
 1 file changed, 26 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index ba81d4e..5fea986 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -1907,10 +1907,33 @@ static int cpsw_set_pauseparam(struct net_device *ndev,
priv->tx_pause = pause->tx_pause ? true : false;
 
for_each_slave(priv, _cpsw_adjust_link, priv, &link);
-
return 0;
 }
 
+static int cpsw_ethtool_op_begin(struct net_device *ndev)
+{
+   struct cpsw_priv *priv = netdev_priv(ndev);
+   int ret;
+
+   ret = pm_runtime_get_sync(&priv->pdev->dev);
+   if (ret < 0) {
+   cpsw_err(priv, drv, "ethtool begin failed %d\n", ret);
+   pm_runtime_put_noidle(&priv->pdev->dev);
+   }
+
+   return ret;
+}
+
+static void cpsw_ethtool_op_complete(struct net_device *ndev)
+{
+   struct cpsw_priv *priv = netdev_priv(ndev);
+   int ret;
+
+   ret = pm_runtime_put(&priv->pdev->dev);
+   if (ret < 0)
+   cpsw_err(priv, drv, "ethtool complete failed %d\n", ret);
+}
+
 static const struct ethtool_ops cpsw_ethtool_ops = {
.get_drvinfo= cpsw_get_drvinfo,
.get_msglevel   = cpsw_get_msglevel,
@@ -1930,6 +1953,8 @@ static const struct ethtool_ops cpsw_ethtool_ops = {
.set_wol= cpsw_set_wol,
.get_regs_len   = cpsw_get_regs_len,
.get_regs   = cpsw_get_regs,
+   .begin  = cpsw_ethtool_op_begin,
+   .complete   = cpsw_ethtool_op_complete,
 };
 
 static void cpsw_slave_init(struct cpsw_slave *slave, struct cpsw_priv *priv,
-- 
2.9.0

Re: [PATCH iptables 3/3] libxt_hashlimit: iptables-restore does not work as expected with xt_hashlimit

2016-06-24 Thread Vishwanath Pai

On 06/23/2016 06:25 AM, Pablo Neira Ayuso wrote:
> On Wed, Jun 01, 2016 at 08:17:59PM -0400, Vishwanath Pai wrote:
>> libxt_hashlimit: iptables-restore does not work as expected with xt_hashlimit
>>
>> Add the following iptables rule.
>>
>> $ iptables -A INPUT -m hashlimit --hashlimit-above 200/sec \
>>   --hashlimit-burst 5 --hashlimit-mode srcip --hashlimit-name hashlimit1 \
>>   --hashlimit-htable-expire 3 -j DROP
>>
>> $ iptables-save > save.txt
>>
>> Edit save.txt and change the value of --hashlimit-above to 300:
>>
>> -A INPUT -m hashlimit --hashlimit-above 300/sec --hashlimit-burst 5 \
>> --hashlimit-mode srcip --hashlimit-name hashlimit2 \
>> --hashlimit-htable-expire 3 -j DROP
>>
>> Now restore save.txt
>>
>> $ iptables-restore < save.txt
> 
> In this case, we don't end up with two rules, we actually get one
> single hashlimit rule, given the sequence you provide.
> 
> $ iptables-save > save.txt
> ... edit save.txt
> $ iptables-restore < save.txt
> 

Yes, we end up with just one rule, but the kernel data structure is not
updated. Userspace thinks the value is 300/s but in the kernel it is
still 200/s.

>> Now userspace thinks that the value of --hashlimit-above is 300 but it is
>> actually 200 in the kernel. This happens because when we add multiple
>> hash-limit rules with the same name they will share the same hashtable
>> internally. The kernel module tries to re-use the old hashtable without
>> updating the values.
>>
>> There are multiple problems here:
>> 1) We can add two iptables rules with the same name, but kernel does not
>>handle this well, one procfs file cannot work with two rules
>> 2) If the second rule has no effect because the hashtable has values from
>>rule 1
>> 3) hashtable-restore does not work (as described above)
>>
>> To fix this I have made the following design change:
>> 1) If a second rule is added with the same name as an existing rule,
>>append a number when we create the procfs, for example hashlimit_1,
>>hashlimit_2 etc
>> 2) Two rules will not share the same hashtable unless they are similar in
>>every possible way
>> 3) This behavior has to be forced with a new userspace flag:
>>--hashlimit-ehanced-procfs, if this flag is not passed we default to
>>the old behavior. This is to make sure we do not break existing scripts
>>that rely on the existing behavior.
> 
> We discussed this in netdev0.1, and I think we agreed on adding a new
> option, something like --hashlimit-update that would force an update
> to the existing hashlimit internal state (that is identified by the
> hashlimit name).
> 
> I think the problem here is that you may want to update the internal
> state of an existing hashlimit object, and currently this is not
> actually happening.
> 
> With the explicit --hashlimit-update flag, from the kernel we really
> know that the user wants an update.
> 
> Let me know, thanks.
> 

Yes, I believe you had a discussion about this with Josh Hunt. This
patch does add a new option, but it is called -enhanced-procfs instead.
I am open to renaming this to something else. I chose this name because
this patch will affect the names of the procfs files when multiple rules
with the same name exist. This generally does not happen, but is a side
effect of the way we create these files. In the case of restore example
above - we get the call to "hashlimit_mt_check" for the new rule before
the old rule is deleted, so there is a short window where we have two
rules in the kernel with the same name.

Other than that, we are doing exactly what you said, but creating a new
entry in the hashtable instead of updating it. The previous entry will
automatically be removed when the old rule is flushed/deleted.

Users will see this new behavior only if the new option is passed,
otherwise we default to the old behavior. We are also doing this in rev2
so old userspace tools will not be affected.

Thanks,
Vishwanath

[PATCH v2 09/15] drivers: net: davinci_mdio: split reset function on init_clk and enable

2016-06-24 Thread Grygorii Strashko

The Davinci MDIO MDIO_CONTROL.CLKDIV can be calculated only once
during probe, hence split __davinci_mdio_reset() on
davinci_mdio_init_clk() and davinci_mdio_enable(). Initialize and
save CLKDIV in .probe(). Then just use saved value.

Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/davinci_mdio.c | 21 ++---
 1 file changed, 14 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/ti/davinci_mdio.c 
b/drivers/net/ethernet/ti/davinci_mdio.c
index b6d0059..b206fd3 100644
--- a/drivers/net/ethernet/ti/davinci_mdio.c
+++ b/drivers/net/ethernet/ti/davinci_mdio.c
@@ -98,9 +98,10 @@ struct davinci_mdio_data {
 * if MDIO bus is registered from DT.
 */
boolskip_scan;
+   u32 clk_div;
 };
 
-static void __davinci_mdio_reset(struct davinci_mdio_data *data)
+static void davinci_mdio_init_clk(struct davinci_mdio_data *data)
 {
u32 mdio_in, div, mdio_out_khz, access_time;
 
@@ -109,9 +110,7 @@ static void __davinci_mdio_reset(struct davinci_mdio_data 
*data)
if (div > CONTROL_MAX_DIV)
div = CONTROL_MAX_DIV;
 
-   /* set enable and clock divider */
-   __raw_writel(div | CONTROL_ENABLE, &data->regs->control);
-
+   data->clk_div = div;
/*
 * One mdio transaction consists of:
 *  32 bits of preamble
@@ -132,12 +131,18 @@ static void __davinci_mdio_reset(struct davinci_mdio_data 
*data)
data->access_time = 1;
 }
 
+static void davinci_mdio_enable(struct davinci_mdio_data *data)
+{
+   /* set enable and clock divider */
+   __raw_writel(data->clk_div | CONTROL_ENABLE, &data->regs->control);
+}
+
 static int davinci_mdio_reset(struct mii_bus *bus)
 {
struct davinci_mdio_data *data = bus->priv;
u32 phy_mask, ver;
 
-   __davinci_mdio_reset(data);
+   davinci_mdio_enable(data);
 
/* wait for scan logic to settle */
msleep(PHY_MAX_ADDR * data->access_time);
@@ -188,7 +193,7 @@ static inline int wait_for_user_access(struct 
davinci_mdio_data *data)
 * operation
 */
dev_warn(data->dev, "resetting idled controller\n");
-   __davinci_mdio_reset(data);
+   davinci_mdio_enable(data);
return -EAGAIN;
}
 
@@ -350,6 +355,8 @@ static int davinci_mdio_probe(struct platform_device *pdev)
if (IS_ERR(data->regs))
return PTR_ERR(data->regs);
 
+   davinci_mdio_init_clk(data);
+
pm_runtime_enable(&pdev->dev);
pm_runtime_get_sync(&pdev->dev);
 
@@ -425,7 +432,7 @@ static int davinci_mdio_resume(struct device *dev)
pinctrl_pm_select_default_state(dev);
 
/* restart the scan state machine */
-   __davinci_mdio_reset(data);
+   davinci_mdio_enable(data);
 
return 0;
 }
-- 
2.9.0

[PATCH v2 07/15] drivers: net: davinci_mdio: remove pm runtime calls from suspend callbacks

2016-06-24 Thread Grygorii Strashko

PM runtime is disabled when Davinci MDIO .suspend_late() and
.resume_early() callbacks are called. As result, any PM runtime calls here will
be just a nop and can be removed.

Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/davinci_mdio.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/net/ethernet/ti/davinci_mdio.c 
b/drivers/net/ethernet/ti/davinci_mdio.c
index 2e19dd1..291c42e 100644
--- a/drivers/net/ethernet/ti/davinci_mdio.c
+++ b/drivers/net/ethernet/ti/davinci_mdio.c
@@ -436,7 +436,6 @@ static int davinci_mdio_suspend(struct device *dev)
 
data->suspended = true;
spin_unlock(&data->lock);
-   pm_runtime_put_sync(data->dev);
 
/* Select sleep pin state */
pinctrl_pm_select_sleep_state(dev);
@@ -451,8 +450,6 @@ static int davinci_mdio_resume(struct device *dev)
/* Select default pin state */
pinctrl_pm_select_default_state(dev);
 
-   pm_runtime_get_sync(data->dev);
-
spin_lock(&data->lock);
/* restart the scan state machine */
__davinci_mdio_reset(data);
-- 
2.9.0

[PATCH v2 10/15] drivers: net: davinci_mdio: add pm runtime callbacks

2016-06-24 Thread Grygorii Strashko

Add PM runtime .runtime_suspend()/.runtime_resume() callbacks and
perform Davinci MDIO enabling/disabling from these callbacks. This
allows to reuse pm_runtime_force_suspend/resume() APIs during System
suspend and required for further implementation of PM runtime
autosuspend.

Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/davinci_mdio.c | 31 +++
 1 file changed, 27 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/ti/davinci_mdio.c 
b/drivers/net/ethernet/ti/davinci_mdio.c
index b206fd3..13f5080 100644
--- a/drivers/net/ethernet/ti/davinci_mdio.c
+++ b/drivers/net/ethernet/ti/davinci_mdio.c
@@ -406,8 +406,8 @@ static int davinci_mdio_remove(struct platform_device *pdev)
return 0;
 }
 
-#ifdef CONFIG_PM_SLEEP
-static int davinci_mdio_suspend(struct device *dev)
+#ifdef CONFIG_PM
+static int davinci_mdio_runtime_suspend(struct device *dev)
 {
struct davinci_mdio_data *data = dev_get_drvdata(dev);
u32 ctrl;
@@ -418,6 +418,28 @@ static int davinci_mdio_suspend(struct device *dev)
__raw_writel(ctrl, &data->regs->control);
wait_for_idle(data);
 
+   return 0;
+}
+
+static int davinci_mdio_runtime_resume(struct device *dev)
+{
+   struct davinci_mdio_data *data = dev_get_drvdata(dev);
+
+   davinci_mdio_enable(data);
+   return 0;
+}
+#endif
+
+#ifdef CONFIG_PM_SLEEP
+static int davinci_mdio_suspend(struct device *dev)
+{
+   struct davinci_mdio_data *data = dev_get_drvdata(dev);
+   int ret = 0;
+
+   ret = pm_runtime_force_suspend(dev);
+   if (ret < 0)
+   return ret;
+
/* Select sleep pin state */
pinctrl_pm_select_sleep_state(dev);
 
@@ -431,14 +453,15 @@ static int davinci_mdio_resume(struct device *dev)
/* Select default pin state */
pinctrl_pm_select_default_state(dev);
 
-   /* restart the scan state machine */
-   davinci_mdio_enable(data);
+   pm_runtime_force_resume(dev);
 
return 0;
 }
 #endif
 
 static const struct dev_pm_ops davinci_mdio_pm_ops = {
+   SET_RUNTIME_PM_OPS(davinci_mdio_runtime_suspend,
+  davinci_mdio_runtime_resume, NULL)
SET_LATE_SYSTEM_SLEEP_PM_OPS(davinci_mdio_suspend, davinci_mdio_resume)
 };
 
-- 
2.9.0

[PATCH v2 06/15] drivers: net: davinci_mdio: do pm runtime initialization later in probe

2016-06-24 Thread Grygorii Strashko

Do PM runtime initialization later in probe - this allows to simplify
error handling a bit.

Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/davinci_mdio.c | 15 ++-
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/drivers/net/ethernet/ti/davinci_mdio.c 
b/drivers/net/ethernet/ti/davinci_mdio.c
index 4e7c9b9..2e19dd1 100644
--- a/drivers/net/ethernet/ti/davinci_mdio.c
+++ b/drivers/net/ethernet/ti/davinci_mdio.c
@@ -356,14 +356,10 @@ static int davinci_mdio_probe(struct platform_device 
*pdev)
data->bus->parent   = dev;
data->bus->priv = data;
 
-   pm_runtime_enable(&pdev->dev);
-   pm_runtime_get_sync(&pdev->dev);
data->clk = devm_clk_get(dev, "fck");
if (IS_ERR(data->clk)) {
dev_err(dev, "failed to get device clock\n");
-   ret = PTR_ERR(data->clk);
-   data->clk = NULL;
-   goto bail_out;
+   return PTR_ERR(data->clk);
}
 
dev_set_drvdata(dev, data);
@@ -372,10 +368,11 @@ static int davinci_mdio_probe(struct platform_device 
*pdev)
 
res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
data->regs = devm_ioremap_resource(dev, res);
-   if (IS_ERR(data->regs)) {
-   ret = PTR_ERR(data->regs);
-   goto bail_out;
-   }
+   if (IS_ERR(data->regs))
+   return PTR_ERR(data->regs);
+
+   pm_runtime_enable(&pdev->dev);
+   pm_runtime_get_sync(&pdev->dev);
 
/* register the mii bus
 * Create PHYs from DT only in case if PHY child nodes are explicitly
-- 
2.9.0

[PATCH v2 13/15] net: davinci_mdio: introduce "ti,cpsw-mdio" compat string

2016-06-24 Thread Grygorii Strashko

Introduce "ti,cpsw-mdio" compatible string for Davinci MDIO, because
it's required to distinguish the case when MDIO is part of TI CPSW to
enable features supported by TI CPSW (for example, enable PM
management).

Signed-off-by: Grygorii Strashko 
---
 Documentation/devicetree/bindings/net/davinci-mdio.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/Documentation/devicetree/bindings/net/davinci-mdio.txt 
b/Documentation/devicetree/bindings/net/davinci-mdio.txt
index b6a4f48..621156c 100644
--- a/Documentation/devicetree/bindings/net/davinci-mdio.txt
+++ b/Documentation/devicetree/bindings/net/davinci-mdio.txt
@@ -4,6 +4,7 @@ TI SoC Davinci/Keystone2 MDIO Controller Device Tree Bindings
 Required properties:
 - compatible   : Should be "ti,davinci_mdio"
  and "ti,keystone_mdio" for Keystone 2 SoCs
+ and "ti,cpsw-mdio" for am335x, am472x, am57xx/dra7, 
dm814x SoCs
  and "ti,am4372-mdio" for am472x SoC
 - reg  : physical base address and size of the davinci mdio
  registers map
-- 
2.9.0

[PATCH v2 08/15] drivers: net: davinci_mdio: drop suspended and lock fields from mdio_data

2016-06-24 Thread Grygorii Strashko

It's not expected Davinci MDIO to be accessible after its suspend
callbacks have been called:
 - all consumers of Davinci MDIO will stop/disconnect phys at Device
suspend stage;
 - all phys are expected to be suspned already by PHY/MDIO core;
 - MDIO locking is done by MDIO Bus code.

Hence, it's safe to drop "suspended" and "lock" fields from mdio_data.

Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/davinci_mdio.c | 30 --
 1 file changed, 30 deletions(-)

diff --git a/drivers/net/ethernet/ti/davinci_mdio.c 
b/drivers/net/ethernet/ti/davinci_mdio.c
index 291c42e..b6d0059 100644
--- a/drivers/net/ethernet/ti/davinci_mdio.c
+++ b/drivers/net/ethernet/ti/davinci_mdio.c
@@ -90,11 +90,9 @@ static const struct mdio_platform_data default_pdata = {
 struct davinci_mdio_data {
struct mdio_platform_data pdata;
struct davinci_mdio_regs __iomem *regs;
-   spinlock_t  lock;
struct clk  *clk;
struct device   *dev;
struct mii_bus  *bus;
-   boolsuspended;
unsigned long   access_time; /* jiffies */
/* Indicates that driver shouldn't modify phy_mask in case
 * if MDIO bus is registered from DT.
@@ -225,13 +223,6 @@ static int davinci_mdio_read(struct mii_bus *bus, int 
phy_id, int phy_reg)
if (phy_reg & ~PHY_REG_MASK || phy_id & ~PHY_ID_MASK)
return -EINVAL;
 
-   spin_lock(&data->lock);
-
-   if (data->suspended) {
-   spin_unlock(&data->lock);
-   return -ENODEV;
-   }
-
reg = (USERACCESS_GO | USERACCESS_READ | (phy_reg << 21) |
   (phy_id << 16));
 
@@ -255,8 +246,6 @@ static int davinci_mdio_read(struct mii_bus *bus, int 
phy_id, int phy_reg)
break;
}
 
-   spin_unlock(&data->lock);
-
return ret;
 }
 
@@ -270,13 +259,6 @@ static int davinci_mdio_write(struct mii_bus *bus, int 
phy_id,
if (phy_reg & ~PHY_REG_MASK || phy_id & ~PHY_ID_MASK)
return -EINVAL;
 
-   spin_lock(&data->lock);
-
-   if (data->suspended) {
-   spin_unlock(&data->lock);
-   return -ENODEV;
-   }
-
reg = (USERACCESS_GO | USERACCESS_WRITE | (phy_reg << 21) |
   (phy_id << 16) | (phy_data & USERACCESS_DATA));
 
@@ -295,8 +277,6 @@ static int davinci_mdio_write(struct mii_bus *bus, int 
phy_id,
break;
}
 
-   spin_unlock(&data->lock);
-
return 0;
 }
 
@@ -364,7 +344,6 @@ static int davinci_mdio_probe(struct platform_device *pdev)
 
dev_set_drvdata(dev, data);
data->dev = dev;
-   spin_lock_init(&data->lock);
 
res = platform_get_resource(pdev, IORESOURCE_MEM, 0);
data->regs = devm_ioremap_resource(dev, res);
@@ -426,17 +405,12 @@ static int davinci_mdio_suspend(struct device *dev)
struct davinci_mdio_data *data = dev_get_drvdata(dev);
u32 ctrl;
 
-   spin_lock(&data->lock);
-
/* shutdown the scan state machine */
ctrl = __raw_readl(&data->regs->control);
ctrl &= ~CONTROL_ENABLE;
__raw_writel(ctrl, &data->regs->control);
wait_for_idle(data);
 
-   data->suspended = true;
-   spin_unlock(&data->lock);
-
/* Select sleep pin state */
pinctrl_pm_select_sleep_state(dev);
 
@@ -450,13 +424,9 @@ static int davinci_mdio_resume(struct device *dev)
/* Select default pin state */
pinctrl_pm_select_default_state(dev);
 
-   spin_lock(&data->lock);
/* restart the scan state machine */
__davinci_mdio_reset(data);
 
-   data->suspended = false;
-   spin_unlock(&data->lock);
-
return 0;
 }
 #endif
-- 
2.9.0

[PATCH v2 12/15] net: davinci_mdio: document missed "ti,am4372-mdio" compat string

2016-06-24 Thread Grygorii Strashko

Document missed "ti,am4372-mdio" compat string used for TI am437x SoC
(am4372.dtsi).

Signed-off-by: Grygorii Strashko 
---
 Documentation/devicetree/bindings/net/davinci-mdio.txt | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/net/davinci-mdio.txt 
b/Documentation/devicetree/bindings/net/davinci-mdio.txt
index 0369e25..b6a4f48 100644
--- a/Documentation/devicetree/bindings/net/davinci-mdio.txt
+++ b/Documentation/devicetree/bindings/net/davinci-mdio.txt
@@ -2,7 +2,9 @@ TI SoC Davinci/Keystone2 MDIO Controller Device Tree Bindings
 ---
 
 Required properties:
-- compatible   : Should be "ti,davinci_mdio" or "ti,keystone_mdio"
+- compatible   : Should be "ti,davinci_mdio"
+ and "ti,keystone_mdio" for Keystone 2 SoCs
+ and "ti,am4372-mdio" for am472x SoC
 - reg  : physical base address and size of the davinci mdio
  registers map
 - bus_freq : Mdio Bus frequency
-- 
2.9.0

[PATCH v2 14/15] drivers: net: davinci_mdio: enable pm runtime auto for ti cpsw-mdio

2016-06-24 Thread Grygorii Strashko

Use "ti,cpsw-mdio" to enable PM runtime auto-suspend on supported
platforms, where MDIO is implemented as part of TI CPSW.

Signed-off-by: Grygorii Strashko 
---
 drivers/net/ethernet/ti/davinci_mdio.c | 45 +-
 1 file changed, 34 insertions(+), 11 deletions(-)

diff --git a/drivers/net/ethernet/ti/davinci_mdio.c 
b/drivers/net/ethernet/ti/davinci_mdio.c
index ce3ec42..33df340 100644
--- a/drivers/net/ethernet/ti/davinci_mdio.c
+++ b/drivers/net/ethernet/ti/davinci_mdio.c
@@ -53,6 +53,10 @@
 
 #define DEF_OUT_FREQ   220 /* 2.2 MHz */
 
+struct davinci_mdio_of_param {
+   int autosuspend_delay_ms;
+};
+
 struct davinci_mdio_regs {
u32 version;
u32 control;
@@ -332,6 +336,19 @@ static int davinci_mdio_probe_dt(struct mdio_platform_data 
*data,
 }
 #endif
 
+#if IS_ENABLED(CONFIG_OF)
+static const struct davinci_mdio_of_param of_cpsw_mdio_data = {
+   .autosuspend_delay_ms = 100,
+};
+
+static const struct of_device_id davinci_mdio_of_mtable[] = {
+   { .compatible = "ti,davinci_mdio", },
+   { .compatible = "ti,cpsw-mdio", .data = &of_cpsw_mdio_data},
+   { /* sentinel */ },
+};
+MODULE_DEVICE_TABLE(of, davinci_mdio_of_mtable);
+#endif
+
 static int davinci_mdio_probe(struct platform_device *pdev)
 {
struct mdio_platform_data *pdata = dev_get_platdata(&pdev->dev);
@@ -340,6 +357,7 @@ static int davinci_mdio_probe(struct platform_device *pdev)
struct resource *res;
struct phy_device *phy;
int ret, addr;
+   int autosuspend_delay_ms = -1;
 
data = devm_kzalloc(dev, sizeof(*data), GFP_KERNEL);
if (!data)
@@ -352,9 +370,22 @@ static int davinci_mdio_probe(struct platform_device *pdev)
}
 
if (dev->of_node) {
-   if (davinci_mdio_probe_dt(&data->pdata, pdev))
-   data->pdata = default_pdata;
+   const struct of_device_id   *of_id;
+
+   ret = davinci_mdio_probe_dt(&data->pdata, pdev);
+   if (ret)
+   return ret;
snprintf(data->bus->id, MII_BUS_ID_SIZE, "%s", pdev->name);
+
+   of_id = of_match_device(davinci_mdio_of_mtable, &pdev->dev);
+   if (of_id) {
+   const struct davinci_mdio_of_param *of_mdio_data;
+
+   of_mdio_data = of_id->data;
+   if (of_mdio_data)
+   autosuspend_delay_ms =
+   of_mdio_data->autosuspend_delay_ms;
+   }
} else {
data->pdata = pdata ? (*pdata) : default_pdata;
snprintf(data->bus->id, MII_BUS_ID_SIZE, "%s-%x",
@@ -384,7 +415,7 @@ static int davinci_mdio_probe(struct platform_device *pdev)
 
davinci_mdio_init_clk(data);
 
-   pm_runtime_set_autosuspend_delay(&pdev->dev, -1);
+   pm_runtime_set_autosuspend_delay(&pdev->dev, autosuspend_delay_ms);
pm_runtime_use_autosuspend(&pdev->dev);
pm_runtime_enable(&pdev->dev);
 
@@ -495,14 +526,6 @@ static const struct dev_pm_ops davinci_mdio_pm_ops = {
SET_LATE_SYSTEM_SLEEP_PM_OPS(davinci_mdio_suspend, davinci_mdio_resume)
 };
 
-#if IS_ENABLED(CONFIG_OF)
-static const struct of_device_id davinci_mdio_of_mtable[] = {
-   { .compatible = "ti,davinci_mdio", },
-   { /* sentinel */ },
-};
-MODULE_DEVICE_TABLE(of, davinci_mdio_of_mtable);
-#endif
-
 static struct platform_driver davinci_mdio_driver = {
.driver = {
.name= "davinci_mdio",
-- 
2.9.0

[PATCH v2 15/15] ARM: dts: am335x/am437x/dra7: use new "ti,cpsw-mdio" compat string

2016-06-24 Thread Grygorii Strashko

Add "ti,cpsw-mdio" for am335x/am437x/dra7 SoCs where MDIO is
implemented as part of TI CPSW and, this way, enable PM runtime auto
suspend for Davinci MDIO driver on these paltforms.

Signed-off-by: Grygorii Strashko 
---
 arch/arm/boot/dts/am33xx.dtsi | 2 +-
 arch/arm/boot/dts/am4372.dtsi | 2 +-
 arch/arm/boot/dts/dra7.dtsi   | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/arm/boot/dts/am33xx.dtsi b/arch/arm/boot/dts/am33xx.dtsi
index 52be48b..7fa9a1d 100644
--- a/arch/arm/boot/dts/am33xx.dtsi
+++ b/arch/arm/boot/dts/am33xx.dtsi
@@ -789,7 +789,7 @@
status = "disabled";
 
davinci_mdio: mdio@4a101000 {
-   compatible = "ti,davinci_mdio";
+   compatible = "ti,cpsw-mdio","ti,davinci_mdio";
#address-cells = <1>;
#size-cells = <0>;
ti,hwmods = "davinci_mdio";
diff --git a/arch/arm/boot/dts/am4372.dtsi b/arch/arm/boot/dts/am4372.dtsi
index 12fcde4..ea76fa7 100644
--- a/arch/arm/boot/dts/am4372.dtsi
+++ b/arch/arm/boot/dts/am4372.dtsi
@@ -636,7 +636,7 @@
syscon = <&scm_conf>;
 
davinci_mdio: mdio@4a101000 {
-   compatible = "ti,am4372-mdio","ti,davinci_mdio";
+   compatible = 
"ti,am4372-mdio","ti,cpsw-mdio","ti,davinci_mdio";
reg = <0x4a101000 0x100>;
#address-cells = <1>;
#size-cells = <0>;
diff --git a/arch/arm/boot/dts/dra7.dtsi b/arch/arm/boot/dts/dra7.dtsi
index 3a8f397..8275d2e 100644
--- a/arch/arm/boot/dts/dra7.dtsi
+++ b/arch/arm/boot/dts/dra7.dtsi
@@ -1663,7 +1663,7 @@
status = "disabled";
 
davinci_mdio: mdio@48485000 {
-   compatible = "ti,davinci_mdio";
+   compatible = "ti,cpsw-mdio","ti,davinci_mdio";
#address-cells = <1>;
#size-cells = <0>;
ti,hwmods = "davinci_mdio";
-- 
2.9.0

Re: [PATCH 3/8] wireless: ipw2200: fix old-style declaration

2016-06-24 Thread Stanislav Yakovlev

On 16 June 2016 at 17:52, Arnd Bergmann  wrote:
> Modern C standards expect the 'inline' keyword to come before the return
> type in a declaration, and we get a warning for this with "make W=1":
>
> drivers/net/wireless/intel/ipw2x00/ipw2200.c:4096:1: error: 'inline' is not 
> at beginning of declaration [-Werror=old-style-declaration]
>
> Signed-off-by: Arnd Bergmann 
> ---
>  drivers/net/wireless/intel/ipw2x00/ipw2200.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>

Acked-by: Stanislav Yakovlev 

Stanislav.

Re: [PATCH net-next 0/5] qed/qede: Tunnel hardware GRO support

2016-06-24 Thread Edward Cree

On 24/06/16 17:31, Tom Herbert wrote:
> Ed,
> Because you took this OT... ;-)
>
> LRO/GRO is the one of the five common offloads that has no generic
> analogue and requires protocol specific logic. For instance each
> IP-over-foo encapsulation needs kernel code for GRO, device/driver
> code for LRO. I think the answer here is to make both GRO and LRO to
> be user programmable via BPF.
I agree that the only way to make LRO generic is to go for hardware
BPF.  However, I think that's likely to cause a _lot_ of headaches to
implement and my hope is that we can instead get acceptable receive
performance from GRO, RSS, and maybe things like the skb bundling I
posted a while back.
For instance, if your 'source port hack' were to mix in the TNI as
well as the inner flow fields it already uses, I think that could
improve hash spreading and thus GRO would perform better.
Fundamentally I believe that robust, responsive hardware LRO is not
workable as the hardware would have to decide to hold onto packets in
the hope of merge candidates arriving soon after.  Whereas in the
software layer (GRO, bundling...), the packets are already coming in
bursts thanks to the way napi polling behaves.
But I'd love to be proved wrong :)  The 'hybrid' approach of using
bpf in hw to identify flows for sw to gro does seem plausible, maybe
having bpf to compute the rxhash is the answer?

-Ed

(disclaimer: definitely not speaking for my employer here, these are
my personal views only.)

Re: [PATCH] net: ethernet: ti: cpdma: switch to use genalloc

2016-06-24 Thread Grygorii Strashko

On 06/24/2016 07:15 PM, Lennart Sorensen wrote:
> On Fri, Jun 24, 2016 at 11:35:15AM +0530, Mugunthan V N wrote:
 +static void cpdma_desc_pool_destroy(struct cpdma_desc_pool *pool)
 +{
 +if (!pool)
 +return;
 +
 +WARN_ON(pool->used_desc);
 +if (pool->cpumap) {
 +dma_free_coherent(pool->dev, pool->mem_size, pool->cpumap,
 +  pool->phys);
 +} else {
 +iounmap(pool->iomap);
 +}
 +}
 +
>>> single if, brackets?
>>
>> if() has multiple line statement, so brackets are must.
> 
> It is line wrapped, it is still one statement.  And you can't argue the
> else being multiple lines, although the style does require using brackets
> for the else if the if required them.
> 
> Style says "Do not unnecessarily use braces where a single statement will do."
> It says statement, not line.  A multiline wrapped statement is still
> one statement.
> 
> I may personally hate the lack of brackets, but style wise it seems very
> clear that the linux kernel only uses brakcets when required, which is
> only when there is more than one statement.  I prefer what you did,
> but not as much as I prefer consistency.
> 

Oh. nice :( So, seems, I'd need to send v3. Right?
By the way, this code hasn't been introduced by this patch - I've
just moved whole function from one place to another.

-- 
regards,
-grygorii

Re: [PATCH net-next 0/5] qed/qede: Tunnel hardware GRO support

2016-06-24 Thread Alexander Duyck

On Thu, Jun 23, 2016 at 10:20 PM, Yuval Mintz  wrote:
>>We already know of one firmware bug you guys have which makes
>>it clear that the bnx2x is not doing hardware assisted GRO it is doing
>>something else since it performs much worse than GRO if the MSS is
>>less than what it would be based on the MTU.
>
> It's a bit nitpicky, isn't it? Claiming this flaw means it's not GRO.
> I.e., you obviously wouldn't have claimed it beacme GRO if it
> was fixed.
>
> Not saying it makes a lot of difference, though.

The fact is LRO and GRO are two separate things.  Even without the bug
in the firmware I would still be saying the are two very different
things.  Your GRO implementation only supports a subset of the
features that GRO has in the hardware.  The way the kernel has
implemented things we should keep GRO and GSO symmetric if at all
possible.  There aren't currently any GSO hardware offloads so there
probably shouldn't be any GRO hardware offloads.  On the other hand
devices do support TSO hardware offloads so it would make sense to
match that up and support LRO as the hardware equivalent of GRO.

Anyway that is my opinion.  It may be nitpicky but I don't fee that we
should be re-purposing feature bits that were meant to be software
features to represent hardware ones.

- Alex

Re: [PATCH net-next 0/5] qed/qede: Tunnel hardware GRO support

2016-06-24 Thread Tom Herbert

On Fri, Jun 24, 2016 at 6:09 AM, Edward Cree  wrote:
> On 23/06/16 18:07, Alexander Duyck wrote:
>> I would prefer to see us extend LRO to support "close enough GRO"
>> instead of have us extend GRO to also include LRO.
> This reminds me of something I've been meaning to bring up (sorry for
> slightly OT, but it might turn out relevant after all).
> In sfc we have an (out-of-tree and likely staying that way) LRO that's
> entirely in software.  The only reason it exists is for users who want
> the 'permissive' merging behaviour of LRO, i.e. they don't need the
> guarantees of reversibility and by merging more stuff they can get
> slightly higher performance.
> I wonder if it would be a good idea for the GRO implementation to have
> some knobs to allow setting it to behave in this way.
> That would imply a scheme to define various GRO/SSR semantics, which
> then would also be a convenient interface for a driver to report the
> semantics of its hardware LRO if it has any.
> And it would make crystal clear that the difference between GRO and
> LRO is kernel vs hardware, rather than reversible vs not.
>
Ed,

Because you took this OT... ;-)

LRO/GRO is the one of the five common offloads that has no generic
analogue and requires protocol specific logic. For instance each
IP-over-foo encapsulation needs kernel code for GRO, device/driver
code for LRO. I think the answer here is to make both GRO and LRO to
be user programmable via BPF. That is, instead of needing to add code
or buy a new device to support every new protocol, we really just want
to write a program for it that runs in any environment. In the case of
LRO this becomes especially important since it resolves the "black
box" nature of LRO in devices, so design goals like ensuring LRO is
"close enough to GRO" become something we (the stack) have some
control over.

We've already moved GRO for to be a attribute of a UDP sockets, it is
not much of a stretch now to allow applications to define their own
GRO for their protocols (I'm thinking protocols like QUIC or TOU could
use this).

For programmable LRO I think the solution is to use XDP. For instance
protocol specific parsing would done by the BPF program to identify
the flows, and the infrastructure would provide the backend handling.
The advantage of XDP for this is that it is not platform specific, so
programmable LRO could be implemented in the driver (probably
leveraging existing LRO solution like sfc), or it could be implemented
implemented in HW using the exact same program (with HW support for
BPF/XDP). Since such a program allows arbitrary parsing and flow
lookup, we can match on specific n-tuples as needed to resolve the
UDP-encapsulation identification problem.

Tom

> -Ed

[RFT][PATCH 2/2] phy: bcm-ns2-pcie: Set missing .owner field in ns2_pci_phy_ops

2016-06-24 Thread Axel Lin

Add missing .owner field in ns2_pci_phy_ops, which is used for refcounting.
While at it, also makes ns2_pci_phy_ops const as it's never modified.

Signed-off-by: Axel Lin 
---
 drivers/phy/phy-bcm-ns2-pcie.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/drivers/phy/phy-bcm-ns2-pcie.c b/drivers/phy/phy-bcm-ns2-pcie.c
index ee61772..4c7d11d 100644
--- a/drivers/phy/phy-bcm-ns2-pcie.c
+++ b/drivers/phy/phy-bcm-ns2-pcie.c
@@ -47,8 +47,9 @@ err:
return rc;
 }
 
-static struct phy_ops ns2_pci_phy_ops = {
+static const struct phy_ops ns2_pci_phy_ops = {
.init = ns2_pci_phy_init,
+   .owner = THIS_MODULE,
 };
 
 static int ns2_pci_phy_probe(struct mdio_device *mdiodev)
-- 
2.5.0

[RFT][PATCH 1/2] phy: bcm-ns2-pcie: Get rid of struct ns2_pci_phy

2016-06-24 Thread Axel Lin

By setting phy_set_drvdata(phy, mdiodev), struct ns2_pci_phy can be
removed.

Signed-off-by: Axel Lin 
---
I don't have this h/w. Appreciate if someone can test this patch serial.
 drivers/phy/phy-bcm-ns2-pcie.c | 25 +
 1 file changed, 5 insertions(+), 20 deletions(-)

diff --git a/drivers/phy/phy-bcm-ns2-pcie.c b/drivers/phy/phy-bcm-ns2-pcie.c
index 9513f7a..ee61772 100644
--- a/drivers/phy/phy-bcm-ns2-pcie.c
+++ b/drivers/phy/phy-bcm-ns2-pcie.c
@@ -18,11 +18,6 @@
 #include 
 #include 
 
-struct ns2_pci_phy {
-   struct mdio_device *mdiodev;
-   struct phy *phy;
-};
-
 #define BLK_ADDR_REG_OFFSET0x1f
 #define PLL_AFE1_100MHZ_BLK0x2100
 #define PLL_CLK_AMP_OFFSET 0x03
@@ -30,17 +25,17 @@ struct ns2_pci_phy {
 
 static int ns2_pci_phy_init(struct phy *p)
 {
-   struct ns2_pci_phy *phy = phy_get_drvdata(p);
+   struct mdio_device *mdiodev = phy_get_drvdata(p);
int rc;
 
/* select the AFE 100MHz block page */
-   rc = mdiobus_write(phy->mdiodev->bus, phy->mdiodev->addr,
+   rc = mdiobus_write(mdiodev->bus, mdiodev->addr,
   BLK_ADDR_REG_OFFSET, PLL_AFE1_100MHZ_BLK);
if (rc)
goto err;
 
/* set the 100 MHz reference clock amplitude to 2.05 v */
-   rc = mdiobus_write(phy->mdiodev->bus, phy->mdiodev->addr,
+   rc = mdiobus_write(mdiodev->bus, mdiodev->addr,
   PLL_CLK_AMP_OFFSET, PLL_CLK_AMP_2P05V);
if (rc)
goto err;
@@ -48,7 +43,7 @@ static int ns2_pci_phy_init(struct phy *p)
return 0;
 
 err:
-   dev_err(&phy->mdiodev->dev, "Error %d writing to phy\n", rc);
+   dev_err(&mdiodev->dev, "Error %d writing to phy\n", rc);
return rc;
 }
 
@@ -60,7 +55,6 @@ static int ns2_pci_phy_probe(struct mdio_device *mdiodev)
 {
struct device *dev = &mdiodev->dev;
struct phy_provider *provider;
-   struct ns2_pci_phy *p;
struct phy *phy;
 
phy = devm_phy_create(dev, dev->of_node, &ns2_pci_phy_ops);
@@ -69,16 +63,7 @@ static int ns2_pci_phy_probe(struct mdio_device *mdiodev)
return PTR_ERR(phy);
}
 
-   p = devm_kmalloc(dev, sizeof(struct ns2_pci_phy),
-GFP_KERNEL);
-   if (!p)
-   return -ENOMEM;
-
-   p->mdiodev = mdiodev;
-   dev_set_drvdata(dev, p);
-
-   p->phy = phy;
-   phy_set_drvdata(phy, p);
+   phy_set_drvdata(phy, mdiodev);
 
provider = devm_of_phy_provider_register(&phy->dev,
 of_phy_simple_xlate);
-- 
2.5.0

Re: [REGRESSION, bisect]cxgb4 port failure with TSO traffic after commit 10d3be569243def8("tcp-tso: do not split TSO packets at retransmit time")

2016-06-24 Thread Eric Dumazet

On Fri, 2016-06-24 at 07:25 -0700, Eric Dumazet wrote:
> Please do not top post on netdev mailing list
> 
> On Fri, Jun 24, 2016 at 4:38 AM, Arjun V.  wrote:
> > Eric,
> > We are seeing skb's with length(skb->len) greater than 65536 coming into 
> > our ndo_start_xmit() callback routine.
> > We can add a check in our eth_xmit() routine to skip those packets, but it 
> > will be better if you fix this in kernel.
> >
> >
> > I have attached pcap file obtained from tcpdump. In the pcap file there are 
> > 2 such packets(I used tcpdump filter to extract out those packets).
> >
> > Let us know if you need anything else.
> >
> 
> Beats me really.
> 
> At retransmit time, we only can eventually reduce packet sizes
> (assuming GSO is used, because we might coalesce sub-mss packets in
> tcp_retrans_try_collapse())
> 
> So why are you seeing too big packets at retransmit, I really have no idea.
> 
> MIght be some bug related to MSS computation, overflowing somehow ?

Could you try this ?

Thanks !

diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 
d39e9e47a26e55ad2b8f775bf9ea9dfb5b12aee5..27013056bcfb9aa49601806bb3aa55a1ac664873
 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -1177,6 +1177,7 @@ int inet_sk_rebuild_header(struct sock *sk)
 
/* Routing failed... */
sk->sk_route_caps = 0;
+   sk->sk_gso_max_segs = 1;
/*
 * Other protocols have to map its equivalent state to 
TCP_SYN_SENT.
 * DCCP maps its DCCP_REQUESTING state to TCP_SYN_SENT. -acme
diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
index 
2076c21107d07e4e78a0a29f1d374c3414b8e1bd..ecc0281acfb702b138c68ac51e3a0518052785b0
 100644
--- a/net/ipv6/af_inet6.c
+++ b/net/ipv6/af_inet6.c
@@ -685,6 +685,7 @@ int inet6_sk_rebuild_header(struct sock *sk)
dst = ip6_dst_lookup_flow(sk, &fl6, final_p);
if (IS_ERR(dst)) {
sk->sk_route_caps = 0;
+   sk->sk_gso_max_segs = 1;
sk->sk_err_soft = -PTR_ERR(dst);
return PTR_ERR(dst);
}

Re: [PATCH] net: ethernet: ti: cpdma: switch to use genalloc

2016-06-24 Thread Lennart Sorensen

On Fri, Jun 24, 2016 at 11:35:15AM +0530, Mugunthan V N wrote:
> >> +static void cpdma_desc_pool_destroy(struct cpdma_desc_pool *pool)
> >> +{
> >> +if (!pool)
> >> +return;
> >> +
> >> +WARN_ON(pool->used_desc);
> >> +if (pool->cpumap) {
> >> +dma_free_coherent(pool->dev, pool->mem_size, pool->cpumap,
> >> +  pool->phys);
> >> +} else {
> >> +iounmap(pool->iomap);
> >> +}
> >> +}
> >> +
> > single if, brackets?
> 
> if() has multiple line statement, so brackets are must.

It is line wrapped, it is still one statement.  And you can't argue the
else being multiple lines, although the style does require using brackets
for the else if the if required them.

Style says "Do not unnecessarily use braces where a single statement will do."
It says statement, not line.  A multiline wrapped statement is still
one statement.

I may personally hate the lack of brackets, but style wise it seems very
clear that the linux kernel only uses brakcets when required, which is
only when there is more than one statement.  I prefer what you did,
but not as much as I prefer consistency.

-- 
Len Sorensen

Re: [PATCH v10 07/22] IB/hns: Add event queue support

2016-06-24 Thread Doug Ledford

On 06/24/2016 11:46 AM, Leon Romanovsky wrote:
> On Thu, Jun 16, 2016 at 10:35:15PM +0800, Lijun Ou wrote:

>>  /*ROCEE_REG DEFINITION/
>>  #define ROCEE_VENDOR_ID_REG 0x0
>> @@ -44,8 +93,29 @@
>>  #define ROCEE_SYS_IMAGE_GUID_L_REG  0xC
>>  #define ROCEE_SYS_IMAGE_GUID_H_REG  0x10
>>  
>> +#define ROCEE_CAEP_AEQE_CONS_IDX_REG0x3AC
>> +#define ROCEE_CAEP_CEQC_CONS_IDX_0_REG  0x3BC
>> +
>> +#define ROCEE_ECC_UCERR_ALM1_REG0xB38
>> +#define ROCEE_ECC_UCERR_ALM2_REG0xB3C
>> +#define ROCEE_ECC_CERR_ALM1_REG 0xB44
>> +#define ROCEE_ECC_CERR_ALM2_REG 0xB48
>> +
>>  #define ROCEE_ACK_DELAY_REG 0x14
>>  
>> +#define ROCEE_CAEP_CE_INTERVAL_CFG_REG  0x190
>> +#define ROCEE_CAEP_CE_BURST_NUM_CFG_REG 0x194
>> +
>>  #define ROCEE_MB1_REG   0x210
>>  
>> +#define ROCEE_CAEP_AEQC_AEQE_SHIFT_REG  0x3A0
>> +#define ROCEE_CAEP_CEQC_SHIFT_0_REG 0x3B0
>> +#define ROCEE_CAEP_CE_IRQ_MASK_0_REG0x3C0
>> +#define ROCEE_CAEP_CEQ_ALM_OVF_0_REG0x3C4
>> +#define ROCEE_CAEP_AE_MASK_REG  0x6C8
>> +#define ROCEE_CAEP_AE_ST_REG0x6CC
>> +
>> +#define ROCEE_ECC_UCERR_ALM0_REG0xB34
>> +#define ROCEE_ECC_CERR_ALM0_REG 0xB40
> 
> Indentation

I'm pretty sure this indentation is fine.  When looking at patch file
contents, the beginning of the line is shifted one char to the right, so
when using tabs for indents, if the line would normally end at the final
char before the next tab, and it gets shifted one char by the patch
format, the contents on the other side of the indent get jumped an extra
tab spot.  You would have to look at it as a file to know for sure, but
I'm betting you can ignore all of these indent issues.


-- 
Doug Ledford 
  GPG KeyID: 0E572FDD




signature.asc
Description: OpenPGP digital signature

Re: [PATCH v10 00/22] Add HiSilicon RoCE driver

2016-06-24 Thread Leon Romanovsky

On Thu, Jun 16, 2016 at 10:35:08PM +0800, Lijun Ou wrote:
> The HiSilicon Network Substem is a long term evolution IP which is
> supposed to be used in HiSilicon ICT SoCs. HNS (HiSilicon Network
> Sybsystem) also has a hardware support of performing RDMA with
> RoCEE.
> The driver for HiSilicon RoCEE(RoCE Engine) is a platform driver and
> will support mulitple versions of SOCs in future. This version of driver
> is meant to support Hip06 SoC(which confirms to RoCEv1 hardware
> specifications).
> 
> Changes v9 -> v10:

I stopped my review on patch 9.


signature.asc
Description: Digital signature

Re: [PATCH v10 09/22] IB/hns: Add hca support

2016-06-24 Thread Leon Romanovsky

On Thu, Jun 16, 2016 at 10:35:17PM +0800, Lijun Ou wrote:
> This patch mainly setup hca for RoCE. It will do a series of
> initial works, as follows:
> 1. init uar table, allocate uar resource
> 2. init pd table
> 3. init cq table
> 4. init mr table
> 5. init qp table
> 
> Signed-off-by: Wei Hu 
> Signed-off-by: Nenglong Zhao 
> Signed-off-by: Lijun Ou 
> ---
> PATCH v9:
> This fixes the comments given by Leon Romanovsky over the PATCH v8
>   Link: https://lkml.org/lkml/2016/6/9/67
> 
> PATCH v8/v7/v6:
>   - No change over the PATCH v5
> 
> PATCH v5:
> - The initial patch which was redesigned based on the second patch
>   in PATCH v4
> ---
> ---
>  drivers/infiniband/hw/hns/hns_roce_alloc.c  | 128 +
>  drivers/infiniband/hw/hns/hns_roce_cq.c |  17 +++
>  drivers/infiniband/hw/hns/hns_roce_device.h |  69 +
>  drivers/infiniband/hw/hns/hns_roce_icm.c|  88 
>  drivers/infiniband/hw/hns/hns_roce_icm.h|   7 +
>  drivers/infiniband/hw/hns/hns_roce_main.c   |  79 +++
>  drivers/infiniband/hw/hns/hns_roce_mr.c | 210 
> 
>  drivers/infiniband/hw/hns/hns_roce_pd.c |  88 
>  drivers/infiniband/hw/hns/hns_roce_qp.c |  30 
>  9 files changed, 716 insertions(+)
>  create mode 100644 drivers/infiniband/hw/hns/hns_roce_alloc.c
>  create mode 100644 drivers/infiniband/hw/hns/hns_roce_mr.c
>  create mode 100644 drivers/infiniband/hw/hns/hns_roce_pd.c
> 
> diff --git a/drivers/infiniband/hw/hns/hns_roce_alloc.c 
> b/drivers/infiniband/hw/hns/hns_roce_alloc.c
> new file mode 100644
> index 000..d2932c1
> --- /dev/null
> +++ b/drivers/infiniband/hw/hns/hns_roce_alloc.c
> @@ -0,0 +1,128 @@
> +/*
> + * Copyright (c) 2016 Hisilicon Limited.
> + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
> + *
> + * This software is available to you under a choice of one of two
> + * licenses.  You may choose to be licensed under the terms of the GNU
> + * General Public License (GPL) Version 2, available from the file
> + * COPYING in the main directory of this source tree, or the
> + * OpenIB.org BSD license below:
> + *
> + * Redistribution and use in source and binary forms, with or
> + * without modification, are permitted provided that the following
> + * conditions are met:
> + *
> + *  - Redistributions of source code must retain the above
> + *copyright notice, this list of conditions and the following
> + *disclaimer.
> + *
> + *  - Redistributions in binary form must reproduce the above
> + *copyright notice, this list of conditions and the following
> + *disclaimer in the documentation and/or other materials
> + *provided with the distribution.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include "hns_roce_device.h"
> +
> +int hns_roce_bitmap_alloc(struct hns_roce_bitmap *bitmap, unsigned long *obj)
> +{
> + int ret = 0;
> +
> + spin_lock(&bitmap->lock);
> + *obj = find_next_zero_bit(bitmap->table, bitmap->max, bitmap->last);
> + if (*obj >= bitmap->max) {
> + bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
> +& bitmap->mask;
> + *obj = find_first_zero_bit(bitmap->table, bitmap->max);
> + }
> +
> + if (*obj < bitmap->max) {
> + set_bit(*obj, bitmap->table);
> + bitmap->last = (*obj + 1);
> + if (bitmap->last == bitmap->max)
> + bitmap->last = 0;
> + *obj |= bitmap->top;
> + } else {
> + ret = -1;
> + }
> +
> + spin_unlock(&bitmap->lock);
> +
> + return ret;
> +}
> +
> +void hns_roce_bitmap_free(struct hns_roce_bitmap *bitmap, unsigned long obj)
> +{
> + hns_roce_bitmap_free_range(bitmap, obj, 1);
> +}
> +
> +void hns_roce_bitmap_free_range(struct hns_roce_bitmap *bitmap,
> + unsigned long obj, int cnt)
> +{
> + int i;
> +
> + obj &= bitmap->max + bitmap->reserved_top - 1;
> +
> + spin_lock(&bitmap->lock);
> + for (i = 0; i < cnt; i++)
> + clear_bit(obj + i, bitmap->table);
> +
> + bitmap->last = min(bitmap->last, obj);
> + bitmap->top = (bitmap->top + bitmap->max + bitmap->reserved_top)
> +& bitmap->mask;
> + spin_unlock(&bitmap->lock);
> +}
> +
> +int hns_roce_bitmap_init(struct hns_roce

Re: [PATCH v10 07/22] IB/hns: Add event queue support

2016-06-24 Thread Leon Romanovsky

On Thu, Jun 16, 2016 at 10:35:15PM +0800, Lijun Ou wrote:
> This patch added event queue support for RoCE driver. It is used
> for RoCE interrupt. RoCE includes 32 synchronous event irqs, 1
> asynchronous event irq and 1 common overflow irq.
> 
> Signed-off-by: Wei Hu 
> Signed-off-by: Nenglong Zhao 
> Signed-off-by: Lijun Ou 
> ---
> PATCH v9/v8:
> - No change over the PATCH v7
> 
> PATCH v7:
> This fixes the comments given by Doug Ledford over the PATCH v6:
>   Link: https://lkml.org/lkml/2016/5/13/510
> 
> PATCH v6:
> - No change over the PATCH v5
> 
> PATCH v5:
> - The initial patch which was redesigned based on the second patch
>   in PATCH v4
> ---
> ---
>  drivers/infiniband/hw/hns/hns_roce_cmd.c|  22 +
>  drivers/infiniband/hw/hns/hns_roce_common.h |  70 +++
>  drivers/infiniband/hw/hns/hns_roce_cq.c |  77 +++
>  drivers/infiniband/hw/hns/hns_roce_device.h | 135 +
>  drivers/infiniband/hw/hns/hns_roce_eq.c | 750 
> 
>  drivers/infiniband/hw/hns/hns_roce_eq.h | 130 +
>  drivers/infiniband/hw/hns/hns_roce_main.c   |  24 +
>  drivers/infiniband/hw/hns/hns_roce_qp.c |  63 +++
>  8 files changed, 1271 insertions(+)
>  create mode 100644 drivers/infiniband/hw/hns/hns_roce_cq.c
>  create mode 100644 drivers/infiniband/hw/hns/hns_roce_eq.c
>  create mode 100644 drivers/infiniband/hw/hns/hns_roce_eq.h
>  create mode 100644 drivers/infiniband/hw/hns/hns_roce_qp.c
> 
> diff --git a/drivers/infiniband/hw/hns/hns_roce_cmd.c 
> b/drivers/infiniband/hw/hns/hns_roce_cmd.c
> index 64e84fe..67b3137 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_cmd.c
> +++ b/drivers/infiniband/hw/hns/hns_roce_cmd.c
> @@ -45,6 +45,28 @@
>  
>  #define CMD_MAX_NUM  32
>  
> +static int hns_roce_status_to_errno(u8 orig_status)
> +{
> + if (orig_status == HNS_ROCE_CMD_SUCCESS)
> + return 0;
> + else
> + return -EIO;
> +}

1. Can orig_status be different from SUCCESS? You defined one enum only.
2. return (orig_status == HNS_ROCE_CMD_SUCCESS)?0:(-EIO);

> +
> +void hns_roce_cmd_event(struct hns_roce_dev *hr_dev, u16 token, u8 status,
> + u64 out_param)
> +{
> + struct hns_roce_cmd_context
> + *context = &hr_dev->cmd.context[token & hr_dev->cmd.token_mask];
> +
> + if (token != context->token)
> + return;
> +
> + context->result = hns_roce_status_to_errno(status);
> + context->out_param = out_param;
> + complete(&context->done);
> +}
> +
>  int hns_roce_cmd_init(struct hns_roce_dev *hr_dev)
>  {
>   struct device *dev = &hr_dev->pdev->dev;
> diff --git a/drivers/infiniband/hw/hns/hns_roce_common.h 
> b/drivers/infiniband/hw/hns/hns_roce_common.h
> index 595cda9..4805852 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_common.h
> +++ b/drivers/infiniband/hw/hns/hns_roce_common.h
> @@ -33,7 +33,56 @@
>  #ifndef _HNS_ROCE_COMMON_H
>  #define _HNS_ROCE_COMMON_H
>  
> +#define roce_write(dev, reg, val)writel((val), (dev)->reg_base + (reg))
>  #define roce_read(dev, reg)  readl((dev)->reg_base + (reg))
> +#define roce_raw_write(value, addr) \
> + __raw_writel((__force u32)cpu_to_le32(value), (addr))
> +
> +#define roce_get_field(origin, mask, shift) \
> + (((origin) & (mask)) >> (shift))
> +
> +#define roce_get_bit(origin, shift) \
> + roce_get_field((origin), (1ul << (shift)), (shift))
> +
> +#define roce_set_field(origin, mask, shift, val) \
> + do { \
> + (origin) &= (~(mask)); \
> + (origin) |= (((u32)(val) << (shift)) & (mask)); \
> + } while (0)
> +
> +#define roce_set_bit(origin, shift, val) \
> + roce_set_field((origin), (1ul << (shift)), (shift), (val))
> +
> +#define ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S 0
> +#define ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_M   \
> + (((1UL << 2) - 1) << ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_STATE_S)
> +
> +#define ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_AEQE_SHIFT_S 8
> +#define ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_AEQE_SHIFT_M   \
> + (((1UL << 4) - 1) << ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQC_AEQE_SHIFT_S)
> +
> +#define ROCEE_CAEP_AEQC_AEQE_SHIFT_CAEP_AEQ_ALM_OVF_INT_ST_S 17
> +
> +#define ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQ_BT_H_S 0
> +#define ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQ_BT_H_M   \
> + (((1UL << 5) - 1) << ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQ_BT_H_S)
> +
> +#define ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQE_CUR_IDX_S 16
> +#define ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQE_CUR_IDX_M   \
> + (((1UL << 16) - 1) << ROCEE_CAEP_AEQE_CUR_IDX_CAEP_AEQE_CUR_IDX_S)
> +
> +#define ROCEE_CAEP_AEQE_CONS_IDX_CAEP_AEQE_CONS_IDX_S 0
> +#define ROCEE_CAEP_AEQE_CONS_IDX_CAEP_AEQE_CONS_IDX_M   \
> + (((1UL << 16) - 1) << ROCEE_CAEP_AEQE_CONS_IDX_CAEP_AEQE_CONS_IDX_S)
> +
> +#define ROCEE_CAEP_CEQC_SHIFT_CAEP_CEQ_ALM_OVF_INT_ST_S 16
> +#define ROCEE_CAEP_CE_IRQ_MASK_CAEP_CEQ_ALM_OVF_MASK_S 1
> +#define ROCEE_CAEP_CEQ_ALM_OVF_CAEP_CEQ_ALM_OVF_S 0
> +
> +#define ROCEE_CAEP_AE_MASK_CAEP_AEQ_A

Re: [PATCH v10 05/22] IB/hns: Add initial profile resource

2016-06-24 Thread Leon Romanovsky

On Thu, Jun 16, 2016 at 10:35:13PM +0800, Lijun Ou wrote:
> This patch added the operation for cmd, and added some functions
> for initializing eq table and selecting cmd mode.
> 
> Signed-off-by: Wei Hu 
> Signed-off-by: Nenglong Zhao 
> Signed-off-by: Lijun Ou 
> ---
> PATCH v9:
> This fixes the comments given by Leon Romanovsky over the PATCH v8:
>   Link: https://lkml.org/lkml/2016/6/9/65
> 
> PATCH v8/v7/v6:
> - No change over the PATCH v5
> 
> PATCH v5:
> - The initial patch which was redesigned based on the second patch
>   in PATCH v4
> ---
> ---
>  drivers/infiniband/hw/hns/hns_roce_common.h | 49 +++
>  drivers/infiniband/hw/hns/hns_roce_device.h | 55 -
>  drivers/infiniband/hw/hns/hns_roce_hw_v1.c  | 75 
> +
>  drivers/infiniband/hw/hns/hns_roce_hw_v1.h  | 36 ++
>  drivers/infiniband/hw/hns/hns_roce_main.c   |  7 +++
>  5 files changed, 221 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/infiniband/hw/hns/hns_roce_common.h
> 
> diff --git a/drivers/infiniband/hw/hns/hns_roce_common.h 
> b/drivers/infiniband/hw/hns/hns_roce_common.h
> new file mode 100644
> index 000..4cc4761
> --- /dev/null
> +++ b/drivers/infiniband/hw/hns/hns_roce_common.h
> @@ -0,0 +1,49 @@
> +/*
> + * Copyright (c) 2016 Hisilicon Limited.
> + *
> + * This software is available to you under a choice of one of two
> + * licenses.  You may choose to be licensed under the terms of the GNU
> + * General Public License (GPL) Version 2, available from the file
> + * COPYING in the main directory of this source tree, or the
> + * OpenIB.org BSD license below:
> + *
> + * Redistribution and use in source and binary forms, with or
> + * without modification, are permitted provided that the following
> + * conditions are met:
> + *
> + *  - Redistributions of source code must retain the above
> + *copyright notice, this list of conditions and the following
> + *disclaimer.
> + *
> + *  - Redistributions in binary form must reproduce the above
> + *copyright notice, this list of conditions and the following
> + *disclaimer in the documentation and/or other materials
> + *provided with the distribution.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + */
> +
> +#ifndef _HNS_ROCE_COMMON_H
> +#define _HNS_ROCE_COMMON_H
> +
> +#define roce_read(dev, reg)  readl((dev)->reg_base + (reg))
> +
> +/*ROCEE_REG DEFINITION/
> +#define ROCEE_VENDOR_ID_REG  0x0
> +#define ROCEE_VENDOR_PART_ID_REG 0x4
> +
> +#define ROCEE_HW_VERSION_REG 0x8
> +
> +#define ROCEE_SYS_IMAGE_GUID_L_REG   0xC
> +#define ROCEE_SYS_IMAGE_GUID_H_REG   0x10
> +
> +#define ROCEE_ACK_DELAY_REG  0x14
> +
> +#endif /* _HNS_ROCE_COMMON_H */
> diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h 
> b/drivers/infiniband/hw/hns/hns_roce_device.h
> index b857c76..e01ea34 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_device.h
> +++ b/drivers/infiniband/hw/hns/hns_roce_device.h
> @@ -45,6 +45,12 @@
>  #define DRV_NAME "hns_roce"
>  
>  #define HNS_ROCE_MAX_IRQ_NUM 34
> +
> +#define HNS_ROCE_COMP_VEC_NUM32
> +
> +#define HNS_ROCE_AEQE_VEC_NUM1
> +#define HNS_ROCE_AEQE_OF_VEC_NUM 1
> +
>  #define HNS_ROCE_MAX_PORTS   6
>  
>  struct hns_roce_ib_iboe {
> @@ -53,11 +59,52 @@ struct hns_roce_ib_iboe {
>  };
>  
>  struct hns_roce_caps {
> - u8  num_ports;
> + u64 fw_ver;
> + u8  num_ports;
> + int gid_table_len[HNS_ROCE_MAX_PORTS];
> + int pkey_table_len[HNS_ROCE_MAX_PORTS];
> + int local_ca_ack_delay;
> + int num_uars;
> + u32 phy_num_uars;
> + u32 max_sq_sg;  /* 2 */
> + u32 max_sq_inline;  /* 32 */
> + u32 max_rq_sg;  /* 2 */
> + int num_qps;/* 256k */
> + u32 max_wqes;   /* 16k */
> + u32 max_sq_desc_sz; /* 64 */
> + u32 max_rq_desc_sz; /* 64 */
> + int max_qp_init_rdma;
> + int max_qp_dest_rdma;
> + int sqp_start;
> + int num_cqs;
> + int max_cqes;
> + int reserved_cqs;
> + int num_aeq_vectors;

Re: [PATCH v10 04/22] IB/hns: Add RoCE engine reset function

2016-06-24 Thread Leon Romanovsky

On Thu, Jun 16, 2016 at 10:35:12PM +0800, Lijun Ou wrote:
> This patch mainly added reset flow of RoCE engine in RoCE
> driver. It is necessary when RoCE is loaded and removed.
> 
> Signed-off-by: Wei Hu 
> Signed-off-by: Nenglong Zhao 
> Signed-off-by: Lijun Ou 
> ---
> PATCH v9/v8:
> - No change over the PATCH v7
> 
> PATCH v7:
> This fixes the comments given by Leon Romanovsky over the PATCH v6:
>   Link: https://lkml.org/lkml/2016/5/3/733
> 
> PATCH v6:
> - No change over the PATCH v5
> 
> PATCH v5:
> - The initial patch which was redesigned based on the second patch
>   in PATCH v4
> ---
> ---
>  drivers/infiniband/hw/hns/hns_roce_device.h |  7 +++
>  drivers/infiniband/hw/hns/hns_roce_hw_v1.c  | 72 
> +
>  drivers/infiniband/hw/hns/hns_roce_hw_v1.h  | 40 
>  drivers/infiniband/hw/hns/hns_roce_main.c   | 17 ++-
>  4 files changed, 135 insertions(+), 1 deletion(-)
>  create mode 100644 drivers/infiniband/hw/hns/hns_roce_hw_v1.c
>  create mode 100644 drivers/infiniband/hw/hns/hns_roce_hw_v1.h
> 
> diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h 
> b/drivers/infiniband/hw/hns/hns_roce_device.h
> index 946b470..b857c76 100644
> --- a/drivers/infiniband/hw/hns/hns_roce_device.h
> +++ b/drivers/infiniband/hw/hns/hns_roce_device.h
> @@ -56,6 +56,10 @@ struct hns_roce_caps {
>   u8  num_ports;
>  };
>  
> +struct hns_roce_hw {
> + int (*reset)(struct hns_roce_dev *hr_dev, bool enable);
> +};
> +
>  struct hns_roce_dev {
>   struct ib_deviceib_dev;
>   struct platform_device  *pdev;
> @@ -68,6 +72,9 @@ struct hns_roce_dev {
>  
>   int cmd_mod;
>   int loop_idc;
> + struct hns_roce_hw  *hw;
>  };
>  
> +extern struct hns_roce_hw hns_roce_hw_v1;
> +
>  #endif /* _HNS_ROCE_DEVICE_H */
> diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.c 
> b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
> new file mode 100644
> index 000..198be3b
> --- /dev/null
> +++ b/drivers/infiniband/hw/hns/hns_roce_hw_v1.c
> @@ -0,0 +1,72 @@
> +/*
> + * Copyright (c) 2016 Hisilicon Limited.
> + *
> + * This software is available to you under a choice of one of two
> + * licenses.  You may choose to be licensed under the terms of the GNU
> + * General Public License (GPL) Version 2, available from the file
> + * COPYING in the main directory of this source tree, or the
> + * OpenIB.org BSD license below:
> + *
> + * Redistribution and use in source and binary forms, with or
> + * without modification, are permitted provided that the following
> + * conditions are met:
> + *
> + *  - Redistributions of source code must retain the above
> + *copyright notice, this list of conditions and the following
> + *disclaimer.
> + *
> + *  - Redistributions in binary form must reproduce the above
> + *copyright notice, this list of conditions and the following
> + *disclaimer in the documentation and/or other materials
> + *provided with the distribution.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + */
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include "hns_roce_device.h"
> +#include "hns_roce_hw_v1.h"
> +
> +/**
> + * hns_roce_v1_reset - reset roce
> + * @hr_dev: roce device struct pointer
> + * @enable: true -- drop reset, false -- reset
> + * return 0 - success , negative --fail
> + */
> +int hns_roce_v1_reset(struct hns_roce_dev *hr_dev, bool enable)
> +{
> + struct device_node *dsaf_node;
> + struct device *dev = &hr_dev->pdev->dev;
> + struct device_node *np = dev->of_node;
> + int ret;
> +
> + dsaf_node = of_parse_phandle(np, "dsaf-handle", 0);
> +
> + if (!enable) {
> + ret = hns_dsaf_roce_reset(&dsaf_node->fwnode, false);
> + } else {
> + ret = hns_dsaf_roce_reset(&dsaf_node->fwnode, false);

Move this line out of if-else and leave "if (enable)" part only.

> + if (ret)
> + return ret;
> +
> + msleep(SLEEP_TIME_INTERVAL);

Nice, here you used define and in other places just hardcoded 20
(msleep(20)). Please give meaningful definition to 20.

> + ret = hns_dsaf_roce_reset(&dsaf_node->fwnode, true);
> + }
> +
> + return ret;

Indentation

> +}
> +
> +struct hns_roce_hw hns_roce_hw_v1 = {
> + .reset = hns_roce_v1_reset,
> +};
> diff --git a/drivers/infiniband/hw/hns/hns_roce_hw_v1.h 
> b/drivers/infiniband/h

Re: [ldv-project] [net] rtl8188ee: a potential race condition

2016-06-24 Thread Larry Finger


On 06/24/2016 09:17 AM, Vaishali Thakkar wrote:



On Friday 10 June 2016 01:51 PM, Pavel Andrianov wrote:

Hi!

There is a potential data race in 
drivers/net/wireless/realtek/rtlwifi/rtl8188ee/rtl8188ee.ko.

In the function rtl88ee_gpio_radio_on_off_checking the flag 
ppsc->rfchange_inprogress is set with a spinlock protection. In the function 
rtl_ps_set_rf_state the flag is read also under a spinlock. But the function 
rtl88e_dm_watchdog read it without any locks. As a result rtl88e_dm_watchdog may 
execute the succeeding code while changing (with the flag rfchange_inprogress == 
true). I do not exactly determine the consequences, but likely they are not good 
if there exists such check. Could anybody more confident confirm this?

The function rtl_ps_set_rf_state is always called with its parameter 
[protect_or_not == false]. Is this flag really necessary, if the value 'true' is 
never used? The function is also set the flag ppsc->rfchange_inprogress and may 
affect the rtl88e_dm_watchdog as in the previous case.


I think the patch was sent sometime ago for removing the parameter. But I am 
not sure why it's not applied.
May be Larry can have better idea about this.

Here, is link to the patch: 
http://linux-wireless.vger.kernel.narkive.com/mu4t9xxr/patch-3-4-rtlwifi-rtl8192cu-remove-unused-parameter


The patch for rtl8192cu was applied as commit 4b9d8d67b44a on Jun 20 2011, but 
the unused parameter was reintroduced as part of an update of the power-save 
code with commit d3feae41a347 on Sep 22 2014. My recollection is that Realtek 
envisioned a driver that needed this parameter to be true. As none has yet been 
introduced, I will prepare a patch to remove it again.


I am also testing a patch to remove the race condition in rtl8188ee.

Larry

Re: [REGRESSION, bisect]cxgb4 port failure with TSO traffic after commit 10d3be569243def8("tcp-tso: do not split TSO packets at retransmit time")

2016-06-24 Thread Eric Dumazet

Please do not top post on netdev mailing list

On Fri, Jun 24, 2016 at 4:38 AM, Arjun V.  wrote:
> Eric,
> We are seeing skb's with length(skb->len) greater than 65536 coming into our 
> ndo_start_xmit() callback routine.
> We can add a check in our eth_xmit() routine to skip those packets, but it 
> will be better if you fix this in kernel.
>
>
> I have attached pcap file obtained from tcpdump. In the pcap file there are 2 
> such packets(I used tcpdump filter to extract out those packets).
>
> Let us know if you need anything else.
>

Beats me really.

At retransmit time, we only can eventually reduce packet sizes
(assuming GSO is used, because we might coalesce sub-mss packets in
tcp_retrans_try_collapse())

So why are you seeing too big packets at retransmit, I really have no idea.

MIght be some bug related to MSS computation, overflowing somehow ?

Re: [ldv-project] [net] rtl8188ee: a potential race condition

2016-06-24 Thread Vaishali Thakkar



On Friday 10 June 2016 01:51 PM, Pavel Andrianov wrote:
> Hi!
> 
> There is a potential data race in 
> drivers/net/wireless/realtek/rtlwifi/rtl8188ee/rtl8188ee.ko.
> 
> In the function rtl88ee_gpio_radio_on_off_checking the flag 
> ppsc->rfchange_inprogress is set with a spinlock protection. In the function 
> rtl_ps_set_rf_state the flag is read also under a spinlock. But the function 
> rtl88e_dm_watchdog read it without any locks. As a result rtl88e_dm_watchdog 
> may execute the succeeding code while changing (with the flag 
> rfchange_inprogress == true). I do not exactly determine the consequences, 
> but likely they are not good if there exists such check. Could anybody more 
> confident confirm this?
> 
> The function rtl_ps_set_rf_state is always called with its parameter 
> [protect_or_not == false]. Is this flag really necessary, if the value 'true' 
> is never used? The function is also set the flag ppsc->rfchange_inprogress 
> and may affect the rtl88e_dm_watchdog as in the previous case.

I think the patch was sent sometime ago for removing the parameter. But I am 
not sure why it's not applied.
May be Larry can have better idea about this.

Here, is link to the patch: 
http://linux-wireless.vger.kernel.narkive.com/mu4t9xxr/patch-3-4-rtlwifi-rtl8192cu-remove-unused-parameter

-- 
Vaishali

Re: [PATCH] vsock: make listener child lock ordering explicit

2016-06-24 Thread Jorgen S. Hansen


> On Jun 23, 2016, at 5:28 PM, Stefan Hajnoczi  wrote:
> 
> There are several places where the listener and pending or accept queue
> child sockets are accessed at the same time.  Lockdep is unhappy that
> two locks from the same class are held.
> 
> Tell lockdep that it is safe and document the lock ordering.
> 
> Originally Claudio Imbrenda  sent a similar
> patch asking whether this is safe.  I have audited the code and also
> covered the vsock_pending_work() function.
> 
> Suggested-by: Claudio Imbrenda 
> Signed-off-by: Stefan Hajnoczi 
> ---
> net/vmw_vsock/af_vsock.c | 12 ++--
> 1 file changed, 10 insertions(+), 2 deletions(-)
> 
> diff --git a/net/vmw_vsock/af_vsock.c b/net/vmw_vsock/af_vsock.c
> index b5f1221..b96ac91 100644
> --- a/net/vmw_vsock/af_vsock.c
> +++ b/net/vmw_vsock/af_vsock.c
> @@ -61,6 +61,14 @@
>  * function will also cleanup rejected sockets, those that reach the connected
>  * state but leave it before they have been accepted.
>  *
> + * - Lock ordering for pending or accept queue sockets is:
> + *
> + * lock_sock(listener);
> + * lock_sock_nested(pending, SINGLE_DEPTH_NESTING);
> + *
> + * Using explicit nested locking keeps lockdep happy since normally only one
> + * lock of a given class may be taken at a time.
> + *
>  * - Sockets created by user action will be cleaned up when the user process
>  * calls close(2), causing our release implementation to be called. Our 
> release
>  * implementation will perform some cleanup then drop the last reference so 
> our
> @@ -443,7 +451,7 @@ void vsock_pending_work(struct work_struct *work)
>   cleanup = true;
> 
>   lock_sock(listener);
> - lock_sock(sk);
> + lock_sock_nested(sk, SINGLE_DEPTH_NESTING);
> 
>   if (vsock_is_pending(sk)) {
>   vsock_remove_pending(listener, sk);
> @@ -1292,7 +1300,7 @@ static int vsock_accept(struct socket *sock, struct 
> socket *newsock, int flags)
>   if (connected) {
>   listener->sk_ack_backlog--;
> 
> - lock_sock(connected);
> + lock_sock_nested(connected, SINGLE_DEPTH_NESTING);
>   vconnected = vsock_sk(connected);
> 
>   /* If the listener socket has received an error, then we should
> -- 
> 2.7.4
> 

Looks good to me - thanks for fixing this!

/jsh

[PATCH (net-next.git) 1/3] drivers: net: stmmac: reworking the PCS code.

2016-06-24 Thread Giuseppe Cavallaro

The 3.xx and 4.xx synopsys gmacs have a very similar
PCS embedded module and they share almost the same registers:
for example:
  AN_Control, AN_Status, AN_Advertisement, AN_Link_Partner_Ability,
  AN_Expansion, TBI_Extended_Status.

Just the RGMII/SMII Control/Status register differs.

So This patch aims to reorganize and enhance the PCS support.
It removes the existent support from the dwmac1000/dwmac4_core.c
moving basic PCS functions inside a new file called: stmmac_pcs.h.

The patch also reviews the available APIs to be better shared among
different hardware and easily enhanced to support new features.

Signed-off-by: Giuseppe Cavallaro 
---
 Documentation/networking/stmmac.txt|   1 +
 drivers/net/ethernet/stmicro/stmmac/common.h   |  16 ++-
 drivers/net/ethernet/stmicro/stmmac/dwmac1000.h|  56 +++-
 .../net/ethernet/stmicro/stmmac/dwmac1000_core.c   | 110 +++---
 drivers/net/ethernet/stmicro/stmmac/dwmac4.h   |  34 +++--
 drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c  |  73 ++
 .../net/ethernet/stmicro/stmmac/stmmac_ethtool.c   |  54 ---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  |  12 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac_pcs.h   | 159 +
 9 files changed, 350 insertions(+), 165 deletions(-)
 create mode 100644 drivers/net/ethernet/stmicro/stmmac/stmmac_pcs.h

diff --git a/Documentation/networking/stmmac.txt 
b/Documentation/networking/stmmac.txt
index 671fe3d..e226f89 100644
--- a/Documentation/networking/stmmac.txt
+++ b/Documentation/networking/stmmac.txt
@@ -285,6 +285,7 @@ Please see the following document:
  o mmc_core.c/mmc.h: Management MAC Counters;
  o stmmac_hwtstamp.c: HW timestamp support for PTP;
  o stmmac_ptp.c: PTP 1588 clock;
+ o stmmac_pcs.h: Physical Coding Sublayer common implementation;
  o dwmac-.c: these are for the platform glue-logic file; e.g. dwmac-sti.c
for STMicroelectronics SoCs.
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h 
b/drivers/net/ethernet/stmicro/stmmac/common.h
index fc60368..86eba2a 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -232,6 +232,11 @@ struct stmmac_extra_stats {
 #define DMA_HW_FEAT_ACTPHYIF   0x7000  /* Active/selected PHY iface */
 #define DEFAULT_DMA_PBL8
 
+/* PCS status and mask defines */
+#definePCS_ANE_IRQ BIT(2)  /* PCS Auto-Negotiation */
+#definePCS_LINK_IRQBIT(1)  /* PCS Link */
+#definePCS_RGSMIIIS_IRQBIT(0)  /* RGMII or SMII Interrupt */
+
 /* Max/Min RI Watchdog Timer count value */
 #define MAX_DMA_RIWT   0xff
 #define MIN_DMA_RIWT   0x20
@@ -272,9 +277,6 @@ enum dma_irq_status {
 #defineCORE_IRQ_RX_PATH_IN_LPI_MODE(1 << 2)
 #defineCORE_IRQ_RX_PATH_EXIT_LPI_MODE  (1 << 3)
 
-#defineCORE_PCS_ANE_COMPLETE   (1 << 5)
-#defineCORE_PCS_LINK_STATUS(1 << 6)
-#defineCORE_RGMII_IRQ  (1 << 7)
 #define CORE_IRQ_MTL_RX_OVERFLOW   BIT(8)
 
 /* Physical Coding Sublayer */
@@ -469,9 +471,12 @@ struct stmmac_ops {
void (*reset_eee_mode)(struct mac_device_info *hw);
void (*set_eee_timer)(struct mac_device_info *hw, int ls, int tw);
void (*set_eee_pls)(struct mac_device_info *hw, int link);
-   void (*ctrl_ane)(struct mac_device_info *hw, bool restart);
-   void (*get_adv)(struct mac_device_info *hw, struct rgmii_adv *adv);
void (*debug)(void __iomem *ioaddr, struct stmmac_extra_stats *x);
+   /* PCS calls */
+   void (*pcs_ctrl_ane)(void __iomem *ioaddr, bool ane, bool srgmi_ral,
+bool loopback);
+   void (*pcs_rane)(void __iomem *ioaddr, bool restart);
+   void (*pcs_get_adv_lp)(void __iomem *ioaddr, struct rgmii_adv *adv);
 };
 
 /* PTP and HW Timer helpers */
@@ -546,6 +551,7 @@ void stmmac_dwmac4_get_mac_addr(void __iomem *ioaddr, 
unsigned char *addr,
 void stmmac_dwmac4_set_mac(void __iomem *ioaddr, bool enable);
 
 void dwmac_dma_flush_tx_fifo(void __iomem *ioaddr);
+
 extern const struct stmmac_mode_ops ring_mode_ops;
 extern const struct stmmac_mode_ops chain_mode_ops;
 extern const struct stmmac_desc_ops dwmac4_desc_ops;
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h 
b/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h
index b0593a4..e671360 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h
@@ -46,9 +46,6 @@ enum dwmac1000_irq_status {
mmc_rx_irq = 0x0020,
mmc_irq = 0x0010,
pmt_irq = 0x0008,
-   pcs_ane_irq = 0x0004,
-   pcs_link_irq = 0x0002,
-   rgmii_irq = 0x0001,
 };
 #define GMAC_INT_MASK  0x003c  /* interrupt mask register */
 
@@ -90,42 +87,23 @@ enum power_event {
(reg * 8))
 #define GMAC_MAX_PERFECT_ADDRESSES 1
 
-/* PCS registers (AN

[PATCH (net-next.git) 2/3] drivers: net: stmmac: rework core ISR to better manage PCS and PMT

2016-06-24 Thread Giuseppe Cavallaro

By default, all gmac cores disable the PCS block and always
enable the PMT.

Note that this is done in a different way by 3.x and 4.x cores.

With this rework, PCS and PMT interrupt masks can be driven by
parameters now moved inside the mac_device_info structure
and the settings follow what the HW capability register reports.

Signed-off-by: Giuseppe Cavallaro 
---
 drivers/net/ethernet/stmicro/stmmac/common.h   |  2 ++
 drivers/net/ethernet/stmicro/stmmac/dwmac1000.h| 30 ++
 .../net/ethernet/stmicro/stmmac/dwmac1000_core.c   | 24 -
 drivers/net/ethernet/stmicro/stmmac/dwmac4.h   |  9 +++
 drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c  |  8 +-
 drivers/net/ethernet/stmicro/stmmac/stmmac.h   |  1 -
 .../net/ethernet/stmicro/stmmac/stmmac_ethtool.c   | 11 +---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  | 29 -
 8 files changed, 79 insertions(+), 35 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h 
b/drivers/net/ethernet/stmicro/stmmac/common.h
index 86eba2a..51077a8 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -529,6 +529,8 @@ struct mac_device_info {
int unicast_filter_entries;
int mcast_bits_log2;
unsigned int rx_csum;
+   unsigned int pcs;
+   unsigned int pmt;
 };
 
 struct mac_device_info *dwmac1000_setup(void __iomem *ioaddr, int mcbins,
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h 
b/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h
index e671360..ff3e5ab 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000.h
@@ -38,16 +38,26 @@
 #define GMAC_WAKEUP_FILTER 0x0028  /* Wake-up Frame Filter */
 
 #define GMAC_INT_STATUS0x0038  /* interrupt status 
register */
-enum dwmac1000_irq_status {
-   lpiis_irq = 0x400,
-   time_stamp_irq = 0x0200,
-   mmc_rx_csum_offload_irq = 0x0080,
-   mmc_tx_irq = 0x0040,
-   mmc_rx_irq = 0x0020,
-   mmc_irq = 0x0010,
-   pmt_irq = 0x0008,
-};
-#define GMAC_INT_MASK  0x003c  /* interrupt mask register */
+#define GMAC_INT_STATUS_PMTBIT(3)
+#define GMAC_INT_STATUS_MMCIS  BIT(4)
+#define GMAC_INT_STATUS_MMCRIS BIT(5)
+#define GMAC_INT_STATUS_MMCTIS BIT(6)
+#define GMAC_INT_STATUS_MMCCSUMBIT(7)
+#define GMAC_INT_STATUS_TSTAMP BIT(9)
+#define GMAC_INT_STATUS_LPIIS  BIT(10)
+
+/* interrupt mask register */
+#defineGMAC_INT_MASK   0x003c
+#defineGMAC_INT_DISABLE_RGMII  BIT(0)
+#defineGMAC_INT_DISABLE_PCSLINKBIT(1)
+#defineGMAC_INT_DISABLE_PCSAN  BIT(2)
+#defineGMAC_INT_DISABLE_PMTBIT(3)
+#defineGMAC_INT_DISABLE_TIMESTAMP  BIT(9)
+#defineGMAC_INT_DISABLE_PCS(GMAC_INT_DISABLE_RGMII | \
+GMAC_INT_DISABLE_PCSLINK | \
+GMAC_INT_DISABLE_PCSAN)
+#defineGMAC_INT_DEFAULT_MASK   (GMAC_INT_DISABLE_TIMESTAMP | \
+GMAC_INT_DISABLE_PCS)
 
 /* PMT Control and Status */
 #define GMAC_PMT   0x002c
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c 
b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
index 9772a43c..0d31f2f 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
@@ -37,7 +37,10 @@ static void dwmac1000_core_init(struct mac_device_info *hw, 
int mtu)
 {
void __iomem *ioaddr = hw->pcsr;
u32 value = readl(ioaddr + GMAC_CONTROL);
+
+   /* Configure GMAC core */
value |= GMAC_CORE_INIT;
+
if (mtu > 1500)
value |= GMAC_CONTROL_2K;
if (mtu > 2000)
@@ -46,7 +49,14 @@ static void dwmac1000_core_init(struct mac_device_info *hw, 
int mtu)
writel(value, ioaddr + GMAC_CONTROL);
 
/* Mask GMAC interrupts */
-   writel(0x207, ioaddr + GMAC_INT_MASK);
+   value = GMAC_INT_DEFAULT_MASK;
+
+   if (hw->pmt)
+   value &= ~GMAC_INT_DISABLE_PMT;
+   if (hw->pcs)
+   value &= ~GMAC_INT_DISABLE_PCS;
+
+   writel(value, ioaddr + GMAC_INT_MASK);
 
 #ifdef STMMAC_VLAN_TAG_USED
/* Tag detection without filtering */
@@ -283,20 +293,20 @@ static int dwmac1000_irq_status(struct mac_device_info 
*hw,
int ret = 0;
 
/* Not used events (e.g. MMC interrupts) are not handled. */
-   if ((intr_status & mmc_tx_irq))
+   if ((intr_status & GMAC_INT_STATUS_MMCTIS))
x->mmc_tx_irq_n++;
-   if (unlikely(intr_status & mmc_rx_irq))
+   if (unlikely(intr_status & GMAC_INT_STATUS_MMCRIS))
x->mmc_rx_irq_n++;
-   if (unlikely(intr_status & mmc_rx_csum_offload_irq))
+   if (unlikely(intr_status & GMAC_INT_STATUS_MMCCSUM))
x->mmc_r

[PATCH (net-next.git) 3/3] drivers: net: stmmac: add port selection programming

2016-06-24 Thread Giuseppe Cavallaro

In case of SGMII more, for example when a MAC2MAC connection
is needed, the port selection bits (inside the MAC configuration
registers) have to be programmed according to the link selected.
So the patch adds a new DT parameter to pass the port selection
and to programmed related PCS and CORE to use it.

Signed-off-by: Giuseppe Cavallaro 
---
 Documentation/devicetree/bindings/net/stmmac.txt  |  3 +++
 drivers/net/ethernet/stmicro/stmmac/common.h  |  1 +
 drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c  | 15 +++
 drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c | 15 +++
 drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c  |  3 ++-
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 15 ++-
 drivers/net/ethernet/stmicro/stmmac/stmmac_platform.c |  2 ++
 include/linux/stmmac.h|  1 +
 8 files changed, 53 insertions(+), 2 deletions(-)

diff --git a/Documentation/devicetree/bindings/net/stmmac.txt 
b/Documentation/devicetree/bindings/net/stmmac.txt
index 95816c5..41b49e6 100644
--- a/Documentation/devicetree/bindings/net/stmmac.txt
+++ b/Documentation/devicetree/bindings/net/stmmac.txt
@@ -47,6 +47,9 @@ Optional properties:
supported by this device instance
 - snps,perfect-filter-entries: Number of perfect filter entries supported
by this device instance
+- snps,ps-speed: port selection speed that can be passed to the core when
+PCS is supported. For example, this is used in case of SGMII
+and MAC2MAC connection.
 - AXI BUS Mode parameters: below the list of all the parameters to program the
   AXI register inside the DMA module:
- snps,lpi_en: enable Low Power Interface
diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h 
b/drivers/net/ethernet/stmicro/stmmac/common.h
index 51077a8..2533b91 100644
--- a/drivers/net/ethernet/stmicro/stmmac/common.h
+++ b/drivers/net/ethernet/stmicro/stmmac/common.h
@@ -531,6 +531,7 @@ struct mac_device_info {
unsigned int rx_csum;
unsigned int pcs;
unsigned int pmt;
+   unsigned int ps;
 };
 
 struct mac_device_info *dwmac1000_setup(void __iomem *ioaddr, int mcbins,
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c 
b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
index 0d31f2f..cbefe9e 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac1000_core.c
@@ -46,6 +46,21 @@ static void dwmac1000_core_init(struct mac_device_info *hw, 
int mtu)
if (mtu > 2000)
value |= GMAC_CONTROL_JE;
 
+   if (hw->ps) {
+   value |= GMAC_CONTROL_TE;
+
+   if (hw->ps == SPEED_1000) {
+   value &= ~GMAC_CONTROL_PS;
+   } else {
+   value |= GMAC_CONTROL_PS;
+
+   if (hw->ps == SPEED_10)
+   value &= ~GMAC_CONTROL_FES;
+   else
+   value |= GMAC_CONTROL_FES;
+   }
+   }
+
writel(value, ioaddr + GMAC_CONTROL);
 
/* Mask GMAC interrupts */
diff --git a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c 
b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
index 747f3cf..df5580d 100644
--- a/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
+++ b/drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c
@@ -32,6 +32,21 @@ static void dwmac4_core_init(struct mac_device_info *hw, int 
mtu)
if (mtu > 2000)
value |= GMAC_CONFIG_JE;
 
+   if (hw->ps) {
+   value |= GMAC_CONFIG_TE;
+
+   if (hw->ps == SPEED_1000) {
+   value &= ~GMAC_CONFIG_PS;
+   } else {
+   value |= GMAC_CONFIG_PS;
+
+   if (hw->ps == SPEED_10)
+   value &= ~GMAC_CONFIG_FES;
+   else
+   value |= GMAC_CONFIG_FES;
+   }
+   }
+
writel(value, ioaddr + GMAC_CONFIG);
 
/* Mask GMAC interrupts */
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c 
b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
index da2d9b5..1e06173 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_ethtool.c
@@ -380,7 +380,8 @@ static int stmmac_ethtool_setsettings(struct net_device 
*dev,
spin_lock(&priv->lock);
 
if (priv->hw->mac->pcs_ctrl_ane)
-   priv->hw->mac->pcs_ctrl_ane(priv->ioaddr, 1, 0, 0);
+   priv->hw->mac->pcs_ctrl_ane(priv->ioaddr, 1,
+   priv->hw->ps, 0);
 
spin_unlock(&priv->lock);
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c 
b/drivers/net/et

[PATCH (net-next.git) 0/3] stmmac: rework and enhance the PCS support

2016-06-24 Thread Giuseppe Cavallaro

The 3.xx and 4.xx synopsys gmacs have a very similar
PCS embedded module and they share almost the same registers;
for example:
  AN_Control, AN_Status, AN_Advertisement, AN_Link_Partner_Ability,
  AN_Expansion, TBI_Extended_Status.

Just the RGMII/SMII Control/Status register differs.

So these patches aim to reorganize and enhance the PCS support;
to do that, some small inline functions have been provided and
also some rework to the PCS ISR part has been done.

In the end, the SGMII for MAC2MAC connection has been introduced.

All patches have been built on top of net-next git and, as for
the previous version, not fully tested.

Giuseppe Cavallaro (3):
  drivers: net: stmmac: reworking the PCS code.
  drivers: net: stmmac: rework core ISR to better manage PCS and PMT
  drivers: net: stmmac: add port selection programming

 Documentation/devicetree/bindings/net/stmmac.txt   |   3 +
 Documentation/networking/stmmac.txt|   1 +
 drivers/net/ethernet/stmicro/stmmac/common.h   |  19 ++-
 drivers/net/ethernet/stmicro/stmmac/dwmac1000.h|  86 +--
 .../net/ethernet/stmicro/stmmac/dwmac1000_core.c   | 147 ++-
 drivers/net/ethernet/stmicro/stmmac/dwmac4.h   |  43 --
 drivers/net/ethernet/stmicro/stmmac/dwmac4_core.c  |  96 +
 drivers/net/ethernet/stmicro/stmmac/stmmac.h   |   1 -
 .../net/ethernet/stmicro/stmmac/stmmac_ethtool.c   |  60 +---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c  |  50 +--
 drivers/net/ethernet/stmicro/stmmac/stmmac_pcs.h   | 159 +
 .../net/ethernet/stmicro/stmmac/stmmac_platform.c  |   2 +
 include/linux/stmmac.h |   1 +
 13 files changed, 474 insertions(+), 194 deletions(-)
 create mode 100644 drivers/net/ethernet/stmicro/stmmac/stmmac_pcs.h

-- 
2.7.4

Re: [PATCH net] Bridge: Fix ipv6 mc snooping if bridge has no ipv6 address

2016-06-24 Thread Linus Lüssing

On Fri, Jun 24, 2016 at 12:35:18PM +0200, Daniel Danzberger wrote:
> The bridge is falsly dropping ipv6 mulitcast packets if there is:
>  1. No ipv6 address assigned on the brigde.
>  2. No external mld querier present.
>  3. The internal querier enabled.
> 
> When the bridge fails to build mld queries, because it has no
> ipv6 address, it slilently returns, but keeps the local querier enabled.
> This specific case causes confusing packet loss.
> 
> Ipv6 multicast snooping can only work if:
>  a) An external querier is present
>  OR
>  b) The bridge has an ipv6 address an is capable of sending own queries
> 
> Otherwise it has to forward/flood the ipv6 multicast traffic,
> because snooping cannot work.
> 
> This patch fixes the issue by adding a flag to the bridge struct that
> indicates that there is currently no ipv6 address assinged to the bridge
> and returns a false state for the local querier in
> __br_multicast_querier_exists().

Acked-by: Linus Lüssing

[PATCH] net: smc91x: ACPI Enable lan91x adapters

2016-06-24 Thread Jeremy Linton

Enable lan91x adapters in some ARM machines and models
when booted with an ACPI kernel.

Signed-off-by: Jeremy Linton 
---
 drivers/net/ethernet/smsc/smc91x.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/smsc/smc91x.c 
b/drivers/net/ethernet/smsc/smc91x.c
index 18ac52d..fcf69f9 100644
--- a/drivers/net/ethernet/smsc/smc91x.c
+++ b/drivers/net/ethernet/smsc/smc91x.c
@@ -2203,6 +2203,12 @@ static const struct of_device_id smc91x_match[] = {
 };
 MODULE_DEVICE_TABLE(of, smc91x_match);
 
+static const struct acpi_device_id smsc91x_acpi_match[] = {
+   { "LNRO0003", 0 },
+   { }
+};
+MODULE_DEVICE_TABLE(acpi, smsc91x_acpi_match);
+
 /**
  * of_try_set_control_gpio - configure a gpio if it exists
  */
@@ -2274,7 +2280,6 @@ static int smc_drv_probe(struct platform_device *pdev)
 #if IS_BUILTIN(CONFIG_OF)
match = of_match_device(of_match_ptr(smc91x_match), &pdev->dev);
if (match) {
-   struct device_node *np = pdev->dev.of_node;
u32 val;
 
/* Optional pwrdwn GPIO configured? */
@@ -2300,7 +2305,8 @@ static int smc_drv_probe(struct platform_device *pdev)
usleep_range(750, 1000);
 
/* Combination of IO widths supported, default to 16-bit */
-   if (!of_property_read_u32(np, "reg-io-width", &val)) {
+   if (!device_property_read_u32(&pdev->dev, "reg-io-width",
+ &val)) {
if (val & 1)
lp->cfg.flags |= SMC91X_USE_8BIT;
if ((val == 0) || (val & 2))
@@ -2479,6 +2485,7 @@ static struct platform_driver smc_driver = {
.name   = CARDNAME,
.pm = &smc_drv_pm_ops,
.of_match_table = of_match_ptr(smc91x_match),
+   .acpi_match_table = smsc91x_acpi_match,
},
 };
 
-- 
2.5.5

[PATCH] connector: fix out-of-order cn_proc netlink message delivery

2016-06-24 Thread Aaron Campbell

The proc connector messages include a sequence number, allowing userspace
programs to detect lost messages.  However, performing this detection is
currently more difficult than necessary, since netlink messages can be
delivered to the application out-of-order.  To fix this, leave pre-emption
disabled during cn_netlink_send(), and use GFP_NOWAIT.

The following was written as a test case.  Building the kernel w/ make -j32
proved a reliable way to generate out-of-order cn_proc messages.

int
main(int argc, char *argv[])
{
static uint32_t last_seq[CPU_SETSIZE], seq;
int cpu, fd;
struct sockaddr_nl sa;
struct __attribute__((aligned(NLMSG_ALIGNTO))) {
struct nlmsghdr nl_hdr;
struct __attribute__((__packed__)) {
struct cn_msg cn_msg;
struct proc_event cn_proc;
};
} rmsg;
struct __attribute__((aligned(NLMSG_ALIGNTO))) {
struct nlmsghdr nl_hdr;
struct __attribute__((__packed__)) {
struct cn_msg cn_msg;
enum proc_cn_mcast_op cn_mcast;
};
} smsg;

fd = socket(PF_NETLINK, SOCK_DGRAM, NETLINK_CONNECTOR);
if (fd < 0) {
perror("socket");
}

sa.nl_family = AF_NETLINK;
sa.nl_groups = CN_IDX_PROC;
sa.nl_pid = getpid();
if (bind(fd, (struct sockaddr *)&sa, sizeof(sa)) < 0) {
perror("bind");
}

memset(&smsg, 0, sizeof(smsg));
smsg.nl_hdr.nlmsg_len = sizeof(smsg);
smsg.nl_hdr.nlmsg_pid = getpid();
smsg.nl_hdr.nlmsg_type = NLMSG_DONE;
smsg.cn_msg.id.idx = CN_IDX_PROC;
smsg.cn_msg.id.val = CN_VAL_PROC;
smsg.cn_msg.len = sizeof(enum proc_cn_mcast_op);
smsg.cn_mcast = PROC_CN_MCAST_LISTEN;
if (send(fd, &smsg, sizeof(smsg), 0) != sizeof(smsg)) {
perror("send");
}

while (recv(fd, &rmsg, sizeof(rmsg), 0) == sizeof(rmsg)) {
cpu = rmsg.cn_proc.cpu;
if (cpu < 0) {
continue;
}
seq = rmsg.cn_msg.seq;
if ((last_seq[cpu] != 0) && (seq != last_seq[cpu] + 1)) {
printf("out-of-order seq=%d on cpu=%d\n", seq, cpu);
}
last_seq[cpu] = seq;
}

/* NOTREACHED */

perror("recv");

return -1;
}

Signed-off-by: Aaron Campbell 
---
 drivers/connector/cn_proc.c | 43 ++-
 1 file changed, 22 insertions(+), 21 deletions(-)

diff --git a/drivers/connector/cn_proc.c b/drivers/connector/cn_proc.c
index 15d06fc..b02f9c6 100644
--- a/drivers/connector/cn_proc.c
+++ b/drivers/connector/cn_proc.c
@@ -56,11 +56,21 @@ static struct cb_id cn_proc_event_id = { CN_IDX_PROC, 
CN_VAL_PROC };
 /* proc_event_counts is used as the sequence number of the netlink message */
 static DEFINE_PER_CPU(__u32, proc_event_counts) = { 0 };
 
-static inline void get_seq(__u32 *ts, int *cpu)
+static inline void send_msg(struct cn_msg *msg)
 {
preempt_disable();
-   *ts = __this_cpu_inc_return(proc_event_counts) - 1;
-   *cpu = smp_processor_id();
+
+   msg->seq = __this_cpu_inc_return(proc_event_counts) - 1;
+   ((struct proc_event *)msg->data)->cpu = smp_processor_id();
+
+   /*
+* Preemption remains disabled during send to ensure the messages are
+* ordered according to their sequence numbers.
+*
+* If cn_netlink_send() fails, the data is not sent.
+*/
+   cn_netlink_send(msg, 0, CN_IDX_PROC, GFP_NOWAIT);
+
preempt_enable();
 }
 
@@ -77,7 +87,6 @@ void proc_fork_connector(struct task_struct *task)
msg = buffer_to_cn_msg(buffer);
ev = (struct proc_event *)msg->data;
memset(&ev->event_data, 0, sizeof(ev->event_data));
-   get_seq(&msg->seq, &ev->cpu);
ev->timestamp_ns = ktime_get_ns();
ev->what = PROC_EVENT_FORK;
rcu_read_lock();
@@ -92,8 +101,7 @@ void proc_fork_connector(struct task_struct *task)
msg->ack = 0; /* not used */
msg->len = sizeof(*ev);
msg->flags = 0; /* not used */
-   /*  If cn_netlink_send() failed, the data is not sent */
-   cn_netlink_send(msg, 0, CN_IDX_PROC, GFP_KERNEL);
+   send_msg(msg);
 }
 
 void proc_exec_connector(struct task_struct *task)
@@ -108,7 +116,6 @@ void proc_exec_connector(struct task_struct *task)
msg = buffer_to_cn_msg(buffer);
ev = (struct proc_event *)msg->data;
memset(&ev->event_data, 0, sizeof(ev->event_data));
-   get_seq(&msg->seq, &ev->cpu);
ev->timestamp_ns = ktime_get_ns();
ev->what = PROC_EVENT_EXEC;
ev->event_data.exec.process_pid = task->pid;
@@ -118,7 +125,7 @@ void proc_exec_connector(struct task_struct *task)
msg->ack = 0; /* not used */

Re: [iproute PATCH v3 0/6] Big C99 style initializer rework

2016-06-24 Thread Nicolas Dichtel

Le 23/06/2016 19:34, Phil Sutter a écrit :
> This is v3 of my C99-style initializer related patch series. The changes
> since v2 are:
Compile-tested with a gcc 4.4.7.


Regards,
Nicolas

Re: [PATCH net-next 0/5] qed/qede: Tunnel hardware GRO support

2016-06-24 Thread Edward Cree

On 23/06/16 18:07, Alexander Duyck wrote:
> I would prefer to see us extend LRO to support "close enough GRO"
> instead of have us extend GRO to also include LRO.
This reminds me of something I've been meaning to bring up (sorry for
slightly OT, but it might turn out relevant after all).
In sfc we have an (out-of-tree and likely staying that way) LRO that's
entirely in software.  The only reason it exists is for users who want
the 'permissive' merging behaviour of LRO, i.e. they don't need the
guarantees of reversibility and by merging more stuff they can get
slightly higher performance.
I wonder if it would be a good idea for the GRO implementation to have
some knobs to allow setting it to behave in this way.
That would imply a scheme to define various GRO/SSR semantics, which
then would also be a convenient interface for a driver to report the
semantics of its hardware LRO if it has any.
And it would make crystal clear that the difference between GRO and
LRO is kernel vs hardware, rather than reversible vs not.

-Ed

Re: vmw_vsock sk_ack_backlog double decrement bug

2016-06-24 Thread Jorgen S. Hansen

Hi Stefan,

Good catch. Thanks for pointing this out. I'll take care of fixing and testing 
this.
 
 Thanks,
 Jørgen


From: Stefan Hajnoczi 
Sent: Thursday, June 23, 2016 5:40 PM
To: Jorgen S. Hansen
Cc: netdev@vger.kernel.org
Subject: vmw_vsock sk_ack_backlog double decrement bug

Hi Jorgen,
virtio-vsock doesn't use vsock_pending_work() but I may have spotted a
problem that affects the VMCI transport.  I'm not sending a patch
because I can't test it.

1. During vsock_accept() listener->sk_ack_backlog is decremented.
2. vsock_pending_work() will decrement listener->sk_ack_backlog again if
   vsk->rejected.

The result is that sk_ack_backlog can be invalid.  It only happens in
the case where the listener socket has an error.  Maybe in practice it's
not a problem because the server application will close the listener
socket if there is an error...

Stefan

Re: [PATCH] net: ethernet: ti: cpdma: switch to use genalloc

2016-06-24 Thread Afzal Mohammed

Hi,

On Fri, Jun 24, 2016 at 11:35:15AM +0530, Mugunthan V N wrote:
> On Thursday 23 June 2016 06:26 PM, Ivan Khoronzhuk wrote:

> >> +if (pool->cpumap) {
> >> +dma_free_coherent(pool->dev, pool->mem_size, pool->cpumap,
> >> +  pool->phys);
> >> +} else {
> >> +iounmap(pool->iomap);
> >> +}

> > single if, brackets?
> 
> if() has multiple line statement, so brackets are must.

Another paint to the bikeshed,

seems documented coding style mentions otherwise.

Regards
afzal

Re: [iproute PATCH v3 6/6] misc/ifstat: simplify unsigned value comparison

2016-06-24 Thread Phil Sutter

On Fri, Jun 24, 2016 at 09:20:32AM +, David Laight wrote:
> From: Phil Sutter
> > Sent: 23 June 2016 18:34
> > 
> > By directly comparing the value of both unsigned variables, casting to
> > signed becomes unnecessary.
> > 
> > This also fixes for compiling with older versions of gcc (at least
> > <=3.4.6) which emit the following warning:
> > 
> > | ifstat.c: In function `update_db':
> > | ifstat.c:542: warning: comparison is always false due to limited range of 
> > data type
> > 
> > Signed-off-by: Phil Sutter 
> > ---
> >  misc/ifstat.c | 2 +-
> >  1 file changed, 1 insertion(+), 1 deletion(-)
> > 
> > diff --git a/misc/ifstat.c b/misc/ifstat.c
> > index abbb4e732fcef..9a44da487599e 100644
> > --- a/misc/ifstat.c
> > +++ b/misc/ifstat.c
> > @@ -539,7 +539,7 @@ static void update_db(int interval)
> > int i;
> > 
> > for (i = 0; i < MAXS; i++) {
> > -   if ((long)(h1->ival[i] - n->ival[i]) < 
> > 0) {
> > +   if (h1->ival[i] < n->ival[i]) {
> > memset(n->ival, 0, 
> > sizeof(n->ival));
> > break;
> 
> That isn't the same check.
> The original code is using modulo arithmetic.

Oh, right! The code behaves differently if h1->ival[i] is close to
ULONG_MAX and n->ival[i] is very small. Though I don't see where this
becomes relevant. Am I missing another scenario?

Thanks, Phil

Re: [PATCH v10 01/22] net: hns: Add reset function support for RoCE driver

2016-06-24 Thread Leon Romanovsky

On Thu, Jun 16, 2016 at 10:35:09PM +0800, Lijun Ou wrote:
> It added reset function for RoCE driver. RoCE is a feature of hns.
> In hip06 SoC, in RoCE reset process, it's needed to configure dsaf
> channel reset, port and sl map info. Reset function of RoCE is
> located in dsaf module, we only call it in RoCE driver when needed.
>
> Signed-off-by: Wei Hu 
> Signed-off-by: Nenglong Zhao 
> Signed-off-by: Lijun Ou 
> Signed-off-by: Sheng Li 
> ---
> PATCH v9/v8/v7:
> - No change over PATCH v6
>
> PATCH v6:
> This fixes the comments given by Leon Romanovsky over the PATCH v5:
>   Link: https://lkml.org/lkml/2016/5/3/733
>
> PATCH v5/v4/v3:
> - No change over PATCH v2
>
> PATCH v2:
> This fixes the comments given by Leon Romanovsky over the PATCH v1:
>   Link: https://lkml.org/lkml/2016/3/12/46
>
> PATCH v1:
> - The initial patch
> ---
> ---
>  drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c | 84 
> ++
>  drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.h | 30 
>  drivers/net/ethernet/hisilicon/hns/hns_dsaf_misc.c | 36 ++
>  drivers/net/ethernet/hisilicon/hns/hns_dsaf_reg.h  | 14 +++-
>  4 files changed, 163 insertions(+), 1 deletion(-)
>
> diff --git a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c 
> b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c
> index 1c2ddb2..0c4a87c 100644
> --- a/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c
> +++ b/drivers/net/ethernet/hisilicon/hns/hns_dsaf_main.c
> @@ -14,6 +14,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  #include 
>  #include 
> @@ -2685,6 +2686,89 @@ static struct platform_driver g_dsaf_driver = {
>
>  module_platform_driver(g_dsaf_driver);
>
> +/**
> + * hns_dsaf_roce_reset - reset dsaf and roce
> + * @dsaf_fwnode: Pointer to framework node for the dasf
> + * @enable: false - request reset , true - drop reset
> + * retuen 0 - success , negative -fail
> + */
> +int hns_dsaf_roce_reset(struct fwnode_handle *dsaf_fwnode, bool enable)
> +{
> + struct dsaf_device *dsaf_dev;
> + struct platform_device *pdev;
> + unsigned int mp;
> + unsigned int sl;
> + unsigned int credit;
> + int i;
> + const u32 port_map[DSAF_ROCE_CREDIT_CHN][DSAF_ROCE_CHAN_MODE_NUM] = {
> + {DSAF_ROCE_PORT_0, DSAF_ROCE_PORT_0, DSAF_ROCE_PORT_0},
> + {DSAF_ROCE_PORT_1, DSAF_ROCE_PORT_0, DSAF_ROCE_PORT_0},
> + {DSAF_ROCE_PORT_2, DSAF_ROCE_PORT_1, DSAF_ROCE_PORT_0},
> + {DSAF_ROCE_PORT_3, DSAF_ROCE_PORT_1, DSAF_ROCE_PORT_0},
> + {DSAF_ROCE_PORT_4, DSAF_ROCE_PORT_2, DSAF_ROCE_PORT_1},
> + {DSAF_ROCE_PORT_4, DSAF_ROCE_PORT_2, DSAF_ROCE_PORT_1},
> + {DSAF_ROCE_PORT_5, DSAF_ROCE_PORT_3, DSAF_ROCE_PORT_1},
> + {DSAF_ROCE_PORT_5, DSAF_ROCE_PORT_3, DSAF_ROCE_PORT_1},
> + };
> + const u32 sl_map[DSAF_ROCE_CREDIT_CHN][DSAF_ROCE_CHAN_MODE_NUM] = {
> + {DSAF_ROCE_SL_0, DSAF_ROCE_SL_0, DSAF_ROCE_SL_0},
> + {DSAF_ROCE_SL_0, DSAF_ROCE_SL_1, DSAF_ROCE_SL_1},
> + {DSAF_ROCE_SL_0, DSAF_ROCE_SL_0, DSAF_ROCE_SL_2},
> + {DSAF_ROCE_SL_0, DSAF_ROCE_SL_1, DSAF_ROCE_SL_3},
> + {DSAF_ROCE_SL_0, DSAF_ROCE_SL_0, DSAF_ROCE_SL_0},
> + {DSAF_ROCE_SL_1, DSAF_ROCE_SL_1, DSAF_ROCE_SL_1},
> + {DSAF_ROCE_SL_0, DSAF_ROCE_SL_0, DSAF_ROCE_SL_2},
> + {DSAF_ROCE_SL_1, DSAF_ROCE_SL_1, DSAF_ROCE_SL_3},
> + };
> +
> + if (!is_of_node(dsaf_fwnode)) {
> + pr_err("hisi_dsaf: Only support DT node!\n");
> + return -EINVAL;
> + }
> + pdev = of_find_device_by_node(to_of_node(dsaf_fwnode));
> + dsaf_dev = dev_get_drvdata(&pdev->dev);
> + if (AE_IS_VER1(dsaf_dev->dsaf_ver)) {
> + dev_err(dsaf_dev->dev, "%s v1 chip do not support roce!\n",

chip don't support roce -> chip doesn't support RoCE

> + dsaf_dev->ae_dev.name);
> + return -ENODEV;
> + }
> +
> + if (!enable) {
> + /* Reset rocee-channels in dsaf and rocee */
> + hns_dsaf_srst_chns(dsaf_dev, DSAF_CHNS_MASK, false);
> + hns_dsaf_roce_srst(dsaf_dev, false);
> + } else {
> + /* Configure dsaf tx roce correspond to port map and sl map */
> + mp = dsaf_read_dev(dsaf_dev, DSAF_ROCE_PORT_MAP_REG);
> + for (i = 0; i < DSAF_ROCE_CREDIT_CHN; i++)
> + dsaf_set_field(mp, 7 << i * 3, i * 3,
> +   port_map[i][DSAF_ROCE_6PORT_MODE]);
> + dsaf_set_field(mp, 3 << i * 3, i * 3, 0);
> + dsaf_write_dev(dsaf_dev, DSAF_ROCE_PORT_MAP_REG, mp);
> +
> + sl = dsaf_read_dev(dsaf_dev, DSAF_ROCE_SL_MAP_REG);
> + for (i = 0; i < DSAF_ROCE_CREDIT_CHN; i++)
> + dsaf_set_field(sl, 3 << i * 2, i * 2,
> +   sl_map[i][DSAF_ROCE_6PORT_MODE]);
> + dsaf_write_dev(dsaf_dev, DSAF_ROCE_SL_MAP_REG, sl);
> +
> + /* De-reset rocee-channels in dsaf and rocee */
> + hns_dsaf_srst_chns(dsaf_dev, DSAF_CHNS_MASK, true);
> + msleep(20);
> + hns_dsaf_roce_srst(dsaf_dev, true);
> +
> + /* Eanble dsaf channel rocee credit */
> + credit = dsaf_read_dev(dsaf_dev, DSAF_SBM_ROCEE_CFG_REG_REG);
> + dsaf_set_bit(credit, DSAF_SBM_ROCEE_CFG_CRD_EN_B, 0);
> + dsaf_write_dev(dsaf_dev, DSAF_SBM_ROCEE_CFG_REG_REG, credit);
> +
> + dsaf_set_bit(credit, DSAF_SBM_ROCEE_CFG_CRD_EN_B, 1);
> + dsaf_write_dev(dsaf_dev, DSAF_SBM_ROCEE_CFG_REG_REG, credit);
> + }
> + return 0;
> +

Re: [PATCH v10 03/22] IB/hns: Add initial main frame driver and get cfg info

2016-06-24 Thread Leon Romanovsky

On Thu, Jun 16, 2016 at 10:35:11PM +0800, Lijun Ou wrote:
> This patch mainly added the initial bare main driver. It
> could get the relative configure information of net node.
> 
> Signed-off-by: Wei Hu 
> Signed-off-by: Nenglong Zhao 
> Signed-off-by: Lijun Ou 
> ---
> PATCH v9:
> This fixes comments given by Leon Romanovsky over the PATCH v8:
>   Link: https://lkml.org/lkml/2016/6/9/56
> 
> PATCH v8/v7/v6:
> - No change over the PATCH v5
> 
> PATCH v5:
> - The initial patch which was redesigned based on the second patch
>   in PATCH v4
> ---
> ---
>  drivers/infiniband/hw/hns/hns_roce_device.h |  73 ++
>  drivers/infiniband/hw/hns/hns_roce_main.c   | 200 
> 
>  2 files changed, 273 insertions(+)
>  create mode 100644 drivers/infiniband/hw/hns/hns_roce_device.h
>  create mode 100644 drivers/infiniband/hw/hns/hns_roce_main.c
> 
> diff --git a/drivers/infiniband/hw/hns/hns_roce_device.h 
> b/drivers/infiniband/hw/hns/hns_roce_device.h
> new file mode 100644
> index 000..946b470
> --- /dev/null
> +++ b/drivers/infiniband/hw/hns/hns_roce_device.h
> @@ -0,0 +1,73 @@
> +/*
> + * Copyright (c) 2016 Hisilicon Limited.
> + *
> + * This software is available to you under a choice of one of two
> + * licenses.  You may choose to be licensed under the terms of the GNU
> + * General Public License (GPL) Version 2, available from the file
> + * COPYING in the main directory of this source tree, or the
> + * OpenIB.org BSD license below:
> + *
> + * Redistribution and use in source and binary forms, with or
> + * without modification, are permitted provided that the following
> + * conditions are met:
> + *
> + *  - Redistributions of source code must retain the above
> + *copyright notice, this list of conditions and the following
> + *disclaimer.
> + *
> + *  - Redistributions in binary form must reproduce the above
> + *copyright notice, this list of conditions and the following
> + *disclaimer in the documentation and/or other materials
> + *provided with the distribution.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
> + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
> + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
> + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
> + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
> + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
> + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
> + * SOFTWARE.
> + */
> +
> +#ifndef _HNS_ROCE_DEVICE_H
> +#define _HNS_ROCE_DEVICE_H
> +
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +#include 
> +
> +#define DRV_NAME "hns_roce"
> +
> +#define HNS_ROCE_MAX_IRQ_NUM 34
> +#define HNS_ROCE_MAX_PORTS   6
> +
> +struct hns_roce_ib_iboe {
> + struct net_device  *netdevs[HNS_ROCE_MAX_PORTS];
> + u8  phy_port[HNS_ROCE_MAX_PORTS];
> +};
> +
> +struct hns_roce_caps {
> + u8  num_ports;
> +};
> +
> +struct hns_roce_dev {
> + struct ib_deviceib_dev;
> + struct platform_device  *pdev;
> + const char  *irq_names;
> + struct hns_roce_ib_iboe iboe;
> +
> + int irq[HNS_ROCE_MAX_IRQ_NUM];
> + u8 __iomem  *reg_base;
> + struct hns_roce_capscaps;
> +
> + int cmd_mod;
> + int loop_idc;
> +};
> +
> +#endif /* _HNS_ROCE_DEVICE_H */
> diff --git a/drivers/infiniband/hw/hns/hns_roce_main.c 
> b/drivers/infiniband/hw/hns/hns_roce_main.c
> new file mode 100644
> index 000..8924ce3
> --- /dev/null
> +++ b/drivers/infiniband/hw/hns/hns_roce_main.c
> @@ -0,0 +1,200 @@
> +/*
> + * Copyright (c) 2016 Hisilicon Limited.
> + * Copyright (c) 2007, 2008 Mellanox Technologies. All rights reserved.
> + *
> + * This software is available to you under a choice of one of two
> + * licenses.  You may choose to be licensed under the terms of the GNU
> + * General Public License (GPL) Version 2, available from the file
> + * COPYING in the main directory of this source tree, or the
> + * OpenIB.org BSD license below:
> + *
> + * Redistribution and use in source and binary forms, with or
> + * without modification, are permitted provided that the following
> + * conditions are met:
> + *
> + *  - Redistributions of source code must retain the above
> + *copyright notice, this list of conditions and the following
> + *disclaimer.
> + *
> + *  - Redistributions in binary form must reproduce the above
> + *copyright notice, this list of conditions and the following
> + *disclaimer in the documentation and/or other materials
> + *provided with the distribution.
> + *
> + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF A

Re: [iproute PATCH v3 0/6] Big C99 style initializer rework

2016-06-24 Thread Phil Sutter

Hi,

On Fri, Jun 24, 2016 at 09:17:07AM +, David Laight wrote:
> From: Phil Sutter
> > Sent: 23 June 2016 18:34
> >
> > This is v3 of my C99-style initializer related patch series.
> ...
> 
> It would be interesting to know how this affect the kernel code size?
> 
> While gcc will generate a memset() call for 'struct foo = {0}' if you
> initialise some members it might generate explicit zeroing instructions
> for all the other words of the structure.
> 
> I've seen gcc use memset() to zero the end of a structure, it may use
> memset() for large gaps earlier in the structure.
> 
> But if you initialise a byte half way down you are very unlikely to
> get a single memset() and then a write to the single location.

I did a standard build ('make distclean; make') before and after this
commit in my tree. The 'ip' binary didn't change in size at all (quite
surprising), the 'tc' binary shrunk by 48 bytes.

Cheers, Phil

[PATCH v2] net: ethernet: ti: cpdma: switch to use genalloc

2016-06-24 Thread Grygorii Strashko

TI CPDMA currently uses a bitmap for tracking descriptors alloactions
allocations, but The genalloc already handles the same and can be used
as with special memory (SRAM) as with DMA cherent memory chank
(dma_alloc_coherent()). Hence, switch to using genalloc and add
desc_num property for each channel for limitation of max number of
allowed descriptors for each CPDMA channel. This patch do not affect
on net throuput.

Tested-by: Ivan Khoronzhuk  
Signed-off-by: Grygorii Strashko 
---
Testing
TCP window: 256K, bandwidth in Mbits/sec:
 host: iperf -s
 device: iperf -c  172.22.39.17 -t600 -i5 -d -w128K

AM437x-idk, 1Gbps link
 before: : 341.60, after: 232+123=355
am57xx-beagle-x15, 1Gbps link
 before: : 1112.80, after: 814+321=1135
am335x-boneblack, 100Mbps link
 before: : 162.40, after: 72+93=165

changes in v2:
 - reverted change in desc_phys() to keep am3517 which has separate CPPI
   addresses from EMAC and CPU perspective
 - minor format changes.

link on v1:
 https://lkml.org/lkml/2016/6/23/353

 drivers/net/ethernet/ti/davinci_cpdma.c | 132 +++-
 1 file changed, 60 insertions(+), 72 deletions(-)

diff --git a/drivers/net/ethernet/ti/davinci_cpdma.c 
b/drivers/net/ethernet/ti/davinci_cpdma.c
index 63b3009..b40a402 100644
--- a/drivers/net/ethernet/ti/davinci_cpdma.c
+++ b/drivers/net/ethernet/ti/davinci_cpdma.c
@@ -21,7 +21,7 @@
 #include 
 #include 
 #include 
-
+#include 
 #include "davinci_cpdma.h"
 
 /* DMA Registers */
@@ -87,9 +87,8 @@ struct cpdma_desc_pool {
void*cpumap;/* dma_alloc map */
int desc_size, mem_size;
int num_desc, used_desc;
-   unsigned long   *bitmap;
struct device   *dev;
-   spinlock_t  lock;
+   struct gen_pool *gen_pool;
 };
 
 enum cpdma_state {
@@ -117,6 +116,7 @@ struct cpdma_chan {
int chan_num;
spinlock_t  lock;
int count;
+   u32 desc_num;
u32 mask;
cpdma_handler_fnhandler;
enum dma_data_direction dir;
@@ -145,6 +145,20 @@ struct cpdma_chan {
 (directed << CPDMA_TO_PORT_SHIFT));\
} while (0)
 
+static void cpdma_desc_pool_destroy(struct cpdma_desc_pool *pool)
+{
+   if (!pool)
+   return;
+
+   WARN_ON(pool->used_desc);
+   if (pool->cpumap) {
+   dma_free_coherent(pool->dev, pool->mem_size, pool->cpumap,
+ pool->phys);
+   } else {
+   iounmap(pool->iomap);
+   }
+}
+
 /*
  * Utility constructs for a cpdma descriptor pool.  Some devices (e.g. davinci
  * emac) have dedicated on-chip memory for these descriptors.  Some other
@@ -155,24 +169,25 @@ static struct cpdma_desc_pool *
 cpdma_desc_pool_create(struct device *dev, u32 phys, dma_addr_t hw_addr,
int size, int align)
 {
-   int bitmap_size;
struct cpdma_desc_pool *pool;
+   int ret;
 
pool = devm_kzalloc(dev, sizeof(*pool), GFP_KERNEL);
if (!pool)
-   goto fail;
-
-   spin_lock_init(&pool->lock);
+   goto gen_pool_create_fail;
 
pool->dev   = dev;
pool->mem_size  = size;
pool->desc_size = ALIGN(sizeof(struct cpdma_desc), align);
pool->num_desc  = size / pool->desc_size;
 
-   bitmap_size  = (pool->num_desc / BITS_PER_LONG) * sizeof(long);
-   pool->bitmap = devm_kzalloc(dev, bitmap_size, GFP_KERNEL);
-   if (!pool->bitmap)
-   goto fail;
+   pool->gen_pool = devm_gen_pool_create(dev, ilog2(pool->desc_size), -1,
+ "cpdma");
+   if (IS_ERR(pool->gen_pool)) {
+   dev_err(dev, "pool create failed %ld\n",
+   PTR_ERR(pool->gen_pool));
+   goto gen_pool_create_fail;
+   }
 
if (phys) {
pool->phys  = phys;
@@ -185,24 +200,22 @@ cpdma_desc_pool_create(struct device *dev, u32 phys, 
dma_addr_t hw_addr,
pool->phys = pool->hw_addr; /* assumes no IOMMU, don't use this 
value */
}
 
-   if (pool->iomap)
-   return pool;
-fail:
-   return NULL;
-}
-
-static void cpdma_desc_pool_destroy(struct cpdma_desc_pool *pool)
-{
-   if (!pool)
-   return;
+   if (!pool->iomap)
+   goto gen_pool_create_fail;
 
-   WARN_ON(pool->used_desc);
-   if (pool->cpumap) {
-   dma_free_coherent(pool->dev, pool->mem_size, pool->cpumap,
- pool->phys);
-   } else {
-   iounmap(pool->iomap);
+   ret = gen_pool_add_virt(pool->gen_pool, (unsigned long)pool->iomap,
+   pool->phys, pool->mem_size, -1);
+

[PATCH net] Bridge: Fix ipv6 mc snooping if bridge has no ipv6 address

2016-06-24 Thread Daniel Danzberger

The bridge is falsly dropping ipv6 mulitcast packets if there is:
 1. No ipv6 address assigned on the brigde.
 2. No external mld querier present.
 3. The internal querier enabled.

When the bridge fails to build mld queries, because it has no
ipv6 address, it slilently returns, but keeps the local querier enabled.
This specific case causes confusing packet loss.

Ipv6 multicast snooping can only work if:
 a) An external querier is present
 OR
 b) The bridge has an ipv6 address an is capable of sending own queries

Otherwise it has to forward/flood the ipv6 multicast traffic,
because snooping cannot work.

This patch fixes the issue by adding a flag to the bridge struct that
indicates that there is currently no ipv6 address assinged to the bridge
and returns a false state for the local querier in
__br_multicast_querier_exists().

Special thanks to Linus Lüssing.

Signed-off-by: Daniel Danzberger 
---
 net/bridge/br_multicast.c |  4 
 net/bridge/br_private.h   | 23 +++
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/net/bridge/br_multicast.c b/net/bridge/br_multicast.c
index 6852f3c..4384414 100644
--- a/net/bridge/br_multicast.c
+++ b/net/bridge/br_multicast.c
@@ -464,8 +464,11 @@ static struct sk_buff *br_ip6_multicast_alloc_query(struct 
net_bridge *br,
if (ipv6_dev_get_saddr(dev_net(br->dev), br->dev, &ip6h->daddr, 0,
   &ip6h->saddr)) {
kfree_skb(skb);
+   br->has_ipv6_addr = 0;
return NULL;
}
+
+   br->has_ipv6_addr = 1;
ipv6_eth_mc_map(&ip6h->daddr, eth->h_dest);
 
hopopt = (u8 *)(ip6h + 1);
@@ -1745,6 +1748,7 @@ void br_multicast_init(struct net_bridge *br)
br->ip6_other_query.delay_time = 0;
br->ip6_querier.port = NULL;
 #endif
+   br->has_ipv6_addr = 1;
 
spin_lock_init(&br->multicast_lock);
setup_timer(&br->multicast_router_timer,
diff --git a/net/bridge/br_private.h b/net/bridge/br_private.h
index c7fb5d7..52edecf 100644
--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -314,6 +314,7 @@ struct net_bridge
u8  multicast_disabled:1;
u8  multicast_querier:1;
u8  multicast_query_use_ifaddr:1;
+   u8  has_ipv6_addr:1;
 
u32 hash_elasticity;
u32 hash_max;
@@ -588,10 +589,22 @@ static inline bool br_multicast_is_router(struct 
net_bridge *br)
 
 static inline bool
 __br_multicast_querier_exists(struct net_bridge *br,
- struct bridge_mcast_other_query *querier)
+   struct bridge_mcast_other_query *querier,
+   const bool is_ipv6)
 {
+   bool own_querier_enabled;
+
+   if (br->multicast_querier) {
+   if (is_ipv6 && !br->has_ipv6_addr)
+   own_querier_enabled = false;
+   else
+   own_querier_enabled = true;
+   } else {
+   own_querier_enabled = false;
+   }
+
return time_is_before_jiffies(querier->delay_time) &&
-  (br->multicast_querier || timer_pending(&querier->timer));
+  (own_querier_enabled || timer_pending(&querier->timer));
 }
 
 static inline bool br_multicast_querier_exists(struct net_bridge *br,
@@ -599,10 +612,12 @@ static inline bool br_multicast_querier_exists(struct 
net_bridge *br,
 {
switch (eth->h_proto) {
case (htons(ETH_P_IP)):
-   return __br_multicast_querier_exists(br, &br->ip4_other_query);
+   return __br_multicast_querier_exists(br,
+   &br->ip4_other_query, false);
 #if IS_ENABLED(CONFIG_IPV6)
case (htons(ETH_P_IPV6)):
-   return __br_multicast_querier_exists(br, &br->ip6_other_query);
+   return __br_multicast_querier_exists(br,
+   &br->ip6_other_query, true);
 #endif
default:
return false;
-- 
2.1.4

[iproute PATCH] man: ip-address, ip-link: Document 'type' quirk

2016-06-24 Thread Phil Sutter

This covers the fact that calling 'ip {link|addr} show type foobar' does
not return an error.

Signed-off-by: Phil Sutter 
---
 man/man8/ip-address.8.in | 6 ++
 man/man8/ip-link.8.in| 6 ++
 2 files changed, 12 insertions(+)

diff --git a/man/man8/ip-address.8.in b/man/man8/ip-address.8.in
index ab0942d7e94e2..8d34adb336af4 100644
--- a/man/man8/ip-address.8.in
+++ b/man/man8/ip-address.8.in
@@ -280,6 +280,12 @@ only list interfaces enslaved to this master device.
 .BI type " TYPE"
 only list interfaces of the given type.
 
+Note that the type name is not checked against the list of supported types -
+instead it is sent as-is to the kernel. Later it is used to filter the returned
+interface list by comparing it with the relevant attribute in case the kernel
+didn't filter already. Therefore any string is accepted, but may lead to empty
+output.
+
 .TP
 .B up
 only list running interfaces.
diff --git a/man/man8/ip-link.8.in b/man/man8/ip-link.8.in
index d5673639d9ddf..e10f541b20bcf 100644
--- a/man/man8/ip-link.8.in
+++ b/man/man8/ip-link.8.in
@@ -1242,6 +1242,12 @@ specifies the master device which enslaves devices to 
show.
 .I TYPE
 specifies the type of devices to show.
 
+Note that the type name is not checked against the list of supported types -
+instead it is sent as-is to the kernel. Later it is used to filter the returned
+interface list by comparing it with the relevant attribute in case the kernel
+didn't filter already. Therefore any string is accepted, but may lead to empty
+output.
+
 .SS  ip link help - display help
 
 .PP
-- 
2.8.2

[PATCH V5 1/1] net: ethernet: Add TSE PCS support to dwmac-socfpga

2016-06-24 Thread thloh

From: Tien Hock Loh 

This adds support for TSE PCS that uses SGMII adapter when the phy-mode of
the dwmac is set to sgmii.

Signed-off-by: Tien Hock Loh 
Acked-by: Giuseppe Cavallaro 
Acked-by: Rob Herring 

---
v2:
- Refactored the TSE PCS out from the dwmac-socfpga.c file
- Added binding documentation for TSE PCS sgmii adapter
v3:
- Added missing license header for new source files
- Updated tse_pcs.h include headers
- Standardize if statements
v4:
- Reset SGMII adapter on speed change
- Do not enable SGMII adapter if speed is not supported
- On init, if PCS reset fails, do not enable adapter
v5:
- Fixed devicetree binding property name using _ instead of -
---
 .../devicetree/bindings/net/socfpga-dwmac.txt  |  19 ++
 drivers/net/ethernet/stmicro/stmmac/Makefile   |   2 +-
 drivers/net/ethernet/stmicro/stmmac/altr_tse_pcs.c | 276 +
 drivers/net/ethernet/stmicro/stmmac/altr_tse_pcs.h |  36 +++
 .../net/ethernet/stmicro/stmmac/dwmac-socfpga.c| 149 +--
 5 files changed, 460 insertions(+), 22 deletions(-)
 create mode 100644 drivers/net/ethernet/stmicro/stmmac/altr_tse_pcs.c
 create mode 100644 drivers/net/ethernet/stmicro/stmmac/altr_tse_pcs.h

diff --git a/Documentation/devicetree/bindings/net/socfpga-dwmac.txt 
b/Documentation/devicetree/bindings/net/socfpga-dwmac.txt
index 72d82d6..2e68a3c 100644
--- a/Documentation/devicetree/bindings/net/socfpga-dwmac.txt
+++ b/Documentation/devicetree/bindings/net/socfpga-dwmac.txt
@@ -17,9 +17,26 @@ Required properties:
 Optional properties:
 altr,emac-splitter: Should be the phandle to the emac splitter soft IP node if
DWMAC controller is connected emac splitter.
+phy-mode: The phy mode the ethernet operates in
+altr,sgmii-to-sgmii-converter: phandle to the TSE SGMII converter
+
+This device node has additional phandle dependency, the sgmii converter:
+
+Required properties:
+ - compatible  : Should be altr,gmii-to-sgmii-2.0
+ - reg-names   : Should be "eth_tse_control_port"
 
 Example:
 
+gmii_to_sgmii_converter: phy@0x10240 {
+   compatible = "altr,gmii-to-sgmii-2.0";
+   reg = <0x0001 0x0240 0x0008>,
+   <0x0001 0x0200 0x0040>;
+   reg-names = "eth_tse_control_port";
+   clocks = <&sgmii_1_clk_0 &emac1 1 &sgmii_clk_125 &sgmii_clk_125>;
+   clock-names = "tse_pcs_ref_clk_clock_connection", "tse_rx_cdr_refclk";
+};
+
 gmac0: ethernet@ff70 {
compatible = "altr,socfpga-stmmac", "snps,dwmac-3.70a", "snps,dwmac";
altr,sysmgr-syscon = <&sysmgr 0x60 0>;
@@ -30,4 +47,6 @@ gmac0: ethernet@ff70 {
mac-address = [00 00 00 00 00 00];/* Filled in by U-Boot */
clocks = <&emac_0_clk>;
clock-names = "stmmaceth";
+   phy-mode = "sgmii";
+   altr,gmii-to-sgmii-converter = <&gmii_to_sgmii_converter>;
 };
diff --git a/drivers/net/ethernet/stmicro/stmmac/Makefile 
b/drivers/net/ethernet/stmicro/stmmac/Makefile
index 0fb362d..0ff76e8 100644
--- a/drivers/net/ethernet/stmicro/stmmac/Makefile
+++ b/drivers/net/ethernet/stmicro/stmmac/Makefile
@@ -11,7 +11,7 @@ obj-$(CONFIG_DWMAC_IPQ806X)   += dwmac-ipq806x.o
 obj-$(CONFIG_DWMAC_LPC18XX)+= dwmac-lpc18xx.o
 obj-$(CONFIG_DWMAC_MESON)  += dwmac-meson.o
 obj-$(CONFIG_DWMAC_ROCKCHIP)   += dwmac-rk.o
-obj-$(CONFIG_DWMAC_SOCFPGA)+= dwmac-socfpga.o
+obj-$(CONFIG_DWMAC_SOCFPGA)+= dwmac-socfpga.o altr_tse_pcs.o
 obj-$(CONFIG_DWMAC_STI)+= dwmac-sti.o
 obj-$(CONFIG_DWMAC_SUNXI)  += dwmac-sunxi.o
 obj-$(CONFIG_DWMAC_GENERIC)+= dwmac-generic.o
diff --git a/drivers/net/ethernet/stmicro/stmmac/altr_tse_pcs.c 
b/drivers/net/ethernet/stmicro/stmmac/altr_tse_pcs.c
new file mode 100644
index 000..40bfaac
--- /dev/null
+++ b/drivers/net/ethernet/stmicro/stmmac/altr_tse_pcs.c
@@ -0,0 +1,276 @@
+/* Copyright Altera Corporation (C) 2016. All rights reserved.
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2,
+ * as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program.  If not, see .
+ *
+ * Author: Tien Hock Loh 
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "stmmac.h"
+#include "stmmac_platform.h"
+#include "altr_tse_pcs.h"
+
+#define SYSMGR_EMACGRP_CTRL_PHYSEL_ENUM_GMII_MII   0
+#define SYSMGR_EMACGRP_CTRL_PHYSEL_ENUM_RGMII  BIT(1)
+#define SYSMGR_EMACGRP_CTRL_PHYSEL_ENUM_RMII   BIT(2)
+#define SYSMGR_EMACGRP_CTRL_PHYSEL_WIDTH   2
+#define SYSMGR_EMACGRP_CTRL_PHYSEL_MASKG

[PATCH] of_mdio: select fixed phy support unconditionally

2016-06-24 Thread Arnd Bergmann

Calling the fixed-phy functions when CONFIG_FIXED_PHY=m as a previous
change tried cannot work if the caller is in built-in code:

drivers/of/built-in.o: In function `of_phy_register_fixed_link':
of_reserved_mem.c:(.text+0x85e0): undefined reference to `fixed_phy_register'

Making of_mdio depend on 'FIXED_PHY || !FIXED_PHY' would solve this
dependency by enforcing that OF_MDIO itself becomes a loadable module
when FIXED_PHY=y, but that creates a different dependency as it
breaks any built-in ethernet driver that uses of_mdio.

Making FIXED_PHY a bool option also cannot work, since it depends on
PHYLIB, which again is tristate.

This version now uses 'select FIXED_PHY' to ensure that the fixed-phy
portion of of_mdio is not optional. The main downside of this is
a small increase in code size for cases that do not need fixed phy
support, but it should avoid all of the link-time problems.

Signed-off-by: Arnd Bergmann 
Fixes: d1bd330a229f ("of_mdio: Enable fixed PHY support if driver is a module")
---
 drivers/of/Kconfig  | 1 +
 drivers/of/of_mdio.c| 2 --
 include/linux/of_mdio.h | 8 ++--
 3 files changed, 3 insertions(+), 8 deletions(-)

diff --git a/drivers/of/Kconfig b/drivers/of/Kconfig
index b3bec3aaa45d..bc07ad30c9bf 100644
--- a/drivers/of/Kconfig
+++ b/drivers/of/Kconfig
@@ -74,6 +74,7 @@ config OF_NET
 config OF_MDIO
def_tristate PHYLIB
depends on PHYLIB
+   select FIXED_PHY
help
  OpenFirmware MDIO bus (Ethernet PHY) accessors
 
diff --git a/drivers/of/of_mdio.c b/drivers/of/of_mdio.c
index de68707a99c7..e2b50bc12f23 100644
--- a/drivers/of/of_mdio.c
+++ b/drivers/of/of_mdio.c
@@ -361,7 +361,6 @@ struct phy_device *of_phy_attach(struct net_device *dev,
 }
 EXPORT_SYMBOL(of_phy_attach);
 
-#if IS_ENABLED(CONFIG_FIXED_PHY)
 /*
  * of_phy_is_fixed_link() and of_phy_register_fixed_link() must
  * support two DT bindings:
@@ -451,4 +450,3 @@ int of_phy_register_fixed_link(struct device_node *np)
return -ENODEV;
 }
 EXPORT_SYMBOL(of_phy_register_fixed_link);
-#endif
diff --git a/include/linux/of_mdio.h b/include/linux/of_mdio.h
index 6c8cb9aa4c00..4b04587d0441 100644
--- a/include/linux/of_mdio.h
+++ b/include/linux/of_mdio.h
@@ -25,6 +25,8 @@ struct phy_device *of_phy_attach(struct net_device *dev,
 
 extern struct mii_bus *of_mdio_find_bus(struct device_node *mdio_np);
 extern int of_mdio_parse_addr(struct device *dev, const struct device_node 
*np);
+extern int of_phy_register_fixed_link(struct device_node *np);
+extern bool of_phy_is_fixed_link(struct device_node *np);
 
 #else /* CONFIG_OF */
 static inline int of_mdiobus_register(struct mii_bus *mdio, struct device_node 
*np)
@@ -67,12 +69,6 @@ static inline int of_mdio_parse_addr(struct device *dev,
 {
return -ENOSYS;
 }
-#endif /* CONFIG_OF */
-
-#if defined(CONFIG_OF) && IS_ENABLED(CONFIG_FIXED_PHY)
-extern int of_phy_register_fixed_link(struct device_node *np);
-extern bool of_phy_is_fixed_link(struct device_node *np);
-#else
 static inline int of_phy_register_fixed_link(struct device_node *np)
 {
return -ENOSYS;
-- 
2.9.0

RE: [iproute PATCH v3 6/6] misc/ifstat: simplify unsigned value comparison

2016-06-24 Thread David Laight

From: Phil Sutter
> Sent: 23 June 2016 18:34
> 
> By directly comparing the value of both unsigned variables, casting to
> signed becomes unnecessary.
> 
> This also fixes for compiling with older versions of gcc (at least
> <=3.4.6) which emit the following warning:
> 
> | ifstat.c: In function `update_db':
> | ifstat.c:542: warning: comparison is always false due to limited range of 
> data type
> 
> Signed-off-by: Phil Sutter 
> ---
>  misc/ifstat.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
> 
> diff --git a/misc/ifstat.c b/misc/ifstat.c
> index abbb4e732fcef..9a44da487599e 100644
> --- a/misc/ifstat.c
> +++ b/misc/ifstat.c
> @@ -539,7 +539,7 @@ static void update_db(int interval)
>   int i;
> 
>   for (i = 0; i < MAXS; i++) {
> - if ((long)(h1->ival[i] - n->ival[i]) < 
> 0) {
> + if (h1->ival[i] < n->ival[i]) {
>   memset(n->ival, 0, 
> sizeof(n->ival));
>   break;

That isn't the same check.
The original code is using modulo arithmetic.

David

RE: [iproute PATCH v3 0/6] Big C99 style initializer rework

2016-06-24 Thread David Laight

From: Phil Sutter
> Sent: 23 June 2016 18:34
>
> This is v3 of my C99-style initializer related patch series.
...

It would be interesting to know how this affect the kernel code size?

While gcc will generate a memset() call for 'struct foo = {0}' if you
initialise some members it might generate explicit zeroing instructions
for all the other words of the structure.

I've seen gcc use memset() to zero the end of a structure, it may use
memset() for large gaps earlier in the structure.

But if you initialise a byte half way down you are very unlikely to
get a single memset() and then a write to the single location.

David

[PATCH v12 net-next 1/1] hv_sock: introduce Hyper-V Sockets

2016-06-24 Thread Dexuan Cui

Hyper-V Sockets (hv_sock) supplies a byte-stream based communication
mechanism between the host and the guest. It's somewhat like TCP over
VMBus, but the transportation layer (VMBus) is much simpler than IP.

With Hyper-V Sockets, applications between the host and the guest can talk
to each other directly by the traditional BSD-style socket APIs.

Hyper-V Sockets is only available on new Windows hosts, like Windows Server
2016. More info is in this article "Make your own integration services":
https://msdn.microsoft.com/en-us/virtualization/hyperv_on_windows/develop/make_mgmt_service

The patch implements the necessary support in the guest side by introducing
a new socket address family AF_HYPERV.

Signed-off-by: Dexuan Cui 
Cc: "K. Y. Srinivasan" 
Cc: Haiyang Zhang 
Cc: Vitaly Kuznetsov 
Cc: Cathy Avery 
---

You can also get the patch here:
https://github.com/dcui/linux/commits/decui/hv_sock/net-next/20160620_v12

For the change log before v12, please see https://lkml.org/lkml/2016/5/15/31


In v12, the changes are mainly the following:

1) remove the module params as David suggested.

2) use 5 exact pages for VMBus send/recv rings, respectively.
The host side's design of the feature requires 5 exact pages for recv/send
rings respectively -- this is suboptimal considering memory consumption,
however unluckily we have to live with it, before the host comes up with
a new design in the future. :-(

3) remove the per-connection static send/recv buffers
Instead, we allocate and free the buffers dynamically only when we recv/send
data. This means: when a connection is idle, no memory is consumed as
recv/send buffers at all.

Looking forward to your comments!

 MAINTAINERS |2 +
 include/linux/hyperv.h  |   14 +
 include/linux/socket.h  |4 +-
 include/net/af_hvsock.h |   59 ++
 include/uapi/linux/hyperv.h |   25 +
 net/Kconfig |1 +
 net/Makefile|1 +
 net/hv_sock/Kconfig |   10 +
 net/hv_sock/Makefile|3 +
 net/hv_sock/af_hvsock.c | 1514 +++
 10 files changed, 1632 insertions(+), 1 deletion(-)

diff --git a/MAINTAINERS b/MAINTAINERS
index 50f69ba..6eaa26f 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5514,7 +5514,9 @@ F:drivers/pci/host/pci-hyperv.c
 F: drivers/net/hyperv/
 F: drivers/scsi/storvsc_drv.c
 F: drivers/video/fbdev/hyperv_fb.c
+F: net/hv_sock/
 F: include/linux/hyperv.h
+F: include/net/af_hvsock.h
 F: tools/hv/
 F: Documentation/ABI/stable/sysfs-bus-vmbus
 
diff --git a/include/linux/hyperv.h b/include/linux/hyperv.h
index 50f493e..95d159e 100644
--- a/include/linux/hyperv.h
+++ b/include/linux/hyperv.h
@@ -1509,4 +1509,18 @@ static inline void commit_rd_index(struct vmbus_channel 
*channel)
 }
 
 
+struct vmpipe_proto_header {
+   u32 pkt_type;
+   u32 data_size;
+};
+
+#define HVSOCK_HEADER_LEN  (sizeof(struct vmpacket_descriptor) + \
+sizeof(struct vmpipe_proto_header))
+
+/* See 'prev_indices' in hv_ringbuffer_read(), hv_ringbuffer_write() */
+#define PREV_INDICES_LEN   (sizeof(u64))
+
+#define HVSOCK_PKT_LEN(payload_len)(HVSOCK_HEADER_LEN + \
+   ALIGN((payload_len), 8) + \
+   PREV_INDICES_LEN)
 #endif /* _HYPERV_H */
diff --git a/include/linux/socket.h b/include/linux/socket.h
index b5cc5a6..0b68b58 100644
--- a/include/linux/socket.h
+++ b/include/linux/socket.h
@@ -202,8 +202,9 @@ struct ucred {
 #define AF_VSOCK   40  /* vSockets */
 #define AF_KCM 41  /* Kernel Connection Multiplexor*/
 #define AF_QIPCRTR 42  /* Qualcomm IPC Router  */
+#define AF_HYPERV  43  /* Hyper-V Sockets  */
 
-#define AF_MAX 43  /* For now.. */
+#define AF_MAX 44  /* For now.. */
 
 /* Protocol families, same as address families. */
 #define PF_UNSPEC  AF_UNSPEC
@@ -251,6 +252,7 @@ struct ucred {
 #define PF_VSOCK   AF_VSOCK
 #define PF_KCM AF_KCM
 #define PF_QIPCRTR AF_QIPCRTR
+#define PF_HYPERV  AF_HYPERV
 #define PF_MAX AF_MAX
 
 /* Maximum queue length specifiable by listen.  */
diff --git a/include/net/af_hvsock.h b/include/net/af_hvsock.h
new file mode 100644
index 000..20d23d5
--- /dev/null
+++ b/include/net/af_hvsock.h
@@ -0,0 +1,59 @@
+#ifndef __AF_HVSOCK_H__
+#define __AF_HVSOCK_H__
+
+#include 
+#include 
+#include 
+
+/* The host side's design of the feature requires 5 exact pages for recv/send
+ * rings respectively -- this is suboptimal considering memory consumption,
+ * however unluckily we have to live with it, before the host comes up with
+ * a better new design in the future.
+ */
+#define RINGBUFFER_HVSOCK_RCV_SIZE (PAGE_SIZE * 5)
+#define RINGBUFFER_HVSOCK_SND_SIZE (PAGE_SIZE * 5)
+
+#define sk_to_hvsock(__sk)   ((struct hvsock_sock *)(__sk))
+#define hvsock_to_s

Re: [PATCH] bridge: netfilter: spanning tree: Add masked_ether_addr_equal and neatening

2016-06-24 Thread Pablo Neira Ayuso

On Fri, Jun 24, 2016 at 10:51:28AM +0200, Pablo Neira Ayuso wrote:
> On Thu, Jun 23, 2016 at 12:00:00PM -0700, Joe Perches wrote:
> > On Thu, 2016-06-23 at 19:36 +0200, Pablo Neira Ayuso wrote:
> > > On Wed, Jun 15, 2016 at 01:58:45PM -0700, Joe Perches wrote:
> > > > 
> > > > There is code duplication of a masked ethernet address comparison here
> > > > so make it a separate function instead.
> > > > 
> > > > Miscellanea:
> > > > 
> > > > o Neaten alignment of FWINV macro uses to make it clearer for the reader
> > > Applied, thanks.
> > > 
> > > > 
> > > > Signed-off-by: Joe Perches 
> > > > ---
> > > > 
> > > > This masked_ether_addr_equal function could go into etherdevice.h,
> > > > but I don't see another use like it in kernel code.  Is there one?
> > >
> > > This is specific of iptables, not even nftables would use this. So I
> > > would keep this in the iptables tree.
> > 
> > Did you see the other patch that adds a generic
> > ether_addr_equal_masked() and uses it in a few
> > more files?
> 
> You mean this one:
> 
> http://patchwork.ozlabs.org/patch/636208/
> 
> OK, so I'll toss the previous and will take this one instead.
> 
> As I said my opinion is that ether_addr_equal_masked() is only
> required by netfilter, but thinking it well I don't really mind in
> what header this function is placed given that these are our internal
> headers.

git am reports patch I get from patchwork is corrupt at line 37.
Tried a couple of tricks to fix it but this didn't work.

Would you mind resubmitting this patch?

Sorry for the inconvenience.

Re: [PATCH] bridge: netfilter: spanning tree: Add masked_ether_addr_equal and neatening

2016-06-24 Thread Pablo Neira Ayuso

On Thu, Jun 23, 2016 at 12:00:00PM -0700, Joe Perches wrote:
> On Thu, 2016-06-23 at 19:36 +0200, Pablo Neira Ayuso wrote:
> > On Wed, Jun 15, 2016 at 01:58:45PM -0700, Joe Perches wrote:
> > > 
> > > There is code duplication of a masked ethernet address comparison here
> > > so make it a separate function instead.
> > > 
> > > Miscellanea:
> > > 
> > > o Neaten alignment of FWINV macro uses to make it clearer for the reader
> > Applied, thanks.
> > 
> > > 
> > > Signed-off-by: Joe Perches 
> > > ---
> > > 
> > > This masked_ether_addr_equal function could go into etherdevice.h,
> > > but I don't see another use like it in kernel code.  Is there one?
> >
> > This is specific of iptables, not even nftables would use this. So I
> > would keep this in the iptables tree.
> 
> Did you see the other patch that adds a generic
> ether_addr_equal_masked() and uses it in a few
> more files?

You mean this one:

http://patchwork.ozlabs.org/patch/636208/

OK, so I'll toss the previous and will take this one instead.

As I said my opinion is that ether_addr_equal_masked() is only
required by netfilter, but thinking it well I don't really mind in
what header this function is placed given that these are our internal
headers.

Thanks.

Re: [RFC PATCH] gro: Partly revert "net: gro: allow to build full sized skb"

2016-06-24 Thread Steffen Klassert

Sorry for replying to old mail, but wanted to keep the context.

On Fri, Apr 22, 2016 at 10:14:22AM -0700, Alexander Duyck wrote:
> On Fri, Apr 22, 2016 at 1:51 AM, Steffen Klassert
>  wrote:
> > On Thu, Apr 21, 2016 at 09:02:48AM -0700, Alexander Duyck wrote:
> >> On Thu, Apr 21, 2016 at 12:40 AM, Steffen Klassert
> >>  wrote:
> >> > This partly reverts the below mentioned patch because on
> >> > forwarding, such skbs can't be offloaded to a NIC.
> >> >
> >> > We need this to get IPsec GRO for forwarding to work properly,
> >> > otherwise the GRO aggregated packets get segmented again by
> >> > the GSO layer. Although discovered when implementing IPsec GRO,
> >> > this is a general problem in the forwarding path.
> >>
> >> I'm confused as to why you would need this to get IPsec GRO forwarding
> >> to work.
> >
> > It works without this, but the performance numbers are not that good
> > if we have to do GSO in software.
> 
> Well really GSO is only meant to preform better than if we didn't do
> any GRO/GSO at all.  If that isn't the case I wouldn't consider it a
> regression since as Eric points out there are other scenerios where
> you end up with a chain of buffers stuck on the fraglist.  Mostly what
> GRO/GSO gets you is fewer runs through the stack.
> 
> >> Are you having to go through a device that doesn't have
> >> NETIF_F_FRAGLIST defined?
> >
> > I don't know of any NIC that can do TSO on a skbuff with fraglist,
> > that's why I try to avoid to have a buffer with fraglist.
> >
> 
> Most of them don't.  There are only one or two NICs out there that
> support transmitting a frame that has a fraglist.
> 
> >> Also what is the issue with having to go
> >> through the GSO layer on segmentation?  It seems like we might be able
> >> to do something like what we did with GSO partial to split frames so
> >> that they are in chunks that wouldn't require NETIF_F_FRAGLIST.  Then
> >> you could get the best of both worlds in that the stack would only
> >> process one super-frame, and the transmitter could TSO a series of
> >> frames that are some fixed MSS in size.
> >
> > This could be interesting. Then we could have a buffer with
> > fraglist, GSO layer splits in skbuffs without fraglist that
> > can be TSO offloaded. Something like this might solve my
> > performance problems.
> 
> Right.  It is something to think about.  I was considering what might
> be involved to make a fraglist based skb a GSO type.  Then we might be
> able to handle it kind of like what we do for the whole
> SKB_GSO_DODGY/NETIF_F_GSO_ROBUST path.  Basically if we just need to
> break the frame at the fraglist level it probably wouldn't be that
> hard to do assuming each skb is MSS aligned in terms of size.

I've tried to implement the idea to split buffers at the frag_list
pointer and ended up with the patch below. With this patch, the
SKB_GSO_PARTIAL case is not the only case where skb_segment() can
return a gso skb. I had to adapt some gso handlers to this, not
sure if I found all places where I have to do this. Works in my
case, but needs review and maybe some more sophisticated tests.

I've could not benchmark this with big packet sizes because my
10G interfaces are the limiting factors then. So I did an iperf
forwarding test with reduced TCP mss to 536 byte.

Result with a recent net-next tree:

net-next 6.67 Gbits/sec

net-next + patch 8.20 Gbits/sec


Subject: [RFC PATCH] gso: Support partial splitting at the frag_list pointer

Since commit 8a29111c7 ("net: gro: allow to build full sized skb")
gro may build buffers with a frag_list. This can hurts forwarding
because most NICs can't offload such packets, they need to be
segmented in software. This patch splits buffers with a frag_list
at the frag_list pointer into buffers that can be TSO offloaded.

Signed-off-by: Steffen Klassert 
---
 net/core/skbuff.c  | 90 +-
 net/ipv4/af_inet.c |  7 ++--
 net/ipv4/gre_offload.c |  7 +++-
 net/ipv4/tcp_offload.c |  3 ++
 net/ipv4/udp_offload.c |  9 +++--
 net/ipv6/ip6_offload.c |  6 +++-
 6 files changed, 115 insertions(+), 7 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index e7ec6d3..093c3cd 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3096,6 +3096,93 @@ struct sk_buff *skb_segment(struct sk_buff *head_skb,
sg = !!(features & NETIF_F_SG);
csum = !!can_checksum_protocol(features, proto);
 
+   headroom = skb_headroom(head_skb);
+
+   if (list_skb && net_gso_ok(features, skb_shinfo(head_skb)->gso_type) &&
+   csum && sg && (mss != GSO_BY_FRAGS) &&
+   !(features & NETIF_F_GSO_PARTIAL)) {
+   unsigned int lskb_segs;
+   unsigned int delta_segs, delta_len, delta_truesize;
+   struct sk_buff *nskb;
+   delta_segs = delta_len = delta_truesize = 0;
+
+   segs = __alloc_skb(skb_headlen(head_skb) + headroom,
+  GFP_ATOMIC, skb_alloc

[PATCH v12 net-next 0/1] introduce Hyper-V VM Sockets(hv_sock)

2016-06-24 Thread Dexuan Cui

Hyper-V Sockets (hv_sock) supplies a byte-stream based communication
mechanism between the host and the guest. It's somewhat like TCP over
VMBus, but the transportation layer (VMBus) is much simpler than IP.

With Hyper-V Sockets, applications between the host and the guest can talk
to each other directly by the traditional BSD-style socket APIs.

Hyper-V Sockets is only available on new Windows hosts, like Windows Server
2016. More info is in this article "Make your own integration services":
https://msdn.microsoft.com/en-us/virtualization/hyperv_on_windows/develop/make_mgmt_service

The patch implements the necessary support in the guest side by
introducing a new socket address family AF_HYPERV.

You can also get the patch by:
https://github.com/dcui/linux/commits/decui/hv_sock/net-next/20160620_v12

Note: the VMBus driver side's supporting patches have been in the mainline
tree.

I know the kernel has already had a VM Sockets driver (AF_VSOCK) based
on VMware VMCI (net/vmw_vsock/, drivers/misc/vmw_vmci), and KVM is
proposing AF_VSOCK of virtio version:
http://marc.info/?l=linux-netdev&m=145952064004765&w=2

However, though Hyper-V Sockets may seem conceptually similar to
AF_VOSCK, there are differences in the transportation layer, and IMO these
make the direct code reusing impractical:

1. In AF_VSOCK, the endpoint type is: , but in
AF_HYPERV, the endpoint type is: . Here GUID
is 128-bit.

2. AF_VSOCK supports SOCK_DGRAM, while AF_HYPERV doesn't.

3. AF_VSOCK supports some special sock opts, like SO_VM_SOCKETS_BUFFER_SIZE,
SO_VM_SOCKETS_BUFFER_MIN/MAX_SIZE and SO_VM_SOCKETS_CONNECT_TIMEOUT.
These are meaningless to AF_HYPERV.

4. Some AF_VSOCK's VMCI transportation ops are meanless to AF_HYPERV/VMBus,
like .notify_recv_init
.notify_recv_pre_block
.notify_recv_pre_dequeue
.notify_recv_post_dequeue
.notify_send_init
.notify_send_pre_block
.notify_send_pre_enqueue
.notify_send_post_enqueue
etc.

So I think we'd better introduce a new address family: AF_HYPERV.

Please review the patch.

Looking forward to your comments, especially comments from David. :-)

Changes since v1:
- updated "[PATCH 6/7] hvsock: introduce Hyper-V VM Sockets feature"
- added __init and __exit for the module init/exit functions
- net/hv_sock/Kconfig: "default m" -> "default m if HYPERV"
- MODULE_LICENSE: "Dual MIT/GPL" -> "Dual BSD/GPL"

Changes since v2:
- fixed various coding issue pointed out by David Miller
- fixed indentation issues
- removed pr_debug in net/hv_sock/af_hvsock.c
- used reverse-Chrismas-tree style for local variables.
- EXPORT_SYMBOL -> EXPORT_SYMBOL_GPL

Changes since v3:
- fixed a few coding issue pointed by Vitaly Kuznetsov and Dan Carpenter
- fixed the ret value in vmbus_recvpacket_hvsock on error
- fixed the style of multi-line comment: vmbus_get_hvsock_rw_status()

Changes since v4 (https://lkml.org/lkml/2015/7/28/404):
- addressed all the comments about V4.
- treat the hvsock offers/channels as special VMBus devices
- add a mechanism to pass hvsock events to the hvsock driver
- fixed some corner cases with proper locking when a connection is closed
- rebased to the latest Greg's tree

Changes since v5 (https://lkml.org/lkml/2015/12/24/103):
- addressed the coding style issues (Vitaly Kuznetsov & David Miller, thanks!)
- used a better coding for the per-channel rescind callback (Thank Vitaly!)
- avoided the introduction of new VMBUS driver APIs vmbus_sendpacket_hvsock()
and vmbus_recvpacket_hvsock() and used vmbus_sendpacket()/vmbus_recvpacket()
in the higher level (i.e., the vmsock driver). Thank Vitaly!

Changes since v6 (http://lkml.iu.edu/hypermail/linux/kernel/1601.3/01813.html)
- only a few minor changes of coding style and comments

Changes since v7
- a few minor changes of coding style: thanks, Joe Perches!
- added some lines of comments about GUID/UUID before the struct sockaddr_hv.

Changes since v8
- removed the unnecessary __packed for some definitions: thanks, David!
- hvsock_open_connection: use offer.u.pipe.user_def[0] to know the connection
and reorganized the function
direction
- reorganized the code according to suggestions from Cathy Avery: split big
functions into small ones, set .setsockopt and getsockopt to
sock_no_setsockopt/sock_no_getsockopt
- inline'd some small list helper functions

Changes since v9
- minimized struct hvsock_sock by making the send/recv buffers pointers.
the buffers are allocated by kmalloc() in __hvsock_create() now.
- minimized the sizes of the send/recv buffers and the vmbus ringbuffers.

Changes since v10

1) add module params: send_ring_page, recv_ring_page. They can be used to
enlarge the ringbuffer size to get better performance, e.g.,
# modprobe hv_sock recv_ring_page=16 send_ring_page=16
By default, recv_ring_page is 3 and send_ring_page is 2.

2) add module param max_socket_number (the default is 1024).
A user can enlarge the number to create more than 1024 hv_sock sockets.
By default, 1024 sockets take about 1024 * (3+2+1+1) * 4KB = 28M bytes.
(H

[PATCH] mac80211_hwsim: Added vendor echo command

2016-06-24 Thread Erik Stromdahl

The purpose of the echo command is to provide a test
facility for user space programs.

Signed-off-by: Erik Stromdahl 
---
 drivers/net/wireless/mac80211_hwsim.c |   27 +--
 1 file changed, 25 insertions(+), 2 deletions(-)

diff --git a/drivers/net/wireless/mac80211_hwsim.c 
b/drivers/net/wireless/mac80211_hwsim.c
index 4dd5adc..2e17bf1 100644
--- a/drivers/net/wireless/mac80211_hwsim.c
+++ b/drivers/net/wireless/mac80211_hwsim.c
@@ -332,14 +332,16 @@ static const struct ieee80211_rate hwsim_rates[] = {
 
 #define OUI_QCA 0x001374
 #define QCA_NL80211_SUBCMD_TEST 1
+#define QCA_NL80211_SUBCMD_ECHO 2
 enum qca_nl80211_vendor_subcmds {
QCA_WLAN_VENDOR_ATTR_TEST = 8,
-   QCA_WLAN_VENDOR_ATTR_MAX = QCA_WLAN_VENDOR_ATTR_TEST
+   QCA_WLAN_VENDOR_ATTR_ECHO,
+   QCA_WLAN_VENDOR_ATTR_MAX = QCA_WLAN_VENDOR_ATTR_ECHO
 };
 
 static const struct nla_policy
 hwsim_vendor_test_policy[QCA_WLAN_VENDOR_ATTR_MAX + 1] = {
-   [QCA_WLAN_VENDOR_ATTR_MAX] = { .type = NLA_U32 },
+   [QCA_WLAN_VENDOR_ATTR_TEST] = { .type = NLA_U32 },
 };
 
 static int mac80211_hwsim_vendor_cmd_test(struct wiphy *wiphy,
@@ -393,12 +395,33 @@ static int mac80211_hwsim_vendor_cmd_test(struct wiphy 
*wiphy,
return cfg80211_vendor_cmd_reply(skb);
 }
 
+static int mac80211_hwsim_vendor_cmd_echo(struct wiphy *wiphy,
+ struct wireless_dev *wdev,
+ const void *data, int data_len)
+{
+   struct sk_buff *skb;
+
+   skb = cfg80211_vendor_cmd_alloc_reply_skb(wiphy, data_len);
+   if (!skb)
+   return -ENOMEM;
+
+   nla_put(skb, QCA_WLAN_VENDOR_ATTR_ECHO, data_len, data);
+
+   return cfg80211_vendor_cmd_reply(skb);
+}
+
 static struct wiphy_vendor_command mac80211_hwsim_vendor_commands[] = {
{
.info = { .vendor_id = OUI_QCA,
  .subcmd = QCA_NL80211_SUBCMD_TEST },
.flags = WIPHY_VENDOR_CMD_NEED_NETDEV,
.doit = mac80211_hwsim_vendor_cmd_test,
+   },
+   {
+   .info = { .vendor_id = OUI_QCA,
+ .subcmd = QCA_NL80211_SUBCMD_ECHO },
+   .flags = WIPHY_VENDOR_CMD_NEED_NETDEV,
+   .doit = mac80211_hwsim_vendor_cmd_echo,
}
 };
 
-- 
1.7.9.5

1 2 >

1 - 100 of 101 matches

Mail list logo