Re: [PATCH net] ipv6: use rt6_info members when dst is set in rt6_fill_node

2018-09-10 Thread Hangbin Liu
Hi David,
On Mon, Sep 10, 2018 at 08:39:34PM -0600, David Ahern wrote:
> On 9/10/18 7:04 PM, Hangbin Liu wrote:
> 
> > diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> > index 18e00ce..62621b4 100644
> > --- a/net/ipv6/route.c
> > +++ b/net/ipv6/route.c
> > @@ -998,6 +998,21 @@ static void ip6_rt_copy_init(struct rt6_info *rt, 
> > struct fib6_info *ort)
> > rt->rt6i_prefsrc = ort->fib6_prefsrc;
> >  }
> >  
> > +static void rt6_update_info(struct rt6_info *rt)
> > +{
> > +   struct fib6_info *from;
> > +
> > +   rcu_read_lock();
> > +   from = rcu_dereference(rt->from);
> > +   fib6_info_hold(from);
> > +   rcu_read_unlock();
> > +
> > +   from->fib6_flags = rt->rt6i_flags;
> > +   from->fib6_nh.nh_gw = rt->rt6i_gateway;
> 
> As I mentioned on your last patch, redirects do *not* update fib
> entries. Exceptions, yes, but not core data of a fib entry.

Thanks for the comments, I understand that we should not update original route.
And here I know the redirect (should?) do not update fib entries.

So Xin Long's patch looks more reasonable.

Thanks
Hangbin


Re: [PATCH net] ipv6: use rt6_info members when dst is set in rt6_fill_node

2018-09-10 Thread David Ahern
On 9/10/18 7:04 PM, Hangbin Liu wrote:

> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index 18e00ce..62621b4 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -998,6 +998,21 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct 
> fib6_info *ort)
>   rt->rt6i_prefsrc = ort->fib6_prefsrc;
>  }
>  
> +static void rt6_update_info(struct rt6_info *rt)
> +{
> + struct fib6_info *from;
> +
> + rcu_read_lock();
> + from = rcu_dereference(rt->from);
> + fib6_info_hold(from);
> + rcu_read_unlock();
> +
> + from->fib6_flags = rt->rt6i_flags;
> + from->fib6_nh.nh_gw = rt->rt6i_gateway;

As I mentioned on your last patch, redirects do *not* update fib
entries. Exceptions, yes, but not core data of a fib entry.


Re: [Patch net v2] rds: fix two RCU related problems

2018-09-10 Thread Santosh Shilimkar

On 9/10/2018 6:27 PM, Cong Wang wrote:

When a rds sock is bound, it is inserted into the bind_hash_table
which is protected by RCU. But when releasing rds sock, after it
is removed from this hash table, it is freed immediately without
respecting RCU grace period. This could cause some use-after-free
as reported by syzbot.

Mark the rds sock with SOCK_RCU_FREE before inserting it into the
bind_hash_table, so that it would be always freed after a RCU grace
period.

The other problem is in rds_find_bound(), the rds sock could be
freed in between rhashtable_lookup_fast() and rds_sock_addref(),
so we need to extend RCU read lock protection in rds_find_bound()
to close this race condition.

Reported-and-tested-by: syzbot+8967084bcac563795...@syzkaller.appspotmail.com
Reported-by: syzbot+93a5839deb3555374...@syzkaller.appspotmail.com
Cc: Sowmini Varadhan 
Cc: Santosh Shilimkar 
Cc: rds-de...@oss.oracle.com
Signed-off-by: Cong Wang 
---

Thank you !!
Acked-by: Santosh Shilimkar 


[Patch net v2] rds: fix two RCU related problems

2018-09-10 Thread Cong Wang
When a rds sock is bound, it is inserted into the bind_hash_table
which is protected by RCU. But when releasing rds sock, after it
is removed from this hash table, it is freed immediately without
respecting RCU grace period. This could cause some use-after-free
as reported by syzbot.

Mark the rds sock with SOCK_RCU_FREE before inserting it into the
bind_hash_table, so that it would be always freed after a RCU grace
period.

The other problem is in rds_find_bound(), the rds sock could be
freed in between rhashtable_lookup_fast() and rds_sock_addref(),
so we need to extend RCU read lock protection in rds_find_bound()
to close this race condition.

Reported-and-tested-by: syzbot+8967084bcac563795...@syzkaller.appspotmail.com
Reported-by: syzbot+93a5839deb3555374...@syzkaller.appspotmail.com
Cc: Sowmini Varadhan 
Cc: Santosh Shilimkar 
Cc: rds-de...@oss.oracle.com
Signed-off-by: Cong Wang 
---
 net/rds/bind.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/net/rds/bind.c b/net/rds/bind.c
index 3ab55784b637..762d2c6788a3 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -76,11 +76,13 @@ struct rds_sock *rds_find_bound(const struct in6_addr 
*addr, __be16 port,
struct rds_sock *rs;
 
__rds_create_bind_key(key, addr, port, scope_id);
-   rs = rhashtable_lookup_fast(_hash_table, key, ht_parms);
+   rcu_read_lock();
+   rs = rhashtable_lookup(_hash_table, key, ht_parms);
if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
rds_sock_addref(rs);
else
rs = NULL;
+   rcu_read_unlock();
 
rdsdebug("returning rs %p for %pI6c:%u\n", rs, addr,
 ntohs(port));
@@ -235,6 +237,7 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, 
int addr_len)
goto out;
}
 
+   sock_set_flag(sk, SOCK_RCU_FREE);
ret = rds_add_bound(rs, binding_addr, , scope_id);
if (ret)
goto out;
-- 
2.14.4



[PATCHv2 iproute2] bridge/mdb: fix missing new line when show bridge mdb

2018-09-10 Thread Hangbin Liu
The bridge mdb show is broken on current iproute2. e.g.
]# bridge mdb show
34: br0  veth0_br  224.1.1.2  temp 34: br0  veth0_br  224.1.1.1  temp

After fix:
]# bridge mdb show
34: br0  veth0_br  224.1.1.2  temp
34: br0  veth0_br  224.1.1.1  temp

v2: use json print lib as Stephen suggested.

Reported-by: Ying Xu 
Fixes: c7c1a1ef51aea ("bridge: colorize output and use JSON print library")
Signed-off-by: Hangbin Liu 
---
 bridge/mdb.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/bridge/mdb.c b/bridge/mdb.c
index f38dc67..408117d 100644
--- a/bridge/mdb.c
+++ b/bridge/mdb.c
@@ -107,6 +107,10 @@ static void br_print_router_ports(FILE *f, struct rtattr 
*attr,
fprintf(f, "%s ", port_ifname);
}
}
+
+   if (!is_json_context() && !show_stats)
+   print_string(PRINT_FP, NULL, "\n", NULL);
+
close_json_array(PRINT_JSON, NULL);
 }
 
@@ -164,6 +168,10 @@ static void print_mdb_entry(FILE *f, int ifindex, const 
struct br_mdb_entry *e,
print_string(PRINT_ANY, "timer", " %s",
 format_timer(timer));
}
+
+   if (!is_json_context())
+   print_string(PRINT_FP, NULL, "\n", NULL);
+
close_json_object();
 }
 
-- 
2.5.5



Re: [PATCH net-next] virtio_net: ethtool tx napi configuration

2018-09-10 Thread Willem de Bruijn
> >> I cook a fixup, and it looks works in my setup:
> >>
> >> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> >> index b320b6b14749..9181c3f2f832 100644
> >> --- a/drivers/net/virtio_net.c
> >> +++ b/drivers/net/virtio_net.c
> >> @@ -2204,10 +2204,17 @@ static int virtnet_set_coalesce(struct
> >> net_device *dev,
> >>   return -EINVAL;
> >>
> >>   if (napi_weight ^ vi->sq[0].napi.weight) {
> >> -   if (dev->flags & IFF_UP)
> >> -   return -EBUSY;
> >> -   for (i = 0; i < vi->max_queue_pairs; i++)
> >> +   for (i = 0; i < vi->max_queue_pairs; i++) {
> >> +   struct netdev_queue *txq =
> >> +  netdev_get_tx_queue(vi->dev, i);
> >> +
> >> + virtnet_napi_tx_disable(>sq[i].napi);
> >> +   __netif_tx_lock_bh(txq);
> >>   vi->sq[i].napi.weight = napi_weight;
> >> +   __netif_tx_unlock_bh(txq);
> >> +   virtnet_napi_tx_enable(vi, vi->sq[i].vq,
> >> + >sq[i].napi);
> >> +   }
> >>   }
> >>
> >>   return 0;
> > Thanks! It passes my simple stress test, too. Which consists of two
> > concurrent loops, one toggling the ethtool option, another running
> > TCP_RR.
> >
> >> The only left case is the speculative tx polling in RX NAPI. I think we
> >> don't need to care in this case since it was not a must for correctness.
> > As long as the txq lock is held that will be a noop, anyway. The other
> > concurrent action is skb_xmit_done. It looks correct to me, but need
> > to think about it a bit. The tricky transition is coming out of napi without
> > having >= 2 + MAX_SKB_FRAGS clean descriptors. If the queue is
> > stopped it may deadlock transmission in no-napi mode.
>
> Yes, maybe we can enable tx queue when napi weight is zero in
> virtnet_poll_tx().

Yes, that precaution should resolve that edge case.

> Let me do more stress test on this.
>
> >
> >>> Link: https://patchwork.ozlabs.org/patch/948149/
> >>> Suggested-by: Jason Wang 
> >>> Signed-off-by: Willem de Bruijn 
> >>> ---
> >>>drivers/net/virtio_net.c | 52 
> >>>1 file changed, 52 insertions(+)
> >>>
> >>> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> >>> index 765920905226..b320b6b14749 100644
> >>> --- a/drivers/net/virtio_net.c
> >>> +++ b/drivers/net/virtio_net.c
> >>> @@ -66,6 +66,8 @@ DECLARE_EWMA(pkt_len, 0, 64)
> >>>
> >>>#define VIRTNET_DRIVER_VERSION "1.0.0"
> >>>
> >>> +static const u32 ethtool_coalesce_napi_mask = (1UL << 10);
> >>> +
> >>>static const unsigned long guest_offloads[] = {
> >>>VIRTIO_NET_F_GUEST_TSO4,
> >>>VIRTIO_NET_F_GUEST_TSO6,
> >>> @@ -2181,6 +2183,54 @@ static int virtnet_get_link_ksettings(struct 
> >>> net_device *dev,
> >>>return 0;
> >>>}
> >>>
> >>> +static int virtnet_set_coalesce(struct net_device *dev,
> >>> + struct ethtool_coalesce *ec)
> >>> +{
> >>> + const struct ethtool_coalesce ec_default = {
> >>> + .cmd = ETHTOOL_SCOALESCE,
> >>> + .rx_max_coalesced_frames = 1,
> >> I think rx part is no necessary.
> > The definition of ethtool_coalesce has:
> >
> > "* It is illegal to set both usecs and max_frames to zero as this
> >   * would cause interrupts to never be generated.  To disable
> >   * coalescing, set usecs = 0 and max_frames = 1."
> >
> > I'd rather not diverge from this prescribed behavior unless there's
> > a strong reason.
>
> I get this.
>
> >
> > On the related point in the other thread:
> >
> >> Rethink about this, how about something like:
> >>
> >> - UINT_MAX: no tx interrupt
> >> - other value: tx interrupt with possible interrupt moderation
> > Okay, that will be simpler to configure.
>
> I wonder maybe we can use ethtool_coalesce definition. E.g usecs = 0 &&
> max_frame = 0 means no interrupt? Just a thought, I'm ok with either.

Come to think of it, max_frames 0 is no worse than max_frames
UINT_MAX. Sure, let's do that.


Re: [RFC PATCH bpf-next v2 0/4] Implement bpf queue/stack maps

2018-09-10 Thread Alexei Starovoitov
On Fri, Sep 7, 2018 at 1:40 PM, Mauricio Vasquez
 wrote:
>
> I read the Joe's proposal and using that for this problem looks like a nice
> solution.
>
> I think a good trade-off for now would be to go ahead with a queue/stack map
> without preallocating support (or maybe include it having always in mind
> that this issue has to be solved in the near future) and then, as a
> separated work, try to use Joe's proposal in the map helpers.
>
> What do you think?

the problem with such approach is that it would mean that
non-prealloc stack/queue api will be different from future one
after verifier will get smarter.
The alternative would be to support by-value api only.
Meaning let stack/queue support value_size = 1,2,4,8 byte only.
Then bpf_push|pop_elem() helper will be by-value
instead of returning a pointer.
No rcu callback issues and implementation on the kernel
side can be much simpler.
I think simple array of value_size elems with head/tail indices
will be enough.
Once we have Joe's verifier improvements
we can add alloc/free bpf object management facility
and use 8-byte stack/queue mapas a stack of pointers.
I think decoupling memory operations alloc/free from
stack push/pop would be additional benefit of such design.


Re: [PATCH net] ipv6: use rt6_info members when dst is set in rt6_fill_node

2018-09-10 Thread Hangbin Liu
On Mon, Sep 10, 2018 at 01:07:11PM -0600, David Ahern wrote:
> On 9/10/18 11:55 AM, Xin Long wrote:
> > On Tue, Sep 11, 2018 at 12:13 AM David Ahern  
> > wrote:
> >>
> >> On 9/9/18 12:29 AM, Xin Long wrote:
> > diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> > index 18e00ce..e554922 100644
> > --- a/net/ipv6/route.c
> > +++ b/net/ipv6/route.c
> > @@ -4670,20 +4670,33 @@ static int rt6_fill_node(struct net *net, 
> > struct sk_buff *skb,
> >int iif, int type, u32 portid, u32 seq,
> >unsigned int flags)
> >  {
> > - struct rtmsg *rtm;
> > + struct rt6key *fib6_prefsrc, *fib6_dst, *fib6_src;
> > + struct rt6_info *rt6 = (struct rt6_info *)dst;
> > + u32 *pmetrics, table, fib6_flags;
> >   struct nlmsghdr *nlh;
> > + struct rtmsg *rtm;
> >   long expires = 0;
> > - u32 *pmetrics;
> > - u32 table;
> >
> >   nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
> >   if (!nlh)
> >   return -EMSGSIZE;
> >
> > + if (rt6) {
> > + fib6_dst = >rt6i_dst;
> > + fib6_src = >rt6i_src;
> > + fib6_flags = rt6->rt6i_flags;
> > + fib6_prefsrc = >rt6i_prefsrc;
> > + } else {
> > + fib6_dst = >fib6_dst;
> > + fib6_src = >fib6_src;
> > + fib6_flags = rt->fib6_flags;
> > + fib6_prefsrc = >fib6_prefsrc;
> > + }
> 
>  Unless I am missing something at the moment, an rt6_info can only have
>  the same dst, src and prefsrc as the fib6_info on which it is based.
>  Thus, only the flags is needed above. That simplifies this patch a lot.
> >>> If dst, src and prefsrc in rt6_info are always the same as these in 
> >>> fib6_info,
> >>> why do we need them in rt6_info? we could just get it by 'from'.
> >>>
> >>
> >> I just sent a patch removing rt6i_prefsrc. It is set with only 1 reader
> >> that can be converted.
> >>
> >> rt6i_src is checked against the fib6_info to invalidate a dst if the src
> >> has changed, so a valid rt will always have the same rt6i_src as the
> >> rt->from.
> >>
> >> rt6i_dst is set to the dest address / 128 in cases, so it should be used
> >> for rt6_info cases above.
> > So that means, I will use rt6i_dst and rt6i_flags when dst is set?
> > how about I use rt6i_src there as well? just to make it look clear.
> > and plus the gw/nh dump fix in rt6_fill_node():
> > -if (rt->fib6_nsiblings) {
> > +if (rt6) {
> > +if (fib6_flags & RTF_GATEWAY)
> > +if (nla_put_in6_addr(skb, RTA_GATEWAY,
> > + >rt6i_gateway) < 0)
> > +goto nla_put_failure;
> > +
> > +if (dst->dev && nla_put_u32(skb, RTA_OIF, 
> > dst->dev->ifindex))
> > +goto nla_put_failure;
> > +} else if (rt->fib6_nsiblings) {
> >  struct fib6_info *sibling, *next_sibling;
> >  struct nlattr *mp;
> > 
> > looks good to you?
> > 
> 
> sure

Hmm... Then how to deal the other nh info filled by rt6_nexthop_info()?
I was planed to fix the gw issue[1] by syncing the gw and flags info. Like

diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 18e00ce..62621b4 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -998,6 +998,21 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct 
fib6_info *ort)
rt->rt6i_prefsrc = ort->fib6_prefsrc;
 }
 
+static void rt6_update_info(struct rt6_info *rt)
+{
+   struct fib6_info *from;
+
+   rcu_read_lock();
+   from = rcu_dereference(rt->from);
+   fib6_info_hold(from);
+   rcu_read_unlock();
+
+   from->fib6_flags = rt->rt6i_flags;
+   from->fib6_nh.nh_gw = rt->rt6i_gateway;
+
+   fib6_info_release(from);
+}
+
 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
struct in6_addr *saddr)
 {
@@ -3446,6 +3463,8 @@ static void rt6_do_redirect(struct dst_entry *dst, struct 
sock *sk, struct sk_bu
goto out;
}
 
+   rt6_update_info(nrt);
+
netevent.old = >dst;
netevent.new = >dst;
netevent.daddr = >dest;
-- 
2.5.5


[1] https://patchwork.ozlabs.org/patch/966972/

Thanks
Hangbin


[PATCH][net-next][v2] netlink: remove hash::nelems check in netlink_insert

2018-09-10 Thread Li RongQing
The type of hash::nelems has been changed from size_t to atom_t
which in fact is int, so not need to check if BITS_PER_LONG, that
is bit number of size_t, is bigger than 32

and rht_grow_above_max() will be called to check if hashtable is
too big, ensure it can not bigger than 1<<31

Signed-off-by: Zhang Yu 
Signed-off-by: Li RongQing 
---
 net/netlink/af_netlink.c | 5 -
 1 file changed, 5 deletions(-)

diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
index b4a29bcc33b9..e3a0538ec0be 100644
--- a/net/netlink/af_netlink.c
+++ b/net/netlink/af_netlink.c
@@ -574,11 +574,6 @@ static int netlink_insert(struct sock *sk, u32 portid)
if (nlk_sk(sk)->bound)
goto err;
 
-   err = -ENOMEM;
-   if (BITS_PER_LONG > 32 &&
-   unlikely(atomic_read(>hash.nelems) >= UINT_MAX))
-   goto err;
-
nlk_sk(sk)->portid = portid;
sock_hold(sk);
 
-- 
2.16.2



Re: [Patch net] rds: mark bound socket with SOCK_RCU_FREE

2018-09-10 Thread Cong Wang
On Mon, Sep 10, 2018 at 5:56 PM Santosh Shilimkar
 wrote:
>
> On 9/10/2018 5:45 PM, Cong Wang wrote:
> > On Mon, Sep 10, 2018 at 5:26 PM Santosh Shilimkar
> >  wrote:
> >> Would you mind posting an updated patch please with call_rcu and
> >> above extended RCU grace period with rcu_read_lock. Thanks !!
> >
> > If you prefer to fix _two_ problems in one patch, sure.
> >
> > For the record, the bug this patch fixes is NOT same with the one
> > in rds_find_bound(), because there is no rds_find_bound() in
> > the backtrace. If you want to see the full backtrace, here it is:
> >
> > https://marc.info/?l=linux-netdev=15364808962=2
> >
> > This is why I believe they are two problems.
> >
> > Whether fixing two problems in one patch or two patches is
> > merely a preference, I leave it up to you.
> >
> I had a partial fix as well in past but since it wasn't covering
> complete window, it was abandoned.
>
> Since we know the other race, it would be best to address it
> together and hence requested a combine patch. Thanks for help !!

No problem! v2 is coming shortly after passing syzbot test. :)


Re: [Patch net] rds: mark bound socket with SOCK_RCU_FREE

2018-09-10 Thread Santosh Shilimkar

On 9/10/2018 5:45 PM, Cong Wang wrote:

On Mon, Sep 10, 2018 at 5:26 PM Santosh Shilimkar
 wrote:

Would you mind posting an updated patch please with call_rcu and
above extended RCU grace period with rcu_read_lock. Thanks !!


If you prefer to fix _two_ problems in one patch, sure.

For the record, the bug this patch fixes is NOT same with the one
in rds_find_bound(), because there is no rds_find_bound() in
the backtrace. If you want to see the full backtrace, here it is:

https://marc.info/?l=linux-netdev=15364808962=2

This is why I believe they are two problems.

Whether fixing two problems in one patch or two patches is
merely a preference, I leave it up to you.


I had a partial fix as well in past but since it wasn't covering
complete window, it was abandoned.

Since we know the other race, it would be best to address it
together and hence requested a combine patch. Thanks for help !!

Regards,
Santosh


Re: [net-next, PATCH 2/2, v1] net: socionext: add AF_XDP support

2018-09-10 Thread Toshiaki Makita
On 2018/09/11 1:21, Ilias Apalodimas wrote:
>>> @@ -707,6 +731,26 @@ static int netsec_process_rx(struct netsec_priv *priv, 
>>> int budget)
>>> if (unlikely(!buf_addr))
>>> break;
>>>  
>>> +   if (xdp_prog) {
>>> +   xdp_result = netsec_run_xdp(desc, priv, xdp_prog,
>>> +   pkt_len);
>>> +   if (xdp_result != NETSEC_XDP_PASS) {
>>> +   xdp_flush |= xdp_result & NETSEC_XDP_REDIR;
>>> +
>>> +   dma_unmap_single_attrs(priv->dev,
>>> +  desc->dma_addr,
>>> +  desc->len, DMA_TO_DEVICE,
>>> +  DMA_ATTR_SKIP_CPU_SYNC);
>>> +
>>> +   desc->len = desc_len;
>>> +   desc->dma_addr = dma_handle;
>>> +   desc->addr = buf_addr;
>>> +   netsec_rx_fill(priv, idx, 1);
>>> +   nsetsec_adv_desc(>tail);
>>> +   }
>>> +   continue;
>>
>> Continue even on XDP_PASS? Is this really correct?
>>
>> Also seems there is no handling of adjust_head/tail for XDP_PASS case.
>>
> A question on this. Should XDP related frames be allocated using 1 page
> per packet?

AFAIK there is no such constraint, e.g. i40e allocates 1 page per 2 packets.

-- 
Toshiaki Makita



Re: [PATCH net-next] virtio_net: ethtool tx napi configuration

2018-09-10 Thread Jason Wang




On 2018年09月10日 21:35, Willem de Bruijn wrote:

On Mon, Sep 10, 2018 at 2:01 AM Jason Wang  wrote:



On 2018年09月10日 06:44, Willem de Bruijn wrote:

From: Willem de Bruijn 

Implement ethtool .set_coalesce (-C) and .get_coalesce (-c) handlers.
Interrupt moderation is currently not supported, so these accept and
display the default settings of 0 usec and 1 frame.

Toggle tx napi through a bit in tx-frames. So as to not interfere
with possible future interrupt moderation, use bit 10, well outside
the reasonable range of real interrupt moderation values.

Changes are not atomic. The tx IRQ, napi BH and transmit path must
be quiesced when switching modes. Only allow changing this setting
when the device is down.

I cook a fixup, and it looks works in my setup:

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index b320b6b14749..9181c3f2f832 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2204,10 +2204,17 @@ static int virtnet_set_coalesce(struct
net_device *dev,
  return -EINVAL;

  if (napi_weight ^ vi->sq[0].napi.weight) {
-   if (dev->flags & IFF_UP)
-   return -EBUSY;
-   for (i = 0; i < vi->max_queue_pairs; i++)
+   for (i = 0; i < vi->max_queue_pairs; i++) {
+   struct netdev_queue *txq =
+  netdev_get_tx_queue(vi->dev, i);
+
+ virtnet_napi_tx_disable(>sq[i].napi);
+   __netif_tx_lock_bh(txq);
  vi->sq[i].napi.weight = napi_weight;
+   __netif_tx_unlock_bh(txq);
+   virtnet_napi_tx_enable(vi, vi->sq[i].vq,
+ >sq[i].napi);
+   }
  }

  return 0;

Thanks! It passes my simple stress test, too. Which consists of two
concurrent loops, one toggling the ethtool option, another running
TCP_RR.


The only left case is the speculative tx polling in RX NAPI. I think we
don't need to care in this case since it was not a must for correctness.

As long as the txq lock is held that will be a noop, anyway. The other
concurrent action is skb_xmit_done. It looks correct to me, but need
to think about it a bit. The tricky transition is coming out of napi without
having >= 2 + MAX_SKB_FRAGS clean descriptors. If the queue is
stopped it may deadlock transmission in no-napi mode.


Yes, maybe we can enable tx queue when napi weight is zero in 
virtnet_poll_tx(). Let me do more stress test on this.





Link: https://patchwork.ozlabs.org/patch/948149/
Suggested-by: Jason Wang 
Signed-off-by: Willem de Bruijn 
---
   drivers/net/virtio_net.c | 52 
   1 file changed, 52 insertions(+)

diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
index 765920905226..b320b6b14749 100644
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -66,6 +66,8 @@ DECLARE_EWMA(pkt_len, 0, 64)

   #define VIRTNET_DRIVER_VERSION "1.0.0"

+static const u32 ethtool_coalesce_napi_mask = (1UL << 10);
+
   static const unsigned long guest_offloads[] = {
   VIRTIO_NET_F_GUEST_TSO4,
   VIRTIO_NET_F_GUEST_TSO6,
@@ -2181,6 +2183,54 @@ static int virtnet_get_link_ksettings(struct net_device 
*dev,
   return 0;
   }

+static int virtnet_set_coalesce(struct net_device *dev,
+ struct ethtool_coalesce *ec)
+{
+ const struct ethtool_coalesce ec_default = {
+ .cmd = ETHTOOL_SCOALESCE,
+ .rx_max_coalesced_frames = 1,

I think rx part is no necessary.

The definition of ethtool_coalesce has:

"* It is illegal to set both usecs and max_frames to zero as this
  * would cause interrupts to never be generated.  To disable
  * coalescing, set usecs = 0 and max_frames = 1."

I'd rather not diverge from this prescribed behavior unless there's
a strong reason.


I get this.



On the related point in the other thread:


Rethink about this, how about something like:

- UINT_MAX: no tx interrupt
- other value: tx interrupt with possible interrupt moderation

Okay, that will be simpler to configure.


I wonder maybe we can use ethtool_coalesce definition. E.g usecs = 0 && 
max_frame = 0 means no interrupt? Just a thought, I'm ok with either.


Thanks


Re: [Patch net] rds: mark bound socket with SOCK_RCU_FREE

2018-09-10 Thread Cong Wang
On Mon, Sep 10, 2018 at 5:26 PM Santosh Shilimkar
 wrote:
> Would you mind posting an updated patch please with call_rcu and
> above extended RCU grace period with rcu_read_lock. Thanks !!

If you prefer to fix _two_ problems in one patch, sure.

For the record, the bug this patch fixes is NOT same with the one
in rds_find_bound(), because there is no rds_find_bound() in
the backtrace. If you want to see the full backtrace, here it is:

https://marc.info/?l=linux-netdev=15364808962=2

This is why I believe they are two problems.

Whether fixing two problems in one patch or two patches is
merely a preference, I leave it up to you.


Re: [Patch net] rds: mark bound socket with SOCK_RCU_FREE

2018-09-10 Thread Cong Wang
On Mon, Sep 10, 2018 at 5:24 PM Sowmini Varadhan
 wrote:
>
> On (09/10/18 17:16), Cong Wang wrote:
> > >
> > > On (09/10/18 16:51), Cong Wang wrote:
> > > >
> > > > __rds_create_bind_key(key, addr, port, scope_id);
> > > > -   rs = rhashtable_lookup_fast(_hash_table, key, ht_parms);
> > > > +   rcu_read_lock();
> > > > +   rs = rhashtable_lookup(_hash_table, key, ht_parms);
> > > > if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
> > > > rds_sock_addref(rs);
> > > > else
> > > > rs = NULL;
> > > > +   rcu_read_unlock();
> > >
> > > aiui, the rcu_read lock/unlock is only useful if the write
> > > side doing destructive operations  does something to make sure readers
> > > are done before doing the destructive opertion. AFAIK, that does
> > > not exist for rds socket management today
> >
> > That is exactly why we need it here, right?
>
> Maybe I am confused, what exactly is the patch you are proposing?
>
> Does it have the SOCK_RCU_FREE change?

Yes, that patch is obviously on top of this patch.


> Does it have the rcu_read_lock you have above?
> Where is the call_rcu?

Sure, as it is on top of this patch, the call_rcu() is
here:

void sk_destruct(struct sock *sk)
{
if (sock_flag(sk, SOCK_RCU_FREE))
call_rcu(>sk_rcu, __sk_destruct);
else
__sk_destruct(>sk_rcu);
}


>
> > Hmm, so you are saying synchronize_rcu() is kinda more correct
> > than call_rcu()??
>
>
> I'm not saying that, I'm asking "what exactly is the patch
> you are proposing?" The only one on record is
>http://patchwork.ozlabs.org/patch/968282/
> which does not have either synchronize_rcu or call_rcu.

It is very obviously on top of this patch ($subject).


>
> > I never hear this before, would like to know why.
>
> Please post precise patches first.

Already showed you precisely what is is, just on top
of this one.


Re: unexpected GRO/veth behavior

2018-09-10 Thread Toshiaki Makita
On 2018/09/10 23:56, Eric Dumazet wrote:
> On 09/10/2018 07:44 AM, Paolo Abeni wrote:
>> hi all,
>>
>> while testing some local patches I observed that the TCP tput in the
>> following scenario:
>>
>> # the following enable napi on veth0, so that we can trigger the
>> # GRO path with namespaces
>> ip netns add test
>> ip link add type veth
>> ip link set dev veth0 netns test
>> ip -n test link set lo up
>> ip -n test link set veth0 up
>> ip -n test addr add dev veth0 172.16.1.2/24
>> ip link set dev veth1 up
>> ip addr add dev veth1 172.16.1.1/24
>> IDX=`ip netns exec test cat /sys/class/net/veth0/ifindex`
>>
>> # 'xdp_pass' is a NO-OP XDP program that simply return XDP_PASS
>> ip netns exec test ./xdp_pass $IDX &
>> taskset 0x2 ip netns exec test iperf3 -s -i 60 &
>> taskset 0x1 iperf3 -c 172.16.1.2 -t 60 -i 60
>>
>> is quite lower than expected (~800Mbps). 'perf' shows a weird topmost 
>> offender:
>>
> 
> 
> But... why GRO would even be needed in this scenario ?
> 
> GRO is really meant for physical devices, having to mess with skb->sk adds 
> extra cost
> in this already heavy cost engine.
> 
> Virtual devices should already be fed with TSO packets.

Because XDP does not have SG feature (GRO path in veth is used only when
XDP is enabled).

I have tested configuration like this:

NIC ---(XDP_REDIRECT)---> veth===veth (XDP_PASS)

GRO seems to work and improves TCP throughput in this case.


Now I noticed I did not test:

netperf -> veth===veth (XDP_PASS) -> netserver

which I think is the case where Paolo faces a problem.

I think it is not the case XDP can improve performance. I think I can
disable GRO for packets with skb->sk != NULL in veth.

-- 
Toshiaki Makita



Re: [Patch net] rds: mark bound socket with SOCK_RCU_FREE

2018-09-10 Thread Santosh Shilimkar

On 9/10/2018 5:16 PM, Cong Wang wrote:

On Mon, Sep 10, 2018 at 5:04 PM Sowmini Varadhan
 wrote:


On (09/10/18 16:51), Cong Wang wrote:


 __rds_create_bind_key(key, addr, port, scope_id);
-   rs = rhashtable_lookup_fast(_hash_table, key, ht_parms);
+   rcu_read_lock();
+   rs = rhashtable_lookup(_hash_table, key, ht_parms);
 if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
 rds_sock_addref(rs);
 else
 rs = NULL;
+   rcu_read_unlock();


aiui, the rcu_read lock/unlock is only useful if the write
side doing destructive operations  does something to make sure readers
are done before doing the destructive opertion. AFAIK, that does
not exist for rds socket management today


That is exactly why we need it here, right?

As you mentioned previously, the sock could be freed after
rhashtable_lookup_fast() but before rds_sock_addref(), extending
the RCU read section after rds_sock_addref() exactly solves
the problem here.


Thats the case really.


Or is there any other destructive problem you didn't mention?
Mind to be specific?






Although sock release path is not a very hot path, but blocking
it isn't a good idea either, especially when you can use call_rcu(),
which has the same effect.

I don't see any reason we should prefer synchronize_rcu() here.


Usually correctness (making sure all readers are done, before nuking a
data structure) is a little bit more important than perforamance, aka
"safety before speed" is what I've always been taught.



Hmm, so you are saying synchronize_rcu() is kinda more correct
than call_rcu()?? I never hear this before, would like to know why.

To my best knowledge, the only difference between them is the context,
one is blocking, the other is non-blocking. Their correctness must be
equivalent.


We have burn our hands with blocking synchronize_rcu() and that was
actually the main reason we moved to rw locks from rcu to localise
the cost. call_rcu()should be just fine as long as it plugs the hole.
I don't want to add blocking in this path since this slows
down the connection shutdown path and at times lead to soft dumps.

Would you mind posting an updated patch please with call_rcu and
above extended RCU grace period with rcu_read_lock. Thanks !!

Regards,
Santosh

Regards,
Santosh


Re: [Patch net] rds: mark bound socket with SOCK_RCU_FREE

2018-09-10 Thread Sowmini Varadhan
On (09/10/18 17:16), Cong Wang wrote:
> >
> > On (09/10/18 16:51), Cong Wang wrote:
> > >
> > > __rds_create_bind_key(key, addr, port, scope_id);
> > > -   rs = rhashtable_lookup_fast(_hash_table, key, ht_parms);
> > > +   rcu_read_lock();
> > > +   rs = rhashtable_lookup(_hash_table, key, ht_parms);
> > > if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
> > > rds_sock_addref(rs);
> > > else
> > > rs = NULL;
> > > +   rcu_read_unlock();
> >
> > aiui, the rcu_read lock/unlock is only useful if the write
> > side doing destructive operations  does something to make sure readers
> > are done before doing the destructive opertion. AFAIK, that does
> > not exist for rds socket management today
> 
> That is exactly why we need it here, right?

Maybe I am confused, what exactly is the patch you are proposing?

Does it have the SOCK_RCU_FREE change? 
Does it have the rcu_read_lock you have above? 
Where is the call_rcu?

> Hmm, so you are saying synchronize_rcu() is kinda more correct
> than call_rcu()??  


I'm not saying that, I'm asking "what exactly is the patch
you are proposing?" The only one on record is 
   http://patchwork.ozlabs.org/patch/968282/
which does not have either synchronize_rcu or call_rcu.

> I never hear this before, would like to know why.

Please post precise patches first.

Thanks.



[PATCH net-next] scsi: libcxgbi: fib6_ino reference in rt6_info is rcu protected

2018-09-10 Thread dsahern
From: David Ahern 

The fib6_info reference in rt6_info is rcu protected. Add a helper
to extract prefsrc from and update cxgbi_check_route6 to use it.

Fixes: 0153167aebd0 ("net/ipv6: Remove rt6i_prefsrc")
Reported-by: kbuild test robot 
Signed-off-by: David Ahern 
---
 drivers/scsi/cxgbi/libcxgbi.c |  5 ++---
 include/net/ip6_fib.h | 19 +++
 2 files changed, 21 insertions(+), 3 deletions(-)

diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c
index 6b3ea50c594e..75f876409fb9 100644
--- a/drivers/scsi/cxgbi/libcxgbi.c
+++ b/drivers/scsi/cxgbi/libcxgbi.c
@@ -784,7 +784,8 @@ cxgbi_check_route6(struct sockaddr *dst_addr, int ifindex)
csk->mtu = mtu;
csk->dst = dst;
 
-   if (!rt->from || ipv6_addr_any(>from->fib6_prefsrc.addr)) {
+   rt6_get_prefsrc(rt, _saddr);
+   if (ipv6_addr_any(_saddr)) {
struct inet6_dev *idev = ip6_dst_idev((struct dst_entry *)rt);
 
err = ipv6_dev_get_saddr(_net, idev ? idev->dev : NULL,
@@ -794,8 +795,6 @@ cxgbi_check_route6(struct sockaddr *dst_addr, int ifindex)
>sin6_addr);
goto rel_rt;
}
-   } else {
-   pref_saddr = rt->from->fib6_prefsrc.addr;
}
 
csk->csk_family = AF_INET6;
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index c7496663f99a..f06e968f1992 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -412,6 +412,25 @@ int fib6_add(struct fib6_node *root, struct fib6_info *rt,
 struct nl_info *info, struct netlink_ext_ack *extack);
 int fib6_del(struct fib6_info *rt, struct nl_info *info);
 
+static inline
+void rt6_get_prefsrc(const struct rt6_info *rt, struct in6_addr *addr)
+{
+   const struct fib6_info *from;
+
+   rcu_read_lock();
+
+   from = rcu_dereference(rt->from);
+   if (from) {
+   *addr = from->fib6_prefsrc.addr;
+   } else {
+   struct in6_addr in6_zero = {};
+
+   *addr = in6_zero;
+   }
+
+   rcu_read_unlock();
+}
+
 static inline struct net_device *fib6_info_nh_dev(const struct fib6_info *f6i)
 {
return f6i->fib6_nh.nh_dev;
-- 
2.11.0



Re: [Patch net] rds: mark bound socket with SOCK_RCU_FREE

2018-09-10 Thread Cong Wang
On Mon, Sep 10, 2018 at 5:04 PM Sowmini Varadhan
 wrote:
>
> On (09/10/18 16:51), Cong Wang wrote:
> >
> > __rds_create_bind_key(key, addr, port, scope_id);
> > -   rs = rhashtable_lookup_fast(_hash_table, key, ht_parms);
> > +   rcu_read_lock();
> > +   rs = rhashtable_lookup(_hash_table, key, ht_parms);
> > if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
> > rds_sock_addref(rs);
> > else
> > rs = NULL;
> > +   rcu_read_unlock();
>
> aiui, the rcu_read lock/unlock is only useful if the write
> side doing destructive operations  does something to make sure readers
> are done before doing the destructive opertion. AFAIK, that does
> not exist for rds socket management today

That is exactly why we need it here, right?

As you mentioned previously, the sock could be freed after
rhashtable_lookup_fast() but before rds_sock_addref(), extending
the RCU read section after rds_sock_addref() exactly solves
the problem here.

Or is there any other destructive problem you didn't mention?
Mind to be specific?


>
>
> > Although sock release path is not a very hot path, but blocking
> > it isn't a good idea either, especially when you can use call_rcu(),
> > which has the same effect.
> >
> > I don't see any reason we should prefer synchronize_rcu() here.
>
> Usually correctness (making sure all readers are done, before nuking a
> data structure) is a little bit more important than perforamance, aka
> "safety before speed" is what I've always been taught.
>

Hmm, so you are saying synchronize_rcu() is kinda more correct
than call_rcu()?? I never hear this before, would like to know why.

To my best knowledge, the only difference between them is the context,
one is blocking, the other is non-blocking. Their correctness must be
equivalent.


Re: [Patch net] rds: mark bound socket with SOCK_RCU_FREE

2018-09-10 Thread Sowmini Varadhan
On (09/10/18 16:51), Cong Wang wrote:
> 
> __rds_create_bind_key(key, addr, port, scope_id);
> -   rs = rhashtable_lookup_fast(_hash_table, key, ht_parms);
> +   rcu_read_lock();
> +   rs = rhashtable_lookup(_hash_table, key, ht_parms);
> if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
> rds_sock_addref(rs);
> else
> rs = NULL;
> +   rcu_read_unlock();

aiui, the rcu_read lock/unlock is only useful if the write
side doing destructive operations  does something to make sure readers
are done before doing the destructive opertion. AFAIK, that does
not exist for rds socket management today


> Although sock release path is not a very hot path, but blocking
> it isn't a good idea either, especially when you can use call_rcu(),
> which has the same effect.
> 
> I don't see any reason we should prefer synchronize_rcu() here.

Usually correctness (making sure all readers are done, before nuking a
data structure) is a little bit more important than perforamance, aka
"safety before speed" is what I've always been taught.  

Clearly, your mileage varies. As you please.

--Sowmini




Re: [Patch net] rds: mark bound socket with SOCK_RCU_FREE

2018-09-10 Thread Cong Wang
On Mon, Sep 10, 2018 at 4:30 PM Sowmini Varadhan
 wrote:
>
> On (09/10/18 15:43), Santosh Shilimkar wrote:
> > On 9/10/2018 3:24 PM, Cong Wang wrote:
> > >When a rds sock is bound, it is inserted into the bind_hash_table
> > >which is protected by RCU. But when releasing rd sock, after it
> > >is removed from this hash table, it is freed immediately without
> > >respecting RCU grace period. This could cause some use-after-free
> > >as reported by syzbot.
> > >
> > Indeed.
> >
> > >Mark the rds sock as SOCK_RCU_FREE before inserting it into the
> > >bind_hash_table, so that it would be always freed after a RCU grace
> > >period.
>
> So I'm not sure I understand.
>
> Yes, Cong's fix may eliminate *some* of the syzbot failures, but the
> basic problem is not solved.
>
> To take one example of possible races (one that was discussed in
>   https://www.spinics.net/lists/netdev/msg475074.html)
> rds_recv_incoming->rds_find_bound is being called in rds_send_worker
> context and the rds_find_bound code is
>
>  63 rs = rhashtable_lookup_fast(_hash_table, , ht_parms);
>  64 if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
>  65 rds_sock_addref(rs);
>  66 else
>  67 rs = NULL;
>  68
>
> After we find an rs at line 63, how can we be sure that the entire
> logic of rds_release does not execute on another cpu, and free the rs,
> before we hit line 64 with the bad rs?

This is a different problem. The RCU grace period should be just
extended:

diff --git a/net/rds/bind.c b/net/rds/bind.c
index 3ab55784b637..fa7592d0760c 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -76,11 +76,13 @@ struct rds_sock *rds_find_bound(const struct
in6_addr *addr, __be16 port,
struct rds_sock *rs;

__rds_create_bind_key(key, addr, port, scope_id);
-   rs = rhashtable_lookup_fast(_hash_table, key, ht_parms);
+   rcu_read_lock();
+   rs = rhashtable_lookup(_hash_table, key, ht_parms);
if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
rds_sock_addref(rs);
else
rs = NULL;
+   rcu_read_unlock();

rdsdebug("returning rs %p for %pI6c:%u\n", rs, addr,
 ntohs(port));

>
> Normally synchronize_rcu() or synchronize_net() in rds_release() would
> ensure this. How do we ensure this with SOCK_RCU_FREE (or is the
> intention to just reduce *some* of the syzbot failures)?

Although sock release path is not a very hot path, but blocking
it isn't a good idea either, especially when you can use call_rcu(),
which has the same effect.

I don't see any reason we should prefer synchronize_rcu() here.


Re: [PATCH v2 net-next 05/12] net: ethernet: genet: Fix speed selection

2018-09-10 Thread Florian Fainelli
On 09/10/2018 02:52 PM, Andrew Lunn wrote:
> The phy supported speed is being used to determine if the MAC should
> be configured to 100 or 1G. The masking logic is broken. Instead, look
> 1G supported speeds to enable 1G MAC support.
> 
> Signed-off-by: Andrew Lunn 

Acked-by: Florian Fainelli 

Let me get the proper fix submitted at some point ;)
-- 
Florian


Re: [Patch net] rds: mark bound socket with SOCK_RCU_FREE

2018-09-10 Thread Sowmini Varadhan
On (09/10/18 15:43), Santosh Shilimkar wrote:
> On 9/10/2018 3:24 PM, Cong Wang wrote:
> >When a rds sock is bound, it is inserted into the bind_hash_table
> >which is protected by RCU. But when releasing rd sock, after it
> >is removed from this hash table, it is freed immediately without
> >respecting RCU grace period. This could cause some use-after-free
> >as reported by syzbot.
> >
> Indeed.
> 
> >Mark the rds sock as SOCK_RCU_FREE before inserting it into the
> >bind_hash_table, so that it would be always freed after a RCU grace
> >period.

So I'm not sure I understand. 

Yes, Cong's fix may eliminate *some* of the syzbot failures, but the
basic problem is not solved.

To take one example of possible races (one that was discussed in
  https://www.spinics.net/lists/netdev/msg475074.html)
rds_recv_incoming->rds_find_bound is being called in rds_send_worker
context and the rds_find_bound code is

 63 rs = rhashtable_lookup_fast(_hash_table, , ht_parms);
 64 if (rs && !sock_flag(rds_rs_to_sk(rs), SOCK_DEAD))
 65 rds_sock_addref(rs);
 66 else 
 67 rs = NULL;
 68 

After we find an rs at line 63, how can we be sure that the entire
logic of rds_release does not execute on another cpu, and free the rs,
before we hit line 64 with the bad rs?

Normally synchronize_rcu() or synchronize_net() in rds_release() would 
ensure this. How do we ensure this with SOCK_RCU_FREE (or is the
intention to just reduce *some* of the syzbot failures)?

--Sowmini




Re: [PATCH v2 net-next 11/12] net: ethernet: Add helper for set_pauseparam for Pause

2018-09-10 Thread Florian Fainelli
On 09/10/2018 02:52 PM, Andrew Lunn wrote:
> ethtool can be used to enable/disable pause. Add a helper to configure
> the PHY when Pause is supported.
> 
> Signed-off-by: Andrew Lunn 

Reviewed-by: Florian Fainelli 
-- 
Florian


Re: [PATCH v2 net-next 10/12] net: ethernet: Add helper for set_pauseparam for Asym Pause

2018-09-10 Thread Florian Fainelli
On 09/10/2018 02:52 PM, Andrew Lunn wrote:
> ethtool can be used to enable/disable pause. Add a helper to configure
> the PHY when asym pause is supported.
> 
> Signed-off-by: Andrew Lunn 

Reviewed-by: Florian Fainelli 
-- 
Florian


Re: [PATCH v2 net-next 01/12] net: phy: ste10Xp: Remove wrong SUPPORTED_Pause

2018-09-10 Thread Florian Fainelli
On 09/10/2018 02:52 PM, Andrew Lunn wrote:
> The PHY driver should not indicate that Pause is supported. It is upto
> the MAC drive enable it, if it supports Pause frames. So remove it
> from the ste10Xp driver.
> 
> Signed-off-by: Andrew Lunn 

Reviewed-by: Florian Fainelli 
-- 
Florian


Re: [Patch net] rds: mark bound socket with SOCK_RCU_FREE

2018-09-10 Thread Santosh Shilimkar

On 9/10/2018 3:24 PM, Cong Wang wrote:

When a rds sock is bound, it is inserted into the bind_hash_table
which is protected by RCU. But when releasing rd sock, after it
is removed from this hash table, it is freed immediately without
respecting RCU grace period. This could cause some use-after-free
as reported by syzbot.


Indeed.


Mark the rds sock as SOCK_RCU_FREE before inserting it into the
bind_hash_table, so that it would be always freed after a RCU grace
period.

Reported-and-tested-by: syzbot+8967084bcac563795...@syzkaller.appspotmail.com
Cc: Sowmini Varadhan 
Cc: Santosh Shilimkar 
Cc: rds-de...@oss.oracle.com
Signed-off-by: Cong Wang 
---
  net/rds/bind.c | 1 +
  1 file changed, 1 insertion(+)

diff --git a/net/rds/bind.c b/net/rds/bind.c
index 3ab55784b637..2281b34415b9 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -235,6 +235,7 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, 
int addr_len)
goto out;
}
  
+	sock_set_flag(sk, SOCK_RCU_FREE);

ret = rds_add_bound(rs, binding_addr, , scope_id);
if (ret)
goto out;

I wasn't aware of this "SOCK_RCU_FREE" so really thanks for this patch. 
Have been scratching my head over this for a while thinking about

generic provision at sk level to synchronize. This is much
better than adding the sync at upper layer.

It does have the tax for slowing down RDS for other kernel components
rcu sync but anyway this hole needs to be plugged so am fine to go
ahead with this change. Thanks for the patch.

FWIW,
Acked-by: Santosh Shilimkar 


Re: [Patch net] rds: mark bound socket with SOCK_RCU_FREE

2018-09-10 Thread Sowmini Varadhan
On (09/10/18 15:24), Cong Wang wrote:
> 
> When a rds sock is bound, it is inserted into the bind_hash_table
> which is protected by RCU. But when releasing rd sock, after it
> is removed from this hash table, it is freed immediately without
> respecting RCU grace period. This could cause some use-after-free
> as reported by syzbot.
> 

I have no objection to the change itself, but the syzbot failures
are caused for a very simple reason: we need synchronize_net()
in rds_release before we remove the rds_sock from the bind_hash_table.

I already pointed this out in 
  https://www.spinics.net/lists/netdev/msg475074.html

I think the objection to synchronize_net() is that it can cause
perf issues (I'm told that rds_release() has been known to be held
up by other threads in rcu critical sections?) but I personally
dont see any other alternative to this (other than going back
to rwlock, instead of rcu)

--Sowmini



[Patch net] rds: mark bound socket with SOCK_RCU_FREE

2018-09-10 Thread Cong Wang
When a rds sock is bound, it is inserted into the bind_hash_table
which is protected by RCU. But when releasing rd sock, after it
is removed from this hash table, it is freed immediately without
respecting RCU grace period. This could cause some use-after-free
as reported by syzbot.

Mark the rds sock as SOCK_RCU_FREE before inserting it into the
bind_hash_table, so that it would be always freed after a RCU grace
period.

Reported-and-tested-by: syzbot+8967084bcac563795...@syzkaller.appspotmail.com
Cc: Sowmini Varadhan 
Cc: Santosh Shilimkar 
Cc: rds-de...@oss.oracle.com
Signed-off-by: Cong Wang 
---
 net/rds/bind.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/net/rds/bind.c b/net/rds/bind.c
index 3ab55784b637..2281b34415b9 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -235,6 +235,7 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, 
int addr_len)
goto out;
}
 
+   sock_set_flag(sk, SOCK_RCU_FREE);
ret = rds_add_bound(rs, binding_addr, , scope_id);
if (ret)
goto out;
-- 
2.14.4



Re: [PATCH v3 net-next 5/6] dt-bindings: net: dsa: Add lantiq,xrx200-gswip DT bindings

2018-09-10 Thread Andrew Lunn
> > +See Documentation/devicetree/bindings/net/dsa/dsa.txt for a list of
> > +additional required and optional properties.
> > +
> > +

snip

> > +   #address-cells = <1>;
> > +   #size-cells = <0>;
> > +   compatible = "lantiq,xrx200-gswip";
> > +   reg = < 0xE108000 0x3000 /* switch */
> > +   0xE10B100 0x70 /* mdio */
> > +   0xE10B1D8 0x30 /* mii */
> > +   >;
> > +   dsa,member = <0 0>;
> 
> Not documented.

Hi Rob

It is documented in Documentation/devicetree/bindings/net/dsa/dsa.txt
referenced above.

   Andrew


Re: [PATCH v3 net-next 5/6] dt-bindings: net: dsa: Add lantiq,xrx200-gswip DT bindings

2018-09-10 Thread Rob Herring
On Sun, Sep 09, 2018 at 10:20:27PM +0200, Hauke Mehrtens wrote:
> This adds the binding for the GSWIP (Gigabit switch) core found in the
> xrx200 / VR9 Lantiq / Intel SoC.
> 
> This part takes care of the switch, MDIO bus, and loading the FW into
> the embedded GPHYs.
> 
> Signed-off-by: Hauke Mehrtens 
> Cc: devicet...@vger.kernel.org
> ---
>  .../devicetree/bindings/net/dsa/lantiq-gswip.txt   | 141 
> +
>  1 file changed, 141 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/net/dsa/lantiq-gswip.txt
> 
> diff --git a/Documentation/devicetree/bindings/net/dsa/lantiq-gswip.txt 
> b/Documentation/devicetree/bindings/net/dsa/lantiq-gswip.txt
> new file mode 100644
> index ..a089f5856778
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/net/dsa/lantiq-gswip.txt
> @@ -0,0 +1,141 @@
> +Lantiq GSWIP Ethernet switches
> +==
> +
> +Required properties for GSWIP core:
> +
> +- compatible : "lantiq,xrx200-gswip" for the embedded GSWIP in the
> +   xRX200 SoC
> +- reg: memory range of the GSWIP core registers
> + : memory range of the GSWIP MDIO registers
> + : memory range of the GSWIP MII registers
> +
> +See Documentation/devicetree/bindings/net/dsa/dsa.txt for a list of
> +additional required and optional properties.
> +
> +
> +Required properties for MDIO bus:
> +- compatible : "lantiq,xrx200-mdio" for the MDIO bus inside the GSWIP
> +   core of the xRX200 SoC and the PHYs connected to it.
> +
> +See Documentation/devicetree/bindings/net/mdio.txt for a list of additional
> +required and optional properties.
> +
> +
> +Required properties for GPHY firmware loading:
> +- compatible : "lantiq,gphy-fw" and "lantiq,xrx200-gphy-fw",
> +   "lantiq,xrx200a1x-gphy-fw", "lantiq,xrx200a2x-gphy-fw",
> +   "lantiq,xrx300-gphy-fw", or "lantiq,xrx330-gphy-fw"
> +   for the loading of the firmware into the embedded
> +   GPHY core of the SoC.

One valid combination of compatibles per line please.

> +- lantiq,rcu : reference to the rcu syscon
> +
> +The GPHY firmware loader has a list of GPHY entries, one for each
> +embedded GPHY
> +
> +- reg: Offset of the GPHY firmware register in the RCU
> +   register range

This use of reg is strange. This node should probably be a child of 
the RCU.

> +- resets : list of resets of the embedded GPHY
> +- reset-names: list of names of the resets
> +
> +Example:
> +
> +Ethernet switch on the VRX200 SoC:
> +
> +gswip: gswip@E108000 {

switch@... or ethernet-switch@...

We need a standard name here and add it to the DT spec.

> + #address-cells = <1>;
> + #size-cells = <0>;
> + compatible = "lantiq,xrx200-gswip";
> + reg = < 0xE108000 0x3000 /* switch */
> + 0xE10B100 0x70 /* mdio */
> + 0xE10B1D8 0x30 /* mii */
> + >;
> + dsa,member = <0 0>;

Not documented.

> +
> + ports {
> + #address-cells = <1>;
> + #size-cells = <0>;
> +
> + port@0 {
> + reg = <0>;
> + label = "lan3";
> + phy-mode = "rgmii";
> + phy-handle = <>;
> + };
> +
> + port@1 {
> + reg = <1>;
> + label = "lan4";
> + phy-mode = "rgmii";
> + phy-handle = <>;
> + };
> +
> + port@2 {
> + reg = <2>;
> + label = "lan2";
> + phy-mode = "internal";
> + phy-handle = <>;
> + };
> +
> + port@4 {
> + reg = <4>;
> + label = "lan1";
> + phy-mode = "internal";
> + phy-handle = <>;
> + };
> +
> + port@5 {
> + reg = <5>;
> + label = "wan";
> + phy-mode = "rgmii";
> + phy-handle = <>;
> + };
> +
> + port@6 {
> + reg = <0x6>;
> + label = "cpu";
> + ethernet = <>;
> + };
> + };
> +
> + mdio@0 {

What's the address 0 here?

> + #address-cells = <1>;
> + #size-cells = <0>;
> + compatible = "lantiq,xrx200-mdio";
> + reg = <0>;
> +
> + phy0: ethernet-phy@0 {
> + reg = <0x0>;
> + };
> + phy1: ethernet-phy@1 {
> + reg = <0x1>;
> + };
> + phy5: ethernet-phy@5 {
> + reg = <0x5>;
> + };
> + phy11: ethernet-phy@11 {
> + reg = <0x11>;
> + };
> + phy13: ethernet-phy@13 {
> + reg = 

[PATCH v2 net-next 12/12] net: ethernet: Add helper to determine if pause configuration is supported

2018-09-10 Thread Andrew Lunn
Rather than have MAC drivers open code the test, add a helper in
phylib. This will help when we change the type of phydev->supported.

Signed-off-by: Andrew Lunn 
Reviewed-by: Florian Fainelli 
---
 .../ethernet/apm/xgene/xgene_enet_ethtool.c   |  4 +---
 drivers/net/ethernet/broadcom/tg3.c   |  4 +---
 .../ethernet/freescale/dpaa/dpaa_ethtool.c|  4 +---
 .../net/ethernet/freescale/gianfar_ethtool.c  |  4 +---
 drivers/net/phy/phy_device.c  | 20 +++
 include/linux/phy.h   |  2 ++
 6 files changed, 26 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_ethtool.c 
b/drivers/net/ethernet/apm/xgene/xgene_enet_ethtool.c
index dfe03afd00b0..78dd09b5beeb 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_ethtool.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_ethtool.c
@@ -312,9 +312,7 @@ static int xgene_set_pauseparam(struct net_device *ndev,
if (!phydev)
return -EINVAL;
 
-   if (!(phydev->supported & SUPPORTED_Pause) ||
-   (!(phydev->supported & SUPPORTED_Asym_Pause) &&
-pp->rx_pause != pp->tx_pause))
+   if (!phy_validate_pause(phydev, pp))
return -EINVAL;
 
pdata->pause_autoneg = pp->autoneg;
diff --git a/drivers/net/ethernet/broadcom/tg3.c 
b/drivers/net/ethernet/broadcom/tg3.c
index b2a3d008e1df..fb0e458e25b7 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -12496,9 +12496,7 @@ static int tg3_set_pauseparam(struct net_device *dev, 
struct ethtool_pauseparam
 
phydev = mdiobus_get_phy(tp->mdio_bus, tp->phy_addr);
 
-   if (!(phydev->supported & SUPPORTED_Pause) ||
-   (!(phydev->supported & SUPPORTED_Asym_Pause) &&
-(epause->rx_pause != epause->tx_pause)))
+   if (!phy_validate_pause(phydev, epause))
return -EINVAL;
 
tp->link_config.flowctrl = 0;
diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c 
b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
index 1f8cdbc4378c..5d0fdf667b82 100644
--- a/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
+++ b/drivers/net/ethernet/freescale/dpaa/dpaa_ethtool.c
@@ -194,9 +194,7 @@ static int dpaa_set_pauseparam(struct net_device *net_dev,
return -ENODEV;
}
 
-   if (!(phydev->supported & SUPPORTED_Pause) ||
-   (!(phydev->supported & SUPPORTED_Asym_Pause) &&
-   (epause->rx_pause != epause->tx_pause)))
+   if (!phy_validate_pause(phydev, epause))
return -EINVAL;
 
/* The MAC should know how to handle PAUSE frame autonegotiation before
diff --git a/drivers/net/ethernet/freescale/gianfar_ethtool.c 
b/drivers/net/ethernet/freescale/gianfar_ethtool.c
index 3545e8f715f2..d3662965f59d 100644
--- a/drivers/net/ethernet/freescale/gianfar_ethtool.c
+++ b/drivers/net/ethernet/freescale/gianfar_ethtool.c
@@ -507,9 +507,7 @@ static int gfar_spauseparam(struct net_device *dev,
if (!phydev)
return -ENODEV;
 
-   if (!(phydev->supported & SUPPORTED_Pause) ||
-   (!(phydev->supported & SUPPORTED_Asym_Pause) &&
-(epause->rx_pause != epause->tx_pause)))
+   if (!phy_validate_pause(phydev, epause))
return -EINVAL;
 
priv->rx_pause_en = priv->tx_pause_en = 0;
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 5693013afe5e..9aa518a957f1 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1862,6 +1862,26 @@ void phy_set_asym_pause(struct phy_device *phydev, bool 
rx, bool tx)
 }
 EXPORT_SYMBOL(phy_set_asym_pause);
 
+/**
+ * phy_validate_pause - Test if the PHY/MAC support the pause configuration
+ * @phydev: phy_device struct
+ * @pp: requested pause configuration
+ *
+ * Description: Test if the PHY/MAC combination supports the Pause
+ * configuration the user is requesting. Returns True if it is
+ * supported, false otherwise.
+ */
+bool phy_validate_pause(struct phy_device *phydev,
+   struct ethtool_pauseparam *pp)
+{
+   if (!(phydev->supported & SUPPORTED_Pause) ||
+   (!(phydev->supported & SUPPORTED_Asym_Pause) &&
+pp->rx_pause != pp->tx_pause))
+   return false;
+   return true;
+}
+EXPORT_SYMBOL(phy_validate_pause);
+
 static void of_set_phy_supported(struct phy_device *phydev)
 {
struct device_node *node = phydev->mdio.dev.of_node;
diff --git a/include/linux/phy.h b/include/linux/phy.h
index 8521391ebb20..192a1fa0c73b 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1055,6 +1055,8 @@ void phy_support_asym_pause(struct phy_device *phydev);
 void phy_set_sym_pause(struct phy_device *phydev, bool rx, bool tx,
   bool autoneg);
 void phy_set_asym_pause(struct 

[PATCH v2 net-next 10/12] net: ethernet: Add helper for set_pauseparam for Asym Pause

2018-09-10 Thread Andrew Lunn
ethtool can be used to enable/disable pause. Add a helper to configure
the PHY when asym pause is supported.

Signed-off-by: Andrew Lunn 
---
v2: Also trigger autoneg if the advertising settings have changed.
---
 .../ethernet/apm/xgene/xgene_enet_ethtool.c   | 26 ++
 drivers/net/ethernet/aurora/nb8800.c  |  9 +---
 drivers/net/ethernet/broadcom/tg3.c   | 43 +---
 drivers/net/ethernet/faraday/ftgmac100.c  | 17 ++-
 .../ethernet/freescale/dpaa/dpaa_ethtool.c| 23 +
 .../net/ethernet/freescale/gianfar_ethtool.c  | 49 ++-
 .../hisilicon/hns3/hns3pf/hclge_main.c|  8 +--
 drivers/net/ethernet/socionext/sni_ave.c  | 11 +
 drivers/net/phy/phy_device.c  | 30 
 include/linux/phy.h   |  1 +
 10 files changed, 69 insertions(+), 148 deletions(-)

diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_ethtool.c 
b/drivers/net/ethernet/apm/xgene/xgene_enet_ethtool.c
index 4f50f11718f4..dfe03afd00b0 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_ethtool.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_ethtool.c
@@ -306,7 +306,6 @@ static int xgene_set_pauseparam(struct net_device *ndev,
 {
struct xgene_enet_pdata *pdata = netdev_priv(ndev);
struct phy_device *phydev = ndev->phydev;
-   u32 oldadv, newadv;
 
if (phy_interface_mode_is_rgmii(pdata->phy_mode) ||
pdata->phy_mode == PHY_INTERFACE_MODE_SGMII) {
@@ -322,29 +321,12 @@ static int xgene_set_pauseparam(struct net_device *ndev,
pdata->tx_pause = pp->tx_pause;
pdata->rx_pause = pp->rx_pause;
 
-   oldadv = phydev->advertising;
-   newadv = oldadv & ~(ADVERTISED_Pause | ADVERTISED_Asym_Pause);
+   phy_set_asym_pause(phydev, pp->rx_pause,  pp->tx_pause);
 
-   if (pp->rx_pause)
-   newadv |= ADVERTISED_Pause | ADVERTISED_Asym_Pause;
-
-   if (pp->tx_pause)
-   newadv ^= ADVERTISED_Asym_Pause;
-
-   if (oldadv ^ newadv) {
-   phydev->advertising = newadv;
-
-   if (phydev->autoneg)
-   return phy_start_aneg(phydev);
-
-   if (!pp->autoneg) {
-   pdata->mac_ops->flowctl_tx(pdata,
-  pdata->tx_pause);
-   pdata->mac_ops->flowctl_rx(pdata,
-  pdata->rx_pause);
-   }
+   if (!pp->autoneg) {
+   pdata->mac_ops->flowctl_tx(pdata, pdata->tx_pause);
+   pdata->mac_ops->flowctl_rx(pdata, pdata->rx_pause);
}
-
} else {
if (pp->autoneg)
return -EINVAL;
diff --git a/drivers/net/ethernet/aurora/nb8800.c 
b/drivers/net/ethernet/aurora/nb8800.c
index c8d1f8fa4713..6f56276015a4 100644
--- a/drivers/net/ethernet/aurora/nb8800.c
+++ b/drivers/net/ethernet/aurora/nb8800.c
@@ -935,18 +935,11 @@ static void nb8800_pause_adv(struct net_device *dev)
 {
struct nb8800_priv *priv = netdev_priv(dev);
struct phy_device *phydev = dev->phydev;
-   u32 adv = 0;
 
if (!phydev)
return;
 
-   if (priv->pause_rx)
-   adv |= ADVERTISED_Pause | ADVERTISED_Asym_Pause;
-   if (priv->pause_tx)
-   adv ^= ADVERTISED_Asym_Pause;
-
-   phydev->supported |= adv;
-   phydev->advertising |= adv;
+   phy_set_asym_pause(phydev, priv->pause_rx, priv->pause_tx);
 }
 
 static int nb8800_open(struct net_device *dev)
diff --git a/drivers/net/ethernet/broadcom/tg3.c 
b/drivers/net/ethernet/broadcom/tg3.c
index 193e990fac7a..b2a3d008e1df 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -12492,7 +12492,6 @@ static int tg3_set_pauseparam(struct net_device *dev, 
struct ethtool_pauseparam
tg3_warn_mgmt_link_flap(tp);
 
if (tg3_flag(tp, USE_PHYLIB)) {
-   u32 newadv;
struct phy_device *phydev;
 
phydev = mdiobus_get_phy(tp->mdio_bus, tp->phy_addr);
@@ -12503,20 +12502,16 @@ static int tg3_set_pauseparam(struct net_device *dev, 
struct ethtool_pauseparam
return -EINVAL;
 
tp->link_config.flowctrl = 0;
+   phy_set_asym_pause(phydev, epause->rx_pause, epause->tx_pause);
if (epause->rx_pause) {
tp->link_config.flowctrl |= FLOW_CTRL_RX;
 
if (epause->tx_pause) {
tp->link_config.flowctrl |= FLOW_CTRL_TX;
-   newadv = ADVERTISED_Pause;
-   } else
-   newadv = ADVERTISED_Pause |
-  

[PATCH v2 net-next 11/12] net: ethernet: Add helper for set_pauseparam for Pause

2018-09-10 Thread Andrew Lunn
ethtool can be used to enable/disable pause. Add a helper to configure
the PHY when Pause is supported.

Signed-off-by: Andrew Lunn 
---
v2:
  Rename phy_set_pause() to phy_set_sym_pause()
  Use the bcm63xx_enet.c logic, not fec_main.c
---
 drivers/net/ethernet/broadcom/bcm63xx_enet.c |  7 ++-
 drivers/net/ethernet/freescale/fec_main.c|  9 ++--
 drivers/net/phy/phy_device.c | 22 
 include/linux/phy.h  |  2 ++
 4 files changed, 28 insertions(+), 12 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c 
b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
index 9f25667c38e6..02e7dfc1a2ef 100644
--- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c
+++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
@@ -892,11 +892,8 @@ static int bcm_enet_open(struct net_device *dev)
/* mask with MAC supported features */
phy_support_sym_pause(phydev);
phy_set_max_speed(phydev, SPEED_100);
-
-   if (priv->pause_auto && priv->pause_rx && priv->pause_tx)
-   phydev->advertising |= SUPPORTED_Pause;
-   else
-   phydev->advertising &= ~SUPPORTED_Pause;
+   phy_set_sym_pause(phydev, priv->pause_rx, priv->pause_rx,
+ priv->pause_auto);
 
phy_attached_info(phydev);
 
diff --git a/drivers/net/ethernet/freescale/fec_main.c 
b/drivers/net/ethernet/freescale/fec_main.c
index 05ce0903391a..2e0bb90131b6 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -2229,13 +2229,8 @@ static int fec_enet_set_pauseparam(struct net_device 
*ndev,
fep->pause_flag |= pause->rx_pause ? FEC_PAUSE_FLAG_ENABLE : 0;
fep->pause_flag |= pause->autoneg ? FEC_PAUSE_FLAG_AUTONEG : 0;
 
-   if (pause->rx_pause || pause->autoneg) {
-   ndev->phydev->supported |= ADVERTISED_Pause;
-   ndev->phydev->advertising |= ADVERTISED_Pause;
-   } else {
-   ndev->phydev->supported &= ~ADVERTISED_Pause;
-   ndev->phydev->advertising &= ~ADVERTISED_Pause;
-   }
+   phy_set_sym_pause(ndev->phydev, pause->rx_pause, pause->tx_pause,
+ pause->autoneg);
 
if (pause->autoneg) {
if (netif_running(ndev))
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index 66173e148768..5693013afe5e 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1810,6 +1810,28 @@ void phy_support_asym_pause(struct phy_device *phydev)
 }
 EXPORT_SYMBOL(phy_support_asym_pause);
 
+/**
+ * phy_set_sym_pause - Configure symmetric Pause
+ * @phydev: target phy_device struct
+ * @rx: Receiver Pause is supported
+ * @autoneg: Auto neg should be used
+ *
+ * Description: Configure advertised Pause support depending on if
+ * receiver pause and pause auto neg is supported. Generally called
+ * from the set_pauseparam .ndo.
+ */
+void phy_set_sym_pause(struct phy_device *phydev, bool rx, bool tx,
+  bool autoneg)
+{
+   phydev->supported &= ~SUPPORTED_Pause;
+
+   if (rx && tx && autoneg)
+   phydev->supported |= SUPPORTED_Pause;
+
+   phydev->advertising = phydev->supported;
+}
+EXPORT_SYMBOL(phy_set_sym_pause);
+
 /**
  * phy_set_asym_pause - Configure Pause and Asym Pause
  * @phydev: target phy_device struct
diff --git a/include/linux/phy.h b/include/linux/phy.h
index e4062ba7472f..8521391ebb20 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1052,6 +1052,8 @@ int phy_set_max_speed(struct phy_device *phydev, u32 
max_speed);
 void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode);
 void phy_support_sym_pause(struct phy_device *phydev);
 void phy_support_asym_pause(struct phy_device *phydev);
+void phy_set_sym_pause(struct phy_device *phydev, bool rx, bool tx,
+  bool autoneg);
 void phy_set_asym_pause(struct phy_device *phydev, bool rx, bool tx);
 
 int phy_register_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask,
-- 
2.19.0.rc1



[PATCH v2 net-next 02/12] net: phy: et1011c: Remove incorrect missing 1000 Half

2018-09-10 Thread Andrew Lunn
The driver indicates it can do 10/100 full and half duplex, plus 1G
Full. The datasheet indicates 1G half is also supported. So make use
of the standard PHY_GBIT_FEATURES.

It could be, this was added because there is a MAC which does not
support 1G half. Bit this is the wrong place to enforce this.

Signed-off-by: Andrew Lunn 
Reviewed-by: Florian Fainelli 
---
 drivers/net/phy/et1011c.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/phy/et1011c.c b/drivers/net/phy/et1011c.c
index a9a4edfa23c8..ab541c9c56fb 100644
--- a/drivers/net/phy/et1011c.c
+++ b/drivers/net/phy/et1011c.c
@@ -91,7 +91,7 @@ static struct phy_driver et1011c_driver[] = { {
.phy_id = 0x0282f014,
.name   = "ET1011C",
.phy_id_mask= 0xfff0,
-   .features   = (PHY_BASIC_FEATURES | SUPPORTED_1000baseT_Full),
+   .features   = PHY_GBIT_FEATURES,
.flags  = PHY_POLL,
.config_aneg= et1011c_config_aneg,
.read_status= et1011c_read_status,
-- 
2.19.0.rc1



[PATCH v2 net-next 08/12] net: ethernet: Add helper for MACs which support asym pause

2018-09-10 Thread Andrew Lunn
Rather than have the MAC drivers manipulate phydev members to indicate
they support Asym Pause, add a helper function.

Signed-off-by: Andrew Lunn 
Reviewed-by: Florian Fainelli 
---
v2: Fixup bad indentation in tg3.c
---
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c |  4 ++--
 drivers/net/ethernet/apm/xgene/xgene_enet_hw.c  |  4 +---
 drivers/net/ethernet/broadcom/sb1250-mac.c  |  5 +
 drivers/net/ethernet/broadcom/tg3.c |  8 ++--
 drivers/net/ethernet/cortina/gemini.c   |  3 +--
 drivers/net/ethernet/dnet.c |  4 +---
 drivers/net/ethernet/faraday/ftgmac100.c|  3 +--
 drivers/net/ethernet/freescale/dpaa/dpaa_eth.c  |  3 +--
 drivers/net/ethernet/freescale/gianfar.c|  4 ++--
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c |  4 +---
 drivers/net/ethernet/mediatek/mtk_eth_soc.c |  6 +-
 drivers/net/ethernet/microchip/lan743x_main.c   |  5 +
 drivers/net/ethernet/smsc/smsc911x.c|  3 +--
 drivers/net/ethernet/smsc/smsc9420.c|  3 +--
 drivers/net/ethernet/socionext/sni_ave.c|  3 ++-
 drivers/net/phy/phy_device.c| 13 +
 include/linux/phy.h |  1 +
 17 files changed, 33 insertions(+), 43 deletions(-)

diff --git a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c 
b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
index 3ceb4f95ca7c..289129011b9f 100644
--- a/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
+++ b/drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c
@@ -879,8 +879,8 @@ static bool xgbe_phy_finisar_phy_quirks(struct 
xgbe_prv_data *pdata)
phy_write(phy_data->phydev, 0x00, 0x9140);
 
phy_data->phydev->supported = PHY_GBIT_FEATURES;
-   phy_data->phydev->supported |= SUPPORTED_Pause | SUPPORTED_Asym_Pause;
phy_data->phydev->advertising = phy_data->phydev->supported;
+   phy_support_asym_pause(phy_data->phydev);
 
netif_dbg(pdata, drv, pdata->netdev,
  "Finisar PHY quirk in place\n");
@@ -951,8 +951,8 @@ static bool xgbe_phy_belfuse_phy_quirks(struct 
xgbe_prv_data *pdata)
phy_write(phy_data->phydev, 0x00, reg & ~0x00800);
 
phy_data->phydev->supported = PHY_GBIT_FEATURES;
-   phy_data->phydev->supported |= SUPPORTED_Pause | SUPPORTED_Asym_Pause;
phy_data->phydev->advertising = phy_data->phydev->supported;
+   phy_support_asym_pause(phy_data->phydev);
 
netif_dbg(pdata, drv, pdata->netdev,
  "BelFuse PHY quirk in place\n");
diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c 
b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
index 4831f9de5945..e3560311711a 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
@@ -898,9 +898,7 @@ int xgene_enet_phy_connect(struct net_device *ndev)
phy_remove_link_mode(phy_dev, ETHTOOL_LINK_MODE_10baseT_Half_BIT);
phy_remove_link_mode(phy_dev, ETHTOOL_LINK_MODE_100baseT_Half_BIT);
phy_remove_link_mode(phy_dev, ETHTOOL_LINK_MODE_1000baseT_Half_BIT);
-   phy_dev->supported |= SUPPORTED_Pause |
- SUPPORTED_Asym_Pause;
-   phy_dev->advertising = phy_dev->supported;
+   phy_support_asym_pause(phy_dev);
 
return 0;
 }
diff --git a/drivers/net/ethernet/broadcom/sb1250-mac.c 
b/drivers/net/ethernet/broadcom/sb1250-mac.c
index 4ce4b097ec05..53acbbb36637 100644
--- a/drivers/net/ethernet/broadcom/sb1250-mac.c
+++ b/drivers/net/ethernet/broadcom/sb1250-mac.c
@@ -2358,13 +2358,10 @@ static int sbmac_mii_probe(struct net_device *dev)
 
/* Remove any features not supported by the controller */
phy_set_max_speed(phy_dev, SPEED_1000);
-   phy_dev->supported |= SUPPORTED_Pause |
- SUPPORTED_Asym_Pause;
+   phy_support_asym_pause(phy_dev);
 
phy_attached_info(phy_dev);
 
-   phy_dev->advertising = phy_dev->supported;
-
sc->phy_dev = phy_dev;
 
return 0;
diff --git a/drivers/net/ethernet/broadcom/tg3.c 
b/drivers/net/ethernet/broadcom/tg3.c
index eab00239a47a..193e990fac7a 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -2123,15 +2123,13 @@ static int tg3_phy_init(struct tg3 *tp)
case PHY_INTERFACE_MODE_RGMII:
if (!(tp->phy_flags & TG3_PHYFLG_10_100_ONLY)) {
phy_set_max_speed(phydev, SPEED_1000);
-   phydev->supported |= (SUPPORTED_Pause |
- SUPPORTED_Asym_Pause);
+   phy_support_asym_pause(phydev);
break;
}
/* fallthru */
case PHY_INTERFACE_MODE_MII:
phy_set_max_speed(phydev, SPEED_100);
-   phydev->supported |= (SUPPORTED_Pause |
- 

[PATCH v2 net-next 01/12] net: phy: ste10Xp: Remove wrong SUPPORTED_Pause

2018-09-10 Thread Andrew Lunn
The PHY driver should not indicate that Pause is supported. It is upto
the MAC drive enable it, if it supports Pause frames. So remove it
from the ste10Xp driver.

Signed-off-by: Andrew Lunn 
---
 drivers/net/phy/ste10Xp.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/net/phy/ste10Xp.c b/drivers/net/phy/ste10Xp.c
index fbd548a1ad84..2fe9a87b55b5 100644
--- a/drivers/net/phy/ste10Xp.c
+++ b/drivers/net/phy/ste10Xp.c
@@ -86,7 +86,7 @@ static struct phy_driver ste10xp_pdriver[] = {
.phy_id = STE101P_PHY_ID,
.phy_id_mask = 0xfff0,
.name = "STe101p",
-   .features = PHY_BASIC_FEATURES | SUPPORTED_Pause,
+   .features = PHY_BASIC_FEATURES,
.flags = PHY_HAS_INTERRUPT,
.config_init = ste10Xp_config_init,
.ack_interrupt = ste10Xp_ack_interrupt,
@@ -97,7 +97,7 @@ static struct phy_driver ste10xp_pdriver[] = {
.phy_id = STE100P_PHY_ID,
.phy_id_mask = 0x,
.name = "STe100p",
-   .features = PHY_BASIC_FEATURES | SUPPORTED_Pause,
+   .features = PHY_BASIC_FEATURES,
.flags = PHY_HAS_INTERRUPT,
.config_init = ste10Xp_config_init,
.ack_interrupt = ste10Xp_ack_interrupt,
-- 
2.19.0.rc1



[PATCH v2 net-next 09/12] net: ethernet: Add helper for MACs which support pause

2018-09-10 Thread Andrew Lunn
Rather than have the MAC drivers manipulate phydev members, add a
helper function for MACs supporting Pause, but not Asym Pause.

Signed-off-by: Andrew Lunn 
Reviewed-by: Florian Fainelli 
---
v2: rename phy_support_pause() to phy_support_sym_pause()
---
 drivers/net/ethernet/broadcom/bcm63xx_enet.c |  2 +-
 drivers/net/ethernet/freescale/fec_main.c|  4 +---
 drivers/net/phy/phy_device.c | 14 ++
 include/linux/phy.h  |  1 +
 4 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/bcm63xx_enet.c 
b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
index 2eee9459c2cf..9f25667c38e6 100644
--- a/drivers/net/ethernet/broadcom/bcm63xx_enet.c
+++ b/drivers/net/ethernet/broadcom/bcm63xx_enet.c
@@ -890,7 +890,7 @@ static int bcm_enet_open(struct net_device *dev)
}
 
/* mask with MAC supported features */
-   phydev->supported |= SUPPORTED_Pause;
+   phy_support_sym_pause(phydev);
phy_set_max_speed(phydev, SPEED_100);
 
if (priv->pause_auto && priv->pause_rx && priv->pause_tx)
diff --git a/drivers/net/ethernet/freescale/fec_main.c 
b/drivers/net/ethernet/freescale/fec_main.c
index 0c6fd77b6599..05ce0903391a 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -1950,14 +1950,12 @@ static int fec_enet_mii_probe(struct net_device *ndev)
phy_remove_link_mode(phy_dev,
 ETHTOOL_LINK_MODE_1000baseT_Half_BIT);
 #if !defined(CONFIG_M5272)
-   phy_dev->supported |= SUPPORTED_Pause;
+   phy_support_sym_pause(phy_dev);
 #endif
}
else
phy_set_max_speed(phy_dev, 100);
 
-   phy_dev->advertising = phy_dev->supported;
-
fep->link = 0;
fep->full_duplex = 0;
 
diff --git a/drivers/net/phy/phy_device.c b/drivers/net/phy/phy_device.c
index a0646a66f005..e657d5ae2ab8 100644
--- a/drivers/net/phy/phy_device.c
+++ b/drivers/net/phy/phy_device.c
@@ -1783,6 +1783,20 @@ void phy_remove_link_mode(struct phy_device *phydev, u32 
link_mode)
 }
 EXPORT_SYMBOL(phy_remove_link_mode);
 
+/**
+ * phy_support_sym_pause - Enable support of symmetrical pause
+ * @phydev: target phy_device struct
+ *
+ * Description: Called by the MAC to indicate is supports symmetrical
+ * Pause, but not asym pause.
+ */
+void phy_support_sym_pause(struct phy_device *phydev)
+{
+   phydev->supported |= SUPPORTED_Pause;
+   phydev->advertising = phydev->supported;
+}
+EXPORT_SYMBOL(phy_support_sym_pause);
+
 /**
  * phy_support_asym_pause - Enable support of asym pause
  * @phydev: target phy_device struct
diff --git a/include/linux/phy.h b/include/linux/phy.h
index e2db819807c1..bc5d6c3f1388 100644
--- a/include/linux/phy.h
+++ b/include/linux/phy.h
@@ -1050,6 +1050,7 @@ int phy_start_interrupts(struct phy_device *phydev);
 void phy_print_status(struct phy_device *phydev);
 int phy_set_max_speed(struct phy_device *phydev, u32 max_speed);
 void phy_remove_link_mode(struct phy_device *phydev, u32 link_mode);
+void phy_support_sym_pause(struct phy_device *phydev);
 void phy_support_asym_pause(struct phy_device *phydev);
 
 int phy_register_fixup(const char *bus_id, u32 phy_uid, u32 phy_uid_mask,
-- 
2.19.0.rc1



[PATCH v2 net-next 05/12] net: ethernet: genet: Fix speed selection

2018-09-10 Thread Andrew Lunn
The phy supported speed is being used to determine if the MAC should
be configured to 100 or 1G. The masking logic is broken. Instead, look
1G supported speeds to enable 1G MAC support.

Signed-off-by: Andrew Lunn 
---
 drivers/net/ethernet/broadcom/genet/bcmmii.c | 7 +++
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/genet/bcmmii.c 
b/drivers/net/ethernet/broadcom/genet/bcmmii.c
index b11d58f1bf45..b756fc79424e 100644
--- a/drivers/net/ethernet/broadcom/genet/bcmmii.c
+++ b/drivers/net/ethernet/broadcom/genet/bcmmii.c
@@ -226,11 +226,10 @@ int bcmgenet_mii_config(struct net_device *dev, bool init)
 * capabilities, use that knowledge to also configure the
 * Reverse MII interface correctly.
 */
-   if ((dev->phydev->supported & PHY_BASIC_FEATURES) ==
-   PHY_BASIC_FEATURES)
-   port_ctrl = PORT_MODE_EXT_RVMII_25;
-   else
+   if (dev->phydev->supported & PHY_1000BT_FEATURES)
port_ctrl = PORT_MODE_EXT_RVMII_50;
+   else
+   port_ctrl = PORT_MODE_EXT_RVMII_25;
bcmgenet_sys_writel(priv, port_ctrl, SYS_PORT_CTRL);
break;
 
-- 
2.19.0.rc1



[PATCH v2 net-next 04/12] net: ethernet: Use phy_set_max_speed() to limit advertised speed

2018-09-10 Thread Andrew Lunn
Many Ethernet MAC drivers want to limit the PHY to only advertise a
maximum speed of 100Mbs or 1Gbps. Rather than using a mask, make use
of the helper function phy_set_max_speed().

Signed-off-by: Andrew Lunn 
Reviewed-by: Florian Fainelli 
---
 drivers/net/ethernet/8390/ax88796.c   |  4 +---
 drivers/net/ethernet/aeroflex/greth.c |  4 ++--
 drivers/net/ethernet/agere/et131x.c   | 12 ++--
 drivers/net/ethernet/allwinner/sun4i-emac.c   |  3 +--
 drivers/net/ethernet/altera/altera_tse_main.c |  5 +
 drivers/net/ethernet/amd/au1000_eth.c | 12 +---
 drivers/net/ethernet/broadcom/bcm63xx_enet.c  | 10 ++
 drivers/net/ethernet/broadcom/genet/bcmmii.c  |  2 +-
 drivers/net/ethernet/broadcom/sb1250-mac.c| 11 ++-
 drivers/net/ethernet/broadcom/tg3.c   |  8 
 drivers/net/ethernet/cadence/macb_main.c  |  4 ++--
 drivers/net/ethernet/cortina/gemini.c |  2 +-
 drivers/net/ethernet/dnet.c   |  4 ++--
 drivers/net/ethernet/ethoc.c  |  5 +
 drivers/net/ethernet/freescale/fec_main.c |  4 ++--
 drivers/net/ethernet/freescale/ucc_geth.c |  7 +--
 drivers/net/ethernet/lantiq_etop.c| 11 ++-
 drivers/net/ethernet/mediatek/mtk_eth_soc.c   |  4 ++--
 drivers/net/ethernet/nxp/lpc_eth.c|  3 +--
 drivers/net/ethernet/rdc/r6040.c  | 12 ++--
 drivers/net/ethernet/samsung/sxgbe/sxgbe_main.c   |  4 ++--
 drivers/net/ethernet/smsc/smsc911x.c  |  5 +++--
 drivers/net/ethernet/smsc/smsc9420.c  |  5 +++--
 drivers/net/ethernet/socionext/sni_ave.c  |  6 ++
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c |  3 +--
 drivers/net/ethernet/toshiba/tc35815.c|  2 +-
 drivers/net/ethernet/xilinx/xilinx_emaclite.c |  3 +--
 drivers/staging/mt7621-eth/mdio.c |  2 +-
 28 files changed, 47 insertions(+), 110 deletions(-)

diff --git a/drivers/net/ethernet/8390/ax88796.c 
b/drivers/net/ethernet/8390/ax88796.c
index 2a0ddec1dd56..3dcc61821ed5 100644
--- a/drivers/net/ethernet/8390/ax88796.c
+++ b/drivers/net/ethernet/8390/ax88796.c
@@ -377,9 +377,7 @@ static int ax_mii_probe(struct net_device *dev)
return ret;
}
 
-   /* mask with MAC supported features */
-   phy_dev->supported &= PHY_BASIC_FEATURES;
-   phy_dev->advertising = phy_dev->supported;
+   phy_set_max_speed(phy_dev, SPEED_100);
 
netdev_info(dev, "PHY driver [%s] (mii_bus:phy_addr=%s, irq=%d)\n",
phy_dev->drv->name, phydev_name(phy_dev), phy_dev->irq);
diff --git a/drivers/net/ethernet/aeroflex/greth.c 
b/drivers/net/ethernet/aeroflex/greth.c
index 4309be3724ad..7c9348a26cbb 100644
--- a/drivers/net/ethernet/aeroflex/greth.c
+++ b/drivers/net/ethernet/aeroflex/greth.c
@@ -1279,9 +1279,9 @@ static int greth_mdio_probe(struct net_device *dev)
}
 
if (greth->gbit_mac)
-   phy->supported &= PHY_GBIT_FEATURES;
+   phy_set_max_speed(phy, SPEED_1000);
else
-   phy->supported &= PHY_BASIC_FEATURES;
+   phy_set_max_speed(phy, SPEED_100);
 
phy->advertising = phy->supported;
 
diff --git a/drivers/net/ethernet/agere/et131x.c 
b/drivers/net/ethernet/agere/et131x.c
index 48220b6c600d..ea34bcb868b5 100644
--- a/drivers/net/ethernet/agere/et131x.c
+++ b/drivers/net/ethernet/agere/et131x.c
@@ -3258,19 +3258,11 @@ static int et131x_mii_probe(struct net_device *netdev)
return PTR_ERR(phydev);
}
 
-   phydev->supported &= (SUPPORTED_10baseT_Half |
- SUPPORTED_10baseT_Full |
- SUPPORTED_100baseT_Half |
- SUPPORTED_100baseT_Full |
- SUPPORTED_Autoneg |
- SUPPORTED_MII |
- SUPPORTED_TP);
+   phy_set_max_speed(phydev, SPEED_100);
 
if (adapter->pdev->device != ET131X_PCI_DEVICE_ID_FAST)
-   phydev->supported |= SUPPORTED_1000baseT_Half |
-SUPPORTED_1000baseT_Full;
+   phy_set_max_speed(phydev, SPEED_1000);
 
-   phydev->advertising = phydev->supported;
phydev->autoneg = AUTONEG_ENABLE;
 
phy_attached_info(phydev);
diff --git a/drivers/net/ethernet/allwinner/sun4i-emac.c 
b/drivers/net/ethernet/allwinner/sun4i-emac.c
index 3143de45baaa..e1acafa82214 100644
--- a/drivers/net/ethernet/allwinner/sun4i-emac.c
+++ b/drivers/net/ethernet/allwinner/sun4i-emac.c
@@ -172,8 +172,7 @@ static int emac_mdio_probe(struct net_device *dev)
}
 
/* mask with MAC supported features */
-   phydev->supported &= PHY_BASIC_FEATURES;
-   phydev->advertising = phydev->supported;
+   phy_set_max_speed(phydev, SPEED_100);
 
db->link = 

[PATCH v2 net-next 06/12] net: ethernet: Fix up drivers masking pause support

2018-09-10 Thread Andrew Lunn
PHY drivers don't indicate they support pause. They expect MAC drivers
to enable its support if the MAC has the needed hardware. Thus MAC
drivers should not mask Pause support, but enable it.

Change a few ANDs to ORs.

Signed-off-by: Andrew Lunn 
Reviewed-by: Florian Fainelli 
---
 drivers/net/ethernet/broadcom/tg3.c | 4 ++--
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c | 4 ++--
 drivers/net/ethernet/mediatek/mtk_eth_soc.c | 2 +-
 drivers/net/ethernet/smsc/smsc911x.c| 2 +-
 drivers/net/ethernet/smsc/smsc9420.c| 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/net/ethernet/broadcom/tg3.c 
b/drivers/net/ethernet/broadcom/tg3.c
index cdc32724c9d9..eab00239a47a 100644
--- a/drivers/net/ethernet/broadcom/tg3.c
+++ b/drivers/net/ethernet/broadcom/tg3.c
@@ -2123,14 +2123,14 @@ static int tg3_phy_init(struct tg3 *tp)
case PHY_INTERFACE_MODE_RGMII:
if (!(tp->phy_flags & TG3_PHYFLG_10_100_ONLY)) {
phy_set_max_speed(phydev, SPEED_1000);
-   phydev->supported &= (SUPPORTED_Pause |
+   phydev->supported |= (SUPPORTED_Pause |
  SUPPORTED_Asym_Pause);
break;
}
/* fallthru */
case PHY_INTERFACE_MODE_MII:
phy_set_max_speed(phydev, SPEED_100);
-   phydev->supported &= (SUPPORTED_Pause |
+   phydev->supported |= (SUPPORTED_Pause |
  SUPPORTED_Asym_Pause);
break;
default:
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c
index 398971a062f4..05b15d254e32 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_mdio.c
@@ -10,8 +10,6 @@
 
 #define HCLGE_PHY_SUPPORTED_FEATURES   (SUPPORTED_Autoneg | \
 SUPPORTED_TP | \
-SUPPORTED_Pause | \
-SUPPORTED_Asym_Pause | \
 PHY_10BT_FEATURES | \
 PHY_100BT_FEATURES | \
 PHY_1000BT_FEATURES)
@@ -213,6 +211,8 @@ int hclge_mac_connect_phy(struct hclge_dev *hdev)
}
 
phydev->supported &= HCLGE_PHY_SUPPORTED_FEATURES;
+   phydev->supported |= SUPPORTED_Pause | SUPPORTED_Asym_Pause;
+
phydev->advertising = phydev->supported;
 
return 0;
diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c 
b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index e93b5375504b..db231bda7c2a 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -360,7 +360,7 @@ static int mtk_phy_connect(struct net_device *dev)
SUPPORTED_Pause | SUPPORTED_Asym_Pause;
 
phy_set_max_speed(dev->phydev, SPEED_1000);
-   dev->phydev->supported &= SUPPORTED_Pause | SUPPORTED_Asym_Pause;
+   dev->phydev->supported |= SUPPORTED_Pause | SUPPORTED_Asym_Pause;
dev->phydev->advertising = dev->phydev->supported |
ADVERTISED_Autoneg;
phy_start_aneg(dev->phydev);
diff --git a/drivers/net/ethernet/smsc/smsc911x.c 
b/drivers/net/ethernet/smsc/smsc911x.c
index f84dbd0beb8e..3e34bf53f055 100644
--- a/drivers/net/ethernet/smsc/smsc911x.c
+++ b/drivers/net/ethernet/smsc/smsc911x.c
@@ -1051,7 +1051,7 @@ static int smsc911x_mii_probe(struct net_device *dev)
phy_set_max_speed(phydev, SPEED_100);
 
/* mask with MAC supported features */
-   phydev->supported &= (SUPPORTED_Pause | SUPPORTED_Asym_Pause);
+   phydev->supported |= (SUPPORTED_Pause | SUPPORTED_Asym_Pause);
phydev->advertising = phydev->supported;
 
pdata->last_duplex = -1;
diff --git a/drivers/net/ethernet/smsc/smsc9420.c 
b/drivers/net/ethernet/smsc/smsc9420.c
index 795f60d92611..326177384544 100644
--- a/drivers/net/ethernet/smsc/smsc9420.c
+++ b/drivers/net/ethernet/smsc/smsc9420.c
@@ -1138,7 +1138,7 @@ static int smsc9420_mii_probe(struct net_device *dev)
phy_set_max_speed(phydev, SPEED_100);
 
/* mask with MAC supported features */
-   phydev->supported &= (SUPPORTED_Pause | SUPPORTED_Asym_Pause);
+   phydev->supported |= (SUPPORTED_Pause | SUPPORTED_Asym_Pause);
phydev->advertising = phydev->supported;
 
phy_attached_info(phydev);
-- 
2.19.0.rc1



[PATCH v2 net-next 00/12] Preparing for phylib limkmodes

2018-09-10 Thread Andrew Lunn
phylib currently makes us of a u32 bitmap for advertising, supported,
and link partner capabilities. For a long time, this has been
sufficient, for devices up to 1Gbps. With more MAC/PHY combinations
now supporting speeds greater than 1Gbps, we have run out of
bits. There is the need to replace this u32 with an
__ETHTOOL_DECLARE_LINK_MODE_MASK, which makes use of linux's generic
bitmaps.

This patchset does some of the work preparing for this change. A few
cleanups are applied to PHY drivers. Some MAC drivers directly access
members of phydev which are going to change type. These patches adds
some helpers and swaps MAC drivers to use them, mostly dealing with
Pause configuration.

v2:
Reviewed-by: Florian Fainelli 
Fixup bad indentation in tg3.c
Rename phy_support_pause() to phy_support_sym_pause()
Also trigger autoneg if the advertising settings have changed.
Rename phy_set_pause() to phy_set_sym_pause()
Use the bcm63xx_enet.c logic, not fec_main.c for validating pause

Andrew Lunn (12):
  net: phy: ste10Xp: Remove wrong SUPPORTED_Pause
  net: phy: et1011c: Remove incorrect missing 1000 Half
  net: phy: bcm63xx: Allow to be built with COMPILE_TEST
  net: ethernet: Use phy_set_max_speed() to limit advertised speed
  net: ethernet: genet: Fix speed selection
  net: ethernet: Fix up drivers masking pause support
  net: ethernet: Add helper to remove a supported link mode
  net: ethernet: Add helper for MACs which support asym pause
  net: ethernet: Add helper for MACs which support pause
  net: ethernet: Add helper for set_pauseparam for Asym Pause
  net: ethernet: Add helper for set_pauseparam for Pause
  net: ethernet: Add helper to determine if pause configuration is
supported

 drivers/net/ethernet/8390/ax88796.c   |   4 +-
 drivers/net/ethernet/aeroflex/greth.c |   4 +-
 drivers/net/ethernet/agere/et131x.c   |  12 +-
 drivers/net/ethernet/allwinner/sun4i-emac.c   |   3 +-
 drivers/net/ethernet/altera/altera_tse_main.c |   5 +-
 drivers/net/ethernet/amd/au1000_eth.c |  12 +-
 drivers/net/ethernet/amd/xgbe/xgbe-phy-v2.c   |   4 +-
 .../ethernet/apm/xgene/xgene_enet_ethtool.c   |  30 +
 .../net/ethernet/apm/xgene/xgene_enet_hw.c|  10 +-
 drivers/net/ethernet/aurora/nb8800.c  |   9 +-
 drivers/net/ethernet/broadcom/bcm63xx_enet.c  |  17 +--
 drivers/net/ethernet/broadcom/genet/bcmmii.c  |   9 +-
 drivers/net/ethernet/broadcom/sb1250-mac.c|  14 +--
 drivers/net/ethernet/broadcom/tg3.c   |  59 +++--
 drivers/net/ethernet/cadence/macb_main.c  |   9 +-
 drivers/net/ethernet/cortina/gemini.c |   5 +-
 drivers/net/ethernet/dnet.c   |   8 +-
 drivers/net/ethernet/ethoc.c  |   5 +-
 drivers/net/ethernet/faraday/ftgmac100.c  |  20 +--
 .../net/ethernet/freescale/dpaa/dpaa_eth.c|   3 +-
 .../ethernet/freescale/dpaa/dpaa_ethtool.c|  27 +---
 drivers/net/ethernet/freescale/fec_main.c |  20 ++-
 drivers/net/ethernet/freescale/gianfar.c  |   4 +-
 .../net/ethernet/freescale/gianfar_ethtool.c  |  53 +++-
 drivers/net/ethernet/freescale/ucc_geth.c |   7 +-
 .../hisilicon/hns3/hns3pf/hclge_main.c|   8 +-
 .../hisilicon/hns3/hns3pf/hclge_mdio.c|   4 +-
 drivers/net/ethernet/lantiq_etop.c|  11 +-
 drivers/net/ethernet/mediatek/mtk_eth_soc.c   |   8 +-
 drivers/net/ethernet/microchip/lan743x_main.c |   7 +-
 drivers/net/ethernet/nxp/lpc_eth.c|   3 +-
 drivers/net/ethernet/rdc/r6040.c  |  12 +-
 drivers/net/ethernet/renesas/ravb_main.c  |   3 +-
 .../net/ethernet/samsung/sxgbe/sxgbe_main.c   |   4 +-
 drivers/net/ethernet/smsc/smsc911x.c  |   6 +-
 drivers/net/ethernet/smsc/smsc9420.c  |   6 +-
 drivers/net/ethernet/socionext/sni_ave.c  |  20 +--
 .../net/ethernet/stmicro/stmmac/stmmac_main.c |  15 ++-
 drivers/net/ethernet/toshiba/tc35815.c|   2 +-
 drivers/net/ethernet/xilinx/xilinx_emaclite.c |   3 +-
 drivers/net/phy/Kconfig   |   2 +-
 drivers/net/phy/et1011c.c |   2 +-
 drivers/net/phy/phy_device.c  | 117 ++
 drivers/net/phy/ste10Xp.c |   4 +-
 drivers/net/usb/lan78xx.c |   2 +-
 drivers/staging/mt7621-eth/mdio.c |   2 +-
 include/linux/phy.h   |   8 ++
 47 files changed, 258 insertions(+), 344 deletions(-)

-- 
2.19.0.rc1



[PATCH v2 net-next 07/12] net: ethernet: Add helper to remove a supported link mode

2018-09-10 Thread Andrew Lunn
Some MAC hardware cannot support a subset of link modes. e.g. often
1Gbps Full duplex is supported, but Half duplex is not. Add a helper
to remove such a link mode.

Signed-off-by: Andrew Lunn 
Reviewed-by: Florian Fainelli 
---
 drivers/net/ethernet/apm/xgene/xgene_enet_hw.c |  6 +++---
 drivers/net/ethernet/cadence/macb_main.c   |  5 ++---
 drivers/net/ethernet/freescale/fec_main.c  |  3 ++-
 drivers/net/ethernet/microchip/lan743x_main.c  |  2 +-
 drivers/net/ethernet/renesas/ravb_main.c   |  3 ++-
 .../net/ethernet/stmicro/stmmac/stmmac_main.c  | 12 
 drivers/net/phy/phy_device.c   | 18 ++
 drivers/net/usb/lan78xx.c  |  2 +-
 include/linux/phy.h|  1 +
 9 files changed, 38 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c 
b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
index 078a04dc1182..4831f9de5945 100644
--- a/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
+++ b/drivers/net/ethernet/apm/xgene/xgene_enet_hw.c
@@ -895,9 +895,9 @@ int xgene_enet_phy_connect(struct net_device *ndev)
}
 
pdata->phy_speed = SPEED_UNKNOWN;
-   phy_dev->supported &= ~SUPPORTED_10baseT_Half &
- ~SUPPORTED_100baseT_Half &
- ~SUPPORTED_1000baseT_Half;
+   phy_remove_link_mode(phy_dev, ETHTOOL_LINK_MODE_10baseT_Half_BIT);
+   phy_remove_link_mode(phy_dev, ETHTOOL_LINK_MODE_100baseT_Half_BIT);
+   phy_remove_link_mode(phy_dev, ETHTOOL_LINK_MODE_1000baseT_Half_BIT);
phy_dev->supported |= SUPPORTED_Pause |
  SUPPORTED_Asym_Pause;
phy_dev->advertising = phy_dev->supported;
diff --git a/drivers/net/ethernet/cadence/macb_main.c 
b/drivers/net/ethernet/cadence/macb_main.c
index bd4095c3a031..96ae8c992810 100644
--- a/drivers/net/ethernet/cadence/macb_main.c
+++ b/drivers/net/ethernet/cadence/macb_main.c
@@ -549,9 +549,8 @@ static int macb_mii_probe(struct net_device *dev)
phy_set_max_speed(phydev, SPEED_100);
 
if (bp->caps & MACB_CAPS_NO_GIGABIT_HALF)
-   phydev->supported &= ~SUPPORTED_1000baseT_Half;
-
-   phydev->advertising = phydev->supported;
+   phy_remove_link_mode(phydev,
+ETHTOOL_LINK_MODE_1000baseT_Half_BIT);
 
bp->link = 0;
bp->speed = 0;
diff --git a/drivers/net/ethernet/freescale/fec_main.c 
b/drivers/net/ethernet/freescale/fec_main.c
index 5e849510c689..0c6fd77b6599 100644
--- a/drivers/net/ethernet/freescale/fec_main.c
+++ b/drivers/net/ethernet/freescale/fec_main.c
@@ -1947,7 +1947,8 @@ static int fec_enet_mii_probe(struct net_device *ndev)
/* mask with MAC supported features */
if (fep->quirks & FEC_QUIRK_HAS_GBIT) {
phy_set_max_speed(phy_dev, 1000);
-   phy_dev->supported &= ~SUPPORTED_1000baseT_Half;
+   phy_remove_link_mode(phy_dev,
+ETHTOOL_LINK_MODE_1000baseT_Half_BIT);
 #if !defined(CONFIG_M5272)
phy_dev->supported |= SUPPORTED_Pause;
 #endif
diff --git a/drivers/net/ethernet/microchip/lan743x_main.c 
b/drivers/net/ethernet/microchip/lan743x_main.c
index e7dce79ff2c9..048307959c01 100644
--- a/drivers/net/ethernet/microchip/lan743x_main.c
+++ b/drivers/net/ethernet/microchip/lan743x_main.c
@@ -1013,7 +1013,7 @@ static int lan743x_phy_open(struct lan743x_adapter 
*adapter)
goto return_error;
 
/* MAC doesn't support 1000T Half */
-   phydev->supported &= ~SUPPORTED_1000baseT_Half;
+   phy_remove_link_mode(phydev, ETHTOOL_LINK_MODE_1000baseT_Half_BIT);
 
/* support both flow controls */
phy->fc_request_control = (FLOW_CTRL_RX | FLOW_CTRL_TX);
diff --git a/drivers/net/ethernet/renesas/ravb_main.c 
b/drivers/net/ethernet/renesas/ravb_main.c
index aff5516b781e..fb2a1125780d 100644
--- a/drivers/net/ethernet/renesas/ravb_main.c
+++ b/drivers/net/ethernet/renesas/ravb_main.c
@@ -1074,7 +1074,8 @@ static int ravb_phy_init(struct net_device *ndev)
}
 
/* 10BASE is not supported */
-   phydev->supported &= ~PHY_10BT_FEATURES;
+   phy_remove_link_mode(phydev, ETHTOOL_LINK_MODE_10baseT_Half_BIT);
+   phy_remove_link_mode(phydev, ETHTOOL_LINK_MODE_10baseT_Full_BIT);
 
phy_attached_info(phydev);
 
diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c 
b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 3d7aec7a050b..3715a0a4af3c 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -993,10 +993,14 @@ static int stmmac_init_phy(struct net_device *dev)
 * Half-duplex mode not supported with multiqueue
 * half-duplex can only works with single queue
 */
-   if (tx_cnt > 1)
-   phydev->supported &= ~(SUPPORTED_1000baseT_Half |
-   

[PATCH v2 net-next 03/12] net: phy: bcm63xx: Allow to be built with COMPILE_TEST

2018-09-10 Thread Andrew Lunn
There is nothing in this driver which prevents it to be compiled for
other architectures. Add COMPILE_TEST so we get better compile test
coverage.

Signed-off-by: Andrew Lunn 
Reviewed-by: Florian Fainelli 
---
 drivers/net/phy/Kconfig | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/phy/Kconfig b/drivers/net/phy/Kconfig
index 82070792edbb..3d187cd50eb0 100644
--- a/drivers/net/phy/Kconfig
+++ b/drivers/net/phy/Kconfig
@@ -240,7 +240,7 @@ config AT803X_PHY
 
 config BCM63XX_PHY
tristate "Broadcom 63xx SOCs internal PHY"
-   depends on BCM63XX
+   depends on BCM63XX || COMPILE_TEST
select BCM_NET_PHYLIB
---help---
  Currently supports the 6348 and 6358 PHYs.
-- 
2.19.0.rc1



Re: [PATCH v3 net-next 3/6] dt-bindings: net: Add lantiq,xrx200-net DT bindings

2018-09-10 Thread Rob Herring
On Sun, Sep 09, 2018 at 10:16:44PM +0200, Hauke Mehrtens wrote:
> This adds the binding for the PMAC core between the CPU and the GSWIP
> switch found on the xrx200 / VR9 Lantiq / Intel SoC.
> 
> Signed-off-by: Hauke Mehrtens 
> Cc: devicet...@vger.kernel.org
> ---
>  .../devicetree/bindings/net/lantiq,xrx200-net.txt   | 21 
> +
>  1 file changed, 21 insertions(+)
>  create mode 100644 
> Documentation/devicetree/bindings/net/lantiq,xrx200-net.txt
> 
> diff --git a/Documentation/devicetree/bindings/net/lantiq,xrx200-net.txt 
> b/Documentation/devicetree/bindings/net/lantiq,xrx200-net.txt
> new file mode 100644
> index ..8a2fe5200cdc
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/net/lantiq,xrx200-net.txt
> @@ -0,0 +1,21 @@
> +Lantiq xRX200 GSWIP PMAC Ethernet driver
> +==
> +
> +Required properties:
> +
> +- compatible : "lantiq,xrx200-net" for the PMAC of the embedded
> + : GSWIP in the xXR200
> +- reg: memory range of the PMAC core inside of the GSWIP core
> +- interrupts : TX and RX DMA interrupts. Use interrupt-names "tx" for
> + : the TX interrupt and "rx" for the RX interrupt.
> +
> +Example:
> +
> +eth0: eth@E10B308 {

ethernet@e10b308

> + #address-cells = <1>;
> + #size-cells = <0>;
> + compatible = "lantiq,xrx200-net";
> + reg = <0xE10B308 0x30>;

lowercase hex please.

> + interrupts = <73>, <72>;
> + interrupt-names = "tx", "rx";
> +};
> -- 
> 2.11.0
> 


[net-next:master 89/93] drivers/scsi/cxgbi/libcxgbi.c:787:43: sparse: incorrect type in argument 1 (different address spaces)

2018-09-10 Thread kbuild test robot
tree:   https://git.kernel.org/pub/scm/linux/kernel/git/davem/net-next.git 
master
head:   11957be20ff68d7670cb979a1c3ea5482a44b370
commit: 0153167aebd0808fb90031dba07d4e696557474c [89/93] net/ipv6: Remove 
rt6i_prefsrc
reproduce:
# apt-get install sparse
git checkout 0153167aebd0808fb90031dba07d4e696557474c
make ARCH=x86_64 allmodconfig
make C=1 CF=-D__CHECK_ENDIAN__


sparse warnings: (new ones prefixed by >>)

>> drivers/scsi/cxgbi/libcxgbi.c:787:43: sparse: incorrect type in argument 1 
>> (different address spaces) @@expected struct in6_addr const *a @@got 
>> struct in6_struct in6_addr const *a @@
   drivers/scsi/cxgbi/libcxgbi.c:787:43:expected struct in6_addr const *a
   drivers/scsi/cxgbi/libcxgbi.c:787:43:got struct in6_addr [noderef] 
*
   drivers/scsi/cxgbi/libcxgbi.c:1040:21: sparse: restricted __wsum degrades to 
integer
   drivers/scsi/cxgbi/libcxgbi.c:1045:33: sparse: bad assignment (-=) to 
restricted __wsum
   drivers/scsi/cxgbi/libcxgbi.c:1049:33: sparse: invalid assignment: -=
   drivers/scsi/cxgbi/libcxgbi.c:1049:33:left side has type unsigned int
   drivers/scsi/cxgbi/libcxgbi.c:1049:33:right side has type restricted 
__wsum
   drivers/scsi/cxgbi/libcxgbi.h:396:19: sparse: invalid assignment: +=
   drivers/scsi/cxgbi/libcxgbi.h:396:19:left side has type int
   drivers/scsi/cxgbi/libcxgbi.h:396:19:right side has type restricted 
__wsum
   drivers/scsi/cxgbi/libcxgbi.c:1426:19: sparse: cast to restricted __be32
   drivers/scsi/cxgbi/libcxgbi.c:1426:19: sparse: cast to restricted __be32
   drivers/scsi/cxgbi/libcxgbi.c:1426:19: sparse: cast to restricted __be32
   drivers/scsi/cxgbi/libcxgbi.c:1426:19: sparse: cast to restricted __be32
   drivers/scsi/cxgbi/libcxgbi.c:1426:19: sparse: cast to restricted __be32
   drivers/scsi/cxgbi/libcxgbi.c:1426:19: sparse: cast to restricted __be32
   drivers/scsi/cxgbi/libcxgbi.c:1502:19: sparse: cast to restricted __be32
   drivers/scsi/cxgbi/libcxgbi.c:1502:19: sparse: cast to restricted __be32
   drivers/scsi/cxgbi/libcxgbi.c:1502:19: sparse: cast to restricted __be32
   drivers/scsi/cxgbi/libcxgbi.c:1502:19: sparse: cast to restricted __be32
   drivers/scsi/cxgbi/libcxgbi.c:1502:19: sparse: cast to restricted __be32
   drivers/scsi/cxgbi/libcxgbi.c:1502:19: sparse: cast to restricted __be32
   drivers/scsi/cxgbi/libcxgbi.c:1663:17: sparse: cast to restricted __be32
   drivers/scsi/cxgbi/libcxgbi.c:1663:17: sparse: cast from restricted itt_t
   drivers/scsi/cxgbi/libcxgbi.c:1663:17: sparse: cast to restricted __be32
   drivers/scsi/cxgbi/libcxgbi.c:1663:17: sparse: cast from restricted itt_t
   drivers/scsi/cxgbi/libcxgbi.c:1663:17: sparse: cast to restricted __be32
   drivers/scsi/cxgbi/libcxgbi.c:1663:17: sparse: cast from restricted itt_t
   drivers/scsi/cxgbi/libcxgbi.c:1663:17: sparse: cast to restricted __be32
   drivers/scsi/cxgbi/libcxgbi.c:1663:17: sparse: cast from restricted itt_t
   drivers/scsi/cxgbi/libcxgbi.c:1663:17: sparse: cast to restricted __be32
   drivers/scsi/cxgbi/libcxgbi.c:1663:17: sparse: cast from restricted itt_t
   drivers/scsi/cxgbi/libcxgbi.c:1663:17: sparse: cast to restricted __be32
   drivers/scsi/cxgbi/libcxgbi.c:1663:17: sparse: cast from restricted itt_t
   drivers/scsi/cxgbi/libcxgbi.c:1856:24: sparse: expression using sizeof(void)
   drivers/scsi/cxgbi/libcxgbi.c:1856:24: sparse: expression using sizeof(void)
   drivers/scsi/cxgbi/libcxgbi.c:1902:29: sparse: expression using sizeof(void)
   drivers/scsi/cxgbi/libcxgbi.c:1902:29: sparse: expression using sizeof(void)
   drivers/scsi/cxgbi/libcxgbi.c:1931:9: sparse: cast to restricted __be32
   drivers/scsi/cxgbi/libcxgbi.c:1931:9: sparse: cast from restricted itt_t
   drivers/scsi/cxgbi/libcxgbi.c:1931:9: sparse: cast to restricted __be32
   drivers/scsi/cxgbi/libcxgbi.c:1931:9: sparse: cast from restricted itt_t
   drivers/scsi/cxgbi/libcxgbi.c:1931:9: sparse: cast to restricted __be32
   drivers/scsi/cxgbi/libcxgbi.c:1931:9: sparse: cast from restricted itt_t
   drivers/scsi/cxgbi/libcxgbi.c:1931:9: sparse: cast to restricted __be32
   drivers/scsi/cxgbi/libcxgbi.c:1931:9: sparse: cast from restricted itt_t
   drivers/scsi/cxgbi/libcxgbi.c:1931:9: sparse: cast to restricted __be32
   drivers/scsi/cxgbi/libcxgbi.c:1931:9: sparse: cast from restricted itt_t
   drivers/scsi/cxgbi/libcxgbi.c:1931:9: sparse: cast to restricted __be32
   drivers/scsi/cxgbi/libcxgbi.c:1931:9: sparse: cast from restricted itt_t
   drivers/scsi/cxgbi/libcxgbi.c:1964:9: sparse: cast to restricted __be32
   drivers/scsi/cxgbi/libcxgbi.c:1964:9: sparse: cast from restricted itt_t
   drivers/scsi/cxgbi/libcxgbi.c:1964:9: sparse: cast to restricted __be32
   drivers/scsi/cxgbi/libcxgbi.c:1964:9: sparse: cast from restricted itt_t
   drivers/scsi/cxgbi/libcxgbi.c:1964:9: sparse: cast to restricted __be32
   drivers/scsi/cxgbi/libcxgbi.c:1964:9: sparse: cast from restricted itt_t
   drivers/scsi/cxgbi/libcxgbi.c:1964:9: 

Re: [PATCH net-next 02/15] sch_netem: Move private queue handler to generic location.

2018-09-10 Thread Stephen Hemminger
On Sat, 08 Sep 2018 13:10:01 -0700 (PDT)
David Miller  wrote:

> By hand copies of SKB list handlers do not belong in individual packet
> schedulers.
> 
> Signed-off-by: David S. Miller 

Thanks for cleaning this up.

Signed-off-by: Stephen Hemminger 



ksoftirqd takes 100% of a core with ixgbe and netconsole (netpoll)

2018-09-10 Thread Song Liu


We are debugging an issue with netconsole and ixgbe, that ksoftirqd takes 100%
of a core. It happens with both current net and net-next.

To reproduce the issue:

  1. Setup server with ixgbe and netconsole. We bind each queue to a separate
 core via smp_affinity;
  2. Start simple netperf job from client, like:
./super_netperf 201 -P 0 -t TCP_RR -p  -H  -l 7200 -- -r 
300,300 -o -s 1M,1M -S 1M,1M
  3. On server, write to /dev/kmsg in a loop (to send netconsole):
for x in {1..7200} ; do echo aa >> /dev/kmsg ; sleep 1; done
  4. On server, monitor ksoftirqd in top

Within a few minutes, top will show one ksoftirqd take 100% of the core for many
seconds in a row. 

When the ksoftirqd takes 100% of a core, the driver hits "clean_complete=false"
path below, so this napi stays in polling mode. 

ixgbe_for_each_ring(ring, q_vector->rx) {
int cleaned = ixgbe_clean_rx_irq(q_vector, ring,
 per_ring_budget);

work_done += cleaned;
if (cleaned >= per_ring_budget)
clean_complete = false;
}

/* If all work not completed, return budget and keep polling */
if (!clean_complete)
return budget;

We didn't see this issue on a 4.6 based kernel.

We are still debugging the issue. But we would like to check whether there is
known solution for it. Any comments and suggestions are highly appreciated.

Best,
Song

Re: [PATCH net-next v2 1/2] net: stmmac: Rework coalesce timer and fix multi-queue races

2018-09-10 Thread Florian Fainelli
On 09/10/2018 02:14 AM, Jose Abreu wrote:
> This follows David Miller advice and tries to fix coalesce timer in
> multi-queue scenarios.
> 
> We are now using per-queue coalesce values and per-queue TX timer.
> 
> Coalesce timer default values was changed to 1ms and the coalesce frames
> to 25.
> 
> Tested in B2B setup between XGMAC2 and GMAC5.

Why not revert the entire features for this merge window and work on
getting it to work over the next weeks/merge windows?

The idea of using a timer to coalesce TX path when there is not a HW
timer is a good idea and if this is made robust enough, you could even
promote that as being a network stack library/feature that could be used
by other drivers. In fact, this could be a great addition to the net DIM
library (Tal, what do you think?)

Here's a quick drive by review of things that appear wrong in the
current driver (without your patches):

- in stmmac_xmit(), in case we hit the !is_jumbo branch and we fail the
DMA mapping, there is no timer cancellation, don't we want to abort the
whole transmission?

- stmmac_tx_clean() should probably use netif_lock_bh() to guard against
the timer (soft IRQ context) and the the NAPI context (also soft IRQ)
running in parallel on two different CPUs. This may not explain all
problems, but these two things are fundamentally exclusive, because the
timer is meant to emulate the interrupt after N packets, while NAPI
executes when such a thing did actually occur

- stmmac_poll() should cancel pending timer(s) if it was able to reclaim
packets, likewise stmmac_tx_timer() should re-enable TX interrupts if it
reclaimed packets, since TX interrupts could have been left disabled
from a prior NAPI run. These could be considered optimizations, since
you could leave the TX timer running all the time, just adjust the
deadline (based on line rate, MTU, IPG, number of fragments and their
respective length), worst case, both NAPI and the timer clean up your TX
ring, so you should always have room to push more packets

> 
> Signed-off-by: Jose Abreu 
> Cc: Jerome Brunet 
> Cc: Martin Blumenstingl 
> Cc: David S. Miller 
> Cc: Joao Pinto 
> Cc: Giuseppe Cavallaro 
> Cc: Alexandre Torgue 
> ---
> Jerome,
> 
> Can you please test if this one is okay ?
> 
> Thanks and Best Regards,
> Jose Miguel Abreu
> ---
>  drivers/net/ethernet/stmicro/stmmac/common.h  |   4 +-
>  drivers/net/ethernet/stmicro/stmmac/stmmac.h  |   6 +-
>  drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 207 
> ++
>  3 files changed, 135 insertions(+), 82 deletions(-)
> 
> diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h 
> b/drivers/net/ethernet/stmicro/stmmac/common.h
> index 1854f270ad66..b1b305f8f414 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/common.h
> +++ b/drivers/net/ethernet/stmicro/stmmac/common.h
> @@ -258,10 +258,10 @@ struct stmmac_safety_stats {
>  #define MAX_DMA_RIWT 0xff
>  #define MIN_DMA_RIWT 0x20
>  /* Tx coalesce parameters */
> -#define STMMAC_COAL_TX_TIMER 4
> +#define STMMAC_COAL_TX_TIMER 1000
>  #define STMMAC_MAX_COAL_TX_TICK  10
>  #define STMMAC_TX_MAX_FRAMES 256
> -#define STMMAC_TX_FRAMES 64
> +#define STMMAC_TX_FRAMES 25
>  
>  /* Packets types */
>  enum packets_types {
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h 
> b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
> index c0a855b7ab3b..957030cfb833 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
> @@ -48,6 +48,9 @@ struct stmmac_tx_info {
>  
>  /* Frequently used values are kept adjacent for cache effect */
>  struct stmmac_tx_queue {
> + u32 tx_count_frames;
> + int tx_timer_active;
> + struct timer_list txtimer;
>   u32 queue_index;
>   struct stmmac_priv *priv_data;
>   struct dma_extended_desc *dma_etx cacheline_aligned_in_smp;
> @@ -59,6 +62,7 @@ struct stmmac_tx_queue {
>   dma_addr_t dma_tx_phy;
>   u32 tx_tail_addr;
>   u32 mss;
> + struct napi_struct napi cacheline_aligned_in_smp;
>  };
>  
>  struct stmmac_rx_queue {
> @@ -109,14 +113,12 @@ struct stmmac_pps_cfg {
>  
>  struct stmmac_priv {
>   /* Frequently used values are kept adjacent for cache effect */
> - u32 tx_count_frames;
>   u32 tx_coal_frames;
>   u32 tx_coal_timer;
>  
>   int tx_coalesce;
>   int hwts_tx_en;
>   bool tx_path_in_lpi_mode;
> - struct timer_list txtimer;
>   bool tso;
>  
>   unsigned int dma_buf_sz;
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c 
> b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> index 9f458bb16f2a..9809c2b319fe 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
> @@ -148,6 +148,7 @@ static void stmmac_verify_args(void)
>  static void stmmac_disable_all_queues(struct stmmac_priv *priv)
>  {
>   u32 rx_queues_cnt = priv->plat->rx_queues_to_use;
> +  

Re: [iproute PATCH v2] ip-route: Fix segfault with many nexthops

2018-09-10 Thread Stephen Hemminger
On Thu,  6 Sep 2018 15:31:51 +0200
Phil Sutter  wrote:

> It was possible to crash ip-route by adding an IPv6 route with 37
> nexthop statements. A simple reproducer is:
> 
> | for i in `seq 37`; do
> | nhs="nexthop via ::$i "$nhs
> | done
> | ip -6 route add ::/64 $nhs
> 
> The related code was broken in multiple ways:
> 
> * parse_one_nh() assumed that rta points to 4kB of storage but caller
>   provided just 1kB. Fixed by passing 'len' parameter with the correct
>   value.
> 
> * Error checking of rta_addattr*() calls in parse_one_nh() and called
>   functions was completely absent, so with above fix in place output
>   flood would occur due to parser looping forever.
> 
> While being at it, increase message buffer sizes to 4k. This allows for
> at most 144 nexthops.
> 
> Signed-off-by: Phil Sutter 

Thanks for fixing this.
Shows where more test cases are needed.




Re: [PATCH iproute2 v2] tc/mqprio: Print extra info on invalid args.

2018-09-10 Thread Stephen Hemminger
On Thu,  6 Sep 2018 14:01:17 -0700
Caleb Raitto  wrote:

> From: Caleb Raitto 
> 
> Print the name of the argument that wasn't understood.
> 
> Signed-off-by: Caleb Raitto 

That is simpler, thanks. Applied


Re: [PATCH net] ipv6: use rt6_info members when dst is set in rt6_fill_node

2018-09-10 Thread David Ahern
On 9/10/18 11:55 AM, Xin Long wrote:
> On Tue, Sep 11, 2018 at 12:13 AM David Ahern  wrote:
>>
>> On 9/9/18 12:29 AM, Xin Long wrote:
> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> index 18e00ce..e554922 100644
> --- a/net/ipv6/route.c
> +++ b/net/ipv6/route.c
> @@ -4670,20 +4670,33 @@ static int rt6_fill_node(struct net *net, struct 
> sk_buff *skb,
>int iif, int type, u32 portid, u32 seq,
>unsigned int flags)
>  {
> - struct rtmsg *rtm;
> + struct rt6key *fib6_prefsrc, *fib6_dst, *fib6_src;
> + struct rt6_info *rt6 = (struct rt6_info *)dst;
> + u32 *pmetrics, table, fib6_flags;
>   struct nlmsghdr *nlh;
> + struct rtmsg *rtm;
>   long expires = 0;
> - u32 *pmetrics;
> - u32 table;
>
>   nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
>   if (!nlh)
>   return -EMSGSIZE;
>
> + if (rt6) {
> + fib6_dst = >rt6i_dst;
> + fib6_src = >rt6i_src;
> + fib6_flags = rt6->rt6i_flags;
> + fib6_prefsrc = >rt6i_prefsrc;
> + } else {
> + fib6_dst = >fib6_dst;
> + fib6_src = >fib6_src;
> + fib6_flags = rt->fib6_flags;
> + fib6_prefsrc = >fib6_prefsrc;
> + }

 Unless I am missing something at the moment, an rt6_info can only have
 the same dst, src and prefsrc as the fib6_info on which it is based.
 Thus, only the flags is needed above. That simplifies this patch a lot.
>>> If dst, src and prefsrc in rt6_info are always the same as these in 
>>> fib6_info,
>>> why do we need them in rt6_info? we could just get it by 'from'.
>>>
>>
>> I just sent a patch removing rt6i_prefsrc. It is set with only 1 reader
>> that can be converted.
>>
>> rt6i_src is checked against the fib6_info to invalidate a dst if the src
>> has changed, so a valid rt will always have the same rt6i_src as the
>> rt->from.
>>
>> rt6i_dst is set to the dest address / 128 in cases, so it should be used
>> for rt6_info cases above.
> So that means, I will use rt6i_dst and rt6i_flags when dst is set?
> how about I use rt6i_src there as well? just to make it look clear.
> and plus the gw/nh dump fix in rt6_fill_node():
> -if (rt->fib6_nsiblings) {
> +if (rt6) {
> +if (fib6_flags & RTF_GATEWAY)
> +if (nla_put_in6_addr(skb, RTA_GATEWAY,
> + >rt6i_gateway) < 0)
> +goto nla_put_failure;
> +
> +if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
> +goto nla_put_failure;
> +} else if (rt->fib6_nsiblings) {
>  struct fib6_info *sibling, *next_sibling;
>  struct nlattr *mp;
> 
> looks good to you?
> 

sure


Fw: [Bug 201063] New: kernel panic on heavy network use

2018-09-10 Thread Stephen Hemminger



Begin forwarded message:

Date: Sun, 09 Sep 2018 13:45:28 +
From: bugzilla-dae...@bugzilla.kernel.org
To: step...@networkplumber.org
Subject: [Bug 201063] New: kernel panic on heavy network use


https://bugzilla.kernel.org/show_bug.cgi?id=201063

Bug ID: 201063
   Summary: kernel panic on heavy network use
   Product: Networking
   Version: 2.5
Kernel Version: 4.19rc2
  Hardware: All
OS: Linux
  Tree: Mainline
Status: NEW
  Severity: normal
  Priority: P1
 Component: Other
  Assignee: step...@networkplumber.org
  Reporter: oyvi...@everdot.org
Regression: No

Created attachment 278379
  --> https://bugzilla.kernel.org/attachment.cgi?id=278379=edit  
RIP: native_smp_send_rechedule, what did they mean by this

kernel panics, it seems to happen when there is heavy network traffic going
through that box. no nothing in logs, took picture of screen with kernel panic,
it is attached

-- 
You are receiving this mail because:
You are the assignee for the bug.


Fw: [Bug 201071] New: Creating a vxlan in state 'up' does not give proper RTM_NEWLINK message

2018-09-10 Thread Stephen Hemminger



Begin forwarded message:

Date: Mon, 10 Sep 2018 04:04:37 +
From: bugzilla-dae...@bugzilla.kernel.org
To: step...@networkplumber.org
Subject: [Bug 201071] New: Creating a vxlan in state 'up' does not give proper 
RTM_NEWLINK message


https://bugzilla.kernel.org/show_bug.cgi?id=201071

Bug ID: 201071
   Summary: Creating a vxlan in state 'up' does not give proper
RTM_NEWLINK message
   Product: Networking
   Version: 2.5
Kernel Version: 4.19-rc1
  Hardware: All
OS: Linux
  Tree: Mainline
Status: NEW
  Severity: normal
  Priority: P1
 Component: Other
  Assignee: step...@networkplumber.org
  Reporter: liam.mcbir...@boeing.com
Regression: Yes

If a vxlan is created with state 'up', the RTM_NEWLINK message shows the state
as down, and there no other netlink messages are sent.
As a result, processes listening to netlink are never notified that the vxlan
link is up.

eg.
# ip link add test up type vxlan id 8 group 224.224.224.224 dev eth0

Output of ip monitor link
# 4: test:  mtu 1450 qdisc noop state DOWN group default
  link/ether ee:cd:97:1a:cf:91 brd ff:ff:ff:ff:ff:ff

Output of ip link show (expected from netlink message)
# 4: test:  mtu 1450 qdisc noqueue state
UNKNOWN group default qlen 1000
  link/ether ee:cd:97:1a:cf:91 brd ff:ff:ff:ff:ff:ff

This is a regression introduced by the following patch series.
https://patchwork.ozlabs.org/patch/947181/

-- 
You are receiving this mail because:
You are the assignee for the bug.


Re: [PATCH net-next v2 2/2] net: stmmac: Fixup the tail addr setting in xmit path

2018-09-10 Thread Florian Fainelli
On 09/10/2018 02:14 AM, Jose Abreu wrote:
> Currently we are always setting the tail address of descriptor list to
> the end of the pre-allocated list.
> 
> According to databook this is not correct. Tail address should point to
> the last available descriptor + 1, which means we have to update the
> tail address everytime we call the xmit function.
> 
> This should make no impact in older versions of MAC but in newer
> versions there are some DMA features which allows the IP to fetch
> descriptors in advance and in a non sequential order so its critical
> that we set the tail address correctly.


Can you include the appropriate Fixes tag here so this can easily be
backported to relevant stable branches?
-- 
Florian


Re: [net-next, v2, 1/2] net: stmmac: Rework coalesce timer and fix multi-queue races

2018-09-10 Thread Neil Armstrong
Hi Jose,

On 10/09/2018 18:21, Jose Abreu wrote:
> On 10-09-2018 16:49, Neil Armstrong wrote:
>> Hi Jose,
>>
>> On 10/09/2018 16:44, Jose Abreu wrote:
>>> On 10-09-2018 14:46, Neil Armstrong wrote:
 hi Jose,

 On 10/09/2018 14:55, Jose Abreu wrote:
> On 10-09-2018 13:52, Jose Abreu wrote:
>> Can you please try attached follow-up patch ? 
> Oh, please apply the whole series otherwise this will not apply
> cleanly.
 Indeed, it helps!

 With the fixups, it fails later, around 15s instead of 3, in RX and TX.
>>> Thanks for testing Neil. What if we keep rearming the timer
>>> whilst there are pending packets ? Something like in the attach.
>>> (applies on top of previous one).
>> It fixes RX, but TX fails after ~13s.
> 
> Ok :(
> 
> Can you please try attached follow-up patch ?

RX is still ok but now TX fails almost immediately...

With 100ms report :

$ iperf3 -c 192.168.1.47 -t 0 -p 5202 -R -i 0.1
Connecting to host 192.168.1.47, port 5202
Reverse mode, remote host 192.168.1.47 is sending
[  4] local 192.168.1.45 port 45900 connected to 192.168.1.47 port 5202
[ ID] Interval   Transfer Bandwidth
[  4]   0.00-0.10   sec  10.9 MBytes   913 Mbits/sec
[  4]   0.10-0.20   sec  11.0 MBytes   923 Mbits/sec
[  4]   0.20-0.30   sec  6.34 MBytes   532 Mbits/sec
[  4]   0.30-0.40   sec  0.00 Bytes  0.00 bits/sec
[  4]   0.40-0.50   sec  0.00 Bytes  0.00 bits/sec
[  4]   0.50-0.60   sec  0.00 Bytes  0.00 bits/sec
[  4]   0.60-0.70   sec  0.00 Bytes  0.00 bits/sec
[  4]   0.70-0.80   sec  0.00 Bytes  0.00 bits/sec
[  4]   0.80-0.90   sec  0.00 Bytes  0.00 bits/sec
[  4]   0.90-1.00   sec  0.00 Bytes  0.00 bits/sec
[  4]   1.00-1.10   sec  0.00 Bytes  0.00 bits/sec
^C[  4]   1.10-1.10   sec  0.00 Bytes  0.00 bits/sec
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval   Transfer Bandwidth
[  4]   0.00-1.10   sec  0.00 Bytes  0.00 bits/sec  sender
[  4]   0.00-1.10   sec  28.2 MBytes   214 Mbits/sec  receiver
iperf3: interrupt - the client has terminated

Neil

> 
> I'm so sorry about this back and forth and I appreciate all your
> help .
> 
> Thanks and Best Regards,
> Jose Miguel Abreu
> 
> 
>>
>> Neil
>>
>>> Thanks and Best Regards,
>>> Jose Miguel Abreu
>>>
> 



Re: [PATCH can-next] can: ucan: remove duplicated include from ucan.c

2018-09-10 Thread Martin Elshuber
Thank you for the fix

Reviewed-by: Martin Elshuber 

Am 29.08.18 um 03:25 schrieb YueHaibing:
> Remove duplicated include.
> 
> Signed-off-by: YueHaibing 
> ---
>  drivers/net/can/usb/ucan.c | 4 
>  1 file changed, 4 deletions(-)
> 
> diff --git a/drivers/net/can/usb/ucan.c b/drivers/net/can/usb/ucan.c
> index 0678a38..c6f4b41 100644
> --- a/drivers/net/can/usb/ucan.c
> +++ b/drivers/net/can/usb/ucan.c
> @@ -35,10 +35,6 @@
>  #include 
>  #include 
>  
> -#include 
> -#include 
> -#include 
> -
>  #define UCAN_DRIVER_NAME "ucan"
>  #define UCAN_MAX_RX_URBS 8
>  /* the CAN controller needs a while to enable/disable the bus */
> 




signature.asc
Description: OpenPGP digital signature


Re: [PATCH net] ipv6: use rt6_info members when dst is set in rt6_fill_node

2018-09-10 Thread Xin Long
On Tue, Sep 11, 2018 at 12:13 AM David Ahern  wrote:
>
> On 9/9/18 12:29 AM, Xin Long wrote:
> >>> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
> >>> index 18e00ce..e554922 100644
> >>> --- a/net/ipv6/route.c
> >>> +++ b/net/ipv6/route.c
> >>> @@ -4670,20 +4670,33 @@ static int rt6_fill_node(struct net *net, struct 
> >>> sk_buff *skb,
> >>>int iif, int type, u32 portid, u32 seq,
> >>>unsigned int flags)
> >>>  {
> >>> - struct rtmsg *rtm;
> >>> + struct rt6key *fib6_prefsrc, *fib6_dst, *fib6_src;
> >>> + struct rt6_info *rt6 = (struct rt6_info *)dst;
> >>> + u32 *pmetrics, table, fib6_flags;
> >>>   struct nlmsghdr *nlh;
> >>> + struct rtmsg *rtm;
> >>>   long expires = 0;
> >>> - u32 *pmetrics;
> >>> - u32 table;
> >>>
> >>>   nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
> >>>   if (!nlh)
> >>>   return -EMSGSIZE;
> >>>
> >>> + if (rt6) {
> >>> + fib6_dst = >rt6i_dst;
> >>> + fib6_src = >rt6i_src;
> >>> + fib6_flags = rt6->rt6i_flags;
> >>> + fib6_prefsrc = >rt6i_prefsrc;
> >>> + } else {
> >>> + fib6_dst = >fib6_dst;
> >>> + fib6_src = >fib6_src;
> >>> + fib6_flags = rt->fib6_flags;
> >>> + fib6_prefsrc = >fib6_prefsrc;
> >>> + }
> >>
> >> Unless I am missing something at the moment, an rt6_info can only have
> >> the same dst, src and prefsrc as the fib6_info on which it is based.
> >> Thus, only the flags is needed above. That simplifies this patch a lot.
> > If dst, src and prefsrc in rt6_info are always the same as these in 
> > fib6_info,
> > why do we need them in rt6_info? we could just get it by 'from'.
> >
>
> I just sent a patch removing rt6i_prefsrc. It is set with only 1 reader
> that can be converted.
>
> rt6i_src is checked against the fib6_info to invalidate a dst if the src
> has changed, so a valid rt will always have the same rt6i_src as the
> rt->from.
>
> rt6i_dst is set to the dest address / 128 in cases, so it should be used
> for rt6_info cases above.
So that means, I will use rt6i_dst and rt6i_flags when dst is set?
how about I use rt6i_src there as well? just to make it look clear.
and plus the gw/nh dump fix in rt6_fill_node():
-if (rt->fib6_nsiblings) {
+if (rt6) {
+if (fib6_flags & RTF_GATEWAY)
+if (nla_put_in6_addr(skb, RTA_GATEWAY,
+ >rt6i_gateway) < 0)
+goto nla_put_failure;
+
+if (dst->dev && nla_put_u32(skb, RTA_OIF, dst->dev->ifindex))
+goto nla_put_failure;
+} else if (rt->fib6_nsiblings) {
 struct fib6_info *sibling, *next_sibling;
 struct nlattr *mp;

looks good to you?


Re: [PATCH net] qmi_wwan: Support dynamic config on Quectel EP06

2018-09-10 Thread David Miller
From: Kristian Evensen 
Date: Sat,  8 Sep 2018 13:50:48 +0200

> Quectel EP06 (and EM06/EG06) supports dynamic configuration of USB
> interfaces, without the device changing VID/PID or configuration number.
> When the configuration is updated and interfaces are added/removed, the
> interface numbers change. This means that the current code for matching
> EP06 does not work.
> 
> This patch removes the current EP06 interface number match, and replaces
> it with a match on class, subclass and protocol. Unfortunately, matching
> on those three alone is not enough, as the diag interface exports the
> same values as QMI. The other serial interfaces + adb export different
> values and do not match.
> 
> The diag interface only has two endpoints, while the QMI interface has
> three. I have therefore added a check for number of interfaces, and we
> ignore the interface if the number of endpoints equals two.
> 
> Signed-off-by: Kristian Evensen 

Applied, thanks.


Re: [Patch net-next] htb: use anonymous union for simplicity

2018-09-10 Thread David Miller
From: Cong Wang 
Date: Fri,  7 Sep 2018 13:29:14 -0700

> cl->leaf.q is slightly more readable than cl->un.leaf.q.
> 
> Cc: Jamal Hadi Salim 
> Signed-off-by: Cong Wang 

Applied.


Re: [Patch net-next] net_sched: remove redundant qdisc lock classes

2018-09-10 Thread David Miller
From: Cong Wang 
Date: Fri,  7 Sep 2018 13:29:13 -0700

> We no longer take any spinlock on RX path for ingress qdisc,
> so this lockdep annotation is no longer needed.
> 
> Cc: Jamal Hadi Salim 
> Signed-off-by: Cong Wang 

Applied.


Re: [PATCH net-next v2] net: sched: cls_flower: dump offload count value

2018-09-10 Thread David Miller
From: Vlad Buslov 
Date: Fri,  7 Sep 2018 17:22:21 +0300

> Change flower in_hw_count type to fixed-size u32 and dump it as
> TCA_FLOWER_IN_HW_COUNT. This change is necessary to properly test shared
> blocks and re-offload functionality.
> 
> Signed-off-by: Vlad Buslov 
> Acked-by: Jiri Pirko 

Applied, thank you.


[PATCH net-next v1] net/tls: Fixed return value when tls_complete_pending_work() fails

2018-09-10 Thread Vakul Garg
In tls_sw_sendmsg() and tls_sw_sendpage(), the variable 'ret' has
been set to return value of tls_complete_pending_work(). This allows
return of proper error code if tls_complete_pending_work() fails.

Fixes: 3c4d7559159b ("tls: kernel TLS support")
Signed-off-by: Vakul Garg 
---
 net/tls/tls_sw.c | 10 ++
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/net/tls/tls_sw.c b/net/tls/tls_sw.c
index be4f2e990f9f..adab598bd6db 100644
--- a/net/tls/tls_sw.c
+++ b/net/tls/tls_sw.c
@@ -486,7 +486,7 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, 
size_t size)
 {
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
-   int ret = 0;
+   int ret;
int required_size;
long timeo = sock_sndtimeo(sk, msg->msg_flags & MSG_DONTWAIT);
bool eor = !(msg->msg_flags & MSG_MORE);
@@ -502,7 +502,8 @@ int tls_sw_sendmsg(struct sock *sk, struct msghdr *msg, 
size_t size)
 
lock_sock(sk);
 
-   if (tls_complete_pending_work(sk, tls_ctx, msg->msg_flags, ))
+   ret = tls_complete_pending_work(sk, tls_ctx, msg->msg_flags, );
+   if (ret)
goto send_end;
 
if (unlikely(msg->msg_controllen)) {
@@ -637,7 +638,7 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
 {
struct tls_context *tls_ctx = tls_get_ctx(sk);
struct tls_sw_context_tx *ctx = tls_sw_ctx_tx(tls_ctx);
-   int ret = 0;
+   int ret;
long timeo = sock_sndtimeo(sk, flags & MSG_DONTWAIT);
bool eor;
size_t orig_size = size;
@@ -657,7 +658,8 @@ int tls_sw_sendpage(struct sock *sk, struct page *page,
 
sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
 
-   if (tls_complete_pending_work(sk, tls_ctx, flags, ))
+   ret = tls_complete_pending_work(sk, tls_ctx, flags, );
+   if (ret)
goto sendpage_end;
 
/* Call the sk_stream functions to manage the sndbuf mem. */
-- 
2.13.6



Re: unexpected GRO/veth behavior

2018-09-10 Thread Eric Dumazet



On 09/10/2018 08:22 AM, Paolo Abeni wrote:
 in this already heavy cost engine.
> 
> Yup, even if I do not see any measurable cost added by the posted code.

Sure, micro bench marks wont show anything.

Now, if GRO receives one packet every 100 usec, as many hosts in the wild do,
there is an additional cost because of icache being wasted.





Re: [PATCH net-next] net/ipv6: Remove rt6i_prefsrc

2018-09-10 Thread David Miller
From: dsah...@kernel.org
Date: Mon, 10 Sep 2018 09:11:28 -0700

> From: David Ahern 
> 
> After the conversion to fib6_info, rt6i_prefsrc has a single user that
> reads the value and otherwise it is only set. The one reader can be
> converted to use rt->from so rt6i_prefsrc can be removed, reducing
> rt6_info by another 20 bytes.
> 
> Signed-off-by: David Ahern 

Applied, thanks David.


Re: [net-next, v2, 1/2] net: stmmac: Rework coalesce timer and fix multi-queue races

2018-09-10 Thread Jose Abreu
On 10-09-2018 16:49, Neil Armstrong wrote:
> Hi Jose,
>
> On 10/09/2018 16:44, Jose Abreu wrote:
>> On 10-09-2018 14:46, Neil Armstrong wrote:
>>> hi Jose,
>>>
>>> On 10/09/2018 14:55, Jose Abreu wrote:
 On 10-09-2018 13:52, Jose Abreu wrote:
> Can you please try attached follow-up patch ? 
 Oh, please apply the whole series otherwise this will not apply
 cleanly.
>>> Indeed, it helps!
>>>
>>> With the fixups, it fails later, around 15s instead of 3, in RX and TX.
>> Thanks for testing Neil. What if we keep rearming the timer
>> whilst there are pending packets ? Something like in the attach.
>> (applies on top of previous one).
> It fixes RX, but TX fails after ~13s.

Ok :(

Can you please try attached follow-up patch ?

I'm so sorry about this back and forth and I appreciate all your
help .

Thanks and Best Regards,
Jose Miguel Abreu


>
> Neil
>
>> Thanks and Best Regards,
>> Jose Miguel Abreu
>>

>From 4f2ba5fca6c8858cfe640f3d466fd01904c451e3 Mon Sep 17 00:00:00 2001
Message-Id: <4f2ba5fca6c8858cfe640f3d466fd01904c451e3.1536596296.git.joab...@synopsys.com>
From: Jose Abreu 
Date: Mon, 10 Sep 2018 18:18:10 +0200
Subject: [PATCH] fixup_coalesce_3

Signed-off-by: Jose Abreu 
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 25 ++-
 1 file changed, 6 insertions(+), 19 deletions(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 76a6196b3263..f6587ee372ab 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2245,11 +2245,7 @@ static void stmmac_tx_timer_arm(struct stmmac_priv *priv, u32 queue)
 {
 	struct stmmac_tx_queue *tx_q = >tx_queue[queue];
 
-	if (tx_q->tx_timer_active)
-		return;
-
 	mod_timer(_q->txtimer, STMMAC_COAL_TIMER(priv->tx_coal_timer));
-	tx_q->tx_timer_active = true;
 }
 
 /**
@@ -2264,10 +2260,7 @@ static void stmmac_tx_timer(struct timer_list *t)
 	struct stmmac_priv *priv = tx_q->priv_data;
 	bool more;
 
-	tx_q->tx_timer_active = false;
 	stmmac_tx_clean(priv, ~0, tx_q->queue_index, );
-	if (more)
-		stmmac_tx_timer_arm(priv, tx_q->queue_index);
 }
 
 /**
@@ -2866,9 +2859,6 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
 	/* Compute header lengths */
 	proto_hdr_len = skb_transport_offset(skb) + tcp_hdrlen(skb);
 
-	/* Start coalesce timer earlier in case TX Queue is stopped */
-	stmmac_tx_timer_arm(priv, queue);
-
 	/* Desc availability based on threshold should be enough safe */
 	if (unlikely(stmmac_tx_avail(priv, queue) <
 		(((skb->len - proto_hdr_len) / TSO_MAX_BUFF_SIZE + 1 {
@@ -2975,6 +2965,8 @@ static netdev_tx_t stmmac_tso_xmit(struct sk_buff *skb, struct net_device *dev)
 		stmmac_set_tx_ic(priv, desc);
 		priv->xstats.tx_set_ic_bit++;
 		tx_q->tx_count_frames = 0;
+	} else {
+		stmmac_tx_timer_arm(priv, queue);
 	}
 
 	skb_tx_timestamp(skb);
@@ -3065,9 +3057,6 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 			return stmmac_tso_xmit(skb, dev);
 	}
 
-	/* Start coalesce timer earlier in case TX Queue is stopped */
-	stmmac_tx_timer_arm(priv, queue);
-
 	if (unlikely(stmmac_tx_avail(priv, queue) < nfrags + 1)) {
 		if (!netif_tx_queue_stopped(netdev_get_tx_queue(dev, queue))) {
 			netif_tx_stop_queue(netdev_get_tx_queue(priv->dev,
@@ -3186,6 +3175,8 @@ static netdev_tx_t stmmac_xmit(struct sk_buff *skb, struct net_device *dev)
 		stmmac_set_tx_ic(priv, desc);
 		priv->xstats.tx_set_ic_bit++;
 		tx_q->tx_count_frames = 0;
+	} else {
+		stmmac_tx_timer_arm(priv, queue);
 	}
 
 	skb_tx_timestamp(skb);
@@ -3572,16 +3563,12 @@ static int stmmac_tx_poll(struct napi_struct *napi, int budget)
 	struct stmmac_priv *priv = tx_q->priv_data;
 	u32 chan = tx_q->queue_index;
 	int work_done = 0;
-	bool more;
 
 	priv->xstats.napi_poll++;
 
-	work_done = stmmac_tx_clean(priv, budget, chan, );
-	if (work_done < budget) {
+	work_done = stmmac_tx_clean(priv, budget, chan, NULL);
+	if (work_done < budget)
 		napi_complete_done(napi, work_done);
-		if (more)
-			napi_reschedule(napi);
-	}
 
 	return min(work_done, budget);
 }
-- 
2.7.4



Re: [net-next, PATCH 2/2, v1] net: socionext: add AF_XDP support

2018-09-10 Thread Ilias Apalodimas
> > @@ -707,6 +731,26 @@ static int netsec_process_rx(struct netsec_priv *priv, 
> > int budget)
> > if (unlikely(!buf_addr))
> > break;
> >  
> > +   if (xdp_prog) {
> > +   xdp_result = netsec_run_xdp(desc, priv, xdp_prog,
> > +   pkt_len);
> > +   if (xdp_result != NETSEC_XDP_PASS) {
> > +   xdp_flush |= xdp_result & NETSEC_XDP_REDIR;
> > +
> > +   dma_unmap_single_attrs(priv->dev,
> > +  desc->dma_addr,
> > +  desc->len, DMA_TO_DEVICE,
> > +  DMA_ATTR_SKIP_CPU_SYNC);
> > +
> > +   desc->len = desc_len;
> > +   desc->dma_addr = dma_handle;
> > +   desc->addr = buf_addr;
> > +   netsec_rx_fill(priv, idx, 1);
> > +   nsetsec_adv_desc(>tail);
> > +   }
> > +   continue;
> 
> Continue even on XDP_PASS? Is this really correct?
> 
> Also seems there is no handling of adjust_head/tail for XDP_PASS case.
> 
A question on this. Should XDP related frames be allocated using 1 page
per packet?

Thanks

Ilias


Re: [PATCH net] ipv6: use rt6_info members when dst is set in rt6_fill_node

2018-09-10 Thread David Ahern
On 9/9/18 12:29 AM, Xin Long wrote:
>>> diff --git a/net/ipv6/route.c b/net/ipv6/route.c
>>> index 18e00ce..e554922 100644
>>> --- a/net/ipv6/route.c
>>> +++ b/net/ipv6/route.c
>>> @@ -4670,20 +4670,33 @@ static int rt6_fill_node(struct net *net, struct 
>>> sk_buff *skb,
>>>int iif, int type, u32 portid, u32 seq,
>>>unsigned int flags)
>>>  {
>>> - struct rtmsg *rtm;
>>> + struct rt6key *fib6_prefsrc, *fib6_dst, *fib6_src;
>>> + struct rt6_info *rt6 = (struct rt6_info *)dst;
>>> + u32 *pmetrics, table, fib6_flags;
>>>   struct nlmsghdr *nlh;
>>> + struct rtmsg *rtm;
>>>   long expires = 0;
>>> - u32 *pmetrics;
>>> - u32 table;
>>>
>>>   nlh = nlmsg_put(skb, portid, seq, type, sizeof(*rtm), flags);
>>>   if (!nlh)
>>>   return -EMSGSIZE;
>>>
>>> + if (rt6) {
>>> + fib6_dst = >rt6i_dst;
>>> + fib6_src = >rt6i_src;
>>> + fib6_flags = rt6->rt6i_flags;
>>> + fib6_prefsrc = >rt6i_prefsrc;
>>> + } else {
>>> + fib6_dst = >fib6_dst;
>>> + fib6_src = >fib6_src;
>>> + fib6_flags = rt->fib6_flags;
>>> + fib6_prefsrc = >fib6_prefsrc;
>>> + }
>>
>> Unless I am missing something at the moment, an rt6_info can only have
>> the same dst, src and prefsrc as the fib6_info on which it is based.
>> Thus, only the flags is needed above. That simplifies this patch a lot.
> If dst, src and prefsrc in rt6_info are always the same as these in fib6_info,
> why do we need them in rt6_info? we could just get it by 'from'.
> 

I just sent a patch removing rt6i_prefsrc. It is set with only 1 reader
that can be converted.

rt6i_src is checked against the fib6_info to invalidate a dst if the src
has changed, so a valid rt will always have the same rt6i_src as the
rt->from.

rt6i_dst is set to the dest address / 128 in cases, so it should be used
for rt6_info cases above.


[PATCH net-next] net/ipv6: Remove rt6i_prefsrc

2018-09-10 Thread dsahern
From: David Ahern 

After the conversion to fib6_info, rt6i_prefsrc has a single user that
reads the value and otherwise it is only set. The one reader can be
converted to use rt->from so rt6i_prefsrc can be removed, reducing
rt6_info by another 20 bytes.

Signed-off-by: David Ahern 
---
 drivers/scsi/cxgbi/libcxgbi.c |  4 ++--
 include/net/ip6_fib.h |  1 -
 net/ipv6/route.c  | 27 ---
 3 files changed, 2 insertions(+), 30 deletions(-)

diff --git a/drivers/scsi/cxgbi/libcxgbi.c b/drivers/scsi/cxgbi/libcxgbi.c
index 3f3af5e74a07..6b3ea50c594e 100644
--- a/drivers/scsi/cxgbi/libcxgbi.c
+++ b/drivers/scsi/cxgbi/libcxgbi.c
@@ -784,7 +784,7 @@ cxgbi_check_route6(struct sockaddr *dst_addr, int ifindex)
csk->mtu = mtu;
csk->dst = dst;
 
-   if (ipv6_addr_any(>rt6i_prefsrc.addr)) {
+   if (!rt->from || ipv6_addr_any(>from->fib6_prefsrc.addr)) {
struct inet6_dev *idev = ip6_dst_idev((struct dst_entry *)rt);
 
err = ipv6_dev_get_saddr(_net, idev ? idev->dev : NULL,
@@ -795,7 +795,7 @@ cxgbi_check_route6(struct sockaddr *dst_addr, int ifindex)
goto rel_rt;
}
} else {
-   pref_saddr = rt->rt6i_prefsrc.addr;
+   pref_saddr = rt->from->fib6_prefsrc.addr;
}
 
csk->csk_family = AF_INET6;
diff --git a/include/net/ip6_fib.h b/include/net/ip6_fib.h
index 3d4930528db0..c7496663f99a 100644
--- a/include/net/ip6_fib.h
+++ b/include/net/ip6_fib.h
@@ -182,7 +182,6 @@ struct rt6_info {
struct in6_addr rt6i_gateway;
struct inet6_dev*rt6i_idev;
u32 rt6i_flags;
-   struct rt6key   rt6i_prefsrc;
 
struct list_headrt6i_uncached;
struct uncached_list*rt6i_uncached_list;
diff --git a/net/ipv6/route.c b/net/ipv6/route.c
index 18e00ce1719a..41f04b966008 100644
--- a/net/ipv6/route.c
+++ b/net/ipv6/route.c
@@ -995,7 +995,6 @@ static void ip6_rt_copy_init(struct rt6_info *rt, struct 
fib6_info *ort)
 #ifdef CONFIG_IPV6_SUBTREES
rt->rt6i_src = ort->fib6_src;
 #endif
-   rt->rt6i_prefsrc = ort->fib6_prefsrc;
 }
 
 static struct fib6_node* fib6_backtrack(struct fib6_node *fn,
@@ -1449,11 +1448,6 @@ static int rt6_insert_exception(struct rt6_info *nrt,
if (ort->fib6_src.plen)
src_key = >rt6i_src.addr;
 #endif
-
-   /* Update rt6i_prefsrc as it could be changed
-* in rt6_remove_prefsrc()
-*/
-   nrt->rt6i_prefsrc = ort->fib6_prefsrc;
/* rt6_mtu_change() might lower mtu on ort.
 * Only insert this exception route if its mtu
 * is less than ort's mtu value.
@@ -1635,25 +1629,6 @@ static void rt6_update_exception_stamp_rt(struct 
rt6_info *rt)
rcu_read_unlock();
 }
 
-static void rt6_exceptions_remove_prefsrc(struct fib6_info *rt)
-{
-   struct rt6_exception_bucket *bucket;
-   struct rt6_exception *rt6_ex;
-   int i;
-
-   bucket = rcu_dereference_protected(rt->rt6i_exception_bucket,
-   lockdep_is_held(_exception_lock));
-
-   if (bucket) {
-   for (i = 0; i < FIB6_EXCEPTION_BUCKET_SIZE; i++) {
-   hlist_for_each_entry(rt6_ex, >chain, hlist) {
-   rt6_ex->rt6i->rt6i_prefsrc.plen = 0;
-   }
-   bucket++;
-   }
-   }
-}
-
 static bool rt6_mtu_change_route_allowed(struct inet6_dev *idev,
 struct rt6_info *rt, int mtu)
 {
@@ -3795,8 +3770,6 @@ static int fib6_remove_prefsrc(struct fib6_info *rt, void 
*arg)
spin_lock_bh(_exception_lock);
/* remove prefsrc entry */
rt->fib6_prefsrc.plen = 0;
-   /* need to update cache as well */
-   rt6_exceptions_remove_prefsrc(rt);
spin_unlock_bh(_exception_lock);
}
return 0;
-- 
2.11.0



Re: [net-next, v2, 1/2] net: stmmac: Rework coalesce timer and fix multi-queue races

2018-09-10 Thread Neil Armstrong
Hi Jose,

On 10/09/2018 16:44, Jose Abreu wrote:
> On 10-09-2018 14:46, Neil Armstrong wrote:
>> hi Jose,
>>
>> On 10/09/2018 14:55, Jose Abreu wrote:
>>> On 10-09-2018 13:52, Jose Abreu wrote:
 Can you please try attached follow-up patch ? 
>>> Oh, please apply the whole series otherwise this will not apply
>>> cleanly.
>> Indeed, it helps!
>>
>> With the fixups, it fails later, around 15s instead of 3, in RX and TX.
> 
> Thanks for testing Neil. What if we keep rearming the timer
> whilst there are pending packets ? Something like in the attach.
> (applies on top of previous one).

It fixes RX, but TX fails after ~13s.

Neil

> 
> Thanks and Best Regards,
> Jose Miguel Abreu
> 



Re: unexpected GRO/veth behavior

2018-09-10 Thread Paolo Abeni
On Mon, 2018-09-10 at 07:56 -0700, Eric Dumazet wrote:
> 
> On 09/10/2018 07:44 AM, Paolo Abeni wrote:
> > hi all,
> > 
> > while testing some local patches I observed that the TCP tput in the
> > following scenario:
> > 
> > # the following enable napi on veth0, so that we can trigger the
> > # GRO path with namespaces
> > ip netns add test
> > ip link add type veth
> > ip link set dev veth0 netns test
> > ip -n test link set lo up
> > ip -n test link set veth0 up
> > ip -n test addr add dev veth0 172.16.1.2/24
> > ip link set dev veth1 up
> > ip addr add dev veth1 172.16.1.1/24
> > IDX=`ip netns exec test cat /sys/class/net/veth0/ifindex`
> > 
> > # 'xdp_pass' is a NO-OP XDP program that simply return XDP_PASS
> > ip netns exec test ./xdp_pass $IDX &
> > taskset 0x2 ip netns exec test iperf3 -s -i 60 &
> > taskset 0x1 iperf3 -c 172.16.1.2 -t 60 -i 60
> > 
> > is quite lower than expected (~800Mbps). 'perf' shows a weird topmost 
> > offender:
> > 
> 
> 
> But... why GRO would even be needed in this scenario ?

AFAICS, attaching an XDP program to a veth device makes TCP flows over
such veth unconditionally hit this code path since:

commit 948d4f214fde43743c57aae0c708bff44f6345f2
Author: Toshiaki Makita 
Date:   Fri Aug 3 16:58:10 2018 +0900

veth: Add driver XDP

I'm personally looking for some way to hit the GRO code path with
selftest/namespaces.

> GRO is really meant for physical devices, having to mess with skb->sk adds 
> extra cost
> in this already heavy cost engine.

Yup, even if I do not see any measurable cost added by the posted code.

Cheers,

Paolo



Re: [PATCH] cxgb4: fix abort_req_rss6 struct

2018-09-10 Thread Steve Wise


(sorry for the late reply, I was out all last week)


On 9/5/2018 5:55 PM, Jason Gunthorpe wrote:
> On Fri, Aug 31, 2018 at 11:52:00AM -0700, Steve Wise wrote:
>> Remove the incorrect WR_HDR field which can cause a misinterpretation
>> of this CPL by ULDs.
> 
> What does that mean?
> 

It means iw_cxgb4 doesn't correctly handle incoming ABORT CPL messages
due to this incorrect struct.


> Is this an -rc patch?
> 

Yes. Since it fixes a3cdaa69e4ae ("cxgb4: Adds CPL support for Shared
Receive Queues") which went into 4.19.

Steve.




Re: unexpected GRO/veth behavior

2018-09-10 Thread Eric Dumazet



On 09/10/2018 07:44 AM, Paolo Abeni wrote:
> hi all,
> 
> while testing some local patches I observed that the TCP tput in the
> following scenario:
> 
> # the following enable napi on veth0, so that we can trigger the
> # GRO path with namespaces
> ip netns add test
> ip link add type veth
> ip link set dev veth0 netns test
> ip -n test link set lo up
> ip -n test link set veth0 up
> ip -n test addr add dev veth0 172.16.1.2/24
> ip link set dev veth1 up
> ip addr add dev veth1 172.16.1.1/24
> IDX=`ip netns exec test cat /sys/class/net/veth0/ifindex`
> 
> # 'xdp_pass' is a NO-OP XDP program that simply return XDP_PASS
> ip netns exec test ./xdp_pass $IDX &
> taskset 0x2 ip netns exec test iperf3 -s -i 60 &
> taskset 0x1 iperf3 -c 172.16.1.2 -t 60 -i 60
> 
> is quite lower than expected (~800Mbps). 'perf' shows a weird topmost 
> offender:
>


But... why GRO would even be needed in this scenario ?

GRO is really meant for physical devices, having to mess with skb->sk adds 
extra cost
in this already heavy cost engine.

Virtual devices should already be fed with TSO packets.






unexpected GRO/veth behavior

2018-09-10 Thread Paolo Abeni
hi all,

while testing some local patches I observed that the TCP tput in the
following scenario:

# the following enable napi on veth0, so that we can trigger the
# GRO path with namespaces
ip netns add test
ip link add type veth
ip link set dev veth0 netns test
ip -n test link set lo up
ip -n test link set veth0 up
ip -n test addr add dev veth0 172.16.1.2/24
ip link set dev veth1 up
ip addr add dev veth1 172.16.1.1/24
IDX=`ip netns exec test cat /sys/class/net/veth0/ifindex`

# 'xdp_pass' is a NO-OP XDP program that simply return XDP_PASS
ip netns exec test ./xdp_pass $IDX &
taskset 0x2 ip netns exec test iperf3 -s -i 60 &
taskset 0x1 iperf3 -c 172.16.1.2 -t 60 -i 60

is quite lower than expected (~800Mbps). 'perf' shows a weird topmost 
offender:

  80.42%  [kernel]   [k] find_bug

I *think* skb_gro_receive() does not behave correctly if !skb->sk, and
I experimented with the following patch, so far with success (in the
above scenario tput is now ~11Gbps). Am I missing something? it that
overkill?

Thank you,

Paolo

---
diff --git a/net/core/dev.c b/net/core/dev.c
index ca78dc5..94723b1 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -5554,6 +5554,11 @@ struct packet_offload *gro_find_complete_by_type(__be16 
type)
 
 static void napi_skb_free_stolen_head(struct sk_buff *skb)
 {
+   if (skb->destructor) {
+   WARN_ON(in_irq());
+   skb->destructor(skb);
+   }
+
skb_dst_drop(skb);
secpath_reset(skb);
kmem_cache_free(skbuff_head_cache, skb);
diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index c996c09..19f2fd9 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3827,6 +3827,7 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff 
*skb)
unsigned int offset = skb_gro_offset(skb);
unsigned int headlen = skb_headlen(skb);
unsigned int len = skb_gro_len(skb);
+   unsigned int new_skb_truesize;
unsigned int delta_truesize;
struct sk_buff *lp;
 
@@ -3858,11 +3859,13 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff 
*skb)
frag->page_offset += offset;
skb_frag_size_sub(frag, offset);
 
-   /* all fragments truesize : remove (head size + sk_buff) */
-   delta_truesize = skb->truesize -
-SKB_TRUESIZE(skb_end_offset(skb));
+   /* all fragments truesize : remove (head size + sk_buff);
+* keep unchanged the amount of memory accounted to skb->sk
+*/
+   new_skb_truesize = SKB_TRUESIZE(skb_end_offset(skb));
+   delta_truesize = skb->truesize - new_skb_truesize;
 
-   skb->truesize -= skb->data_len;
+   skb->truesize -= new_skb_truesize;
skb->len -= skb->data_len;
skb->data_len = 0;
 
@@ -3891,12 +3894,24 @@ int skb_gro_receive(struct sk_buff *p, struct sk_buff 
*skb)
memcpy(frag + 1, skbinfo->frags, sizeof(*frag) * 
skbinfo->nr_frags);
/* We dont need to clear skbinfo->nr_frags here */
 
-   delta_truesize = skb->truesize - SKB_DATA_ALIGN(sizeof(struct 
sk_buff));
+   /* keep unchanged the amount of memory accounted to skb->sk */
+   new_skb_truesize = SKB_DATA_ALIGN(sizeof(struct sk_buff));
+   delta_truesize = skb->truesize - new_skb_truesize;
+   skb->truesize = new_skb_truesize;
NAPI_GRO_CB(skb)->free = NAPI_GRO_FREE_STOLEN_HEAD;
goto done;
}
 
 merge:
+   if (skb->destructor) {
+   /* skb's truesize onwership is transferred to p, avoid
+* releasing it twice when p is freed
+*/
+   WARN_ON_ONCE(p->sk != skb->sk || p->destructor != 
skb->destructor);
+   skb->sk = 0;
+   skb->destructor = NULL;
+   }
+
delta_truesize = skb->truesize;
if (offset > headlen) {
unsigned int eat = offset - headlen;





Re: [net-next, v2, 1/2] net: stmmac: Rework coalesce timer and fix multi-queue races

2018-09-10 Thread Jose Abreu
On 10-09-2018 14:46, Neil Armstrong wrote:
> hi Jose,
>
> On 10/09/2018 14:55, Jose Abreu wrote:
>> On 10-09-2018 13:52, Jose Abreu wrote:
>>> Can you please try attached follow-up patch ? 
>> Oh, please apply the whole series otherwise this will not apply
>> cleanly.
> Indeed, it helps!
>
> With the fixups, it fails later, around 15s instead of 3, in RX and TX.

Thanks for testing Neil. What if we keep rearming the timer
whilst there are pending packets ? Something like in the attach.
(applies on top of previous one).

Thanks and Best Regards,
Jose Miguel Abreu
>From 5d5a6bd882006f14c59b351f4324160115f818c0 Mon Sep 17 00:00:00 2001
Message-Id: <5d5a6bd882006f14c59b351f4324160115f818c0.1536590220.git.joab...@synopsys.com>
From: Jose Abreu 
Date: Mon, 10 Sep 2018 16:36:37 +0200
Subject: [PATCH] fixup_coalesce_2

Signed-off-by: Jose Abreu 
---
 drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 5 -
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
index 7875e81966fb..76a6196b3263 100644
--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
@@ -2262,9 +2262,12 @@ static void stmmac_tx_timer(struct timer_list *t)
 {
 	struct stmmac_tx_queue *tx_q = from_timer(tx_q, t, txtimer);
 	struct stmmac_priv *priv = tx_q->priv_data;
+	bool more;
 
-	stmmac_tx_clean(priv, ~0, tx_q->queue_index, NULL);
 	tx_q->tx_timer_active = false;
+	stmmac_tx_clean(priv, ~0, tx_q->queue_index, );
+	if (more)
+		stmmac_tx_timer_arm(priv, tx_q->queue_index);
 }
 
 /**
-- 
2.7.4



Re: [PATCH net] qmi_wwan: Support dynamic config on Quectel EP06

2018-09-10 Thread Dan Williams
On Sat, 2018-09-08 at 16:12 +0200, Bjørn Mork wrote:
> Kristian Evensen  writes:
> 
> > Quectel EP06 (and EM06/EG06) supports dynamic configuration of USB
> > interfaces, without the device changing VID/PID or configuration
> > number.
> > When the configuration is updated and interfaces are added/removed,
> > the
> > interface numbers change. This means that the current code for
> > matching
> > EP06 does not work.
> 
> That's annoying, but hardly surprising. They obviously try to make
> life
> as hard as possible for the drivers.  I wonder what the Windows
> drivers
> do here, if there are any?  Or are these modules only used in
> embedded
> Linux devices?
> 
> > This patch removes the current EP06 interface number match, and
> > replaces
> > it with a match on class, subclass and protocol. Unfortunately,
> > matching
> > on those three alone is not enough, as the diag interface exports
> > the
> > same values as QMI. The other serial interfaces + adb export
> > different
> > values and do not match.
> > 
> > The diag interface only has two endpoints, while the QMI interface
> > has
> > three. I have therefore added a check for number of interfaces, and
> > we
> > ignore the interface if the number of endpoints equals two.
> 
> Could this break if more/other functions are enabled?  Are you sure
> there can't be any other type of serial function with 3 endpoints and
> ff/ff/ff?  Well, I guess no one knows for sure... And this is more
> than
> good enough until it breaks. Thanks for solving the puzzle.  Looks
> good
> to me.

I'm sure they could add another serial interface with ff/ff/ff and 3
endpoints, but I don't know what else we can do to make this device
work correctly.  I double-checked the permutations & logic and it makes
sense to me.

Acked-by: Dan Williams 

Dan


Re: [PATCH net-next v3 4/7] net: aquantia: implement EEE support

2018-09-10 Thread Andrew Lunn
On Mon, Sep 10, 2018 at 12:39:31PM +0300, Igor Russkikh wrote:
> From: Yana Esina 
> 
> Support of Energy-Efficient Ethernet to aQuantia NIC's via ethtool
> (according to the IEEE 802.3az specifications)
> 
> Signed-off-by: Yana Esina 
> Signed-off-by: Nikita Danilov 
> Tested-by: Nikita Danilov 
> Signed-off-by: Igor Russkikh 

Reviewed-by: Andrew Lunn 

Andrew


Re: [PATCH net-next v3 3/7] net: aquantia: implement WOL support

2018-09-10 Thread Andrew Lunn
On Mon, Sep 10, 2018 at 12:39:30PM +0300, Igor Russkikh wrote:
> From: Yana Esina 
> 
> Add WOL support. Currently only magic packet
> (ethtool -s  wol g) feature is implemented.
> 
> Remove hw_set_power and move that to FW_OPS set_power:
> because WOL configuration behaves differently on 1x and 2x
> firmwares
> 
> Signed-off-by: Yana Esina 
> Signed-off-by: Nikita Danilov 
> Tested-by: Nikita Danilov 
> Signed-off-by: Igor Russkikh 

Reviewed-by: Andrew Lunn 

Andrew


Re: [PATCH net-next v3 2/7] net: aquantia: definitions for WOL

2018-09-10 Thread Andrew Lunn
On Mon, Sep 10, 2018 at 12:39:29PM +0300, Igor Russkikh wrote:
> From: Yana Esina 
> 
> Added definitions and structures needed to support WOL.
> 
> Signed-off-by: Yana Esina 
> Signed-off-by: Nikita Danilov 
> Tested-by: Nikita Danilov 
> Signed-off-by: Igor Russkikh 

Reviewed-by: Andrew Lunn 

Andrew


Re: [net-next, v2, 1/2] net: stmmac: Rework coalesce timer and fix multi-queue races

2018-09-10 Thread Neil Armstrong
hi Jose,

On 10/09/2018 14:55, Jose Abreu wrote:
> On 10-09-2018 13:52, Jose Abreu wrote:
>>
>> Can you please try attached follow-up patch ? 
> 
> Oh, please apply the whole series otherwise this will not apply
> cleanly.

Indeed, it helps!

With the fixups, it fails later, around 15s instead of 3, in RX and TX.

Neil

> 
> Thanks and Best Regards,
> Jose Miguel Abreu
> 
> 



Re: [PATCH can-next] can: ucan: remove set but not used variable 'udev'

2018-09-10 Thread Martin Elshuber
Thank you for the fix

Reviewed-by: Martin Elshuber 

> Fixes gcc '-Wunused-but-set-variable' warning:
> 
> drivers/net/can/usb/ucan.c: In function 'ucan_disconnect':
> drivers/net/can/usb/ucan.c:1578:21: warning:
>  variable 'udev' set but not used [-Wunused-but-set-variable]
>   struct usb_device *udev;
> 
> Signed-off-by: YueHaibing 
> ---
>  drivers/net/can/usb/ucan.c | 3 ---
>  1 file changed, 3 deletions(-)
> 
> diff --git a/drivers/net/can/usb/ucan.c b/drivers/net/can/usb/ucan.c
> index 0678a38..c9fd83e 100644
> --- a/drivers/net/can/usb/ucan.c
> +++ b/drivers/net/can/usb/ucan.c
> @@ -1575,11 +1575,8 @@ static int ucan_probe(struct usb_interface *intf,
>  /* disconnect the device */
>  static void ucan_disconnect(struct usb_interface *intf)
>  {
> - struct usb_device *udev;
>   struct ucan_priv *up = usb_get_intfdata(intf);
>  
> - udev = interface_to_usbdev(intf);
> -
>   usb_set_intfdata(intf, NULL);
>  
>   if (up) {
> 




signature.asc
Description: OpenPGP digital signature


Re: [PATCH net-next] virtio_net: ethtool tx napi configuration

2018-09-10 Thread Willem de Bruijn
On Mon, Sep 10, 2018 at 2:01 AM Jason Wang  wrote:
>
>
>
> On 2018年09月10日 06:44, Willem de Bruijn wrote:
> > From: Willem de Bruijn 
> >
> > Implement ethtool .set_coalesce (-C) and .get_coalesce (-c) handlers.
> > Interrupt moderation is currently not supported, so these accept and
> > display the default settings of 0 usec and 1 frame.
> >
> > Toggle tx napi through a bit in tx-frames. So as to not interfere
> > with possible future interrupt moderation, use bit 10, well outside
> > the reasonable range of real interrupt moderation values.
> >
> > Changes are not atomic. The tx IRQ, napi BH and transmit path must
> > be quiesced when switching modes. Only allow changing this setting
> > when the device is down.
>
> I cook a fixup, and it looks works in my setup:
>
> diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> index b320b6b14749..9181c3f2f832 100644
> --- a/drivers/net/virtio_net.c
> +++ b/drivers/net/virtio_net.c
> @@ -2204,10 +2204,17 @@ static int virtnet_set_coalesce(struct
> net_device *dev,
>  return -EINVAL;
>
>  if (napi_weight ^ vi->sq[0].napi.weight) {
> -   if (dev->flags & IFF_UP)
> -   return -EBUSY;
> -   for (i = 0; i < vi->max_queue_pairs; i++)
> +   for (i = 0; i < vi->max_queue_pairs; i++) {
> +   struct netdev_queue *txq =
> +  netdev_get_tx_queue(vi->dev, i);
> +
> + virtnet_napi_tx_disable(>sq[i].napi);
> +   __netif_tx_lock_bh(txq);
>  vi->sq[i].napi.weight = napi_weight;
> +   __netif_tx_unlock_bh(txq);
> +   virtnet_napi_tx_enable(vi, vi->sq[i].vq,
> + >sq[i].napi);
> +   }
>  }
>
>  return 0;

Thanks! It passes my simple stress test, too. Which consists of two
concurrent loops, one toggling the ethtool option, another running
TCP_RR.

> The only left case is the speculative tx polling in RX NAPI. I think we
> don't need to care in this case since it was not a must for correctness.

As long as the txq lock is held that will be a noop, anyway. The other
concurrent action is skb_xmit_done. It looks correct to me, but need
to think about it a bit. The tricky transition is coming out of napi without
having >= 2 + MAX_SKB_FRAGS clean descriptors. If the queue is
stopped it may deadlock transmission in no-napi mode.

> >
> > Link: https://patchwork.ozlabs.org/patch/948149/
> > Suggested-by: Jason Wang 
> > Signed-off-by: Willem de Bruijn 
> > ---
> >   drivers/net/virtio_net.c | 52 
> >   1 file changed, 52 insertions(+)
> >
> > diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c
> > index 765920905226..b320b6b14749 100644
> > --- a/drivers/net/virtio_net.c
> > +++ b/drivers/net/virtio_net.c
> > @@ -66,6 +66,8 @@ DECLARE_EWMA(pkt_len, 0, 64)
> >
> >   #define VIRTNET_DRIVER_VERSION "1.0.0"
> >
> > +static const u32 ethtool_coalesce_napi_mask = (1UL << 10);
> > +
> >   static const unsigned long guest_offloads[] = {
> >   VIRTIO_NET_F_GUEST_TSO4,
> >   VIRTIO_NET_F_GUEST_TSO6,
> > @@ -2181,6 +2183,54 @@ static int virtnet_get_link_ksettings(struct 
> > net_device *dev,
> >   return 0;
> >   }
> >
> > +static int virtnet_set_coalesce(struct net_device *dev,
> > + struct ethtool_coalesce *ec)
> > +{
> > + const struct ethtool_coalesce ec_default = {
> > + .cmd = ETHTOOL_SCOALESCE,
> > + .rx_max_coalesced_frames = 1,
>
> I think rx part is no necessary.

The definition of ethtool_coalesce has:

"* It is illegal to set both usecs and max_frames to zero as this
 * would cause interrupts to never be generated.  To disable
 * coalescing, set usecs = 0 and max_frames = 1."

I'd rather not diverge from this prescribed behavior unless there's
a strong reason.

On the related point in the other thread:

> Rethink about this, how about something like:
>
> - UINT_MAX: no tx interrupt
> - other value: tx interrupt with possible interrupt moderation

Okay, that will be simpler to configure.


Re: [PATCH v3 net-next 6/6] net: dsa: Add Lantiq / Intel DSA driver for vrx200

2018-09-10 Thread Andrew Lunn
On Sun, Sep 09, 2018 at 10:20:39PM +0200, Hauke Mehrtens wrote:
> +static void gswip_phylink_validate(struct dsa_switch *ds, int port,
> +unsigned long *supported,
> +struct phylink_link_state *state)
> +{
> + __ETHTOOL_DECLARE_LINK_MODE_MASK(mask) = { 0, };
> +
> + switch (port) {
> + case 0:
> + case 1:
> + if (!phy_interface_mode_is_rgmii(state->interface) &&
> + state->interface != PHY_INTERFACE_MODE_MII &&
> + state->interface != PHY_INTERFACE_MODE_REVMII &&
> + state->interface != PHY_INTERFACE_MODE_RMII) {
> + bitmap_zero(supported, __ETHTOOL_LINK_MODE_MASK_NBITS);
> + dev_err(ds->dev,
> + "Unsupported interface: %d\n", state->interface);
> + return;
> + }
> + break;
> + case 2:
> + case 3:
> + case 4:
> + if (state->interface != PHY_INTERFACE_MODE_INTERNAL) {
> + bitmap_zero(supported, __ETHTOOL_LINK_MODE_MASK_NBITS);
> + dev_err(ds->dev,
> + "Unsupported interface: %d\n", state->interface);
> + return;
> + }
> + break;
> + case 5:
> + if (!phy_interface_mode_is_rgmii(state->interface) &&
> + state->interface != PHY_INTERFACE_MODE_INTERNAL) {
> + bitmap_zero(supported, __ETHTOOL_LINK_MODE_MASK_NBITS);
> + dev_err(ds->dev,
> + "Unsupported interface: %d\n", state->interface);
> + return;

Hi Hauke

Minor nit. You have the same thing repeated three times. Maybe change
it to a goto out; and have the error block only once at the out:
label.

> +static int gswip_gphy_fw_list(struct gswip_priv *priv,
> +   struct device_node *gphy_fw_list_np, u32 version)
> +{
> + struct device *dev = priv->dev;
> + struct device_node *gphy_fw_np;
> + const struct of_device_id *match;
> + int err;
> + int i = 0;
> +
> + /* The The VRX200 rev 1.1 uses the GSWIP 2.0 and needs the older

Double The.

> +
> + /* bring up the mdio bus */
> + mdio_np = of_find_compatible_node(pdev->dev.of_node, NULL,
> +   "lantiq,xrx200-mdio");
> + if (mdio_np) {
> + err = gswip_mdio(priv, mdio_np);
> + if (err) {
> + dev_err(dev, "mdio probe failed\n");
> + goto gphy_fw;
> + }
> + }
> +
> + err = dsa_register_switch(priv->ds);
> + if (err) {
> + dev_err(dev, "dsa switch register failed: %i\n", err);
> + goto mdio_bus;
> + }

> + if (priv->ds->dst->cpu_dp->index != priv->hw_info->cpu_port) {

I think that can be simplified to

if (!dsa_is_cpu_port(ds, priv->hw_info->cpu_port))

which is probably more readable.

Florian was also considering that we should move this test into the
DSA core. But for the moment, doing it here is O.K.

This is otherwise looking good.

Thanks

Andrew


Re: [net-next, v2, 1/2] net: stmmac: Rework coalesce timer and fix multi-queue races

2018-09-10 Thread Jose Abreu
On 10-09-2018 13:52, Jose Abreu wrote:
>
> Can you please try attached follow-up patch ? 

Oh, please apply the whole series otherwise this will not apply
cleanly.

Thanks and Best Regards,
Jose Miguel Abreu




Re: [PATCH v3 net-next 3/6] dt-bindings: net: Add lantiq,xrx200-net DT bindings

2018-09-10 Thread Andrew Lunn
On Sun, Sep 09, 2018 at 10:16:44PM +0200, Hauke Mehrtens wrote:
> This adds the binding for the PMAC core between the CPU and the GSWIP
> switch found on the xrx200 / VR9 Lantiq / Intel SoC.
> 
> Signed-off-by: Hauke Mehrtens 
> Cc: devicet...@vger.kernel.org
> ---
>  .../devicetree/bindings/net/lantiq,xrx200-net.txt   | 21 
> +
>  1 file changed, 21 insertions(+)
>  create mode 100644 
> Documentation/devicetree/bindings/net/lantiq,xrx200-net.txt
> 
> diff --git a/Documentation/devicetree/bindings/net/lantiq,xrx200-net.txt 
> b/Documentation/devicetree/bindings/net/lantiq,xrx200-net.txt
> new file mode 100644
> index ..8a2fe5200cdc
> --- /dev/null
> +++ b/Documentation/devicetree/bindings/net/lantiq,xrx200-net.txt
> @@ -0,0 +1,21 @@
> +Lantiq xRX200 GSWIP PMAC Ethernet driver
> +==
> +
> +Required properties:
> +
> +- compatible : "lantiq,xrx200-net" for the PMAC of the embedded
> + : GSWIP in the xXR200
> +- reg: memory range of the PMAC core inside of the GSWIP core
> +- interrupts : TX and RX DMA interrupts. Use interrupt-names "tx" for
> + : the TX interrupt and "rx" for the RX interrupt.
> +
> +Example:
> +
> +eth0: eth@E10B308 {
> + #address-cells = <1>;
> + #size-cells = <0>;
> + compatible = "lantiq,xrx200-net";
> + reg = <0xE10B308 0x30>;

Hi Hauke

This binding itself looks fine. I just find this address range a bit
odd. What are 0xe10b300-0xe10b307 used for? Are all 0x30 bytes used in
the range? The address range ending at 0xe10b338 seems a bit
odd. 0xe10b33f would be more typical.

I'm asking because it can be messy when you find out you need to
change the address range, and not break backwards compatibility.

 Andrew


Re: [net-next, v2, 1/2] net: stmmac: Rework coalesce timer and fix multi-queue races

2018-09-10 Thread Jose Abreu
Hi Neil,

On 10-09-2018 12:43, Neil Armstrong wrote:
> Hi Jose,
>
> On 10/09/2018 11:14, Jose Abreu wrote:
>> This follows David Miller advice and tries to fix coalesce timer in
>> multi-queue scenarios.
>>
>> We are now using per-queue coalesce values and per-queue TX timer.
>>
>> Coalesce timer default values was changed to 1ms and the coalesce frames
>> to 25.
>>
>> Tested in B2B setup between XGMAC2 and GMAC5.
>>
>> Signed-off-by: Jose Abreu 
>> Cc: Jerome Brunet 
>> Cc: Martin Blumenstingl 
>> Cc: David S. Miller 
>> Cc: Joao Pinto 
>> Cc: Giuseppe Cavallaro 
>> Cc: Alexandre Torgue 
>> ---
>> Jerome,
> Jerome is in holidays...
>
>> Can you please test if this one is okay ?
>
> I tested this patch on top of 4.18.7 with the previous patch (4ae0169fd1b3) 
> reverted.
>
> The TX or RX stopped a few seconds after iperf3 started :
> (iperf3 is running in server mode on the Amlogic A113D board)
>
> $ iperf3 -c 10.1.4.95
> Connecting to host 10.1.4.95, port 5201
> [  4] local 10.1.2.12 port 39906 connected to 10.1.4.95 port 5201
> [ ID] Interval   Transfer Bandwidth   Retr  Cwnd
> [  4]   0.00-1.00   sec  56.2 MBytes   472 Mbits/sec0660 KBytes
> [  4]   1.00-2.00   sec  56.2 MBytes   472 Mbits/sec0660 KBytes
> [  4]   2.00-3.00   sec  38.8 MBytes   325 Mbits/sec1   1.41 KBytes
> [  4]   3.00-4.00   sec  0.00 Bytes  0.00 bits/sec1   1.41 KBytes
> [  4]   4.00-5.00   sec  0.00 Bytes  0.00 bits/sec1   1.41 KBytes
> ^C[  4]   5.00-5.61   sec  0.00 Bytes  0.00 bits/sec0   1.41 KBytes
> - - - - - - - - - - - - - - - - - - - - - - - - -
> [ ID] Interval   Transfer Bandwidth   Retr
> [  4]   0.00-5.61   sec   151 MBytes   226 Mbits/sec3 sender
> [  4]   0.00-5.61   sec  0.00 Bytes  0.00 bits/sec  receiver
> iperf3: interrupt - the client has terminated
>
> $ iperf3 -c 10.1.4.95 -R
> Connecting to host 10.1.4.95, port 5201
> Reverse mode, remote host 10.1.4.95 is sending
> [  4] local 10.1.2.12 port 39894 connected to 10.1.4.95 port 5201
> [ ID] Interval   Transfer Bandwidth
> [  4]   0.00-1.00   sec  82.2 MBytes   690 Mbits/sec
> [  4]   1.00-2.00   sec  82.6 MBytes   693 Mbits/sec
> [  4]   2.00-3.00   sec  24.2 MBytes   203 Mbits/sec
> [  4]   3.00-4.00   sec  0.00 Bytes  0.00 bits/sec
> [  4]   4.00-5.00   sec  0.00 Bytes  0.00 bits/sec
> ^C[  4]   5.00-5.53   sec  0.00 Bytes  0.00 bits/sec
> - - - - - - - - - - - - - - - - - - - - - - - - -
> [ ID] Interval   Transfer Bandwidth
> [  4]   0.00-5.53   sec  0.00 Bytes  0.00 bits/sec  sender
> [  4]   0.00-5.53   sec   189 MBytes   287 Mbits/sec  receiver
> iperf3: interrupt - the client has terminated
>
> These works for hours without this patch applied.

Damn... I'm able to replicate the issue if I turn SMP off but
it's not consistent ...

Can you please try attached follow-up patch ? It's been running
for 10min now and I'm getting ~700Mbps on the same GMAC as you
have (3.71).

Thanks and Best Regards,
Jose Miguel Abreu

>
> Neil
>
>> Thanks and Best Regards,
>> Jose Miguel Abreu
>> ---
>>  drivers/net/ethernet/stmicro/stmmac/common.h  |   4 +-
>>  drivers/net/ethernet/stmicro/stmmac/stmmac.h  |   6 +-
>>  drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 207 
>> ++
>>  3 files changed, 135 insertions(+), 82 deletions(-)
>>
>> diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h 
>> b/drivers/net/ethernet/stmicro/stmmac/common.h
>> index 1854f270ad66..b1b305f8f414 100644
>> --- a/drivers/net/ethernet/stmicro/stmmac/common.h
>> +++ b/drivers/net/ethernet/stmicro/stmmac/common.h
>> @@ -258,10 +258,10 @@ struct stmmac_safety_stats {
>>  #define MAX_DMA_RIWT0xff
>>  #define MIN_DMA_RIWT0x20
>>  /* Tx coalesce parameters */
>> -#define STMMAC_COAL_TX_TIMER4
>> +#define STMMAC_COAL_TX_TIMER1000
>>  #define STMMAC_MAX_COAL_TX_TICK 10
>>  #define STMMAC_TX_MAX_FRAMES256
>> -#define STMMAC_TX_FRAMES64
>> +#define STMMAC_TX_FRAMES25
>>  
>>  /* Packets types */
>>  enum packets_types {
>> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h 
>> b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
>> index c0a855b7ab3b..957030cfb833 100644
>> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
>> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
>> @@ -48,6 +48,9 @@ struct stmmac_tx_info {
>>  
>>  /* Frequently used values are kept adjacent for cache effect */
>>  struct stmmac_tx_queue {
>> +u32 tx_count_frames;
>> +int tx_timer_active;
>> +struct timer_list txtimer;
>>  u32 queue_index;
>>  struct stmmac_priv *priv_data;
>>  struct dma_extended_desc *dma_etx cacheline_aligned_in_smp;
>> @@ -59,6 +62,7 @@ struct stmmac_tx_queue {
>>  dma_addr_t dma_tx_phy;
>>  u32 tx_tail_addr;
>>  u32 mss;
>> +struct napi_struct napi cacheline_aligned_in_smp;
>>  };
>>  
>>  struct 

Re: [PATCH v2 net] MIPS: lantiq: dma: add dev pointer

2018-09-10 Thread Andrew Lunn
On Sun, Sep 09, 2018 at 09:26:23PM +0200, Hauke Mehrtens wrote:
> dma_zalloc_coherent() now crashes if no dev pointer is given.
> Add a dev pointer to the ltq_dma_channel structure and fill it in the
> driver using it.
> 
> This fixes a bug introduced in kernel 4.19.
> 
> Signed-off-by: Hauke Mehrtens 
> ---
> 
> no changes since v1.
> 
> This should go into kernel 4.19 and I have some other patches adding new 
> features for kernel 4.20 which are depending on this, so I would prefer 
> if this goes through the net tree. 

Hi Hauke

Is this a build time dependency, or a runtime dependency?

What we don't want to do is add the switch driver to net-next and find
it does not compile because this change is not in net-next yet.

   Andrew


Re: Offloaded u32 classifier tables WAS (Re: [PATCH net 00/13] cls_u32 cleanups and fixes.

2018-09-10 Thread Jamal Hadi Salim

On 2018-09-10 8:25 a.m., Jamal Hadi Salim wrote:

On 2018-09-09 11:48 a.m., Al Viro wrote:



BTW, shouldn't we issue u32_clear_hw_hnode() every time
we destroy an hnode?  It's done on u32_delete(), it's
done (for root ht) on u32_destroy(), but it's not done
for any other hnodes when you remove the entire (not shared)
filter.  Looks fishy...



What you are saying makes sense, but that doesnt seem
like a new thing.
All hardware offload examples I have seen use root tables
only[1]. So I am not sure if they use any other tables
although the intel hardware at least seems very capable.
Look at ixgbe_main.c for example. Theres an explicit assumption
that root is always 0x800 but oresence of



"..but presence of other tables can be handled"

cheers,
jamal


Offloaded u32 classifier tables WAS (Re: [PATCH net 00/13] cls_u32 cleanups and fixes.

2018-09-10 Thread Jamal Hadi Salim

On 2018-09-09 11:48 a.m., Al Viro wrote:



BTW, shouldn't we issue u32_clear_hw_hnode() every time
we destroy an hnode?  It's done on u32_delete(), it's
done (for root ht) on u32_destroy(), but it's not done
for any other hnodes when you remove the entire (not shared)
filter.  Looks fishy...



What you are saying makes sense, but that doesnt seem
like a new thing.
All hardware offload examples I have seen use root tables
only[1]. So I am not sure if they use any other tables
although the intel hardware at least seems very capable.
Look at ixgbe_main.c for example. Theres an explicit assumption
that root is always 0x800 but oresence of

+Cc some of the NIC vendor folks..



cheers,
jamal

[1]
Here's a script posted by someone at Intel(Sridhar?) a while back
that adds 2 filters, one with skip-sw and the other with skip-hw
flag.

---
   # add ingress qdisc
   tc qdisc add dev p4p1 ingress

   # enable hw tc offload.
   ethtool -K p4p1 hw-tc-offload on

   # add u32 filter with skip-sw flag.
   tc filter add dev p4p1 parent : protocol ip prio 99 \
  handle 800:0:1 u32 ht 800: flowid 800:1 \
  skip-sw \
  match ip src 192.168.1.0/24 \
  action drop

   # add u32 filter with skip-hw flag.
   tc filter add dev p4p1 parent : protocol ip prio 99 \
  handle 800:0:2 u32 ht 800: flowid 800:2 \
  skip-hw \
  match ip src 192.168.2.0/24 \
  action drop



Re: [PATCH iproute2 3/3] bridge: fix vlan show formatting

2018-09-10 Thread Tobias Jungel
Hi Stephen,

I just have seen this patch, hence please ignore the "bridge: Correct
json output" I sent. This one solves the issue in a way more elegant
manor.

I tested the JSON output in this series and it works as intended.

Thanks


Reviewed-by: Tobias Jungel 


On Thu, 2018-09-06 at 16:30 +0100, Stephen Hemminger wrote:
> The output of vlan show was broken previous change to use json_print.
> Clean the code up and return to original format.
> 
> Note: the JSON syntax has changed to make the bridge vlan
> show more like other outputs (e.g. ip -j li show).
> 
> Signed-off-by: Stephen Hemminger 
> ---
>  bridge/br_common.h |  2 +-
>  bridge/link.c  |  6 ++---
>  bridge/vlan.c  | 61 +++-
> --
>  3 files changed, 40 insertions(+), 29 deletions(-)
> 
> diff --git a/bridge/br_common.h b/bridge/br_common.h
> index 2f1cb8fd9f3d..69665fde32b6 100644
> --- a/bridge/br_common.h
> +++ b/bridge/br_common.h
> @@ -6,7 +6,7 @@
>  #define MDB_RTR_RTA(r) \
>   ((struct rtattr *)(((char *)(r)) +
> RTA_ALIGN(sizeof(__u32
>  
> -extern void print_vlan_info(FILE *fp, struct rtattr *tb);
> +extern void print_vlan_info(struct rtattr *tb, int ifindex);

nitpick, this does not apply to master.

>  extern int print_linkinfo(const struct sockaddr_nl *who,
> struct nlmsghdr *n,
> void *arg);
> diff --git a/bridge/link.c b/bridge/link.c
> index 8d89aca2e638..a5ee9a5c58e6 100644
> --- a/bridge/link.c
> +++ b/bridge/link.c
> @@ -161,7 +161,7 @@ static void print_protinfo(FILE *fp, struct
> rtattr *attr)
>   * This is reported by HW devices that have some bridging
>   * capabilities.
>   */
> -static void print_af_spec(FILE *fp, struct rtattr *attr)
> +static void print_af_spec(struct rtattr *attr, int ifindex)
>  {
>   struct rtattr *aftb[IFLA_BRIDGE_MAX+1];
>  
> @@ -174,7 +174,7 @@ static void print_af_spec(FILE *fp, struct rtattr
> *attr)
>   return;
>  
>   if (aftb[IFLA_BRIDGE_VLAN_INFO])
> - print_vlan_info(fp, aftb[IFLA_BRIDGE_VLAN_INFO]);
> + print_vlan_info(aftb[IFLA_BRIDGE_VLAN_INFO], ifindex);
>  }
>  
>  int print_linkinfo(const struct sockaddr_nl *who,
> @@ -229,7 +229,7 @@ int print_linkinfo(const struct sockaddr_nl *who,
>   print_protinfo(fp, tb[IFLA_PROTINFO]);
>  
>   if (tb[IFLA_AF_SPEC])
> - print_af_spec(fp, tb[IFLA_AF_SPEC]);
> + print_af_spec(tb[IFLA_AF_SPEC], ifi->ifi_index);
>  
>   print_string(PRINT_FP, NULL, "%s", "\n");
>   close_json_object();
> diff --git a/bridge/vlan.c b/bridge/vlan.c
> index 19a36b804069..bdce55ae4e14 100644
> --- a/bridge/vlan.c
> +++ b/bridge/vlan.c
> @@ -252,10 +252,18 @@ static int filter_vlan_check(__u16 vid, __u16
> flags)
>   return 1;
>  }
>  
> -static void print_vlan_port(FILE *fp, int ifi_index)
> +static void open_vlan_port(int ifi_index)
>  {
> - print_string(PRINT_ANY, NULL, "%s",
> + open_json_object(NULL);
> + print_string(PRINT_ANY, "ifname", "%s",
>ll_index_to_name(ifi_index));
> + open_json_array(PRINT_JSON, "vlans");
> +}
> +
> +static void close_vlan_port(void)
> +{
> + close_json_array(PRINT_JSON, NULL);
> + close_json_object();
>  }
>  
>  static void print_range(const char *name, __u16 start, __u16 id)
> @@ -278,7 +286,7 @@ static void print_vlan_tunnel_info(FILE *fp,
> struct rtattr *tb, int ifindex)
>   __u32 last_tunid_start = 0;
>  
>   if (!filter_vlan)
> - print_vlan_port(fp, ifindex);
> + open_vlan_port(ifindex);
>  
>   open_json_array(PRINT_JSON, "tunnel");
>   for (i = RTA_DATA(list); RTA_OK(i, rem); i = RTA_NEXT(i, rem))
> {
> @@ -323,18 +331,20 @@ static void print_vlan_tunnel_info(FILE *fp,
> struct rtattr *tb, int ifindex)
>   continue;
>  
>   if (filter_vlan)
> - print_vlan_port(fp, ifindex);
> + open_vlan_port(ifindex);
>  
>   open_json_object(NULL);
>   print_range("vlan", last_vid_start, tunnel_vid);
>   print_range("tunid", last_tunid_start, tunnel_id);
>   close_json_object();
>  
> - if (!is_json_context())
> - fprintf(fp, "\n");
> -
> + print_string(PRINT_FP, NULL, "%s", _SL_);
> + if (filter_vlan)
> + close_vlan_port();
>   }
> - close_json_array(PRINT_JSON, NULL);
> +
> + if (!filter_vlan)
> + close_vlan_port();
>  }
>  
>  static int print_vlan_tunnel(const struct sockaddr_nl *who,
> @@ -421,8 +431,8 @@ static int print_vlan(const struct sockaddr_nl
> *who,
>   return 0;
>   }
>  
> - print_vlan_port(fp, ifm->ifi_index);
> - print_vlan_info(fp, tb[IFLA_AF_SPEC]);
> + print_vlan_info(tb[IFLA_AF_SPEC], ifm->ifi_index);
> + print_string(PRINT_FP, NULL, "%s", _SL_);
>  
>   fflush(fp);
>   return 

Re: [net-next, v2, 1/2] net: stmmac: Rework coalesce timer and fix multi-queue races

2018-09-10 Thread Neil Armstrong
Hi Jose,

On 10/09/2018 11:14, Jose Abreu wrote:
> This follows David Miller advice and tries to fix coalesce timer in
> multi-queue scenarios.
> 
> We are now using per-queue coalesce values and per-queue TX timer.
> 
> Coalesce timer default values was changed to 1ms and the coalesce frames
> to 25.
> 
> Tested in B2B setup between XGMAC2 and GMAC5.
> 
> Signed-off-by: Jose Abreu 
> Cc: Jerome Brunet 
> Cc: Martin Blumenstingl 
> Cc: David S. Miller 
> Cc: Joao Pinto 
> Cc: Giuseppe Cavallaro 
> Cc: Alexandre Torgue 
> ---
> Jerome,

Jerome is in holidays...

> 
> Can you please test if this one is okay ?


I tested this patch on top of 4.18.7 with the previous patch (4ae0169fd1b3) 
reverted.

The TX or RX stopped a few seconds after iperf3 started :
(iperf3 is running in server mode on the Amlogic A113D board)

$ iperf3 -c 10.1.4.95
Connecting to host 10.1.4.95, port 5201
[  4] local 10.1.2.12 port 39906 connected to 10.1.4.95 port 5201
[ ID] Interval   Transfer Bandwidth   Retr  Cwnd
[  4]   0.00-1.00   sec  56.2 MBytes   472 Mbits/sec0660 KBytes
[  4]   1.00-2.00   sec  56.2 MBytes   472 Mbits/sec0660 KBytes
[  4]   2.00-3.00   sec  38.8 MBytes   325 Mbits/sec1   1.41 KBytes
[  4]   3.00-4.00   sec  0.00 Bytes  0.00 bits/sec1   1.41 KBytes
[  4]   4.00-5.00   sec  0.00 Bytes  0.00 bits/sec1   1.41 KBytes
^C[  4]   5.00-5.61   sec  0.00 Bytes  0.00 bits/sec0   1.41 KBytes
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval   Transfer Bandwidth   Retr
[  4]   0.00-5.61   sec   151 MBytes   226 Mbits/sec3 sender
[  4]   0.00-5.61   sec  0.00 Bytes  0.00 bits/sec  receiver
iperf3: interrupt - the client has terminated

$ iperf3 -c 10.1.4.95 -R
Connecting to host 10.1.4.95, port 5201
Reverse mode, remote host 10.1.4.95 is sending
[  4] local 10.1.2.12 port 39894 connected to 10.1.4.95 port 5201
[ ID] Interval   Transfer Bandwidth
[  4]   0.00-1.00   sec  82.2 MBytes   690 Mbits/sec
[  4]   1.00-2.00   sec  82.6 MBytes   693 Mbits/sec
[  4]   2.00-3.00   sec  24.2 MBytes   203 Mbits/sec
[  4]   3.00-4.00   sec  0.00 Bytes  0.00 bits/sec
[  4]   4.00-5.00   sec  0.00 Bytes  0.00 bits/sec
^C[  4]   5.00-5.53   sec  0.00 Bytes  0.00 bits/sec
- - - - - - - - - - - - - - - - - - - - - - - - -
[ ID] Interval   Transfer Bandwidth
[  4]   0.00-5.53   sec  0.00 Bytes  0.00 bits/sec  sender
[  4]   0.00-5.53   sec   189 MBytes   287 Mbits/sec  receiver
iperf3: interrupt - the client has terminated

These works for hours without this patch applied.

Neil

> 
> Thanks and Best Regards,
> Jose Miguel Abreu
> ---
>  drivers/net/ethernet/stmicro/stmmac/common.h  |   4 +-
>  drivers/net/ethernet/stmicro/stmmac/stmmac.h  |   6 +-
>  drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 207 
> ++
>  3 files changed, 135 insertions(+), 82 deletions(-)
> 
> diff --git a/drivers/net/ethernet/stmicro/stmmac/common.h 
> b/drivers/net/ethernet/stmicro/stmmac/common.h
> index 1854f270ad66..b1b305f8f414 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/common.h
> +++ b/drivers/net/ethernet/stmicro/stmmac/common.h
> @@ -258,10 +258,10 @@ struct stmmac_safety_stats {
>  #define MAX_DMA_RIWT 0xff
>  #define MIN_DMA_RIWT 0x20
>  /* Tx coalesce parameters */
> -#define STMMAC_COAL_TX_TIMER 4
> +#define STMMAC_COAL_TX_TIMER 1000
>  #define STMMAC_MAX_COAL_TX_TICK  10
>  #define STMMAC_TX_MAX_FRAMES 256
> -#define STMMAC_TX_FRAMES 64
> +#define STMMAC_TX_FRAMES 25
>  
>  /* Packets types */
>  enum packets_types {
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac.h 
> b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
> index c0a855b7ab3b..957030cfb833 100644
> --- a/drivers/net/ethernet/stmicro/stmmac/stmmac.h
> +++ b/drivers/net/ethernet/stmicro/stmmac/stmmac.h
> @@ -48,6 +48,9 @@ struct stmmac_tx_info {
>  
>  /* Frequently used values are kept adjacent for cache effect */
>  struct stmmac_tx_queue {
> + u32 tx_count_frames;
> + int tx_timer_active;
> + struct timer_list txtimer;
>   u32 queue_index;
>   struct stmmac_priv *priv_data;
>   struct dma_extended_desc *dma_etx cacheline_aligned_in_smp;
> @@ -59,6 +62,7 @@ struct stmmac_tx_queue {
>   dma_addr_t dma_tx_phy;
>   u32 tx_tail_addr;
>   u32 mss;
> + struct napi_struct napi cacheline_aligned_in_smp;
>  };
>  
>  struct stmmac_rx_queue {
> @@ -109,14 +113,12 @@ struct stmmac_pps_cfg {
>  
>  struct stmmac_priv {
>   /* Frequently used values are kept adjacent for cache effect */
> - u32 tx_count_frames;
>   u32 tx_coal_frames;
>   u32 tx_coal_timer;
>  
>   int tx_coalesce;
>   int hwts_tx_en;
>   bool tx_path_in_lpi_mode;
> - struct timer_list txtimer;
>   bool tso;
>  
>   unsigned int dma_buf_sz;
> diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c 
> 

Re: [PATCH net 00/13] cls_u32 cleanups and fixes.

2018-09-10 Thread Jamal Hadi Salim

On 2018-09-09 10:15 a.m., Al Viro wrote:

[..]


Umm...  Interesting - TCA_U32_SEL is not the only thing that
gets ignored there; TCA_U32_MARK gets the same treatment.
And then there's a lovely question what to do with n->pf -
it's an array of n->sel.nkeys counters, and apparently we
want (at least in common cases) to avoid resetting those.



*If* we declare that ->nkeys mismatch means failure, it's
all relatively easy to implement.  Alternatively, we could
declare that selector change means resetting the stats.
Preferences?


I am now conflicted. I have sample scripts which showed
that at one point that worked. All of them seem to be
indicating the nkeys and stats never changed. So that
could be a starting point; however, if a policy changes
the match tuples then it is no longer the same rule imo.

Note: you can change the "actions" - of which the most
primitive is "set classid x:y" without changing what is
being matched. i.e changing the classid in that example
would work.

cheers,
jamal




Re: [net-next, PATCH 2/2, v1] net: socionext: add AF_XDP support

2018-09-10 Thread Ilias Apalodimas
On Mon, Sep 10, 2018 at 07:56:49PM +0900, Toshiaki Makita wrote:
> On 2018/09/10 17:24, Ilias Apalodimas wrote:
> > Add basic AF_XDP support without zero-copy
> > 
> > Signed-off-by: Ilias Apalodimas 
> > ---
> ...
> > @@ -707,6 +731,26 @@ static int netsec_process_rx(struct netsec_priv *priv, 
> > int budget)
> > if (unlikely(!buf_addr))
> > break;
> >  
> > +   if (xdp_prog) {
> > +   xdp_result = netsec_run_xdp(desc, priv, xdp_prog,
> > +   pkt_len);
> > +   if (xdp_result != NETSEC_XDP_PASS) {
> > +   xdp_flush |= xdp_result & NETSEC_XDP_REDIR;
> > +
> > +   dma_unmap_single_attrs(priv->dev,
> > +  desc->dma_addr,
> > +  desc->len, DMA_TO_DEVICE,
> > +  DMA_ATTR_SKIP_CPU_SYNC);
> > +
> > +   desc->len = desc_len;
> > +   desc->dma_addr = dma_handle;
> > +   desc->addr = buf_addr;
> > +   netsec_rx_fill(priv, idx, 1);
> > +   nsetsec_adv_desc(>tail);
> > +   }
> > +   continue;
> 
> Continue even on XDP_PASS? Is this really correct?
> 
> Also seems there is no handling of adjust_head/tail for XDP_PASS case.
No continue is wrong there, thanks for the catch. This should return the packet
to the network stack. I'll fix it on v2.

> 
> > +   }
> > +
> > skb = build_skb(desc->addr, desc->len);
> > if (unlikely(!skb)) {
> > dma_unmap_single(priv->dev, dma_handle, desc_len,
> > @@ -740,6 +784,9 @@ static int netsec_process_rx(struct netsec_priv *priv, 
> > int budget)
> > nsetsec_adv_desc(>tail);
> > }
> >  
> > +   if (xdp_flush & NETSEC_XDP_REDIR)
> > +   xdp_do_flush_map();
> > +
> > return done;
> >  }
> ...
> > +static u32 netsec_run_xdp(struct netsec_desc *desc, struct netsec_priv 
> > *priv,
> > + struct bpf_prog *prog, u16 len)
> > +
> > +{
> > +   struct netsec_desc_ring *dring = >desc_ring[NETSEC_RING_RX];
> > +   struct xdp_buff xdp;
> > +   u32 ret = NETSEC_XDP_PASS;
> > +   int err;
> > +   u32 act;
> > +
> > +   xdp.data_hard_start = desc->addr;
> > +   xdp.data = desc->addr;
> 
> There is no headroom. REDIRECT using devmap/cpumap will fail due to
> this. Generally we need XDP_PACKET_HEADROOM.
> 
Ok, i'll modify the patch to include XDP_PACKET_HEADROOM
> > +   xdp_set_data_meta_invalid();
> > +   xdp.data_end = xdp.data + len;
> > +   xdp.rxq = >xdp_rxq;
> > +
> > +   rcu_read_lock();
> > +   act = bpf_prog_run_xdp(prog, );
> > +
> > +   switch (act) {
> > +   case XDP_PASS:
> > +   ret = NETSEC_XDP_PASS;
> > +   break;
> > +   case XDP_TX:
> > +   ret = netsec_xmit_xdp(priv, , desc);
> > +   break;
> > +   case XDP_REDIRECT:
> > +   err = xdp_do_redirect(priv->ndev, , prog);
> > +   if (!err) {
> > +   ret = NETSEC_XDP_REDIR;
> > +   } else {
> > +   ret = NETSEC_XDP_CONSUMED;
> > +   xdp_return_buff();
> > +   }
> > +   break;
> > +   default:
> > +   bpf_warn_invalid_xdp_action(act);
> > +   /* fall through */
> > +   case XDP_ABORTED:
> > +   trace_xdp_exception(priv->ndev, prog, act);
> > +   /* fall through -- handle aborts by dropping packet */
> > +   case XDP_DROP:
> > +   ret = NETSEC_XDP_CONSUMED;
> > +   break;
> > +   }
> > +
> > +   rcu_read_unlock();
> > +
> > +   return ret;
> > +}
> > +
> > +static int netsec_xdp_setup(struct netsec_priv *priv, struct bpf_prog 
> > *prog)
> > +{
> > +   struct net_device *dev = priv->ndev;
> > +   struct bpf_prog *old_prog;
> > +
> > +   /* For now just support only the usual MTU sized frames */
> > +   if (prog && dev->mtu > 1500) {
> > +   netdev_warn(dev, "Jumbo frames not yet supported with XDP\n");
> 
> Why not using extack?
> 
Wasn't aware that this should use the extack for errors. I just saw the same
check on an existing driver and thought that was the way to go
> > +   return -EOPNOTSUPP;
> > +   }
> > +

Thanks for having a look!

/Ilias


Re: [net-next, PATCH 2/2, v1] net: socionext: add AF_XDP support

2018-09-10 Thread Toshiaki Makita
On 2018/09/10 17:24, Ilias Apalodimas wrote:
> Add basic AF_XDP support without zero-copy
> 
> Signed-off-by: Ilias Apalodimas 
> ---
...
> @@ -707,6 +731,26 @@ static int netsec_process_rx(struct netsec_priv *priv, 
> int budget)
>   if (unlikely(!buf_addr))
>   break;
>  
> + if (xdp_prog) {
> + xdp_result = netsec_run_xdp(desc, priv, xdp_prog,
> + pkt_len);
> + if (xdp_result != NETSEC_XDP_PASS) {
> + xdp_flush |= xdp_result & NETSEC_XDP_REDIR;
> +
> + dma_unmap_single_attrs(priv->dev,
> +desc->dma_addr,
> +desc->len, DMA_TO_DEVICE,
> +DMA_ATTR_SKIP_CPU_SYNC);
> +
> + desc->len = desc_len;
> + desc->dma_addr = dma_handle;
> + desc->addr = buf_addr;
> + netsec_rx_fill(priv, idx, 1);
> + nsetsec_adv_desc(>tail);
> + }
> + continue;

Continue even on XDP_PASS? Is this really correct?

Also seems there is no handling of adjust_head/tail for XDP_PASS case.

> + }
> +
>   skb = build_skb(desc->addr, desc->len);
>   if (unlikely(!skb)) {
>   dma_unmap_single(priv->dev, dma_handle, desc_len,
> @@ -740,6 +784,9 @@ static int netsec_process_rx(struct netsec_priv *priv, 
> int budget)
>   nsetsec_adv_desc(>tail);
>   }
>  
> + if (xdp_flush & NETSEC_XDP_REDIR)
> + xdp_do_flush_map();
> +
>   return done;
>  }
...
> +static u32 netsec_run_xdp(struct netsec_desc *desc, struct netsec_priv *priv,
> +   struct bpf_prog *prog, u16 len)
> +
> +{
> + struct netsec_desc_ring *dring = >desc_ring[NETSEC_RING_RX];
> + struct xdp_buff xdp;
> + u32 ret = NETSEC_XDP_PASS;
> + int err;
> + u32 act;
> +
> + xdp.data_hard_start = desc->addr;
> + xdp.data = desc->addr;

There is no headroom. REDIRECT using devmap/cpumap will fail due to
this. Generally we need XDP_PACKET_HEADROOM.

> + xdp_set_data_meta_invalid();
> + xdp.data_end = xdp.data + len;
> + xdp.rxq = >xdp_rxq;
> +
> + rcu_read_lock();
> + act = bpf_prog_run_xdp(prog, );
> +
> + switch (act) {
> + case XDP_PASS:
> + ret = NETSEC_XDP_PASS;
> + break;
> + case XDP_TX:
> + ret = netsec_xmit_xdp(priv, , desc);
> + break;
> + case XDP_REDIRECT:
> + err = xdp_do_redirect(priv->ndev, , prog);
> + if (!err) {
> + ret = NETSEC_XDP_REDIR;
> + } else {
> + ret = NETSEC_XDP_CONSUMED;
> + xdp_return_buff();
> + }
> + break;
> + default:
> + bpf_warn_invalid_xdp_action(act);
> + /* fall through */
> + case XDP_ABORTED:
> + trace_xdp_exception(priv->ndev, prog, act);
> + /* fall through -- handle aborts by dropping packet */
> + case XDP_DROP:
> + ret = NETSEC_XDP_CONSUMED;
> + break;
> + }
> +
> + rcu_read_unlock();
> +
> + return ret;
> +}
> +
> +static int netsec_xdp_setup(struct netsec_priv *priv, struct bpf_prog *prog)
> +{
> + struct net_device *dev = priv->ndev;
> + struct bpf_prog *old_prog;
> +
> + /* For now just support only the usual MTU sized frames */
> + if (prog && dev->mtu > 1500) {
> + netdev_warn(dev, "Jumbo frames not yet supported with XDP\n");

Why not using extack?

> + return -EOPNOTSUPP;
> + }
> +

-- 
Toshiaki Makita



Re: kernels > v4.12 oops/crash with ipsec-traffic: bisected to b838d5e1c5b6e57b10ec8af2268824041e3ea911: ipv4: mark DST_NOGC and remove the operation of dst_free()

2018-09-10 Thread Wolfgang Walter
Am Montag, 10. September 2018, 10:18:47 schrieb Kristian Evensen:
> Hi,
> 
> Thanks everyone for all the effort in debugging this issue.
> 
> On Mon, Sep 10, 2018 at 8:39 AM Steffen Klassert
> 
>  wrote:
> > The easy fix that could be backported to stable would be
> > to check skb->dst for NULL and drop the packet in that case.
> 
> Thought I should just chime in and say that we deployed this
> work-around when we started observing the error back in June. Since
> then we have not seen any crashes. Also, we have instrumented some of
> our kernels to count the number of times the error is hit (overall +
> consecutive). Compared to the overall number of packets, the error
> happens very rarely. With our workloads, we on average see the error
> once every couple of days.
> 

Would you mind send us yout patch (with the accounting) so that we can check 
how often that happens here?

Regards,
-- 
Wolfgang Walter
Studentenwerk München
Anstalt des öffentlichen Rechts


Re: [net-next, PATCH 0/2, v1] net: socionext: add AF_XDP support

2018-09-10 Thread Björn Töpel
Den mån 10 sep. 2018 kl 10:26 skrev Ilias Apalodimas
:
>
> This patch series adds AF_XDP support socionext netsec driver
>
> - patch [1/2]: Use a different allocation scheme for Rx DMA buffers to prepare
> the driver for AF_XDP support
> - patch [2/2]: Add AF_XDP support without zero-copy
>
> Ilias Apalodimas (2):
>   net: socionext: different approach on DMA
>   net: socionext: add AF_XDP support
>

You should probably rephrase patch #2. You are adding XDP support, and
AF_XDP just follows from that. Nice to see more XDP support!


Björn

>  drivers/net/ethernet/socionext/netsec.c | 444 
> +++-
>  1 file changed, 329 insertions(+), 115 deletions(-)
>
> --
> 2.7.4
>


[afaerber:lora-next 14445/14606] net//smc/smc_ib.c:152:2: warning: 'ib_query_gid' is deprecated (declared at include/rdma/ib_cache.h:139)

2018-09-10 Thread kbuild test robot
tree:   git://github.com/afaerber/linux lora-next
head:   51325f8cd6dd80cba7ac6e881fe523ad0475c927
commit: 6a991b35a2fa0b75d99f03c564b40b6b9a5d2aed [14445/14606] Merge 
remote-tracking branch 'net-next/master'
config: i386-randconfig-j0-09101243 (attached as .config)
compiler: gcc-4.9 (Debian 4.9.4-2) 4.9.4
reproduce:
git checkout 6a991b35a2fa0b75d99f03c564b40b6b9a5d2aed
# save the attached .config to linux build tree
make ARCH=i386 

All warnings (new ones prefixed by >>):

   net//smc/smc_ib.c: In function 'smc_ib_fill_mac':
>> net//smc/smc_ib.c:152:2: warning: 'ib_query_gid' is deprecated (declared at 
>> include/rdma/ib_cache.h:139) [-Wdeprecated-declarations]
 rc = ib_query_gid(smcibdev->ibdev, ibport, 0, , );
 ^
   net//smc/smc_ib.c: In function 'smc_ib_determine_gid':
   net//smc/smc_ib.c:190:3: warning: 'ib_query_gid' is deprecated (declared at 
include/rdma/ib_cache.h:139) [-Wdeprecated-declarations]
  if (ib_query_gid(smcibdev->ibdev, ibport, i, &_gid, ))
  ^
   net//smc/smc_ib.c:190:3: warning: 'ib_query_gid' is deprecated (declared at 
include/rdma/ib_cache.h:139) [-Wdeprecated-declarations]
   In file included from include/linux/kernel.h:10:0,
from include/linux/list.h:9,
from include/linux/random.h:10,
from net//smc/smc_ib.c:15:
   include/linux/compiler.h:61:17: warning: 'ib_query_gid' is deprecated 
(declared at include/rdma/ib_cache.h:139) [-Wdeprecated-declarations]
  static struct ftrace_branch_data   \
^
   include/linux/compiler.h:56:23: note: in expansion of macro '__trace_if'
#define if(cond, ...) __trace_if( (cond , ## __VA_ARGS__) )
  ^
   net//smc/smc_ib.c:190:3: note: in expansion of macro 'if'
  if (ib_query_gid(smcibdev->ibdev, ibport, i, &_gid, ))
  ^

vim +/ib_query_gid +152 net//smc/smc_ib.c

bd4ad577 Ursula Braun 2017-01-09  145  
7005ada6 Ursula Braun 2018-07-25  146  static int smc_ib_fill_mac(struct 
smc_ib_device *smcibdev, u8 ibport)
be6a3f38 Ursula Braun 2018-06-28  147  {
be6a3f38 Ursula Braun 2018-06-28  148   struct ib_gid_attr gattr;
7005ada6 Ursula Braun 2018-07-25  149   union ib_gid gid;
be6a3f38 Ursula Braun 2018-06-28  150   int rc;
be6a3f38 Ursula Braun 2018-06-28  151  
7005ada6 Ursula Braun 2018-07-25 @152   rc = ib_query_gid(smcibdev->ibdev, 
ibport, 0, , );
be6a3f38 Ursula Braun 2018-06-28  153   if (rc || !gattr.ndev)
be6a3f38 Ursula Braun 2018-06-28  154   return -ENODEV;
be6a3f38 Ursula Braun 2018-06-28  155  
be6a3f38 Ursula Braun 2018-06-28  156   memcpy(smcibdev->mac[ibport - 1], 
gattr.ndev->dev_addr, ETH_ALEN);
be6a3f38 Ursula Braun 2018-06-28  157   dev_put(gattr.ndev);
be6a3f38 Ursula Braun 2018-06-28  158   return 0;
be6a3f38 Ursula Braun 2018-06-28  159  }
be6a3f38 Ursula Braun 2018-06-28  160  

:: The code at line 152 was first introduced by commit
:: 7005ada68d1774d7c1109deaba0c2cd8e46f5091 net/smc: use correct vlan gid 
of RoCE device

:: TO: Ursula Braun 
:: CC: David S. Miller 

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


[PATCH net-next v3 0/7] net: aquantia: implement WOL and EEE support

2018-09-10 Thread Igor Russkikh
This is v3 of WOL/EEE functionality patch for atlantic driver.

In this patchset Yana Esina and Nikita Danilov implemented:

- Upload function to interact with FW memory
- Definitions and structures necessary for the correct operation of Wake ON Lan
- The functionality Wake On Lan via ethtool (Magic packet is supported)
- The functionality for Energy-Efficient Ethernet configuration via ethtool

Version 3:
- use ETH_ALEN instead of raw number

Version 2 has the following fixes:
- patchset reorganized to extract renaming and whitespace fixes into separate
  patches
- some of magic numbers replaced with defines
- reverse christmas tree applied

Discussion outcome regarding driver version bumps was not finished
(here https://patchwork.ozlabs.org/patch/954905/)
David, could you suggest the best way to proceed on this?

Igor Russkikh (1):
  net: aquantia: bump driver version

Nikita Danilov (2):
  net: aquantia: whitespace changes
  net: aquantia: renaming for better visibility

Yana Esina (4):
  net: aquantia: fix hw_atl_utils_fw_upload_dwords
  net: aquantia: definitions for WOL
  net: aquantia: implement WOL support
  net: aquantia: implement EEE support

 drivers/net/ethernet/aquantia/atlantic/aq_common.h |   5 +
 .../net/ethernet/aquantia/atlantic/aq_ethtool.c| 113 +-
 drivers/net/ethernet/aquantia/atlantic/aq_hw.h |  13 +-
 drivers/net/ethernet/aquantia/atlantic/aq_nic.c|  24 ++-
 drivers/net/ethernet/aquantia/atlantic/aq_nic.h|   4 +
 drivers/net/ethernet/aquantia/atlantic/aq_ring.c   |   4 +-
 .../ethernet/aquantia/atlantic/hw_atl/hw_atl_a0.c  |  41 ++--
 .../aquantia/atlantic/hw_atl/hw_atl_a0_internal.h  |   6 -
 .../ethernet/aquantia/atlantic/hw_atl/hw_atl_b0.c  |  35 ++--
 .../aquantia/atlantic/hw_atl/hw_atl_b0_internal.h  |   6 -
 .../ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.c |   8 +
 .../ethernet/aquantia/atlantic/hw_atl/hw_atl_llh.h |   3 +
 .../aquantia/atlantic/hw_atl/hw_atl_llh_internal.h |  13 ++
 .../aquantia/atlantic/hw_atl/hw_atl_utils.c| 163 ++
 .../aquantia/atlantic/hw_atl/hw_atl_utils.h| 130 +++-
 .../aquantia/atlantic/hw_atl/hw_atl_utils_fw2x.c   | 233 -
 drivers/net/ethernet/aquantia/atlantic/ver.h   |   2 +-
 17 files changed, 675 insertions(+), 128 deletions(-)

-- 
2.7.4



[PATCH net-next v3 2/7] net: aquantia: definitions for WOL

2018-09-10 Thread Igor Russkikh
From: Yana Esina 

Added definitions and structures needed to support WOL.

Signed-off-by: Yana Esina 
Signed-off-by: Nikita Danilov 
Tested-by: Nikita Danilov 
Signed-off-by: Igor Russkikh 
---
 drivers/net/ethernet/aquantia/atlantic/aq_nic.h|  3 +
 .../aquantia/atlantic/hw_atl/hw_atl_utils.h| 94 --
 .../aquantia/atlantic/hw_atl/hw_atl_utils_fw2x.c   | 32 
 3 files changed, 124 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h 
b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
index fecfc401f95d..2069cbb6e1a1 100644
--- a/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
+++ b/drivers/net/ethernet/aquantia/atlantic/aq_nic.h
@@ -36,6 +36,7 @@ struct aq_nic_cfg_s {
u32 flow_control;
u32 link_speed_msk;
u32 vlan_id;
+   u32 wol;
u16 is_mc_list_enabled;
u16 mc_list_count;
bool is_autoneg;
@@ -54,6 +55,8 @@ struct aq_nic_cfg_s {
 #define AQ_NIC_FLAG_ERR_UNPLUG  0x4000U
 #define AQ_NIC_FLAG_ERR_HW  0x8000U
 
+#define AQ_NIC_WOL_ENABLED BIT(0)
+
 #define AQ_NIC_TCVEC2RING(_NIC_, _TC_, _VEC_) \
((_TC_) * AQ_CFG_TCS_MAX + (_VEC_))
 
diff --git a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h 
b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h
index 505c8a2abd9c..beec0775f1c1 100644
--- a/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h
+++ b/drivers/net/ethernet/aquantia/atlantic/hw_atl/hw_atl_utils.h
@@ -101,8 +101,6 @@ struct __packed hw_aq_atl_utils_fw_rpc {
struct {
u32 priority;
u32 wol_packet_type;
-   u16 friendly_name_len;
-   u16 friendly_name[65];
u32 pattern_id;
u32 next_wol_pattern_offset;
 
@@ -134,13 +132,36 @@ struct __packed hw_aq_atl_utils_fw_rpc {
u32 pattern_offset;
u32 pattern_size;
} wol_bit_map_pattern;
+
+   struct {
+   u8 mac_addr[ETH_ALEN];
+   } wol_magic_packet_patter;
} wol_pattern;
} msg_wol;
 
struct {
-   u32 is_wake_on_link_down;
-   u32 is_wake_on_link_up;
-   } msg_wolink;
+   union {
+   u32 pattern_mask;
+
+   struct {
+   u32 reason_arp_v4_pkt : 1;
+   u32 reason_ipv4_ping_pkt : 1;
+   u32 reason_ipv6_ns_pkt : 1;
+   u32 reason_ipv6_ping_pkt : 1;
+   u32 reason_link_up : 1;
+   u32 reason_link_down : 1;
+   u32 reason_maximum : 1;
+   };
+   };
+
+   union {
+   u32 offload_mask;
+   };
+   } msg_enable_wakeup;
+
+   struct {
+   u32 id;
+   } msg_del_id;
};
 };
 
@@ -155,6 +176,57 @@ struct __packed hw_aq_atl_utils_mbox {
struct hw_atl_stats_s stats;
 };
 
+/* fw2x */
+typedef u32fw_offset_t;
+
+struct __packed offload_ip_info {
+   u8 v4_local_addr_count;
+   u8 v4_addr_count;
+   u8 v6_local_addr_count;
+   u8 v6_addr_count;
+   fw_offset_t v4_addr;
+   fw_offset_t v4_prefix;
+   fw_offset_t v6_addr;
+   fw_offset_t v6_prefix;
+};
+
+struct __packed offload_port_info {
+   u16 udp_port_count;
+   u16 tcp_port_count;
+   fw_offset_t udp_port;
+   fw_offset_t tcp_port;
+};
+
+struct __packed offload_ka_info {
+   u16 v4_ka_count;
+   u16 v6_ka_count;
+   u32 retry_count;
+   u32 retry_interval;
+   fw_offset_t v4_ka;
+   fw_offset_t v6_ka;
+};
+
+struct __packed offload_rr_info {
+   u32 rr_count;
+   u32 rr_buf_len;
+   fw_offset_t rr_id_x;
+   fw_offset_t rr_buf;
+};
+
+struct __packed offload_info {
+   u32 version;
+   u32 len;
+   u8 mac_addr[ETH_ALEN];
+
+   u8 reserved[2];
+
+   struct offload_ip_info ips;
+   struct offload_port_info ports;
+   struct offload_ka_info kas;
+   struct offload_rr_info rrs;
+   u8 buf[0];
+};
+
 #define HAL_ATLANTIC_UTILS_CHIP_MIPS 0x0001U
 #define HAL_ATLANTIC_UTILS_CHIP_TPO2 0x0002U
 #define HAL_ATLANTIC_UTILS_CHIP_RPF2 0x0004U
@@ -181,6 +253,18 @@ enum hal_atl_utils_fw_state_e {
 #define HAL_ATLANTIC_RATE_100M   BIT(5)
 #define HAL_ATLANTIC_RATE_INVALIDBIT(6)
 
+#define HAL_ATLANTIC_UTILS_FW_MSG_PING  0x1U

  1   2   >