Re: [PATCH V2 3/3] vhost_net: basic polling support

2016-01-20 Thread Yang Zhang

On 2016/1/20 22:35, Michael S. Tsirkin wrote:

On Tue, Dec 01, 2015 at 02:39:45PM +0800, Jason Wang wrote:

This patch tries to poll for new added tx buffer or socket receive
queue for a while at the end of tx/rx processing. The maximum time
spent on polling were specified through a new kind of vring ioctl.

Signed-off-by: Jason Wang 
---
  drivers/vhost/net.c| 72 ++
  drivers/vhost/vhost.c  | 15 ++
  drivers/vhost/vhost.h  |  1 +
  include/uapi/linux/vhost.h | 11 +++
  4 files changed, 94 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 9eda69e..ce6da77 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -287,6 +287,41 @@ static void vhost_zerocopy_callback(struct ubuf_info 
*ubuf, bool success)
rcu_read_unlock_bh();
  }

+static inline unsigned long busy_clock(void)
+{
+   return local_clock() >> 10;
+}
+
+static bool vhost_can_busy_poll(struct vhost_dev *dev,
+   unsigned long endtime)
+{
+   return likely(!need_resched()) &&
+  likely(!time_after(busy_clock(), endtime)) &&
+  likely(!signal_pending(current)) &&
+  !vhost_has_work(dev) &&
+  single_task_running();
+}
+
+static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
+   struct vhost_virtqueue *vq,
+   struct iovec iov[], unsigned int iov_size,
+   unsigned int *out_num, unsigned int *in_num)
+{
+   unsigned long uninitialized_var(endtime);
+
+   if (vq->busyloop_timeout) {
+   preempt_disable();
+   endtime = busy_clock() + vq->busyloop_timeout;
+   while (vhost_can_busy_poll(vq->dev, endtime) &&
+  !vhost_vq_more_avail(vq->dev, vq))
+   cpu_relax();
+   preempt_enable();
+   }


Isn't there a way to call all this after vhost_get_vq_desc?
First, this will reduce the good path overhead as you
won't have to play with timers and preemption.

Second, this will reduce the chance of a pagefault on avail ring read.


+
+   return vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+out_num, in_num, NULL, NULL);
+}
+
  /* Expects to be always run from workqueue - which acts as
   * read-size critical section for our kind of RCU. */
  static void handle_tx(struct vhost_net *net)
@@ -331,10 +366,9 @@ static void handle_tx(struct vhost_net *net)
  % UIO_MAXIOV == nvq->done_idx))
break;

-   head = vhost_get_vq_desc(vq, vq->iov,
-ARRAY_SIZE(vq->iov),
-, ,
-NULL, NULL);
+   head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
+   ARRAY_SIZE(vq->iov),
+   , );
/* On error, stop handling until the next kick. */
if (unlikely(head < 0))
break;
@@ -435,6 +469,34 @@ static int peek_head_len(struct sock *sk)
return len;
  }

+static int vhost_net_peek_head_len(struct vhost_net *net, struct sock *sk)


Need a hint that it's rx related in the name.


+{
+   struct vhost_net_virtqueue *nvq = >vqs[VHOST_NET_VQ_TX];
+   struct vhost_virtqueue *vq = >vq;
+   unsigned long uninitialized_var(endtime);
+
+   if (vq->busyloop_timeout) {
+   mutex_lock(>mutex);


This appears to be called under vq mutex in handle_rx.
So how does this work then?



+   vhost_disable_notify(>dev, vq);


This appears to be called after disable notify
in handle_rx - so why disable here again?


+
+   preempt_disable();
+   endtime = busy_clock() + vq->busyloop_timeout;
+
+   while (vhost_can_busy_poll(>dev, endtime) &&
+  skb_queue_empty(>sk_receive_queue) &&
+  !vhost_vq_more_avail(>dev, vq))
+   cpu_relax();


This seems to mix in several items.
RX queue is normally not empty. I don't think
we need to poll for that.


I have seen the RX queue is easy to be empty under some extreme 
conditions like lots of small packet. So maybe the check is useful here.


--
best regards
yang


Re: [PATCH net v2] gro: Make GRO aware of lightweight tunnels.

2016-01-20 Thread David Miller
From: Jesse Gross 
Date: Wed, 20 Jan 2016 17:59:49 -0800

> GRO is currently not aware of tunnel metadata generated by lightweight
> tunnels and stored in the dst. This leads to two possible problems:
>  * Incorrectly merging two frames that have different metadata.
>  * Leaking of allocated metadata from merged frames.
> 
> This avoids those problems by comparing the tunnel information before
> merging, similar to how we handle other metadata (such as vlan tags),
> and releasing any state when we are done.
> 
> Reported-by: John 
> Fixes: 2e15ea39 ("ip_gre: Add support to collect tunnel metadata.")
> Signed-off-by: Jesse Gross 
> ---
> v2: Remove branches to optimize for common case where there is no dst.

Applied and queued up for -stable, thanks.


Re: [PATCH net] gro: Make GRO aware of lightweight tunnels.

2016-01-20 Thread Jesse Gross
On Wed, Jan 20, 2016 at 4:48 PM, Eric Dumazet  wrote:
> On Wed, 2016-01-20 at 16:27 -0800, Jesse Gross wrote:
>> GRO is currently not aware of tunnel metadata generated by lightweight
>> tunnels and stored in the dst. This leads to two possible problems:
>>  * Incorrectly merging two frames that have different metadata.
>>  * Leaking of allocated metadata from merged frames.
>>
>> This avoids those problems by comparing the tunnel information before
>> merging, similar to how we handle other metadata (such as vlan tags),
>> and releasing any state when we are done.
>>
>> Reported-by: John 
>> Fixes: 2e15ea39 ("ip_gre: Add support to collect tunnel metadata.")
>> Signed-off-by: Jesse Gross 
>> ---
>>  include/net/dst_metadata.h | 23 +++
>>  net/core/dev.c |  9 +++--
>>  2 files changed, 30 insertions(+), 2 deletions(-)
>>
>> diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
>> index 6816f0f..c3de935 100644
>> --- a/include/net/dst_metadata.h
>> +++ b/include/net/dst_metadata.h
>> @@ -44,6 +44,29 @@ static inline bool skb_valid_dst(const struct sk_buff 
>> *skb)
>>   return dst && !(dst->flags & DST_METADATA);
>>  }
>>
>> +static inline int skb_metadata_dst_cmp(struct sk_buff *skb_a,
>> +struct sk_buff *skb_b)
>> +{
>> + const struct metadata_dst *a = skb_metadata_dst(skb_a);
>> + const struct metadata_dst *b = skb_metadata_dst(skb_b);
>> +
>> + if (!a != !b)
>> + return 1;
>> +
>> + if (!a)
>> + return 0;
>> +
>
> It is adding 4 conditional test per flow in GRO engine for the fast
> path...
>
> With up to 8 flows in GRO (per RX queue), it is a total of 32 added
> conditional tests.
>
> You should shortcut to one test :
>
> if (!(skb_a->_skb_refdst | skb_b->_skb_refdst)
> return 0;

Thanks, that's a good idea. I'll send out a v2 soon.

Just to merge the two threads together, all of protocols that would be
affected by this also have "normal" GRO handlers that will run when
the packet is first received. There's no argument that that is
preferable if it is available. However, GRO cells do provide a
performance benefit in other situations so it would be nice to avoid
disabling it if possible.


Re: [PATCH net-next 2/2] tg3: Fix for tg3 transmit queue 0 timed out when too many gso_segs.

2016-01-20 Thread Michael Chan
On Wed, 2016-01-20 at 20:36 -0800, Eric Dumazet wrote: 
> On Wed, 2016-01-20 at 17:16 +0530, skallam wrote:
> > From: Siva Reddy Kallam 
> > 
> > There is an issue on the GSO path inside tg3_tso_bug() when we don't
> > have sufficient descriptors available, we stop the queue. This queue
> > may never get started again as there are no Tx completions pending.
> > 
> > For example if the tcp segment size is as low as 88 bytes and TSO packet
> > from the stack is quite big(<64 K), gso_segs exceeds the limit of
> > descriptors available.
> > 
> > tg3_tso_bug_gso_check() is implemented to verify if the total no. of
> > descriptors available for gso are sufficient or not. If not sufficient
> > we simply linearize the the skb and transmit it once again or drop the
> > skb.
> 
> I find this changelog misleading.
> 
> linearize wont change gso_segs
> 
> You are in fact segmenting the GSO packet, which is very different than
> linearizing it.

He is referring to linearizing it for the hardware to perform TSO.

There are 2 cases that this code is trying to handle:

1. The hardware has TSO bugs.  The code detects the condtions and then
falls back to GSO.

2. The hardware has DMA bugs (such as short DMA length, 4G DMA
boundaries, etc).  In this case, the hardware can still do TSO but
requires the SKB to be linearized.  Linearizing is expensive, so we also
try to do GSO if possible.  This patch will check whether it is possible
to do GSO or not.  If not, it will linearize the SKB and have the
hardware do TSO.

I will ask Siva to try to clarify the descriptions.




Re: [PATCH net-next 2/2] tg3: Fix for tg3 transmit queue 0 timed out when too many gso_segs.

2016-01-20 Thread Eric Dumazet
On Wed, 2016-01-20 at 17:16 +0530, skallam wrote:
> From: Siva Reddy Kallam 
> 
> There is an issue on the GSO path inside tg3_tso_bug() when we don't
> have sufficient descriptors available, we stop the queue. This queue
> may never get started again as there are no Tx completions pending.
> 
> For example if the tcp segment size is as low as 88 bytes and TSO packet
> from the stack is quite big(<64 K), gso_segs exceeds the limit of
> descriptors available.
> 
> tg3_tso_bug_gso_check() is implemented to verify if the total no. of
> descriptors available for gso are sufficient or not. If not sufficient
> we simply linearize the the skb and transmit it once again or drop the
> skb.

I find this changelog misleading.

linearize wont change gso_segs

You are in fact segmenting the GSO packet, which is very different than
linearizing it.





Re: [PATCH] net: take care of bonding in build_skb_flow_key (v3)

2016-01-20 Thread Wengang Wang



在 2016年01月21日 12:05, Jay Vosburgh 写道:

Wengang Wang  wrote:
[...]

For ipip,  yes seems update_pmtu is called in line for each call of
queue_xmit.  Do you know if it's a good configuration for ipip + bonding?

Yes, it is.


Other's comment and suggestion?

I agree with Sabrina Dubroca 's suggestions
from yesterday.


Thank you! I will follow.

thanks,
wengang

-J

---
-Jay Vosburgh, jay.vosbu...@canonical.com




Re: [PATCH V2 3/3] vhost_net: basic polling support

2016-01-20 Thread Michael S. Tsirkin
On Thu, Jan 21, 2016 at 10:11:35AM +0800, Yang Zhang wrote:
> On 2016/1/20 22:35, Michael S. Tsirkin wrote:
> >On Tue, Dec 01, 2015 at 02:39:45PM +0800, Jason Wang wrote:
> >>This patch tries to poll for new added tx buffer or socket receive
> >>queue for a while at the end of tx/rx processing. The maximum time
> >>spent on polling were specified through a new kind of vring ioctl.
> >>
> >>Signed-off-by: Jason Wang 
> >>---
> >>  drivers/vhost/net.c| 72 
> >> ++
> >>  drivers/vhost/vhost.c  | 15 ++
> >>  drivers/vhost/vhost.h  |  1 +
> >>  include/uapi/linux/vhost.h | 11 +++
> >>  4 files changed, 94 insertions(+), 5 deletions(-)
> >>
> >>diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
> >>index 9eda69e..ce6da77 100644
> >>--- a/drivers/vhost/net.c
> >>+++ b/drivers/vhost/net.c
> >>@@ -287,6 +287,41 @@ static void vhost_zerocopy_callback(struct ubuf_info 
> >>*ubuf, bool success)
> >>rcu_read_unlock_bh();
> >>  }
> >>
> >>+static inline unsigned long busy_clock(void)
> >>+{
> >>+   return local_clock() >> 10;
> >>+}
> >>+
> >>+static bool vhost_can_busy_poll(struct vhost_dev *dev,
> >>+   unsigned long endtime)
> >>+{
> >>+   return likely(!need_resched()) &&
> >>+  likely(!time_after(busy_clock(), endtime)) &&
> >>+  likely(!signal_pending(current)) &&
> >>+  !vhost_has_work(dev) &&
> >>+  single_task_running();
> >>+}
> >>+
> >>+static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
> >>+   struct vhost_virtqueue *vq,
> >>+   struct iovec iov[], unsigned int iov_size,
> >>+   unsigned int *out_num, unsigned int *in_num)
> >>+{
> >>+   unsigned long uninitialized_var(endtime);
> >>+
> >>+   if (vq->busyloop_timeout) {
> >>+   preempt_disable();
> >>+   endtime = busy_clock() + vq->busyloop_timeout;
> >>+   while (vhost_can_busy_poll(vq->dev, endtime) &&
> >>+  !vhost_vq_more_avail(vq->dev, vq))
> >>+   cpu_relax();
> >>+   preempt_enable();
> >>+   }
> >
> >Isn't there a way to call all this after vhost_get_vq_desc?
> >First, this will reduce the good path overhead as you
> >won't have to play with timers and preemption.
> >
> >Second, this will reduce the chance of a pagefault on avail ring read.
> >
> >>+
> >>+   return vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
> >>+out_num, in_num, NULL, NULL);
> >>+}
> >>+
> >>  /* Expects to be always run from workqueue - which acts as
> >>   * read-size critical section for our kind of RCU. */
> >>  static void handle_tx(struct vhost_net *net)
> >>@@ -331,10 +366,9 @@ static void handle_tx(struct vhost_net *net)
> >>  % UIO_MAXIOV == nvq->done_idx))
> >>break;
> >>
> >>-   head = vhost_get_vq_desc(vq, vq->iov,
> >>-ARRAY_SIZE(vq->iov),
> >>-, ,
> >>-NULL, NULL);
> >>+   head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
> >>+   ARRAY_SIZE(vq->iov),
> >>+   , );
> >>/* On error, stop handling until the next kick. */
> >>if (unlikely(head < 0))
> >>break;
> >>@@ -435,6 +469,34 @@ static int peek_head_len(struct sock *sk)
> >>return len;
> >>  }
> >>
> >>+static int vhost_net_peek_head_len(struct vhost_net *net, struct sock *sk)
> >
> >Need a hint that it's rx related in the name.
> >
> >>+{
> >>+   struct vhost_net_virtqueue *nvq = >vqs[VHOST_NET_VQ_TX];
> >>+   struct vhost_virtqueue *vq = >vq;
> >>+   unsigned long uninitialized_var(endtime);
> >>+
> >>+   if (vq->busyloop_timeout) {
> >>+   mutex_lock(>mutex);
> >
> >This appears to be called under vq mutex in handle_rx.
> >So how does this work then?
> >
> >
> >>+   vhost_disable_notify(>dev, vq);
> >
> >This appears to be called after disable notify
> >in handle_rx - so why disable here again?
> >
> >>+
> >>+   preempt_disable();
> >>+   endtime = busy_clock() + vq->busyloop_timeout;
> >>+
> >>+   while (vhost_can_busy_poll(>dev, endtime) &&
> >>+  skb_queue_empty(>sk_receive_queue) &&
> >>+  !vhost_vq_more_avail(>dev, vq))
> >>+   cpu_relax();
> >
> >This seems to mix in several items.
> >RX queue is normally not empty. I don't think
> >we need to poll for that.
> 
> I have seen the RX queue is easy to be empty under some extreme conditions
> like lots of small packet. So maybe the check is useful here.

It's not useful *here*.
If you have an rx packet but no space in the ring,
this will exit immediately.

It might be useful elsewhere but I doubt it -
if rx ring is out of buffers, you are 

Re: question about samsung/sxgbe/sxgbe_xpcs.c

2016-01-20 Thread Jεan Sacren
From: David Miller 
Date: Tue, 19 Jan 2016 14:36:28 -0500
>
> From: Julia Lawall 
> Date: Tue, 19 Jan 2016 19:54:20 +0100 (CET)

[...]

> > I just wondered.  I was looking at dependencies between networking files.
> > This one stands out because nothing is dependenton it, even the files it
> > is compiled with, and it doesn't contain the usual functions,
> > register_netdev, etc.
> 
> Even with that explanation, this is a bogus situation.
> 
> There are no in-tree callers of this code.  It should be removed until there
> are in-tree users.
> 
> Nobody can figure out if the interface for this is done properly without 
> seeing
> the call sites and how they work.  It is therefore impossible to review this
> code and judge it's design.
> 
> If someone doesn't send me a removal patch, I will remove this code myself.

I have the patch ready.

Do you want me to submit it now during the merge window or wait till
net-next opens up again?


Re: [PATCH] net: take care of bonding in build_skb_flow_key (v3)

2016-01-20 Thread Wengang Wang



在 2016年01月20日 23:18, Sabrina Dubroca 写道:

2016-01-20, 13:32:13 +0800, Wengang Wang wrote:

In a bonding setting, we determines fragment size according to MTU and
PMTU associated to the bonding master. If the slave finds the fragment
size is too big, it drops the fragment and calls ip_rt_update_pmtu(),
passing _skb_ and _pmtu_, trying to update the path MTU.
Problem is that the target device that function ip_rt_update_pmtu actually
tries to update is the slave (skb->dev), not the master. Thus since no
PMTU change happens on master, the fragment size for later packets doesn't
change so all later fragments/packets are dropped too.

The fix is letting build_skb_flow_key() take care of the transition of
device index from bonding slave to the master. That makes the master become
the target device that ip_rt_update_pmtu tries to update PMTU to.

Signed-off-by: Wengang Wang 
---
  net/ipv4/route.c | 13 -
  1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 85f184e..c59fb0d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -523,10 +523,21 @@ static void build_skb_flow_key(struct flowi4 *fl4, const 
struct sk_buff *skb,
   const struct sock *sk)
  {
const struct iphdr *iph = ip_hdr(skb);
-   int oif = skb->dev->ifindex;
+   struct net_device *master = NULL;
u8 tos = RT_TOS(iph->tos);
u8 prot = iph->protocol;
u32 mark = skb->mark;
+   int oif;
+
+   if (skb->dev->flags & IFF_SLAVE) {

Maybe use netif_is_bond_slave here instead, since you have this
problem with bonding slaves?



+   rtnl_lock();
+   master = netdev_master_upper_dev_get(skb->dev);
+   rtnl_unlock();
+   }

As zhuyj said, this is called from dev_queue_xmit, so you cannot take
rtnl_lock here.


+   if (master)
+   oif = master->ifindex;

You cannot dereference master after you release the rtnl lock.

So it would probably be best to use netdev_master_upper_dev_get_rcu,
as zhuyj suggested earlier, and make sure that you only use the result
between rcu_read_lock()/rcu_read_unlock():

 rcu_read_lock();
 master = netdev_master_upper_dev_get_rcu(skb->dev);
 if (master)
 oif = master->ifindex;
 rcu_read_unlock();


OK, thanks for advising.

thanks,
wengang


Thanks,





Re: [PATCH V2 3/3] vhost_net: basic polling support

2016-01-20 Thread Yang Zhang

On 2016/1/21 13:13, Michael S. Tsirkin wrote:

On Thu, Jan 21, 2016 at 10:11:35AM +0800, Yang Zhang wrote:

On 2016/1/20 22:35, Michael S. Tsirkin wrote:

On Tue, Dec 01, 2015 at 02:39:45PM +0800, Jason Wang wrote:

This patch tries to poll for new added tx buffer or socket receive
queue for a while at the end of tx/rx processing. The maximum time
spent on polling were specified through a new kind of vring ioctl.

Signed-off-by: Jason Wang 
---
  drivers/vhost/net.c| 72 ++
  drivers/vhost/vhost.c  | 15 ++
  drivers/vhost/vhost.h  |  1 +
  include/uapi/linux/vhost.h | 11 +++
  4 files changed, 94 insertions(+), 5 deletions(-)

diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c
index 9eda69e..ce6da77 100644
--- a/drivers/vhost/net.c
+++ b/drivers/vhost/net.c
@@ -287,6 +287,41 @@ static void vhost_zerocopy_callback(struct ubuf_info 
*ubuf, bool success)
rcu_read_unlock_bh();
  }

+static inline unsigned long busy_clock(void)
+{
+   return local_clock() >> 10;
+}
+
+static bool vhost_can_busy_poll(struct vhost_dev *dev,
+   unsigned long endtime)
+{
+   return likely(!need_resched()) &&
+  likely(!time_after(busy_clock(), endtime)) &&
+  likely(!signal_pending(current)) &&
+  !vhost_has_work(dev) &&
+  single_task_running();
+}
+
+static int vhost_net_tx_get_vq_desc(struct vhost_net *net,
+   struct vhost_virtqueue *vq,
+   struct iovec iov[], unsigned int iov_size,
+   unsigned int *out_num, unsigned int *in_num)
+{
+   unsigned long uninitialized_var(endtime);
+
+   if (vq->busyloop_timeout) {
+   preempt_disable();
+   endtime = busy_clock() + vq->busyloop_timeout;
+   while (vhost_can_busy_poll(vq->dev, endtime) &&
+  !vhost_vq_more_avail(vq->dev, vq))
+   cpu_relax();
+   preempt_enable();
+   }


Isn't there a way to call all this after vhost_get_vq_desc?
First, this will reduce the good path overhead as you
won't have to play with timers and preemption.

Second, this will reduce the chance of a pagefault on avail ring read.


+
+   return vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+out_num, in_num, NULL, NULL);
+}
+
  /* Expects to be always run from workqueue - which acts as
   * read-size critical section for our kind of RCU. */
  static void handle_tx(struct vhost_net *net)
@@ -331,10 +366,9 @@ static void handle_tx(struct vhost_net *net)
  % UIO_MAXIOV == nvq->done_idx))
break;

-   head = vhost_get_vq_desc(vq, vq->iov,
-ARRAY_SIZE(vq->iov),
-, ,
-NULL, NULL);
+   head = vhost_net_tx_get_vq_desc(net, vq, vq->iov,
+   ARRAY_SIZE(vq->iov),
+   , );
/* On error, stop handling until the next kick. */
if (unlikely(head < 0))
break;
@@ -435,6 +469,34 @@ static int peek_head_len(struct sock *sk)
return len;
  }

+static int vhost_net_peek_head_len(struct vhost_net *net, struct sock *sk)


Need a hint that it's rx related in the name.


+{
+   struct vhost_net_virtqueue *nvq = >vqs[VHOST_NET_VQ_TX];
+   struct vhost_virtqueue *vq = >vq;
+   unsigned long uninitialized_var(endtime);
+
+   if (vq->busyloop_timeout) {
+   mutex_lock(>mutex);


This appears to be called under vq mutex in handle_rx.
So how does this work then?



+   vhost_disable_notify(>dev, vq);


This appears to be called after disable notify
in handle_rx - so why disable here again?


+
+   preempt_disable();
+   endtime = busy_clock() + vq->busyloop_timeout;
+
+   while (vhost_can_busy_poll(>dev, endtime) &&
+  skb_queue_empty(>sk_receive_queue) &&
+  !vhost_vq_more_avail(>dev, vq))
+   cpu_relax();


This seems to mix in several items.
RX queue is normally not empty. I don't think
we need to poll for that.


I have seen the RX queue is easy to be empty under some extreme conditions
like lots of small packet. So maybe the check is useful here.


It's not useful *here*.
If you have an rx packet but no space in the ring,
this will exit immediately.


Indeed!



It might be useful elsewhere but I doubt it -
if rx ring is out of buffers, you are better off
backing out and giving guest some breathing space.


--
best regards
yang



--
best regards
yang


[PATCH net v2] gro: Make GRO aware of lightweight tunnels.

2016-01-20 Thread Jesse Gross
GRO is currently not aware of tunnel metadata generated by lightweight
tunnels and stored in the dst. This leads to two possible problems:
 * Incorrectly merging two frames that have different metadata.
 * Leaking of allocated metadata from merged frames.

This avoids those problems by comparing the tunnel information before
merging, similar to how we handle other metadata (such as vlan tags),
and releasing any state when we are done.

Reported-by: John 
Fixes: 2e15ea39 ("ip_gre: Add support to collect tunnel metadata.")
Signed-off-by: Jesse Gross 
---
v2: Remove branches to optimize for common case where there is no dst.
---
 include/net/dst_metadata.h | 18 ++
 net/core/dev.c |  7 +--
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/include/net/dst_metadata.h b/include/net/dst_metadata.h
index 6816f0f..30a56ab 100644
--- a/include/net/dst_metadata.h
+++ b/include/net/dst_metadata.h
@@ -44,6 +44,24 @@ static inline bool skb_valid_dst(const struct sk_buff *skb)
return dst && !(dst->flags & DST_METADATA);
 }
 
+static inline int skb_metadata_dst_cmp(const struct sk_buff *skb_a,
+  const struct sk_buff *skb_b)
+{
+   const struct metadata_dst *a, *b;
+
+   if (!(skb_a->_skb_refdst | skb_b->_skb_refdst))
+   return 0;
+
+   a = (const struct metadata_dst *) skb_dst(skb_a);
+   b = (const struct metadata_dst *) skb_dst(skb_b);
+
+   if (!a != !b || a->u.tun_info.options_len != b->u.tun_info.options_len)
+   return 1;
+
+   return memcmp(>u.tun_info, >u.tun_info,
+ sizeof(a->u.tun_info) + a->u.tun_info.options_len);
+}
+
 struct metadata_dst *metadata_dst_alloc(u8 optslen, gfp_t flags);
 struct metadata_dst __percpu *metadata_dst_alloc_percpu(u8 optslen, gfp_t 
flags);
 
diff --git a/net/core/dev.c b/net/core/dev.c
index cc9e365..8cba3d8 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4351,6 +4351,7 @@ static void gro_list_prepare(struct napi_struct *napi, 
struct sk_buff *skb)
 
diffs = (unsigned long)p->dev ^ (unsigned long)skb->dev;
diffs |= p->vlan_tci ^ skb->vlan_tci;
+   diffs |= skb_metadata_dst_cmp(p, skb);
if (maclen == ETH_HLEN)
diffs |= compare_ether_header(skb_mac_header(p),
  skb_mac_header(skb));
@@ -4548,10 +4549,12 @@ static gro_result_t napi_skb_finish(gro_result_t ret, 
struct sk_buff *skb)
break;
 
case GRO_MERGED_FREE:
-   if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD)
+   if (NAPI_GRO_CB(skb)->free == NAPI_GRO_FREE_STOLEN_HEAD) {
+   skb_dst_drop(skb);
kmem_cache_free(skbuff_head_cache, skb);
-   else
+   } else {
__kfree_skb(skb);
+   }
break;
 
case GRO_HELD:
-- 
2.5.0



Re: [PATCH net] gro: Make GRO aware of lightweight tunnels.

2016-01-20 Thread Eric Dumazet
On Wed, 2016-01-20 at 17:47 -0800, Jesse Gross wrote:

> Just to merge the two threads together, all of protocols that would be
> affected by this also have "normal" GRO handlers that will run when
> the packet is first received. There's no argument that that is
> preferable if it is available. However, GRO cells do provide a
> performance benefit in other situations so it would be nice to avoid
> disabling it if possible.

Note that having a second stage GRO can introduce packet reorders,
because GRO packets given to the second layer simply bypass GRO engine.

Say sender sends P1,P2,P3

Receiver gets P1 alone, put P1 in the GRO cell (2nd layer)

Then we get P2 and P3, aggregated by first layer.

We decap tunnel header, then give P2-P3 to 2nd GRO engine, P2-P3 is
directly given to upper stack. [1]

Then P1 will follow later.

-> packets received out of order. Slow paths on both senders and
receiver, extra sack, ...

[1]

static enum gro_result dev_gro_receive(struct napi_struct *napi, struct
sk_buff *skb)
{
...
if (!(skb->dev->features & NETIF_F_GRO))
goto normal;

if (skb_is_gso(skb) || skb_has_frag_list(skb) || skb->csum_bad)
goto normal;





Re: [PATCH] net: take care of bonding in build_skb_flow_key (v3)

2016-01-20 Thread Wengang Wang



在 2016年01月20日 17:56, zhuyj 写道:

On 01/20/2016 05:47 PM, Wengang Wang wrote:



在 2016年01月20日 15:54, zhuyj 写道:

On 01/20/2016 03:38 PM, Wengang Wang wrote:



在 2016年01月20日 14:24, zhuyj 写道:

On 01/20/2016 01:32 PM, Wengang Wang wrote:
In a bonding setting, we determines fragment size according to 
MTU and
PMTU associated to the bonding master. If the slave finds the 
fragment
size is too big, it drops the fragment and calls 
ip_rt_update_pmtu(),

passing _skb_ and _pmtu_, trying to update the path MTU.
Problem is that the target device that function ip_rt_update_pmtu 
actually
tries to update is the slave (skb->dev), not the master. Thus 
since no
PMTU change happens on master, the fragment size for later 
packets doesn't

change so all later fragments/packets are dropped too.

The fix is letting build_skb_flow_key() take care of the 
transition of
device index from bonding slave to the master. That makes the 
master become

the target device that ip_rt_update_pmtu tries to update PMTU to.

Signed-off-by: Wengang Wang 
---
  net/ipv4/route.c | 13 -
  1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 85f184e..c59fb0d 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -523,10 +523,21 @@ static void build_skb_flow_key(struct 
flowi4 *fl4, const struct sk_buff *skb,

 const struct sock *sk)
  {
  const struct iphdr *iph = ip_hdr(skb);
-int oif = skb->dev->ifindex;
+struct net_device *master = NULL;
  u8 tos = RT_TOS(iph->tos);
  u8 prot = iph->protocol;
  u32 mark = skb->mark;
+int oif;
+
+if (skb->dev->flags & IFF_SLAVE) {
+rtnl_lock();
+master = netdev_master_upper_dev_get(skb->dev);
+rtnl_unlock();
update_pmtu is called very frequently. Is it appropriate to use 
rtnl_lock here?
By "very frequently", how frequently it is expected? And what 
situation can cause that?

For my case, the update_pmtu is called only once.

ip_tunnel_xmit


Can you please explain with more details?


dev_queue_xmit->ipip_tunnel_xmit->ip_tunnel_xmit->tnl_update_pmtu-> 
skb_dst(skb)->ops->update_pmtu
For ipip,  yes seems update_pmtu is called in line for each call of 
queue_xmit.  Do you know if it's a good configuration for ipip + bonding?

Other's comment and suggestion?

thanks,
wengang


Re: [PATCH net] net: diag: support v4mapped sockets in inet_diag_find_one_icsk()

2016-01-20 Thread Lorenzo Colitti
On Wed, Jan 20, 2016 at 4:25 PM, Eric Dumazet  wrote:
> Lorenzo reported that we could not properly find v4mapped sockets
> in inet_diag_find_one_icsk(). This patch fixes the issue.
>
> Reported-by: Lorenzo Colitti 
> Signed-off-by: Eric Dumazet 

This fixes the problem for me. I tested that SOCK_DESTROY (which uses
inet_diag_find_one_icsk) can successfully find/close the following
AF_INET6 sockets:

- Listen sockets bound to either :: or :::0.0.0.0
- Accepted IPv4 connections on sockets bound to either :: or :::0.0.0.0
- Active IPv4 connections

Acked-by: Lorenzo Colitti 


Re: [PATCH net 0/3] net: phy: Finally fix PHY_IGNORE_INTERRUPTS

2016-01-20 Thread David Miller
From: 
Date: Thu, 21 Jan 2016 00:59:42 +

>  > I'm experiencing misbehavior after restart system.
>> Can you wait applying the patch?
>> 
>> Sorry about it.
>> Woojung
> 
> Sorry for spam. Version mismatch happened. :(
> It works as expected on USB-to-Ethernet point of view.

So Florian, can I apply this series now?


Re: [PATCH net-next 0/2] tg3: update on maintainer and fix for tx timeout

2016-01-20 Thread David Miller
From: skallam 
Date: Wed, 20 Jan 2016 17:16:56 +0530

> From: Siva Reddy Kallam 
> 
> First patch:
>   This patch includes additional maintainer for tg3
> 
> Second patch:
>   Fix for tg3 transmit queue 0 timed out when too many gso_segs

Why are you targetting bug fixes at the 'net-next' tree?


Re: [PATCH net] gro: Make GRO aware of lightweight tunnels.

2016-01-20 Thread Thomas Graf
On 01/20/16 at 05:47pm, Jesse Gross wrote:
> Just to merge the two threads together, all of protocols that would be
> affected by this also have "normal" GRO handlers that will run when
> the packet is first received. There's no argument that that is
> preferable if it is available. However, GRO cells do provide a
> performance benefit in other situations so it would be nice to avoid
> disabling it if possible.

I missed this thread when I replied to the other one.

What are these situations? It seems like there are specific
scenarios where this helps. Is it varying TLVs in the encap header
for otherwise meregable inner headers?


[PATCH iproute2] ip-link: remove warning message

2016-01-20 Thread Zhang Shengju
the warning was:
iproute.c:301:12: warning: 'val' may be used uninitialized in this
function [-Wmaybe-uninitialized]
   features &= ~RTAX_FEATURE_ECN;
^
iproute.c:575:10: note: 'val' was declared here
   __u32 val;
  ^

Signed-off-by: Zhang Shengju 
---
 ip/iproute.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ip/iproute.c b/ip/iproute.c
index d5e3ebe..afe70e1 100644
--- a/ip/iproute.c
+++ b/ip/iproute.c
@@ -572,7 +572,7 @@ int print_route(const struct sockaddr_nl *who, struct 
nlmsghdr *n, void *arg)
mxlock = *(unsigned*)RTA_DATA(mxrta[RTAX_LOCK]);
 
for (i=2; i<= RTAX_MAX; i++) {
-   __u32 val;
+   __u32 val = 0U;
 
if (mxrta[i] == NULL)
continue;
-- 
1.8.3.1





Re: [PATCH 0/6] Netfilter fixes for net

2016-01-20 Thread David Miller
From: Pablo Neira Ayuso 
Date: Wed, 20 Jan 2016 18:03:58 +0100

> The following patchset contains Netfilter fixes for your net tree, they
> are:
> 
> 1) Fix accidental 3-times le/be conversion for 64-bits in nft_byteorder,
>from Florian Westphal.
> 
> 2) Get rid of defensive cidr = 0 check in the ipset hash:netiface set
>type which doesn't allow valid 0.0.0.0/0 elements, also from Florian.
> 
> 3) Relocate #endif in nft_ct counter support, this doesn't have any
>relation with labels.
> 
> 4) Fix TCPMSS target for IPv6 when skb has CHECKSUM_COMPLETE, from
>Eric Dumazet.
> 
> 5) Fix netdevice notifier leak from the error path of nf_tables_netdev.
> 
> 6) Safe conntrack hashtable resizing by introducing a global lock and
>synchronize all buckets to avoid going over the maximum number of
>preemption levels, from Sasha Levin.
> 
> You can pull these changes from:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git

Pulled, thanks Pablo.


Re: [PATCH net] gro: Make GRO aware of lightweight tunnels.

2016-01-20 Thread Jesse Gross
On Wed, Jan 20, 2016 at 6:31 PM, Thomas Graf  wrote:
> On 01/20/16 at 05:47pm, Jesse Gross wrote:
>> Just to merge the two threads together, all of protocols that would be
>> affected by this also have "normal" GRO handlers that will run when
>> the packet is first received. There's no argument that that is
>> preferable if it is available. However, GRO cells do provide a
>> performance benefit in other situations so it would be nice to avoid
>> disabling it if possible.
>
> I missed this thread when I replied to the other one.
>
> What are these situations? It seems like there are specific
> scenarios where this helps. Is it varying TLVs in the encap header
> for otherwise meregable inner headers?

It's nothing really fancy or tunnel type specific.

It's obviously preferable to merge things as soon as the packet is
received but if we don't (for example, if we don't have a checksum
provided by hardware) then we take another crack at it after
decapsulation. There's still enough stack left after decapsulation for
it to make a difference, particularly if you are going up to a VM. I
guess this shouldn't be surprising because it's basically equivalent
to GRO when there is no tunnel at all.

There was some previous discussion about this a little while ago:
https://www.mail-archive.com/netdev@vger.kernel.org/msg68880.html


Re: [PATCH net] net: change tcp_syn_retries documentation

2016-01-20 Thread David Miller
From: Xin Long 
Date: Wed, 20 Jan 2016 16:12:33 +0800

> Documentation should be kept consistent with the code:
> 
>  static int tcp_syn_retries_max = MAX_TCP_SYNCNT;
>  #define MAX_TCP_SYNCNT  127
> 
> Signed-off-by: Xin Long 

Applied.


[PATCH] net: take care of bonding in build_skb_flow_key (v4)

2016-01-20 Thread Wengang Wang
In a bonding setting, we determines fragment size according to MTU and
PMTU associated to the bonding master. If the slave finds the fragment
size is too big, it drops the fragment and calls ip_rt_update_pmtu(),
passing _skb_ and _pmtu_, trying to update the path MTU.
Problem is that the target device that function ip_rt_update_pmtu actually
tries to update is the slave (skb->dev), not the master. Thus since no
PMTU change happens on master, the fragment size for later packets doesn't
change so all later fragments/packets are dropped too.

The fix is letting build_skb_flow_key() take care of the transition of
device index from bonding slave to the master. That makes the master become
the target device that ip_rt_update_pmtu tries to update PMTU to.

Signed-off-by: Wengang Wang 
---
 net/ipv4/route.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 85f184e..7e766b5 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -524,10 +524,19 @@ static void build_skb_flow_key(struct flowi4 *fl4, const 
struct sk_buff *skb,
 {
const struct iphdr *iph = ip_hdr(skb);
int oif = skb->dev->ifindex;
+   struct net_device *master;
u8 tos = RT_TOS(iph->tos);
u8 prot = iph->protocol;
u32 mark = skb->mark;
 
+   if (netif_is_bond_slave(skb->dev)) {
+   rcu_read_lock();
+   master = netdev_master_upper_dev_get_rcu(skb->dev);
+   if (master)
+   oif = master->ifindex;
+   rcu_read_unlock();
+   }
+
__build_flow_key(fl4, sk, iph, oif, tos, prot, mark, 0);
 }
 
-- 
2.1.0



[PATCH] net:mac80211:mesh_plink: remove redundant sta_info check

2016-01-20 Thread Sunil Shahu
Remove unnecessory "if" statement and club it with previos "if" block.

Signed-off-by: Sunil Shahu 
---
 net/mac80211/mesh_plink.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/net/mac80211/mesh_plink.c b/net/mac80211/mesh_plink.c
index bd3d55e..e5851ae 100644
--- a/net/mac80211/mesh_plink.c
+++ b/net/mac80211/mesh_plink.c
@@ -976,6 +976,9 @@ mesh_plink_get_event(struct ieee80211_sub_if_data *sdata,
mpl_dbg(sdata, "Mesh plink error: no more free 
plinks\n");
goto out;
}
+   /* new matching peer */
+   event = OPN_ACPT;
+   goto out;
} else {
if (!test_sta_flag(sta, WLAN_STA_AUTH)) {
mpl_dbg(sdata, "Mesh plink: Action frame from 
non-authed peer\n");
@@ -985,12 +988,6 @@ mesh_plink_get_event(struct ieee80211_sub_if_data *sdata,
goto out;
}
 
-   /* new matching peer */
-   if (!sta) {
-   event = OPN_ACPT;
-   goto out;
-   }
-
switch (ftype) {
case WLAN_SP_MESH_PEERING_OPEN:
if (!matches_local)
-- 
1.9.1



Re: Kernel memory leak in bnx2x driver with vxlan tunnel

2016-01-20 Thread Thomas Graf
On 01/20/16 at 04:34pm, Eric Dumazet wrote:
> On Wed, 2016-01-20 at 16:19 -0800, Jesse Gross wrote:
> 
> > I have a patch that implements the comparison between dsts. For
> > packets without a dst, there isn't really a cost and if we do have a
> > dst then GRO is still a benefit. So it seems like it is worth doing,
> > even if it is more expensive than is ideal.
> 
> You guys really want to kill GRO performance.
> 
> Really the aggregation should happen at the first layer (ethernet
> device), instead of doing it after tunnel decap.
> 
> This is already done for GRE, IPIP, SIT, ...
> 
> GRO having to access metadata looks wrong, if you think about trying to
> do the same function in hardware (offload)

If I understand Jesse correctly then the added cost is only for
metadata enabled packets. Though I agree, what's the benefit of
doing GRO after decap?

It seems like it's way too late and we've already paid the cost
by going through the stack for each outer header packet.


Re: [PATCH] net: dp83640: Fix tx timestamp overflow handling.

2016-01-20 Thread David Miller
From: Richard Cochran 
Date: Wed, 20 Jan 2016 14:25:45 +0100

> On Wed, Jan 20, 2016 at 11:22:28AM +0100, Manfred Rudigier wrote:
>> PHY status frames are not reliable, the PHY may not be able to send them
>> during heavy receive traffic. This overflow condition is signaled by the
>> PHY in the next status frame, but the driver did not make use of it.
>> Instead it always reported wrong tx timestamps to user space after an
>> overflow happened because it assigned newly received tx timestamps to old
>> packets in the queue.
>> 
>> This commit fixes this issue by clearing the tx timestamp queue every time
>> an overflow happens, so that no timestamps are delivered for overflow
>> packets. This way time stamping will continue correctly after an overflow.
> 
> Hi Manfred,
> 
> Thanks for finding and fixing this bug.
> 
> Dave, this patch should also go into stable.
> 
> Acked-by: Richard Cochran 

Applied and queued up for -stable, thanks everyone.


Re: [PATCH RESEND v2] ixgbe: on recv increment rx.ring->stats.yields

2016-01-20 Thread Jeff Kirsher
On Wed, 2016-01-20 at 11:08 +0300, Pavel Tikhomirov wrote:
> It seem to be non intentionaly changed to tx in
> commit adc810900a70 ("ixgbe: Refactor busy poll socket code to
> address
> multiple issues")
> 
> Lock is taken from ixgbe_low_latency_recv, and there under this
> lock we use ixgbe_clean_rx_irq so it looks wrong for me to increment
> tx counter.
> 
> Yield stats can be shown through ethtool:
> ethtool -S enp129s0 | grep yield
> 
> v2: follow commit citing style
> 
> Signed-off-by: Pavel Tikhomirov 
> ---
>  drivers/net/ethernet/intel/ixgbe/ixgbe.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)

Why are you re-sending this patch?  I already have it queued up for
testing in my tree.  This just causes problems and extra work for me.

signature.asc
Description: This is a digitally signed message part


Re: [PATCH net 0/3] net: phy: Finally fix PHY_IGNORE_INTERRUPTS

2016-01-20 Thread Florian Fainelli
On January 20, 2016 6:58:09 PM PST, David Miller  wrote:
>From: 
>Date: Thu, 21 Jan 2016 00:59:42 +
>
>>  > I'm experiencing misbehavior after restart system.
>>> Can you wait applying the patch?
>>> 
>>> Sorry about it.
>>> Woojung
>> 
>> Sorry for spam. Version mismatch happened. :(
>> It works as expected on USB-to-Ethernet point of view.
>
>So Florian, can I apply this series now?

Yes, sorry for the lag and thanks for your patience.


-- 
Florian


Re: [PATCH net v2] gro: Make GRO aware of lightweight tunnels.

2016-01-20 Thread Thomas Graf
On 01/20/16 at 05:59pm, Jesse Gross wrote:
> GRO is currently not aware of tunnel metadata generated by lightweight
> tunnels and stored in the dst. This leads to two possible problems:
>  * Incorrectly merging two frames that have different metadata.
>  * Leaking of allocated metadata from merged frames.
> 
> This avoids those problems by comparing the tunnel information before
> merging, similar to how we handle other metadata (such as vlan tags),
> and releasing any state when we are done.
> 
> Reported-by: John 
> Fixes: 2e15ea39 ("ip_gre: Add support to collect tunnel metadata.")
> Signed-off-by: Jesse Gross 

Acked-by: Thomas Graf 

Thanks Jesse


[PATCH] net/irda: bfin_sir: remove duplicate defines

2016-01-20 Thread Sudip Mukherjee
The defconfig build of blackfin is failing with the error:

arch/blackfin/include/asm/bfin_serial.h:269:0: warning: "port_membase" redefined
drivers/net/irda/bfin_sir.h:85:0: note: this is the location of the previous 
definition
arch/blackfin/include/asm/bfin_serial.h:382:0: warning: "get_lsr_cache" 
redefined
drivers/net/irda/bfin_sir.h:86:0: note: this is the location of the previous 
definition
arch/blackfin/include/asm/bfin_serial.h:383:0: warning: "put_lsr_cache" 
redefined
drivers/net/irda/bfin_sir.h:87:0: note: this is the location of the previous 
definition

port_membase, get_lsr_cache, put_lsr_cache are already defined in the
architecture files, no need to define them again in the driver.

Signed-off-by: Sudip Mukherjee 
---
 drivers/net/irda/bfin_sir.h | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/drivers/net/irda/bfin_sir.h b/drivers/net/irda/bfin_sir.h
index 29cbde8..d47cf14 100644
--- a/drivers/net/irda/bfin_sir.h
+++ b/drivers/net/irda/bfin_sir.h
@@ -82,9 +82,6 @@ struct bfin_sir_self {
 
 #define DRIVER_NAME "bfin_sir"
 
-#define port_membase(port) (((struct bfin_sir_port *)(port))->membase)
-#define get_lsr_cache(port)(((struct bfin_sir_port *)(port))->lsr)
-#define put_lsr_cache(port, v) (((struct bfin_sir_port *)(port))->lsr = (v))
 #include 
 
 static const unsigned short per[][4] = {
-- 
1.9.1



Re: [PATCH net] net: diag: support v4mapped sockets in inet_diag_find_one_icsk()

2016-01-20 Thread David Miller
From: Eric Dumazet 
Date: Wed, 20 Jan 2016 16:25:01 -0800

> From: Eric Dumazet 
> 
> Lorenzo reported that we could not properly find v4mapped sockets
> in inet_diag_find_one_icsk(). This patch fixes the issue.
> 
> Reported-by: Lorenzo Colitti 
> Signed-off-by: Eric Dumazet 

Applied, thanks Eric.


Re: [PATCH net v2] gro: Make GRO aware of lightweight tunnels.

2016-01-20 Thread Eric Dumazet
On Wed, 2016-01-20 at 17:59 -0800, Jesse Gross wrote:
> GRO is currently not aware of tunnel metadata generated by lightweight
> tunnels and stored in the dst. This leads to two possible problems:
>  * Incorrectly merging two frames that have different metadata.
>  * Leaking of allocated metadata from merged frames.
> 
> This avoids those problems by comparing the tunnel information before
> merging, similar to how we handle other metadata (such as vlan tags),
> and releasing any state when we are done.
> 
> Reported-by: John 
> Fixes: 2e15ea39 ("ip_gre: Add support to collect tunnel metadata.")
> Signed-off-by: Jesse Gross 
> ---

Acked-by: Eric Dumazet 




Re: [PATCH] net: take care of bonding in build_skb_flow_key (v3)

2016-01-20 Thread Jay Vosburgh
Wengang Wang  wrote:
[...]
>For ipip,  yes seems update_pmtu is called in line for each call of
>queue_xmit.  Do you know if it's a good configuration for ipip + bonding?

Yes, it is.

>Other's comment and suggestion?

I agree with Sabrina Dubroca 's suggestions
from yesterday.

-J

---
-Jay Vosburgh, jay.vosbu...@canonical.com


Re: [PATCH v3 00/16] add Intel X722 iWARP driver

2016-01-20 Thread Or Gerlitz

On 1/21/2016 12:57 AM, Steve Wise wrote:

I also asked you why the port mapper code has to be present in each
iwarp driver and not part of the IB core stack, and you responded
"i40iw iwarp driver registers with port mapper and uses its services.
Beside that it is not the scope of the patch series"  -- well, it is
in the scope of upstream review to pose such questions, please
address.


There is a common service/API in the IB core for iWarp port mapping.  See 
drivers/infinbiand/core/iwpm*.c and include/rdma/iw_portmap.h.




Steve,

Okay, but I still don't follow why each vendor needs an RDMA_NL_YYY ID 
(RDMA_NL_C4IW, RDMA_NL_NES and now a new one  for this driver) of their own.


I see that the two current drivers actually use the same callbacks 
towards user-space, does each vendor uses a different user-space daemon, 
why?


Or.

net.git]# cd drivers/infiniband/hw/

hw]# git grep RDMA_NL_IWPM .
cxgb4/device.c: [RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb},
cxgb4/device.c: [RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb},
cxgb4/device.c: [RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = 
iwpm_add_and_query_mapping_cb},

cxgb4/device.c: [RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb},
cxgb4/device.c: [RDMA_NL_IWPM_REMOTE_INFO] = {.dump = iwpm_remote_info_cb},
cxgb4/device.c: [RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb},
cxgb4/device.c: [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = 
iwpm_ack_mapping_info_cb}

cxgb4/device.c: if (ibnl_add_client(RDMA_NL_C4IW, RDMA_NL_IWPM_NUM_OPS,

nes/nes.c:  [RDMA_NL_IWPM_REG_PID] = {.dump = iwpm_register_pid_cb},
nes/nes.c:  [RDMA_NL_IWPM_ADD_MAPPING] = {.dump = iwpm_add_mapping_cb},
nes/nes.c:  [RDMA_NL_IWPM_QUERY_MAPPING] = {.dump = 
iwpm_add_and_query_mapping_cb},

nes/nes.c:  [RDMA_NL_IWPM_REMOTE_INFO] = {.dump = iwpm_remote_info_cb},
nes/nes.c:  [RDMA_NL_IWPM_HANDLE_ERR] = {.dump = iwpm_mapping_error_cb},
nes/nes.c:  [RDMA_NL_IWPM_MAPINFO] = {.dump = iwpm_mapping_info_cb},
nes/nes.c:  [RDMA_NL_IWPM_MAPINFO_NUM] = {.dump = 
iwpm_ack_mapping_info_cb}
nes/nes.c:  if (ibnl_add_client(RDMA_NL_NES, RDMA_NL_IWPM_NUM_OPS, 
nes_nl_cb_table))




RE: [PATCH net 0/3] net: phy: Finally fix PHY_IGNORE_INTERRUPTS

2016-01-20 Thread Woojung.Huh
 > I'm experiencing misbehavior after restart system.
> Can you wait applying the patch?
> 
> Sorry about it.
> Woojung

Sorry for spam. Version mismatch happened. :(
It works as expected on USB-to-Ethernet point of view.

Woojung


Re: net: GPF in __netlink_ns_capable

2016-01-20 Thread Herbert Xu
On Wed, Jan 20, 2016 at 02:35:59PM +, Wan, Kaike wrote:
> >From the code (netlink_dump() in net/netlink/af_netlink.c), it shows that a 
> >skb is allocated without initializing the skb->cb[] field, which will cause 
> >oops if netlink_capable() is called with the duplicate skb. This will happen 
> >if the netlink_dump_start() path is followed (in ibnl_rcv_msg() in 
> >drivers/infiniband/core/netlink.c). However, for the IB netlink local 
> >service, we handle only request RDMA_NL_LS_OP_SET_TIMEOUT and response to 
> >RDMA_NL_LS_OP_RESOLVE, which directly call the registered dump function 
> >(ib_nl_handle_resolve_resp() and ib_nl_handle_resolve_resp()). See the 
> >following snippet:

You'll find a reproducer in the original email:

http://lkml.iu.edu/hypermail/linux/kernel/1601.1/06505.html

Cheers,
-- 
Email: Herbert Xu 
Home Page: http://gondor.apana.org.au/~herbert/
PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH V3 5/5] net: can: ifi: Add IFI CANFD IP support

2016-01-20 Thread Marek Vasut
On Wednesday, January 20, 2016 at 03:46:58 PM, Marc Kleine-Budde wrote:
> On 01/20/2016 03:41 PM, Marek Vasut wrote:
> > On Wednesday, January 20, 2016 at 03:39:51 PM, Marc Kleine-Budde wrote:
> >> On 01/20/2016 03:33 PM, Marek Vasut wrote:
> >>> The patch adds support for IFI CAN/FD controller [1]. This driver
> >>> currently supports sending and receiving both standard CAN and new
> >>> CAN/FD frames. Both ISO and BOSCH variant of CAN/FD is supported.
> >>> 
> >>> [1] http://www.ifi-pld.de/IP/CANFD/canfd.html
> >>> 
> >>> Signed-off-by: Marek Vasut 
> >>> Cc: Marc Kleine-Budde 
> >>> Cc: Mark Rutland 
> >>> Cc: Oliver Hartkopp 
> >>> Cc: Wolfgang Grandegger 
> >>> ---
> >>> V2: - Move request_irq()/free_irq() into
> >>> ifi_canfd_open()/ifi_canfd_close()
> >>> 
> >>>   just like other drivers do it to prevent crash when reloading
> >>>   module.
> >>> 
> >>> - Fix Woverflow complains on x86_64 and itanium, exactly the same
> >>> way
> >>> 
> >>>   as in commit dec23dca5a9ca4b9eb2fb66926f567889028b904 .
> >>> 
> >>> V3: - Hopefully fix all problems with BIT(31) by adding more u32 casts.
> >>> 
> >>> - Drop struct device from struct ifi_canfd_priv .
> >>> 
> >>> NOTE: The driver is surprisingly similar to m_can, but the register
> >>> 
> >>>   layout of the IFI core is completely different, so it's clear
> >>>   that those are two different IP cores.
> >>> 
> >>> ---
> > 
> > [...]
> > 
> > Hi,
> > 
> >>> +static irqreturn_t ifi_canfd_isr(int irq, void *dev_id)
> >>> +{
> >>> + struct net_device *ndev = (struct net_device *)dev_id;
> >>> + struct ifi_canfd_priv *priv = netdev_priv(ndev);
> >>> + struct net_device_stats *stats = >stats;
> >>> + const u32 rx_irq_mask = IFI_CANFD_INTERRUPT_RXFIFO_NEMPTY |
> >>> + IFI_CANFD_INTERRUPT_RXFIFO_NEMPTY_PER;
> >>> + const u32 tx_irq_mask = IFI_CANFD_INTERRUPT_TXFIFO_EMPTY |
> >>> + IFI_CANFD_INTERRUPT_TXFIFO_REMOVE;
> >>> + const u32 clr_irq_mask = (u32)(~(IFI_CANFD_INTERRUPT_SET_IRQ |
> >>> +  IFI_CANFD_INTERRUPT_ERROR_WARNING));
> >> 
> >> I've squashed:
>  -   const u32 clr_irq_mask = (u32)(~(IFI_CANFD_INTERRUPT_SET_IRQ |
>  -
>  IFI_CANFD_INTERRUPT_ERROR_WARNING)); +   const u32 clr_irq_mask =
>  ~(IFI_CANFD_INTERRUPT_SET_IRQ | +
>  IFI_CANFD_INTERRUPT_ERROR_WARNING);
> >> 
> >> and the driver compiles without warnings.
> > 
> > It doesn't , try with x86_64_defconfig, it will complain with -Woverflow
> > on gcc 4.9 or newer. That's what the kernel robot complained about in V1
> > of the patch too.
> 
> Doh! Right, let's try this:
> >> -#define IFI_CANFD_INTERRUPT_ERROR_WARNING  BIT(1)
> >> +#define IFI_CANFD_INTERRUPT_ERROR_WARNING  ((u32)BIT(1))
> > 
> > I'd be happy to be proven wrong though.
> 
> /me too

I think that will do the trick too. Do you want a V4 patch or will you fix it?


Re: [PATCH] net: Fix potential NULL pointer dereference in __skb_try_recv_datagram

2016-01-20 Thread Jacob Siverskog
On Tue, Jan 5, 2016 at 3:39 PM, Eric Dumazet  wrote:
> On Tue, 2016-01-05 at 15:34 +0100, Jacob Siverskog wrote:
>> On Tue, Jan 5, 2016 at 3:14 PM, Eric Dumazet  wrote:
>
>> >
>> > You might build a kernel with KASAN support to get maybe more chances to
>> > trigger the bug.
>> >
>> > ( https://www.kernel.org/doc/Documentation/kasan.txt )
>> >
>>
>> Ah. Doesn't seem to be supported on arm(32) unfortunately.
>
> Then you could at least use standard debugging features :
>
> CONFIG_SLAB=y
> CONFIG_SLABINFO=y
> CONFIG_DEBUG_SLAB=y
> CONFIG_DEBUG_SLAB_LEAK=y
>
> (Or equivalent SLUB options)
>
> and
>
> CONFIG_DEBUG_PAGEALLOC=y
>
> (If arm(32) has CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y)

I tried with those enabled and while toggling power on the Bluetooth
interface I usually get this after a few iterations:
kernel: Bluetooth: Unable to push skb to HCI core(-6)
kernel: (stc):  proto stack 4's ->recv failed
kernel: Slab corruption (Not tainted): skbuff_head_cache start=c08a8a00, len=176
kernel: 0a0: 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6b 6a 6b 6b a5  jkk.
kernel: Prev obj: start=c08a8940, len=176
kernel: 000: 00 00 00 00 00 00 00 00 31 73 52 00 43 17 2b 14  1sR.C.+.
kernel: 010: 00 00 00 00 00 00 00 00 04 00 00 00 01 00 00 00  
kernel: Next obj: start=c08a8ac0, len=176
kernel: 000: 00 00 00 00 00 00 00 00 01 42 f6 50 36 17 2b 14  .B.P6.+.
kernel: 010: 00 00 00 00 00 00 00 00 04 00 00 00 01 00 00 00  

The "Unable to push skb" and "recv failed" lines always appear before
the corruption.

Unfortunately, the corruptions occur also with your patch.


Re: ethtool NFC/ntuple API questions

2016-01-20 Thread Ben Hutchings
On Wed, 2016-01-20 at 09:53 -0800, Alexander Duyck wrote:
> On Wed, Jan 20, 2016 at 9:10 AM, Edward Cree  wrote:
> > I'm looking into adding IPv6 support to the ethtool flow steering API.  But,
> > I don't know "the unfortunate history of and subtle differences between the
> > RX n-tuple versus RX NFC commands".  In particular, would I need to add IPv6
> > support to both of them, or only one?  If one would be sufficient, which one
> > is preferred?
> 
> I'd say just focus on Rx NFC.  The NTUPLE interface is only really
> used for legacy support on ixgbe if I recall correctly.

sfc also supported it for a while.

> The original
> implementation was badly broken, but because it went out we are stuck
> supporting it.  Any new features can be added to the Rx NFC since that
> is the interface that has the ability to both set and get filters.

Right.  In fact maybe the ntuple stuff could be removed from the UAPI
headers given it's no longer part of the actual UAPI.

> > Also, is it necessary to duplicate the profusion of variants that the IPv4
> > flow specs have (3x struct ethtool_tcpip4_spec, 2x struct
> > ethtool_ah_espip4_spec, and struct ethtool_usrip4_spec), or should I just
> > make one struct that contains all the fields from those (I would say "the
> > union of their fields", but that might be confusing), and rely on flow_type
> > to indicate which fields are meaningful?
> 
> I'd say just stick with the approach taken for IPv4 since it makes it
> much more readable.  There were only really 4 types in use, but we
> named each to make certain it was clear which one should be used for
> each type.  To some extent the approach of relying on the flow_type is
> already in use, it is just made clearer by specifying which union to
> use for each type.

I don't mind one way or the other.

> > And, what exactly are the hdata fields in ethtool_flow_union and the
> > anonymous union in ethtool_rx_ntuple_flow_spec (they're not documented) and
> > why are they different sizes?
> 
> The hdata is essentially just padding that defines the entire region
> size.  For the user interface we have to reserve some amount of space,
> and in order to make the flow definitions compatible with NTUPLE we
> added extensions so that we could provide the information about the
> VLAN header if needed.
> 
> The reason for the sizing difference is that the ethtool_flow_union is
> half of the flow definition, the other half is stored in
> ethtool_flow_ext.  We shimmed ethtool_flow_ext into Rx NFC in order to
> work around the limitations of the NTUPLE filters.  The structure you
> probably should be looking at for comparison to NTUPLE is
> ethtool_rx_flow_spec, not ethtool_flow_union as that helps to tell the
> whole story.

Right.  Further, we can extend ethtool_flow_ext *downwards* so long as
we shrink ethtool_flow_union by the same amount (and add a type flag
for the extension).

I already checked that ethtool_flow_union remained large enough to hold
IPv6 flow specs because I expected sfc would support them some day. :-)

Ben.

-- 
Ben Hutchings
The program is absolutely right; therefore, the computer must be wrong.

signature.asc
Description: This is a digitally signed message part


Re: [PATCH] net: take care of bonding in build_skb_flow_key (v3)

2016-01-20 Thread Sabrina Dubroca
2016-01-20, 13:32:13 +0800, Wengang Wang wrote:
> In a bonding setting, we determines fragment size according to MTU and
> PMTU associated to the bonding master. If the slave finds the fragment
> size is too big, it drops the fragment and calls ip_rt_update_pmtu(),
> passing _skb_ and _pmtu_, trying to update the path MTU.
> Problem is that the target device that function ip_rt_update_pmtu actually
> tries to update is the slave (skb->dev), not the master. Thus since no
> PMTU change happens on master, the fragment size for later packets doesn't
> change so all later fragments/packets are dropped too.
> 
> The fix is letting build_skb_flow_key() take care of the transition of
> device index from bonding slave to the master. That makes the master become
> the target device that ip_rt_update_pmtu tries to update PMTU to.
> 
> Signed-off-by: Wengang Wang 
> ---
>  net/ipv4/route.c | 13 -
>  1 file changed, 12 insertions(+), 1 deletion(-)
> 
> diff --git a/net/ipv4/route.c b/net/ipv4/route.c
> index 85f184e..c59fb0d 100644
> --- a/net/ipv4/route.c
> +++ b/net/ipv4/route.c
> @@ -523,10 +523,21 @@ static void build_skb_flow_key(struct flowi4 *fl4, 
> const struct sk_buff *skb,
>  const struct sock *sk)
>  {
>   const struct iphdr *iph = ip_hdr(skb);
> - int oif = skb->dev->ifindex;
> + struct net_device *master = NULL;
>   u8 tos = RT_TOS(iph->tos);
>   u8 prot = iph->protocol;
>   u32 mark = skb->mark;
> + int oif;
> +
> + if (skb->dev->flags & IFF_SLAVE) {

Maybe use netif_is_bond_slave here instead, since you have this
problem with bonding slaves?


> + rtnl_lock();
> + master = netdev_master_upper_dev_get(skb->dev);
> + rtnl_unlock();
> + }

As zhuyj said, this is called from dev_queue_xmit, so you cannot take
rtnl_lock here.

> + if (master)
> + oif = master->ifindex;

You cannot dereference master after you release the rtnl lock.

So it would probably be best to use netdev_master_upper_dev_get_rcu,
as zhuyj suggested earlier, and make sure that you only use the result
between rcu_read_lock()/rcu_read_unlock():

rcu_read_lock();
master = netdev_master_upper_dev_get_rcu(skb->dev);
if (master)
oif = master->ifindex;
rcu_read_unlock();


Thanks,

-- 
Sabrina


Re: [PATCH v5 1/3] can: sja1000: of: add per-compatible init hook

2016-01-20 Thread Damien Riegel
Hi Marc,


On Wed, Jan 20, 2016 at 11:23:49AM +0100, Marc Kleine-Budde wrote:
> From: Damien Riegel 
> 
> This commit adds the capability to allocate and init private data
> embedded in the sja1000_priv structure on a per-compatible basis. The
> device node is passed as a parameter of the init callback to allow
> parsing of custom device tree properties.
> 
> Signed-off-by: Damien Riegel 
> Signed-off-by: Marc Kleine-Budde 
> ---
>  drivers/net/can/sja1000/sja1000_platform.c | 39 
> +++---
>  1 file changed, 30 insertions(+), 9 deletions(-)
> 
> diff --git a/drivers/net/can/sja1000/sja1000_platform.c 
> b/drivers/net/can/sja1000/sja1000_platform.c
> index 0552ed46a206..d398561b48ae 100644
> --- a/drivers/net/can/sja1000/sja1000_platform.c
> +++ b/drivers/net/can/sja1000/sja1000_platform.c
> @@ -27,6 +27,7 @@
>  #include 
>  #include 
>  #include 
> +#include 
>  #include 
>  
>  #include "sja1000.h"
> @@ -40,6 +41,11 @@ MODULE_DESCRIPTION("Socket-CAN driver for SJA1000 on the 
> platform bus");
>  MODULE_ALIAS("platform:" DRV_NAME);
>  MODULE_LICENSE("GPL v2");
>  
> +struct sja1000_of_data {
> + size_t  priv_sz;
> + int (*init)(struct sja1000_priv *priv, struct device_node *of);
> +};
> +
>  static u8 sp_read_reg8(const struct sja1000_priv *priv, int reg)
>  {
>   return ioread8(priv->reg_base + reg);
> @@ -154,6 +160,11 @@ static void sp_populate_of(struct sja1000_priv *priv, 
> struct device_node *of)
>   priv->cdr |= CDR_CBP; /* default */
>  }
>  
> +static const struct of_device_id sp_of_table[] = {
> + { /* sentinel */ },
> +};
> +MODULE_DEVICE_TABLE(of, sp_of_table);
> +

I will give a try to this serie but it seems that

{.compatible = "nxp,sja1000"},

has disappeared of sp_of_table in v5.


Damien

>  static int sp_probe(struct platform_device *pdev)
>  {
>   int err, irq = 0;
> @@ -163,6 +174,9 @@ static int sp_probe(struct platform_device *pdev)
>   struct resource *res_mem, *res_irq = NULL;
>   struct sja1000_platform_data *pdata;
>   struct device_node *of = pdev->dev.of_node;
> + const struct of_device_id *of_id;
> + const struct sja1000_of_data *of_data = NULL;
> + size_t priv_sz = 0;
>  
>   pdata = dev_get_platdata(>dev);
>   if (!pdata && !of) {
> @@ -191,7 +205,13 @@ static int sp_probe(struct platform_device *pdev)
>   if (!irq && !res_irq)
>   return -ENODEV;
>  
> - dev = alloc_sja1000dev(0);
> + of_id = of_match_device(sp_of_table, >dev);
> + if (of_id && of_id->data) {
> + of_data = of_id->data;
> + priv_sz = of_data->priv_sz;
> + }
> +
> + dev = alloc_sja1000dev(priv_sz);
>   if (!dev)
>   return -ENOMEM;
>   priv = netdev_priv(dev);
> @@ -208,10 +228,17 @@ static int sp_probe(struct platform_device *pdev)
>   dev->irq = irq;
>   priv->reg_base = addr;
>  
> - if (of)
> + if (of) {
>   sp_populate_of(priv, of);
> - else
> +
> + if (of_data && of_data->init) {
> + err = of_data->init(priv, of);
> + if (err)
> + goto exit_free;
> + }
> + } else {
>   sp_populate(priv, pdata, res_mem->flags);
> + }
>  
>   platform_set_drvdata(pdev, dev);
>   SET_NETDEV_DEV(dev, >dev);
> @@ -242,12 +269,6 @@ static int sp_remove(struct platform_device *pdev)
>   return 0;
>  }
>  
> -static const struct of_device_id sp_of_table[] = {
> - {.compatible = "nxp,sja1000"},
> - {},
> -};
> -MODULE_DEVICE_TABLE(of, sp_of_table);
> -
>  static struct platform_driver sp_driver = {
>   .probe = sp_probe,
>   .remove = sp_remove,
> -- 
> 2.7.0.rc3
> 


Re: mlx4 fails ("go bit not cleared") without MSI

2016-01-20 Thread Bjorn Helgaas
On Wed, Jan 20, 2016 at 3:42 AM, Yishai Hadas  wrote:
>> From: Bjorn Helgaas [mailto:bhelg...@google.com]
>> I assume you're tried booting with "pci=nomsi" or "mlx_core.msi_x=0"
>> and it works for you?  Note that my board might be an internal design,
>> not a Mellanox board, so if it works for you, we can probably just
>> ignore this as being an artifact of the internal design.
>
> Yes, it works for me (module parameter msi_x=0), as you pointed out above
> you are not using Mellanox board but some internal design and
> it might be some artifact of.

OK, let's assume that's the case.  I haven't seen the actual board, so
I can't be certain what I'm using.  Thanks for checking into it.

Bjorn


Re: ethtool NFC/ntuple API questions

2016-01-20 Thread Alexander Duyck
On Wed, Jan 20, 2016 at 9:10 AM, Edward Cree  wrote:
> I'm looking into adding IPv6 support to the ethtool flow steering API.  But,
> I don't know "the unfortunate history of and subtle differences between the
> RX n-tuple versus RX NFC commands".  In particular, would I need to add IPv6
> support to both of them, or only one?  If one would be sufficient, which one
> is preferred?

I'd say just focus on Rx NFC.  The NTUPLE interface is only really
used for legacy support on ixgbe if I recall correctly.  The original
implementation was badly broken, but because it went out we are stuck
supporting it.  Any new features can be added to the Rx NFC since that
is the interface that has the ability to both set and get filters.

> Also, is it necessary to duplicate the profusion of variants that the IPv4
> flow specs have (3x struct ethtool_tcpip4_spec, 2x struct
> ethtool_ah_espip4_spec, and struct ethtool_usrip4_spec), or should I just
> make one struct that contains all the fields from those (I would say "the
> union of their fields", but that might be confusing), and rely on flow_type
> to indicate which fields are meaningful?

I'd say just stick with the approach taken for IPv4 since it makes it
much more readable.  There were only really 4 types in use, but we
named each to make certain it was clear which one should be used for
each type.  To some extent the approach of relying on the flow_type is
already in use, it is just made clearer by specifying which union to
use for each type.

> And, what exactly are the hdata fields in ethtool_flow_union and the
> anonymous union in ethtool_rx_ntuple_flow_spec (they're not documented) and
> why are they different sizes?

The hdata is essentially just padding that defines the entire region
size.  For the user interface we have to reserve some amount of space,
and in order to make the flow definitions compatible with NTUPLE we
added extensions so that we could provide the information about the
VLAN header if needed.

The reason for the sizing difference is that the ethtool_flow_union is
half of the flow definition, the other half is stored in
ethtool_flow_ext.  We shimmed ethtool_flow_ext into Rx NFC in order to
work around the limitations of the NTUPLE filters.  The structure you
probably should be looking at for comparison to NTUPLE is
ethtool_rx_flow_spec, not ethtool_flow_union as that helps to tell the
whole story.

- Alex


RE: net: GPF in __netlink_ns_capable

2016-01-20 Thread Wan, Kaike
The problem was caused by the RDMA_NL_LS_OP_RESOLVE request (not response) 
packet sent by the user application, which falls through the netlink_dump path 
and eventually calls ib_nl_handle_resp() with a new skb with uninitialized 
control block. Checking the NETLINK_CB(skb).sk before calling netlink_capable() 
will fix the problem.

I will submit a patch soon.

Kaike

> -Original Message-
> From: Herbert Xu [mailto:herb...@gondor.apana.org.au]
> Sent: Wednesday, January 20, 2016 10:00 AM
> To: Wan, Kaike
> Cc: Eric W. Biederman; Richard Weinberger; David S. Miller; Thomas Graf;
> Daniel Borkmann; Ken-ichirou MATSUZAWA; Nicolas Dichtel; Florian
> Westphal; netdev; LKML; syzkaller; Kostya Serebryany; Alexander Potapenko;
> Sasha Levin; Eric Dumazet; Dmitry Vyukov; Fleck, John; Weiny, Ira; Doug
> Ledford; Jason Gunthorpe
> Subject: Re: net: GPF in __netlink_ns_capable
> 
> On Wed, Jan 20, 2016 at 02:35:59PM +, Wan, Kaike wrote:
> > >From the code (netlink_dump() in net/netlink/af_netlink.c), it shows that a
> skb is allocated without initializing the skb->cb[] field, which will cause 
> oops
> if netlink_capable() is called with the duplicate skb. This will happen if the
> netlink_dump_start() path is followed (in ibnl_rcv_msg() in
> drivers/infiniband/core/netlink.c). However, for the IB netlink local service,
> we handle only request RDMA_NL_LS_OP_SET_TIMEOUT and response to
> RDMA_NL_LS_OP_RESOLVE, which directly call the registered dump function
> (ib_nl_handle_resolve_resp() and ib_nl_handle_resolve_resp()). See the
> following snippet:
> 
> You'll find a reproducer in the original email:
> 
> http://lkml.iu.edu/hypermail/linux/kernel/1601.1/06505.html
> 
> Cheers,
> --
> Email: Herbert Xu  Home Page:
> http://gondor.apana.org.au/~herbert/
> PGP Key: http://gondor.apana.org.au/~herbert/pubkey.txt


Re: [PATCH V3 5/5] net: can: ifi: Add IFI CANFD IP support

2016-01-20 Thread Marc Kleine-Budde
On 01/20/2016 04:03 PM, Marek Vasut wrote:
>>
>> Doh! Right, let's try this:
 -#define IFI_CANFD_INTERRUPT_ERROR_WARNING  BIT(1)
 +#define IFI_CANFD_INTERRUPT_ERROR_WARNING  ((u32)BIT(1))
>>>
>>> I'd be happy to be proven wrong though.
>>
>> /me too
> 
> I think that will do the trick too. Do you want a V4 patch or will you fix it?

I'll fix this.

Marc
-- 
Pengutronix e.K.  | Marc Kleine-Budde   |
Industrial Linux Solutions| Phone: +49-231-2826-924 |
Vertretung West/Dortmund  | Fax:   +49-5121-206917- |
Amtsgericht Hildesheim, HRA 2686  | http://www.pengutronix.de   |



signature.asc
Description: OpenPGP digital signature


Re: [PATCH v5 1/3] can: sja1000: of: add per-compatible init hook

2016-01-20 Thread Marc Kleine-Budde
On 01/20/2016 04:30 PM, Damien Riegel wrote:
>> +static const struct of_device_id sp_of_table[] = {
>> +{ /* sentinel */ },
>> +};
>> +MODULE_DEVICE_TABLE(of, sp_of_table);
>> +
> 
> I will give a try to this serie but it seems that
> 
>   {.compatible = "nxp,sja1000"},
> 
> has disappeared of sp_of_table in v5.

Doh! This is of course broken. Wait for v6, please.

Marc

-- 
Pengutronix e.K.  | Marc Kleine-Budde   |
Industrial Linux Solutions| Phone: +49-231-2826-924 |
Vertretung West/Dortmund  | Fax:   +49-5121-206917- |
Amtsgericht Hildesheim, HRA 2686  | http://www.pengutronix.de   |



signature.asc
Description: OpenPGP digital signature


[PATCH RT 1/2] net: provide a way to delegate processing a softirq to ksoftirqd

2016-01-20 Thread Sebastian Andrzej Siewior
If the NET_RX uses up all of his budget it moves the following NAPI
invocations into the `ksoftirqd`. On -RT it does not do so. Instead it
rises the NET_RX softirq in its current context again.

In order to get closer to mainline's behaviour this patch provides
__raise_softirq_irqoff_ksoft() which raises the softirq in the ksoftird.

Cc: stable...@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior 
---
 include/linux/interrupt.h |  8 
 kernel/softirq.c  | 13 +
 net/core/dev.c|  2 +-
 3 files changed, 22 insertions(+), 1 deletion(-)

diff --git a/include/linux/interrupt.h b/include/linux/interrupt.h
index 79a9622b5a38..655cee096aed 100644
--- a/include/linux/interrupt.h
+++ b/include/linux/interrupt.h
@@ -465,6 +465,14 @@ extern void thread_do_softirq(void);
 extern void open_softirq(int nr, void (*action)(struct softirq_action *));
 extern void softirq_init(void);
 extern void __raise_softirq_irqoff(unsigned int nr);
+#ifdef CONFIG_PREEMPT_RT_FULL
+extern void __raise_softirq_irqoff_ksoft(unsigned int nr);
+#else
+static inline void __raise_softirq_irqoff_ksoft(unsigned int nr)
+{
+   __raise_softirq_irqoff(nr);
+}
+#endif
 
 extern void raise_softirq_irqoff(unsigned int nr);
 extern void raise_softirq(unsigned int nr);
diff --git a/kernel/softirq.c b/kernel/softirq.c
index f4c2e679a7d7..e83f38cf 100644
--- a/kernel/softirq.c
+++ b/kernel/softirq.c
@@ -641,6 +641,19 @@ void __raise_softirq_irqoff(unsigned int nr)
 }
 
 /*
+ * Same as __raise_softirq_irqoff() but will process them in ksoftirqd
+ */
+void __raise_softirq_irqoff_ksoft(unsigned int nr)
+{
+   if (WARN_ON_ONCE(!__this_cpu_read(ksoftirqd)))
+   return;
+   trace_softirq_raise(nr);
+   or_softirq_pending(1UL << nr);
+   __this_cpu_read(ksoftirqd)->softirqs_raised |= (1U << nr);
+   wakeup_softirqd();
+}
+
+/*
  * This function must run with irqs disabled!
  */
 void raise_softirq_irqoff(unsigned int nr)
diff --git a/net/core/dev.c b/net/core/dev.c
index f4475ccbc19b..13a55d0df151 100644
--- a/net/core/dev.c
+++ b/net/core/dev.c
@@ -4955,7 +4955,7 @@ static void net_rx_action(struct softirq_action *h)
list_splice_tail(, );
list_splice(, >poll_list);
if (!list_empty(>poll_list))
-   __raise_softirq_irqoff(NET_RX_SOFTIRQ);
+   __raise_softirq_irqoff_ksoft(NET_RX_SOFTIRQ);
 
net_rps_action_and_irq_enable(sd);
 }
-- 
2.7.0.rc3



Re: regression kernel 4.4: stops routing packets with a GRE-payload

2016-01-20 Thread Nicolas Dichtel

Le 20/01/2016 15:00, Wolfgang Walter a écrit :

Hello,

we tried 4.4 on our routers. We found one problem: 4.4 stops routing GRE
packets (ipv4 in GRE/ipv4) here. 4.4.15 works fine.

4.4.15 does not exist. Is it 4.1.15?


[PATCH 2/6] netfilter: ipset: allow a 0 netmask with hash_netiface type

2016-01-20 Thread Pablo Neira Ayuso
From: Florian Westphal 

Jozsef says:
 The correct behaviour is that if we have
 ipset create test1 hash:net,iface
 ipset add test1 0.0.0.0/0,eth0
 iptables -A INPUT -m set --match-set test1 src,src

 then the rule should match for any traffic coming in through eth0.

This removes the -EINVAL runtime test to make matching work
in case packet arrived via the specified interface.

Bugzilla: https://bugzilla.redhat.com/show_bug.cgi?id=1297092
Signed-off-by: Florian Westphal 
Acked-by: Jozsef Kadlecsik 
Signed-off-by: Pablo Neira Ayuso 
---
 net/netfilter/ipset/ip_set_hash_netiface.c | 4 
 1 file changed, 4 deletions(-)

diff --git a/net/netfilter/ipset/ip_set_hash_netiface.c 
b/net/netfilter/ipset/ip_set_hash_netiface.c
index 43d8c98..f0f688d 100644
--- a/net/netfilter/ipset/ip_set_hash_netiface.c
+++ b/net/netfilter/ipset/ip_set_hash_netiface.c
@@ -164,8 +164,6 @@ hash_netiface4_kadt(struct ip_set *set, const struct 
sk_buff *skb,
};
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
 
-   if (e.cidr == 0)
-   return -EINVAL;
if (adt == IPSET_TEST)
e.cidr = HOST_MASK;
 
@@ -377,8 +375,6 @@ hash_netiface6_kadt(struct ip_set *set, const struct 
sk_buff *skb,
};
struct ip_set_ext ext = IP_SET_INIT_KEXT(skb, opt, set);
 
-   if (e.cidr == 0)
-   return -EINVAL;
if (adt == IPSET_TEST)
e.cidr = HOST_MASK;
 
-- 
2.1.4



[PATCH v6 2/3] can: sja1000: add documentation for Technologic Systems version

2016-01-20 Thread Marc Kleine-Budde
From: Damien Riegel 

This commit adds documentation for the Technologic Systems version of
SJA1000. The difference with the NXP version is in the way the registers
are accessed.

Signed-off-by: Damien Riegel 
Signed-off-by: Marc Kleine-Budde 
---
 Documentation/devicetree/bindings/net/can/sja1000.txt | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/Documentation/devicetree/bindings/net/can/sja1000.txt 
b/Documentation/devicetree/bindings/net/can/sja1000.txt
index b4a6d53fb01a..ac3160eca96a 100644
--- a/Documentation/devicetree/bindings/net/can/sja1000.txt
+++ b/Documentation/devicetree/bindings/net/can/sja1000.txt
@@ -2,7 +2,7 @@ Memory mapped SJA1000 CAN controller from NXP (formerly Philips)
 
 Required properties:
 
-- compatible : should be "nxp,sja1000".
+- compatible : should be one of "nxp,sja1000", "technologic,sja1000".
 
 - reg : should specify the chip select, address offset and size required
to map the registers of the SJA1000. The size is usually 0x80.
@@ -14,6 +14,7 @@ Optional properties:
 
 - reg-io-width : Specify the size (in bytes) of the IO accesses that
should be performed on the device.  Valid value is 1, 2 or 4.
+   This property is ignored for technologic version.
Default to 1 (8 bits).
 
 - nxp,external-clock-frequency : Frequency of the external oscillator
-- 
2.7.0.rc3



[PATCH v6 3/3] can: sja1000: of: add compatibility with Technologic Systems version

2016-01-20 Thread Marc Kleine-Budde
From: Damien Riegel 

Technologic Systems provides an IP compatible with the SJA1000,
instantiated in an FPGA. Because of some bus widths issue, access to
registers is made through a "window" that works like this:

base + 0x0: address to read/write
base + 0x2: 8-bit register value

This commit adds a new compatible device, "technologic,sja1000", with
read and write functions using the window mechanism.

Signed-off-by: Damien Riegel 
Signed-off-by: Marc Kleine-Budde 
---
 drivers/net/can/sja1000/sja1000_platform.c | 47 ++
 1 file changed, 47 insertions(+)

diff --git a/drivers/net/can/sja1000/sja1000_platform.c 
b/drivers/net/can/sja1000/sja1000_platform.c
index 777d312f1779..dc9c6db96c3c 100644
--- a/drivers/net/can/sja1000/sja1000_platform.c
+++ b/drivers/net/can/sja1000/sja1000_platform.c
@@ -46,6 +46,10 @@ struct sja1000_of_data {
int (*init)(struct sja1000_priv *priv, struct device_node *of);
 };
 
+struct technologic_priv {
+   spinlock_t  io_lock;
+};
+
 static u8 sp_read_reg8(const struct sja1000_priv *priv, int reg)
 {
return ioread8(priv->reg_base + reg);
@@ -76,6 +80,43 @@ static void sp_write_reg32(const struct sja1000_priv *priv, 
int reg, u8 val)
iowrite8(val, priv->reg_base + reg * 4);
 }
 
+static u8 sp_technologic_read_reg16(const struct sja1000_priv *priv, int reg)
+{
+   struct technologic_priv *tp = priv->priv;
+   unsigned long flags;
+   u8 val;
+
+   spin_lock_irqsave(>io_lock, flags);
+   iowrite16(reg, priv->reg_base + 0);
+   val = ioread16(priv->reg_base + 2);
+   spin_unlock_irqrestore(>io_lock, flags);
+
+   return val;
+}
+
+static void sp_technologic_write_reg16(const struct sja1000_priv *priv,
+  int reg, u8 val)
+{
+   struct technologic_priv *tp = priv->priv;
+   unsigned long flags;
+
+   spin_lock_irqsave(>io_lock, flags);
+   iowrite16(reg, priv->reg_base + 0);
+   iowrite16(val, priv->reg_base + 2);
+   spin_unlock_irqrestore(>io_lock, flags);
+}
+
+static int sp_technologic_init(struct sja1000_priv *priv, struct device_node 
*of)
+{
+   struct technologic_priv *tp = priv->priv;
+
+   priv->read_reg = sp_technologic_read_reg16;
+   priv->write_reg = sp_technologic_write_reg16;
+   spin_lock_init(>io_lock);
+
+   return 0;
+}
+
 static void sp_populate(struct sja1000_priv *priv,
struct sja1000_platform_data *pdata,
unsigned long resource_mem_flags)
@@ -160,8 +201,14 @@ static void sp_populate_of(struct sja1000_priv *priv, 
struct device_node *of)
priv->cdr |= CDR_CBP; /* default */
 }
 
+static struct sja1000_of_data technologic_data = {
+   .priv_sz = sizeof(struct technologic_priv),
+   .init = sp_technologic_init,
+};
+
 static const struct of_device_id sp_of_table[] = {
{ .compatible = "nxp,sja1000", .data = NULL, },
+   { .compatible = "technologic,sja1000", .data = _data, },
{ /* sentinel */ },
 };
 MODULE_DEVICE_TABLE(of, sp_of_table);
-- 
2.7.0.rc3



Re: [PATCH V3 5/5] net: can: ifi: Add IFI CANFD IP support

2016-01-20 Thread Marek Vasut
On Wednesday, January 20, 2016 at 04:09:48 PM, Marc Kleine-Budde wrote:
> On 01/20/2016 04:03 PM, Marek Vasut wrote:
> >> Doh! Right, let's try this:
>  -#define IFI_CANFD_INTERRUPT_ERROR_WARNING  BIT(1)
>  +#define IFI_CANFD_INTERRUPT_ERROR_WARNING  ((u32)BIT(1))
> >>> 
> >>> I'd be happy to be proven wrong though.
> >> 
> >> /me too
> > 
> > I think that will do the trick too. Do you want a V4 patch or will you
> > fix it?
> 
> I'll fix this.

Thanks :)

Best regards,
Marek Vasut


Re: [PATCH] net: Fix potential NULL pointer dereference in __skb_try_recv_datagram

2016-01-20 Thread Eric Dumazet
On Wed, 2016-01-20 at 17:17 +0100, Jacob Siverskog wrote:
> On Wed, Jan 20, 2016 at 4:48 PM, Eric Dumazet  wrote:
> > On Wed, 2016-01-20 at 16:06 +0100, Jacob Siverskog wrote:
> >> On Tue, Jan 5, 2016 at 3:39 PM, Eric Dumazet  
> >> wrote:
> >> > On Tue, 2016-01-05 at 15:34 +0100, Jacob Siverskog wrote:
> >> >> On Tue, Jan 5, 2016 at 3:14 PM, Eric Dumazet  
> >> >> wrote:
> >> >
> >> >> >
> >> >> > You might build a kernel with KASAN support to get maybe more chances 
> >> >> > to
> >> >> > trigger the bug.
> >> >> >
> >> >> > ( https://www.kernel.org/doc/Documentation/kasan.txt )
> >> >> >
> >> >>
> >> >> Ah. Doesn't seem to be supported on arm(32) unfortunately.
> >> >
> >> > Then you could at least use standard debugging features :
> >> >
> >> > CONFIG_SLAB=y
> >> > CONFIG_SLABINFO=y
> >> > CONFIG_DEBUG_SLAB=y
> >> > CONFIG_DEBUG_SLAB_LEAK=y
> >> >
> >> > (Or equivalent SLUB options)
> >> >
> >> > and
> >> >
> >> > CONFIG_DEBUG_PAGEALLOC=y
> >> >
> >> > (If arm(32) has CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y)
> >>
> >> I tried with those enabled and while toggling power on the Bluetooth
> >> interface I usually get this after a few iterations:
> >> kernel: Bluetooth: Unable to push skb to HCI core(-6)
> >
> > Well, this code seems to be quite buggy.
> >
> > I do not have time to audit it, but 5 minutes are enough to spot 2
> > issues.
> >
> > skb, once given to another queue/layer should not be accessed anymore.
> >
> 
> Ok. Unfortunately I still see the slab corruption even with your changes.

Patch was only showing potential _reads_ after free, which do not
generally corrupt memory.

As I said, a full audit is needed, and I don't have time for this.





[PATCH 1/6] netfilter: nft_byteorder: avoid unneeded le/be conversion steps

2016-01-20 Thread Pablo Neira Ayuso
From: Florian Westphal 

David points out that we to three le/be conversions instead
of just one.  Doesn't matter on x86_64 w. gcc, but other
architectures might be less lucky.

Since it also simplifies code just follow his advice.

Fixes: c0f3275f5cb ("nftables: byteorder: provide le/be 64 bit conversion 
helper")
Suggested-by: David Laight 
Signed-off-by: Florian Westphal 
Signed-off-by: Pablo Neira Ayuso 
---
 net/netfilter/nft_byteorder.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/nft_byteorder.c b/net/netfilter/nft_byteorder.c
index 383c171..b78c28b 100644
--- a/net/netfilter/nft_byteorder.c
+++ b/net/netfilter/nft_byteorder.c
@@ -46,16 +46,14 @@ static void nft_byteorder_eval(const struct nft_expr *expr,
switch (priv->op) {
case NFT_BYTEORDER_NTOH:
for (i = 0; i < priv->len / 8; i++) {
-   src64 = get_unaligned_be64([i]);
-   src64 = be64_to_cpu((__force __be64)src64);
+   src64 = get_unaligned((u64 *)[i]);
put_unaligned_be64(src64, [i]);
}
break;
case NFT_BYTEORDER_HTON:
for (i = 0; i < priv->len / 8; i++) {
src64 = get_unaligned_be64([i]);
-   src64 = (__force u64)cpu_to_be64(src64);
-   put_unaligned_be64(src64, [i]);
+   put_unaligned(src64, (u64 *)[i]);
}
break;
}
-- 
2.1.4



Re: [PATCH] net: Fix potential NULL pointer dereference in __skb_try_recv_datagram

2016-01-20 Thread Peter Hurley
Hi Jacob,

On 01/05/2016 06:34 AM, Jacob Siverskog wrote:
> On Tue, Jan 5, 2016 at 3:14 PM, Eric Dumazet  wrote:
>> On Tue, 2016-01-05 at 12:07 +0100, Jacob Siverskog wrote:
>>> On Mon, Jan 4, 2016 at 4:25 PM, Eric Dumazet  wrote:
 On Mon, 2016-01-04 at 10:10 +0100, Jacob Siverskog wrote:
> On Wed, Dec 30, 2015 at 11:30 PM, Cong Wang  
> wrote:
>> On Wed, Dec 30, 2015 at 6:30 AM, Jacob Siverskog
>>  wrote:
>>> On Wed, Dec 30, 2015 at 2:26 PM, Eric Dumazet  
>>> wrote:
 How often can you trigger this bug ?
>>>
>>> Ok. I don't have a good repro to trigger it unfortunately, I've seen it 
>>> just a
>>> few times when bringing up/down network interfaces. Does the trace
>>> give any clue?
>>>
>>
>> A little bit. You need to help people to narrow down the problem
>> because there are too many places using skb->next and skb->prev.
>>
>> Since you mentioned it seems related to network interface flip,
>> what network interfaces are you using? What's is your TC setup?
>>
>> Thanks.
>
> The system contains only one physical network interface (TI WL1837,
> wl18xx module).
> The state prior to the crash was as follows:
> - One virtual network interface active (as STA, associated with access 
> point)
> - Bluetooth (BLE only) active (same physical chip, co-existence,
> btwilink/st_drv modules)
>
> Actions made around the time of the crash:
> - Bluetooth disabled
> - One additional virtual network interface brought up (also as STA)
>
> I believe the crash occurred between these two actions. I just saw
> that there are some interesting events in the log prior to the crash:
> kernel: Bluetooth: Unable to push skb to HCI core(-6)
> kernel: (stc):  proto stack 4's ->recv failed
> kernel: (stc): remove_channel_from_table: id 3
> kernel: (stc): remove_channel_from_table: id 2
> kernel: (stc): remove_channel_from_table: id 4
> kernel: (stc):  all chnl_ids unregistered
> kernel: (stk) :ldisc_install = 0(stc): st_tty_close
>
> The first print is from btwilink.c. However, I can't see the
> connection between Bluetooth (BLE) and UDP/IPv6 (we're not using
> 6LoWPAN or anything similar).
>
> Thanks, Jacob

 Definitely these details are useful ;)

 Could you try :

 diff --git a/drivers/misc/ti-st/st_core.c b/drivers/misc/ti-st/st_core.c
 index 6e3af8b42cdd..0c99a74fb895 100644
 --- a/drivers/misc/ti-st/st_core.c
 +++ b/drivers/misc/ti-st/st_core.c
 @@ -912,7 +912,9 @@ void st_core_exit(struct st_data_s *st_gdata)
 skb_queue_purge(_gdata->txq);
 skb_queue_purge(_gdata->tx_waitq);
 kfree_skb(st_gdata->rx_skb);
 +   st_gdata->rx_skb = NULL;
 kfree_skb(st_gdata->tx_skb);
 +   st_gdata->tx_skb = NULL;
 /* TTY ldisc cleanup */
 err = tty_unregister_ldisc(N_TI_WL);
 if (err)

FWIW,

You don't need that ti-st junk to get the WL1837 working; the WL1837 only
has BT channels. Unfortunately, that's really all I can say about it; sorry.

Regards,
Peter Hurley




Re: [PATCH] net: Fix potential NULL pointer dereference in __skb_try_recv_datagram

2016-01-20 Thread Eric Dumazet
On Wed, 2016-01-20 at 16:06 +0100, Jacob Siverskog wrote:
> On Tue, Jan 5, 2016 at 3:39 PM, Eric Dumazet  wrote:
> > On Tue, 2016-01-05 at 15:34 +0100, Jacob Siverskog wrote:
> >> On Tue, Jan 5, 2016 at 3:14 PM, Eric Dumazet  
> >> wrote:
> >
> >> >
> >> > You might build a kernel with KASAN support to get maybe more chances to
> >> > trigger the bug.
> >> >
> >> > ( https://www.kernel.org/doc/Documentation/kasan.txt )
> >> >
> >>
> >> Ah. Doesn't seem to be supported on arm(32) unfortunately.
> >
> > Then you could at least use standard debugging features :
> >
> > CONFIG_SLAB=y
> > CONFIG_SLABINFO=y
> > CONFIG_DEBUG_SLAB=y
> > CONFIG_DEBUG_SLAB_LEAK=y
> >
> > (Or equivalent SLUB options)
> >
> > and
> >
> > CONFIG_DEBUG_PAGEALLOC=y
> >
> > (If arm(32) has CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y)
> 
> I tried with those enabled and while toggling power on the Bluetooth
> interface I usually get this after a few iterations:
> kernel: Bluetooth: Unable to push skb to HCI core(-6)

Well, this code seems to be quite buggy.

I do not have time to audit it, but 5 minutes are enough to spot 2
issues.

skb, once given to another queue/layer should not be accessed anymore.

diff --git a/drivers/bluetooth/btwilink.c b/drivers/bluetooth/btwilink.c
index 24a652f9252b..2d3092aa6cfe 100644
--- a/drivers/bluetooth/btwilink.c
+++ b/drivers/bluetooth/btwilink.c
@@ -98,6 +98,7 @@ static void st_reg_completion_cb(void *priv_data, char data)
 static long st_receive(void *priv_data, struct sk_buff *skb)
 {
struct ti_st *lhst = priv_data;
+   unsigned int len;
int err;
 
if (!skb)
@@ -109,13 +110,14 @@ static long st_receive(void *priv_data, struct sk_buff 
*skb)
}
 
/* Forward skb to HCI core layer */
+   len = skb->len;
err = hci_recv_frame(lhst->hdev, skb);
if (err < 0) {
BT_ERR("Unable to push skb to HCI core(%d)", err);
return err;
}
 
-   lhst->hdev->stat.byte_rx += skb->len;
+   lhst->hdev->stat.byte_rx += len;
 
return 0;
 }
@@ -245,6 +247,7 @@ static int ti_st_send_frame(struct hci_dev *hdev, struct 
sk_buff *skb)
 {
struct ti_st *hst;
long len;
+   u8 pkt_type;
 
hst = hci_get_drvdata(hdev);
 
@@ -258,6 +261,7 @@ static int ti_st_send_frame(struct hci_dev *hdev, struct 
sk_buff *skb)
 * Freeing skb memory is taken care in shared transport layer,
 * so don't free skb memory here.
 */
+   pkt_type = hci_skb_pkt_type(skb);
len = hst->st_write(skb);
if (len < 0) {
kfree_skb(skb);
@@ -268,7 +272,7 @@ static int ti_st_send_frame(struct hci_dev *hdev, struct 
sk_buff *skb)
 
/* ST accepted our skb. So, Go ahead and do rest */
hdev->stat.byte_tx += len;
-   ti_st_tx_complete(hst, hci_skb_pkt_type(skb));
+   ti_st_tx_complete(hst, pkt_type);
 
return 0;
 }





ethtool NFC/ntuple API questions

2016-01-20 Thread Edward Cree
I'm looking into adding IPv6 support to the ethtool flow steering API.  But,
I don't know "the unfortunate history of and subtle differences between the
RX n-tuple versus RX NFC commands".  In particular, would I need to add IPv6
support to both of them, or only one?  If one would be sufficient, which one
is preferred?
Also, is it necessary to duplicate the profusion of variants that the IPv4
flow specs have (3x struct ethtool_tcpip4_spec, 2x struct
ethtool_ah_espip4_spec, and struct ethtool_usrip4_spec), or should I just
make one struct that contains all the fields from those (I would say "the
union of their fields", but that might be confusing), and rely on flow_type
to indicate which fields are meaningful?
And, what exactly are the hdata fields in ethtool_flow_union and the
anonymous union in ethtool_rx_ntuple_flow_spec (they're not documented) and
why are they different sizes?

-Ed


Re: [PATCH] net: Fix potential NULL pointer dereference in __skb_try_recv_datagram

2016-01-20 Thread Jacob Siverskog
On Wed, Jan 20, 2016 at 4:48 PM, Eric Dumazet  wrote:
> On Wed, 2016-01-20 at 16:06 +0100, Jacob Siverskog wrote:
>> On Tue, Jan 5, 2016 at 3:39 PM, Eric Dumazet  wrote:
>> > On Tue, 2016-01-05 at 15:34 +0100, Jacob Siverskog wrote:
>> >> On Tue, Jan 5, 2016 at 3:14 PM, Eric Dumazet  
>> >> wrote:
>> >
>> >> >
>> >> > You might build a kernel with KASAN support to get maybe more chances to
>> >> > trigger the bug.
>> >> >
>> >> > ( https://www.kernel.org/doc/Documentation/kasan.txt )
>> >> >
>> >>
>> >> Ah. Doesn't seem to be supported on arm(32) unfortunately.
>> >
>> > Then you could at least use standard debugging features :
>> >
>> > CONFIG_SLAB=y
>> > CONFIG_SLABINFO=y
>> > CONFIG_DEBUG_SLAB=y
>> > CONFIG_DEBUG_SLAB_LEAK=y
>> >
>> > (Or equivalent SLUB options)
>> >
>> > and
>> >
>> > CONFIG_DEBUG_PAGEALLOC=y
>> >
>> > (If arm(32) has CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y)
>>
>> I tried with those enabled and while toggling power on the Bluetooth
>> interface I usually get this after a few iterations:
>> kernel: Bluetooth: Unable to push skb to HCI core(-6)
>
> Well, this code seems to be quite buggy.
>
> I do not have time to audit it, but 5 minutes are enough to spot 2
> issues.
>
> skb, once given to another queue/layer should not be accessed anymore.
>

Ok. Unfortunately I still see the slab corruption even with your changes.


Re: [PATCH v3 4/4] net: mvneta: update clocks property and document additional clock-names

2016-01-20 Thread Rob Herring
On Wed, Jan 20, 2016 at 07:27:25PM +0800, Jisheng Zhang wrote:
> Signed-off-by: Jisheng Zhang 
> ---
>  Documentation/devicetree/bindings/net/marvell-armada-370-neta.txt | 7 ++-
>  1 file changed, 6 insertions(+), 1 deletion(-)

Acked-by: Rob Herring 


[PATCH v3 14/16] i40iw: virtual channel handling files

2016-01-20 Thread Faisal Latif
i40iw_vf.[ch] and i40iw_virtchnl[ch] are used for virtual
channel support for iWARP VF module.

Changes since v2:
code cleanup

Acked-by: Anjali Singhai Jain 
Acked-by: Shannon Nelson 
Signed-off-by: Faisal Latif 
---
 drivers/infiniband/hw/i40iw/i40iw_vf.c   |  85 +++
 drivers/infiniband/hw/i40iw/i40iw_vf.h   |  62 +++
 drivers/infiniband/hw/i40iw/i40iw_virtchnl.c | 748 +++
 drivers/infiniband/hw/i40iw/i40iw_virtchnl.h | 124 +
 4 files changed, 1019 insertions(+)
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_vf.c
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_vf.h
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_virtchnl.c
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_virtchnl.h

diff --git a/drivers/infiniband/hw/i40iw/i40iw_vf.c 
b/drivers/infiniband/hw/i40iw/i40iw_vf.c
new file mode 100644
index 000..cb0f183
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_vf.c
@@ -0,0 +1,85 @@
+/***
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*- Redistributions of source code must retain the above
+*  copyright notice, this list of conditions and the following
+*  disclaimer.
+*
+*- Redistributions in binary form must reproduce the above
+*  copyright notice, this list of conditions and the following
+*  disclaimer in the documentation and/or other materials
+*  provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+***/
+
+#include "i40iw_osdep.h"
+#include "i40iw_register.h"
+#include "i40iw_status.h"
+#include "i40iw_hmc.h"
+#include "i40iw_d.h"
+#include "i40iw_type.h"
+#include "i40iw_p.h"
+#include "i40iw_vf.h"
+
+/**
+ * i40iw_manage_vf_pble_bp - manage vf pble
+ * @cqp: cqp for cqp' sq wqe
+ * @info: pble info
+ * @scratch: pointer for completion
+ * @post_sq: to post and ring
+ */
+enum i40iw_status_code i40iw_manage_vf_pble_bp(struct i40iw_sc_cqp *cqp,
+  struct i40iw_manage_vf_pble_info 
*info,
+  u64 scratch,
+  bool post_sq)
+{
+   u64 *wqe;
+   u64 temp, header, pd_pl_pba = 0;
+
+   wqe = i40iw_sc_cqp_get_next_send_wqe(cqp, scratch);
+   if (!wqe)
+   return I40IW_ERR_RING_FULL;
+
+   temp = LS_64(info->pd_entry_cnt, I40IW_CQPSQ_MVPBP_PD_ENTRY_CNT) |
+   LS_64(info->first_pd_index, I40IW_CQPSQ_MVPBP_FIRST_PD_INX) |
+   LS_64(info->sd_index, I40IW_CQPSQ_MVPBP_SD_INX);
+   set_64bit_val(wqe, 16, temp);
+
+   header = LS_64((info->inv_pd_ent ? 1 : 0), 
I40IW_CQPSQ_MVPBP_INV_PD_ENT) |
+   LS_64(I40IW_CQP_OP_MANAGE_VF_PBLE_BP, I40IW_CQPSQ_OPCODE) |
+   LS_64(cqp->polarity, I40IW_CQPSQ_WQEVALID);
+   set_64bit_val(wqe, 24, header);
+
+   pd_pl_pba = LS_64(info->pd_pl_pba >> 3, I40IW_CQPSQ_MVPBP_PD_PLPBA);
+   set_64bit_val(wqe, 32, pd_pl_pba);
+
+   i40iw_debug_buf(cqp->dev, I40IW_DEBUG_WQE, "MANAGE VF_PBLE_BP WQE", 
wqe, I40IW_CQP_WQE_SIZE * 8);
+
+   if (post_sq)
+   i40iw_sc_cqp_post_sq(cqp);
+   return 0;
+}
+
+struct i40iw_vf_cqp_ops iw_vf_cqp_ops = {
+   i40iw_manage_vf_pble_bp
+};
diff --git a/drivers/infiniband/hw/i40iw/i40iw_vf.h 
b/drivers/infiniband/hw/i40iw/i40iw_vf.h
new file mode 100644
index 000..f649f3a
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_vf.h
@@ -0,0 +1,62 @@
+/***
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this 

[PATCH v3 08/16] i40iw: add hw and utils files

2016-01-20 Thread Faisal Latif
i40iw_hw.c, i40iw_utils.c and i40iw_osdep.h are files to handle
interrupts and processing.

Changes since v1:
Cleanup/removed some macros reported by Christoph Hellwig.

Acked-by: Anjali Singhai Jain 
Acked-by: Shannon Nelson 
Signed-off-by: Faisal Latif 
---
 drivers/infiniband/hw/i40iw/i40iw_hw.c|  730 +
 drivers/infiniband/hw/i40iw/i40iw_osdep.h |  214 +
 drivers/infiniband/hw/i40iw/i40iw_utils.c | 1256 +
 3 files changed, 2200 insertions(+)
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_hw.c
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_osdep.h
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_utils.c

diff --git a/drivers/infiniband/hw/i40iw/i40iw_hw.c 
b/drivers/infiniband/hw/i40iw/i40iw_hw.c
new file mode 100644
index 000..bee9ba6
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_hw.c
@@ -0,0 +1,730 @@
+/***
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*- Redistributions of source code must retain the above
+*  copyright notice, this list of conditions and the following
+*  disclaimer.
+*
+*- Redistributions in binary form must reproduce the above
+*  copyright notice, this list of conditions and the following
+*  disclaimer in the documentation and/or other materials
+*  provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+***/
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "i40iw.h"
+
+/**
+ * i40iw_initialize_hw_resources - initialize hw resource during open
+ * @iwdev: iwarp device
+ */
+u32 i40iw_initialize_hw_resources(struct i40iw_device *iwdev)
+{
+   unsigned long num_pds;
+   u32 resources_size;
+   u32 max_mr;
+   u32 max_qp;
+   u32 max_cq;
+   u32 arp_table_size;
+   u32 mrdrvbits;
+   void *resource_ptr;
+
+   max_qp = iwdev->sc_dev.hmc_info->hmc_obj[I40IW_HMC_IW_QP].cnt;
+   max_cq = iwdev->sc_dev.hmc_info->hmc_obj[I40IW_HMC_IW_CQ].cnt;
+   max_mr = iwdev->sc_dev.hmc_info->hmc_obj[I40IW_HMC_IW_MR].cnt;
+   arp_table_size = iwdev->sc_dev.hmc_info->hmc_obj[I40IW_HMC_IW_ARP].cnt;
+   iwdev->max_cqe = 0xF;
+   num_pds = max_qp * 4;
+   resources_size = sizeof(struct i40iw_arp_entry) * arp_table_size;
+   resources_size += sizeof(unsigned long) * BITS_TO_LONGS(max_qp);
+   resources_size += sizeof(unsigned long) * BITS_TO_LONGS(max_mr);
+   resources_size += sizeof(unsigned long) * BITS_TO_LONGS(max_cq);
+   resources_size += sizeof(unsigned long) * BITS_TO_LONGS(num_pds);
+   resources_size += sizeof(unsigned long) * BITS_TO_LONGS(arp_table_size);
+   resources_size += sizeof(struct i40iw_qp **) * max_qp;
+   iwdev->mem_resources = kzalloc(resources_size, GFP_KERNEL);
+
+   if (!iwdev->mem_resources)
+   return -ENOMEM;
+
+   iwdev->max_qp = max_qp;
+   iwdev->max_mr = max_mr;
+   iwdev->max_cq = max_cq;
+   iwdev->max_pd = num_pds;
+   iwdev->arp_table_size = arp_table_size;
+   iwdev->arp_table = (struct i40iw_arp_entry *)iwdev->mem_resources;
+   resource_ptr = iwdev->mem_resources + (sizeof(struct i40iw_arp_entry) * 
arp_table_size);
+
+   iwdev->device_cap_flags = IB_DEVICE_LOCAL_DMA_LKEY |
+   IB_DEVICE_MEM_WINDOW | IB_DEVICE_MEM_MGT_EXTENSIONS;
+
+   iwdev->allocated_qps = resource_ptr;
+   iwdev->allocated_cqs = >allocated_qps[BITS_TO_LONGS(max_qp)];
+   iwdev->allocated_mrs = >allocated_cqs[BITS_TO_LONGS(max_cq)];
+   iwdev->allocated_pds = >allocated_mrs[BITS_TO_LONGS(max_mr)];
+   iwdev->allocated_arps = >allocated_pds[BITS_TO_LONGS(num_pds)];
+   iwdev->qp_table = (struct i40iw_qp 
**)(>allocated_arps[BITS_TO_LONGS(arp_table_size)]);
+   set_bit(0, iwdev->allocated_mrs);
+   

[PATCH v3 09/16] i40iw: add files for iwarp interface

2016-01-20 Thread Faisal Latif
i40iw_verbs.[ch] are to handle iwarp interface.

Changes since v2:
Made infiniband interface changes for 4.5
removed i40iw_reg_phys_mr() for 4.5
made changes as made by Christoph Hellwig made for nes
in i40iw_get_dma_mr().

Changes since v1:
Following modification based on Christoph Hellwig's feedback
 -remove kmap() calls and moved to i40iw_cm.c.
 -cleanup some of casts

Acked-by: Anjali Singhai Jain 
Acked-by: Shannon Nelson 
Signed-off-by: Faisal Latif 
---
 drivers/infiniband/hw/i40iw/i40iw_ucontext.h |  107 ++
 drivers/infiniband/hw/i40iw/i40iw_verbs.c| 2434 ++
 drivers/infiniband/hw/i40iw/i40iw_verbs.h|  173 ++
 3 files changed, 2714 insertions(+)
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_ucontext.h
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_verbs.c
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_verbs.h

diff --git a/drivers/infiniband/hw/i40iw/i40iw_ucontext.h 
b/drivers/infiniband/hw/i40iw/i40iw_ucontext.h
new file mode 100644
index 000..12acd68
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_ucontext.h
@@ -0,0 +1,107 @@
+/*
+ * Copyright (c) 2006 - 2016 Intel Corporation.  All rights reserved.
+ * Copyright (c) 2005 Topspin Communications.  All rights reserved.
+ * Copyright (c) 2005 Cisco Systems.  All rights reserved.
+ * Copyright (c) 2005 Open Grid Computing, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ *  - Redistributions of source code must retain the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer in the documentation and/or other materials
+ *provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ */
+
+#ifndef I40IW_USER_CONTEXT_H
+#define I40IW_USER_CONTEXT_H
+
+#include 
+
+#define I40IW_ABI_USERSPACE_VER 4
+#define I40IW_ABI_KERNEL_VER4
+struct i40iw_alloc_ucontext_req {
+   __u32 reserved32;
+   __u8 userspace_ver;
+   __u8 reserved8[3];
+};
+
+struct i40iw_alloc_ucontext_resp {
+   __u32 max_pds;  /* maximum pds allowed for this user process */
+   __u32 max_qps;  /* maximum qps allowed for this user process */
+   __u32 wq_size;  /* size of the WQs (sq+rq) allocated to the 
mmaped area */
+   __u8 kernel_ver;
+   __u8 reserved[3];
+};
+
+struct i40iw_alloc_pd_resp {
+   __u32 pd_id;
+   __u8 reserved[4];
+};
+
+struct i40iw_create_cq_req {
+   __u64 user_cq_buffer;
+   __u64 user_shadow_area;
+};
+
+struct i40iw_create_qp_req {
+   __u64 user_wqe_buffers;
+   __u64 user_compl_ctx;
+
+   /* UDA QP PHB */
+   __u64 user_sq_phb;  /* place for VA of the sq phb buff */
+   __u64 user_rq_phb;  /* place for VA of the rq phb buff */
+};
+
+enum i40iw_memreg_type {
+   IW_MEMREG_TYPE_MEM = 0x,
+   IW_MEMREG_TYPE_QP = 0x0001,
+   IW_MEMREG_TYPE_CQ = 0x0002,
+};
+
+struct i40iw_mem_reg_req {
+   __u16 reg_type; /* Memory, QP or CQ */
+   __u16 cq_pages;
+   __u16 rq_pages;
+   __u16 sq_pages;
+};
+
+struct i40iw_create_cq_resp {
+   __u32 cq_id;
+   __u32 cq_size;
+   __u32 mmap_db_index;
+   __u32 reserved;
+};
+
+struct i40iw_create_qp_resp {
+   __u32 qp_id;
+   __u32 actual_sq_size;
+   __u32 actual_rq_size;
+   __u32 i40iw_drv_opt;
+   __u16 push_idx;
+   __u8  lsmm;
+   __u8  rsvd2;
+};
+
+#endif
diff --git a/drivers/infiniband/hw/i40iw/i40iw_verbs.c 
b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
new file mode 100644
index 000..c5c9805
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_verbs.c
@@ -0,0 +1,2434 @@
+/***
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All 

[PATCH v3 12/16] i40iw: add X722 register file

2016-01-20 Thread Faisal Latif
X722 Hardware registers defines for iWARP component.

Acked-by: Anjali Singhai Jain 
Acked-by: Shannon Nelson 
Signed-off-by: Faisal Latif 
---
 drivers/infiniband/hw/i40iw/i40iw_register.h | 1030 ++
 1 file changed, 1030 insertions(+)
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_register.h

diff --git a/drivers/infiniband/hw/i40iw/i40iw_register.h 
b/drivers/infiniband/hw/i40iw/i40iw_register.h
new file mode 100644
index 000..5776818
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_register.h
@@ -0,0 +1,1030 @@
+/***
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*- Redistributions of source code must retain the above
+*  copyright notice, this list of conditions and the following
+*  disclaimer.
+*
+*- Redistributions in binary form must reproduce the above
+*  copyright notice, this list of conditions and the following
+*  disclaimer in the documentation and/or other materials
+*  provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+***/
+
+#ifndef I40IW_REGISTER_H
+#define I40IW_REGISTER_H
+
+#define I40E_GLGEN_STAT   0x000B612C /* Reset: POR */
+
+#define I40E_PFHMC_PDINV   0x000C0300 /* Reset: PFR */
+#define I40E_PFHMC_PDINV_PMSDIDX_SHIFT 0
+#define I40E_PFHMC_PDINV_PMSDIDX_MASK  (0xFFF <<  
I40E_PFHMC_PDINV_PMSDIDX_SHIFT)
+#define I40E_PFHMC_PDINV_PMPDIDX_SHIFT 16
+#define I40E_PFHMC_PDINV_PMPDIDX_MASK  (0x1FF <<  
I40E_PFHMC_PDINV_PMPDIDX_SHIFT)
+#define I40E_PFHMC_SDCMD_PMSDWR_SHIFT  31
+#define I40E_PFHMC_SDCMD_PMSDWR_MASK   (0x1 <<  I40E_PFHMC_SDCMD_PMSDWR_SHIFT)
+#define I40E_PFHMC_SDDATALOW_PMSDVALID_SHIFT   0
+#define I40E_PFHMC_SDDATALOW_PMSDVALID_MASK(0x1 <<  
I40E_PFHMC_SDDATALOW_PMSDVALID_SHIFT)
+#define I40E_PFHMC_SDDATALOW_PMSDTYPE_SHIFT1
+#define I40E_PFHMC_SDDATALOW_PMSDTYPE_MASK (0x1 <<  
I40E_PFHMC_SDDATALOW_PMSDTYPE_SHIFT)
+#define I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_SHIFT 2
+#define I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_MASK  (0x3FF <<  
I40E_PFHMC_SDDATALOW_PMSDBPCOUNT_SHIFT)
+
+#define I40E_PFINT_DYN_CTLN(_INTPF) (0x00034800 + ((_INTPF) * 4)) /* 
_i=0...511 */ /* Reset: PFR */
+#define I40E_PFINT_DYN_CTLN_INTENA_SHIFT  0
+#define I40E_PFINT_DYN_CTLN_INTENA_MASK   (0x1 <<  
I40E_PFINT_DYN_CTLN_INTENA_SHIFT)
+#define I40E_PFINT_DYN_CTLN_CLEARPBA_SHIFT1
+#define I40E_PFINT_DYN_CTLN_CLEARPBA_MASK (0x1 <<  
I40E_PFINT_DYN_CTLN_CLEARPBA_SHIFT)
+#define I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT3
+#define I40E_PFINT_DYN_CTLN_ITR_INDX_MASK (0x3 <<  
I40E_PFINT_DYN_CTLN_ITR_INDX_SHIFT)
+
+#define I40E_VFINT_DYN_CTLN1(_INTVF)   (0x3800 + ((_INTVF) * 
4)) /* _i=0...15 */ /* Reset: VFR */
+#define I40E_GLHMC_VFPDINV(_i)   (0x000C8300 + ((_i) * 4)) /* 
_i=0...31 */ /* Reset: CORER */
+
+#define I40E_PFHMC_PDINV_PMSDPARTSEL_SHIFT 15
+#define I40E_PFHMC_PDINV_PMSDPARTSEL_MASK  (0x1 <<  
I40E_PFHMC_PDINV_PMSDPARTSEL_SHIFT)
+#define I40E_GLPCI_LBARCTRL0x000BE484 /* Reset: POR */
+#define I40E_GLPCI_LBARCTRL_PE_DB_SIZE_SHIFT4
+#define I40E_GLPCI_LBARCTRL_PE_DB_SIZE_MASK (0x3 <<  
I40E_GLPCI_LBARCTRL_PE_DB_SIZE_SHIFT)
+#define I40E_GLPCI_DREVID  0x0009C480 /* Reset: PCIR */
+#define I40E_GLPCI_DREVID_DEFAULT_REVID_SHIFT 0
+#define I40E_GLPCI_DREVID_DEFAULT_REVID_MASK 0xFF
+
+#define I40E_PFPE_AEQALLOC   0x00131180 /* Reset: PFR */
+#define I40E_PFPE_AEQALLOC_AECOUNT_SHIFT 0
+#define I40E_PFPE_AEQALLOC_AECOUNT_MASK  (0x <<  
I40E_PFPE_AEQALLOC_AECOUNT_SHIFT)
+#define I40E_PFPE_CCQPHIGH  0x8200 /* Reset: PFR */
+#define I40E_PFPE_CCQPHIGH_PECCQPHIGH_SHIFT 0
+#define I40E_PFPE_CCQPHIGH_PECCQPHIGH_MASK  (0x <<  
I40E_PFPE_CCQPHIGH_PECCQPHIGH_SHIFT)
+#define 

[PATCH v3 10/16] i40iw: add file to handle cqp calls

2016-01-20 Thread Faisal Latif
i40iw_ctrl.c provides for hardware wqe support and cqp.

Changes since v2:
cleanup coccinelle error reported by Julia Lawall

Changes since v1:
reported by Christoph Hellwig's review
-remove unnecessary casts

Acked-by: Anjali Singhai Jain 
Acked-by: Shannon Nelson 
Signed-off-by: Faisal Latif 
---
 drivers/infiniband/hw/i40iw/i40iw_ctrl.c | 4743 ++
 1 file changed, 4743 insertions(+)
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_ctrl.c

diff --git a/drivers/infiniband/hw/i40iw/i40iw_ctrl.c 
b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c
new file mode 100644
index 000..f05802b
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_ctrl.c
@@ -0,0 +1,4743 @@
+/***
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*- Redistributions of source code must retain the above
+*  copyright notice, this list of conditions and the following
+*  disclaimer.
+*
+*- Redistributions in binary form must reproduce the above
+*  copyright notice, this list of conditions and the following
+*  disclaimer in the documentation and/or other materials
+*  provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+***/
+
+#include "i40iw_osdep.h"
+#include "i40iw_register.h"
+#include "i40iw_status.h"
+#include "i40iw_hmc.h"
+
+#include "i40iw_d.h"
+#include "i40iw_type.h"
+#include "i40iw_p.h"
+#include "i40iw_vf.h"
+#include "i40iw_virtchnl.h"
+
+/**
+ * i40iw_insert_wqe_hdr - write wqe header
+ * @wqe: cqp wqe for header
+ * @header: header for the cqp wqe
+ */
+static inline void i40iw_insert_wqe_hdr(u64 *wqe, u64 header)
+{
+   wmb();/* make sure WQE is populated before polarity is set 
*/
+   set_64bit_val(wqe, 24, header);
+}
+
+/**
+ * i40iw_get_cqp_reg_info - get head and tail for cqp using registers
+ * @cqp: struct for cqp hw
+ * @val: cqp tail register value
+ * @tail:wqtail register value
+ * @error: cqp processing err
+ */
+static inline void i40iw_get_cqp_reg_info(struct i40iw_sc_cqp *cqp,
+ u32 *val,
+ u32 *tail,
+ u32 *error)
+{
+   if (cqp->dev->is_pf) {
+   *val = i40iw_rd32(cqp->dev->hw, I40E_PFPE_CQPTAIL);
+   *tail = RS_32(*val, I40E_PFPE_CQPTAIL_WQTAIL);
+   *error = RS_32(*val, I40E_PFPE_CQPTAIL_CQP_OP_ERR);
+   } else {
+   *val = i40iw_rd32(cqp->dev->hw, I40E_VFPE_CQPTAIL1);
+   *tail = RS_32(*val, I40E_VFPE_CQPTAIL_WQTAIL);
+   *error = RS_32(*val, I40E_VFPE_CQPTAIL_CQP_OP_ERR);
+   }
+}
+
+/**
+ * i40iw_cqp_poll_registers - poll cqp registers
+ * @cqp: struct for cqp hw
+ * @tail:wqtail register value
+ * @count: how many times to try for completion
+ */
+static enum i40iw_status_code i40iw_cqp_poll_registers(
+   struct i40iw_sc_cqp *cqp,
+   u32 tail,
+   u32 count)
+{
+   u32 i = 0;
+   u32 newtail, error, val;
+
+   while (i < count) {
+   i++;
+   i40iw_get_cqp_reg_info(cqp, , , );
+   if (error) {
+   error = (cqp->dev->is_pf) ?
+i40iw_rd32(cqp->dev->hw, 
I40E_PFPE_CQPERRCODES) :
+i40iw_rd32(cqp->dev->hw, 
I40E_VFPE_CQPERRCODES1);
+   return I40IW_ERR_CQP_COMPL_ERROR;
+   }
+   if (newtail != tail) {
+   /* SUCCESS */
+   I40IW_RING_MOVE_TAIL(cqp->sq_ring);
+   return 0;
+   }
+   udelay(I40IW_SLEEP_COUNT);
+   }
+   return 

[PATCH v3 16/16] i40iw: changes for build of i40iw module

2016-01-20 Thread Faisal Latif
MAINTAINERS, Kconfig, and Makefile to build i40iw module

Signed-off-by: Faisal Latif 
---
 MAINTAINERS| 10 ++
 drivers/infiniband/Kconfig |  1 +
 drivers/infiniband/hw/Makefile |  1 +
 3 files changed, 12 insertions(+)

diff --git a/MAINTAINERS b/MAINTAINERS
index 978526c..480e0a2 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -5601,6 +5601,16 @@ F:   Documentation/networking/i40evf.txt
 F: drivers/net/ethernet/intel/
 F: drivers/net/ethernet/intel/*/
 
+INTEL RDMA RNIC DRIVER
+M: Faisal Latif 
+R: Chien Tin Tung 
+R: Mustafa Ismail 
+R: Shiraz Saleem 
+R: Tatyana Nikolova 
+L: linux-r...@vger.kernel.org
+S: Supported
+F: drivers/infiniband/hw/i40iw/
+
 INTEL-MID GPIO DRIVER
 M: David Cohen 
 L: linux-g...@vger.kernel.org
diff --git a/drivers/infiniband/Kconfig b/drivers/infiniband/Kconfig
index 8a8440c..0434760 100644
--- a/drivers/infiniband/Kconfig
+++ b/drivers/infiniband/Kconfig
@@ -68,6 +68,7 @@ source "drivers/infiniband/hw/mthca/Kconfig"
 source "drivers/infiniband/hw/qib/Kconfig"
 source "drivers/infiniband/hw/cxgb3/Kconfig"
 source "drivers/infiniband/hw/cxgb4/Kconfig"
+source "drivers/infiniband/hw/i40iw/Kconfig"
 source "drivers/infiniband/hw/mlx4/Kconfig"
 source "drivers/infiniband/hw/mlx5/Kconfig"
 source "drivers/infiniband/hw/nes/Kconfig"
diff --git a/drivers/infiniband/hw/Makefile b/drivers/infiniband/hw/Makefile
index aded2a5..c7ad0a4 100644
--- a/drivers/infiniband/hw/Makefile
+++ b/drivers/infiniband/hw/Makefile
@@ -2,6 +2,7 @@ obj-$(CONFIG_INFINIBAND_MTHCA)  += mthca/
 obj-$(CONFIG_INFINIBAND_QIB)   += qib/
 obj-$(CONFIG_INFINIBAND_CXGB3) += cxgb3/
 obj-$(CONFIG_INFINIBAND_CXGB4) += cxgb4/
+obj-$(CONFIG_INFINIBAND_I40IW) += i40iw/
 obj-$(CONFIG_MLX4_INFINIBAND)  += mlx4/
 obj-$(CONFIG_MLX5_INFINIBAND)  += mlx5/
 obj-$(CONFIG_INFINIBAND_NES)   += nes/
-- 
2.5.3



[PATCH v3 05/16] i40iw: add puda code

2016-01-20 Thread Faisal Latif
i40iw_puda.[ch] are files to handle iwarp connection packets as
well as exception packets over multiple privilege mode uda queues.

Acked-by: Anjali Singhai Jain 
Acked-by: Shannon Nelson 
Signed-off-by: Faisal Latif 
---
 drivers/infiniband/hw/i40iw/i40iw_puda.c | 1436 ++
 drivers/infiniband/hw/i40iw/i40iw_puda.h |  183 
 2 files changed, 1619 insertions(+)
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_puda.c
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_puda.h

diff --git a/drivers/infiniband/hw/i40iw/i40iw_puda.c 
b/drivers/infiniband/hw/i40iw/i40iw_puda.c
new file mode 100644
index 000..ae9971f
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_puda.c
@@ -0,0 +1,1436 @@
+/***
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*- Redistributions of source code must retain the above
+*  copyright notice, this list of conditions and the following
+*  disclaimer.
+*
+*- Redistributions in binary form must reproduce the above
+*  copyright notice, this list of conditions and the following
+*  disclaimer in the documentation and/or other materials
+*  provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+***/
+
+#include "i40iw_osdep.h"
+#include "i40iw_register.h"
+#include "i40iw_status.h"
+#include "i40iw_hmc.h"
+
+#include "i40iw_d.h"
+#include "i40iw_type.h"
+#include "i40iw_p.h"
+#include "i40iw_puda.h"
+
+static void i40iw_ieq_receive(struct i40iw_sc_dev *dev,
+ struct i40iw_puda_buf *buf);
+static void i40iw_ieq_tx_compl(struct i40iw_sc_dev *dev, void *sqwrid);
+static void i40iw_ilq_putback_rcvbuf(struct i40iw_sc_qp *qp, u32 wqe_idx);
+static enum i40iw_status_code i40iw_puda_replenish_rq(struct i40iw_puda_rsrc
+ *rsrc, bool initial);
+/**
+ * i40iw_puda_get_listbuf - get buffer from puda list
+ * @list: list to use for buffers (ILQ or IEQ)
+ */
+static struct i40iw_puda_buf *i40iw_puda_get_listbuf(struct list_head *list)
+{
+   struct i40iw_puda_buf *buf = NULL;
+
+   if (!list_empty(list)) {
+   buf = (struct i40iw_puda_buf *)list->next;
+   list_del((struct list_head *)>list);
+   }
+   return buf;
+}
+
+/**
+ * i40iw_puda_get_bufpool - return buffer from resource
+ * @rsrc: resource to use for buffer
+ */
+struct i40iw_puda_buf *i40iw_puda_get_bufpool(struct i40iw_puda_rsrc *rsrc)
+{
+   struct i40iw_puda_buf *buf = NULL;
+   struct list_head *list = >bufpool;
+   unsigned long   flags;
+
+   spin_lock_irqsave(>bufpool_lock, flags);
+   buf = i40iw_puda_get_listbuf(list);
+   if (buf)
+   rsrc->avail_buf_count--;
+   else
+   rsrc->stats_buf_alloc_fail++;
+   spin_unlock_irqrestore(>bufpool_lock, flags);
+   return buf;
+}
+
+/**
+ * i40iw_puda_ret_bufpool - return buffer to rsrc list
+ * @rsrc: resource to use for buffer
+ * @buf: buffe to return to resouce
+ */
+void i40iw_puda_ret_bufpool(struct i40iw_puda_rsrc *rsrc,
+   struct i40iw_puda_buf *buf)
+{
+   unsigned long   flags;
+
+   spin_lock_irqsave(>bufpool_lock, flags);
+   list_add(>list, >bufpool);
+   spin_unlock_irqrestore(>bufpool_lock, flags);
+   rsrc->avail_buf_count++;
+}
+
+/**
+ * i40iw_puda_post_recvbuf - set wqe for rcv buffer
+ * @rsrc: resource ptr
+ * @wqe_idx: wqe index to use
+ * @buf: puda buffer for rcv q
+ * @initial: flag if during init time
+ */
+static void i40iw_puda_post_recvbuf(struct i40iw_puda_rsrc *rsrc, u32 wqe_idx,
+   struct i40iw_puda_buf *buf, bool initial)
+{
+   u64 *wqe;
+   struct i40iw_sc_qp *qp = >qp;
+   u64 offset24 = 0;
+
+   qp->qp_uk.rq_wrid_array[wqe_idx] = (uintptr_t)buf;
+   

[PATCH v3 03/16] i40iw: add main, hdr, status

2016-01-20 Thread Faisal Latif
i40iw_main.c contains routines for i40e <=> i40iw interface and setup.
i40iw.h is header file for main device data structures.
i40iw_status.h is for return status codes.

Changes from v2:
more cast improvement
fixed timing issue during unload
added paramater change call from i40e

Changes from v1:
improved casting issues
do not print error using pr_err
change from bits to bool in i40iw_cqp_request{}

Acked-by: Anjali Singhai Jain 
Acked-by: Shannon Nelson 
Signed-off-by: Faisal Latif 
---
 drivers/infiniband/hw/i40iw/i40iw.h|  569 
 drivers/infiniband/hw/i40iw/i40iw_main.c   | 1930 
 drivers/infiniband/hw/i40iw/i40iw_status.h |  100 ++
 3 files changed, 2599 insertions(+)
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw.h
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_main.c
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_status.h

diff --git a/drivers/infiniband/hw/i40iw/i40iw.h 
b/drivers/infiniband/hw/i40iw/i40iw.h
new file mode 100644
index 000..19f6651
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw.h
@@ -0,0 +1,569 @@
+/***
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*- Redistributions of source code must retain the above
+*  copyright notice, this list of conditions and the following
+*  disclaimer.
+*
+*- Redistributions in binary form must reproduce the above
+*  copyright notice, this list of conditions and the following
+*  disclaimer in the documentation and/or other materials
+*  provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+***/
+
+#ifndef I40IW_IW_H
+#define I40IW_IW_H
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "i40iw_status.h"
+#include "i40iw_osdep.h"
+#include "i40iw_d.h"
+#include "i40iw_hmc.h"
+
+#include 
+#include "i40iw_type.h"
+#include "i40iw_p.h"
+#include "i40iw_ucontext.h"
+#include "i40iw_pble.h"
+#include "i40iw_verbs.h"
+#include "i40iw_cm.h"
+#include "i40iw_user.h"
+#include "i40iw_puda.h"
+
+#define I40IW_FW_VERSION  2
+#define I40IW_HW_VERSION  2
+
+#define I40IW_ARP_ADD 1
+#define I40IW_ARP_DELETE  2
+#define I40IW_ARP_RESOLVE 3
+
+#define I40IW_MACIP_ADD 1
+#define I40IW_MACIP_DELETE  2
+
+#define IW_CCQ_SIZE (I40IW_CQP_SW_SQSIZE_2048 + 1)
+#define IW_CEQ_SIZE 2048
+#define IW_AEQ_SIZE 2048
+
+#define RX_BUF_SIZE(1536 + 8)
+#define IW_REG0_SIZE   (4 * 1024)
+#define IW_TX_TIMEOUT  (6 * HZ)
+#define IW_FIRST_QPN   1
+#define IW_SW_CONTEXT_ALIGN1024
+
+#define MAX_DPC_ITERATIONS 128
+
+#define I40IW_EVENT_TIMEOUT10
+#define I40IW_VCHNL_EVENT_TIMEOUT  10
+
+#defineI40IW_NO_VLAN   0x
+#defineI40IW_NO_QSET   0x
+
+/* access to mcast filter list */
+#define IW_ADD_MCAST false
+#define IW_DEL_MCAST true
+
+#define I40IW_DRV_OPT_ENABLE_MPA_VER_0 0x0001
+#define I40IW_DRV_OPT_DISABLE_MPA_CRC  0x0002
+#define I40IW_DRV_OPT_DISABLE_FIRST_WRITE  0x0004
+#define I40IW_DRV_OPT_DISABLE_INTF 0x0008
+#define I40IW_DRV_OPT_ENABLE_MSI   0x0010
+#define I40IW_DRV_OPT_DUAL_LOGICAL_PORT0x0020
+#define I40IW_DRV_OPT_NO_INLINE_DATA   0x0080
+#define I40IW_DRV_OPT_DISABLE_INT_MOD  0x0100
+#define I40IW_DRV_OPT_DISABLE_VIRT_WQ  0x0200
+#define I40IW_DRV_OPT_ENABLE_PAU   0x0400
+#define I40IW_DRV_OPT_MCAST_LOGPORT_MAP0x0800
+
+#define IW_HMC_OBJ_TYPE_NUM ARRAY_SIZE(iw_hmc_obj_types)
+#define IW_CFG_FPM_QP_COUNT32768
+
+#define 

[PATCH v3 15/16] i40iw: Kconfig and Makefile for iwarp module

2016-01-20 Thread Faisal Latif
Kconfig and Makefile needed to build iwarp module.

Changes since v2:
moved from Kbuild to Makefile

Signed-off-by: Faisal Latif 
---
 drivers/infiniband/hw/i40iw/Kconfig  | 7 +++
 drivers/infiniband/hw/i40iw/Makefile | 9 +
 2 files changed, 16 insertions(+)
 create mode 100644 drivers/infiniband/hw/i40iw/Kconfig
 create mode 100644 drivers/infiniband/hw/i40iw/Makefile

diff --git a/drivers/infiniband/hw/i40iw/Kconfig 
b/drivers/infiniband/hw/i40iw/Kconfig
new file mode 100644
index 000..6e7d27a
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/Kconfig
@@ -0,0 +1,7 @@
+config INFINIBAND_I40IW
+   tristate "Intel(R) Ethernet X722 iWARP Driver"
+   depends on INET && I40E
+   select GENERIC_ALLOCATOR
+   ---help---
+   Intel(R) Ethernet X722 iWARP Driver
+   INET && I40IW && INFINIBAND && I40E
diff --git a/drivers/infiniband/hw/i40iw/Makefile 
b/drivers/infiniband/hw/i40iw/Makefile
new file mode 100644
index 000..90068c0
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/Makefile
@@ -0,0 +1,9 @@
+ccflags-y :=  -Idrivers/net/ethernet/intel/i40e
+
+obj-$(CONFIG_INFINIBAND_I40IW) += i40iw.o
+
+i40iw-objs :=\
+   i40iw_cm.o i40iw_ctrl.o \
+   i40iw_hmc.o i40iw_hw.o i40iw_main.o  \
+   i40iw_pble.o i40iw_puda.o i40iw_uk.o i40iw_utils.o \
+   i40iw_verbs.o i40iw_virtchnl.o i40iw_vf.o
-- 
2.5.3



[PATCH v3 07/16] i40iw: add hmc resource files

2016-01-20 Thread Faisal Latif
i40iw_hmc.[ch] are to manage hmc for the device.

Acked-by: Anjali Singhai Jain 
Acked-by: Shannon Nelson 
Signed-off-by: Faisal Latif 
---
 drivers/infiniband/hw/i40iw/i40iw_hmc.c | 821 
 drivers/infiniband/hw/i40iw/i40iw_hmc.h | 241 ++
 2 files changed, 1062 insertions(+)
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_hmc.c
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_hmc.h

diff --git a/drivers/infiniband/hw/i40iw/i40iw_hmc.c 
b/drivers/infiniband/hw/i40iw/i40iw_hmc.c
new file mode 100644
index 000..5484cbf
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_hmc.c
@@ -0,0 +1,821 @@
+/***
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*- Redistributions of source code must retain the above
+*  copyright notice, this list of conditions and the following
+*  disclaimer.
+*
+*- Redistributions in binary form must reproduce the above
+*  copyright notice, this list of conditions and the following
+*  disclaimer in the documentation and/or other materials
+*  provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+***/
+
+#include "i40iw_osdep.h"
+#include "i40iw_register.h"
+#include "i40iw_status.h"
+#include "i40iw_hmc.h"
+#include "i40iw_d.h"
+#include "i40iw_type.h"
+#include "i40iw_p.h"
+#include "i40iw_vf.h"
+#include "i40iw_virtchnl.h"
+
+/**
+ * i40iw_find_sd_index_limit - finds segment descriptor index limit
+ * @hmc_info: pointer to the HMC configuration information structure
+ * @type: type of HMC resources we're searching
+ * @index: starting index for the object
+ * @cnt: number of objects we're trying to create
+ * @sd_idx: pointer to return index of the segment descriptor in question
+ * @sd_limit: pointer to return the maximum number of segment descriptors
+ *
+ * This function calculates the segment descriptor index and index limit
+ * for the resource defined by i40iw_hmc_rsrc_type.
+ */
+
+static inline void i40iw_find_sd_index_limit(struct i40iw_hmc_info *hmc_info,
+u32 type,
+u32 idx,
+u32 cnt,
+u32 *sd_idx,
+u32 *sd_limit)
+{
+   u64 fpm_addr, fpm_limit;
+
+   fpm_addr = hmc_info->hmc_obj[(type)].base +
+   hmc_info->hmc_obj[type].size * idx;
+   fpm_limit = fpm_addr + hmc_info->hmc_obj[type].size * cnt;
+   *sd_idx = (u32)(fpm_addr / I40IW_HMC_DIRECT_BP_SIZE);
+   *sd_limit = (u32)((fpm_limit - 1) / I40IW_HMC_DIRECT_BP_SIZE);
+   *sd_limit += 1;
+}
+
+/**
+ * i40iw_find_pd_index_limit - finds page descriptor index limit
+ * @hmc_info: pointer to the HMC configuration information struct
+ * @type: HMC resource type we're examining
+ * @idx: starting index for the object
+ * @cnt: number of objects we're trying to create
+ * @pd_index: pointer to return page descriptor index
+ * @pd_limit: pointer to return page descriptor index limit
+ *
+ * Calculates the page descriptor index and index limit for the resource
+ * defined by i40iw_hmc_rsrc_type.
+ */
+
+static inline void i40iw_find_pd_index_limit(struct i40iw_hmc_info *hmc_info,
+u32 type,
+u32 idx,
+u32 cnt,
+u32 *pd_idx,
+u32 *pd_limit)
+{
+   u64 fpm_adr, fpm_limit;
+
+   fpm_adr = hmc_info->hmc_obj[type].base +
+   hmc_info->hmc_obj[type].size * idx;
+   fpm_limit = fpm_adr + (hmc_info)->hmc_obj[(type)].size * (cnt);
+   *(pd_idx) = (u32)(fpm_adr / 

[PATCH v3 06/16] i40iw: add pble resource files

2016-01-20 Thread Faisal Latif
i40iw_pble.[ch] to manage pble resource for iwarp clients.

Changes since v2:
remove unnecessary casts

Acked-by: Anjali Singhai Jain 
Acked-by: Shannon Nelson 
Signed-off-by: Faisal Latif 
---
 drivers/infiniband/hw/i40iw/i40iw_pble.c | 618 +++
 drivers/infiniband/hw/i40iw/i40iw_pble.h | 131 +++
 2 files changed, 749 insertions(+)
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_pble.c
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_pble.h

diff --git a/drivers/infiniband/hw/i40iw/i40iw_pble.c 
b/drivers/infiniband/hw/i40iw/i40iw_pble.c
new file mode 100644
index 000..ded853d
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_pble.c
@@ -0,0 +1,618 @@
+/***
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*- Redistributions of source code must retain the above
+*  copyright notice, this list of conditions and the following
+*  disclaimer.
+*
+*- Redistributions in binary form must reproduce the above
+*  copyright notice, this list of conditions and the following
+*  disclaimer in the documentation and/or other materials
+*  provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+***/
+
+#include "i40iw_status.h"
+#include "i40iw_osdep.h"
+#include "i40iw_register.h"
+#include "i40iw_hmc.h"
+
+#include "i40iw_d.h"
+#include "i40iw_type.h"
+#include "i40iw_p.h"
+
+#include 
+#include 
+#include 
+#include "i40iw_pble.h"
+#include "i40iw.h"
+
+struct i40iw_device;
+static enum i40iw_status_code add_pble_pool(struct i40iw_sc_dev *dev,
+   struct i40iw_hmc_pble_rsrc 
*pble_rsrc);
+static void i40iw_free_vmalloc_mem(struct i40iw_hw *hw, struct i40iw_chunk 
*chunk);
+
+/**
+ * i40iw_destroy_pble_pool - destroy pool during module unload
+ * @pble_rsrc: pble resources
+ */
+void i40iw_destroy_pble_pool(struct i40iw_sc_dev *dev, struct 
i40iw_hmc_pble_rsrc *pble_rsrc)
+{
+   struct list_head *clist;
+   struct list_head *tlist;
+   struct i40iw_chunk *chunk;
+   struct i40iw_pble_pool *pinfo = _rsrc->pinfo;
+
+   if (pinfo->pool) {
+   list_for_each_safe(clist, tlist, >clist) {
+   chunk = list_entry(clist, struct i40iw_chunk, list);
+   if (chunk->type == I40IW_VMALLOC)
+   i40iw_free_vmalloc_mem(dev->hw, chunk);
+   kfree(chunk);
+   }
+   gen_pool_destroy(pinfo->pool);
+   }
+}
+
+/**
+ * i40iw_hmc_init_pble - Initialize pble resources during module load
+ * @dev: i40iw_sc_dev struct
+ * @pble_rsrc: pble resources
+ */
+enum i40iw_status_code i40iw_hmc_init_pble(struct i40iw_sc_dev *dev,
+  struct i40iw_hmc_pble_rsrc 
*pble_rsrc)
+{
+   struct i40iw_hmc_info *hmc_info;
+   u32 fpm_idx = 0;
+
+   hmc_info = dev->hmc_info;
+   pble_rsrc->fpm_base_addr = hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].base;
+   /* Now start the pble' on 4k boundary */
+   if (pble_rsrc->fpm_base_addr & 0xfff)
+   fpm_idx = (PAGE_SIZE - (pble_rsrc->fpm_base_addr & 0xfff)) >> 3;
+
+   pble_rsrc->unallocated_pble =
+   hmc_info->hmc_obj[I40IW_HMC_IW_PBLE].cnt - fpm_idx;
+   pble_rsrc->next_fpm_addr = pble_rsrc->fpm_base_addr + (fpm_idx << 3);
+
+   pble_rsrc->pinfo.pool_shift = POOL_SHIFT;
+   pble_rsrc->pinfo.pool = gen_pool_create(pble_rsrc->pinfo.pool_shift, 
-1);
+   INIT_LIST_HEAD(_rsrc->pinfo.clist);
+   if (!pble_rsrc->pinfo.pool)
+   goto error;
+
+   if (add_pble_pool(dev, pble_rsrc))
+   goto error;
+
+   return 0;
+
+ error:i40iw_destroy_pble_pool(dev, pble_rsrc);
+   return I40IW_ERR_NO_MEMORY;
+}
+
+/**
+ * get_sd_pd_idx -  Returns sd index, pd index and 

[PATCH v3 11/16] i40iw: add hardware related header files

2016-01-20 Thread Faisal Latif
header files for hardware accesses

Signed-off-by: Faisal Latif 
---
 drivers/infiniband/hw/i40iw/i40iw_d.h| 1713 ++
 drivers/infiniband/hw/i40iw/i40iw_p.h|  106 ++
 drivers/infiniband/hw/i40iw/i40iw_type.h | 1312 +++
 3 files changed, 3131 insertions(+)
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_d.h
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_p.h
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_type.h

diff --git a/drivers/infiniband/hw/i40iw/i40iw_d.h 
b/drivers/infiniband/hw/i40iw/i40iw_d.h
new file mode 100644
index 000..aab88d6
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_d.h
@@ -0,0 +1,1713 @@
+/***
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*- Redistributions of source code must retain the above
+*  copyright notice, this list of conditions and the following
+*  disclaimer.
+*
+*- Redistributions in binary form must reproduce the above
+*  copyright notice, this list of conditions and the following
+*  disclaimer in the documentation and/or other materials
+*  provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+***/
+
+#ifndef I40IW_D_H
+#define I40IW_D_H
+
+#define I40IW_DB_ADDR_OFFSET(4 * 1024 * 1024 - 64 * 1024)
+#define I40IW_VF_DB_ADDR_OFFSET (64 * 1024)
+
+#define I40IW_PUSH_OFFSET   (4 * 1024 * 1024)
+#define I40IW_PF_FIRST_PUSH_PAGE_INDEX 16
+#define I40IW_VF_PUSH_OFFSET((8 + 64) * 1024)
+#define I40IW_VF_FIRST_PUSH_PAGE_INDEX 2
+
+#define I40IW_PE_DB_SIZE_4M 1
+#define I40IW_PE_DB_SIZE_8M 2
+
+#define I40IW_DDP_VER 1
+#define I40IW_RDMAP_VER 1
+
+#define I40IW_RDMA_MODE_RDMAC 0
+#define I40IW_RDMA_MODE_IETF  1
+
+#define I40IW_QP_STATE_INVALID 0
+#define I40IW_QP_STATE_IDLE 1
+#define I40IW_QP_STATE_RTS 2
+#define I40IW_QP_STATE_CLOSING 3
+#define I40IW_QP_STATE_RESERVED 4
+#define I40IW_QP_STATE_TERMINATE 5
+#define I40IW_QP_STATE_ERROR 6
+
+#define I40IW_STAG_STATE_INVALID 0
+#define I40IW_STAG_STATE_VALID 1
+
+#define I40IW_STAG_TYPE_SHARED 0
+#define I40IW_STAG_TYPE_NONSHARED 1
+
+#define I40IW_MAX_USER_PRIORITY 8
+
+#define LS_64_1(val, bits)  ((u64)(uintptr_t)val << bits)
+#define RS_64_1(val, bits)  ((u64)(uintptr_t)val >> bits)
+#define LS_32_1(val, bits)  (u32)(val << bits)
+#define RS_32_1(val, bits)  (u32)(val >> bits)
+#define I40E_HI_DWORD(x)((u32)x) >> 16) >> 16) & 0x))
+
+#define LS_64(val, field) (((u64)val << field ## _SHIFT) & (field ## _MASK))
+
+#define RS_64(val, field) ((u64)(val & field ## _MASK) >> field ## _SHIFT)
+#define LS_32(val, field) ((val << field ## _SHIFT) & (field ## _MASK))
+#define RS_32(val, field) ((val & field ## _MASK) >> field ## _SHIFT)
+
+#define TERM_DDP_LEN_TAGGED 14
+#define TERM_DDP_LEN_UNTAGGED   18
+#define TERM_RDMA_LEN   28
+#define RDMA_OPCODE_MASK0x0f
+#define RDMA_READ_REQ_OPCODE1
+#define Q2_BAD_FRAME_OFFSET 72
+#define CQE_MAJOR_DRV   0x8000
+
+#define I40IW_TERM_SENT 0x01
+#define I40IW_TERM_RCVD 0x02
+#define I40IW_TERM_DONE 0x04
+#define I40IW_MAC_HLEN  14
+
+#define I40IW_INVALID_WQE_INDEX 0x
+
+#define I40IW_CQP_WAIT_POLL_REGS 1
+#define I40IW_CQP_WAIT_POLL_CQ 2
+#define I40IW_CQP_WAIT_EVENT 3
+
+#define I40IW_CQP_INIT_WQE(wqe) memset(wqe, 0, 64)
+
+#define I40IW_GET_CURRENT_CQ_ELEMENT(_cq) \
+   ( \
+   &((_cq)->cq_base[I40IW_RING_GETCURRENT_HEAD((_cq)->cq_ring)])  \
+   )
+#define I40IW_GET_CURRENT_EXTENDED_CQ_ELEMENT(_cq) \
+   ( \
+   &(((struct i40iw_extended_cqe *)\
+  
((_cq)->cq_base))[I40IW_RING_GETCURRENT_HEAD((_cq)->cq_ring)]) \
+   )
+
+#define I40IW_GET_CURRENT_AEQ_ELEMENT(_aeq) \
+   ( \
+   &_aeq->aeqe_base[I40IW_RING_GETCURRENT_TAIL(_aeq->aeq_ring)]   \
+   )
+
+#define 

[PATCH v3 13/16] i40iw: user kernel shared files

2016-01-20 Thread Faisal Latif
i40iw_user.h and i40iw_uk.c are used by both user library as well as
kernel requests.

Acked-by: Anjali Singhai Jain 
Acked-by: Shannon Nelson 
Signed-off-by: Faisal Latif 
---
 drivers/infiniband/hw/i40iw/i40iw_uk.c   | 1204 ++
 drivers/infiniband/hw/i40iw/i40iw_user.h |  442 +++
 2 files changed, 1646 insertions(+)
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_uk.c
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_user.h

diff --git a/drivers/infiniband/hw/i40iw/i40iw_uk.c 
b/drivers/infiniband/hw/i40iw/i40iw_uk.c
new file mode 100644
index 000..f78c3dc
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_uk.c
@@ -0,0 +1,1204 @@
+/***
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*- Redistributions of source code must retain the above
+*  copyright notice, this list of conditions and the following
+*  disclaimer.
+*
+*- Redistributions in binary form must reproduce the above
+*  copyright notice, this list of conditions and the following
+*  disclaimer in the documentation and/or other materials
+*  provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+***/
+
+#include "i40iw_osdep.h"
+#include "i40iw_status.h"
+#include "i40iw_d.h"
+#include "i40iw_user.h"
+#include "i40iw_register.h"
+
+static u32 nop_signature = 0x;
+
+/**
+ * i40iw_nop_1 - insert a nop wqe and move head. no post work
+ * @qp: hw qp ptr
+ */
+static enum i40iw_status_code i40iw_nop_1(struct i40iw_qp_uk *qp)
+{
+   u64 header, *wqe;
+   u64 *wqe_0 = NULL;
+   u32 wqe_idx, peek_head;
+   bool signaled = false;
+
+   if (!qp->sq_ring.head)
+   return I40IW_ERR_PARAM;
+
+   wqe_idx = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring);
+   wqe = qp->sq_base[wqe_idx].elem;
+   peek_head = (qp->sq_ring.head + 1) % qp->sq_ring.size;
+   wqe_0 = qp->sq_base[peek_head].elem;
+   if (peek_head)
+   wqe_0[3] = LS_64(!qp->swqe_polarity, I40IWQPSQ_VALID);
+   else
+   wqe_0[3] = LS_64(qp->swqe_polarity, I40IWQPSQ_VALID);
+
+   set_64bit_val(wqe, 0, 0);
+   set_64bit_val(wqe, 8, 0);
+   set_64bit_val(wqe, 16, 0);
+
+   header = LS_64(I40IWQP_OP_NOP, I40IWQPSQ_OPCODE) |
+   LS_64(signaled, I40IWQPSQ_SIGCOMPL) |
+   LS_64(qp->swqe_polarity, I40IWQPSQ_VALID) | nop_signature++;
+
+   wmb();  /* Memory barrier to ensure data is written before valid bit is 
set */
+
+   set_64bit_val(wqe, 24, header);
+   return 0;
+}
+
+/**
+ * i40iw_qp_post_wr - post wr to hrdware
+ * @qp: hw qp ptr
+ */
+void i40iw_qp_post_wr(struct i40iw_qp_uk *qp)
+{
+   u64 temp;
+   u32 hw_sq_tail;
+   u32 sw_sq_head;
+
+   mb(); /* valid bit is written and loads completed before reading shadow 
*/
+
+   /* read the doorbell shadow area */
+   get_64bit_val(qp->shadow_area, 0, );
+
+   hw_sq_tail = (u32)RS_64(temp, I40IW_QP_DBSA_HW_SQ_TAIL);
+   sw_sq_head = I40IW_RING_GETCURRENT_HEAD(qp->sq_ring);
+   if (sw_sq_head != hw_sq_tail) {
+   if (sw_sq_head > qp->initial_ring.head) {
+   if ((hw_sq_tail >= qp->initial_ring.head) &&
+   (hw_sq_tail < sw_sq_head)) {
+   writel(qp->qp_id, qp->wqe_alloc_reg);
+   }
+   } else if (sw_sq_head != qp->initial_ring.head) {
+   if ((hw_sq_tail >= qp->initial_ring.head) ||
+   (hw_sq_tail < sw_sq_head)) {
+   writel(qp->qp_id, qp->wqe_alloc_reg);
+   }
+   }
+   }
+
+   qp->initial_ring.head = qp->sq_ring.head;
+}
+
+/**
+ * i40iw_qp_ring_push_db -  ring qp doorbell
+ * @qp: hw qp ptr
+ * @wqe_idx: wqe index
+ */

Re: ethtool NFC/ntuple API questions

2016-01-20 Thread Ben Hutchings
On Wed, 2016-01-20 at 19:12 +, Edward Cree wrote:
> Thanks both, it's making more sense now.
> One thing I'm still unclear about: why does struct ethtool_usrip4_spechave
> the ip_ver field?  The struct can't be extended to cover ipv6, because the
> address fields aren't big enough.  So what's it for?

It's also defined to always have the same value and mask!  It's a
design bug.

> Also, would it be appropriate to use struct in6_addr for IPv6 addresses, or
> should I use __be32[4]?

I think for consistency with the IPv4 structures it should be __be32[4].

Ben.

-- 
Ben Hutchings
The program is absolutely right; therefore, the computer must be wrong.

signature.asc
Description: This is a digitally signed message part


[PATCH v3 02/16] i40iw: add entry in rdma_netlink

2016-01-20 Thread Faisal Latif
Add entry for port mapper services.

Changes since v2:
moved this patch before being used

Changes since v1:
moved I40IW as last element

Signed-off-by: Faisal Latif 
---
 include/uapi/rdma/rdma_netlink.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/rdma/rdma_netlink.h b/include/uapi/rdma/rdma_netlink.h
index c19a5dc..4fa418d 100644
--- a/include/uapi/rdma/rdma_netlink.h
+++ b/include/uapi/rdma/rdma_netlink.h
@@ -8,6 +8,7 @@ enum {
RDMA_NL_NES,
RDMA_NL_C4IW,
RDMA_NL_LS, /* RDMA Local Services */
+   RDMA_NL_I40IW,
RDMA_NL_NUM_CLIENTS
 };
 
-- 
2.5.3



[PATCH v3 01/16] i40e: Add support for client interface for IWARP driver

2016-01-20 Thread Faisal Latif
From: Anjali Singhai Jain 

This patch adds a Client interface for i40iw driver
support. Also expands the Virtchannel to support messages
from i40evf driver on behalf of i40iwvf driver.

This client API is used by the i40iw and i40iwvf driver
to access the core driver resources brokered by the i40e driver.

Signed-off-by: Anjali Singhai Jain 
---
 drivers/net/ethernet/intel/i40e/Makefile   |1 +
 drivers/net/ethernet/intel/i40e/i40e.h |   22 +
 drivers/net/ethernet/intel/i40e/i40e_client.c  | 1012 
 drivers/net/ethernet/intel/i40e/i40e_client.h  |  232 +
 drivers/net/ethernet/intel/i40e/i40e_main.c|  115 ++-
 drivers/net/ethernet/intel/i40e/i40e_type.h|3 +-
 drivers/net/ethernet/intel/i40e/i40e_virtchnl.h|   34 +
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.c |  247 -
 drivers/net/ethernet/intel/i40e/i40e_virtchnl_pf.h |4 +
 9 files changed, 1657 insertions(+), 13 deletions(-)
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_client.c
 create mode 100644 drivers/net/ethernet/intel/i40e/i40e_client.h

diff --git a/drivers/net/ethernet/intel/i40e/Makefile 
b/drivers/net/ethernet/intel/i40e/Makefile
index b4729ba..3b3c63e 100644
--- a/drivers/net/ethernet/intel/i40e/Makefile
+++ b/drivers/net/ethernet/intel/i40e/Makefile
@@ -41,6 +41,7 @@ i40e-objs := i40e_main.o \
i40e_diag.o \
i40e_txrx.o \
i40e_ptp.o  \
+   i40e_client.o   \
i40e_virtchnl_pf.o
 
 i40e-$(CONFIG_I40E_DCB) += i40e_dcb.o i40e_dcb_nl.o
diff --git a/drivers/net/ethernet/intel/i40e/i40e.h 
b/drivers/net/ethernet/intel/i40e/i40e.h
index 4dd3e26..1417ae8 100644
--- a/drivers/net/ethernet/intel/i40e/i40e.h
+++ b/drivers/net/ethernet/intel/i40e/i40e.h
@@ -59,6 +59,7 @@
 #ifdef I40E_FCOE
 #include "i40e_fcoe.h"
 #endif
+#include "i40e_client.h"
 #include "i40e_virtchnl.h"
 #include "i40e_virtchnl_pf.h"
 #include "i40e_txrx.h"
@@ -178,6 +179,7 @@ struct i40e_lump_tracking {
u16 search_hint;
u16 list[0];
 #define I40E_PILE_VALID_BIT  0x8000
+#define I40E_IWARP_IRQ_PILE_ID  (I40E_PILE_VALID_BIT - 2)
 };
 
 #define I40E_DEFAULT_ATR_SAMPLE_RATE   20
@@ -264,6 +266,8 @@ struct i40e_pf {
 #endif /* I40E_FCOE */
u16 num_lan_qps;   /* num lan queues this PF has set up */
u16 num_lan_msix;  /* num queue vectors for the base PF vsi */
+   u16 num_iwarp_msix;/* num of iwarp vectors for this PF */
+   int iwarp_base_vector;
int queues_left;   /* queues left unclaimed */
u16 rss_size;  /* num queues in the RSS array */
u16 rss_size_max;  /* HW defined max RSS queues */
@@ -313,6 +317,7 @@ struct i40e_pf {
 #define I40E_FLAG_16BYTE_RX_DESC_ENABLED   BIT_ULL(13)
 #define I40E_FLAG_CLEAN_ADMINQ BIT_ULL(14)
 #define I40E_FLAG_FILTER_SYNC  BIT_ULL(15)
+#define I40E_FLAG_SERVICE_CLIENT_REQUESTED BIT_ULL(16)
 #define I40E_FLAG_PROCESS_MDD_EVENTBIT_ULL(17)
 #define I40E_FLAG_PROCESS_VFLR_EVENT   BIT_ULL(18)
 #define I40E_FLAG_SRIOV_ENABLEDBIT_ULL(19)
@@ -550,6 +555,8 @@ struct i40e_vsi {
struct kobject *kobj;  /* sysfs object */
bool current_isup; /* Sync 'link up' logging */
 
+   void *priv; /* client driver data reference. */
+
/* VSI specific handlers */
irqreturn_t (*irq_handler)(int irq, void *data);
 
@@ -702,6 +709,10 @@ void i40e_vsi_setup_queue_map(struct i40e_vsi *vsi,
  struct i40e_vsi_context *ctxt,
  u8 enabled_tc, bool is_add);
 #endif
+void i40e_service_event_schedule(struct i40e_pf *pf);
+void i40e_notify_client_of_vf_msg(struct i40e_vsi *vsi, u32 vf_id,
+ u8 *msg, u16 len);
+
 int i40e_vsi_control_rings(struct i40e_vsi *vsi, bool enable);
 int i40e_reconfig_rss_queues(struct i40e_pf *pf, int queue_count);
 struct i40e_veb *i40e_veb_setup(struct i40e_pf *pf, u16 flags, u16 uplink_seid,
@@ -724,6 +735,17 @@ static inline void i40e_dbg_pf_exit(struct i40e_pf *pf) {}
 static inline void i40e_dbg_init(void) {}
 static inline void i40e_dbg_exit(void) {}
 #endif /* CONFIG_DEBUG_FS*/
+/* needed by client drivers */
+int i40e_lan_add_device(struct i40e_pf *pf);
+int i40e_lan_del_device(struct i40e_pf *pf);
+void i40e_client_subtask(struct i40e_pf *pf);
+void i40e_notify_client_of_l2_param_changes(struct i40e_vsi *vsi);
+void i40e_notify_client_of_netdev_open(struct i40e_vsi *vsi);
+void i40e_notify_client_of_netdev_close(struct i40e_vsi *vsi, bool reset);
+void i40e_notify_client_of_vf_enable(struct i40e_pf *pf, u32 num_vfs);
+void i40e_notify_client_of_vf_reset(struct i40e_pf *pf, u32 vf_id);
+int i40e_vf_client_capable(struct i40e_pf *pf, u32 vf_id,
+  enum i40e_client_type type);
 /**
  * i40e_irq_dynamic_enable - 

[PATCH v3 00/16] add Intel X722 iWARP driver

2016-01-20 Thread Faisal Latif
This driver provides iWARP RDMA functionality for the Intel(R) X722 Ethernet
controller for PCI Physical Functions. It is in early product cycle
and having the driver in the kernel will allow users to have hardware support
when available for purchase.


i40iw cooperates with the Intel(R) X722 base driver (i40e.ko) to allocate
resources and program the controller.

It has support for Virtual Function driver (i40iwvf.ko), which will
be part of separate patch series. The VF driver (i40iwvf) requires i40iw 
for resource management.


This series include 1 patch to i40e.ko to provide interface support to
i40iw.ko. The interface provides a driver registration mechanism, resource
allocations, and device reset coordination mechanisms.


This patch series is based on Doug Ledford's k.o/for-4.5.

Changes since v2:

Incorporated following comments from Or Gerlitz
*full git cover letter 
*0-day testing complete
*remove unused memtypes
*move netlink patch up

Changes made as part of 4.5 merge
*ported interface changes done for infiniband interface for 4.5
*some changes made by Christoph Hellwig in nes were also
 ported to i40iw

copyright changed to 2015-2016
moved from Kbuild to Makefile
Added i40e param change
fixes including crash during unload


Changes since v1:

Following review comments from Joe Perches are implemented.
*No need to print error for i40e_print errpr )
*Change from bits in cqp to bool in i40iw_cqp_request()

Following review comments from Christoph Hellwig are implemented.
*move down the I40IW enum
*remove pointless braces (all over the code)
*change usage of kmap() to make it short lived.
*remove unnecessary casts
*remove routine stubs
*do not set unused fields to zero as those are already zero

Anjali Singhai Jain (1):
  i40e: Add support for client interface for IWARP driver

Faisal Latif (15):
  i40iw: add entry in rdma_netlink
  i40iw: add main, hdr, status
  i40iw: add connection management code
  i40iw: add puda code
  i40iw: add pble resource files
  i40iw: add hmc resource files
  i40iw: add hw and utils files
  i40iw: add files for iwarp interface
  i40iw: add file to handle cqp calls
  i40iw: add hardware related header files
  i40iw: add X722 register file
  i40iw: user kernel shared files
  i40iw: virtual channel handling files
  i40iw: Kconfig and Makefile for iwarp module
  i40iw: changes for build of i40iw module

 MAINTAINERS|   10 +
 drivers/infiniband/Kconfig |1 +
 drivers/infiniband/hw/Makefile |1 +
 drivers/infiniband/hw/i40iw/Kconfig|7 +
 drivers/infiniband/hw/i40iw/Makefile   |9 +
 drivers/infiniband/hw/i40iw/i40iw.h|  569 +++
 drivers/infiniband/hw/i40iw/i40iw_cm.c | 4442 ++
 drivers/infiniband/hw/i40iw/i40iw_cm.h |  456 ++
 drivers/infiniband/hw/i40iw/i40iw_ctrl.c   | 4743 
 drivers/infiniband/hw/i40iw/i40iw_d.h  | 1713 +++
 drivers/infiniband/hw/i40iw/i40iw_hmc.c|  821 
 drivers/infiniband/hw/i40iw/i40iw_hmc.h|  241 +
 drivers/infiniband/hw/i40iw/i40iw_hw.c |  730 +++
 drivers/infiniband/hw/i40iw/i40iw_main.c   | 1930 
 drivers/infiniband/hw/i40iw/i40iw_osdep.h  |  214 +
 drivers/infiniband/hw/i40iw/i40iw_p.h  |  106 +
 drivers/infiniband/hw/i40iw/i40iw_pble.c   |  618 +++
 drivers/infiniband/hw/i40iw/i40iw_pble.h   |  131 +
 drivers/infiniband/hw/i40iw/i40iw_puda.c   | 1436 ++
 drivers/infiniband/hw/i40iw/i40iw_puda.h   |  183 +
 drivers/infiniband/hw/i40iw/i40iw_register.h   | 1030 +
 drivers/infiniband/hw/i40iw/i40iw_status.h |  100 +
 drivers/infiniband/hw/i40iw/i40iw_type.h   | 1312 ++
 drivers/infiniband/hw/i40iw/i40iw_ucontext.h   |  107 +
 drivers/infiniband/hw/i40iw/i40iw_uk.c | 1204 +
 drivers/infiniband/hw/i40iw/i40iw_user.h   |  442 ++
 drivers/infiniband/hw/i40iw/i40iw_utils.c  | 1256 ++
 drivers/infiniband/hw/i40iw/i40iw_verbs.c  | 2434 ++
 drivers/infiniband/hw/i40iw/i40iw_verbs.h  |  173 +
 drivers/infiniband/hw/i40iw/i40iw_vf.c |   85 +
 drivers/infiniband/hw/i40iw/i40iw_vf.h |   62 +
 drivers/infiniband/hw/i40iw/i40iw_virtchnl.c   |  748 +++
 drivers/infiniband/hw/i40iw/i40iw_virtchnl.h   |  124 +
 drivers/net/ethernet/intel/i40e/Makefile   |1 +
 drivers/net/ethernet/intel/i40e/i40e.h |   22 +
 drivers/net/ethernet/intel/i40e/i40e_client.c  | 1012 +
 

[PATCH v3 04/16] i40iw: add connection management code

2016-01-20 Thread Faisal Latif
i40iw_cm.c i40iw_cm.h are used for connection management.

changes since v2:
Implemented interface changes as reg_phys_mr() is
not part of inifiniband interface  Done as
Christoph Hellwig  did for nes.

Changes since v1:
improved casts
moved kmap() from i40iw_verbs.c to make them short
lived.

Acked-by: Anjali Singhai Jain 
Acked-by: Shannon Nelson 
Signed-off-by: Faisal Latif 
---
 drivers/infiniband/hw/i40iw/i40iw_cm.c | 4442 
 drivers/infiniband/hw/i40iw/i40iw_cm.h |  456 
 2 files changed, 4898 insertions(+)
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_cm.c
 create mode 100644 drivers/infiniband/hw/i40iw/i40iw_cm.h

diff --git a/drivers/infiniband/hw/i40iw/i40iw_cm.c 
b/drivers/infiniband/hw/i40iw/i40iw_cm.c
new file mode 100644
index 000..0589bd1
--- /dev/null
+++ b/drivers/infiniband/hw/i40iw/i40iw_cm.c
@@ -0,0 +1,4442 @@
+/***
+*
+* Copyright (c) 2015-2016 Intel Corporation.  All rights reserved.
+*
+* This software is available to you under a choice of one of two
+* licenses.  You may choose to be licensed under the terms of the GNU
+* General Public License (GPL) Version 2, available from the file
+* COPYING in the main directory of this source tree, or the
+* OpenFabrics.org BSD license below:
+*
+*   Redistribution and use in source and binary forms, with or
+*   without modification, are permitted provided that the following
+*   conditions are met:
+*
+*- Redistributions of source code must retain the above
+*  copyright notice, this list of conditions and the following
+*  disclaimer.
+*
+*- Redistributions in binary form must reproduce the above
+*  copyright notice, this list of conditions and the following
+*  disclaimer in the documentation and/or other materials
+*  provided with the distribution.
+*
+* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+* EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+* MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+* NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+* BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+* ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+* CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+* SOFTWARE.
+*
+***/
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "i40iw.h"
+
+static void i40iw_rem_ref_cm_node(struct i40iw_cm_node *);
+static void i40iw_cm_post_event(struct i40iw_cm_event *event);
+static void i40iw_disconnect_worker(struct work_struct *work);
+
+/**
+ * i40iw_free_sqbuf - put back puda buffer if refcount = 0
+ * @dev: FPK device
+ * @buf: puda buffer to free
+ */
+void i40iw_free_sqbuf(struct i40iw_sc_dev *dev, void *bufp)
+{
+   struct i40iw_puda_buf *buf = (struct i40iw_puda_buf *)bufp;
+   struct i40iw_puda_rsrc *ilq = dev->ilq;
+
+   if (!atomic_dec_return(>refcount))
+   i40iw_puda_ret_bufpool(ilq, buf);
+}
+
+/**
+ * i40iw_derive_hw_ird_setting - Calculate IRD
+ *
+ * @cm_ird: IRD of connection's node
+ *
+ * The ird from the connection is rounded to a supported HW
+ * setting (2,8,32,64) and then encoded for ird_size field of
+ * qp_ctx
+ */
+static u8 i40iw_derive_hw_ird_setting(u16 cm_ird)
+{
+   u8 encoded_ird_size;
+   u8 pof2_cm_ird = 1;
+
+   /* round-off to next powerof2 */
+   while (pof2_cm_ird < cm_ird)
+   pof2_cm_ird *= 2;
+
+   /* ird_size field is encoded in qp_ctx */
+   switch (pof2_cm_ird) {
+   case I40IW_HW_IRD_SETTING_64:
+   encoded_ird_size = 3;
+   break;
+   case I40IW_HW_IRD_SETTING_32:
+   case I40IW_HW_IRD_SETTING_16:
+   encoded_ird_size = 2;
+   break;
+   case I40IW_HW_IRD_SETTING_8:
+   case I40IW_HW_IRD_SETTING_4:
+   encoded_ird_size = 1;
+   break;
+   case I40IW_HW_IRD_SETTING_2:
+   default:
+   encoded_ird_size = 0;
+   break;
+   }
+   return encoded_ird_size;
+}
+
+/**
+ * i40iw_record_ird_ord - Record IRD/ORD passed in
+ * @cm_node: connection's node
+ * @conn_ird: connection IRD
+ * @conn_ord: connection ORD
+ */
+static void i40iw_record_ird_ord(struct i40iw_cm_node *cm_node, u16 conn_ird, 
u16 conn_ord)
+{
+   if (conn_ird > I40IW_MAX_IRD_SIZE)
+   conn_ird = I40IW_MAX_IRD_SIZE;
+
+   if (conn_ord > I40IW_MAX_ORD_SIZE)
+ 

Re: [PATCH net-next 6/8] net: gre: Implement LCO for GRE over IPv4

2016-01-20 Thread Tom Herbert
On Wed, Jan 20, 2016 at 11:35 AM, Alexander Duyck
 wrote:
> On Wed, Jan 20, 2016 at 11:11 AM, Rustad, Mark D
>  wrote:
>> Alexander Duyck  wrote:
>>
>>> Actually you may want to go the other way on that.  If they weren't
>>> flipping the checksum value for GRE before why should we start doing
>>> that now?  I'm pretty sure the checksum mangling is a very UDP centric
>>> thing.  There is no need to introduce it to other protocols.
>>
>>
>> If different checksum representations are needed, then there really should
>> be an explicit indication of whether it is a UDP-style checksum or other in
>> the skb I would think rather than guessing it based on the offset. Of course
>> it would be convenient if all the protocols that use a one's complement
>> checksum would tolerate the UDP representation. I have a long (and now old)
>> history working with real one's complement machines, and so I would want to
>> believe that any correct implementation would tolerate it, but I don't know
>> for a fact that they do.
>
> The only reason why UDP does the bit flip is because it has reserved a
> checksum of 0 as a special value.  For the checksum math itself either
> 0x or 0 are interchangeable.  The only time they would make any
> difference would be if we had a value of 0 that we were checksumming,
> but since that is not the case the values will always end up
> converging back onto 0x as the final result in the case of a
> correct checksum.
>
0x is mathematically equivalent to 0x0 for checksum. I would
rather always flip 0 to 0x in LCO rather than adding an explicit
indication (i.e. another flag) in SKB that it has a UDP checksum.

Tom

> - Alex


Re: ethtool NFC/ntuple API questions

2016-01-20 Thread Edward Cree
Thanks both, it's making more sense now.
One thing I'm still unclear about: why does struct ethtool_usrip4_spechave
the ip_ver field?  The struct can't be extended to cover ipv6, because the
address fields aren't big enough.  So what's it for?

Also, would it be appropriate to use struct in6_addr for IPv6 addresses, or
should I use __be32[4]?

-Ed


Re: [PATCH net-next 6/8] net: gre: Implement LCO for GRE over IPv4

2016-01-20 Thread Alexander Duyck
On Wed, Jan 20, 2016 at 11:11 AM, Rustad, Mark D
 wrote:
> Alexander Duyck  wrote:
>
>> Actually you may want to go the other way on that.  If they weren't
>> flipping the checksum value for GRE before why should we start doing
>> that now?  I'm pretty sure the checksum mangling is a very UDP centric
>> thing.  There is no need to introduce it to other protocols.
>
>
> If different checksum representations are needed, then there really should
> be an explicit indication of whether it is a UDP-style checksum or other in
> the skb I would think rather than guessing it based on the offset. Of course
> it would be convenient if all the protocols that use a one's complement
> checksum would tolerate the UDP representation. I have a long (and now old)
> history working with real one's complement machines, and so I would want to
> believe that any correct implementation would tolerate it, but I don't know
> for a fact that they do.

The only reason why UDP does the bit flip is because it has reserved a
checksum of 0 as a special value.  For the checksum math itself either
0x or 0 are interchangeable.  The only time they would make any
difference would be if we had a value of 0 that we were checksumming,
but since that is not the case the values will always end up
converging back onto 0x as the final result in the case of a
correct checksum.

- Alex


Re: [PATCH net-next 6/8] net: gre: Implement LCO for GRE over IPv4

2016-01-20 Thread Rustad, Mark D

Alexander Duyck  wrote:


Actually you may want to go the other way on that.  If they weren't
flipping the checksum value for GRE before why should we start doing
that now?  I'm pretty sure the checksum mangling is a very UDP centric
thing.  There is no need to introduce it to other protocols.


If different checksum representations are needed, then there really should  
be an explicit indication of whether it is a UDP-style checksum or other in  
the skb I would think rather than guessing it based on the offset. Of  
course it would be convenient if all the protocols that use a one's  
complement checksum would tolerate the UDP representation. I have a long  
(and now old) history working with real one's complement machines, and so I  
would want to believe that any correct implementation would tolerate it,  
but I don't know for a fact that they do.


--
Mark Rustad, Networking Division, Intel Corporation


signature.asc
Description: Message signed with OpenPGP using GPGMail


Re: [PATCH net-next 6/8] net: gre: Implement LCO for GRE over IPv4

2016-01-20 Thread Alexander Duyck
On Wed, Jan 20, 2016 at 11:58 AM, Tom Herbert  wrote:
> On Wed, Jan 20, 2016 at 11:35 AM, Alexander Duyck
>  wrote:
>> On Wed, Jan 20, 2016 at 11:11 AM, Rustad, Mark D
>>  wrote:
>>> Alexander Duyck  wrote:
>>>
 Actually you may want to go the other way on that.  If they weren't
 flipping the checksum value for GRE before why should we start doing
 that now?  I'm pretty sure the checksum mangling is a very UDP centric
 thing.  There is no need to introduce it to other protocols.
>>>
>>>
>>> If different checksum representations are needed, then there really should
>>> be an explicit indication of whether it is a UDP-style checksum or other in
>>> the skb I would think rather than guessing it based on the offset. Of course
>>> it would be convenient if all the protocols that use a one's complement
>>> checksum would tolerate the UDP representation. I have a long (and now old)
>>> history working with real one's complement machines, and so I would want to
>>> believe that any correct implementation would tolerate it, but I don't know
>>> for a fact that they do.
>>
>> The only reason why UDP does the bit flip is because it has reserved a
>> checksum of 0 as a special value.  For the checksum math itself either
>> 0x or 0 are interchangeable.  The only time they would make any
>> difference would be if we had a value of 0 that we were checksumming,
>> but since that is not the case the values will always end up
>> converging back onto 0x as the final result in the case of a
>> correct checksum.
>>
> 0x is mathematically equivalent to 0x0 for checksum. I would
> rather always flip 0 to 0x in LCO rather than adding an explicit
> indication (i.e. another flag) in SKB that it has a UDP checksum.

There isn't any need to add such an indication, nor do we need to
start bitflipping the return value from csum_fold in all cases.  I
think there was just some confusion about UDP checksums vs GRE or TCP
checksums.

I'd say we are better off keeping this simple.  The original patch
just needs to drop the check for the resultant checksum being 0 since
that is not needed for GRE.

- Alex


RE: [PATCH net 0/3] net: phy: Finally fix PHY_IGNORE_INTERRUPTS

2016-01-20 Thread Woojung.Huh
> > Targetting the "net" tree since these are bugfixes, but I would like
> > Woojun and Andrew to take a look and test that on their respective
> > HW setups as well.
> 
> Ok I'll wait for Woojun and Andrew to give feedback.

This patch fixes periodic phy read_status access when phy is configured as 
PHY_IGNORE_INTERRUPTS.
Tested and confirmed with LAN78xx USB-to-Ethernet driver except following 
checkpatch.pl warnings.

WARNING: line over 80 characters
#54: FILE: drivers/net/phy/phy.c:1008:
+   queue_delayed_work(system_power_efficient_wq, 
>state_queue,

WARNING: Comparisons should place the constant on the right side of the test
#68: FILE: drivers/net/phy/phy.c:714:
+   if (PHY_HALTED != phydev->state &&


Re: [PATCH net 0/3] net: phy: Finally fix PHY_IGNORE_INTERRUPTS

2016-01-20 Thread Florian Fainelli
Le 20/01/2016 13:20, woojung@microchip.com a écrit :
>>> Targetting the "net" tree since these are bugfixes, but I would like
>>> Woojun and Andrew to take a look and test that on their respective
>>> HW setups as well.
>>
>> Ok I'll wait for Woojun and Andrew to give feedback.
> 
> This patch fixes periodic phy read_status access when phy is configured as 
> PHY_IGNORE_INTERRUPTS.

Great, thanks for the feedback!

> Tested and confirmed with LAN78xx USB-to-Ethernet driver except following 
> checkpatch.pl warnings.
> 
> WARNING: line over 80 characters
> #54: FILE: drivers/net/phy/phy.c:1008:
> + queue_delayed_work(system_power_efficient_wq, 
> >state_queue,

This one is definitively added by the patch

> 
> WARNING: Comparisons should place the constant on the right side of the test
> #68: FILE: drivers/net/phy/phy.c:714:
> + if (PHY_HALTED != phydev->state &&
> 

This one is an existing warning.

David, let me know if you consider the over 80 columns issue worth a
resubmission or not, either way is fine with me.

Thanks!
-- 
Florian


RE: [PATCH v3 00/16] add Intel X722 iWARP driver

2016-01-20 Thread Steve Wise


> -Original Message-
> From: linux-rdma-ow...@vger.kernel.org 
> [mailto:linux-rdma-ow...@vger.kernel.org] On Behalf Of Or Gerlitz
> Sent: Wednesday, January 20, 2016 4:25 PM
> To: Faisal Latif
> Cc: Doug Ledford; linux-r...@vger.kernel.org; Linux Netdev List; Jeff 
> Kirsher; e1000-r...@lists.sourceforge.net
> Subject: Re: [PATCH v3 00/16] add Intel X722 iWARP driver
> 
> On Wed, Jan 20, 2016 at 9:40 PM, Faisal Latif  wrote:
> 
> > Changes since v2:
> [...]
> > *move netlink patch up
> 
> I also asked you why the port mapper code has to be present in each
> iwarp driver and not part of the IB core stack, and you responded
> "i40iw iwarp driver registers with port mapper and uses its services.
> Beside that it is not the scope of the patch series"  -- well, it is
> in the scope of upstream review to pose such questions, please
> address.
> 
> Or.

Hey Or, 

There is a common service/API in the IB core for iWarp port mapping.  See 
drivers/infinbiand/core/iwpm*.c and include/rdma/iw_portmap.h.

Steve.




Re: Optimizing instruction-cache, more packets at each stage

2016-01-20 Thread Tom Herbert
On Wed, Jan 20, 2016 at 3:02 PM, Eric Dumazet  wrote:
> On Thu, 2016-01-21 at 00:20 +0200, Or Gerlitz wrote:
>
>> Dave, I assume you refer to the RSS hash result which is written by
>> NIC HWs to the completion descriptor and then fed to the stack by the
>> driver calling skb_set_hash(.)? Well, this can be taken even further.
>>
>> Suppose a the NIC can be programmed by the kernel to provide a unique
>> flow tag on the completion descriptor per a given 5/12 tuple which
>> represents a TCP (or other logical) stream a higher level in the stack
>> is identifying to be in progress, and the driver plants that in
>> skb->mark before calling into the stack.
>>
>> I guess this could yield nice speed up for the GRO stack -- matching
>> based on single 32 bit value instead of per protocol (eth, vlan, ip,
>> tcp) checks [1] - or hint which packets from the current window of
>> "ready" completion descriptor could be grouped together for upper
>> processing?
>
> We already use the RSS hash (skb->hash) in GRO engine to speedup the
> parsing : If skb->hash differs, then there is no point trying to
> aggregate two packets.
>
> Note that if we had a l4 hash for all provided packets, GRO could use a
> hash table instead of one single list of skbs.
>
Besides that, GRO requires parsing the packet anyway so I don't see
much value in trying to optimize GRO by using the hash.

Unfortunately, the hardware hash from devices hasn't really lived up
to its potential. The original intent of getting the hash from device
was to be able to do packet steering (RPS and RFS) without touching
the header. But this never was implemented. eth_type_trans touches
headers and GRO is best when done before steering. Given the
weaknesses of Toeplitz we talked about recently and that fact that
Jenkins is really fast to compute, I am starting to think maybe we
should always do a software hash and not rely on HW for it...

>
>


Re: [PATCH net-next 6/8] net: gre: Implement LCO for GRE over IPv4

2016-01-20 Thread Rustad, Mark D

Alexander Duyck  wrote:


There isn't any need to add such an indication, nor do we need to
start bitflipping the return value from csum_fold in all cases.  I
think there was just some confusion about UDP checksums vs GRE or TCP
checksums.


Yeah. I think I finally got there. The naive software methods will never  
generate a true 0 unless everything was zero. Real one's complement  
machines did addition in terms of subtraction so that 1 + -1 would never  
produce a -0, only a normal 0. Of course a simple adder would produce a -0,  
making it impossible to get back to a normal 0.



I'd say we are better off keeping this simple.  The original patch
just needs to drop the check for the resultant checksum being 0 since
that is not needed for GRE.


I'm all in favor of simple. I had just started to worry about a possible  
change in behavior that might have interoperability problems with some  
implementations. I wonder if any implementation ever did the addition by  
subtraction, but also failed to make 0 compare equal to -0? I guess if they  
knew enough to do the former, they should not have blown the latter.


--
Mark Rustad, Networking Division, Intel Corporation


signature.asc
Description: Message signed with OpenPGP using GPGMail


Re: [PATCH] net: i40e: avoid unused function warnings

2016-01-20 Thread Arnd Bergmann
On Wednesday 20 January 2016 14:44:45 Jeff Kirsher wrote:
> Yeah, I have a fix for that as well.
> 
> You can confirm by pulling my next-queue tree (dev-queue branch).
> git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-queue.git d
> ev-queue
> 

I checked out that branch, but still see both warnings in that one, plus
a new build error in igb, which I don't see in linux-next:

drivers/net/ethernet/intel/igb/igb_cdev.c: In function 'igb_mapring':
drivers/net/ethernet/intel/igb/igb_cdev.c:150:2: error: implicit declaration of 
function 'set_pages_uc' [-Werror=implicit-function-declaration]
  set_pages_uc(virt_to_page(ring->desc), ring->size >> PAGE_SHIFT);
  ^
drivers/net/ethernet/intel/igb/igb_cdev.c: In function 'igb_unmapring':
drivers/net/ethernet/intel/igb/igb_cdev.c:275:2: error: implicit declaration of 
function 'set_pages_wb' [-Werror=implicit-function-declaration]
  set_pages_wb(virt_to_page(ring->desc), ring->size >> PAGE_SHIFT);

Arnd


Re: Optimizing instruction-cache, more packets at each stage

2016-01-20 Thread Eric Dumazet
On Thu, 2016-01-21 at 00:20 +0200, Or Gerlitz wrote:

> Dave, I assume you refer to the RSS hash result which is written by
> NIC HWs to the completion descriptor and then fed to the stack by the
> driver calling skb_set_hash(.)? Well, this can be taken even further.
> 
> Suppose a the NIC can be programmed by the kernel to provide a unique
> flow tag on the completion descriptor per a given 5/12 tuple which
> represents a TCP (or other logical) stream a higher level in the stack
> is identifying to be in progress, and the driver plants that in
> skb->mark before calling into the stack.
> 
> I guess this could yield nice speed up for the GRO stack -- matching
> based on single 32 bit value instead of per protocol (eth, vlan, ip,
> tcp) checks [1] - or hint which packets from the current window of
> "ready" completion descriptor could be grouped together for upper
> processing?

We already use the RSS hash (skb->hash) in GRO engine to speedup the
parsing : If skb->hash differs, then there is no point trying to
aggregate two packets.

Note that if we had a l4 hash for all provided packets, GRO could use a
hash table instead of one single list of skbs.





Re: out of bounds in pptp_connect.

2016-01-20 Thread Dave Jones
On Sun, Jan 17, 2016 at 12:06:58PM -0500, Dave Jones wrote:
 > I've managed to trigger this a few times the last few days, on Linus' tree.
 > 
 > ==
 > BUG: KASAN: slab-out-of-bounds in pptp_connect+0xb7b/0xc70 [pptp] at addr 
 > 8800242da0d0
 > Read of size 2 by task trinity-c14/13664
 > =
 > BUG kmalloc-8192 (Not tainted): kasan: bad access detected
 > -
 > 
 > Disabling lock debugging due to kernel taint
 > INFO: Allocated in copy_thread_tls+0x6b3/0x8d0 age=5483091 cpu=1 pid=18329
 >  ___slab_alloc.constprop.66+0x4de/0x580
 >  __slab_alloc.isra.63.constprop.65+0x48/0x80
 >  __kmalloc_track_caller+0x2a2/0x2f0
 >  kmemdup+0x20/0x50
 >  copy_thread_tls+0x6b3/0x8d0
 >  copy_process.part.40+0x3679/0x57b0
 >  _do_fork+0x16c/0xba0
 >  SyS_clone+0x19/0x20
 >  tracesys_phase2+0x84/0x89
 > INFO: Freed in x86_pmu_event_init+0x477/0x550 age=5483145 cpu=1 pid=18329
 >  __slab_free+0x18b/0x2b0
 >  kfree+0x272/0x290
 >  x86_pmu_event_init+0x477/0x550
 >  perf_try_init_event+0x164/0x1c0
 >  perf_event_alloc+0x1235/0x18c0
 >  inherit_event.isra.88+0xd4/0x6c0
 >  inherit_task_group.isra.90.part.91+0x68/0x200
 >  perf_event_init_task+0x41f/0x830
 >  copy_process.part.40+0x15d6/0x57b0
 >  _do_fork+0x16c/0xba0
 >  SyS_clone+0x19/0x20
 >  tracesys_phase2+0x84/0x89

I'm now seeing different bug type, with similar traces.
Instead of an out of bounds, it's now a use-after-free, but
it's interesting that it's complaining about memory that used
to belong to perf again.  Could the bug be in perf ?

Dave


BUG: KASAN: use-after-free in pptp_connect+0x19f/0x5e0 [pptp] at addr 
8804632ba0d0
Read of size 2 by task trinity-c4/18013
=
BUG kmalloc-2048 (Tainted: GW  ): kasan: bad access detected
-

INFO: Allocated in perf_event_alloc+0x72/0xd60 age=5653 cpu=0 pid=17555
___slab_alloc.constprop.71+0x523/0x5c0
__slab_alloc.isra.67.constprop.70+0x48/0x80
kmem_cache_alloc_trace+0x24c/0x2e0
perf_event_alloc+0x72/0xd60
inherit_event.isra.90+0x82/0x3a0
inherit_task_group.isra.92.part.93+0x55/0x120
perf_event_init_task+0x35a/0x530
copy_process.part.40+0xb3d/0x2db0
_do_fork+0x164/0x880
SyS_clone+0x19/0x20
tracesys_phase2+0x84/0x89
INFO: Freed in free_event_rcu+0x38/0x40 age=5635 cpu=0 pid=17555
__slab_free+0x19e/0x2d0
kfree+0x25c/0x280
free_event_rcu+0x38/0x40
rcu_process_callbacks+0xbac/0x1200
__do_softirq+0x1a4/0x590
irq_exit+0xf5/0x100
smp_apic_timer_interrupt+0x5c/0x70
apic_timer_interrupt+0x90/0xa0
context_tracking_exit+0x1d/0x20
enter_from_user_mode+0x1f/0x50
syscall_trace_enter_phase1+0x1cb/0x260
tracesys+0xd/0x44
INFO: Slab 0xea00118cae00 objects=13 used=9 fp=0x8804632bae68 
flags=0x80004080
INFO: Object 0x8804632b9bd8 @offset=7128 fp=0x8804632be618



Re: [PATCH] net: i40e: avoid unused function warnings

2016-01-20 Thread Jeff Kirsher
On Wed, 2016-01-20 at 23:54 +0100, Arnd Bergmann wrote:
> On Wednesday 20 January 2016 14:44:45 Jeff Kirsher wrote:
> > Yeah, I have a fix for that as well.
> > 
> > You can confirm by pulling my next-queue tree (dev-queue branch).
> > git://git.kernel.org/pub/scm/linux/kernel/git/jkirsher/next-
> queue.git d
> > ev-queue
> > 
> 
> I checked out that branch, but still see both warnings in that one,
> plus
> a new build error in igb, which I don't see in linux-next:
> 
> drivers/net/ethernet/intel/igb/igb_cdev.c: In function 'igb_mapring':
> drivers/net/ethernet/intel/igb/igb_cdev.c:150:2: error: implicit
> declaration of function 'set_pages_uc' [-Werror=implicit-function-
> declaration]
>   set_pages_uc(virt_to_page(ring->desc), ring->size >> PAGE_SHIFT);
>   ^
> drivers/net/ethernet/intel/igb/igb_cdev.c: In function
> 'igb_unmapring':
> drivers/net/ethernet/intel/igb/igb_cdev.c:275:2: error: implicit
> declaration of function 'set_pages_wb' [-Werror=implicit-function-
> declaration]
>   set_pages_wb(virt_to_page(ring->desc), ring->size >> PAGE_SHIFT);

Oops, I just realized I had not pushed my latest tree to kernel.org.

The igb issue still remains, I am working with the developer who
introduced the issue.  Looks like the i40e issue about possible
uninitialized variables still exists.  I thought we had resolved that
issue, but apparently not.

You should see Eric Dumazet's patch on the tree to resolve the other
i40e build warnings.

I can add your second patch to resolve the uninitialized variables to
my tree.

signature.asc
Description: This is a digitally signed message part


[PATCH v6 0/3] can: sja1000: support for technologic version

2016-01-20 Thread Marc Kleine-Budde
This is Damien Riegel's series with minor changes.

regards,
Marc

---

This patchset introduces support for the technologic version of the
SJA1000. Access to IP's registers are proxied through a window,
requiring two bus accesses to read or write a register. These accesses
must be protected by a spinlock to prevent race conditions. Currently,
there is no easy way to allocate and initialize this spinlock.

SJA1000 already provides a way to allocate private data, but
sja1000_platform.c makes no use of it.

Patch 1 adds the capability to allocate and initialize private data on a
per-compatible basis in sja1000_platform.c.

Patch 2 updates device tree documentation to add the technologic
version.

Patch 3 updates the driver to implement the technologic version

Changes in v6:
 - fix static const struct of_device_id sp_of_table

Changes in v5:
 - remove empty "static struct sja1000_of_data nxp_data", again
 - add additional check for of_id->data

Changes in v4:
 - add sp_ prefix to technologic functions
 - add empty "static struct sja1000_of_data nxp_data"
 - make "struct sja1000_of_data technologic_data" static
 - get rid of "?" operator in sp_probe()

Changes in v3:
 - moved sp_of_table above sp_probe as it is used in this function
 - removed functions added in v2 and do everyting in sp_probe

Changes in v2:
 - added a patch to allocate and initialize private data
 - changed device tree documentation
 - added a spinlock to protect bus accesses
 - changed sp_{read,write}_reg16 to io{read,write}16






[PATCH v6 1/3] can: sja1000: of: add per-compatible init hook

2016-01-20 Thread Marc Kleine-Budde
From: Damien Riegel 

This commit adds the capability to allocate and init private data
embedded in the sja1000_priv structure on a per-compatible basis. The
device node is passed as a parameter of the init callback to allow
parsing of custom device tree properties.

Signed-off-by: Damien Riegel 
Signed-off-by: Marc Kleine-Budde 
---
 drivers/net/can/sja1000/sja1000_platform.c | 40 +++---
 1 file changed, 31 insertions(+), 9 deletions(-)

diff --git a/drivers/net/can/sja1000/sja1000_platform.c 
b/drivers/net/can/sja1000/sja1000_platform.c
index 0552ed46a206..777d312f1779 100644
--- a/drivers/net/can/sja1000/sja1000_platform.c
+++ b/drivers/net/can/sja1000/sja1000_platform.c
@@ -27,6 +27,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include "sja1000.h"
@@ -40,6 +41,11 @@ MODULE_DESCRIPTION("Socket-CAN driver for SJA1000 on the 
platform bus");
 MODULE_ALIAS("platform:" DRV_NAME);
 MODULE_LICENSE("GPL v2");
 
+struct sja1000_of_data {
+   size_t  priv_sz;
+   int (*init)(struct sja1000_priv *priv, struct device_node *of);
+};
+
 static u8 sp_read_reg8(const struct sja1000_priv *priv, int reg)
 {
return ioread8(priv->reg_base + reg);
@@ -154,6 +160,12 @@ static void sp_populate_of(struct sja1000_priv *priv, 
struct device_node *of)
priv->cdr |= CDR_CBP; /* default */
 }
 
+static const struct of_device_id sp_of_table[] = {
+   { .compatible = "nxp,sja1000", .data = NULL, },
+   { /* sentinel */ },
+};
+MODULE_DEVICE_TABLE(of, sp_of_table);
+
 static int sp_probe(struct platform_device *pdev)
 {
int err, irq = 0;
@@ -163,6 +175,9 @@ static int sp_probe(struct platform_device *pdev)
struct resource *res_mem, *res_irq = NULL;
struct sja1000_platform_data *pdata;
struct device_node *of = pdev->dev.of_node;
+   const struct of_device_id *of_id;
+   const struct sja1000_of_data *of_data = NULL;
+   size_t priv_sz = 0;
 
pdata = dev_get_platdata(>dev);
if (!pdata && !of) {
@@ -191,7 +206,13 @@ static int sp_probe(struct platform_device *pdev)
if (!irq && !res_irq)
return -ENODEV;
 
-   dev = alloc_sja1000dev(0);
+   of_id = of_match_device(sp_of_table, >dev);
+   if (of_id && of_id->data) {
+   of_data = of_id->data;
+   priv_sz = of_data->priv_sz;
+   }
+
+   dev = alloc_sja1000dev(priv_sz);
if (!dev)
return -ENOMEM;
priv = netdev_priv(dev);
@@ -208,10 +229,17 @@ static int sp_probe(struct platform_device *pdev)
dev->irq = irq;
priv->reg_base = addr;
 
-   if (of)
+   if (of) {
sp_populate_of(priv, of);
-   else
+
+   if (of_data && of_data->init) {
+   err = of_data->init(priv, of);
+   if (err)
+   goto exit_free;
+   }
+   } else {
sp_populate(priv, pdata, res_mem->flags);
+   }
 
platform_set_drvdata(pdev, dev);
SET_NETDEV_DEV(dev, >dev);
@@ -242,12 +270,6 @@ static int sp_remove(struct platform_device *pdev)
return 0;
 }
 
-static const struct of_device_id sp_of_table[] = {
-   {.compatible = "nxp,sja1000"},
-   {},
-};
-MODULE_DEVICE_TABLE(of, sp_of_table);
-
 static struct platform_driver sp_driver = {
.probe = sp_probe,
.remove = sp_remove,
-- 
2.7.0.rc3



Re: [PATCH v2] can: c_can: add xceiver enable/disable support

2016-01-20 Thread Bjørn Mork
Michael Grzeschik  writes:

> @@ -1263,6 +1271,10 @@ int register_c_can_dev(struct net_device *dev)
>*/
>   pinctrl_pm_select_sleep_state(dev->dev.parent);
>  
> + priv->reg_xceiver = devm_regulator_get(priv->device, "xceiver");
> + if (IS_ERR(priv->reg_xceiver))
> + return PTR_ERR(priv->reg_xceiver);
> +
>   c_can_pm_runtime_enable(priv);
>  
>   dev->flags |= IFF_ECHO; /* we support local echo */

Do you really want to leave priv->reg_xceiver pointing to an ERR_PTR in
case of error?



Bjørn


Re: [PATCH v2] can: c_can: add xceiver enable/disable support

2016-01-20 Thread Michael Grzeschik
Hi,

On Wed, Jan 20, 2016 at 05:19:18PM +0100, Bjørn Mork wrote:
> Michael Grzeschik  writes:
> 
> > @@ -1263,6 +1271,10 @@ int register_c_can_dev(struct net_device *dev)
> >  */
> > pinctrl_pm_select_sleep_state(dev->dev.parent);
> >  
> > +   priv->reg_xceiver = devm_regulator_get(priv->device, "xceiver");
> > +   if (IS_ERR(priv->reg_xceiver))
> > +   return PTR_ERR(priv->reg_xceiver);
> > +
> > c_can_pm_runtime_enable(priv);
> >  
> > dev->flags |= IFF_ECHO; /* we support local echo */
> 
> Do you really want to leave priv->reg_xceiver pointing to an ERR_PTR in
> case of error?

No, therefore the priv->reg_xceiver will be returned in case of error.
This codepath is called once on device registration.

Michael

-- 
Pengutronix e.K.   | |
Industrial Linux Solutions | http://www.pengutronix.de/  |
Peiner Str. 6-8, 31137 Hildesheim, Germany | Phone: +49-5121-206917-0|
Amtsgericht Hildesheim, HRA 2686   | Fax:   +49-5121-206917- |


[PATCH 4/6] netfilter: xt_TCPMSS: handle CHECKSUM_COMPLETE in tcpmss_tg6()

2016-01-20 Thread Pablo Neira Ayuso
From: Eric Dumazet 

In case MSS option is added in TCP options, skb length increases by 4.
IPv6 needs to update skb->csum if skb has CHECKSUM_COMPLETE,
otherwise kernel complains loudly in netdev_rx_csum_fault() with a
stack dump.

Signed-off-by: Eric Dumazet 
Signed-off-by: Pablo Neira Ayuso 
---
 net/netfilter/xt_TCPMSS.c | 9 +++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/net/netfilter/xt_TCPMSS.c b/net/netfilter/xt_TCPMSS.c
index b7c43de..e118397 100644
--- a/net/netfilter/xt_TCPMSS.c
+++ b/net/netfilter/xt_TCPMSS.c
@@ -228,7 +228,7 @@ tcpmss_tg6(struct sk_buff *skb, const struct 
xt_action_param *par)
 {
struct ipv6hdr *ipv6h = ipv6_hdr(skb);
u8 nexthdr;
-   __be16 frag_off;
+   __be16 frag_off, oldlen, newlen;
int tcphoff;
int ret;
 
@@ -244,7 +244,12 @@ tcpmss_tg6(struct sk_buff *skb, const struct 
xt_action_param *par)
return NF_DROP;
if (ret > 0) {
ipv6h = ipv6_hdr(skb);
-   ipv6h->payload_len = htons(ntohs(ipv6h->payload_len) + ret);
+   oldlen = ipv6h->payload_len;
+   newlen = htons(ntohs(oldlen) + ret);
+   if (skb->ip_summed == CHECKSUM_COMPLETE)
+   skb->csum = csum_add(csum_sub(skb->csum, oldlen),
+newlen);
+   ipv6h->payload_len = newlen;
}
return XT_CONTINUE;
 }
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 5/6] netfilter: nf_tables_netdev: fix error path in module initialization

2016-01-20 Thread Pablo Neira Ayuso
Unregister the chain type and return error, otherwise this leaks the
subscription to the netdevice notifier call chain.

Signed-off-by: Pablo Neira Ayuso 
---
 net/netfilter/nf_tables_netdev.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/net/netfilter/nf_tables_netdev.c b/net/netfilter/nf_tables_netdev.c
index b6605e0..5eefe4a 100644
--- a/net/netfilter/nf_tables_netdev.c
+++ b/net/netfilter/nf_tables_netdev.c
@@ -224,12 +224,12 @@ static int __init nf_tables_netdev_init(void)
 
nft_register_chain_type(_filter_chain_netdev);
ret = register_pernet_subsys(_tables_netdev_net_ops);
-   if (ret < 0)
+   if (ret < 0) {
nft_unregister_chain_type(_filter_chain_netdev);
-
+   return ret;
+   }
register_netdevice_notifier(_tables_netdev_notifier);
-
-   return ret;
+   return 0;
 }
 
 static void __exit nf_tables_netdev_exit(void)
-- 
2.1.4

--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/6] Netfilter fixes for net

2016-01-20 Thread Pablo Neira Ayuso
Hi David,

The following patchset contains Netfilter fixes for your net tree, they
are:

1) Fix accidental 3-times le/be conversion for 64-bits in nft_byteorder,
   from Florian Westphal.

2) Get rid of defensive cidr = 0 check in the ipset hash:netiface set
   type which doesn't allow valid 0.0.0.0/0 elements, also from Florian.

3) Relocate #endif in nft_ct counter support, this doesn't have any
   relation with labels.

4) Fix TCPMSS target for IPv6 when skb has CHECKSUM_COMPLETE, from
   Eric Dumazet.

5) Fix netdevice notifier leak from the error path of nf_tables_netdev.

6) Safe conntrack hashtable resizing by introducing a global lock and
   synchronize all buckets to avoid going over the maximum number of
   preemption levels, from Sasha Levin.

You can pull these changes from:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git

Thanks!



The following changes since commit f1640c3ddeec12804bc9a21feee85fc15aca95f6:

  bgmac: fix a missing check for build_skb (2016-01-13 00:24:14 -0500)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git HEAD

for you to fetch changes up to b16c29191dc89bd877af99a7b04ce4866728a3e0:

  netfilter: nf_conntrack: use safer way to lock all buckets (2016-01-20 
14:15:31 +0100)


Eric Dumazet (1):
  netfilter: xt_TCPMSS: handle CHECKSUM_COMPLETE in tcpmss_tg6()

Florian Westphal (2):
  netfilter: nft_byteorder: avoid unneeded le/be conversion steps
  netfilter: ipset: allow a 0 netmask with hash_netiface type

Pablo Neira Ayuso (2):
  netfilter: nft_ct: keep counters away from CONFIG_NF_CONNTRACK_LABELS
  netfilter: nf_tables_netdev: fix error path in module initialization

Sasha Levin (1):
  netfilter: nf_conntrack: use safer way to lock all buckets

 include/net/netfilter/nf_conntrack_core.h  |  8 +++
 net/netfilter/ipset/ip_set_hash_netiface.c |  4 
 net/netfilter/nf_conntrack_core.c  | 38 ++
 net/netfilter/nf_conntrack_helper.c|  2 +-
 net/netfilter/nf_conntrack_netlink.c   |  2 +-
 net/netfilter/nf_tables_netdev.c   |  8 +++
 net/netfilter/nfnetlink_cttimeout.c|  4 ++--
 net/netfilter/nft_byteorder.c  |  6 ++---
 net/netfilter/nft_ct.c |  2 +-
 net/netfilter/xt_TCPMSS.c  |  9 +--
 10 files changed, 49 insertions(+), 34 deletions(-)
--
To unsubscribe from this list: send the line "unsubscribe netfilter-devel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/6] netfilter: nft_ct: keep counters away from CONFIG_NF_CONNTRACK_LABELS

2016-01-20 Thread Pablo Neira Ayuso
This is accidental, they don't depend on the label infrastructure.

Fixes: 48f66c905a97 ("netfilter: nft_ct: add byte/packet counter support")
Reported-by: Arnd Bergmann 
Signed-off-by: Pablo Neira Ayuso 
Acked-by: Florian Westphal 
---
 net/netfilter/nft_ct.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nft_ct.c b/net/netfilter/nft_ct.c
index a0eb216..d4a4619 100644
--- a/net/netfilter/nft_ct.c
+++ b/net/netfilter/nft_ct.c
@@ -127,6 +127,7 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
   NF_CT_LABELS_MAX_SIZE - size);
return;
}
+#endif
case NFT_CT_BYTES: /* fallthrough */
case NFT_CT_PKTS: {
const struct nf_conn_acct *acct = nf_conn_acct_find(ct);
@@ -138,7 +139,6 @@ static void nft_ct_get_eval(const struct nft_expr *expr,
memcpy(dest, , sizeof(count));
return;
}
-#endif
default:
break;
}
-- 
2.1.4



[PATCH 6/6] netfilter: nf_conntrack: use safer way to lock all buckets

2016-01-20 Thread Pablo Neira Ayuso
From: Sasha Levin 

When we need to lock all buckets in the connection hashtable we'd attempt to
lock 1024 spinlocks, which is way more preemption levels than supported by
the kernel. Furthermore, this behavior was hidden by checking if lockdep is
enabled, and if it was - use only 8 buckets(!).

Fix this by using a global lock and synchronize all buckets on it when we
need to lock them all. This is pretty heavyweight, but is only done when we
need to resize the hashtable, and that doesn't happen often enough (or at all).

Signed-off-by: Sasha Levin 
Acked-by: Jesper Dangaard Brouer 
Reviewed-by: Florian Westphal 
Signed-off-by: Pablo Neira Ayuso 
---
 include/net/netfilter/nf_conntrack_core.h |  8 +++
 net/netfilter/nf_conntrack_core.c | 38 +++
 net/netfilter/nf_conntrack_helper.c   |  2 +-
 net/netfilter/nf_conntrack_netlink.c  |  2 +-
 net/netfilter/nfnetlink_cttimeout.c   |  4 ++--
 5 files changed, 35 insertions(+), 19 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_core.h 
b/include/net/netfilter/nf_conntrack_core.h
index 788ef58..62e17d1 100644
--- a/include/net/netfilter/nf_conntrack_core.h
+++ b/include/net/netfilter/nf_conntrack_core.h
@@ -79,12 +79,10 @@ print_tuple(struct seq_file *s, const struct 
nf_conntrack_tuple *tuple,
 const struct nf_conntrack_l3proto *l3proto,
 const struct nf_conntrack_l4proto *proto);
 
-#ifdef CONFIG_LOCKDEP
-# define CONNTRACK_LOCKS 8
-#else
-# define CONNTRACK_LOCKS 1024
-#endif
+#define CONNTRACK_LOCKS 1024
+
 extern spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
+void nf_conntrack_lock(spinlock_t *lock);
 
 extern spinlock_t nf_conntrack_expect_lock;
 
diff --git a/net/netfilter/nf_conntrack_core.c 
b/net/netfilter/nf_conntrack_core.c
index 3cb3cb8..58882de 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -66,6 +66,21 @@ EXPORT_SYMBOL_GPL(nf_conntrack_locks);
 __cacheline_aligned_in_smp DEFINE_SPINLOCK(nf_conntrack_expect_lock);
 EXPORT_SYMBOL_GPL(nf_conntrack_expect_lock);
 
+static __read_mostly spinlock_t nf_conntrack_locks_all_lock;
+static __read_mostly bool nf_conntrack_locks_all;
+
+void nf_conntrack_lock(spinlock_t *lock) __acquires(lock)
+{
+   spin_lock(lock);
+   while (unlikely(nf_conntrack_locks_all)) {
+   spin_unlock(lock);
+   spin_lock(_conntrack_locks_all_lock);
+   spin_unlock(_conntrack_locks_all_lock);
+   spin_lock(lock);
+   }
+}
+EXPORT_SYMBOL_GPL(nf_conntrack_lock);
+
 static void nf_conntrack_double_unlock(unsigned int h1, unsigned int h2)
 {
h1 %= CONNTRACK_LOCKS;
@@ -82,12 +97,12 @@ static bool nf_conntrack_double_lock(struct net *net, 
unsigned int h1,
h1 %= CONNTRACK_LOCKS;
h2 %= CONNTRACK_LOCKS;
if (h1 <= h2) {
-   spin_lock(_conntrack_locks[h1]);
+   nf_conntrack_lock(_conntrack_locks[h1]);
if (h1 != h2)
spin_lock_nested(_conntrack_locks[h2],
 SINGLE_DEPTH_NESTING);
} else {
-   spin_lock(_conntrack_locks[h2]);
+   nf_conntrack_lock(_conntrack_locks[h2]);
spin_lock_nested(_conntrack_locks[h1],
 SINGLE_DEPTH_NESTING);
}
@@ -102,16 +117,19 @@ static void nf_conntrack_all_lock(void)
 {
int i;
 
-   for (i = 0; i < CONNTRACK_LOCKS; i++)
-   spin_lock_nested(_conntrack_locks[i], i);
+   spin_lock(_conntrack_locks_all_lock);
+   nf_conntrack_locks_all = true;
+
+   for (i = 0; i < CONNTRACK_LOCKS; i++) {
+   spin_lock(_conntrack_locks[i]);
+   spin_unlock(_conntrack_locks[i]);
+   }
 }
 
 static void nf_conntrack_all_unlock(void)
 {
-   int i;
-
-   for (i = 0; i < CONNTRACK_LOCKS; i++)
-   spin_unlock(_conntrack_locks[i]);
+   nf_conntrack_locks_all = false;
+   spin_unlock(_conntrack_locks_all_lock);
 }
 
 unsigned int nf_conntrack_htable_size __read_mostly;
@@ -757,7 +775,7 @@ restart:
hash = hash_bucket(_hash, net);
for (; i < net->ct.htable_size; i++) {
lockp = _conntrack_locks[hash % CONNTRACK_LOCKS];
-   spin_lock(lockp);
+   nf_conntrack_lock(lockp);
if (read_seqcount_retry(>ct.generation, sequence)) {
spin_unlock(lockp);
goto restart;
@@ -1382,7 +1400,7 @@ get_next_corpse(struct net *net, int (*iter)(struct 
nf_conn *i, void *data),
for (; *bucket < net->ct.htable_size; (*bucket)++) {
lockp = _conntrack_locks[*bucket % CONNTRACK_LOCKS];
local_bh_disable();
-   spin_lock(lockp);
+   nf_conntrack_lock(lockp);
if (*bucket < 

Re: [PATCH v3 00/16] add Intel X722 iWARP driver

2016-01-20 Thread Or Gerlitz
On Wed, Jan 20, 2016 at 9:40 PM, Faisal Latif  wrote:

> Changes since v2:
[...]
> *move netlink patch up

I also asked you why the port mapper code has to be present in each
iwarp driver and not part of the IB core stack, and you responded
"i40iw iwarp driver registers with port mapper and uses its services.
Beside that it is not the scope of the patch series"  -- well, it is
in the scope of upstream review to pose such questions, please
address.

Or.


  1   2   >