date:20160913

[PATCH net-next 2/2] net: mvneta: add BQL support

2016-09-13 Thread Marcin Wojtas

Tests showed that when whole bandwidth is consumed, the latency for
various kind of traffic can reach high values. With saturated
link (e.g. with iperf from target to host) simple ping could take
significant amount of time. BQL proved to improve this situation
when implemented in mvneta driver. Measurements of ping latency
for 3 link speeds:
Speed | Latency w/o BQL | Latency with BQL
10|  7-14 ms| 3.5 ms
100   |  2-12 ms| 0.6 ms
1000  |   often timeout |   up to 2ms

Decreasing latency as above result in sligt performance cost - 4kpps
(-1.4%) when pushing 64B packets via two bridged interfaces of Armada 38x.
For 1500B packets in the same setup, the mpstat tool showed +8% of
CPU occupation (default affinity, second CPU idle). Even though this
cost seems reasonable to take, considering other improvements.

This commit adds byte queue limit mechanism for the mvneta driver.

Signed-off-by: Marcin Wojtas 
---
 drivers/net/ethernet/marvell/mvneta.c | 22 +++---
 1 file changed, 19 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvneta.c 
b/drivers/net/ethernet/marvell/mvneta.c
index b9dccea..bb5df35 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -1719,8 +1719,10 @@ static struct mvneta_tx_queue 
*mvneta_tx_done_policy(struct mvneta_port *pp,
 
 /* Free tx queue skbuffs */
 static void mvneta_txq_bufs_free(struct mvneta_port *pp,
-struct mvneta_tx_queue *txq, int num)
+struct mvneta_tx_queue *txq, int num,
+struct netdev_queue *nq)
 {
+   unsigned int bytes_compl = 0, pkts_compl = 0;
int i;
 
for (i = 0; i < num; i++) {
@@ -1728,6 +1730,11 @@ static void mvneta_txq_bufs_free(struct mvneta_port *pp,
txq->txq_get_index;
struct sk_buff *skb = txq->tx_skb[txq->txq_get_index];
 
+   if (skb) {
+   bytes_compl += skb->len;
+   pkts_compl++;
+   }
+
mvneta_txq_inc_get(txq);
 
if (!IS_TSO_HEADER(txq, tx_desc->buf_phys_addr))
@@ -1738,6 +1745,8 @@ static void mvneta_txq_bufs_free(struct mvneta_port *pp,
continue;
dev_kfree_skb_any(skb);
}
+
+   netdev_tx_completed_queue(nq, pkts_compl, bytes_compl);
 }
 
 /* Handle end of transmission */
@@ -1751,7 +1760,7 @@ static void mvneta_txq_done(struct mvneta_port *pp,
if (!tx_done)
return;
 
-   mvneta_txq_bufs_free(pp, txq, tx_done);
+   mvneta_txq_bufs_free(pp, txq, tx_done, nq);
 
txq->count -= tx_done;
 
@@ -2358,6 +2367,8 @@ out:
struct mvneta_pcpu_stats *stats = this_cpu_ptr(pp->stats);
struct netdev_queue *nq = netdev_get_tx_queue(dev, txq_id);
 
+   netdev_tx_sent_queue(nq, len);
+
txq->count += frags;
if (txq->count >= txq->tx_stop_threshold)
netif_tx_stop_queue(nq);
@@ -2385,9 +2396,10 @@ static void mvneta_txq_done_force(struct mvneta_port *pp,
  struct mvneta_tx_queue *txq)
 
 {
+   struct netdev_queue *nq = netdev_get_tx_queue(pp->dev, txq->id);
int tx_done = txq->count;
 
-   mvneta_txq_bufs_free(pp, txq, tx_done);
+   mvneta_txq_bufs_free(pp, txq, tx_done, nq);
 
/* reset txq */
txq->count = 0;
@@ -2884,6 +2896,8 @@ static int mvneta_txq_init(struct mvneta_port *pp,
 static void mvneta_txq_deinit(struct mvneta_port *pp,
  struct mvneta_tx_queue *txq)
 {
+   struct netdev_queue *nq = netdev_get_tx_queue(pp->dev, txq->id);
+
kfree(txq->tx_skb);
 
if (txq->tso_hdrs)
@@ -2895,6 +2909,8 @@ static void mvneta_txq_deinit(struct mvneta_port *pp,
  txq->size * MVNETA_DESC_ALIGNED_SIZE,
  txq->descs, txq->descs_phys);
 
+   netdev_tx_reset_queue(nq);
+
txq->descs = NULL;
txq->last_desc = 0;
txq->next_desc_to_proc = 0;
-- 
1.8.3.1

[PATCH net-next 1/2] net: mvneta: add xmit_more support

2016-09-13 Thread Marcin Wojtas

From: Simon Guinot 

Basing on xmit_more flag of the skb, TX descriptors can be concatenated
before flushing. This commit delay Tx descriptor flush if the queue is
running and if there is more skb's to send.

Signed-off-by: Simon Guinot 
---
 drivers/net/ethernet/marvell/mvneta.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/marvell/mvneta.c 
b/drivers/net/ethernet/marvell/mvneta.c
index d41c28d..b9dccea 100644
--- a/drivers/net/ethernet/marvell/mvneta.c
+++ b/drivers/net/ethernet/marvell/mvneta.c
@@ -512,6 +512,7 @@ struct mvneta_tx_queue {
 * descriptor ring
 */
int count;
+   int pending;
int tx_stop_threshold;
int tx_wake_threshold;
 
@@ -802,8 +803,9 @@ static void mvneta_txq_pend_desc_add(struct mvneta_port *pp,
/* Only 255 descriptors can be added at once ; Assume caller
 * process TX desriptors in quanta less than 256
 */
-   val = pend_desc;
+   val = pend_desc + txq->pending;
mvreg_write(pp, MVNETA_TXQ_UPDATE_REG(txq->id), val);
+   txq->pending = 0;
 }
 
 /* Get pointer to next TX descriptor to be processed (send) by HW */
@@ -2357,11 +2359,14 @@ out:
struct netdev_queue *nq = netdev_get_tx_queue(dev, txq_id);
 
txq->count += frags;
-   mvneta_txq_pend_desc_add(pp, txq, frags);
-
if (txq->count >= txq->tx_stop_threshold)
netif_tx_stop_queue(nq);
 
+   if (!skb->xmit_more || netif_xmit_stopped(nq))
+   mvneta_txq_pend_desc_add(pp, txq, frags);
+   else
+   txq->pending += frags;
+
u64_stats_update_begin(&stats->syncp);
stats->tx_packets++;
stats->tx_bytes  += len;
-- 
1.8.3.1

[PATCH net-next 0/2] mvneta xmit_more and bql support

2016-09-13 Thread Marcin Wojtas

Hi,

This short patchset introduces two enhancements to mvneta driver
TX packets concatenation support using xmit_more mechanism and also
byte queue limit in order to decrease latency on saturated links.

Any comments or feedback would be welcome.

Best regards,
Marcin

Marcin Wojtas (1):
  net: mvneta: add BQL support

Simon Guinot (1):
  net: mvneta: add xmit_more support

 drivers/net/ethernet/marvell/mvneta.c | 33 +++--
 1 file changed, 27 insertions(+), 6 deletions(-)

-- 
1.8.3.1

Re: [PATCH net-next] ipv4: accept u8 in IP_TOS ancillary data

2016-09-13 Thread Jesper Dangaard Brouer

On Thu, 08 Sep 2016 06:31:14 -0700
Eric Dumazet  wrote:

> On Thu, 2016-09-08 at 11:15 +0200, Jesper Dangaard Brouer wrote:
> > On Wed, 07 Sep 2016 21:52:56 -0700
> > Eric Dumazet  wrote:
> >   
> > > From: Eric Dumazet 
> > > 
> > > In commit f02db315b8d8 ("ipv4: IP_TOS and IP_TTL can be specified as
> > > ancillary data") Francesco added IP_TOS values specified as integer.
> > > 
> > > However, kernel sends to userspace (at recvmsg() time) an IP_TOS value
> > > in a single byte, when IP_RECVTOS is set on the socket.
> > > 
> > > It can be very useful to reflect all ancillary options as given by the
> > > kernel in a subsequent sendmsg(), instead of aborting the sendmsg() with
> > > EINVAL after Francesco patch.
> > > 
> > > So this patch extends IP_TOS ancillary to accept an u8, so that an UDP
> > > server can simply reuse same ancillary block without having to mangle
> > > it.
> > > 
> > > Jesper can then augment
> > > https://github.com/netoptimizer/network-testing/blob/master/src/udp_example02.c
> > > to add TOS reflection ;)  
> > 
> > This is actually your old program ;-)
> > Do I need to change anything, as I'm just bouncing the packet back with 
> > sendmsg() ?  
> 
> I guess you want to add an option and if this option is requested by the
> user, add :
> 
>   setsockopt(fd, SOL_IP, IP_PKTINFO, &on, sizeof(on));
> + if (tos_reflect)
> + setsockopt(fd, SOL_IP, IP_RECVTOS, &on, sizeof(on));
> 
> before the loop doing the recvmsg()/sendmsg() calls.

Hi Eric,

I've implemented what you suggested:
 https://github.com/netoptimizer/network-testing/commit/0758ad77a96ecb1

Now QA can use this tool to verify the kernel commit ;-)

-- 
Best regards,
  Jesper Dangaard Brouer
  MSc.CS, Principal Kernel Engineer at Red Hat
  Author of http://www.iptv-analyzer.org
  LinkedIn: http://www.linkedin.com/in/brouer

[PATCH net-next] cxgb4vf: don't offload Rx checksums for IPv6 fragments

2016-09-13 Thread Hariprasad Shenai

The checksum provided by the device doesn't include the L3 headers,
as IPv6 expects

Signed-off-by: Hariprasad Shenai 
---
 drivers/net/ethernet/chelsio/cxgb4vf/sge.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c 
b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
index c8fd4f8fe1fa..4d4b94a8969a 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
@@ -1648,14 +1648,15 @@ int t4vf_ethrx_handler(struct sge_rspq *rspq, const 
__be64 *rsp,
 
if (csum_ok && !pkt->err_vec &&
(be32_to_cpu(pkt->l2info) & (RXF_UDP_F | RXF_TCP_F))) {
-   if (!pkt->ip_frag)
+   if (!pkt->ip_frag) {
skb->ip_summed = CHECKSUM_UNNECESSARY;
-   else {
+   rxq->stats.rx_cso++;
+   } else if (pkt->l2info & htonl(F_RXF_IP)) {
__sum16 c = (__force __sum16)pkt->csum;
skb->csum = csum_unfold(c);
skb->ip_summed = CHECKSUM_COMPLETE;
+   rxq->stats.rx_cso++;
}
-   rxq->stats.rx_cso++;
} else
skb_checksum_none_assert(skb);
 
-- 
2.3.4

Re: [RFC 00/11] QLogic RDMA Driver (qedr) RFC

2016-09-13 Thread Mintz, Yuval

>> While this might work, I personally dislike it as I find it
>> counter-intuitive when going over the code -
>> I don't expect driver to locally modify the inclusion path.
>> Besides, we're going to [eventually] a whole suite of drivers based
>> on the qed module, some of which would reside under drivers/scsi;
>> Not sure it's best to have 3 or 4 different drivers privately include the
>> same directory under a different subsystem.

> I agree with you that orcdma's way can be valuable for small drivers.

> Orcmda has small shared headers set and doesn't need to change them rapidly
> to support different devices.

> I thought to place them in similar directory to include/soc/* and remove
> from include/linux/. We have include/rdma/ and it looks like a good
> candidate.

I'm perfectly fine with relocating those to a different directory under 
include/,
although using 'rdma' doesn't sound like a good fit [as the headers would be
included by ethernet, scsi and rdma drivers]. 
Are there good existing alternatives?

Regardless, I don't believe this should be part of the initial submission,
as it would involve in relocating existing networking headers as well.
I think we can move those at leisure later on.

[We're in the middle of transitioning our e-mails from qlogic -> cavium,
so sorry if things become corrupted]

Re: [RFC 02/11] Add RoCE driver framework

2016-09-13 Thread Mintz, Yuval

>> >> +uint debug;
>> >> +module_param(debug, uint, 0);
> >>> +MODULE_PARM_DESC(debug, "Default debug msglevel");
>>
>> >Why are you adding this as a module parameter?
>>
>>  I believe this is mostly to follow same line as qede which also defines
> > 'debug' module parameter for allowing easy user control of debug
> > prints [& specifically for probe prints, which can't be controlled
> > otherwise].

> Can you give us an example where dynamic debug and tracing infrastructures
> are not enough?

> AFAIK, most of these debug module parameters are legacy copy/paste
> code which is useless in real life scenarios.

Define 'enough'; Using dynamic debug you can provide all the necessary
information and at an even better granularity that's achieved by suggested
infrastructure,  but is harder for an end-user to use. Same goes for tracing.

The 'debug' option provides an easy grouping for prints related to a specific
area in the driver.

RE: [v11, 5/8] soc: fsl: add GUTS driver for QorIQ platforms

2016-09-13 Thread Y.B. Lu

> -Original Message-
> From: linux-mmc-ow...@vger.kernel.org [mailto:linux-mmc-
> ow...@vger.kernel.org] On Behalf Of Scott Wood
> Sent: Tuesday, September 13, 2016 7:25 AM
> To: Y.B. Lu; linux-...@vger.kernel.org; ulf.hans...@linaro.org; Arnd
> Bergmann
> Cc: linuxppc-...@lists.ozlabs.org; devicet...@vger.kernel.org; linux-arm-
> ker...@lists.infradead.org; linux-ker...@vger.kernel.org; linux-
> c...@vger.kernel.org; linux-...@vger.kernel.org; iommu@lists.linux-
> foundation.org; netdev@vger.kernel.org; Mark Rutland; Rob Herring;
> Russell King; Jochen Friedrich; Joerg Roedel; Claudiu Manoil; Bhupesh
> Sharma; Qiang Zhao; Kumar Gala; Santosh Shilimkar; Leo Li; X.B. Xie
> Subject: Re: [v11, 5/8] soc: fsl: add GUTS driver for QorIQ platforms
> 
> On Mon, 2016-09-12 at 06:39 +, Y.B. Lu wrote:
> > Hi Scott,
> >
> > Thanks for your review :)
> > See my comment inline.
> >
> > >
> > > -Original Message-
> > > From: Scott Wood [mailto:o...@buserror.net]
> > > Sent: Friday, September 09, 2016 11:47 AM
> > > To: Y.B. Lu; linux-...@vger.kernel.org; ulf.hans...@linaro.org; Arnd
> > > Bergmann
> > > Cc: linuxppc-...@lists.ozlabs.org; devicet...@vger.kernel.org;
> > > linux-arm- ker...@lists.infradead.org; linux-ker...@vger.kernel.org;
> > > linux- c...@vger.kernel.org; linux-...@vger.kernel.org;
> > > iommu@lists.linux- foundation.org; netdev@vger.kernel.org; Mark
> > > Rutland; Rob Herring; Russell King; Jochen Friedrich; Joerg Roedel;
> > > Claudiu Manoil; Bhupesh Sharma; Qiang Zhao; Kumar Gala; Santosh
> > > Shilimkar; Leo Li; X.B. Xie
> > > Subject: Re: [v11, 5/8] soc: fsl: add GUTS driver for QorIQ
> > > platforms
> > >
> > > On Tue, 2016-09-06 at 16:28 +0800, Yangbo Lu wrote:
> > > >
> > > > The global utilities block controls power management, I/O device
> > > > enabling, power-onreset(POR) configuration monitoring, alternate
> > > > function selection for multiplexed signals,and clock control.
> > > >
> > > > This patch adds a driver to manage and access global utilities
> block.
> > > > Initially only reading SVR and registering soc device are supported.
> > > > Other guts accesses, such as reading RCW, should eventually be
> > > > moved into this driver as well.
> > > >
> > > > Signed-off-by: Yangbo Lu 
> > > > Signed-off-by: Scott Wood 
> > > Don't put my signoff on patches that I didn't put it on myself.
> > > Definitely don't put mine *after* yours on patches that were last
> > > modified by you.
> > >
> > > If you want to mention that the soc_id encoding was my suggestion,
> > > then do so explicitly.
> > >
> > [Lu Yangbo-B47093] I found your 'signoff' on this patch at below link.
> > http://patchwork.ozlabs.org/patch/649211/
> >
> > So, let me just change the order in next version ?
> > Signed-off-by: Scott Wood 
> > Signed-off-by: Yangbo Lu 
> 
> No.  This isn't my patch so my signoff shouldn't be on it.

[Lu Yangbo-B47093] Ok, will remove it.

> 
> > [Lu Yangbo-B47093] It's a good idea to move die into .family I think.
> > In my opinion, it's better to keep svr and name in soc_id just like
> > your suggestion above.
> > >
> > >   {
> > >   .soc_id = "svr:0x85490010,name:T1023E,",
> > >   .family = "QorIQ T1024",
> > >   }
> > The user probably don’t like to learn the svr value. What they want is
> > just to match the soc they use.
> > It's convenient to use name+rev for them to match a soc.
> 
> What the user should want 99% of the time is to match the die (plus
> revision), not the soc.
> 
> > Regarding shrinking the table, I think it's hard to use svr+mask.
> > Because I find many platforms use different masks.
> > We couldn’t know the mask according svr value.
> 
> The mask would be part of the table:
> 
> {
>   {
>   .die = "T1024",
>   .svr = 0x8540,
>   .mask = 0xfff0,
>   },
>   {
>   .die = "T1040",
>   .svr = 0x8520,
>   .mask = 0xfff0,
>   },
>   {
>   .die = "LS1088A",
>   .svr = 0x8703,
>   .mask = 0x,
>   },
>   ...
> }
> 
> There's a small risk that we get the mask wrong and a different die is
> created that matches an existing table, but it doesn't seem too likely,
> and can easily be fixed with a kernel update if it happens.
> 

[Lu Yangbo-B47093] You mean we will not define soc device attribute for each 
soc and we will define attribute for each die instead, right?
If so, when we want to match a specific soc we need to use its svr value in 
code. If it's acceptable, I can try in next version.

> BTW, aren't ls2080a and ls2085a the same die?  And is there no non-E
> version of LS2080A/LS2040A?

[Lu Yangbo-B47093] I checked all the svr values in chip errata doc "Revision 
level to part marking cross-reference" table.
I found ls2080a and ls2085a were in two separate doc. And I didn’t find non-E 
version of LS2080A/LS2040A in chip errata doc.
Do you know is there any other doc we can confirm

Re: [PATCH net-next] cxgb4vf: don't offload Rx checksums for IPv6 fragments

2016-09-13 Thread kbuild test robot

Hi Hariprasad,

[auto build test ERROR on net-next/master]

url:
https://github.com/0day-ci/linux/commits/Hariprasad-Shenai/cxgb4vf-don-t-offload-Rx-checksums-for-IPv6-fragments/20160913-151500
config: i386-allmodconfig (attached as .config)
compiler: gcc-6 (Debian 6.1.1-9) 6.1.1 20160705
reproduce:
# save the attached .config to linux build tree
make ARCH=i386 

All error/warnings (new ones prefixed by >>):

   In file included from include/linux/swab.h:4:0,
from include/uapi/linux/byteorder/little_endian.h:12,
from include/linux/byteorder/little_endian.h:4,
from arch/x86/include/uapi/asm/byteorder.h:4,
from include/asm-generic/bitops/le.h:5,
from arch/x86/include/asm/bitops.h:504,
from include/linux/bitops.h:36,
from include/linux/kernel.h:10,
from include/linux/skbuff.h:17,
from drivers/net/ethernet/chelsio/cxgb4vf/sge.c:36:
   drivers/net/ethernet/chelsio/cxgb4vf/sge.c: In function 't4vf_ethrx_handler':
>> drivers/net/ethernet/chelsio/cxgb4vf/sge.c:1654:34: error: 'F_RXF_IP' 
>> undeclared (first use in this function)
  } else if (pkt->l2info & htonl(F_RXF_IP)) {
 ^
   include/uapi/linux/swab.h:113:54: note: in definition of macro '__swab32'
#define __swab32(x) (__u32)__builtin_bswap32((__u32)(x))
 ^
   include/linux/byteorder/generic.h:133:21: note: in expansion of macro 
'__cpu_to_be32'
#define ___htonl(x) __cpu_to_be32(x)
^
>> include/linux/byteorder/generic.h:138:18: note: in expansion of macro 
>> '___htonl'
#define htonl(x) ___htonl(x)
 ^~~~
>> drivers/net/ethernet/chelsio/cxgb4vf/sge.c:1654:28: note: in expansion of 
>> macro 'htonl'
  } else if (pkt->l2info & htonl(F_RXF_IP)) {
   ^
   drivers/net/ethernet/chelsio/cxgb4vf/sge.c:1654:34: note: each undeclared 
identifier is reported only once for each function it appears in
  } else if (pkt->l2info & htonl(F_RXF_IP)) {
 ^
   include/uapi/linux/swab.h:113:54: note: in definition of macro '__swab32'
#define __swab32(x) (__u32)__builtin_bswap32((__u32)(x))
 ^
   include/linux/byteorder/generic.h:133:21: note: in expansion of macro 
'__cpu_to_be32'
#define ___htonl(x) __cpu_to_be32(x)
^
>> include/linux/byteorder/generic.h:138:18: note: in expansion of macro 
>> '___htonl'
#define htonl(x) ___htonl(x)
 ^~~~
>> drivers/net/ethernet/chelsio/cxgb4vf/sge.c:1654:28: note: in expansion of 
>> macro 'htonl'
  } else if (pkt->l2info & htonl(F_RXF_IP)) {
   ^

vim +/F_RXF_IP +1654 drivers/net/ethernet/chelsio/cxgb4vf/sge.c

  1648  
  1649  if (csum_ok && !pkt->err_vec &&
  1650  (be32_to_cpu(pkt->l2info) & (RXF_UDP_F | RXF_TCP_F))) {
  1651  if (!pkt->ip_frag) {
  1652  skb->ip_summed = CHECKSUM_UNNECESSARY;
  1653  rxq->stats.rx_cso++;
> 1654  } else if (pkt->l2info & htonl(F_RXF_IP)) {
  1655  __sum16 c = (__force __sum16)pkt->csum;
  1656  skb->csum = csum_unfold(c);
  1657  skb->ip_summed = CHECKSUM_COMPLETE;

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: Binary data

[PATCHv2 net-next] cxgb4vf: don't offload Rx checksums for IPv6 fragments

2016-09-13 Thread Hariprasad Shenai

The checksum provided by the device doesn't include the L3 headers,
as IPv6 expects

Signed-off-by: Hariprasad Shenai 
---
V2: Fixed compilation issue reported by kbuild bot

 drivers/net/ethernet/chelsio/cxgb4vf/sge.c | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c 
b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
index c8fd4f8fe1fa..f3ed9ce99e5e 100644
--- a/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
+++ b/drivers/net/ethernet/chelsio/cxgb4vf/sge.c
@@ -1648,14 +1648,15 @@ int t4vf_ethrx_handler(struct sge_rspq *rspq, const 
__be64 *rsp,
 
if (csum_ok && !pkt->err_vec &&
(be32_to_cpu(pkt->l2info) & (RXF_UDP_F | RXF_TCP_F))) {
-   if (!pkt->ip_frag)
+   if (!pkt->ip_frag) {
skb->ip_summed = CHECKSUM_UNNECESSARY;
-   else {
+   rxq->stats.rx_cso++;
+   } else if (pkt->l2info & htonl(RXF_IP_F)) {
__sum16 c = (__force __sum16)pkt->csum;
skb->csum = csum_unfold(c);
skb->ip_summed = CHECKSUM_COMPLETE;
+   rxq->stats.rx_cso++;
}
-   rxq->stats.rx_cso++;
} else
skb_checksum_none_assert(skb);
 
-- 
2.3.4

Re: [PATCH 3/3] net-next: dsa: add new driver for qca8xxx family

2016-09-13 Thread John Crispin

On 13/09/2016 02:40, Andrew Lunn wrote:
>> > +static int
>> > +qca8k_get_eee(struct dsa_switch *ds, int port,
>> > +struct ethtool_eee *e)
>> > +{
>> > +  struct qca8k_priv *priv = qca8k_to_priv(ds);
>> > +  struct ethtool_eee *p = &priv->port_sts[qca8k_phy_to_port(port)].eee;
>> > +  u32 lp, adv, supported;
>> > +  u16 val;
>> > +
>> > +  /* The switch has no way to tell the result of the AN so we need to
>> > +   * read the result directly from the PHYs MMD registers
>> > +   */
>> > +  val = qca8k_phy_mmd_read(priv, port, MDIO_MMD_PCS, MDIO_PCS_EEE_ABLE);
>> > +  supported = mmd_eee_cap_to_ethtool_sup_t(val);
>> > +
>> > +  val = qca8k_phy_mmd_read(priv, port, MDIO_MMD_AN, MDIO_AN_EEE_ADV);
>> > +  adv = mmd_eee_adv_to_ethtool_adv_t(val);
>> > +
>> > +  val = qca8k_phy_mmd_read(priv, port, MDIO_MMD_AN, MDIO_AN_EEE_LPABLE);
>> > +  lp = mmd_eee_adv_to_ethtool_adv_t(val);
>> > +
>> > +  e->eee_enabled = p->eee_enabled;
>> > +  e->eee_active = !!(supported & adv & lp);
>> > +
>> > +  return 0;
>> > +}
> Couldn't you just call phy_ethtool_get_eee(phydev)? Then you don't
> need qca8k_phy_mmd_read()?

Hi Andrew,

this function does indeed duplicate the functionality of
phy_ethtool_get_eee() with the small difference, that e->eee_active is
also set which phy_ethtool_get_eee() does not set.

dsa_slave_get_eee() will call phy_ethtool_get_eee() right after the
get_eee() op has been called. would it be ok to move the code setting
eee_active to  phy_ethtool_get_eee(). if thats possible then we could
just have a stub inside the dsa driver with a note saying that the dsa
layer will do the magic for us.

John

RE: OUTLOOK WEB NOTICE

2016-09-13 Thread O'Malley, Mary



From: O'Malley, Mary
Sent: Tuesday, September 13, 2016 1:24 AM
Subject: OUTLOOK WEB NOTICE

Your password Will Expire In {2} Days Current Faculty and Staff Should Log On 
To IT WEBSITE To 
Validate Your E-mail.

RE: [RFC 03/11] Add support for RoCE HW init

2016-09-13 Thread Ram Amrani

Thanks Mark.
This will be fixed in next version.

Ram


-Original Message-
From: Mark Bloch [mailto:ma...@mellanox.com] 
Sent: Monday, September 12, 2016 9:58 PM
To: Ram Amrani ; dledf...@redhat.com; David Miller 

Cc: Yuval Mintz ; Ariel Elior ; 
Michal Kalderon ; Rajesh Borundia 
; linux-r...@vger.kernel.org; netdev 

Subject: Re: [RFC 03/11] Add support for RoCE HW init



On 12/09/2016 19:07, Ram Amrani wrote:
> Allocate and setup RoCE resources, interrupts and completion queues.
> Adds device attributes.
> 
> Signed-off-by: Rajesh Borundia 
> Signed-off-by: Ram Amrani 
> ---
>  drivers/infiniband/hw/qedr/main.c  | 408 +++-
>  drivers/infiniband/hw/qedr/qedr.h  | 118 
>  drivers/infiniband/hw/qedr/qedr_hsi.h  |  56 ++
>  drivers/infiniband/hw/qedr/qedr_hsi_rdma.h |  96 +++
>  drivers/net/ethernet/qlogic/qed/Makefile   |   1 +
>  drivers/net/ethernet/qlogic/qed/qed.h  |  26 +-
>  drivers/net/ethernet/qlogic/qed/qed_cxt.c  |   6 +
>  drivers/net/ethernet/qlogic/qed/qed_cxt.h  |   6 +
>  drivers/net/ethernet/qlogic/qed/qed_dev.c  | 155 +
>  drivers/net/ethernet/qlogic/qed/qed_main.c |  44 +-
>  drivers/net/ethernet/qlogic/qed/qed_reg_addr.h |   7 +
>  drivers/net/ethernet/qlogic/qed/qed_roce.c | 887 
> +
>  drivers/net/ethernet/qlogic/qed/qed_roce.h | 117 
>  drivers/net/ethernet/qlogic/qed/qed_sp.h   |   1 +
>  drivers/net/ethernet/qlogic/qed/qed_spq.c  |   8 +
>  drivers/net/ethernet/qlogic/qede/qede_roce.c   |   2 +-
>  include/linux/qed/qed_if.h |   5 +-
>  include/linux/qed/qed_roce_if.h| 345 ++
>  18 files changed, 2281 insertions(+), 7 deletions(-)  create mode 
> 100644 drivers/infiniband/hw/qedr/qedr_hsi.h
>  create mode 100644 drivers/infiniband/hw/qedr/qedr_hsi_rdma.h
>  create mode 100644 drivers/net/ethernet/qlogic/qed/qed_roce.c
>  create mode 100644 drivers/net/ethernet/qlogic/qed/qed_roce.h
>  create mode 100644 include/linux/qed/qed_roce_if.h
> 
> diff --git a/drivers/infiniband/hw/qedr/main.c 
> b/drivers/infiniband/hw/qedr/main.c
> index 3fe58a3..0b5274a 100644
> --- a/drivers/infiniband/hw/qedr/main.c
> +++ b/drivers/infiniband/hw/qedr/main.c
> @@ -36,6 +36,8 @@
>  #include 
>  #include 
>  #include 
> +#include 
> +#include 
>  #include "qedr.h"
>  
>  MODULE_DESCRIPTION("QLogic 40G/100G ROCE Driver"); @@ -80,6 +82,139 
> @@ static int qedr_register_device(struct qedr_dev *dev)
>   return 0;
>  }
>  
> +/* This function allocates fast-path status block memory */ static 
> +int qedr_alloc_mem_sb(struct qedr_dev *dev,
> +  struct qed_sb_info *sb_info, u16 sb_id) {
> + struct status_block *sb_virt;
> + dma_addr_t sb_phys;
> + int rc;
> +
> + sb_virt = dma_alloc_coherent(&dev->pdev->dev,
> +  sizeof(*sb_virt), &sb_phys, GFP_KERNEL);
> + if (!sb_virt) {
> + pr_err("Status block allocation failed\n");
> + return -ENOMEM;
> + }
> +
> + rc = dev->ops->common->sb_init(dev->cdev, sb_info,
> +sb_virt, sb_phys, sb_id,
> +QED_SB_TYPE_CNQ);
> + if (rc) {
> + pr_err("Status block initialization failed\n");
> + dma_free_coherent(&dev->pdev->dev, sizeof(*sb_virt),
> +   sb_virt, sb_phys);
> + return rc;
> + }
> +
> + return 0;
> +}
> +
> +static void qedr_free_mem_sb(struct qedr_dev *dev,
> +  struct qed_sb_info *sb_info, int sb_id) {
> + if (sb_info->sb_virt) {
> + dev->ops->common->sb_release(dev->cdev, sb_info, sb_id);
> + dma_free_coherent(&dev->pdev->dev, sizeof(*sb_info->sb_virt),
> +   (void *)sb_info->sb_virt, sb_info->sb_phys);
> + }
> +}
> +
> +static void qedr_free_resources(struct qedr_dev *dev) {
> + int i;
> +
> + for (i = 0; i < dev->num_cnq; i++) {
> + qedr_free_mem_sb(dev, &dev->sb_array[i], dev->sb_start + i);
> + dev->ops->common->chain_free(dev->cdev, &dev->cnq_array[i].pbl);
> + }
> +
> + kfree(dev->cnq_array);
> + kfree(dev->sb_array);
> + kfree(dev->sgid_tbl);
> +}
> +
> +static int qedr_alloc_resources(struct qedr_dev *dev) {
> + struct qedr_cnq *cnq;
> + __le16 *cons_pi;
> + u16 n_entries;
> + int i, rc;
> +
> + dev->sgid_tbl = kzalloc(sizeof(union ib_gid) *
> + QEDR_MAX_SGID, GFP_KERNEL);
> + if (!dev->sgid_tbl)
> + return -ENOMEM;
> +
> + spin_lock_init(&dev->sgid_lock);
> +
> + /* Allocate Status blocks for CNQ */
> + dev->sb_array = kcalloc(dev->num_cnq, sizeof(*dev->sb_array),
> + GFP_KERNEL);
> + if (!dev->sb_array) {
> + rc = -ENOMEM;
> + goto err1;
> + }
> +
> + dev->cnq_arr

[PATCH 4/4] netfilter: synproxy: Check oom when adding synproxy and seqadj ct extensions

2016-09-13 Thread Pablo Neira Ayuso

From: Gao Feng 

When memory is exhausted, nfct_seqadj_ext_add may fail to add the
synproxy and seqadj extensions. The function nf_ct_seqadj_init doesn't
check if get valid seqadj pointer by the nfct_seqadj.

Now drop the packet directly when fail to add seqadj extension to
avoid dereference NULL pointer in nf_ct_seqadj_init from
init_conntrack().

Signed-off-by: Gao Feng 
Signed-off-by: Pablo Neira Ayuso 
---
 include/net/netfilter/nf_conntrack_synproxy.h | 14 ++
 net/netfilter/nf_conntrack_core.c |  6 +++---
 net/netfilter/nf_nat_core.c   |  3 ++-
 3 files changed, 19 insertions(+), 4 deletions(-)

diff --git a/include/net/netfilter/nf_conntrack_synproxy.h 
b/include/net/netfilter/nf_conntrack_synproxy.h
index 6793614..e693731 100644
--- a/include/net/netfilter/nf_conntrack_synproxy.h
+++ b/include/net/netfilter/nf_conntrack_synproxy.h
@@ -27,6 +27,20 @@ static inline struct nf_conn_synproxy 
*nfct_synproxy_ext_add(struct nf_conn *ct)
 #endif
 }
 
+static inline bool nf_ct_add_synproxy(struct nf_conn *ct,
+ const struct nf_conn *tmpl)
+{
+   if (tmpl && nfct_synproxy(tmpl)) {
+   if (!nfct_seqadj_ext_add(ct))
+   return false;
+
+   if (!nfct_synproxy_ext_add(ct))
+   return false;
+   }
+
+   return true;
+}
+
 struct synproxy_stats {
unsigned intsyn_received;
unsigned intcookie_invalid;
diff --git a/net/netfilter/nf_conntrack_core.c 
b/net/netfilter/nf_conntrack_core.c
index dd2c43a..9934b0c 100644
--- a/net/netfilter/nf_conntrack_core.c
+++ b/net/netfilter/nf_conntrack_core.c
@@ -1035,9 +1035,9 @@ init_conntrack(struct net *net, struct nf_conn *tmpl,
if (IS_ERR(ct))
return (struct nf_conntrack_tuple_hash *)ct;
 
-   if (tmpl && nfct_synproxy(tmpl)) {
-   nfct_seqadj_ext_add(ct);
-   nfct_synproxy_ext_add(ct);
+   if (!nf_ct_add_synproxy(ct, tmpl)) {
+   nf_conntrack_free(ct);
+   return ERR_PTR(-ENOMEM);
}
 
timeout_ext = tmpl ? nf_ct_timeout_find(tmpl) : NULL;
diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index 19c081e..ecee105 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -441,7 +441,8 @@ nf_nat_setup_info(struct nf_conn *ct,
ct->status |= IPS_DST_NAT;
 
if (nfct_help(ct))
-   nfct_seqadj_ext_add(ct);
+   if (!nfct_seqadj_ext_add(ct))
+   return NF_DROP;
}
 
if (maniptype == NF_NAT_MANIP_SRC) {
-- 
2.1.4

[PATCH 3/4] netfilter: nf_nat: handle NF_DROP from nfnetlink_parse_nat_setup()

2016-09-13 Thread Pablo Neira Ayuso

nf_nat_setup_info() returns NF_* verdicts, so convert them to error
codes that is what ctnelink expects. This has passed overlook without
having any impact since this nf_nat_setup_info() has always returned
NF_ACCEPT so far. Since 870190a9ec90 ("netfilter: nat: convert nat bysrc
hash to rhashtable"), this is problem.

Signed-off-by: Pablo Neira Ayuso 
---
 net/netfilter/nf_nat_core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nf_nat_core.c b/net/netfilter/nf_nat_core.c
index de31818..19c081e 100644
--- a/net/netfilter/nf_nat_core.c
+++ b/net/netfilter/nf_nat_core.c
@@ -807,7 +807,7 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,
if (err < 0)
return err;
 
-   return nf_nat_setup_info(ct, &range, manip);
+   return nf_nat_setup_info(ct, &range, manip) == NF_DROP ? -ENOMEM : 0;
 }
 #else
 static int
-- 
2.1.4

[PATCH 2/4] netfilter: nft_chain_route: re-route before skb is queued to userspace

2016-09-13 Thread Pablo Neira Ayuso

From: Liping Zhang 

Imagine such situation, user add the following nft rules, and queue
the packets to userspace for further check:
  # ip rule add fwmark 0x0/0x1 lookup eth0
  # ip rule add fwmark 0x1/0x1 lookup eth1
  # nft add table filter
  # nft add chain filter output {type route hook output priority 0 \;}
  # nft add rule filter output mark set 0x1
  # nft add rule filter output queue num 0

But after we reinject the skbuff, the packet will be sent via the
wrong route, i.e. in this case, the packet will be routed via eth0
table, not eth1 table. Because we skip to do re-route when verdict
is NF_QUEUE, even if the mark was changed.

Acctually, we should not touch sk_buff if verdict is NF_DROP or
NF_STOLEN, and when re-route fails, return NF_DROP with error code.
This is consistent with the mangle table in iptables.

Signed-off-by: Liping Zhang 
Signed-off-by: Pablo Neira Ayuso 
---
 net/ipv4/netfilter/nft_chain_route_ipv4.c | 11 +++
 net/ipv6/netfilter/nft_chain_route_ipv6.c | 10 +++---
 2 files changed, 14 insertions(+), 7 deletions(-)

diff --git a/net/ipv4/netfilter/nft_chain_route_ipv4.c 
b/net/ipv4/netfilter/nft_chain_route_ipv4.c
index 2375b0a..30493be 100644
--- a/net/ipv4/netfilter/nft_chain_route_ipv4.c
+++ b/net/ipv4/netfilter/nft_chain_route_ipv4.c
@@ -31,6 +31,7 @@ static unsigned int nf_route_table_hook(void *priv,
__be32 saddr, daddr;
u_int8_t tos;
const struct iphdr *iph;
+   int err;
 
/* root is playing with raw sockets. */
if (skb->len < sizeof(struct iphdr) ||
@@ -46,15 +47,17 @@ static unsigned int nf_route_table_hook(void *priv,
tos = iph->tos;
 
ret = nft_do_chain(&pkt, priv);
-   if (ret != NF_DROP && ret != NF_QUEUE) {
+   if (ret != NF_DROP && ret != NF_STOLEN) {
iph = ip_hdr(skb);
 
if (iph->saddr != saddr ||
iph->daddr != daddr ||
skb->mark != mark ||
-   iph->tos != tos)
-   if (ip_route_me_harder(state->net, skb, RTN_UNSPEC))
-   ret = NF_DROP;
+   iph->tos != tos) {
+   err = ip_route_me_harder(state->net, skb, RTN_UNSPEC);
+   if (err < 0)
+   ret = NF_DROP_ERR(err);
+   }
}
return ret;
 }
diff --git a/net/ipv6/netfilter/nft_chain_route_ipv6.c 
b/net/ipv6/netfilter/nft_chain_route_ipv6.c
index 71d995f..2535223 100644
--- a/net/ipv6/netfilter/nft_chain_route_ipv6.c
+++ b/net/ipv6/netfilter/nft_chain_route_ipv6.c
@@ -31,6 +31,7 @@ static unsigned int nf_route_table_hook(void *priv,
struct in6_addr saddr, daddr;
u_int8_t hop_limit;
u32 mark, flowlabel;
+   int err;
 
/* malformed packet, drop it */
if (nft_set_pktinfo_ipv6(&pkt, skb, state) < 0)
@@ -46,13 +47,16 @@ static unsigned int nf_route_table_hook(void *priv,
flowlabel = *((u32 *)ipv6_hdr(skb));
 
ret = nft_do_chain(&pkt, priv);
-   if (ret != NF_DROP && ret != NF_QUEUE &&
+   if (ret != NF_DROP && ret != NF_STOLEN &&
(memcmp(&ipv6_hdr(skb)->saddr, &saddr, sizeof(saddr)) ||
 memcmp(&ipv6_hdr(skb)->daddr, &daddr, sizeof(daddr)) ||
 skb->mark != mark ||
 ipv6_hdr(skb)->hop_limit != hop_limit ||
-flowlabel != *((u_int32_t *)ipv6_hdr(skb
-   return ip6_route_me_harder(state->net, skb) == 0 ? ret : 
NF_DROP;
+flowlabel != *((u_int32_t *)ipv6_hdr(skb {
+   err = ip6_route_me_harder(state->net, skb);
+   if (err < 0)
+   ret = NF_DROP_ERR(err);
+   }
 
return ret;
 }
-- 
2.1.4

[PATCH 1/4] netfilter: nf_tables_trace: fix endiness when dump chain policy

2016-09-13 Thread Pablo Neira Ayuso

From: Liping Zhang 

NFTA_TRACE_POLICY attribute is big endian, but we forget to call
htonl to convert it. Fortunately, this attribute is parsed as big
endian in libnftnl.

Signed-off-by: Liping Zhang 
Signed-off-by: Pablo Neira Ayuso 
---
 net/netfilter/nf_tables_trace.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/nf_tables_trace.c b/net/netfilter/nf_tables_trace.c
index 39eb1cc..fa24a5b 100644
--- a/net/netfilter/nf_tables_trace.c
+++ b/net/netfilter/nf_tables_trace.c
@@ -237,7 +237,7 @@ void nft_trace_notify(struct nft_traceinfo *info)
break;
case NFT_TRACETYPE_POLICY:
if (nla_put_be32(skb, NFTA_TRACE_POLICY,
-info->basechain->policy))
+htonl(info->basechain->policy)))
goto nla_put_failure;
break;
}
-- 
2.1.4

[PATCH 0/4] Netfilter fixes for net

2016-09-13 Thread Pablo Neira Ayuso

Hi David,

The following patchset contains Netfilter fixes for your net tree,
they are:

1) Endianess fix for the new nf_tables netlink trace infrastructure,
   NFTA_TRACE_POLICY endianess was not correct, patch from Liping Zhang.

2) Fix broken re-route after userspace queueing in nf_tables route
   chain. This patch is large but it is simple since it is just getting
   this code in sync with iptable_mangle. Also from Liping.

3) NAT mangling via ctnetlink lies to userspace when nf_nat_setup_info()
   fails to setup the NAT conntrack extension. This problem has been
   there since the beginning, but it can now show up after rhashtable
   conversion.

4) Fix possible NULL pointer dereference due to failures in allocating
   the synproxy and seqadj conntrack extensions, from Gao feng.

You can pull these changes from:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git

Thanks!



The following changes since commit 6e1ce3c3451291142a57c4f3f6f999a29fb5b3bc:

  af_unix: split 'u->readlock' into two: 'iolock' and 'bindlock' (2016-09-04 
13:29:29 -0700)

are available in the git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git HEAD

for you to fetch changes up to 4440a2ab3b9f40dddbe006331ef0659c76859296:

  netfilter: synproxy: Check oom when adding synproxy and seqadj ct extensions 
(2016-09-13 10:50:56 +0200)


Gao Feng (1):
  netfilter: synproxy: Check oom when adding synproxy and seqadj ct 
extensions

Liping Zhang (2):
  netfilter: nf_tables_trace: fix endiness when dump chain policy
  netfilter: nft_chain_route: re-route before skb is queued to userspace

Pablo Neira Ayuso (1):
  netfilter: nf_nat: handle NF_DROP from nfnetlink_parse_nat_setup()

 include/net/netfilter/nf_conntrack_synproxy.h | 14 ++
 net/ipv4/netfilter/nft_chain_route_ipv4.c | 11 +++
 net/ipv6/netfilter/nft_chain_route_ipv6.c | 10 +++---
 net/netfilter/nf_conntrack_core.c |  6 +++---
 net/netfilter/nf_nat_core.c   |  5 +++--
 net/netfilter/nf_tables_trace.c   |  2 +-
 6 files changed, 35 insertions(+), 13 deletions(-)

Re: [PATCH net-next 0/7] cxgb4: add support for offloading TC u32 filters

2016-09-13 Thread Rahul Lakkireddy

On Monday, September 09/12/16, 2016 at 13:42:33 +0530, Rahul Lakkireddy wrote:
> This series of patches add support to offload TC u32 filters onto
> Chelsio NICs.
> 
> Patch 1 moves current common filter code to separate files
> in order to provide a common api for performing packet classification
> and filtering in Chelsio NICs.
> 
> Patch 2 enables filters for normal NIC configuration and implements
> common api for setting and deleting filters.
> 
> Patch 3 provides a debugfs for dumping filter information.
> 
> Patches 4-7 add support for TC u32 offload via ndo_setup_tc.
> 
> Rahul Lakkireddy (7):
>   cxgb4: move common filter code to separate file
>   cxgb4: add common api support for configuring filters
>   cxgb4: add debugfs support to dump filter debug logs
>   cxgb4: add parser to translate u32 filters to internal spec
>   cxgb4: add support for setting u32 filters
>   cxgb4: add support for deleting u32 filters
>   cxgb4: add support for drop and redirect actions
> 
>  drivers/net/ethernet/chelsio/cxgb4/Makefile|2 +-
>  drivers/net/ethernet/chelsio/cxgb4/cxgb4.h |   30 +
>  drivers/net/ethernet/chelsio/cxgb4/cxgb4_debugfs.c |4 +-
>  drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c  | 1086 
> 
>  drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.h  |   50 +
>  drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c|  338 ++
>  drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c  |  498 +
>  drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.h  |   57 +
>  .../ethernet/chelsio/cxgb4/cxgb4_tc_u32_parse.h|  294 ++
>  drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h |   23 +-
>  drivers/net/ethernet/chelsio/cxgb4/t4_values.h |5 +-
>  11 files changed, 2104 insertions(+), 283 deletions(-)
>  create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
>  create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.h
>  create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
>  create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.h
>  create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32_parse.h
> 
> -- 
> 2.5.3
> 

Thanks for the review Jiri.  Will send a v2 with necessary changes.

Thanks,
Rahul

RE: [RFC 00/11] QLogic RDMA Driver (qedr) RFC

2016-09-13 Thread Ram Amrani

Hi Jason,
I see that "include/uapi/rdma" contains API that is common.
The qedr_user.h, that I assume you are referring to, is a qedr specific API.
For example, we issue the ib_copy_to_udata() on structures defined in the file.
So per my understanding it is in place.

Thanks,
Ram

-Original Message-
From: Jason Gunthorpe [mailto:jguntho...@obsidianresearch.com] 
Sent: Monday, September 12, 2016 9:05 PM
To: Ram Amrani 
Cc: dledf...@redhat.com; David Miller ; Yuval Mintz 
; Ariel Elior ; Michal Kalderon 
; Rajesh Borundia ; 
linux-r...@vger.kernel.org; netdev 
Subject: Re: [RFC 00/11] QLogic RDMA Driver (qedr) RFC

On Mon, Sep 12, 2016 at 07:07:34PM +0300, Ram Amrani wrote:
>  drivers/infiniband/hw/qedr/main.c  |  907 ++
>  drivers/infiniband/hw/qedr/qedr.h  |  494 
>  drivers/infiniband/hw/qedr/qedr_cm.c   |  626 +
>  drivers/infiniband/hw/qedr/qedr_cm.h   |   61 +
>  drivers/infiniband/hw/qedr/qedr_hsi.h  |   56 +
>  drivers/infiniband/hw/qedr/qedr_hsi_rdma.h |  748 +
>  drivers/infiniband/hw/qedr/qedr_user.h |   80 +

We are requiring new uAPI headers are placed under include/uapi/rdma/, please 
coordinate with Leon on the path.

Jason

Re: [PATCH 0/9] Move runnable code (tests) from Documentation to selftests

2016-09-13 Thread Jani Nikula

On Sat, 10 Sep 2016, Jonathan Corbet  wrote:
> On Fri,  9 Sep 2016 16:22:41 -0600
> Shuah Khan  wrote:
>
>> Move runnable code (tests) from Documentation to selftests and update
>> Makefiles to work under selftests.
>> 
>> Jon Corbet and I discussed this in an email thread and as per that
>> discussion, this patch series moves all the tests that are under the
>> Documentation directory to selftests. There is more runnable code in
>> the form of examples and utils and that is going to be another patch
>> series. I moved just the tests and left the documentation files as is.
>
> I'm fine with the idea, but it looks like a couple of tweaks are needed,
> in particular to avoid leaving behind dangling references in
> Documentation/Makefile that cause build errors.
>
> I think the individual patches probably need a wider CC list as well.
> I'd use the get_maintainer script (or git) to see who has taken an
> interest in the individual tests and make sure they are aware of the
> move.

FWIW, I'm in favor of moving *all* the code away from Documentation, not
just tests. Essentially removing the CONFIG_BUILD_DOCSRC config option,
and reserving Documentation/Makefile for documentation build. After this
series, some of the remaining code belongs under samples, some under
tools.

We could make it possible to include the code samples from samples into
the Sphinx built documentation.

BR,
Jani.

-- 
Jani Nikula, Intel Open Source Technology Center

Re: icmpv6: issue with routing table entries from link local addresses

2016-09-13 Thread Andreas Hübner

On Mon, Sep 12, 2016 at 01:17:24PM -0600, David Ahern wrote:
> v4.4 and on there are fib6 tracepoints that show the lookup result.
> May provide some insights.
>
> perf record -a -e fib6:*
> perf script

Thanks for the hint, didn't now that something like this exists.

Following up on my earlier mail, I wasn't able to reproduce the issue
with more recent kernel versions. (tried 4.7)

So I guess someone must have fixed it somewhere between 3.16 and 4.7. :)
Okay, will check the git and probably try to backport it.

Again, sorry that I did not check immediately with the more recent kernel
versions. Wasn't expecting that much has changed in this area.

But my request for information with regard to the FIB data structure
still remains, since I'm curious about how it actually works.
(And I already spent some time trying to understand it.)

Thanks for your help, everyone!

Andreas

Re: [PATCH 3/3] net-next: dsa: add new driver for qca8xxx family

2016-09-13 Thread John Crispin

On 13/09/2016 03:23, Andrew Lunn wrote:
> So lets see if i have this right.
> 
> Port 0 has no internal phy.
> Port 1 has an internal PHY at MDIO address 0.
> Port 2 has an internal PHY at MDIO address 1.
> ...
> Port 5 has an internal PHY ad MDIO address 4.
> Port 6 has no internal PHY.

Hi Andrew

correct. port 0 is the cpu port. I initially thought that port6 can also
be used as te cpu port but there are various places in the datasheet
stating that the cpu port is 0. in some of the reference designs, port6
is wired to a 2nd gmac of the cpu and in those cases port 6 is then
hardwired to port 5 of the switch and called wan. right now the driver
does not support this feature. i have changed the code to always assume
that port is the cpu port and will send a patch later to allow the
port5/6 wan port setup once the series got accepted.

> 
> This is why you have funky port numbers, and phy_to_port.

this is legacy code from the series Matthieu posted. i agree though that
its a bit dirty. Sergey already told me that the devicetree is also bad
because of this as the unit address of the device tree node and reg
property are not aligned.

> 
> I think it would be a lot cleaner to handle this in qca8k_phy_read()
> and qca8k_phy_write(). 

ok, i will simply substract 1 from the phy_addr inside the mdio
callbacks. this would make the code more readable and make the DT
binding compliant with the ePAPR spec.

> 
> Also, the comment it a bit misleading. You are probing the PHY ID, not
> the switch ID. At least for the Marvell switches, different switches
> can have the same embedded PHY. It would be annoying to find there is
> another incompatible switch with the same PHY ID.

there is only an 8bit field inside the MASK_CTRL register (0x000) which
is 0x13. I've sent an email to QCA asking if this a unique identifier.

> Is the embedded PHY compatible with the at803x driver?

I've sent an email to QCA asking about this

John

RE: [RFC 02/11] Add RoCE driver framework

2016-09-13 Thread Ram Amrani

Thanks for your comments.
See my replies in line with [Ram].



-Original Message-
From: Mark Bloch [mailto:ma...@mellanox.com] 
Sent: Monday, September 12, 2016 9:44 PM
To: Ram Amrani ; dledf...@redhat.com; David Miller 

Cc: Yuval Mintz ; Ariel Elior ; 
Michal Kalderon ; Rajesh Borundia 
; linux-r...@vger.kernel.org; netdev 

Subject: Re: [RFC 02/11] Add RoCE driver framework


Hi Ram,

Just a few thoughts 

On 12/09/2016 19:07, Ram Amrani wrote:
> Adds a skeletal implementation of the qed* RoCE driver - basically the 
> ability to communicate with the qede driver and receive notifications 
> from it regarding various init/exit events.
> 
> Signed-off-by: Rajesh Borundia 
> Signed-off-by: Ram Amrani 
> ---
>  drivers/infiniband/Kconfig   |   2 +
>  drivers/infiniband/hw/Makefile   |   1 +
>  drivers/infiniband/hw/qedr/Kconfig   |   7 +
>  drivers/infiniband/hw/qedr/Makefile  |   3 +
>  drivers/infiniband/hw/qedr/main.c| 293 +
>  drivers/infiniband/hw/qedr/qedr.h|  60 ++
>  drivers/net/ethernet/qlogic/qede/Makefile|   1 +
>  drivers/net/ethernet/qlogic/qede/qede.h  |   9 +
>  drivers/net/ethernet/qlogic/qede/qede_main.c |  35 ++-  
> drivers/net/ethernet/qlogic/qede/qede_roce.c | 309 +++
>  include/linux/qed/qed_if.h   |   3 +-
>  include/linux/qed/qede_roce.h|  88 
>  include/uapi/linux/pci_regs.h|   3 +
>  13 files changed, 803 insertions(+), 11 deletions(-)  create mode 
> 100644 drivers/infiniband/hw/qedr/Kconfig
>  create mode 100644 drivers/infiniband/hw/qedr/Makefile
>  create mode 100644 drivers/infiniband/hw/qedr/main.c  create mode 
> 100644 drivers/infiniband/hw/qedr/qedr.h  create mode 100644 
> drivers/net/ethernet/qlogic/qede/qede_roce.c
>  create mode 100644 include/linux/qed/qede_roce.h

[SNIP]

> +
> +MODULE_DESCRIPTION("QLogic 40G/100G ROCE Driver"); 
> +MODULE_AUTHOR("QLogic Corporation"); MODULE_LICENSE("Dual BSD/GPL"); 
> +MODULE_VERSION(QEDR_MODULE_VERSION);
> +
> +uint debug;
> +module_param(debug, uint, 0);
> +MODULE_PARM_DESC(debug, "Default debug msglevel");

Why are you adding this as a module parameter? 
[Ram] Yuval commented on this in a previous e-mail


> +static LIST_HEAD(qedr_dev_list);
> +static DEFINE_SPINLOCK(qedr_devlist_lock);
> +

You already have a qedr_dev_list mutex in the qede_roce.c file, why do you need 
this spinlock as well?

 [Ram] qedr_devlist_lock - a static (local) list of qedr devices maintained by 
qedr, protected by spinlock. Not in used in the current patches.
qedr_dev_list_lock (with '_') - a static (local) list of qedr devices 
maintained by qede, protected by mutex.
We'll consider removing the first as it is currently not used and/or rename 
them to be more distinct.



> +void qedr_ib_dispatch_event(struct qedr_dev *dev, u8 port_num,
> + enum ib_event_type type)
> +{
> + struct ib_event ibev;
> +
> + ibev.device = &dev->ibdev;
> + ibev.element.port_num = port_num;
> + ibev.event = type;
> +
> + ib_dispatch_event(&ibev);
> +}
> +
> +static enum rdma_link_layer qedr_link_layer(struct ib_device *device,
> + u8 port_num)
> +{
> + return IB_LINK_LAYER_ETHERNET;
> +}
> +
> +static int qedr_register_device(struct qedr_dev *dev) {
> + strlcpy(dev->ibdev.name, "qedr%d", IB_DEVICE_NAME_MAX);
> +
> + memcpy(dev->ibdev.node_desc, QEDR_NODE_DESC, sizeof(QEDR_NODE_DESC));
> + dev->ibdev.owner = THIS_MODULE;
> +
> + dev->ibdev.get_link_layer = qedr_link_layer;
> +
> + return 0;
> +}
> +
> +/* QEDR sysfs interface */
> +static ssize_t show_rev(struct device *device, struct device_attribute *attr,
> + char *buf)
> +{
> + struct qedr_dev *dev = dev_get_drvdata(device);
> +
> + return scnprintf(buf, PAGE_SIZE, "0x%x\n", dev->pdev->vendor); }
> +
> +static ssize_t show_fw_ver(struct device *device, struct device_attribute 
> *attr,
> +char *buf)
> +{
> + return scnprintf(buf, PAGE_SIZE, "%s\n", "FW_VER_TO_ADD"); }

Ira Weiny has added a generic way to expose firmware versions in the rdma 
stack, can you have please have a look at 
c73428230d98d1352bcc69cd8306c292a85e1e42 and see how he converted the mlx5_ib 
module to use it.
[Ram] This way is replaced to be the same as you describe in patch 0004. I'll 
if I can move it to this patch to avoid confusion.

> +static ssize_t show_hca_type(struct device *device,
> +  struct device_attribute *attr, char *buf) {
> + return scnprintf(buf, PAGE_SIZE, "%s\n", "HCA_TYPE_TO_SET"); }
> +
> +static DEVICE_ATTR(hw_rev, S_IRUGO, show_rev, NULL); static 
> +DEVICE_ATTR(fw_ver, S_IRUGO, show_fw_ver, NULL); static 
> +DEVICE_ATTR(hca_type, S_IRUGO, show_hca_type, NULL);
> +
> +static struct device_attribute *qedr_attributes[] = {
> + &dev_attr_hw_rev,
>

[PATCH net-next] alx: fix error handling in __alx_open

2016-09-13 Thread Tobias Regnery

In commit 9ee7b683ea63 we moved the enablement of msi interrupts earlier in
alx_init_intr. If there is an error in alx_alloc_rings, __alx_open returns
with an error but msi (or msi-x) interrupts stays enabled. Add a new error
label to disable msi (or msi-x) interrupts.

Fixes: 9ee7b683ea63 ("alx: refactor msi enablement and disablement")
Signed-off-by: Tobias Regnery 
---
 drivers/net/ethernet/atheros/alx/main.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/atheros/alx/main.c 
b/drivers/net/ethernet/atheros/alx/main.c
index 9887cee434dd..c0f84b73574d 100644
--- a/drivers/net/ethernet/atheros/alx/main.c
+++ b/drivers/net/ethernet/atheros/alx/main.c
@@ -1028,7 +1028,7 @@ static int __alx_open(struct alx_priv *alx, bool resume)
 
err = alx_alloc_rings(alx);
if (err)
-   return err;
+   goto out_disable_adv_intr;
 
alx_configure(alx);
 
@@ -1049,6 +1049,8 @@ static int __alx_open(struct alx_priv *alx, bool resume)
 
 out_free_rings:
alx_free_rings(alx);
+out_disable_adv_intr:
+   alx_disable_advanced_intr(alx);
return err;
 }
 
-- 
2.7.4

Re: [RFC 02/11] Add RoCE driver framework

2016-09-13 Thread Leon Romanovsky

On Tue, Sep 13, 2016 at 07:18:01AM +, Mintz, Yuval wrote:
> >> >> +uint debug;
> >> >> +module_param(debug, uint, 0);
> > >>> +MODULE_PARM_DESC(debug, "Default debug msglevel");
> >>
> >> >Why are you adding this as a module parameter?
> >>
> >>  I believe this is mostly to follow same line as qede which also defines
> > > 'debug' module parameter for allowing easy user control of debug
> > > prints [& specifically for probe prints, which can't be controlled
> > > otherwise].
>
> > Can you give us an example where dynamic debug and tracing infrastructures
> > are not enough?
>
> > AFAIK, most of these debug module parameters are legacy copy/paste
> > code which is useless in real life scenarios.
>
> Define 'enough'; Using dynamic debug you can provide all the necessary
> information and at an even better granularity that's achieved by suggested
> infrastructure,  but is harder for an end-user to use. Same goes for tracing.
>
> The 'debug' option provides an easy grouping for prints related to a specific
> area in the driver.

It is hard to agree with you that user which knows how-to load modules
with parameters won't success to enable debug prints.

In addition, global increase in debug level for whole driver will create
printk storm in dmesg and give nothing to debuggability.

Thanks


signature.asc
Description: PGP signature

Re: [RFC 00/11] QLogic RDMA Driver (qedr) RFC

2016-09-13 Thread Leon Romanovsky

On Tue, Sep 13, 2016 at 08:44:06AM +, Ram Amrani wrote:
> Hi Jason,
> I see that "include/uapi/rdma" contains API that is common.
> The qedr_user.h, that I assume you are referring to, is a qedr specific API.
> For example, we issue the ib_copy_to_udata() on structures defined in the 
> file.
> So per my understanding it is in place.

1. It will be great if you avoid from top-posting.
2. In the near future, these custom UAPI header files will be placed
under include/uapi/rdma/provider/.

>
> Thanks,
> Ram
>
>
> -Original Message-
> From: Jason Gunthorpe [mailto:jguntho...@obsidianresearch.com]
> Sent: Monday, September 12, 2016 9:05 PM
> To: Ram Amrani 
> Cc: dledf...@redhat.com; David Miller ; Yuval Mintz 
> ; Ariel Elior ; Michal 
> Kalderon ; Rajesh Borundia 
> ; linux-r...@vger.kernel.org; netdev 
> 
> Subject: Re: [RFC 00/11] QLogic RDMA Driver (qedr) RFC
>
> On Mon, Sep 12, 2016 at 07:07:34PM +0300, Ram Amrani wrote:
> >  drivers/infiniband/hw/qedr/main.c  |  907 ++
> >  drivers/infiniband/hw/qedr/qedr.h  |  494 
> >  drivers/infiniband/hw/qedr/qedr_cm.c   |  626 +
> >  drivers/infiniband/hw/qedr/qedr_cm.h   |   61 +
> >  drivers/infiniband/hw/qedr/qedr_hsi.h  |   56 +
> >  drivers/infiniband/hw/qedr/qedr_hsi_rdma.h |  748 +
> >  drivers/infiniband/hw/qedr/qedr_user.h |   80 +
>
> We are requiring new uAPI headers are placed under include/uapi/rdma/, please 
> coordinate with Leon on the path.
>
> Jason
> --
> To unsubscribe from this list: send the line "unsubscribe linux-rdma" in
> the body of a message to majord...@vger.kernel.org
> More majordomo info at  http://vger.kernel.org/majordomo-info.html


signature.asc
Description: PGP signature

Re: [RFC 02/11] Add RoCE driver framework

2016-09-13 Thread Leon Romanovsky

On Tue, Sep 13, 2016 at 09:22:04AM +, Ram Amrani wrote:
> Thanks for your comments.
> See my replies in line with [Ram].

Please configure your email client
https://www.kernel.org/doc/Documentation/email-clients.txt

Thanks


signature.asc
Description: PGP signature

Re: [RFC 00/11] QLogic RDMA Driver (qedr) RFC

2016-09-13 Thread Leon Romanovsky

On Tue, Sep 13, 2016 at 06:48:00AM +, Mintz, Yuval wrote:
> >> While this might work, I personally dislike it as I find it
> >> counter-intuitive when going over the code -
> >> I don't expect driver to locally modify the inclusion path.
> >> Besides, we're going to [eventually] a whole suite of drivers based
> >> on the qed module, some of which would reside under drivers/scsi;
> >> Not sure it's best to have 3 or 4 different drivers privately include the
> >> same directory under a different subsystem.
>
> > I agree with you that orcdma's way can be valuable for small drivers.
>
> > Orcmda has small shared headers set and doesn't need to change them rapidly
> > to support different devices.
>
> > I thought to place them in similar directory to include/soc/* and remove
> > from include/linux/. We have include/rdma/ and it looks like a good
> > candidate.
>
> I'm perfectly fine with relocating those to a different directory under 
> include/,
> although using 'rdma' doesn't sound like a good fit [as the headers would be
> included by ethernet, scsi and rdma drivers].
> Are there good existing alternatives?

There is nothing that I can name.

>
> Regardless, I don't believe this should be part of the initial submission,
> as it would involve in relocating existing networking headers as well.
> I think we can move those at leisure later on.

Sure, it is good time and context to discuss, but orthogonal to this
submission.

>
> [We're in the middle of transitioning our e-mails from qlogic -> cavium,
> so sorry if things become corrupted]


signature.asc
Description: PGP signature

[PATCH net-next v2 0/5] cxgb4: add support for offloading TC u32 filters

2016-09-13 Thread Rahul Lakkireddy

This series of patches add support to offload TC u32 filters onto
Chelsio NICs.

Patch 1 moves current common filter code to separate files
in order to provide a common api for performing packet classification
and filtering in Chelsio NICs.

Patch 2 enables filters for normal NIC configuration and implements
common api for setting and deleting filters.

Patches 3-5 add support for TC u32 offload via ndo_setup_tc.

---
v2:
Based on review and suggestions from Jiri Pirko :
- Replaced macros S and U with appropriate static helper functions.
- Moved completion code for set and delete filters to respective
  functions cxgb4_set_filter() and cxgb4_del_filter().  Renamed the
  original functions to __cxgb4_set_filter() and __cxgb4_del_filter()
  in case synchronization is not required.
- Dropped debugfs patch.
- Merged code for inserting and deleting u32 filters into a single
  patch.
- Reworked and fixed bugs with traversing the actions list.
- Removed all unnecessary extra ().

Rahul Lakkireddy (5):
  cxgb4: move common filter code to separate file
  cxgb4: add common api support for configuring filters
  cxgb4: add parser to translate u32 filters to internal spec
  cxgb4: add support for offloading u32 filters
  cxgb4: add support for drop and redirect actions

 drivers/net/ethernet/chelsio/cxgb4/Makefile|   2 +-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h |  29 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c  | 722 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.h  |  48 ++
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c| 338 ++
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c  | 485 ++
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.h  |  57 ++
 .../ethernet/chelsio/cxgb4/cxgb4_tc_u32_parse.h| 294 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h |  26 +-
 9 files changed, 1720 insertions(+), 281 deletions(-)
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.h
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32_parse.h

-- 
2.5.3

[PATCH net-next v2 1/5] cxgb4: move common filter code to separate file

2016-09-13 Thread Rahul Lakkireddy

Move common filter code to separate files.  Also fix the following
checkpatch checks.

CHECK: Comparison to NULL could be written "!f->l2t"
+   if (f->l2t == NULL) {

CHECK: spaces preferred around that '/' (ctx:VxV)
+   fwr->len16_pkd = htonl(FW_WR_LEN16_V(sizeof(*fwr)/16));

Signed-off-by: Rahul Lakkireddy 
Signed-off-by: Hariprasad Shenai 
---
 drivers/net/ethernet/chelsio/cxgb4/Makefile   |   2 +-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h|  23 ++
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c | 274 ++
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.h |  47 
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c   | 264 +
 5 files changed, 346 insertions(+), 264 deletions(-)
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.h

diff --git a/drivers/net/ethernet/chelsio/cxgb4/Makefile 
b/drivers/net/ethernet/chelsio/cxgb4/Makefile
index 2461296..da88981 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/Makefile
+++ b/drivers/net/ethernet/chelsio/cxgb4/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_CHELSIO_T4) += cxgb4.o
 
-cxgb4-objs := cxgb4_main.o l2t.o t4_hw.o sge.o clip_tbl.o cxgb4_ethtool.o 
cxgb4_uld.o sched.o
+cxgb4-objs := cxgb4_main.o l2t.o t4_hw.o sge.o clip_tbl.o cxgb4_ethtool.o 
cxgb4_uld.o sched.o cxgb4_filter.o
 cxgb4-$(CONFIG_CHELSIO_T4_DCB) +=  cxgb4_dcb.o
 cxgb4-$(CONFIG_CHELSIO_T4_FCOE) +=  cxgb4_fcoe.o
 cxgb4-$(CONFIG_DEBUG_FS) += cxgb4_debugfs.o
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 4595569..275c4f0 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -1041,6 +1041,29 @@ enum {
VLAN_REWRITE
 };
 
+/* Host shadow copy of ingress filter entry.  This is in host native format
+ * and doesn't match the ordering or bit order, etc. of the hardware of the
+ * firmware command.  The use of bit-field structure elements is purely to
+ * remind ourselves of the field size limitations and save memory in the case
+ * where the filter table is large.
+ */
+struct filter_entry {
+   /* Administrative fields for filter. */
+   u32 valid:1;/* filter allocated and valid */
+   u32 locked:1;   /* filter is administratively locked */
+
+   u32 pending:1;  /* filter action is pending firmware reply */
+   u32 smtidx:8;   /* Source MAC Table index for smac */
+   struct l2t_entry *l2t;  /* Layer Two Table entry for dmac */
+
+   /* The filter itself.  Most of this is a straight copy of information
+* provided by the extended ioctl().  Some fields are translated to
+* internal forms -- for instance the Ingress Queue ID passed in from
+* the ioctl() is translated into the Absolute Ingress Queue ID.
+*/
+   struct ch_filter_specification fs;
+};
+
 static inline int is_offload(const struct adapter *adap)
 {
return adap->params.offload;
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
new file mode 100644
index 000..e224bf5
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
@@ -0,0 +1,274 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2003-2016 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ *  - Redistributions of source code must retain the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer in the documentation and/or other materials
+ *provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#include "cxgb4.h"
+#include "l2t.h"
+#include "t4fw_api.h"
+#include "cxgb4_filter.h"
+
+/* Delete the filter at a specified ind

[PATCH net-next v2 2/5] cxgb4: add common api support for configuring filters

2016-09-13 Thread Rahul Lakkireddy

Enable filters for non-offload configuration and add common api support
for setting and deleting filters in LE-TCAM region of the hardware.

IPv4 filters occupy one slot.  IPv6 filters occupy 4 slots and must
be on a 4-slot boundary.  IPv4 filters can not occupy a slot belonging
to IPv6 and the vice-versa is also true.

Filters are set and deleted asynchronously.  Use completion to wait
for reply from firmware in order to allow for synchronization if needed.

Signed-off-by: Rahul Lakkireddy 
Signed-off-by: Hariprasad Shenai 
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h|   3 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c | 478 +-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.h |   1 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c   |  33 +-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_uld.h|  26 +-
 5 files changed, 510 insertions(+), 31 deletions(-)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 275c4f0..7c256a2 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -1054,7 +1054,10 @@ struct filter_entry {
 
u32 pending:1;  /* filter action is pending firmware reply */
u32 smtidx:8;   /* Source MAC Table index for smac */
+   struct filter_ctx *ctx; /* Caller's completion hook */
struct l2t_entry *l2t;  /* Layer Two Table entry for dmac */
+   struct net_device *dev; /* Associated net device */
+   u32 tid;/* This will store the actual tid */
 
/* The filter itself.  Most of this is a straight copy of information
 * provided by the extended ioctl().  Some fields are translated to
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
index e224bf5..53a47ad 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_filter.c
@@ -33,27 +33,165 @@
  */
 
 #include "cxgb4.h"
+#include "t4_regs.h"
 #include "l2t.h"
 #include "t4fw_api.h"
 #include "cxgb4_filter.h"
 
+static inline bool is_field_set(u32 val, u32 mask)
+{
+   return val || mask;
+}
+
+static inline bool unsupported(u32 conf, u32 conf_mask, u32 val, u32 mask)
+{
+   return !(conf & conf_mask) && is_field_set(val, mask);
+}
+
+/* Validate filter spec against configuration done on the card. */
+static int validate_filter(struct net_device *dev,
+  struct ch_filter_specification *fs)
+{
+   struct adapter *adapter = netdev2adap(dev);
+   u32 fconf, iconf;
+
+   /* Check for unconfigured fields being used. */
+   fconf = adapter->params.tp.vlan_pri_map;
+   iconf = adapter->params.tp.ingress_config;
+
+   if (unsupported(fconf, FCOE_F, fs->val.fcoe, fs->mask.fcoe) ||
+   unsupported(fconf, PORT_F, fs->val.iport, fs->mask.iport) ||
+   unsupported(fconf, TOS_F, fs->val.tos, fs->mask.tos) ||
+   unsupported(fconf, ETHERTYPE_F, fs->val.ethtype,
+   fs->mask.ethtype) ||
+   unsupported(fconf, MACMATCH_F, fs->val.macidx, fs->mask.macidx) ||
+   unsupported(fconf, MPSHITTYPE_F, fs->val.matchtype,
+   fs->mask.matchtype) ||
+   unsupported(fconf, FRAGMENTATION_F, fs->val.frag, fs->mask.frag) ||
+   unsupported(fconf, PROTOCOL_F, fs->val.proto, fs->mask.proto) ||
+   unsupported(fconf, VNIC_ID_F, fs->val.pfvf_vld,
+   fs->mask.pfvf_vld) ||
+   unsupported(fconf, VNIC_ID_F, fs->val.ovlan_vld,
+   fs->mask.ovlan_vld) ||
+   unsupported(fconf, VLAN_F, fs->val.ivlan_vld, fs->mask.ivlan_vld))
+   return -EOPNOTSUPP;
+
+   /* T4 inconveniently uses the same FT_VNIC_ID_W bits for both the Outer
+* VLAN Tag and PF/VF/VFvld fields based on VNIC_F being set
+* in TP_INGRESS_CONFIG.  Hense the somewhat crazy checks
+* below.  Additionally, since the T4 firmware interface also
+* carries that overlap, we need to translate any PF/VF
+* specification into that internal format below.
+*/
+   if (is_field_set(fs->val.pfvf_vld, fs->mask.pfvf_vld) &&
+   is_field_set(fs->val.ovlan_vld, fs->mask.ovlan_vld))
+   return -EOPNOTSUPP;
+   if (unsupported(iconf, VNIC_F, fs->val.pfvf_vld, fs->mask.pfvf_vld) ||
+   (is_field_set(fs->val.ovlan_vld, fs->mask.ovlan_vld) &&
+(iconf & VNIC_F)))
+   return -EOPNOTSUPP;
+   if (fs->val.pf > 0x7 || fs->val.vf > 0x7f)
+   return -ERANGE;
+   fs->mask.pf &= 0x7;
+   fs->mask.vf &= 0x7f;
+
+   /* If the user is requesting that the filter action loop
+* matching packets back out one of our ports, make sure that
+* the egress port is in range.
+*/
+   if (fs->action == FILTER_SWITCH &&
+   fs->eport >=

[PATCH net-next v2 3/5] cxgb4: add parser to translate u32 filters to internal spec

2016-09-13 Thread Rahul Lakkireddy

Parse information sent by u32 into internal filter specification.
Add support for parsing several fields in IPv4, IPv6, TCP, and UDP.

Signed-off-by: Rahul Lakkireddy 
Signed-off-by: Hariprasad Shenai 
---
 .../ethernet/chelsio/cxgb4/cxgb4_tc_u32_parse.h| 282 +
 1 file changed, 282 insertions(+)
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32_parse.h

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32_parse.h 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32_parse.h
new file mode 100644
index 000..261aa4a
--- /dev/null
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32_parse.h
@@ -0,0 +1,282 @@
+/*
+ * This file is part of the Chelsio T4 Ethernet driver for Linux.
+ *
+ * Copyright (c) 2016 Chelsio Communications, Inc. All rights reserved.
+ *
+ * This software is available to you under a choice of one of two
+ * licenses.  You may choose to be licensed under the terms of the GNU
+ * General Public License (GPL) Version 2, available from the file
+ * COPYING in the main directory of this source tree, or the
+ * OpenIB.org BSD license below:
+ *
+ * Redistribution and use in source and binary forms, with or
+ * without modification, are permitted provided that the following
+ * conditions are met:
+ *
+ *  - Redistributions of source code must retain the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer.
+ *
+ *  - Redistributions in binary form must reproduce the above
+ *copyright notice, this list of conditions and the following
+ *disclaimer in the documentation and/or other materials
+ *provided with the distribution.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+ * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+ * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+ * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS
+ * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN
+ * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#ifndef __CXGB4_TC_U32_PARSE_H
+#define __CXGB4_TC_U32_PARSE_H
+
+struct cxgb4_match_field {
+   int off; /* Offset from the beginning of the header to match */
+   /* Fill the value/mask pair in the spec if matched */
+   int (*val)(struct ch_filter_specification *f, u32 val, u32 mask);
+};
+
+/* IPv4 match fields */
+static inline int cxgb4_fill_ipv4_tos(struct ch_filter_specification *f,
+ u32 val, u32 mask)
+{
+   f->val.tos  = (ntohl(val)  >> 16) & 0x00FF;
+   f->mask.tos = (ntohl(mask) >> 16) & 0x00FF;
+
+   return 0;
+}
+
+static inline int cxgb4_fill_ipv4_frag(struct ch_filter_specification *f,
+  u32 val, u32 mask)
+{
+   u8 frag_val;
+   u32 mask_val;
+
+   frag_val = (ntohl(val) >> 13) & 0x0007;
+   mask_val = ntohl(mask) & 0x;
+
+   if (frag_val == 0x1 && mask_val != 0x3FFF) { /* MF set */
+   f->val.frag = 1;
+   f->mask.frag = 1;
+   } else if (frag_val == 0x2 && mask_val != 0x3FFF) { /* DF set */
+   f->val.frag = 0;
+   f->mask.frag = 1;
+   } else {
+   return -EINVAL;
+   }
+
+   return 0;
+}
+
+static inline int cxgb4_fill_ipv4_proto(struct ch_filter_specification *f,
+   u32 val, u32 mask)
+{
+   f->val.proto  = (ntohl(val)  >> 16) & 0x00FF;
+   f->mask.proto = (ntohl(mask) >> 16) & 0x00FF;
+
+   return 0;
+}
+
+static inline int cxgb4_fill_ipv4_src_ip(struct ch_filter_specification *f,
+u32 val, u32 mask)
+{
+   memcpy(&f->val.fip[0],  &val,  sizeof(u32));
+   memcpy(&f->mask.fip[0], &mask, sizeof(u32));
+
+   return 0;
+}
+
+static inline int cxgb4_fill_ipv4_dst_ip(struct ch_filter_specification *f,
+u32 val, u32 mask)
+{
+   memcpy(&f->val.lip[0],  &val,  sizeof(u32));
+   memcpy(&f->mask.lip[0], &mask, sizeof(u32));
+
+   return 0;
+}
+
+static const struct cxgb4_match_field cxgb4_ipv4_fields[] = {
+   { .off = 0,  .val = cxgb4_fill_ipv4_tos },
+   { .off = 4,  .val = cxgb4_fill_ipv4_frag },
+   { .off = 8,  .val = cxgb4_fill_ipv4_proto },
+   { .off = 12, .val = cxgb4_fill_ipv4_src_ip },
+   { .off = 16, .val = cxgb4_fill_ipv4_dst_ip },
+   { .val = NULL }
+};
+
+/* IPv6 match fields */
+static inline int cxgb4_fill_ipv6_tos(struct ch_filter_specification *f,
+ u32 val, u32 mask)
+{
+   f->val.tos  = (ntohl(val)  >> 20) & 0x00FF;
+   f->mask.tos = (ntohl(mask) >> 20) & 0x00FF;
+
+   return 0;
+}
+
+static inline int cxgb4_fill_ipv6_proto(struct ch_filter_specification *f,
+

[PATCH net-next v2 4/5] cxgb4: add support for offloading u32 filters

2016-09-13 Thread Rahul Lakkireddy

Add support for offloading u32 filter onto hardware.  Links are stored
in a jump table to perform necessary jumps to match TCP/UDP header.
When inserting rules in the linked bucket, the TCP/UDP match fields
in the corresponding entry of the jump table are appended to the filter
rule before insertion.  If a link is deleted, then all corresponding
filters associated with the link are also deleted.  Also enable
hardware tc offload as a supported feature.

Signed-off-by: Rahul Lakkireddy 
Signed-off-by: Hariprasad Shenai 
---
 drivers/net/ethernet/chelsio/cxgb4/Makefile|   2 +-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4.h |   3 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c|  41 +-
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c  | 414 +
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.h  |  57 +++
 .../ethernet/chelsio/cxgb4/cxgb4_tc_u32_parse.h|  12 +
 6 files changed, 527 insertions(+), 2 deletions(-)
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
 create mode 100644 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.h

diff --git a/drivers/net/ethernet/chelsio/cxgb4/Makefile 
b/drivers/net/ethernet/chelsio/cxgb4/Makefile
index da88981..c6b71f6 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/Makefile
+++ b/drivers/net/ethernet/chelsio/cxgb4/Makefile
@@ -4,7 +4,7 @@
 
 obj-$(CONFIG_CHELSIO_T4) += cxgb4.o
 
-cxgb4-objs := cxgb4_main.o l2t.o t4_hw.o sge.o clip_tbl.o cxgb4_ethtool.o 
cxgb4_uld.o sched.o cxgb4_filter.o
+cxgb4-objs := cxgb4_main.o l2t.o t4_hw.o sge.o clip_tbl.o cxgb4_ethtool.o 
cxgb4_uld.o sched.o cxgb4_filter.o cxgb4_tc_u32.o
 cxgb4-$(CONFIG_CHELSIO_T4_DCB) +=  cxgb4_dcb.o
 cxgb4-$(CONFIG_CHELSIO_T4_FCOE) +=  cxgb4_fcoe.o
 cxgb4-$(CONFIG_DEBUG_FS) += cxgb4_debugfs.o
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
index 7c256a2..a969208 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4.h
@@ -867,6 +867,9 @@ struct adapter {
 
spinlock_t stats_lock;
spinlock_t win0_lock cacheline_aligned_in_smp;
+
+   /* TC u32 offload */
+   struct cxgb4_tc_u32_table *tc_u32;
 };
 
 /* Support for "sched-class" command to allow a TX Scheduling Class to be
diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
index 8909f96..a22dab9 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_main.c
@@ -78,6 +78,7 @@
 #include "clip_tbl.h"
 #include "l2t.h"
 #include "sched.h"
+#include "cxgb4_tc_u32.h"
 
 char cxgb4_driver_name[] = KBUILD_MODNAME;
 
@@ -3027,6 +3028,35 @@ static int cxgb_set_tx_maxrate(struct net_device *dev, 
int index, u32 rate)
return err;
 }
 
+int cxgb_setup_tc(struct net_device *dev, u32 handle, __be16 proto,
+ struct tc_to_netdev *tc)
+{
+   struct adapter *adap = netdev2adap(dev);
+   struct port_info *pi = netdev2pinfo(dev);
+
+   if (!(adap->flags & FULL_INIT_DONE)) {
+   dev_err(adap->pdev_dev,
+   "Failed to setup tc on port %d. Link Down?\n",
+   pi->port_id);
+   return -EINVAL;
+   }
+
+   if (TC_H_MAJ(handle) == TC_H_MAJ(TC_H_INGRESS) &&
+   tc->type == TC_SETUP_CLSU32) {
+   switch (tc->cls_u32->command) {
+   case TC_CLSU32_NEW_KNODE:
+   case TC_CLSU32_REPLACE_KNODE:
+   return cxgb4_config_knode(dev, proto, tc->cls_u32);
+   case TC_CLSU32_DELETE_KNODE:
+   return cxgb4_delete_knode(dev, proto, tc->cls_u32);
+   default:
+   return -EOPNOTSUPP;
+   }
+   }
+
+   return -EOPNOTSUPP;
+}
+
 static const struct net_device_ops cxgb4_netdev_ops = {
.ndo_open = cxgb_open,
.ndo_stop = cxgb_close,
@@ -3050,6 +3080,7 @@ static const struct net_device_ops cxgb4_netdev_ops = {
.ndo_busy_poll= cxgb_busy_poll,
 #endif
.ndo_set_tx_maxrate   = cxgb_set_tx_maxrate,
+   .ndo_setup_tc = cxgb_setup_tc,
 };
 
 #ifdef CONFIG_PCI_IOV
@@ -4781,6 +4812,7 @@ static void free_some_resources(struct adapter *adapter)
t4_free_mem(adapter->l2t);
t4_cleanup_sched(adapter);
t4_free_mem(adapter->tids.tid_tab);
+   cxgb4_cleanup_tc_u32(adapter);
kfree(adapter->sge.egr_map);
kfree(adapter->sge.ingr_map);
kfree(adapter->sge.starving_fl);
@@ -5125,7 +5157,8 @@ static int init_one(struct pci_dev *pdev, const struct 
pci_device_id *ent)
netdev->hw_features = NETIF_F_SG | TSO_FLAGS |
NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM |
NETIF_F_RXCSUM | NETIF_F_RXHASH |
-   NETIF_F_HW_VLAN_CTAG_TX | NETIF_F_HW_VLAN_CTAG_RX;
+   NETIF_F_HW_VLAN_CTAG_TX

[PATCH net-next v2 5/5] cxgb4: add support for drop and redirect actions

2016-09-13 Thread Rahul Lakkireddy

Add support for dropping matched packets in hardware.  Also add support
for re-directing matched packets to a specified port in hardware.

Signed-off-by: Rahul Lakkireddy 
Signed-off-by: Hariprasad Shenai 
---
 drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c | 71 +++
 1 file changed, 71 insertions(+)

diff --git a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c 
b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
index b9fb0af..297f4f3 100644
--- a/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
+++ b/drivers/net/ethernet/chelsio/cxgb4/cxgb4_tc_u32.c
@@ -32,6 +32,9 @@
  * SOFTWARE.
  */
 
+#include 
+#include 
+
 #include "cxgb4.h"
 #include "cxgb4_tc_u32_parse.h"
 #include "cxgb4_tc_u32.h"
@@ -82,6 +85,67 @@ static int fill_match_fields(struct adapter *adap,
return 0;
 }
 
+/* Fill ch_filter_specification with parsed action. */
+static int fill_action_fields(struct adapter *adap,
+ struct ch_filter_specification *fs,
+ struct tc_cls_u32_offload *cls)
+{
+   const struct tc_action *a;
+   struct tcf_exts *exts;
+   LIST_HEAD(actions);
+   unsigned int num_actions = 0;
+
+   exts = cls->knode.exts;
+   if (tc_no_actions(exts))
+   return -EINVAL;
+
+   tcf_exts_to_list(exts, &actions);
+   list_for_each_entry(a, &actions, list) {
+   /* Don't allow more than one action per rule. */
+   if (num_actions)
+   return -EINVAL;
+
+   /* Drop in hardware. */
+   if (is_tcf_gact_shot(a)) {
+   fs->action = FILTER_DROP;
+   num_actions++;
+   continue;
+   }
+
+   /* Re-direct to specified port in hardware. */
+   if (is_tcf_mirred_redirect(a)) {
+   struct net_device *n_dev;
+   unsigned int i, index;
+   bool found = false;
+
+   index = tcf_mirred_ifindex(a);
+   for_each_port(adap, i) {
+   n_dev = adap->port[i];
+   if (index == n_dev->ifindex) {
+   fs->action = FILTER_SWITCH;
+   fs->eport = i;
+   found = true;
+   break;
+   }
+   }
+
+   /* Interface doesn't belong to any port of
+* the underlying hardware.
+*/
+   if (!found)
+   return -EINVAL;
+
+   num_actions++;
+   continue;
+   }
+
+   /* Un-supported action. */
+   return -EINVAL;
+   }
+
+   return 0;
+}
+
 int cxgb4_config_knode(struct net_device *dev, __be16 protocol,
   struct tc_cls_u32_offload *cls)
 {
@@ -234,6 +298,13 @@ int cxgb4_config_knode(struct net_device *dev, __be16 
protocol,
if (ret)
goto out;
 
+   /* Fill ch_filter_specification action fields to be shipped to
+* hardware.
+*/
+   ret = fill_action_fields(adapter, &fs, cls);
+   if (ret)
+   goto out;
+
/* The filter spec has been completely built from the info
 * provided from u32.  We now set some default fields in the
 * spec for sanity.
-- 
2.5.3

Re: [PATCH 2/9] selftests: update filesystems Makefile to work under selftests

2016-09-13 Thread Michael Ellerman

Shuah Khan  writes:

> Update to work under selftests. dnotify_test will not be run as part of
> selftests suite and will not included in install targets. It can be built
> separately for now.
>
> Signed-off-by: Shuah Khan 
> ---
>  tools/testing/selftests/filesystems/Makefile | 10 ++
>  1 file changed, 6 insertions(+), 4 deletions(-)
>
> diff --git a/tools/testing/selftests/filesystems/Makefile 
> b/tools/testing/selftests/filesystems/Makefile
> index 883010c..f1dce5c 100644
> --- a/tools/testing/selftests/filesystems/Makefile
> +++ b/tools/testing/selftests/filesystems/Makefile
> @@ -1,5 +1,7 @@
> -# List of programs to build
> -hostprogs-y := dnotify_test
> +TEST_PROGS := dnotify_test
> +all: $(TEST_PROGS)
>  
> -# Tell kbuild to always build the programs
> -always := $(hostprogs-y)
> +include ../lib.mk
> +
> +clean:
> + rm -fr dnotify_test

That's a complete rewrite of the Makefile, so I don't think there's any
value in bringing its content across from Documentation.

Better IMHO would be to squash this with the previous patch, so we get a
working test under selftests in a single commit.

cheers

Re: [PATCH v5 0/6] Add eBPF hooks for cgroups

2016-09-13 Thread Pablo Neira Ayuso

Hi,

On Mon, Sep 12, 2016 at 06:12:09PM +0200, Daniel Mack wrote:
> This is v5 of the patch set to allow eBPF programs for network
> filtering and accounting to be attached to cgroups, so that they apply
> to all sockets of all tasks placed in that cgroup. The logic also
> allows to be extendeded for other cgroup based eBPF logic.

1) This infrastructure can only be useful to systemd, or any similar
   orchestration daemon. Look, you can only apply filtering policies
   to processes that are launched by systemd, so this only works
   for server processes. For client processes this infrastructure is
   *racy*, you have to add new processes in runtime to the cgroup,
   thus there will be time some little time where no filtering policy
   will be applied. For quality of service, this may be an acceptable
   race, but this is aiming to deploy a filtering policy.

2) This aproach looks uninfrastructured to me. This provides a hook
   to push a bpf blob at a place in the stack that deploys a filtering
   policy that is not visible to others. We have interfaces that allows
   us to dump the filtering policy that is being applied, report events
   to enable cooperation between several processes with similar
   capabilities and so on.  For the XDP thing, this ability to push
   blobs may be fine as long as it will not interfer with the stack so
   we can provide an alternative to DPDK in Linux. For tracing, that's
   fine too since it is innocuous. And likely for other applications is
   a good fit. But I don't think this is the case.

> After chatting with Daniel Borkmann and Alexei off-list, we concluded
> that __dev_queue_xmit() is the place where the egress hooks should live
> when eBPF programs need access to the L2 bits of the skb.

3) This egress hook is coming very late, the only reason I find to
   place it at __dev_queue_xmit() is that bpf naturally works with
   layer 2 information in place. But this new hook is placed in
   _everyone's output ath_ that only works for the very specific
   usecase I exposed above.

The main concern during the workshop was that a hook only for cgroups
is too specific, but this is actually even more specific than this.

I have nothing against systemd or the needs for more
programmability/flexibility in the stack, but I think this needs to
fulfill some requirements to fit into the infrastructure that we have
in the right way.

Re: [PATCH net 1/6] sctp: remove the unnecessary state check in sctp_outq_tail

2016-09-13 Thread Neil Horman

On Sat, Sep 10, 2016 at 12:03:53AM +0800, Xin Long wrote:
> > I don't know, I still don't feel safe about it.  I agree the socket lock 
> > keeps
> > the state from changing during a single transmission, which makes the use 
> > case
> > you are focused on correct.
> ok, :-)
> 
> >
> > That said, have you considered the retransmit case?  That is to say, if you
> > queue and flush the outq, and some packets fail delivery, and in the time
> > between the intial send and the expiration of the RTX timer (during which 
> > the
> > socket lock will have been released), an event may occur which changes the
> > transport state, which will then be ignored with your patch.
> Sorry, I'm not sure if I got it.
> 
> You mean "during which changes q->asoc->state", right ?
> 
> This patch removes the check of q->asoc->state in sctp_outq_tail().
> 
> sctp_outq_tail() is called for data only in:
> sctp_primitive_SEND -> sctp_do_sm -> sctp_cmd_send_msg ->
> sctp_cmd_interpreter -> sctp_cmd_send_msg() -> sctp_outq_tail()
> 
> before calling sctp_primitive_SEND, hold sock lock first.
> then sctp_primitive_SEND choose FUNC according:
> 
> #define TYPE_SCTP_PRIMITIVE_SEND  {
> 
> 
> if asoc->state is unavailable, FUNC can't be sctp_cmd_send_msg,
> but sctp_sf_error_closed/sctp_sf_error_shutdown,  sctp_outq_tail
> can't be called, either.
> I mean sctp_primitive_SEND do the same check for asoc->state
> already actually.
> 
> so the code in sctp_outq_tail is redundant actually.
> 
> 
> 
> 
> >
> > Neil
> >
> 

Ok, you've convinced me, thanks for taking the time to go through it

Acked-by: Neil Horman

Re: icmpv6: issue with routing table entries from link local addresses

2016-09-13 Thread Andreas Hübner

I think I found the relevant fixes:

First and foremost it's 741a11d9e410.
(net: ipv6: Add RT6_LOOKUP_F_IFACE flag if oif is set)

This seems to have already solved my problem, however there were two
followup fixes that I should probably also apply:

d46a9d678e4c net: ipv6: Dont add RT6_LOOKUP_F_IFACE flag if saddr set
6f21c96a78b8 ipv6: enforce flowi6_oif usage in ip6_dst_lookup_tail()


So again, sorry for the noise and thanks for your help!

Andreas

RE: [RFC V3 PATCH 18/26] net/netpolicy: set tx queues according to policy

2016-09-13 Thread Liang, Kan



> -Original Message-
> From: Tom Herbert [mailto:t...@herbertland.com]
> Sent: Monday, September 12, 2016 4:23 PM
> To: Liang, Kan 
> Cc: David S. Miller ; LKML  ker...@vger.kernel.org>; Linux Kernel Network Developers
> ; Kirsher, Jeffrey T ;
> Ingo Molnar ; pet...@infradead.org; Alexey Kuznetsov
> ; James Morris ; Hideaki
> YOSHIFUJI ; Patrick McHardy ;
> a...@linux-foundation.org; Kees Cook ;
> v...@zeniv.linux.org.uk; gorcu...@openvz.org; John Stultz
> ; Alexander Duyck ; Ben
> Hutchings ; David Decotigny ;
> Florian Westphal ; Alexander Duyck
> ; Daniel Borkmann ;
> rdun...@infradead.org; Cong Wang ; Hannes
> Frederic Sowa ; Stephen Hemminger
> ; Alexei Starovoitov
> ; Brandeburg, Jesse
> ; Andi Kleen 
> Subject: Re: [RFC V3 PATCH 18/26] net/netpolicy: set tx queues according to
> policy
> 
> On Mon, Sep 12, 2016 at 7:55 AM,   wrote:
> > From: Kan Liang 
> >
> > When the device tries to transmit a packet, netdev_pick_tx is called
> > to find the available tx queues. If the net policy is applied, it
> > picks up the assigned tx queue from net policy subsystem, and redirect
> > the traffic to the assigned queue.
> >
> > Signed-off-by: Kan Liang 
> > ---
> >  include/net/sock.h |  9 +
> >  net/core/dev.c | 20 ++--
> >  2 files changed, 27 insertions(+), 2 deletions(-)
> >
> > diff --git a/include/net/sock.h b/include/net/sock.h index
> > e1e9e3d..ca97f35 100644
> > --- a/include/net/sock.h
> > +++ b/include/net/sock.h
> > @@ -2280,4 +2280,13 @@ extern int sysctl_optmem_max;  extern __u32
> > sysctl_wmem_default;  extern __u32 sysctl_rmem_default;
> >
> > +/* Return netpolicy instance information from socket. */ static
> > +inline struct netpolicy_instance *netpolicy_find_instance(struct sock
> > +*sk) { #ifdef CONFIG_NETPOLICY
> > +   if (is_net_policy_valid(sk->sk_netpolicy.policy))
> > +   return &sk->sk_netpolicy; #endif
> > +   return NULL;
> > +}
> >  #endif /* _SOCK_H */
> > diff --git a/net/core/dev.c b/net/core/dev.c index 34b5322..b9a8044
> > 100644
> > --- a/net/core/dev.c
> > +++ b/net/core/dev.c
> > @@ -3266,6 +3266,7 @@ struct netdev_queue *netdev_pick_tx(struct
> net_device *dev,
> > struct sk_buff *skb,
> > void *accel_priv)  {
> > +   struct sock *sk = skb->sk;
> > int queue_index = 0;
> >
> >  #ifdef CONFIG_XPS
> > @@ -3280,8 +3281,23 @@ struct netdev_queue *netdev_pick_tx(struct
> net_device *dev,
> > if (ops->ndo_select_queue)
> > queue_index = ops->ndo_select_queue(dev, skb, 
> > accel_priv,
> > 
> > __netdev_pick_tx);
> > -   else
> > -   queue_index = __netdev_pick_tx(dev, skb);
> > +   else {
> > +#ifdef CONFIG_NETPOLICY
> > +   struct netpolicy_instance *instance;
> > +
> > +   queue_index = -1;
> > +   if (dev->netpolicy && sk) {
> > +   instance = netpolicy_find_instance(sk);
> > +   if (instance) {
> > +   if (!instance->dev)
> > +   instance->dev = dev;
> > +   queue_index = 
> > netpolicy_pick_queue(instance, false);
> > +   }
> > +   }
> > +   if (queue_index < 0) #endif
> 
> I doubt this produces the intended effect. Several drivers use
> ndo_select_queue (such as mlx4) where there might do something special
> for a few packets but end up called the default handler which
> __netdev_pick_tx for most packets. So in such cases the netpolicy path would
> be routinely bypassed. Maybe this code should be in __netdev_pick_tx.

I will move the code to __netdev_pick_tx in next version.

Thanks,
Kan

> 
> Tom
> 
> > +   queue_index = __netdev_pick_tx(dev, skb);
> > +   }
> >
> > if (!accel_priv)
> > queue_index = netdev_cap_txqueue(dev,
> > queue_index);
> > --
> > 2.5.5
> >

RE: [RFC V3 PATCH 03/26] net/netpolicy: get device queue irq information

2016-09-13 Thread Liang, Kan

> 
> Hello.
> 
> On 09/12/2016 05:55 PM, kan.li...@intel.com wrote:
> 
> > From: Kan Liang 
> >
> > Net policy needs to know device information. Currently, it's enough to
> > only get irq information of rx and tx queues.
> >
> > This patch introduces ndo ops to do so, not ethtool ops.
> > Because there are already several ways to get irq information in
> > userspace. It's not necessory to extend the ethtool.
> 
> Necessary.

OK. I will extend the ethtool in next version.

Thanks,
Kan

> 
> > Signed-off-by: Kan Liang 
> 
> [...]
> 
> MBR, Sergei

[RFC PATCH] xen-netback: fix error handling on netback_probe()

2016-09-13 Thread Filipe Manco

In case of error during netback_probe() (e.g. an entry missing on the
xenstore) netback_remove() is called on the new device, which will set
the device backend state to XenbusStateClosed by calling
set_backend_state(). However, the backend state wasn't initialized by
netback_probe() at this point, which will cause and invalid transaction
and set_backend_state() to BUG().

Initialize the backend state at the beginning of netback_probe() to
XenbusStateInitialising, and create a new valid state transaction on
set_backend_state(), from XenbusStateInitialising to XenbusStateClosed.

Signed-off-by: Filipe Manco 
---
 drivers/net/xen-netback/xenbus.c | 10 ++
 1 file changed, 10 insertions(+)

diff --git a/drivers/net/xen-netback/xenbus.c b/drivers/net/xen-netback/xenbus.c
index 6a31f2610c23..c0e5f6994d01 100644
--- a/drivers/net/xen-netback/xenbus.c
+++ b/drivers/net/xen-netback/xenbus.c
@@ -270,6 +270,7 @@ static int netback_probe(struct xenbus_device *dev,
 
be->dev = dev;
dev_set_drvdata(&dev->dev, be);
+   be->state = XenbusStateInitialising;
 
sg = 1;
 
@@ -515,6 +516,15 @@ static void set_backend_state(struct backend_info *be,
 {
while (be->state != state) {
switch (be->state) {
+   case XenbusStateInitialising:
+   switch (state) {
+   case XenbusStateClosed:
+   backend_switch_state(be, XenbusStateClosed);
+   break;
+   default:
+   BUG();
+   }
+   break;
case XenbusStateClosed:
switch (state) {
case XenbusStateInitWait:
-- 
2.7.4

Re: [V2] ath10k: fix memory leak on caldata on error exit path

2016-09-13 Thread Kalle Valo

Colin Ian King  wrote:
> From: Colin Ian King 
> 
> caldata is not being free'd on the error exit path, causing
> a memory leak and data definitely should not be freed. Free
> caldata instead of data.
> 
> Thanks to Kalle Valo for spotting that data should not be
> free'd.
> 
> Signed-off-by: Colin Ian King 

Thanks, 1 patch applied to ath-next branch of ath.git:

5f4761dda2ba ath10k: fix memory leak on caldata on error exit path

-- 
Sent by pwcli
https://patchwork.kernel.org/patch/9312163/

Re: ath10k: remove unused variable ar_pci

2016-09-13 Thread Kalle Valo

Chaehyun Lim  wrote:
> Trival fix to remove unused variable ar_pci in ath10k_pci_tx_pipe_cleanup
> when building with W=1:
> drivers/net/wireless/ath/ath10k/pci.c:1696:21: warning: variable
> 'ar_pci' set but not used [-Wunused-but-set-variable]
> 
> Signed-off-by: Chaehyun Lim 

Thanks, 1 patch applied to ath-next branch of ath.git:

214d55394481 ath10k: remove unused variable ar_pci

-- 
Sent by pwcli
https://patchwork.kernel.org/patch/9313963/

[ANNOUNCE] netdev 1.2 tokyo weekly update (13th September, 2016)

2016-09-13 Thread Hajime Tazaki


Hello folks,

I hope you're fine and ready to trip to Tokyo.

Here is an weekly update of Netdev 1.2 Tokyo.

== Keynote talk ==

We confirmed that David Miller will give a keynote titled
"Fast Programmable Networks & Encapsulated Protocols".


== Newly accepted sessions ==

We also accepted one additional talk in the last week.

- Network interface configuration on a Linux NOS
  by Roopa Prabhu

Full list of accepted sessions is available here.

http://netdevconf.org/1.2/accepted-sessions.html


== Our sponsors ==

We got new sponsors, Cisco as a Gold sponsor, LWN.net as a
Media sponsor.  Many thanks for your support.

- Platinum
Verizon, Facebook, Cumulus Networks
- Gold
Mojatatu Networks, VMWare, Google, NTT, LinkedIn, Cisco (new)
- Silver
NetApp, IIJ, Netronome, SolarFlare, Mellanox, Sophos
- Bronze
Zen Load Balancer
- Media Sponsor
lwn.net (new)

Twitter: https://twitter.com/netdev01
Web: http://netdevconf.org/1.2/


== Others ==

Be prepared for your travel. Hotel and travel information
are available on the web pages.

http://netdevconf.org/1.2/travel.html
http://netdevconf.org/1.2/hotel.html

The early bird tickets is still available until 15th.
Please be registered - your early registration is always
helpful to organize a great conference.

http://netdevconf.org/1.2/registration.html


Looking forward to seeing you in Tokyo very soon.

-- Hajime

Re: [PATCH] ath10k: Spelling and miscellaneous neatening

2016-09-13 Thread Valo, Kalle

Joe Perches  writes:

> Correct some trivial comment typos.
> Remove unnecessary parentheses in a long line.
> Convert a return; before the end of a void function definition to just ;
>
> Signed-off-by: Joe Perches 

[...]

> --- a/drivers/net/wireless/ath/ath10k/core.c
> +++ b/drivers/net/wireless/ath/ath10k/core.c
> @@ -2118,7 +2118,7 @@ err:
>   /* TODO: It's probably a good idea to release device from the driver
>* but calling device_release_driver() here will cause a deadlock.
>*/
> - return;
> + ;
>  }

I don't think this improves anything, I dropped this part from the patch
in my pending branch.

-- 
Kalle Valo

Re: [RFC V3 PATCH 03/26] net/netpolicy: get device queue irq information

2016-09-13 Thread Alexander Duyck

On Tue, Sep 13, 2016 at 5:23 AM, Liang, Kan  wrote:
>>
>> Hello.
>>
>> On 09/12/2016 05:55 PM, kan.li...@intel.com wrote:
>>
>> > From: Kan Liang 
>> >
>> > Net policy needs to know device information. Currently, it's enough to
>> > only get irq information of rx and tx queues.
>> >
>> > This patch introduces ndo ops to do so, not ethtool ops.
>> > Because there are already several ways to get irq information in
>> > userspace. It's not necessory to extend the ethtool.
>>
>> Necessary.
>
> OK. I will extend the ethtool in next version.
>
> Thanks,
> Kan

Kan, I don't think Sergei was saying you have to extend the ethtool.
Your spelling of necessary was incorrect in your patch description.

Sergei, please feel free to tell me I am wrong if my assumption on
that is incorrect.

- Alex

Re: [PATCH 3/3] net-next: dsa: add new driver for qca8xxx family

2016-09-13 Thread Andrew Lunn

On Tue, Sep 13, 2016 at 11:40:43AM +0200, John Crispin wrote:
> 
> 
> On 13/09/2016 03:23, Andrew Lunn wrote:
> > So lets see if i have this right.
> > 
> > Port 0 has no internal phy.
> > Port 1 has an internal PHY at MDIO address 0.
> > Port 2 has an internal PHY at MDIO address 1.
> > ...
> > Port 5 has an internal PHY ad MDIO address 4.
> > Port 6 has no internal PHY.
> 
> Hi Andrew
> 
> correct. port 0 is the cpu port. I initially thought that port6 can also
> be used as te cpu port but there are various places in the datasheet
> stating that the cpu port is 0.

O.K, please correct the comments in the code, and make this clear in
the device tree binding.

> in some of the reference designs, port6
> is wired to a 2nd gmac of the cpu and in those cases port 6 is then
> hardwired to port 5 of the switch and called wan. right now the driver
> does not support this feature.

Hum, why not?

brctl addbr br1
brctl addif br1 port5
brctl addif br1 port6

They are just ports on a switch, so bridge them together.

> ok, i will simply substract 1 from the phy_addr inside the mdio
> callbacks. this would make the code more readable and make the DT
> binding compliant with the ePAPR spec.

It does however need well commenting. It is setting a trap for anybody
who puts an external PHY on port 6. If they access that PHY via these
functions, the address is off by one.

This is the first silicon vendor who made their MDIO addresses for
PHYs illogical. So i'm thinking we maybe should add a new function to
dsa_switch_ops.

/* Return the MDIO address for the PHY for this port. */
int (*phy_port_map(struct dsa_switch *ds, int port);

This should return the MDIO address for integrated PHYs only, or
-ENODEV if the port does not have an integrated PHY. For an external
PHY, a phy-handle should be used. This phy_port_map() is used in
dsa_slave_phy_setup(). But dsa_slave_phy_setup() is already too
complex, so it needs doing with care.

 Andrew

Re: [Intel-wired-lan] [net-next PATCH v3 1/3] e1000: track BQL bytes regardless of skb or not

2016-09-13 Thread Alexander Duyck

On Mon, Sep 12, 2016 at 9:25 PM, Tom Herbert  wrote:
> On Mon, Sep 12, 2016 at 8:00 PM, Alexander Duyck
>  wrote:
>> On Mon, Sep 12, 2016 at 3:13 PM, John Fastabend
>>  wrote:
>>> The BQL API does not reference the sk_buff nor does the driver need to
>>> reference the sk_buff to calculate the length of a transmitted frame.
>>> This patch removes an sk_buff reference from the xmit irq path and
>>> also allows packets sent from XDP to use BQL.
>>>
>>> Signed-off-by: John Fastabend 
>>> ---
>>>  drivers/net/ethernet/intel/e1000/e1000_main.c |7 ++-
>>>  1 file changed, 2 insertions(+), 5 deletions(-)
>>>
>>> diff --git a/drivers/net/ethernet/intel/e1000/e1000_main.c 
>>> b/drivers/net/ethernet/intel/e1000/e1000_main.c
>>> index f42129d..62a7f8d 100644
>>> --- a/drivers/net/ethernet/intel/e1000/e1000_main.c
>>> +++ b/drivers/net/ethernet/intel/e1000/e1000_main.c
>>> @@ -3882,11 +3882,8 @@ static bool e1000_clean_tx_irq(struct e1000_adapter 
>>> *adapter,
>>> if (cleaned) {
>>> total_tx_packets += buffer_info->segs;
>>> total_tx_bytes += buffer_info->bytecount;
>>> -   if (buffer_info->skb) {
>>> -   bytes_compl += 
>>> buffer_info->skb->len;
>>> -   pkts_compl++;
>>> -   }
>>> -
>>> +   bytes_compl += buffer_info->length;
>>> +   pkts_compl++;
>>> }
>>> e1000_unmap_and_free_tx_resource(adapter, 
>>> buffer_info);
>>> tx_desc->upper.data = 0;
>>
>> Actually it might be worth looking into why we have two different
>> stats for tracking bytecount and segs.  From what I can tell the
>> pkts_compl value is never actually used.  The function doesn't even
>> use the value so it is just wasted cycles.  And as far as the bytes go
>
> Transmit flow steering which I posted and is pending on some testing
> uses the pkt count BQL to track inflight packets.
>
>> the accounting would be more accurate if you were to use bytecount
>> instead of buffer_info->skb->len.  You would just need to update the
>> xmit function to use that on the other side so that they match.

Okay, that makes sense.

But as I was saying we might be better off using the segs and
bytecount values instead of the skb->len in the xmit and cleanup paths
to get more accurate accounting for the total bytes/packets coming and
going from the interface.  That way we can avoid any significant
change in behavior between TSO and GSO.

- Alex

Re: [PATCH 2/9] selftests: update filesystems Makefile to work under selftests

2016-09-13 Thread Shuah Khan

On 09/13/2016 05:56 AM, Michael Ellerman wrote:
> Shuah Khan  writes:
> 
>> Update to work under selftests. dnotify_test will not be run as part of
>> selftests suite and will not included in install targets. It can be built
>> separately for now.
>>
>> Signed-off-by: Shuah Khan 
>> ---
>>  tools/testing/selftests/filesystems/Makefile | 10 ++
>>  1 file changed, 6 insertions(+), 4 deletions(-)
>>
>> diff --git a/tools/testing/selftests/filesystems/Makefile 
>> b/tools/testing/selftests/filesystems/Makefile
>> index 883010c..f1dce5c 100644
>> --- a/tools/testing/selftests/filesystems/Makefile
>> +++ b/tools/testing/selftests/filesystems/Makefile
>> @@ -1,5 +1,7 @@
>> -# List of programs to build
>> -hostprogs-y := dnotify_test
>> +TEST_PROGS := dnotify_test
>> +all: $(TEST_PROGS)
>>  
>> -# Tell kbuild to always build the programs
>> -always := $(hostprogs-y)
>> +include ../lib.mk
>> +
>> +clean:
>> +rm -fr dnotify_test
> 
> That's a complete rewrite of the Makefile, so I don't think there's any
> value in bringing its content across from Documentation.

Moving Makefile accomplishes delete at the same time. I can combine
the move and updating Makefile into one single patch.

> 
> Better IMHO would be to squash this with the previous patch, so we get a
> working test under selftests in a single commit.
> 
> cheers
> 

thanks,
-- Shuah

RE: [RFC V3 PATCH 03/26] net/netpolicy: get device queue irq information

2016-09-13 Thread Liang, Kan



> On Tue, Sep 13, 2016 at 5:23 AM, Liang, Kan  wrote:
> >>
> >> Hello.
> >>
> >> On 09/12/2016 05:55 PM, kan.li...@intel.com wrote:
> >>
> >> > From: Kan Liang 
> >> >
> >> > Net policy needs to know device information. Currently, it's enough
> >> > to only get irq information of rx and tx queues.
> >> >
> >> > This patch introduces ndo ops to do so, not ethtool ops.
> >> > Because there are already several ways to get irq information in
> >> > userspace. It's not necessory to extend the ethtool.
> >>
> >> Necessary.
> >
> > OK. I will extend the ethtool in next version.
> >
> > Thanks,
> > Kan
> 
> Kan, I don't think Sergei was saying you have to extend the ethtool.
> Your spelling of necessary was incorrect in your patch description.
> 
> Sergei, please feel free to tell me I am wrong if my assumption on that is
> incorrect.
> 
> - Alex

Oh, I see. Thanks Alex. :)

Kan

Re: [PATCH 0/9] Move runnable code (tests) from Documentation to selftests

2016-09-13 Thread Shuah Khan

On 09/13/2016 03:20 AM, Jani Nikula wrote:
> On Sat, 10 Sep 2016, Jonathan Corbet  wrote:
>> On Fri,  9 Sep 2016 16:22:41 -0600
>> Shuah Khan  wrote:
>>
>>> Move runnable code (tests) from Documentation to selftests and update
>>> Makefiles to work under selftests.
>>>
>>> Jon Corbet and I discussed this in an email thread and as per that
>>> discussion, this patch series moves all the tests that are under the
>>> Documentation directory to selftests. There is more runnable code in
>>> the form of examples and utils and that is going to be another patch
>>> series. I moved just the tests and left the documentation files as is.
>>
>> I'm fine with the idea, but it looks like a couple of tweaks are needed,
>> in particular to avoid leaving behind dangling references in
>> Documentation/Makefile that cause build errors.
>>
>> I think the individual patches probably need a wider CC list as well.
>> I'd use the get_maintainer script (or git) to see who has taken an
>> interest in the individual tests and make sure they are aware of the
>> move.
> 
> FWIW, I'm in favor of moving *all* the code away from Documentation, not
> just tests. Essentially removing the CONFIG_BUILD_DOCSRC config option,
> and reserving Documentation/Makefile for documentation build. After this
> series, some of the remaining code belongs under samples, some under
> tools.

I am planning another patch series to move all the examples and samples
and tools to their right location.

> 
> We could make it possible to include the code samples from samples into
> the Sphinx built documentation.
> 
> BR,
> Jani.
> 

I can't say I understand Sphinx, however, it might make sense to include
samples into Sphinx build. Is this approach different from the way they
are built under Documentation via Doc Makfiles now?

thanks,
-- Shuah

Re: [PATCH v5 0/6] Add eBPF hooks for cgroups

2016-09-13 Thread Daniel Mack

Hi,

On 09/13/2016 01:56 PM, Pablo Neira Ayuso wrote:
> On Mon, Sep 12, 2016 at 06:12:09PM +0200, Daniel Mack wrote:
>> This is v5 of the patch set to allow eBPF programs for network
>> filtering and accounting to be attached to cgroups, so that they apply
>> to all sockets of all tasks placed in that cgroup. The logic also
>> allows to be extendeded for other cgroup based eBPF logic.
> 
> 1) This infrastructure can only be useful to systemd, or any similar
>orchestration daemon. Look, you can only apply filtering policies
>to processes that are launched by systemd, so this only works
>for server processes.

Sorry, but both statements aren't true. The eBPF policies apply to every
process that is placed in a cgroup, and my example program in 6/6 shows
how that can be done from the command line. Also, systemd is able to
control userspace processes just fine, and it not limited to 'server
processes'.

> For client processes this infrastructure is
>*racy*, you have to add new processes in runtime to the cgroup,
>thus there will be time some little time where no filtering policy
>will be applied. For quality of service, this may be an acceptable
>race, but this is aiming to deploy a filtering policy.

That's a limitation that applies to many more control mechanisms in the
kernel, and it's something that can easily be solved with fork+exec.

> 2) This aproach looks uninfrastructured to me. This provides a hook
>to push a bpf blob at a place in the stack that deploys a filtering
>policy that is not visible to others.

That's just as transparent as SO_ATTACH_FILTER. What kind of
introspection mechanism do you have in mind?

> We have interfaces that allows
>us to dump the filtering policy that is being applied, report events
>to enable cooperation between several processes with similar
>capabilities and so on.

Well, in practice, for netfilter, there can only be one instance in the
system that acts as central authoritative, otherwise you'll end up with
orphaned entries or with situation where some client deletes rules
behind the back of the one that originally installed it. So I really
think there is nothing wrong with demanding a single, privileged
controller to manage things.

>> After chatting with Daniel Borkmann and Alexei off-list, we concluded
>> that __dev_queue_xmit() is the place where the egress hooks should live
>> when eBPF programs need access to the L2 bits of the skb.
> 
> 3) This egress hook is coming very late, the only reason I find to
>place it at __dev_queue_xmit() is that bpf naturally works with
>layer 2 information in place. But this new hook is placed in
>_everyone's output ath_ that only works for the very specific
>usecase I exposed above.

It's about filtering outgoing network packets of applications, and
providing them with L2 information for filtering purposes. I don't think
that's a very specific use-case.

When the feature is not used at all, the added costs on the output path
are close to zero, due to the use of static branches. If used somewhere
in the system but not for the packet in flight, costs are slightly
higher but acceptable. In fact, it's not even measurable in my tests
here. How is that different from the netfilter OUTPUT hook, btw?

That said, limiting it to L3 is still an option. It's just that we need
ingress and egress to be in sync, so both would be L3 then. So far, the
possible advantages for future use-cases having access to L2 outweighed
the concerns of putting the hook to dev_queue_xmit(), but I'm open to
discussing that.

> The main concern during the workshop was that a hook only for cgroups
> is too specific, but this is actually even more specific than this.

This patch set merely implements an infrastructure that can accommodate
many more things as well in the future. We could, in theory, even add
hooks for forwarded packets specifically, or other eBPF programs, not
even for network filtering etc.

> I have nothing against systemd or the needs for more
> programmability/flexibility in the stack, but I think this needs to
> fulfill some requirements to fit into the infrastructure that we have
> in the right way.

Well, as I explained already, this patch set results from endless
discussions that went nowhere, about how such a thing can be achieved
with netfilter.

Thanks,
Daniel

Re: [PATCH v3 1/9] ethernet: add sun8i-emac driver

2016-09-13 Thread LABBE Corentin

On Fri, Sep 09, 2016 at 04:15:27PM +0200, Andrew Lunn wrote:
> Hi Corentin
> 
> > +static int sun8i_emac_mdio_register(struct net_device *ndev)
> > +{
> > +   struct sun8i_emac_priv *priv = netdev_priv(ndev);
> > +   struct mii_bus *bus;
> > +   int ret;
> > +
> > +   bus = mdiobus_alloc();
> 
> You can use devm_mdiobus_alloc() which will simplify your error
> handling and unregister code.
> 
>Andrew

Hello

Since the mdio bus is allocated on ndev/open, it need to be removed when 
ndev/stop is called.
So devm_mdiobus_alloc cannot be used.

Regards

Corentin Labbe

Re: [PATCH v3 3/9] ARM: sun8i: dt: Add DT bindings documentation for Allwinner sun8i-emac

2016-09-13 Thread LABBE Corentin

On Fri, Sep 09, 2016 at 04:17:10PM +0200, Andrew Lunn wrote:
> > +Optional properties:
> > +- allwinner,tx-delay: TX clock delay chain value. Range value is 0-0x07. 
> > Default is 0)
> > +- allwinner,rx-delay: RX clock delay chain value. Range value is 0-0x1F. 
> > Default is 0)
> 
> What are the units? pS? nS?
> 
>  Andrew

No units, only raw number.
I will add a comment for this.

Regards

Corentin Labbe

Re: [PATCH v3 4/9] ARM: dts: sun8i-h3: Add dt node for the syscon control module

2016-09-13 Thread LABBE Corentin

On Mon, Sep 12, 2016 at 09:28:12AM +0200, Maxime Ripard wrote:
> On Fri, Sep 09, 2016 at 02:45:12PM +0200, Corentin Labbe wrote:
> > This patch add the dt node for the syscon register present on the
> > Allwinner H3.
> > 
> > Only two register are present in this syscon and the only one useful is
> > the one dedicated to EMAC clock.
> > 
> > Signed-off-by: Corentin Labbe 
> > ---
> >  arch/arm/boot/dts/sun8i-h3.dtsi | 5 +
> >  1 file changed, 5 insertions(+)
> > 
> > diff --git a/arch/arm/boot/dts/sun8i-h3.dtsi 
> > b/arch/arm/boot/dts/sun8i-h3.dtsi
> > index fdf9fdb..a39da6f 100644
> > --- a/arch/arm/boot/dts/sun8i-h3.dtsi
> > +++ b/arch/arm/boot/dts/sun8i-h3.dtsi
> > @@ -140,6 +140,11 @@
> > #size-cells = <1>;
> > ranges;
> >  
> > +   syscon: syscon@01c0 {
> > +   compatible = "syscon";
> 
> Having our compatible would be nice here. syscon doesn't mean anything
> by itself.
> 

Since no driver handle it, I follow what I saw in other DT.
At your choice, I can add a sun8i-syscon, but it will be unused.

> > +   reg = <0x01c0 0x34>;
> 
> And the size of our system controller is 0x1000
> 

I put the real size used, but I can put what datasheet said.

Regards

Corentin Labbe

Re: [PATCH v3 8/9] ARM: sunxi: Enable sun8i-emac driver on sunxi_defconfig

2016-09-13 Thread LABBE Corentin

On Mon, Sep 12, 2016 at 09:30:08AM +0200, Maxime Ripard wrote:
> Hi,
> 
> On Fri, Sep 09, 2016 at 02:45:16PM +0200, Corentin Labbe wrote:
> > Enable the sun8i-emac driver in the sunxi default configuration
> > 
> > Signed-off-by: Corentin Labbe 
> 
> Could you make the same patch for multi_v7 ?
> 

I will

Thanks

Re: [PATCH 0/9] Move runnable code (tests) from Documentation to selftests

2016-09-13 Thread Jani Nikula

On Tue, 13 Sep 2016, Shuah Khan  wrote:
> On 09/13/2016 03:20 AM, Jani Nikula wrote:
>> FWIW, I'm in favor of moving *all* the code away from Documentation, not
>> just tests. Essentially removing the CONFIG_BUILD_DOCSRC config option,
>> and reserving Documentation/Makefile for documentation build. After this
>> series, some of the remaining code belongs under samples, some under
>> tools.
>
> I am planning another patch series to move all the examples and samples
> and tools to their right location.

Great!

>> We could make it possible to include the code samples from samples into
>> the Sphinx built documentation.
>
> I can't say I understand Sphinx, however, it might make sense to include
> samples into Sphinx build. Is this approach different from the way they
> are built under Documentation via Doc Makfiles now?

It's just that by default Sphinx won't allow including files outside of
its root directory, which is Documentation in this case. It just needs
an extension for this.

BR,
Jani.


-- 
Jani Nikula, Intel Open Source Technology Center

Re: [PATCH RFC 03/11] net/mlx5e: Implement RX mapped page cache for page recycle

2016-09-13 Thread Tariq Toukan



On 07/09/2016 9:45 PM, Jesper Dangaard Brouer wrote:

On Wed,  7 Sep 2016 15:42:24 +0300 Saeed Mahameed  wrote:


From: Tariq Toukan 

Instead of reallocating and mapping pages for RX data-path,
recycle already used pages in a per ring cache.

We ran pktgen single-stream benchmarks, with iptables-raw-drop:

Single stride, 64 bytes:
* 4,739,057 - baseline
* 4,749,550 - order0 no cache
* 4,786,899 - order0 with cache
1% gain

Larger packets, no page cross, 1024 bytes:
* 3,982,361 - baseline
* 3,845,682 - order0 no cache
* 4,127,852 - order0 with cache
3.7% gain

Larger packets, every 3rd packet crosses a page, 1500 bytes:
* 3,731,189 - baseline
* 3,579,414 - order0 no cache
* 3,931,708 - order0 with cache
5.4% gain

Signed-off-by: Tariq Toukan 
Signed-off-by: Saeed Mahameed 
---
  drivers/net/ethernet/mellanox/mlx5/core/en.h   | 16 ++
  drivers/net/ethernet/mellanox/mlx5/core/en_main.c  | 15 ++
  drivers/net/ethernet/mellanox/mlx5/core/en_rx.c| 57 --
  drivers/net/ethernet/mellanox/mlx5/core/en_stats.h | 16 ++
  4 files changed, 99 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en.h 
b/drivers/net/ethernet/mellanox/mlx5/core/en.h
index 075cdfc..afbdf70 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en.h
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en.h
@@ -287,6 +287,18 @@ struct mlx5e_rx_am { /* Adaptive Moderation */
u8  tired;
  };
  
+/* a single cache unit is capable to serve one napi call (for non-striding rq)

+ * or a MPWQE (for striding rq).
+ */
+#define MLX5E_CACHE_UNIT   (MLX5_MPWRQ_PAGES_PER_WQE > NAPI_POLL_WEIGHT ? \
+MLX5_MPWRQ_PAGES_PER_WQE : NAPI_POLL_WEIGHT)
+#define MLX5E_CACHE_SIZE   (2 * roundup_pow_of_two(MLX5E_CACHE_UNIT))
+struct mlx5e_page_cache {
+   u32 head;
+   u32 tail;
+   struct mlx5e_dma_info page_cache[MLX5E_CACHE_SIZE];
+};
+

[...]
  
diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c

index c1cb510..8e02af3 100644
--- a/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx5/core/en_rx.c
@@ -305,11 +305,55 @@ static inline void mlx5e_post_umr_wqe(struct mlx5e_rq 
*rq, u16 ix)
mlx5e_tx_notify_hw(sq, &wqe->ctrl, 0);
  }
  
+static inline bool mlx5e_rx_cache_put(struct mlx5e_rq *rq,

+ struct mlx5e_dma_info *dma_info)
+{
+   struct mlx5e_page_cache *cache = &rq->page_cache;
+   u32 tail_next = (cache->tail + 1) & (MLX5E_CACHE_SIZE - 1);
+
+   if (tail_next == cache->head) {
+   rq->stats.cache_full++;
+   return false;
+   }
+
+   cache->page_cache[cache->tail] = *dma_info;
+   cache->tail = tail_next;
+   return true;
+}
+
+static inline bool mlx5e_rx_cache_get(struct mlx5e_rq *rq,
+ struct mlx5e_dma_info *dma_info)
+{
+   struct mlx5e_page_cache *cache = &rq->page_cache;
+
+   if (unlikely(cache->head == cache->tail)) {
+   rq->stats.cache_empty++;
+   return false;
+   }
+
+   if (page_ref_count(cache->page_cache[cache->head].page) != 1) {
+   rq->stats.cache_busy++;
+   return false;
+   }

Hmmm... doesn't this cause "blocking" of the page_cache recycle
facility until the page at the head of the queue gets (page) refcnt
decremented?  Real use-case could fairly easily block/cause this...

Hi Jesper,

That's right. We are aware of this issue.
We considered ways of solving this, but decided to keep current 
implementation for now.

One way of solving this is to look deeper in the cache.
Cons:
- this will consume time, and the chance of finding an available page is 
not that high: if the page in head of queue is busy then there's a good 
chance that all the others are too (because of FIFO).
in other words, you already checked all pages and anyway you're going to 
allocate a new one (higher penalty for same decision).
- this will make holes in the array causing complex accounting when 
looking for an available page (this can easily be fixed by swapping 
between the page in head and the available one).


Another way is sharing pages between different RQs.
- For now we're not doing this for simplicity and to keep 
synchronization away.


What do you think?

Anyway, we're looking forward to use your page-pool API which solves 
these issues.


Regards,
Tariq



+
+   *dma_info = cache->page_cache[cache->head];
+   cache->head = (cache->head + 1) & (MLX5E_CACHE_SIZE - 1);
+   rq->stats.cache_reuse++;
+
+   dma_sync_single_for_device(rq->pdev, dma_info->addr, PAGE_SIZE,
+  DMA_FROM_DEVICE);
+   return true;
+}
+
  static inline int mlx5e_page_alloc_mapped(struct mlx5e_rq *rq,
  struct mlx5e_dma_info *dma_info)
  {
-   str

[PATCH net-next 0/3] net: ethernet: mediatek: add HW LRO functions

2016-09-13 Thread Nelson Chang

The patches add the large receive offload (LRO) functions by hardware and
the ethtool functions to configure RX flows of HW LRO.

Nelson Chang (3):
  net: ethernet: mediatek: add HW LRO functions of PDMA RX rings
  net: ethernet: mediatek: add ethtool functions to configure RX flows
of HW LRO
  net: ethernet: mediatek: add dts configuration to enable HW LRO

 .../devicetree/bindings/net/mediatek-net.txt   |   2 +
 drivers/net/ethernet/mediatek/mtk_eth_soc.c| 433 +++--
 drivers/net/ethernet/mediatek/mtk_eth_soc.h|  75 +++-
 3 files changed, 485 insertions(+), 25 deletions(-)

-- 
1.9.1

[PATCH net-next 2/3] net: ethernet: mediatek: add ethtool functions to configure RX flows of HW LRO

2016-09-13 Thread Nelson Chang

The codes add ethtool functions to set RX flows for HW LRO. Because the
HW LRO hardware can only recognize the destination IP of TCP/IP RX flows,
the ethtool command to add HW LRO flow is as below:
ethtool -N [devname] flow-type tcp4 dst-ip [ip_addr] loc [0~1]

Otherwise, cause the hardware can set total four destination IPs, each
GMAC (GMAC1/GMAC2) can set two IPs separately at most.

Signed-off-by: Nelson Chang 
---
 drivers/net/ethernet/mediatek/mtk_eth_soc.c | 218 
 1 file changed, 218 insertions(+)

diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c 
b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index ed35e0f..8245841 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -1344,6 +1344,165 @@ static void mtk_hwlro_rx_uninit(struct mtk_eth *eth)
mtk_w32(eth, 0, MTK_PDMA_LRO_CTRL_DW0);
 }
 
+static void mtk_hwlro_val_ipaddr(struct mtk_eth *eth, int idx, __be32 ip)
+{
+   u32 reg_val;
+
+   reg_val = mtk_r32(eth, MTK_LRO_CTRL_DW2_CFG(idx));
+
+   /* invalidate the IP setting */
+   mtk_w32(eth, (reg_val & ~MTK_RING_MYIP_VLD), MTK_LRO_CTRL_DW2_CFG(idx));
+
+   mtk_w32(eth, ip, MTK_LRO_DIP_DW0_CFG(idx));
+
+   /* validate the IP setting */
+   mtk_w32(eth, (reg_val | MTK_RING_MYIP_VLD), MTK_LRO_CTRL_DW2_CFG(idx));
+}
+
+static void mtk_hwlro_inval_ipaddr(struct mtk_eth *eth, int idx)
+{
+   u32 reg_val;
+
+   reg_val = mtk_r32(eth, MTK_LRO_CTRL_DW2_CFG(idx));
+
+   /* invalidate the IP setting */
+   mtk_w32(eth, (reg_val & ~MTK_RING_MYIP_VLD), MTK_LRO_CTRL_DW2_CFG(idx));
+
+   mtk_w32(eth, 0, MTK_LRO_DIP_DW0_CFG(idx));
+}
+
+static int mtk_hwlro_get_ip_cnt(struct mtk_mac *mac)
+{
+   int cnt = 0;
+   int i;
+
+   for (i = 0; i < MTK_MAX_LRO_IP_CNT; i++) {
+   if (mac->hwlro_ip[i])
+   cnt++;
+   }
+
+   return cnt;
+}
+
+static int mtk_hwlro_add_ipaddr(struct net_device *dev,
+   struct ethtool_rxnfc *cmd)
+{
+   struct ethtool_rx_flow_spec *fsp =
+   (struct ethtool_rx_flow_spec *)&cmd->fs;
+   struct mtk_mac *mac = netdev_priv(dev);
+   struct mtk_eth *eth = mac->hw;
+   int hwlro_idx;
+
+   if ((fsp->flow_type != TCP_V4_FLOW) ||
+   (!fsp->h_u.tcp_ip4_spec.ip4dst) ||
+   (fsp->location > 1))
+   return -EINVAL;
+
+   mac->hwlro_ip[fsp->location] = htonl(fsp->h_u.tcp_ip4_spec.ip4dst);
+   hwlro_idx = (mac->id * MTK_MAX_LRO_IP_CNT) + fsp->location;
+
+   mac->hwlro_ip_cnt = mtk_hwlro_get_ip_cnt(mac);
+
+   mtk_hwlro_val_ipaddr(eth, hwlro_idx, mac->hwlro_ip[fsp->location]);
+
+   return 0;
+}
+
+static int mtk_hwlro_del_ipaddr(struct net_device *dev,
+   struct ethtool_rxnfc *cmd)
+{
+   struct ethtool_rx_flow_spec *fsp =
+   (struct ethtool_rx_flow_spec *)&cmd->fs;
+   struct mtk_mac *mac = netdev_priv(dev);
+   struct mtk_eth *eth = mac->hw;
+   int hwlro_idx;
+
+   if (fsp->location > 1)
+   return -EINVAL;
+
+   mac->hwlro_ip[fsp->location] = 0;
+   hwlro_idx = (mac->id * MTK_MAX_LRO_IP_CNT) + fsp->location;
+
+   mac->hwlro_ip_cnt = mtk_hwlro_get_ip_cnt(mac);
+
+   mtk_hwlro_inval_ipaddr(eth, hwlro_idx);
+
+   return 0;
+}
+
+static void mtk_hwlro_netdev_disable(struct net_device *dev)
+{
+   struct mtk_mac *mac = netdev_priv(dev);
+   struct mtk_eth *eth = mac->hw;
+   int i, hwlro_idx;
+
+   for (i = 0; i < MTK_MAX_LRO_IP_CNT; i++) {
+   mac->hwlro_ip[i] = 0;
+   hwlro_idx = (mac->id * MTK_MAX_LRO_IP_CNT) + i;
+
+   mtk_hwlro_inval_ipaddr(eth, hwlro_idx);
+   }
+
+   mac->hwlro_ip_cnt = 0;
+}
+
+static int mtk_hwlro_get_fdir_entry(struct net_device *dev,
+   struct ethtool_rxnfc *cmd)
+{
+   struct mtk_mac *mac = netdev_priv(dev);
+   struct ethtool_rx_flow_spec *fsp =
+   (struct ethtool_rx_flow_spec *)&cmd->fs;
+
+   /* only tcp dst ipv4 is meaningful, others are meaningless */
+   fsp->flow_type = TCP_V4_FLOW;
+   fsp->h_u.tcp_ip4_spec.ip4dst = ntohl(mac->hwlro_ip[fsp->location]);
+   fsp->m_u.tcp_ip4_spec.ip4dst = 0;
+
+   fsp->h_u.tcp_ip4_spec.ip4src = 0;
+   fsp->m_u.tcp_ip4_spec.ip4src = 0x;
+   fsp->h_u.tcp_ip4_spec.psrc = 0;
+   fsp->m_u.tcp_ip4_spec.psrc = 0x;
+   fsp->h_u.tcp_ip4_spec.pdst = 0;
+   fsp->m_u.tcp_ip4_spec.pdst = 0x;
+   fsp->h_u.tcp_ip4_spec.tos = 0;
+   fsp->m_u.tcp_ip4_spec.tos = 0xff;
+
+   return 0;
+}
+
+static int mtk_hwlro_get_fdir_all(struct net_device *dev,
+ struct ethtool_rxnfc *cmd,
+ u32 *rule_locs)
+{
+   struct mtk_mac *mac = netdev_priv(dev);
+   int cnt = 0;
+   int i;
+
+   for (i = 0; i < MTK_MAX_LRO_IP_CNT; i

[PATCH net-next 3/3] net: ethernet: mediatek: add dts configuration to enable HW LRO

2016-09-13 Thread Nelson Chang

Add the configuration of HW LRO in the binding document.

Signed-off-by: Nelson Chang 
---
 Documentation/devicetree/bindings/net/mediatek-net.txt | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/Documentation/devicetree/bindings/net/mediatek-net.txt 
b/Documentation/devicetree/bindings/net/mediatek-net.txt
index 32eaaca..f43c0d1 100644
--- a/Documentation/devicetree/bindings/net/mediatek-net.txt
+++ b/Documentation/devicetree/bindings/net/mediatek-net.txt
@@ -20,6 +20,7 @@ Required properties:
 - mediatek,ethsys: phandle to the syscon node that handles the port setup
 - mediatek,pctl: phandle to the syscon node that handles the ports slew rate
and driver current
+- mediatek,hwlro: set to enable HW LRO functions of PDMA rx rings
 
 Optional properties:
 - interrupt-parent: Should be the phandle for the interrupt controller
@@ -51,6 +52,7 @@ eth: ethernet@1b10 {
reset-names = "eth";
mediatek,ethsys = <ðsys>;
mediatek,pctl = <&syscfg_pctl_a>;
+   mediatek,hwlro;
#address-cells = <1>;
#size-cells = <0>;
 
-- 
1.9.1

[PATCH net-next 1/3] net: ethernet: mediatek: add HW LRO functions of PDMA RX rings

2016-09-13 Thread Nelson Chang

The codes add the large receive offload (LRO) functions by hardware as below:
1) PDMA has total four RX rings that one is the normal ring, and others can
   be configured as LRO rings.
2) Only TCP/IP RX flows can be offloaded. The hardware can set four IP
   addresses at most, if the destination IP of the RX flow matches one of
   them, it has the chance to be offloaded.
3) There three RX flows can be offloaded at most, and one flow is mapped to
   one RX ring.
4) If there are more than three candidate RX flows, the hardware can
   choose three of them by throughput comparison results.

Signed-off-by: Nelson Chang 
---
 drivers/net/ethernet/mediatek/mtk_eth_soc.c | 215 +---
 drivers/net/ethernet/mediatek/mtk_eth_soc.h |  75 +-
 2 files changed, 265 insertions(+), 25 deletions(-)

diff --git a/drivers/net/ethernet/mediatek/mtk_eth_soc.c 
b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
index 6e01f1f..ed35e0f 100644
--- a/drivers/net/ethernet/mediatek/mtk_eth_soc.c
+++ b/drivers/net/ethernet/mediatek/mtk_eth_soc.c
@@ -810,11 +810,51 @@ drop:
return NETDEV_TX_OK;
 }
 
+static struct mtk_rx_ring *mtk_get_rx_ring(struct mtk_eth *eth)
+{
+   int i;
+   struct mtk_rx_ring *ring;
+   int idx;
+
+   if (!eth->hwlro)
+   return ð->rx_ring[0];
+
+   for (i = 0; i < MTK_MAX_RX_RING_NUM; i++) {
+   ring = ð->rx_ring[i];
+   idx = NEXT_RX_DESP_IDX(ring->calc_idx, ring->dma_size);
+   if (ring->dma[idx].rxd2 & RX_DMA_DONE) {
+   ring->calc_idx_update = true;
+   return ring;
+   }
+   }
+
+   return NULL;
+}
+
+static void mtk_update_rx_cpu_idx(struct mtk_eth *eth)
+{
+   struct mtk_rx_ring *ring;
+   int i;
+
+   if (!eth->hwlro) {
+   ring = ð->rx_ring[0];
+   mtk_w32(eth, ring->calc_idx, ring->crx_idx_reg);
+   } else {
+   for (i = 0; i < MTK_MAX_RX_RING_NUM; i++) {
+   ring = ð->rx_ring[i];
+   if (ring->calc_idx_update) {
+   ring->calc_idx_update = false;
+   mtk_w32(eth, ring->calc_idx, ring->crx_idx_reg);
+   }
+   }
+   }
+}
+
 static int mtk_poll_rx(struct napi_struct *napi, int budget,
   struct mtk_eth *eth)
 {
-   struct mtk_rx_ring *ring = ð->rx_ring;
-   int idx = ring->calc_idx;
+   struct mtk_rx_ring *ring;
+   int idx;
struct sk_buff *skb;
u8 *data, *new_data;
struct mtk_rx_dma *rxd, trxd;
@@ -826,7 +866,11 @@ static int mtk_poll_rx(struct napi_struct *napi, int 
budget,
dma_addr_t dma_addr;
int mac = 0;
 
-   idx = NEXT_RX_DESP_IDX(idx);
+   ring = mtk_get_rx_ring(eth);
+   if (unlikely(!ring))
+   goto rx_done;
+
+   idx = NEXT_RX_DESP_IDX(ring->calc_idx, ring->dma_size);
rxd = &ring->dma[idx];
data = ring->data[idx];
 
@@ -894,12 +938,13 @@ release_desc:
done++;
}
 
+rx_done:
if (done) {
/* make sure that all changes to the dma ring are flushed before
 * we continue
 */
wmb();
-   mtk_w32(eth, ring->calc_idx, MTK_PRX_CRX_IDX0);
+   mtk_update_rx_cpu_idx(eth);
}
 
return done;
@@ -1122,32 +1167,41 @@ static void mtk_tx_clean(struct mtk_eth *eth)
}
 }
 
-static int mtk_rx_alloc(struct mtk_eth *eth)
+static int mtk_rx_alloc(struct mtk_eth *eth, int ring_no, int rx_flag)
 {
-   struct mtk_rx_ring *ring = ð->rx_ring;
+   struct mtk_rx_ring *ring = ð->rx_ring[ring_no];
+   int rx_data_len, rx_dma_size;
int i;
 
-   ring->frag_size = mtk_max_frag_size(ETH_DATA_LEN);
+   if (rx_flag == MTK_RX_FLAGS_HWLRO) {
+   rx_data_len = MTK_MAX_LRO_RX_LENGTH;
+   rx_dma_size = MTK_HW_LRO_DMA_SIZE;
+   } else {
+   rx_data_len = ETH_DATA_LEN;
+   rx_dma_size = MTK_DMA_SIZE;
+   }
+
+   ring->frag_size = mtk_max_frag_size(rx_data_len);
ring->buf_size = mtk_max_buf_size(ring->frag_size);
-   ring->data = kcalloc(MTK_DMA_SIZE, sizeof(*ring->data),
+   ring->data = kcalloc(rx_dma_size, sizeof(*ring->data),
 GFP_KERNEL);
if (!ring->data)
return -ENOMEM;
 
-   for (i = 0; i < MTK_DMA_SIZE; i++) {
+   for (i = 0; i < rx_dma_size; i++) {
ring->data[i] = netdev_alloc_frag(ring->frag_size);
if (!ring->data[i])
return -ENOMEM;
}
 
ring->dma = dma_alloc_coherent(eth->dev,
-  MTK_DMA_SIZE * sizeof(*ring->dma),
+  rx_dma_size * sizeof(*ring->dma),

[PATCH net-next V2 0/3] net/sched: cls_flower: Add ports masks

2016-09-13 Thread Or Gerlitz

This series adds the ability to specify tcp/udp ports masks 
for TC/flower filter matches.

I also removed an unused fields from the flower keys struct 
and clarified the format of the recently added vlan attibutes.

Or.

Or Gerlitz (3):
  net/sched: cls_flower: Support masking for matching on tcp/udp ports
  net/sched: cls_flower: Remove an unsed field from the filter key structure
  net/sched: cls_flower: Specify vlan attributes format in the UAPI header

 include/uapi/linux/pkt_cls.h | 10 +++---
 net/sched/cls_flower.c   | 21 -
 2 files changed, 19 insertions(+), 12 deletions(-)

-- 
2.3.7

[PATCH net-next 3/3] net/sched: cls_flower: Specify vlan attributes format in the UAPI header

2016-09-13 Thread Or Gerlitz

Specify the format (size and endianess) for the vlan attributes.

Signed-off-by: Or Gerlitz 
---
 include/uapi/linux/pkt_cls.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index 60ea2a0..8915b61 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -428,9 +428,9 @@ enum {
TCA_FLOWER_KEY_UDP_DST, /* be16 */
 
TCA_FLOWER_FLAGS,
-   TCA_FLOWER_KEY_VLAN_ID,
-   TCA_FLOWER_KEY_VLAN_PRIO,
-   TCA_FLOWER_KEY_VLAN_ETH_TYPE,
+   TCA_FLOWER_KEY_VLAN_ID, /* be16 */
+   TCA_FLOWER_KEY_VLAN_PRIO,   /* u8   */
+   TCA_FLOWER_KEY_VLAN_ETH_TYPE,   /* be16 */
 
TCA_FLOWER_KEY_ENC_KEY_ID,  /* be32 */
TCA_FLOWER_KEY_ENC_IPV4_SRC,/* be32 */
-- 
2.3.7

[PATCH net-next 2/3] net/sched: cls_flower: Remove an unsed field from the filter key structure

2016-09-13 Thread Or Gerlitz

Commit c3f8324188fa "net: Add full IPv6 addresses to flow_keys" added an
unsed instance of struct flow_dissector_key_addrs into struct fl_flow_key,
remove it.

Signed-off-by: Or Gerlitz 
Reported-by: Hadar Hen Zion 
---
 net/sched/cls_flower.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index 027523c..a3f4c70 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -33,7 +33,6 @@ struct fl_flow_key {
struct flow_dissector_key_basic basic;
struct flow_dissector_key_eth_addrs eth;
struct flow_dissector_key_vlan vlan;
-   struct flow_dissector_key_addrs ipaddrs;
union {
struct flow_dissector_key_ipv4_addrs ipv4;
struct flow_dissector_key_ipv6_addrs ipv6;
-- 
2.3.7

[PATCH net-next 1/3] net/sched: cls_flower: Support masking for matching on tcp/udp ports

2016-09-13 Thread Or Gerlitz

Add the definitions for src/dst udp/tcp port masks and use
them when setting && dumping the relevant keys.

Signed-off-by: Or Gerlitz 
Signed-off-by: Paul Blakey 
---
 include/uapi/linux/pkt_cls.h |  4 
 net/sched/cls_flower.c   | 20 
 2 files changed, 16 insertions(+), 8 deletions(-)

diff --git a/include/uapi/linux/pkt_cls.h b/include/uapi/linux/pkt_cls.h
index f9c287c..60ea2a0 100644
--- a/include/uapi/linux/pkt_cls.h
+++ b/include/uapi/linux/pkt_cls.h
@@ -442,6 +442,10 @@ enum {
TCA_FLOWER_KEY_ENC_IPV6_DST,/* struct in6_addr */
TCA_FLOWER_KEY_ENC_IPV6_DST_MASK,/* struct in6_addr */
 
+   TCA_FLOWER_KEY_TCP_SRC_MASK,/* be16 */
+   TCA_FLOWER_KEY_TCP_DST_MASK,/* be16 */
+   TCA_FLOWER_KEY_UDP_SRC_MASK,/* be16 */
+   TCA_FLOWER_KEY_UDP_DST_MASK,/* be16 */
__TCA_FLOWER_MAX,
 };
 
diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
index b084b2a..027523c 100644
--- a/net/sched/cls_flower.c
+++ b/net/sched/cls_flower.c
@@ -335,6 +335,10 @@ static const struct nla_policy fl_policy[TCA_FLOWER_MAX + 
1] = {
[TCA_FLOWER_KEY_ENC_IPV6_SRC_MASK] = { .len = sizeof(struct in6_addr) },
[TCA_FLOWER_KEY_ENC_IPV6_DST]   = { .len = sizeof(struct in6_addr) },
[TCA_FLOWER_KEY_ENC_IPV6_DST_MASK] = { .len = sizeof(struct in6_addr) },
+   [TCA_FLOWER_KEY_TCP_SRC_MASK]   = { .type = NLA_U16 },
+   [TCA_FLOWER_KEY_TCP_DST_MASK]   = { .type = NLA_U16 },
+   [TCA_FLOWER_KEY_UDP_SRC_MASK]   = { .type = NLA_U16 },
+   [TCA_FLOWER_KEY_UDP_DST_MASK]   = { .type = NLA_U16 },
 };
 
 static void fl_set_key_val(struct nlattr **tb,
@@ -432,17 +436,17 @@ static int fl_set_key(struct net *net, struct nlattr **tb,
 
if (key->basic.ip_proto == IPPROTO_TCP) {
fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC,
-  &mask->tp.src, TCA_FLOWER_UNSPEC,
+  &mask->tp.src, TCA_FLOWER_KEY_TCP_SRC_MASK,
   sizeof(key->tp.src));
fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST,
-  &mask->tp.dst, TCA_FLOWER_UNSPEC,
+  &mask->tp.dst, TCA_FLOWER_KEY_TCP_DST_MASK,
   sizeof(key->tp.dst));
} else if (key->basic.ip_proto == IPPROTO_UDP) {
fl_set_key_val(tb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC,
-  &mask->tp.src, TCA_FLOWER_UNSPEC,
+  &mask->tp.src, TCA_FLOWER_KEY_UDP_SRC_MASK,
   sizeof(key->tp.src));
fl_set_key_val(tb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST,
-  &mask->tp.dst, TCA_FLOWER_UNSPEC,
+  &mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK,
   sizeof(key->tp.dst));
}
 
@@ -877,18 +881,18 @@ static int fl_dump(struct net *net, struct tcf_proto *tp, 
unsigned long fh,
 
if (key->basic.ip_proto == IPPROTO_TCP &&
(fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_TCP_SRC,
-&mask->tp.src, TCA_FLOWER_UNSPEC,
+&mask->tp.src, TCA_FLOWER_KEY_TCP_SRC_MASK,
 sizeof(key->tp.src)) ||
 fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_TCP_DST,
-&mask->tp.dst, TCA_FLOWER_UNSPEC,
+&mask->tp.dst, TCA_FLOWER_KEY_TCP_DST_MASK,
 sizeof(key->tp.dst
goto nla_put_failure;
else if (key->basic.ip_proto == IPPROTO_UDP &&
 (fl_dump_key_val(skb, &key->tp.src, TCA_FLOWER_KEY_UDP_SRC,
- &mask->tp.src, TCA_FLOWER_UNSPEC,
+ &mask->tp.src, TCA_FLOWER_KEY_UDP_SRC_MASK,
  sizeof(key->tp.src)) ||
  fl_dump_key_val(skb, &key->tp.dst, TCA_FLOWER_KEY_UDP_DST,
- &mask->tp.dst, TCA_FLOWER_UNSPEC,
+ &mask->tp.dst, TCA_FLOWER_KEY_UDP_DST_MASK,
  sizeof(key->tp.dst
goto nla_put_failure;
 
-- 
2.3.7

[PATCH net-next] openvswitch: avoid deferred execution of recirc actions

2016-09-13 Thread Lance Richardson

The ovs kernel data path currently defers the execution of all
recirc actions until stack utilization is at a minimum.
This is too limiting for some packet forwarding scenarios due to
the small size of the deferred action FIFO (10 entries). For
example, broadcast traffic sent out more than 10 ports with
recirculation results in packet drops when the deferred action
FIFO becomes full, as reported here:

 http://openvswitch.org/pipermail/dev/2016-March/067672.html

Since the current recursion depth is available (it is already tracked
by the exec_actions_level pcpu variable), we can use it to determine
whether to execute recirculation actions immediately (safe when
recursion depth is low) or defer execution until more stack space is
available.

With this change, the deferred action fifo size becomes a non-issue
for currently failing scenarios because it is no longer used when
there are three or fewer recursions through ovs_execute_actions().

Suggested-by: Pravin Shelar 
Signed-off-by: Lance Richardson 
---
 net/openvswitch/actions.c | 30 --
 1 file changed, 28 insertions(+), 2 deletions(-)

diff --git a/net/openvswitch/actions.c b/net/openvswitch/actions.c
index 6eb5261..ef7cc6c 100644
--- a/net/openvswitch/actions.c
+++ b/net/openvswitch/actions.c
@@ -71,6 +71,8 @@ struct ovs_frag_data {
 static DEFINE_PER_CPU(struct ovs_frag_data, ovs_frag_data_storage);
 
 #define DEFERRED_ACTION_FIFO_SIZE 10
+#define OVS_RECURSION_LIMIT 5
+#define OVS_DEFERRED_ACTION_THRESHOLD (OVS_RECURSION_LIMIT - 2)
 struct action_fifo {
int head;
int tail;
@@ -78,7 +80,12 @@ struct action_fifo {
struct deferred_action fifo[DEFERRED_ACTION_FIFO_SIZE];
 };
 
+struct recirc_keys {
+   struct sw_flow_key key[OVS_DEFERRED_ACTION_THRESHOLD];
+};
+
 static struct action_fifo __percpu *action_fifos;
+static struct recirc_keys __percpu *recirc_keys;
 static DEFINE_PER_CPU(int, exec_actions_level);
 
 static void action_fifo_init(struct action_fifo *fifo)
@@ -1020,6 +1027,7 @@ static int execute_recirc(struct datapath *dp, struct 
sk_buff *skb,
  const struct nlattr *a, int rem)
 {
struct deferred_action *da;
+   int level;
 
if (!is_flow_key_valid(key)) {
int err;
@@ -1043,6 +1051,18 @@ static int execute_recirc(struct datapath *dp, struct 
sk_buff *skb,
return 0;
}
 
+   level = this_cpu_read(exec_actions_level);
+   if (level <= OVS_DEFERRED_ACTION_THRESHOLD) {
+   struct recirc_keys *rks = this_cpu_ptr(recirc_keys);
+   struct sw_flow_key *recirc_key = &rks->key[level - 1];
+
+   *recirc_key = *key;
+   recirc_key->recirc_id = nla_get_u32(a);
+   ovs_dp_process_packet(skb, recirc_key);
+
+   return 0;
+   }
+
da = add_deferred_actions(skb, key, NULL);
if (da) {
da->pkt_key.recirc_id = nla_get_u32(a);
@@ -1209,11 +1229,10 @@ int ovs_execute_actions(struct datapath *dp, struct 
sk_buff *skb,
const struct sw_flow_actions *acts,
struct sw_flow_key *key)
 {
-   static const int ovs_recursion_limit = 5;
int err, level;
 
level = __this_cpu_inc_return(exec_actions_level);
-   if (unlikely(level > ovs_recursion_limit)) {
+   if (unlikely(level > OVS_RECURSION_LIMIT)) {
net_crit_ratelimited("ovs: recursion limit reached on datapath 
%s, probable configuration error\n",
 ovs_dp_name(dp));
kfree_skb(skb);
@@ -1238,10 +1257,17 @@ int action_fifos_init(void)
if (!action_fifos)
return -ENOMEM;
 
+   recirc_keys = alloc_percpu(struct recirc_keys);
+   if (!recirc_keys) {
+   free_percpu(action_fifos);
+   return -ENOMEM;
+   }
+
return 0;
 }
 
 void action_fifos_exit(void)
 {
free_percpu(action_fifos);
+   free_percpu(recirc_keys);
 }
-- 
2.5.5

Re: [PATCH net-next 1/3] net/sched: cls_flower: Support masking for matching on tcp/udp ports

2016-09-13 Thread Jiri Pirko

Tue, Sep 13, 2016 at 04:02:39PM CEST, ogerl...@mellanox.com wrote:
>Add the definitions for src/dst udp/tcp port masks and use
>them when setting && dumping the relevant keys.
>
>Signed-off-by: Or Gerlitz 
>Signed-off-by: Paul Blakey 

Acked-by: Jiri Pirko

Re: [PATCH v3 3/9] ARM: sun8i: dt: Add DT bindings documentation for Allwinner sun8i-emac

2016-09-13 Thread Andrew Lunn

On Tue, Sep 13, 2016 at 03:34:17PM +0200, LABBE Corentin wrote:
> On Fri, Sep 09, 2016 at 04:17:10PM +0200, Andrew Lunn wrote:
> > > +Optional properties:
> > > +- allwinner,tx-delay: TX clock delay chain value. Range value is 0-0x07. 
> > > Default is 0)
> > > +- allwinner,rx-delay: RX clock delay chain value. Range value is 0-0x1F. 
> > > Default is 0)
> > 
> > What are the units? pS? nS?
> > 
> >  Andrew
> 
> No units, only raw number.
> I will add a comment for this.

And it is likely it will get NACKed by the device tree
maintainers. You should use real unit here.

   Andrew

Re: [PATCH v5 0/6] Add eBPF hooks for cgroups

2016-09-13 Thread Daniel Borkmann


On 09/13/2016 03:31 PM, Daniel Mack wrote:

On 09/13/2016 01:56 PM, Pablo Neira Ayuso wrote:

On Mon, Sep 12, 2016 at 06:12:09PM +0200, Daniel Mack wrote:

This is v5 of the patch set to allow eBPF programs for network
filtering and accounting to be attached to cgroups, so that they apply
to all sockets of all tasks placed in that cgroup. The logic also
allows to be extendeded for other cgroup based eBPF logic.


1) This infrastructure can only be useful to systemd, or any similar
orchestration daemon. Look, you can only apply filtering policies
to processes that are launched by systemd, so this only works
for server processes.


Sorry, but both statements aren't true. The eBPF policies apply to every
process that is placed in a cgroup, and my example program in 6/6 shows
how that can be done from the command line. Also, systemd is able to
control userspace processes just fine, and it not limited to 'server
processes'.


For client processes this infrastructure is
*racy*, you have to add new processes in runtime to the cgroup,
thus there will be time some little time where no filtering policy
will be applied. For quality of service, this may be an acceptable
race, but this is aiming to deploy a filtering policy.


That's a limitation that applies to many more control mechanisms in the
kernel, and it's something that can easily be solved with fork+exec.


2) This aproach looks uninfrastructured to me. This provides a hook
to push a bpf blob at a place in the stack that deploys a filtering
policy that is not visible to others.


That's just as transparent as SO_ATTACH_FILTER. What kind of
introspection mechanism do you have in mind?


We have interfaces that allows
us to dump the filtering policy that is being applied, report events
to enable cooperation between several processes with similar
capabilities and so on.


Well, in practice, for netfilter, there can only be one instance in the
system that acts as central authoritative, otherwise you'll end up with
orphaned entries or with situation where some client deletes rules
behind the back of the one that originally installed it. So I really
think there is nothing wrong with demanding a single, privileged
controller to manage things.


After chatting with Daniel Borkmann and Alexei off-list, we concluded
that __dev_queue_xmit() is the place where the egress hooks should live
when eBPF programs need access to the L2 bits of the skb.


3) This egress hook is coming very late, the only reason I find to
place it at __dev_queue_xmit() is that bpf naturally works with
layer 2 information in place. But this new hook is placed in
_everyone's output ath_ that only works for the very specific
usecase I exposed above.


It's about filtering outgoing network packets of applications, and
providing them with L2 information for filtering purposes. I don't think
that's a very specific use-case.

When the feature is not used at all, the added costs on the output path
are close to zero, due to the use of static branches. If used somewhere
in the system but not for the packet in flight, costs are slightly
higher but acceptable. In fact, it's not even measurable in my tests
here. How is that different from the netfilter OUTPUT hook, btw?

That said, limiting it to L3 is still an option. It's just that we need
ingress and egress to be in sync, so both would be L3 then. So far, the
possible advantages for future use-cases having access to L2 outweighed
the concerns of putting the hook to dev_queue_xmit(), but I'm open to
discussing that.


While I fully disagree with Pablo's point 1) and 2), in the last set I
raised a similar concern as in point 3) wrt __dev_queue_xmit(). The set
as-is would indeed need the L2 info, since a filter could do a load via
LLVM built-ins such as asm("llvm.bpf.load.byte") et al, with BPF_LL_OFF,
where we're forced to do a load relative to skb_mac_header(). As stated
by Daniel already, it would be nice to see the full frame, so it comes
down to a trade-off, but the option of L3 onwards also exists and BPF can
work just fine with it, too. This just means it's placed in the local
output path and the verifier would need to disallow these built-ins during
bpf(2) load time. They are a rather cumbersome legacy anyway, so
bpf_skb_load_bytes() helper can be used instead, which is also easier
to use.


The main concern during the workshop was that a hook only for cgroups
is too specific, but this is actually even more specific than this.


This patch set merely implements an infrastructure that can accommodate
many more things as well in the future. We could, in theory, even add
hooks for forwarded packets specifically, or other eBPF programs, not
even for network filtering etc.


I have nothing against systemd or the needs for more
programmability/flexibility in the stack, but I think this needs to
fulfill some requirements to fit into the infrastructure that we have
in the right way.


Well, as I

Re: [PATCH net-next 3/3] net/sched: cls_flower: Specify vlan attributes format in the UAPI header

2016-09-13 Thread Jiri Pirko

Tue, Sep 13, 2016 at 04:02:41PM CEST, ogerl...@mellanox.com wrote:
>Specify the format (size and endianess) for the vlan attributes.
>
>Signed-off-by: Or Gerlitz 

Acked-by: Jiri Pirko

Re: [PATCH net-next 2/3] net/sched: cls_flower: Remove an unsed field from the filter key structure

2016-09-13 Thread Jiri Pirko

Tue, Sep 13, 2016 at 04:02:40PM CEST, ogerl...@mellanox.com wrote:
>Commit c3f8324188fa "net: Add full IPv6 addresses to flow_keys" added an
>unsed instance of struct flow_dissector_key_addrs into struct fl_flow_key,
>remove it.
>
>Signed-off-by: Or Gerlitz 
>Reported-by: Hadar Hen Zion 

Acked-by: Jiri Pirko

Re: [RFC 00/11] QLogic RDMA Driver (qedr) RFC

2016-09-13 Thread Sagi Grimberg


Hey Ram and Co,


This series introduces RoCE RDMA driver for the 579xx RDMA products by Qlogic.
The RDMA support is added as an additional loadable module (qedr) over the 
Ethernet qede driver.
The qedr module will support both RoCE and iWarp, although this series only 
adds RoCE support.
The qed and qede drivers are enhanced with functionality required for RDMA 
support.

Any review/comment is appreciated.


Was this driver tested with any of our kernel consumers (nfs/iser/nvmf)?

If so, are there known issues?

Thanks,
Sagi.

Re: [RFC 07/11] Add support for memory registeration verbs

2016-09-13 Thread Sagi Grimberg




+static inline struct qedr_ah *get_qedr_ah(struct ib_ah *ibah)
+{
+   return container_of(ibah, struct qedr_ah, ibah);
+}


Little surprising to find that here... how is the ah related
to this patch?

Re: [RFC 03/11] Add support for RoCE HW init

2016-09-13 Thread Sagi Grimberg




+   dev->max_sge = min_t(u32, RDMA_MAX_SGE_PER_SQ_WQE,
+RDMA_MAX_SGE_PER_RQ_WQE);


Our kernel target mode consumers sort of rely on max_sge_rd, you need
to make sure to set it too.

Re: [PATCH] [RFC] proc connector: add namespace events

2016-09-13 Thread Alban Crequy

On 12 September 2016 at 23:39, Evgeniy Polyakov  wrote:
> Hi everyone
>
> 08.09.2016, 18:39, "Alban Crequy" :
>> The act of a process creating or joining a namespace via clone(),
>> unshare() or setns() is a useful signal for monitoring applications.
>
>> + if (old_ns->mnt_ns != new_ns->mnt_ns)
>> + proc_ns_connector(tsk, CLONE_NEWNS, PROC_NM_REASON_CLONE, old_mntns_inum, 
>> new_mntns_inum);
>> +
>> + if (old_ns->uts_ns != new_ns->uts_ns)
>> + proc_ns_connector(tsk, CLONE_NEWUTS, PROC_NM_REASON_CLONE, 
>> old_ns->uts_ns->ns.inum, new_ns->uts_ns->ns.inum);
>> +
>> + if (old_ns->ipc_ns != new_ns->ipc_ns)
>> + proc_ns_connector(tsk, CLONE_NEWIPC, PROC_NM_REASON_CLONE, 
>> old_ns->ipc_ns->ns.inum, new_ns->ipc_ns->ns.inum);
>> +
>> + if (old_ns->net_ns != new_ns->net_ns)
>> + proc_ns_connector(tsk, CLONE_NEWNET, PROC_NM_REASON_CLONE, 
>> old_ns->net_ns->ns.inum, new_ns->net_ns->ns.inum);
>> +
>> + if (old_ns->cgroup_ns != new_ns->cgroup_ns)
>> + proc_ns_connector(tsk, CLONE_NEWCGROUP, PROC_NM_REASON_CLONE, 
>> old_ns->cgroup_ns->ns.inum, new_ns->cgroup_ns->ns.inum);
>> +
>> + if (old_ns->pid_ns_for_children != new_ns->pid_ns_for_children)
>> + proc_ns_connector(tsk, CLONE_NEWPID, PROC_NM_REASON_CLONE, 
>> old_ns->pid_ns_for_children->ns.inum, new_ns->pid_ns_for_children->ns.inum);
>> + }
>> +
>
> Patch looks good to me from technical/connector point of view, but these even 
> multiplication is a bit weird imho.
>
> I'm not against it, but did you consider sending just 2 serialized ns 
> structures via single message, and client
> would check all ns bits himself?

I have not considered it, thanks for the suggestion. Should we offer
the guarantee to userspace that it will always be send in one single
message? If we want to give the information about the userns change
too, it will be a bit more complicated to write the patch because it
is not done in the same function.

Note that I will probably not have the chance to spend more time on
this patch soon because Iago will explore other methods with
eBPF+kprobes instead. eBPF+kprobes would not have the same API
stability though. I was curious to see if anyone would find the
namespace addition to proc connector interesting for other projects.

Re: [RFC 08/11] Add support for data path

2016-09-13 Thread Sagi Grimberg





+   pbe = (struct regpair *)pbl_table->va;
+   num_pbes = 0;
+
+   for (i = 0; i < mr->npages &&
+(total_num_pbes != mr->info.pbl_info.num_pbes); i++) {
+   u64 buf_addr = mr->pages[i];
+
+   pbe->lo = cpu_to_le32((u32)buf_addr);
+   pbe->hi = cpu_to_le32((u32)upper_32_bits(buf_addr));


Thats a shame... you could have easily set the buf_addr correctly
in qedr_set_page...

I think you could have also set the pbe directly from set_page if you
have access to pbl_table from your mr context
(and if I understand correctly I think you do, mr->info.pbl_table)...

RE: [RFC 02/11] Add RoCE driver framework

2016-09-13 Thread Steve Wise

> Adds a skeletal implementation of the qed* RoCE driver -
> basically the ability to communicate with the qede driver and
> receive notifications from it regarding various init/exit events.
> 
> Signed-off-by: Rajesh Borundia 
> Signed-off-by: Ram Amrani 
> ---
>  drivers/infiniband/Kconfig   |   2 +
>  drivers/infiniband/hw/Makefile   |   1 +
>  drivers/infiniband/hw/qedr/Kconfig   |   7 +
>  drivers/infiniband/hw/qedr/Makefile  |   3 +
>  drivers/infiniband/hw/qedr/main.c| 293 +
>  drivers/infiniband/hw/qedr/qedr.h|  60 ++
>  drivers/net/ethernet/qlogic/qede/Makefile|   1 +
>  drivers/net/ethernet/qlogic/qede/qede.h  |   9 +
>  drivers/net/ethernet/qlogic/qede/qede_main.c |  35 ++-
>  drivers/net/ethernet/qlogic/qede/qede_roce.c | 309
> +++
>  include/linux/qed/qed_if.h   |   3 +-
>  include/linux/qed/qede_roce.h|  88 
>  include/uapi/linux/pci_regs.h|   3 +
>  13 files changed, 803 insertions(+), 11 deletions(-)
>  create mode 100644 drivers/infiniband/hw/qedr/Kconfig
>  create mode 100644 drivers/infiniband/hw/qedr/Makefile
>  create mode 100644 drivers/infiniband/hw/qedr/main.c
>  create mode 100644 drivers/infiniband/hw/qedr/qedr.h
>  create mode 100644 drivers/net/ethernet/qlogic/qede/qede_roce.c
>  create mode 100644 include/linux/qed/qede_roce.h



> @@ -189,8 +189,7 @@ static int qede_netdev_event(struct notifier_block *this,
> unsigned long event,
>   struct ethtool_drvinfo drvinfo;
>   struct qede_dev *edev;
> 
> - /* Currently only support name change */
> - if (event != NETDEV_CHANGENAME)
> + if ((event != NETDEV_CHANGENAME) && (event !=
> NETDEV_CHANGEADDR))

nit: You don't really need the extra parens here.

Re: [RFC 07/11] Add support for memory registeration verbs

2016-09-13 Thread Sagi Grimberg




+struct qedr_mr *__qedr_alloc_mr(struct ib_pd *ibpd, int max_page_list_len)
+{
+   struct qedr_pd *pd = get_qedr_pd(ibpd);
+   struct qedr_dev *dev = get_qedr_dev(ibpd->device);
+   struct qedr_mr *mr;
+   int rc = -ENOMEM;
+
+   DP_VERBOSE(dev, QEDR_MSG_MR,
+  "qedr_alloc_frmr pd = %d max_page_list_len= %d\n", pd->pd_id,
+  max_page_list_len);
+
+   mr = kzalloc(sizeof(*mr), GFP_KERNEL);
+   if (!mr)
+   return ERR_PTR(rc);
+
+   mr->dev = dev;
+   mr->type = QEDR_MR_FRMR;
+
+   rc = init_mr_info(dev, &mr->info, max_page_list_len, 1);
+   if (rc)
+   goto err0;
+
+   rc = dev->ops->rdma_alloc_tid(dev->rdma_ctx, &mr->hw_mr.itid);
+   if (rc) {
+   DP_ERR(dev, "roce alloc tid returned an error %d\n", rc);
+   goto err0;
+   }
+
+   /* Index only, 18 bit long, lkey = itid << 8 | key */
+   mr->hw_mr.tid_type = QED_RDMA_TID_FMR;
+   mr->hw_mr.key = 0;
+   mr->hw_mr.pd = pd->pd_id;


Do you have a real MR<->PD association in HW? If so, can you point
me to the code that binds it? If not, any reason not to expose
the local_dma_lkey?


+struct ib_mr *qedr_get_dma_mr(struct ib_pd *ibpd, int acc)
+{
+   struct qedr_dev *dev = get_qedr_dev(ibpd->device);
+   struct qedr_pd *pd = get_qedr_pd(ibpd);
+   struct qedr_mr *mr;
+   int rc;
+
+   if (acc & IB_ACCESS_MW_BIND) {
+   DP_ERR(dev, "Unsupported access flags received for dma mr\n");
+   return ERR_PTR(-EINVAL);
+   }


This check looks like it really belongs in the core, it would help
everyone if you move it...

Although I know Christoph is trying to get rid of this API altogether...

Re: [PATCH net-next 1/2] net: mvneta: add xmit_more support

2016-09-13 Thread Eric Dumazet

On Tue, 2016-09-13 at 07:33 -0700, Eric Dumazet wrote:

> Hi Marcin
> 
> Well, given the above comment, and fact that MVNETA_MAX_TXD == 532, it
> looks like you might add a bug if more than 256 skb are given to your
> ndo_start_xmit() with skb->xmit_more = 1
> 
> I therefore suggest you make sure it does not happen.
> 
> txq->pending += frags;
> if (!skb->xmit_more ||
> txq->pending > 256 - MVNETA_MAX_SKB_DESCS ||
> netif_xmit_stopped(nq))
>   mvneta_txq_pend_desc_add(pp, txq)
> 

Another solution would be to test the potential overflow in mvneta_tx()
and force a mvneta_txq_pend_desc_add(pp, txq) _before_ adding the desc
of the "about to be cooked" TSO packet.

(This is because MVNETA_MAX_SKB_DESCS is 217, so 255-217 leaves few room
for xmit_more to show its power)

Re: [PATCH net-next 1/2] net: mvneta: add xmit_more support

2016-09-13 Thread Eric Dumazet

On Tue, 2016-09-13 at 09:00 +0200, Marcin Wojtas wrote:
> From: Simon Guinot 
> 
> Basing on xmit_more flag of the skb, TX descriptors can be concatenated
> before flushing. This commit delay Tx descriptor flush if the queue is
> running and if there is more skb's to send.
> 
> Signed-off-by: Simon Guinot 
> ---
>  drivers/net/ethernet/marvell/mvneta.c | 11 ---
>  1 file changed, 8 insertions(+), 3 deletions(-)
> 
> diff --git a/drivers/net/ethernet/marvell/mvneta.c 
> b/drivers/net/ethernet/marvell/mvneta.c
> index d41c28d..b9dccea 100644
> --- a/drivers/net/ethernet/marvell/mvneta.c
> +++ b/drivers/net/ethernet/marvell/mvneta.c
> @@ -512,6 +512,7 @@ struct mvneta_tx_queue {
>* descriptor ring
>*/
>   int count;
> + int pending;
>   int tx_stop_threshold;
>   int tx_wake_threshold;
>  
> @@ -802,8 +803,9 @@ static void mvneta_txq_pend_desc_add(struct mvneta_port 
> *pp,
>   /* Only 255 descriptors can be added at once ; Assume caller
>* process TX desriptors in quanta less than 256
>*/

Hi Marcin

Well, given the above comment, and fact that MVNETA_MAX_TXD == 532, it
looks like you might add a bug if more than 256 skb are given to your
ndo_start_xmit() with skb->xmit_more = 1

I therefore suggest you make sure it does not happen.

txq->pending += frags;
if (!skb->xmit_more ||
txq->pending > 256 - MVNETA_MAX_SKB_DESCS ||
netif_xmit_stopped(nq))
mvneta_txq_pend_desc_add(pp, txq)

Re: [PATCH 0/4] Netfilter fixes for net

2016-09-13 Thread David Miller

From: Pablo Neira Ayuso 
Date: Tue, 13 Sep 2016 11:05:13 +0200

> The following patchset contains Netfilter fixes for your net tree,
> they are:
 ...
> You can pull these changes from:
> 
>   git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf.git

Pulled, thanks Pablo.

Re: [iovisor-dev] README: [PATCH RFC 11/11] net/mlx5e: XDP TX xmit more

2016-09-13 Thread Edward Cree

On 12/09/16 11:15, Jesper Dangaard Brouer wrote:
> I'm reacting so loudly, because this is a mental model switch, that
> need to be applied to the full drivers RX path. Also for normal stack
> delivery of SKBs. As both Edward Cree[1] and I[2] have demonstrated,
> there is between 10%-25% perf gain here.
>
> [1] http://lists.openwall.net/netdev/2016/04/19/89
> [2] http://lists.openwall.net/netdev/2016/01/15/51
BTW, I'd also still rather like to see that happen, I never really
understood the objections people had to those patches when I posted them.  I
still believe that dealing in skb-lists instead of skbs, and thus
'automatically' bulking similar packets, is better than trying to categorise
packets into flows early on based on some set of keys.  The problem with the
latter approach is that there are now two definitions of "similar":
1) the set of fields used to index the flow
2) what will actually cause the stack's behaviour to differ if not using the
cached values.
Quite apart from the possibility of bugs if one changes but not the other,
this forces (1) to be conservative, only considering things "similar" if the
entire stack will.  Whereas with bundling, the stack can keep packets
together until they reach a layer at which they are no longer "similar"
enough.  Thus, for instance, packets with the same IP 3-tuple but different
port numbers can be grouped together for IP layer processing, then split
apart for L4.

-Ed

Re: [PATCH net-next] net: bridge: add helper to call /sbin/bridge-stp

2016-09-13 Thread David Miller

From: Vivien Didelot 
Date: Thu,  8 Sep 2016 12:50:43 -0400

> If /sbin/bridge-stp is available on the system, bridge tries to execute
> it instead of the kernel implementation when starting/stopping STP.
> 
> If anything goes wrong with /sbin/bridge-stp, bridge silently falls back
> to kernel STP, making hard to debug userspace STP.
> 
> This patch adds a br_stp_call_user helper to start/stop userspace STP
> and debug errors from the program: abnormal exit status is stored in the
> lower byte and normal exit status is stored in higher byte.
> 
> Below is a simple example on a kernel with dynamic debug enabled:
> 
> # ln -s /bin/false /sbin/bridge-stp
> # brctl stp br0 on
> br0: failed to start userspace STP (256)
> # dmesg
> br0: /sbin/bridge-stp exited with code 1
> br0: failed to start userspace STP (256)
> br0: using kernel STP
> 
> Signed-off-by: Vivien Didelot 

Applied.

Re: [PATCH v2] bnx2: Reset device during driver initialization

2016-09-13 Thread David Miller


Just to be clear, I did actually apply this v2 of the patch
rather than the initial version.:)

Re: [PATCH] bnx2: Reset device during driver initialization

2016-09-13 Thread David Miller

From: Baoquan He 
Date: Fri,  9 Sep 2016 16:11:07 +0800

> When system enters into kdump kernel because of kernel panic, it won't
> shutdown devices. On-flight DMA will continue transferring data until
> device driver initializes. All devices are supposed to reset during
> driver initialization. And this property is used to fix the kdump
> failure in system with intel iommu. Other systems with hardware iommu
> should be similar. Please check commit 091d42e ("iommu/vt-d: Copy
> translation tables from old kernel") and those commits around it.
> 
> But bnx2 driver doesn't reset device during driver initialization. The
> device resetting is deferred to net device up stage. This will cause
> hardware iommu handling failure on bnx2 device. And its resetting relies
> on firmware. So in this patch move the firmware requesting code to earlier
> bnx2_init_one(), then next call bnx2_reset_chip to reset device.
> 
> Signed-off-by: Baoquan He 

Applied, thanks.

Re: [PATCH -next] tipc: fix possible memory leak in tipc_udp_enable()

2016-09-13 Thread David Miller

From: Wei Yongjun 
Date: Sat, 10 Sep 2016 00:56:55 +

> From: Wei Yongjun 
> 
> 'ub' is malloced in tipc_udp_enable() and should be freed before
> leaving from the error handling cases, otherwise it will cause
> memory leak.
> 
> Fixes: ba5aa84a2d22 ("tipc: split UDP nl address parsing")
> Signed-off-by: Wei Yongjun 

Applied, thanks.

Re: net/bluetooth: workqueue destruction WARNING in hci_unregister_dev

2016-09-13 Thread Tejun Heo

Hello,

On Sat, Sep 10, 2016 at 11:33:48AM +0200, Dmitry Vyukov wrote:
> Hit the WARNING with the patch. It showed "Showing busy workqueues and
> worker pools:" after the WARNING, but then no queue info. Was it
> already destroyed and removed from the list?...

Hmm...  It either means that the work item which was in flight when
WARN_ON() ran finished by the time the debug printout got to it or
that it's something unrelated to busy work items.

> [ 198.113838] WARNING: CPU: 2 PID: 26691 at kernel/workqueue.c:4042
> destroy_workqueue+0x17b/0x630

I don't seem to have the same source code that you have.  Which exact
WARN_ON() is this?

Thanks.

-- 
tejun

Re: [RFC 00/11] QLogic RDMA Driver (qedr) RFC

2016-09-13 Thread Jason Gunthorpe

On Tue, Sep 13, 2016 at 08:44:06AM +, Ram Amrani wrote:
> Hi Jason,
> I see that "include/uapi/rdma" contains API that is common.

It is intended to contain *all* the uapi.

> The qedr_user.h, that I assume you are referring to, is a qedr specific API.
> For example, we issue the ib_copy_to_udata() on structures defined in the 
> file.
> So per my understanding it is in place.

Anything that is used with copy_to/from_user, ib_copy_to/from_udata,
etc, etc must be in a include/uapi header.

Any constant you might want to copy into your userspace provider must
be in include/uapi.

Avoid copying headers in your user space and use the standard kernel
names to access your driver's uapi.

Jason

Re: [PATCH] net: Remove NO_IRQ from powerpc-only network drivers

2016-09-13 Thread David Miller

From: Michael Ellerman 
Date: Sat, 10 Sep 2016 19:59:05 +1000

> We'd like to eventually remove NO_IRQ on powerpc, so remove usages of it
> from powerpc-only drivers.
> 
> Signed-off-by: Michael Ellerman 

Applied to net-next, thanks.

Re: [PATCH -next] net: macb: fix missing unlock on error in macb_start_xmit()

2016-09-13 Thread David Miller

From: Wei Yongjun 
Date: Sat, 10 Sep 2016 11:17:57 +

> From: Wei Yongjun 
> 
> Fix missing unlock before return from function macb_start_xmit()
> in the error handling case.
> 
> Fixes: 007e4ba3ee13 ("net: macb: initialize checksum when using
> checksum offloading")
> Signed-off-by: Wei Yongjun 

Applied.

Re: [PATCH -next] net: ethernet: dwmac: fix non static symbol warning

2016-09-13 Thread David Miller

From: Wei Yongjun 
Date: Sat, 10 Sep 2016 12:31:30 +

> From: Wei Yongjun 
> 
> Fixes the following sparse warning:
> 
> drivers/net/ethernet/stmicro/stmmac/dwmac-stm32.c:172:1: warning:
>  symbol 'stm32_dwmac_pm_ops' was not declared. Should it be static?
> 
> Signed-off-by: Wei Yongjun 

Applied.

Re: [PATCHv2 net] sctp: hold the transport before using it in sctp_hash_cmp

2016-09-13 Thread David Miller

From: Xin Long 
Date: Sat, 10 Sep 2016 23:11:23 +0800

> Since commit 4f0087812648 ("sctp: apply rhashtable api to send/recv
> path"), sctp uses transport rhashtable with .obj_cmpfn sctp_hash_cmp,
> in which it compares the members of the transport with the rhashtable
> args to check if it's the right transport.
> 
> But sctp uses the transport without holding it in sctp_hash_cmp, it can
> cause a use-after-free panic. As after it gets transport from hashtable,
> another CPU may close the sk and free the asoc. In sctp_association_free,
> it frees all the transports, meanwhile, the assoc's refcnt may be reduced
> to 0, assoc can be destroyed by sctp_association_destroy.
> 
> So after that, transport->assoc is actually an unavailable memory address
> in sctp_hash_cmp. Although sctp_hash_cmp is under rcu_read_lock, it still
> can not avoid this, as assoc is not freed by RCU.
> 
> This patch is to hold the transport before checking it's members with
> sctp_transport_hold, in which it checks the refcnt first, holds it if
> it's not 0.
> 
> Fixes: 4f0087812648 ("sctp: apply rhashtable api to send/recv path")
> Signed-off-by: Xin Long 

Applied and queued up for -stable.

Re: [PATCH 1/2] net: ethernet: apm: xgene: use phydev from struct net_device

2016-09-13 Thread David Miller

From: Philippe Reynes 
Date: Sun, 11 Sep 2016 17:54:03 +0200

> The private structure contain a pointer to phydev, but the structure
> net_device already contain such pointer. So we can remove the pointer
> phy_dev in the private structure, and update the driver to use the
> one contained in struct net_device.
> 
> Signed-off-by: Philippe Reynes 

Applied.

Re: [PATCH 2/2] net: ethernet: apm: xgene: use new api ethtool_{get|set}_link_ksettings

2016-09-13 Thread David Miller

From: Philippe Reynes 
Date: Sun, 11 Sep 2016 17:54:04 +0200

> The ethtool api {get|set}_settings is deprecated.
> We move this driver to new api {get|set}_link_ksettings.
> 
> Signed-off-by: Philippe Reynes 

Applied.

[net-next PATCH 05/11] libcxgb,iw_cxgb4,cxgbit: add cxgb_best_mtu()

2016-09-13 Thread Varun Prakash

Add cxgb_best_mtu() in libcxgb_cm.h to remove
it's duplicate definitions from cxgb4/cm.c and
cxgbit/cxgbit_cm.c

Signed-off-by: Varun Prakash 
---
 drivers/infiniband/hw/cxgb4/cm.c  | 32 +++
 drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.h | 18 +
 drivers/target/iscsi/cxgbit/cxgbit_cm.c   | 20 +++---
 3 files changed, 30 insertions(+), 40 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index b35fdc0..c3c678f 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -677,20 +677,6 @@ static int send_abort(struct c4iw_ep *ep)
return c4iw_l2t_send(&ep->com.dev->rdev, req_skb, ep->l2t);
 }
 
-static void best_mtu(const unsigned short *mtus, unsigned short mtu,
-unsigned int *idx, int use_ts, int ipv6)
-{
-   unsigned short hdr_size = (ipv6 ?
-  sizeof(struct ipv6hdr) :
-  sizeof(struct iphdr)) +
- sizeof(struct tcphdr) +
- (use_ts ?
-  round_up(TCPOLEN_TIMESTAMP, 4) : 0);
-   unsigned short data_size = mtu - hdr_size;
-
-   cxgb4_best_aligned_mtu(mtus, hdr_size, data_size, 8, idx);
-}
-
 static int send_connect(struct c4iw_ep *ep)
 {
struct cpl_act_open_req *req = NULL;
@@ -750,9 +736,9 @@ static int send_connect(struct c4iw_ep *ep)
}
set_wr_txq(skb, CPL_PRIORITY_SETUP, ep->ctrlq_idx);
 
-   best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx,
-enable_tcp_timestamps,
-(AF_INET == ep->com.remote_addr.ss_family) ? 0 : 1);
+   cxgb_best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx,
+ enable_tcp_timestamps,
+ (ep->com.remote_addr.ss_family == AF_INET) ? 0 : 1);
wscale = compute_wscale(rcv_win);
 
/*
@@ -1930,9 +1916,9 @@ static int send_fw_act_open_req(struct c4iw_ep *ep, 
unsigned int atid)
htons(FW_OFLD_CONNECTION_WR_CPLRXDATAACK_F);
req->tcb.tx_max = (__force __be32) jiffies;
req->tcb.rcv_adv = htons(1);
-   best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx,
-enable_tcp_timestamps,
-(AF_INET == ep->com.remote_addr.ss_family) ? 0 : 1);
+   cxgb_best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx,
+ enable_tcp_timestamps,
+ (ep->com.remote_addr.ss_family == AF_INET) ? 0 : 1);
wscale = compute_wscale(rcv_win);
 
/*
@@ -2374,9 +2360,9 @@ static int accept_cr(struct c4iw_ep *ep, struct sk_buff 
*skb,
OPCODE_TID(rpl) = cpu_to_be32(MK_OPCODE_TID(CPL_PASS_ACCEPT_RPL,
ep->hwtid));
 
-   best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx,
-enable_tcp_timestamps && req->tcpopt.tstamp,
-(AF_INET == ep->com.remote_addr.ss_family) ? 0 : 1);
+   cxgb_best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx,
+ enable_tcp_timestamps && req->tcpopt.tstamp,
+ (ep->com.remote_addr.ss_family == AF_INET) ? 0 : 1);
wscale = compute_wscale(rcv_win);
 
/*
diff --git a/drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.h 
b/drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.h
index 57fcc98..7fb4feb 100644
--- a/drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.h
+++ b/drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.h
@@ -33,6 +33,9 @@
 #ifndef __LIBCXGB_CM_H__
 #define __LIBCXGB_CM_H__
 
+
+#include 
+
 #include 
 #include 
 
@@ -56,4 +59,19 @@ static inline bool cxgb_is_neg_adv(unsigned int status)
   status == CPL_ERR_PERSIST_NEG_ADVICE ||
   status == CPL_ERR_KEEPALV_NEG_ADVICE;
 }
+
+static inline void
+cxgb_best_mtu(const unsigned short *mtus, unsigned short mtu,
+ unsigned int *idx, int use_ts, int ipv6)
+{
+   unsigned short hdr_size = (ipv6 ?
+  sizeof(struct ipv6hdr) :
+  sizeof(struct iphdr)) +
+ sizeof(struct tcphdr) +
+ (use_ts ?
+  round_up(TCPOLEN_TIMESTAMP, 4) : 0);
+   unsigned short data_size = mtu - hdr_size;
+
+   cxgb4_best_aligned_mtu(mtus, hdr_size, data_size, 8, idx);
+}
 #endif
diff --git a/drivers/target/iscsi/cxgbit/cxgbit_cm.c 
b/drivers/target/iscsi/cxgbit/cxgbit_cm.c
index c46bdd5..b09c09b 100644
--- a/drivers/target/iscsi/cxgbit/cxgbit_cm.c
+++ b/drivers/target/iscsi/cxgbit/cxgbit_cm.c
@@ -997,20 +997,6 @@ cxgbit_l2t_send(struct cxgbit_device *cdev, struct sk_buff 
*skb,
return ret < 0 ? ret : 0;
 }
 
-static void
-cxgbit_best_mtu(const unsigned short *mtus, unsigned short mtu,
-   unsigned int *idx, int use_ts, int ipv6)
-{
-

[net-next PATCH 01/11] libcxgb,iw_cxgb4,cxgbit: add cxgb_get_4tuple()

2016-09-13 Thread Varun Prakash

Add cxgb_get_4tuple() in libcxgb_cm.c to remove
it's duplicate definitions from cxgb4/cm.c and
cxgbit/cxgbit_cm.c.

Signed-off-by: Varun Prakash 
---
 drivers/infiniband/hw/cxgb4/Kconfig   |  1 +
 drivers/infiniband/hw/cxgb4/Makefile  |  1 +
 drivers/infiniband/hw/cxgb4/cm.c  | 41 +
 drivers/net/ethernet/chelsio/libcxgb/Makefile |  4 +-
 drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.c | 72 +++
 drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.h | 42 +
 drivers/target/iscsi/cxgbit/cxgbit_cm.c   | 41 +
 7 files changed, 125 insertions(+), 77 deletions(-)
 create mode 100644 drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.c
 create mode 100644 drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.h

diff --git a/drivers/infiniband/hw/cxgb4/Kconfig 
b/drivers/infiniband/hw/cxgb4/Kconfig
index 23f38cf..afe8b28 100644
--- a/drivers/infiniband/hw/cxgb4/Kconfig
+++ b/drivers/infiniband/hw/cxgb4/Kconfig
@@ -1,6 +1,7 @@
 config INFINIBAND_CXGB4
tristate "Chelsio T4/T5 RDMA Driver"
depends on CHELSIO_T4 && INET && (IPV6 || IPV6=n)
+   select CHELSIO_LIB
select GENERIC_ALLOCATOR
---help---
  This is an iWARP/RDMA driver for the Chelsio T4 and T5
diff --git a/drivers/infiniband/hw/cxgb4/Makefile 
b/drivers/infiniband/hw/cxgb4/Makefile
index e11cf72..fa40b68 100644
--- a/drivers/infiniband/hw/cxgb4/Makefile
+++ b/drivers/infiniband/hw/cxgb4/Makefile
@@ -1,4 +1,5 @@
 ccflags-y := -Idrivers/net/ethernet/chelsio/cxgb4
+ccflags-y += -Idrivers/net/ethernet/chelsio/libcxgb
 
 obj-$(CONFIG_INFINIBAND_CXGB4) += iw_cxgb4.o
 
diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index b6a953a..e591f61 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -49,6 +49,7 @@
 
 #include 
 
+#include 
 #include "iw_cxgb4.h"
 #include "clip_tbl.h"
 
@@ -2518,42 +2519,6 @@ static void reject_cr(struct c4iw_dev *dev, u32 hwtid, 
struct sk_buff *skb)
return;
 }
 
-static void get_4tuple(struct cpl_pass_accept_req *req, enum chip_type type,
-  int *iptype, __u8 *local_ip, __u8 *peer_ip,
-  __be16 *local_port, __be16 *peer_port)
-{
-   int eth_len = (CHELSIO_CHIP_VERSION(type) <= CHELSIO_T5) ?
- ETH_HDR_LEN_G(be32_to_cpu(req->hdr_len)) :
- T6_ETH_HDR_LEN_G(be32_to_cpu(req->hdr_len));
-   int ip_len = (CHELSIO_CHIP_VERSION(type) <= CHELSIO_T5) ?
-IP_HDR_LEN_G(be32_to_cpu(req->hdr_len)) :
-T6_IP_HDR_LEN_G(be32_to_cpu(req->hdr_len));
-   struct iphdr *ip = (struct iphdr *)((u8 *)(req + 1) + eth_len);
-   struct ipv6hdr *ip6 = (struct ipv6hdr *)((u8 *)(req + 1) + eth_len);
-   struct tcphdr *tcp = (struct tcphdr *)
-((u8 *)(req + 1) + eth_len + ip_len);
-
-   if (ip->version == 4) {
-   PDBG("%s saddr 0x%x daddr 0x%x sport %u dport %u\n", __func__,
-ntohl(ip->saddr), ntohl(ip->daddr), ntohs(tcp->source),
-ntohs(tcp->dest));
-   *iptype = 4;
-   memcpy(peer_ip, &ip->saddr, 4);
-   memcpy(local_ip, &ip->daddr, 4);
-   } else {
-   PDBG("%s saddr %pI6 daddr %pI6 sport %u dport %u\n", __func__,
-ip6->saddr.s6_addr, ip6->daddr.s6_addr, ntohs(tcp->source),
-ntohs(tcp->dest));
-   *iptype = 6;
-   memcpy(peer_ip, ip6->saddr.s6_addr, 16);
-   memcpy(local_ip, ip6->daddr.s6_addr, 16);
-   }
-   *peer_port = tcp->source;
-   *local_port = tcp->dest;
-
-   return;
-}
-
 static int pass_accept_req(struct c4iw_dev *dev, struct sk_buff *skb)
 {
struct c4iw_ep *child_ep = NULL, *parent_ep;
@@ -2582,8 +2547,8 @@ static int pass_accept_req(struct c4iw_dev *dev, struct 
sk_buff *skb)
goto reject;
}
 
-   get_4tuple(req, parent_ep->com.dev->rdev.lldi.adapter_type, &iptype,
-  local_ip, peer_ip, &local_port, &peer_port);
+   cxgb_get_4tuple(req, parent_ep->com.dev->rdev.lldi.adapter_type,
+   &iptype, local_ip, peer_ip, &local_port, &peer_port);
 
/* Find output route */
if (iptype == 4)  {
diff --git a/drivers/net/ethernet/chelsio/libcxgb/Makefile 
b/drivers/net/ethernet/chelsio/libcxgb/Makefile
index 2362230..2534e30 100644
--- a/drivers/net/ethernet/chelsio/libcxgb/Makefile
+++ b/drivers/net/ethernet/chelsio/libcxgb/Makefile
@@ -1,3 +1,5 @@
+ccflags-y := -Idrivers/net/ethernet/chelsio/cxgb4
+
 obj-$(CONFIG_CHELSIO_LIB) += libcxgb.o
 
-libcxgb-y := libcxgb_ppm.o
+libcxgb-y := libcxgb_ppm.o libcxgb_cm.o
diff --git a/drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.c 
b/drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.c
new file mode 100644
index 000..d7342bb
--- /dev/null
+++ b/drivers/net/ethernet/c

[net-next PATCH 02/11] libcxgb,iw_cxgb4,cxgbit: add cxgb_find_route()

2016-09-13 Thread Varun Prakash

Add cxgb_find_route() in libcxgb_cm.c to remove
it's duplicate definitions from cxgb4/cm.c and
cxgbit/cxgbit_cm.c.

Signed-off-by: Varun Prakash 
---
 drivers/infiniband/hw/cxgb4/cm.c  | 53 +++
 drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.c | 44 +++
 drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.h |  4 ++
 drivers/target/iscsi/cxgbit/cxgbit_cm.c   | 36 +++
 4 files changed, 69 insertions(+), 68 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index e591f61..02f5e20 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -505,32 +505,6 @@ out:
return dst;
 }
 
-static struct dst_entry *find_route(struct c4iw_dev *dev, __be32 local_ip,
-__be32 peer_ip, __be16 local_port,
-__be16 peer_port, u8 tos)
-{
-   struct rtable *rt;
-   struct flowi4 fl4;
-   struct neighbour *n;
-
-   rt = ip_route_output_ports(&init_net, &fl4, NULL, peer_ip, local_ip,
-  peer_port, local_port, IPPROTO_TCP,
-  tos, 0);
-   if (IS_ERR(rt))
-   return NULL;
-   n = dst_neigh_lookup(&rt->dst, &peer_ip);
-   if (!n)
-   return NULL;
-   if (!our_interface(dev, n->dev) &&
-   !(n->dev->flags & IFF_LOOPBACK)) {
-   neigh_release(n);
-   dst_release(&rt->dst);
-   return NULL;
-   }
-   neigh_release(n);
-   return &rt->dst;
-}
-
 static void arp_failure_discard(void *handle, struct sk_buff *skb)
 {
pr_err(MOD "ARP failure\n");
@@ -2215,9 +2189,11 @@ static int c4iw_reconnect(struct c4iw_ep *ep)
 
/* find a route */
if (ep->com.cm_id->m_local_addr.ss_family == AF_INET) {
-   ep->dst = find_route(ep->com.dev, laddr->sin_addr.s_addr,
-raddr->sin_addr.s_addr, laddr->sin_port,
-raddr->sin_port, ep->com.cm_id->tos);
+   ep->dst = cxgb_find_route(&ep->com.dev->rdev.lldi, get_real_dev,
+ laddr->sin_addr.s_addr,
+ raddr->sin_addr.s_addr,
+ laddr->sin_port,
+ raddr->sin_port, ep->com.cm_id->tos);
iptype = 4;
ra = (__u8 *)&raddr->sin_addr;
} else {
@@ -2556,9 +2532,9 @@ static int pass_accept_req(struct c4iw_dev *dev, struct 
sk_buff *skb)
 , __func__, parent_ep, hwtid,
 local_ip, peer_ip, ntohs(local_port),
 ntohs(peer_port), peer_mss);
-   dst = find_route(dev, *(__be32 *)local_ip, *(__be32 *)peer_ip,
-local_port, peer_port,
-tos);
+   dst = cxgb_find_route(&dev->rdev.lldi, get_real_dev,
+ *(__be32 *)local_ip, *(__be32 *)peer_ip,
+ local_port, peer_port, tos);
} else {
PDBG("%s parent ep %p hwtid %u laddr %pI6 raddr %pI6 lport %d 
rport %d peer_mss %d\n"
 , __func__, parent_ep, hwtid,
@@ -3340,9 +3316,11 @@ int c4iw_connect(struct iw_cm_id *cm_id, struct 
iw_cm_conn_param *conn_param)
PDBG("%s saddr %pI4 sport 0x%x raddr %pI4 rport 0x%x\n",
 __func__, &laddr->sin_addr, ntohs(laddr->sin_port),
 ra, ntohs(raddr->sin_port));
-   ep->dst = find_route(dev, laddr->sin_addr.s_addr,
-raddr->sin_addr.s_addr, laddr->sin_port,
-raddr->sin_port, cm_id->tos);
+   ep->dst = cxgb_find_route(&dev->rdev.lldi, get_real_dev,
+ laddr->sin_addr.s_addr,
+ raddr->sin_addr.s_addr,
+ laddr->sin_port,
+ raddr->sin_port, cm_id->tos);
} else {
iptype = 6;
ra = (__u8 *)&raddr6->sin6_addr;
@@ -4006,8 +3984,9 @@ static int rx_pkt(struct c4iw_dev *dev, struct sk_buff 
*skb)
 ntohl(iph->daddr), ntohs(tcph->dest), ntohl(iph->saddr),
 ntohs(tcph->source), iph->tos);
 
-   dst = find_route(dev, iph->daddr, iph->saddr, tcph->dest, tcph->source,
-iph->tos);
+   dst = cxgb_find_route(&dev->rdev.lldi, get_real_dev,
+ iph->daddr, iph->saddr, tcph->dest,
+ tcph->source, iph->tos);
if (!dst) {
pr_err("%s - failed to find dst entry!\n",
   __func__);
diff --git a/drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.c 
b/drivers/n

[net-next PATCH 06/11] libcxgb,iw_cxgb4,cxgbit: add cxgb_compute_wscale()

2016-09-13 Thread Varun Prakash

Add cxgb_compute_wscale() in libcxgb_cm.h to remove
it's duplicate definitions from cxgb4/cm.c and
cxgbit/cxgbit_cm.c.

Signed-off-by: Varun Prakash 
---
 drivers/infiniband/hw/cxgb4/cm.c  | 12 ++--
 drivers/infiniband/hw/cxgb4/iw_cxgb4.h|  9 -
 drivers/net/ethernet/chelsio/libcxgb/libcxgb_cm.h |  9 +
 drivers/target/iscsi/cxgbit/cxgbit_cm.c   | 11 +--
 4 files changed, 16 insertions(+), 25 deletions(-)

diff --git a/drivers/infiniband/hw/cxgb4/cm.c b/drivers/infiniband/hw/cxgb4/cm.c
index c3c678f..b9d77df 100644
--- a/drivers/infiniband/hw/cxgb4/cm.c
+++ b/drivers/infiniband/hw/cxgb4/cm.c
@@ -689,7 +689,7 @@ static int send_connect(struct c4iw_ep *ep)
u64 opt0;
u32 opt2;
unsigned int mtu_idx;
-   int wscale;
+   u32 wscale;
int win, sizev4, sizev6, wrlen;
struct sockaddr_in *la = (struct sockaddr_in *)
 &ep->com.local_addr;
@@ -739,7 +739,7 @@ static int send_connect(struct c4iw_ep *ep)
cxgb_best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx,
  enable_tcp_timestamps,
  (ep->com.remote_addr.ss_family == AF_INET) ? 0 : 1);
-   wscale = compute_wscale(rcv_win);
+   wscale = cxgb_compute_wscale(rcv_win);
 
/*
 * Specify the largest window that will fit in opt0. The
@@ -1891,7 +1891,7 @@ static int send_fw_act_open_req(struct c4iw_ep *ep, 
unsigned int atid)
struct sk_buff *skb;
struct fw_ofld_connection_wr *req;
unsigned int mtu_idx;
-   int wscale;
+   u32 wscale;
struct sockaddr_in *sin;
int win;
 
@@ -1919,7 +1919,7 @@ static int send_fw_act_open_req(struct c4iw_ep *ep, 
unsigned int atid)
cxgb_best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx,
  enable_tcp_timestamps,
  (ep->com.remote_addr.ss_family == AF_INET) ? 0 : 1);
-   wscale = compute_wscale(rcv_win);
+   wscale = cxgb_compute_wscale(rcv_win);
 
/*
 * Specify the largest window that will fit in opt0. The
@@ -2339,7 +2339,7 @@ static int accept_cr(struct c4iw_ep *ep, struct sk_buff 
*skb,
unsigned int mtu_idx;
u64 opt0;
u32 opt2;
-   int wscale;
+   u32 wscale;
struct cpl_t5_pass_accept_rpl *rpl5 = NULL;
int win;
enum chip_type adapter_type = ep->com.dev->rdev.lldi.adapter_type;
@@ -2363,7 +2363,7 @@ static int accept_cr(struct c4iw_ep *ep, struct sk_buff 
*skb,
cxgb_best_mtu(ep->com.dev->rdev.lldi.mtus, ep->mtu, &mtu_idx,
  enable_tcp_timestamps && req->tcpopt.tstamp,
  (ep->com.remote_addr.ss_family == AF_INET) ? 0 : 1);
-   wscale = compute_wscale(rcv_win);
+   wscale = cxgb_compute_wscale(rcv_win);
 
/*
 * Specify the largest window that will fit in opt0. The
diff --git a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h 
b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
index aa47e0a..6a9bef1f 100644
--- a/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
+++ b/drivers/infiniband/hw/cxgb4/iw_cxgb4.h
@@ -881,15 +881,6 @@ static inline struct c4iw_listen_ep *to_listen_ep(struct 
iw_cm_id *cm_id)
return cm_id->provider_data;
 }
 
-static inline int compute_wscale(int win)
-{
-   int wscale = 0;
-
-   while (wscale < 14 && (65535lldi.mtus, csk->mtu, &mtu_idx,
  req->tcpopt.tstamp,
  (csk->com.remote_addr.ss_family == AF_INET) ? 0 : 1);
-   wscale = cxgbit_compute_wscale(csk->rcv_win);
+   wscale = cxgb_compute_wscale(csk->rcv_win);
/*
 * Specify the largest window that will fit in opt0. The
 * remainder will be specified in the rx_data_ack.
-- 
2.0.2

1 2 3 >

1 - 100 of 227 matches

Mail list logo