[PATCH v2] rsi: Fix failure to load firmware after memory leak fix and fix the leak

2015-07-27 Thread Mike Looijmans
Fixes commit eae79b4f3e82 (rsi: fix memory leak in rsi_load_ta_instructions())
which stopped the driver from functioning.

Firmware data has been allocated using vmalloc(), resulting in memory
that cannot be used for DMA. Hence the firmware was first copied to a
buffer allocated with kmalloc() in the original code. This patch reverts
the commit and only calls kfree() to release the buffer after sending
the data. This fixes the memory leak without breaking the driver.

Add a comment to the kmemdup() calls to explain why this is done, and abort
if memory allocation fails.

Tested on a Topic Miami-Florida board which contains the rsi SDIO chip.

Also added the same kfree() call to the USB glue driver. This was not
tested on actual hardware though, as I only have the SDIO version.

Fixes: eae79b4f3e82 (rsi: fix memory leak in rsi_load_ta_instructions())
Signed-off-by: Mike Looijmans mike.looijm...@topic.nl
Cc: sta...@vger.kernel.org
---
v2: Add Fixes: header and abbreviate git hashes.
Return -ENOMEM if kmemdup() fails.

 drivers/net/wireless/rsi/rsi_91x_sdio_ops.c | 8 +++-
 drivers/net/wireless/rsi/rsi_91x_usb_ops.c  | 4 
 2 files changed, 11 insertions(+), 1 deletion(-)

diff --git a/drivers/net/wireless/rsi/rsi_91x_sdio_ops.c 
b/drivers/net/wireless/rsi/rsi_91x_sdio_ops.c
index b6cc9ff..1c6788a 100644
--- a/drivers/net/wireless/rsi/rsi_91x_sdio_ops.c
+++ b/drivers/net/wireless/rsi/rsi_91x_sdio_ops.c
@@ -172,6 +172,7 @@ static int rsi_load_ta_instructions(struct rsi_common 
*common)
(struct rsi_91x_sdiodev *)adapter-rsi_dev;
u32 len;
u32 num_blocks;
+   const u8 *fw;
const struct firmware *fw_entry = NULL;
u32 block_size = dev-tx_blk_size;
int status = 0;
@@ -200,6 +201,10 @@ static int rsi_load_ta_instructions(struct rsi_common 
*common)
return status;
}
 
+   /* Copy firmware into DMA-accessible memory */
+   fw = kmemdup(fw_entry-data, fw_entry-size, GFP_KERNEL);
+   if (!fw)
+   return -ENOMEM;
len = fw_entry-size;
 
if (len % 4)
@@ -210,7 +215,8 @@ static int rsi_load_ta_instructions(struct rsi_common 
*common)
rsi_dbg(INIT_ZONE, %s: Instruction size:%d\n, __func__, len);
rsi_dbg(INIT_ZONE, %s: num blocks: %d\n, __func__, num_blocks);
 
-   status = rsi_copy_to_card(common, fw_entry-data, len, num_blocks);
+   status = rsi_copy_to_card(common, fw, len, num_blocks);
+   kfree(fw);
release_firmware(fw_entry);
return status;
 }
diff --git a/drivers/net/wireless/rsi/rsi_91x_usb_ops.c 
b/drivers/net/wireless/rsi/rsi_91x_usb_ops.c
index 1106ce7..30c2cf7 100644
--- a/drivers/net/wireless/rsi/rsi_91x_usb_ops.c
+++ b/drivers/net/wireless/rsi/rsi_91x_usb_ops.c
@@ -146,7 +146,10 @@ static int rsi_load_ta_instructions(struct rsi_common 
*common)
return status;
}
 
+   /* Copy firmware into DMA-accessible memory */
fw = kmemdup(fw_entry-data, fw_entry-size, GFP_KERNEL);
+   if (!fw)
+   return -ENOMEM;
len = fw_entry-size;
 
if (len % 4)
@@ -158,6 +161,7 @@ static int rsi_load_ta_instructions(struct rsi_common 
*common)
rsi_dbg(INIT_ZONE, %s: num blocks: %d\n, __func__, num_blocks);
 
status = rsi_copy_to_card(common, fw, len, num_blocks);
+   kfree(fw);
release_firmware(fw_entry);
return status;
 }
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [net:master 41/49] drivers/net/ethernet/cadence/macb.c:164:1: error: macro writel passed 3 arguments, but takes just 2

2015-07-27 Thread Andy Shevchenko
On Mon, 2015-07-27 at 17:03 +0800, kbuild test robot wrote:
 tree:   git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git 
 master
 head:   8fff755e9f8d0f70a595e79f248695ce6aef5cc3
 commit: f2ce8a9e48385f444389e75cfe293637c3eb5410 [41/49] net/macb: 
 improve big endian CPU support
 config: arm-at91_dt_defconfig (attached as .config)
 reproduce:
   wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp
 -tests.git/plain/sbin/make.cross -O ~/bin/make.cross
   chmod +x ~/bin/make.cross
   git checkout f2ce8a9e48385f444389e75cfe293637c3eb5410
   # save the attached .config to linux build tree
   make.cross ARCH=arm 

Oh, no.

I do use compiler from Debian for AVR32, didn't check this on other
architectures.

Possible something like following will fix it:

--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
@@ -429,12 +429,12 @@
 | GEM_BF(name, value))

 /* Register access macros */
-#define macb_readl(port, reg)  (port)-readl((port),
MACB_##reg)
-#define macb_writel(port, reg, value)  (port)-writel((port),
MACB_##reg, (value))
-#define gem_readl(port, reg)   (port)-readl((port),
GEM_##reg)
-#define gem_writel(port, reg, value)   (port)-writel((port),
GEM_##reg, (value))
-#define queue_readl(queue, reg)(queue)-bp
-readl((queue)-bp, (queue)-reg)
-#define queue_writel(queue, reg, value)(queue)-bp
-writel((queue)-bp, (queue)-reg, (value))
+#define macb_readl(port, reg)  port-readl(port, MACB_##reg)
+#define macb_writel(port, reg, value)  port-writel(port, MACB_##reg,
(value))
+#define gem_readl(port, reg)   port-readl(port, GEM_##reg)
+#define gem_writel(port, reg, value)   port-writel(port, GEM_##reg,
(value))
+#define queue_readl(queue, reg)queue-bp-readl(queue
-bp, queue-reg)
+#define queue_writel(queue, reg, value)queue-bp-writel(queue
-bp, queue-reg, (value))

 /* Conditional GEM/MACB macros.  These perform the operation to the
correct
  * register dependent on whether the device is a GEM or a MACB.  For
registers



 
 All error/warnings (new ones prefixed by ):
 
drivers/net/ethernet/cadence/macb.c: In function 
 'macb_set_hwaddr':
   drivers/net/ethernet/cadence/macb.c:164:1: error: macro writel 
   passed 3 arguments, but takes just 2
  macb_or_gem_writel(bp, SA1B, bottom);
 ^
In file included from drivers/net/ethernet/cadence/macb.c:34:0:
   drivers/net/ethernet/cadence/macb.h:435:38: warning: statement 
   with no effect [-Wunused-value]
 #define gem_writel(port, reg, value) (port)-writel((port), 
 GEM_##reg, (value))
  ^
   drivers/net/ethernet/cadence/macb.h:447:4: note: in expansion of 
   macro 'gem_writel'
gem_writel((__bp), __reg, __value); \
^
   drivers/net/ethernet/cadence/macb.c:164:2: note: in expansion of 
   macro 'macb_or_gem_writel'
  macb_or_gem_writel(bp, SA1B, bottom);
  ^
   drivers/net/ethernet/cadence/macb.c:164:1: error: macro writel 
   passed 3 arguments, but takes just 2
  macb_or_gem_writel(bp, SA1B, bottom);
 ^
In file included from drivers/net/ethernet/cadence/macb.c:34:0:
drivers/net/ethernet/cadence/macb.h:433:39: warning: statement 
 with no effect [-Wunused-value]
 #define macb_writel(port, reg, value) (port)-writel((port), 
 MACB_##reg, (value))
   ^
   drivers/net/ethernet/cadence/macb.h:449:4: note: in expansion of 
   macro 'macb_writel'
macb_writel((__bp), __reg, __value); \
^
   drivers/net/ethernet/cadence/macb.c:164:2: note: in expansion of 
   macro 'macb_or_gem_writel'
  macb_or_gem_writel(bp, SA1B, bottom);
  ^
drivers/net/ethernet/cadence/macb.c:166:1: error: macro writel 
 passed 3 arguments, but takes just 2
  macb_or_gem_writel(bp, SA1T, top);
 ^
In file included from drivers/net/ethernet/cadence/macb.c:34:0:
   drivers/net/ethernet/cadence/macb.h:435:38: warning: statement 
   with no effect [-Wunused-value]
 #define gem_writel(port, reg, value) (port)-writel((port), 
 GEM_##reg, (value))
  ^
   drivers/net/ethernet/cadence/macb.h:447:4: note: in expansion of 
   macro 'gem_writel'
gem_writel((__bp), __reg, __value); \
^
drivers/net/ethernet/cadence/macb.c:166:2: note: in expansion of 
 macro 'macb_or_gem_writel'
  macb_or_gem_writel(bp, SA1T, top);
  ^
drivers/net/ethernet/cadence/macb.c:166:1: error: macro writel 
 passed 3 arguments, but takes just 2
  macb_or_gem_writel(bp, SA1T, top);
 ^
In file included from drivers/net/ethernet/cadence/macb.c:34:0:
drivers/net/ethernet/cadence/macb.h:433:39: warning: statement 
 with no effect [-Wunused-value]
 #define macb_writel(port, reg, value) (port)-writel((port), 
 MACB_##reg, (value))
   ^
   drivers/net/ethernet/cadence/macb.h:449:4: note: in expansion of 
   macro 

[PATCH V3 net-next 3/3] ARM: net: add support for BPF_ANC | SKF_AD_HATYPE in ARM JIT.

2015-07-27 Thread Nicolas Schichan
Signed-off-by: Nicolas Schichan nschic...@freebox.fr
---
 arch/arm/net/bpf_jit_32.c | 22 --
 arch/arm/net/bpf_jit_32.h |  3 +++
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index 3c73caf..876060b 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -857,7 +857,9 @@ b_epilogue:
emit(ARM_LDR_I(r_A, r_scratch, off), ctx);
break;
case BPF_ANC | SKF_AD_IFINDEX:
+   case BPF_ANC | SKF_AD_HATYPE:
/* A = skb-dev-ifindex */
+   /* A = skb-dev-type */
ctx-seen |= SEEN_SKB;
off = offsetof(struct sk_buff, dev);
emit(ARM_LDR_I(r_scratch, r_skb, off), ctx);
@@ -867,8 +869,24 @@ b_epilogue:
 
BUILD_BUG_ON(FIELD_SIZEOF(struct net_device,
  ifindex) != 4);
-   off = offsetof(struct net_device, ifindex);
-   emit(ARM_LDR_I(r_A, r_scratch, off), ctx);
+   BUILD_BUG_ON(FIELD_SIZEOF(struct net_device,
+ type) != 2);
+
+   if (code == (BPF_ANC | SKF_AD_IFINDEX)) {
+   off = offsetof(struct net_device, ifindex);
+   emit(ARM_LDR_I(r_A, r_scratch, off), ctx);
+   } else {
+   /*
+* offset of field type in struct
+* net_device is above what can be
+* used in the ldrh rd, [rn, #imm]
+* instruction, so load the offset in
+* a register and use ldrh rd, [rn, rm]
+*/
+   off = offsetof(struct net_device, type);
+   emit_mov_i(ARM_R3, off, ctx);
+   emit(ARM_LDRH_R(r_A, r_scratch, ARM_R3), ctx);
+   }
break;
case BPF_ANC | SKF_AD_MARK:
ctx-seen |= SEEN_SKB;
diff --git a/arch/arm/net/bpf_jit_32.h b/arch/arm/net/bpf_jit_32.h
index b2d7d92..4b17d5ab 100644
--- a/arch/arm/net/bpf_jit_32.h
+++ b/arch/arm/net/bpf_jit_32.h
@@ -74,6 +74,7 @@
 #define ARM_INST_LDRB_I0x05d0
 #define ARM_INST_LDRB_R0x07d0
 #define ARM_INST_LDRH_I0x01d000b0
+#define ARM_INST_LDRH_R0x019000b0
 #define ARM_INST_LDR_I 0x0590
 
 #define ARM_INST_LDM   0x0890
@@ -160,6 +161,8 @@
 | (rm))
 #define ARM_LDRH_I(rt, rn, off)(ARM_INST_LDRH_I | (rt)  12 | (rn)  
16 \
 | (((off)  0xf0)  4) | ((off)  0xf))
+#define ARM_LDRH_R(rt, rn, rm) (ARM_INST_LDRH_R | (rt)  12 | (rn)  16 \
+| (rm))
 
 #define ARM_LDM(rn, regs)  (ARM_INST_LDM | (rn)  16 | (regs))
 
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V3 net-next 0/3] ARM BPF JIT features

2015-07-27 Thread Nicolas Schichan
Hello,

This series adds support for more instructions to the ARM BPF JIT
namely skb netdevice type retrieval, skb payload offset retrieval, and
skb packet type retrieval.

This allows 35 tests to use the JIT instead of 29 before.

This series depends on the BPF JIT fixes for ARM serie sent earlier.

Regards,

Changes from V1 to V2:
* split fixes and features in separate patch series.

Changes from V2 to V3:
* respin against latest net-next.

Nicolas Schichan (3):
  ARM: net: add support for BPF_ANC | SKF_AD_PKTTYPE in ARM JIT.
  ARM: net: add support for BPF_ANC | SKF_AD_PAY_OFFSET in ARM JIT.
  ARM: net: add support for BPF_ANC | SKF_AD_HATYPE in ARM JIT.

 arch/arm/net/bpf_jit_32.c | 41 +++--
 arch/arm/net/bpf_jit_32.h |  3 +++
 2 files changed, 42 insertions(+), 2 deletions(-)

-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] MIPS: Remove most of the custom gpio.h

2015-07-27 Thread Linus Walleij
On Thu, Jul 23, 2015 at 8:25 PM, Lars-Peter Clausen l...@metafoo.de wrote:
 On 07/22/2015 07:33 PM, Alban Bedel wrote:

 diff --git a/arch/mips/jz4740/gpio.c b/arch/mips/jz4740/gpio.c
 index 54c80d4..3dc500c 100644
 --- a/arch/mips/jz4740/gpio.c
 +++ b/arch/mips/jz4740/gpio.c
 @@ -262,18 +262,6 @@ uint32_t jz_gpio_port_get_value(int port, uint32_t
 mask)
   }
   EXPORT_SYMBOL(jz_gpio_port_get_value);

 -int gpio_to_irq(unsigned gpio)
 -{
 -   return JZ4740_IRQ_GPIO(0) + gpio;
 -}
 -EXPORT_SYMBOL_GPL(gpio_to_irq);


 This need to be hooked up the gpio_to_irq() callback of the gpio_chip struct
 of this driver rather than completely removing it. Otherwise this
 functionality will be broken.

 Similar for other platforms which implement the function.

Even better is to see if we can convert the driver to
GPIOLIB_IRQCHIP which moves the handling of IRQ mapping
to the gpiolib core. This works for all simple cascading GPIO-with-IRQ
controllers with a local mask register. (Not when the system intcon
and GPIO is mashed up though.)

But no hurry with that.

Yours,
Linus Walleij
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH net] sctp: ASCONF-ACK with Unresolvable Address should be sent

2015-07-27 Thread Marcelo Ricardo Leitner
On Sat, Jul 25, 2015 at 01:08:08PM +0800, Xin Long wrote:
 RFC 5061:
 This is an opaque integer assigned by the sender to identify each
 request parameter.  The receiver of the ASCONF Chunk will copy this
 32-bit value into the ASCONF Response Correlation ID field of the
 ASCONF-ACK response parameter.  The sender of the ASCONF can use this
 same value in the ASCONF-ACK to find which request the response is
 for.  Note that the receiver MUST NOT change this 32-bit value.
 
 Address Parameter: TLV
 
 This field contains an IPv4 or IPv6 address parameter, as described
 in Section 3.3.2.1 of [RFC4960].
 
 ASCONF chunk with Error Cause Indication Parameter (Unresolvable Address)
 should be sent if the Delete IP Address is not part of the association.
 
   Endpoint A   Endpoint B
   (ESTABLISHED)(ESTABLISHED)
 
   ASCONF-
   (Delete IP Address)
 -  ASCONF-ACK
 (Unresolvable Address)
 
 Signed-off-by: Xin Long lucien@gmail.com
 ---
  net/sctp/sm_make_chunk.c | 15 +--
  1 file changed, 13 insertions(+), 2 deletions(-)
 
 diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c
 index 06320c8..6e399f6 100644
 --- a/net/sctp/sm_make_chunk.c
 +++ b/net/sctp/sm_make_chunk.c
 @@ -3090,8 +3090,19 @@ static __be16 sctp_process_asconf_param(struct 
 sctp_association *asoc,
   sctp_assoc_set_primary(asoc, asconf-transport);
   sctp_assoc_del_nonprimary_peers(asoc,
   asconf-transport);
 - } else
 - sctp_assoc_del_peer(asoc, addr);
 + return SCTP_ERROR_NO_ERROR;
 + }
 +
 + /* If the address is not part of the association, the
 +  * ASCONF-ACK with Error Cause Indication Parameter
 +  * which including cause of Unresolvable Address should
 +  * be sent.
 +  */
 + peer = sctp_assoc_lookup_paddr(asoc, addr);
 + if (!peer)
 + return SCTP_ERROR_DNS_FAILED;
 +
 + sctp_assoc_rm_peer(asoc, peer);
   break;
   case SCTP_PARAM_SET_PRIMARY:
   /* ADDIP Section 4.2.4
 -- 
 2.1.0
 

Looks good to me.

  Marcelo

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Several races in usbnet module (kernel 4.1.x)

2015-07-27 Thread Oliver Neukum
On Fri, 2015-07-24 at 20:38 +0300, Eugene Shatokhin wrote:
 21.07.2015 15:04, Oliver Neukum пишет:

  your analysis is correct and it looks like in addition to your proposed
  fix locking needs to be simplified and a common lock to be taken.
  Suggestions?
 
 Just an idea, I haven't tested it.
 
 How about moving the operations with dev-done under list-lock in 
 defer_bh, while keeping dev-done.lock too and changing 

Why keep dev-done.lock?
Does it make sense at all?

 usbnet_terminate_urbs() as described below?
 
 Like this:
 @@ -428,12 +428,12 @@ static enum skb_state defer_bh(struct usbnet *dev, 
 struct sk_buff *skb,
   old_state = entry-state;
   entry-state = state;
   __skb_unlink(skb, list);
 - spin_unlock(list-lock);
   spin_lock(dev-done.lock);
   __skb_queue_tail(dev-done, skb);
   if (dev-done.qlen == 1)
   tasklet_schedule(dev-bh);
 - spin_unlock_irqrestore(dev-done.lock, flags);
 + spin_unlock(dev-done.lock);
 + spin_unlock_irqrestore(list-lock, flags);
   return old_state;
   }
 ---
 
 usbnet_terminate_urbs() can then be changed as follows:
 
 @@ -749,6 +749,20 @@ EXPORT_SYMBOL_GPL(usbnet_unlink_rx_urbs);
 
  
 /*-*/
 
 +static void wait_skb_queue_empty(struct sk_buff_head *q)
 +{
 + unsigned long flags;
 +
 + spin_lock_irqsave(q-lock, flags);
 + while (!skb_queue_empty(q)) {
 + spin_unlock_irqrestore(q-lock, flags);
 + schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS));
 + set_current_state(TASK_UNINTERRUPTIBLE);

I suppose you want to invert those lines

 + spin_lock_irqsave(q-lock, flags);
 + }
 + spin_unlock_irqrestore(q-lock, flags);
 +}
 +

Your changes make sense, but it locks to me as if a lock would
become totally redundant.

Regards
Oliver


--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V3 net-next 1/3] ARM: net: add support for BPF_ANC | SKF_AD_PKTTYPE in ARM JIT.

2015-07-27 Thread Nicolas Schichan
Signed-off-by: Nicolas Schichan nschic...@freebox.fr
---
 arch/arm/net/bpf_jit_32.c | 11 +++
 1 file changed, 11 insertions(+)

diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index c011e22..6ff248c 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -895,6 +895,17 @@ b_epilogue:
OP_IMM3(ARM_AND, r_A, r_A, 0x1, ctx);
}
break;
+   case BPF_ANC | SKF_AD_PKTTYPE:
+   ctx-seen |= SEEN_SKB;
+   BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff,
+ __pkt_type_offset[0]) != 1);
+   off = PKT_TYPE_OFFSET();
+   emit(ARM_LDRB_I(r_A, r_skb, off), ctx);
+   emit(ARM_AND_I(r_A, r_A, PKT_TYPE_MAX), ctx);
+#ifdef __BIG_ENDIAN_BITFIELD
+   emit(ARM_LSR_I(r_A, r_A, 5), ctx);
+#endif
+   break;
case BPF_ANC | SKF_AD_QUEUE:
ctx-seen |= SEEN_SKB;
BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff,
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH V3 net-next 2/3] ARM: net: add support for BPF_ANC | SKF_AD_PAY_OFFSET in ARM JIT.

2015-07-27 Thread Nicolas Schichan
Signed-off-by: Nicolas Schichan nschic...@freebox.fr
---
 arch/arm/net/bpf_jit_32.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c
index 6ff248c..3c73caf 100644
--- a/arch/arm/net/bpf_jit_32.c
+++ b/arch/arm/net/bpf_jit_32.c
@@ -915,6 +915,14 @@ b_epilogue:
off = offsetof(struct sk_buff, queue_mapping);
emit(ARM_LDRH_I(r_A, r_skb, off), ctx);
break;
+   case BPF_ANC | SKF_AD_PAY_OFFSET:
+   ctx-seen |= SEEN_SKB | SEEN_CALL;
+
+   emit(ARM_MOV_R(ARM_R0, r_skb), ctx);
+   emit_mov_i(ARM_R3, (unsigned int)skb_get_poff, ctx);
+   emit_blx_r(ARM_R3, ctx);
+   emit(ARM_MOV_R(r_A, ARM_R0), ctx);
+   break;
case BPF_LDX | BPF_W | BPF_ABS:
/*
 * load a 32bit word from struct seccomp_data.
-- 
1.9.1

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Several races in usbnet module (kernel 4.1.x)

2015-07-27 Thread Eugene Shatokhin

27.07.2015 15:29, Oliver Neukum пишет:

On Fri, 2015-07-24 at 20:38 +0300, Eugene Shatokhin wrote:

21.07.2015 15:04, Oliver Neukum пишет:



your analysis is correct and it looks like in addition to your proposed
fix locking needs to be simplified and a common lock to be taken.
Suggestions?


Just an idea, I haven't tested it.

How about moving the operations with dev-done under list-lock in
defer_bh, while keeping dev-done.lock too and changing


Why keep dev-done.lock?
Does it make sense at all?


I think it does.

Both skb_queue_tail(dev-done, skb) called from rx_process() and 
skb_dequeue (dev-done) called from usbnet_bh() take dev-done.lock 
internally. So, to synchronize accesses to dev-done, one needs that 
lock in defer_bh() too.





usbnet_terminate_urbs() as described below?

Like this:
@@ -428,12 +428,12 @@ static enum skb_state defer_bh(struct usbnet *dev,
struct sk_buff *skb,
old_state = entry-state;
entry-state = state;
__skb_unlink(skb, list);
-   spin_unlock(list-lock);
spin_lock(dev-done.lock);
__skb_queue_tail(dev-done, skb);
if (dev-done.qlen == 1)
tasklet_schedule(dev-bh);
-   spin_unlock_irqrestore(dev-done.lock, flags);
+   spin_unlock(dev-done.lock);
+   spin_unlock_irqrestore(list-lock, flags);
return old_state;
   }
---

usbnet_terminate_urbs() can then be changed as follows:

@@ -749,6 +749,20 @@ EXPORT_SYMBOL_GPL(usbnet_unlink_rx_urbs);


/*-*/

+static void wait_skb_queue_empty(struct sk_buff_head *q)
+{
+   unsigned long flags;
+
+   spin_lock_irqsave(q-lock, flags);
+   while (!skb_queue_empty(q)) {
+   spin_unlock_irqrestore(q-lock, flags);
+   schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS));
+   set_current_state(TASK_UNINTERRUPTIBLE);


I suppose you want to invert those lines


Do you mean
+set_current_state(TASK_UNINTERRUPTIBLE);
+schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS));
?




+   spin_lock_irqsave(q-lock, flags);
+   }
+   spin_unlock_irqrestore(q-lock, flags);
+}
+


Your changes make sense, but it locks to me as if a lock would
become totally redundant.



Regards,

Eugene

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2] ravb: minimize TX data copying

2015-07-27 Thread Sergei Shtylyov

On 7/27/2015 11:47 AM, David Laight wrote:


Renesas Ethernet AVB controller requires that all data are aligned on 4-byte
boundary.  While it's  easily achievable for  the RX  data with  the help of
skb_reserve() (we even align on 128-byte boundary as recommended by the manual),
we  can't  do the same with the TX data, and it always comes  unaligned from
the networking core. Originally we solved it an easy way, copying all packet
to  a  preallocated  aligned buffer; however, it's enough to copy only up to
3 first bytes from each packet, doing the transfer using 2 TX descriptors
instead of just 1. Here's an implementation of the new  TX algorithm that
significantly reduces the driver's memory requirements.



...

-   buffer = PTR_ALIGN(priv-tx_buffers[q][entry], RAVB_ALIGN);
-   memcpy(buffer, skb-data, skb-len);
-   desc = priv-tx_ring[q][entry];
-   desc-ds_tagl = cpu_to_le16(skb-len);
-   dma_addr = dma_map_single(ndev-dev, buffer, skb-len, DMA_TO_DEVICE);
+   buffer = PTR_ALIGN(priv-tx_align[q], DPTR_ALIGN) +
+entry / NUM_TX_DESC * DPTR_ALIGN;



The above would be clearer if tx_align was char[DPTR_ALIGN][].


   tx_align is a pointer, not an array.


+   len = PTR_ALIGN(skb-data, DPTR_ALIGN) - skb-data;
+   memcpy(buffer, skb-data, len);



Does this imply there has been an skb_linearize() ???


   Sure, I don't support S/G (and it seems problematic given how the DMA 
descriptors are handled by the h/w).



The old version didn't really need it (it was doing a copy anyway).


   It did since it copied the whole packet.


+   dma_addr = dma_map_single(ndev-dev, buffer, len, DMA_TO_DEVICE);
if (dma_mapping_error(ndev-dev, dma_addr))
goto drop;
+
+   desc = priv-tx_ring[q][entry];
+   desc-ds_tagl = cpu_to_le16(len);
+   desc-dptr = cpu_to_le32(dma_addr);
+
+   buffer = skb-data + len;
+   len = skb-len - len;
+   dma_addr = dma_map_single(ndev-dev, buffer, len, DMA_TO_DEVICE);
+   if (dma_mapping_error(ndev-dev, dma_addr))
+   goto unmap;
+
+   desc++;
+   desc-ds_tagl = cpu_to_le16(len);



What happens if a fragment is less than DPTR_ALIGN bytes ???


   It's always the case. If you mean a packet shorter than DPTR_ALIGN, it can 
happen due to call to skb_put_padto(skb, ETH_ZLEN).



Actually is looks like you relying on having a linear skb.


   Yes, and I was relying on it even before this patch.


David


WBR, Sergei

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[net-next PATCH 2/2] drivers: net: cpsw: add separate napi for tx packet handling for performance improvment

2015-07-27 Thread Mugunthan V N
Instead of processing tx events in ISR itself, moving the tx
event processing to a separate napi improves tx performance by
180 Mbps with omap2plus_defconfig. Also cleaning up rx napis by
renaming to napi_rx for better understanding the code.

Signed-off-by: Mugunthan V N mugunthan...@ti.com
---
 drivers/net/ethernet/ti/cpsw.c | 61 --
 1 file changed, 47 insertions(+), 14 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index d68d759..4f98537 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -365,7 +365,8 @@ struct cpsw_priv {
spinlock_t  lock;
struct platform_device  *pdev;
struct net_device   *ndev;
-   struct napi_struct  napi;
+   struct napi_struct  napi_rx;
+   struct napi_struct  napi_tx;
struct device   *dev;
struct cpsw_platform_data   data;
struct cpsw_ss_regs __iomem *regs;
@@ -752,13 +753,22 @@ static irqreturn_t cpsw_tx_interrupt(int irq, void 
*dev_id)
struct cpsw_priv *priv = dev_id;
 
cpdma_ctlr_eoi(priv-dma, CPDMA_EOI_TX);
-   cpdma_chan_process(priv-txch, 128);
+   writel(0, priv-wr_regs-tx_en);
+
+   if (netif_running(priv-ndev)) {
+   napi_schedule(priv-napi_tx);
+   return IRQ_HANDLED;
+   }
 
priv = cpsw_get_slave_priv(priv, 1);
-   if (priv)
-   cpdma_chan_process(priv-txch, 128);
+   if (!priv)
+   return IRQ_NONE;
 
-   return IRQ_HANDLED;
+   if (netif_running(priv-ndev)) {
+   napi_schedule(priv-napi_tx);
+   return IRQ_HANDLED;
+   }
+   return IRQ_NONE;
 }
 
 static irqreturn_t cpsw_rx_interrupt(int irq, void *dev_id)
@@ -769,7 +779,7 @@ static irqreturn_t cpsw_rx_interrupt(int irq, void *dev_id)
writel(0, priv-wr_regs-rx_en);
 
if (netif_running(priv-ndev)) {
-   napi_schedule(priv-napi);
+   napi_schedule(priv-napi_rx);
return IRQ_HANDLED;
}
 
@@ -778,20 +788,37 @@ static irqreturn_t cpsw_rx_interrupt(int irq, void 
*dev_id)
return IRQ_NONE;
 
if (netif_running(priv-ndev)) {
-   napi_schedule(priv-napi);
+   napi_schedule(priv-napi_rx);
return IRQ_HANDLED;
}
return IRQ_NONE;
 }
 
-static int cpsw_poll(struct napi_struct *napi, int budget)
+static int cpsw_tx_poll(struct napi_struct *napi_tx, int budget)
+{
+   struct cpsw_priv*priv = napi_to_priv(napi_tx);
+   int num_tx;
+
+   num_tx = cpdma_chan_process(priv-txch, budget);
+   if (num_tx  budget) {
+   napi_complete(napi_tx);
+   writel(0xff, priv-wr_regs-tx_en);
+   }
+
+   if (num_tx)
+   cpsw_dbg(priv, intr, poll %d tx pkts\n, num_tx);
+
+   return num_tx;
+}
+
+static int cpsw_rx_poll(struct napi_struct *napi_rx, int budget)
 {
-   struct cpsw_priv*priv = napi_to_priv(napi);
+   struct cpsw_priv*priv = napi_to_priv(napi_rx);
int num_rx;
 
num_rx = cpdma_chan_process(priv-rxch, budget);
if (num_rx  budget) {
-   napi_complete(napi);
+   napi_complete(napi_rx);
writel(0xff, priv-wr_regs-rx_en);
}
 
@@ -1297,7 +1324,8 @@ static int cpsw_ndo_open(struct net_device *ndev)
cpsw_set_coalesce(ndev, coal);
}
 
-   napi_enable(priv-napi);
+   napi_enable(priv-napi_rx);
+   napi_enable(priv-napi_tx);
cpdma_ctlr_start(priv-dma);
cpsw_intr_enable(priv);
 
@@ -1319,7 +1347,8 @@ static int cpsw_ndo_stop(struct net_device *ndev)
 
cpsw_info(priv, ifdown, shutting down cpsw device\n);
netif_stop_queue(priv-ndev);
-   napi_disable(priv-napi);
+   napi_disable(priv-napi_rx);
+   napi_disable(priv-napi_tx);
netif_carrier_off(priv-ndev);
 
if (cpsw_common_res_usage_state(priv) = 1) {
@@ -2105,7 +2134,10 @@ static int cpsw_probe_dual_emac(struct platform_device 
*pdev,
 
ndev-netdev_ops = cpsw_netdev_ops;
ndev-ethtool_ops = cpsw_ethtool_ops;
-   netif_napi_add(ndev, priv_sl2-napi, cpsw_poll, CPSW_POLL_WEIGHT);
+   netif_napi_add(ndev, priv_sl2-napi_rx, cpsw_rx_poll,
+  CPSW_POLL_WEIGHT);
+   netif_napi_add(ndev, priv_sl2-napi_tx, cpsw_tx_poll,
+  CPSW_POLL_WEIGHT);
 
/* register the network device */
SET_NETDEV_DEV(ndev, pdev-dev);
@@ -2357,7 +2389,8 @@ static int cpsw_probe(struct platform_device *pdev)
 
ndev-netdev_ops = cpsw_netdev_ops;
ndev-ethtool_ops = cpsw_ethtool_ops;
-   netif_napi_add(ndev, priv-napi, cpsw_poll, CPSW_POLL_WEIGHT);
+   netif_napi_add(ndev, priv-napi_rx, 

[net-next PATCH 1/2] drivers: net: cpsw: remove disable_irq/enable_irq as irq can be masked from cpsw itself

2015-07-27 Thread Mugunthan V N
CPSW interrupts can be disabled by masking CPSW interrupts and
clearing interrupt by writing appropriate EOI. So removing all
disable_irq/enable_irq as discussed in [1]

[1] http://patchwork.ozlabs.org/patch/492741/

Signed-off-by: Mugunthan V N mugunthan...@ti.com
---
 drivers/net/ethernet/ti/cpsw.c | 27 ++-
 1 file changed, 2 insertions(+), 25 deletions(-)

diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
index d155bf2..d68d759 100644
--- a/drivers/net/ethernet/ti/cpsw.c
+++ b/drivers/net/ethernet/ti/cpsw.c
@@ -389,7 +389,6 @@ struct cpsw_priv {
/* snapshot of IRQ numbers */
u32 irqs_table[4];
u32 num_irqs;
-   bool irq_enabled;
struct cpts *cpts;
u32 emac_port;
 };
@@ -767,12 +766,7 @@ static irqreturn_t cpsw_rx_interrupt(int irq, void *dev_id)
struct cpsw_priv *priv = dev_id;
 
cpdma_ctlr_eoi(priv-dma, CPDMA_EOI_RX);
-
-   cpsw_intr_disable(priv);
-   if (priv-irq_enabled == true) {
-   disable_irq_nosync(priv-irqs_table[0]);
-   priv-irq_enabled = false;
-   }
+   writel(0, priv-wr_regs-rx_en);
 
if (netif_running(priv-ndev)) {
napi_schedule(priv-napi);
@@ -797,15 +791,8 @@ static int cpsw_poll(struct napi_struct *napi, int budget)
 
num_rx = cpdma_chan_process(priv-rxch, budget);
if (num_rx  budget) {
-   struct cpsw_priv *prim_cpsw;
-
napi_complete(napi);
-   cpsw_intr_enable(priv);
-   prim_cpsw = cpsw_get_slave_priv(priv, 0);
-   if (prim_cpsw-irq_enabled == false) {
-   prim_cpsw-irq_enabled = true;
-   enable_irq(priv-irqs_table[0]);
-   }
+   writel(0xff, priv-wr_regs-rx_en);
}
 
if (num_rx)
@@ -1230,7 +1217,6 @@ static void cpsw_slave_stop(struct cpsw_slave *slave, 
struct cpsw_priv *priv)
 static int cpsw_ndo_open(struct net_device *ndev)
 {
struct cpsw_priv *priv = netdev_priv(ndev);
-   struct cpsw_priv *prim_cpsw;
int i, ret;
u32 reg;
 
@@ -1315,14 +1301,6 @@ static int cpsw_ndo_open(struct net_device *ndev)
cpdma_ctlr_start(priv-dma);
cpsw_intr_enable(priv);
 
-   prim_cpsw = cpsw_get_slave_priv(priv, 0);
-   if (prim_cpsw-irq_enabled == false) {
-   if ((priv == prim_cpsw) || !netif_running(prim_cpsw-ndev)) {
-   prim_cpsw-irq_enabled = true;
-   enable_irq(prim_cpsw-irqs_table[0]);
-   }
-   }
-
if (priv-data.dual_emac)
priv-slaves[priv-emac_port].open_stat = true;
return 0;
@@ -2169,7 +2147,6 @@ static int cpsw_probe(struct platform_device *pdev)
priv-msg_enable = netif_msg_init(debug_level, CPSW_DEBUG);
priv-rx_packet_max = max(rx_packet_max, 128);
priv-cpts = devm_kzalloc(pdev-dev, sizeof(struct cpts), GFP_KERNEL);
-   priv-irq_enabled = true;
if (!priv-cpts) {
dev_err(pdev-dev, error allocating cpts\n);
ret = -ENOMEM;
-- 
2.5.0.rc3.2.g6f9504c

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[net-next PATCH 0/2] CPSW interrupt handling cleanup and performance improvement

2015-07-27 Thread Mugunthan V N
This patch series removes the irq controller disable interrupt and
adding a napi for tx event handling which improves the performance by
180Mbps on dra7-evm

[  5] local 192.168.10.116 port 5001 connected with 192.168.10.125 port 44174
[  5]  0.0-60.0 sec  1.46 GBytes   209 Mbits/sec
[  4] local 192.168.10.116 port 5001 connected with 192.168.10.125 port 33954
[  4]  0.0-60.0 sec  2.72 GBytes   390 Mbits/sec

Mugunthan V N (2):
  drivers: net: cpsw: remove  disable_irq/enable_irq as irq can be
masked from cpsw itself
  drivers: net: cpsw: add separate napi for tx packet handling for
performance improvment

 drivers/net/ethernet/ti/cpsw.c | 88 +++---
 1 file changed, 49 insertions(+), 39 deletions(-)

-- 
2.5.0.rc3.2.g6f9504c

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 4/4] net/mlx4_en: Add support for hardware accelerated 802.1ad vlan

2015-07-27 Thread Amir Vadai
From: Hadar Hen Zion had...@mellanox.com

To enable device support in accelerated 802.1ad vlan, the port
capability packet has vlan enable (phv_en) should be set.
Firmware won't work properly, in case phv_en is not set.

The user can enable phv_en port capability with the new ethtool
private flag phv-bit. The phv-bit private flag default value is OFF,
users who are interested in 802.1ad hardware acceleration should turn ON
the phv-bit private flag:
$ ethtool --set-priv-flags eth1 phv-bit on

Once the private flag is set, the device is ready for 802.1ad vlan
acceleration.

The user should also change the interface device features and turn on
tx-vlan-stag-hw-insert which is off by default:
$ ethtool -K eth1  tx-vlan-stag-hw-insert on

phv-bit private flag setting is available only for Physical
Functions(PF), the Virtual Function (VF) will be able to use the feature
by setting tx-vlan-stag-hw-insert ethtool device feature only if the
feature was enabled by the Hypervisor.

Signed-off-by: Hadar Hen Zion had...@mellanox.com
Signed-off-by: Amir Vadai am...@mellanox.com
---
 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c | 16 +
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c  | 46 +
 drivers/net/ethernet/mellanox/mlx4/en_rx.c  | 16 -
 drivers/net/ethernet/mellanox/mlx4/en_tx.c  | 13 ---
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h|  1 +
 include/linux/mlx4/cq.h |  1 +
 include/linux/mlx4/qp.h |  1 +
 7 files changed, 89 insertions(+), 5 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c 
b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
index 70f6553..f79d812 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -102,6 +102,7 @@ mlx4_en_get_drvinfo(struct net_device *dev, struct 
ethtool_drvinfo *drvinfo)
 
 static const char mlx4_en_priv_flags[][ETH_GSTRING_LEN] = {
blueflame,
+   phv-bit
 };
 
 static const char main_strings[][ETH_GSTRING_LEN] = {
@@ -1797,9 +1798,13 @@ static int mlx4_en_get_ts_info(struct net_device *dev,
 static int mlx4_en_set_priv_flags(struct net_device *dev, u32 flags)
 {
struct mlx4_en_priv *priv = netdev_priv(dev);
+   struct mlx4_en_dev *mdev = priv-mdev;
bool bf_enabled_new = !!(flags  MLX4_EN_PRIV_FLAGS_BLUEFLAME);
bool bf_enabled_old = !!(priv-pflags  MLX4_EN_PRIV_FLAGS_BLUEFLAME);
+   bool phv_enabled_new = !!(flags  MLX4_EN_PRIV_FLAGS_PHV);
+   bool phv_enabled_old = !!(priv-pflags  MLX4_EN_PRIV_FLAGS_PHV);
int i;
+   int ret = 0;
 
if (bf_enabled_new != bf_enabled_old) {
if (bf_enabled_new) {
@@ -1825,6 +1830,17 @@ static int mlx4_en_set_priv_flags(struct net_device 
*dev, u32 flags)
bf_enabled_new ?  Enabled : Disabled);
}
 
+   if (phv_enabled_new != phv_enabled_old) {
+   ret = set_phv_bit(mdev-dev, priv-port, (int)phv_enabled_new);
+   if (ret)
+   return ret;
+   else if (phv_enabled_new)
+   priv-pflags |= MLX4_EN_PRIV_FLAGS_PHV;
+   else
+   priv-pflags = ~MLX4_EN_PRIV_FLAGS_PHV;
+   en_info(priv, PHV bit %s\n,
+   phv_enabled_new ?  Enabled : Disabled);
+   }
return 0;
 }
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c 
b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
index e0de2fd..4726122 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c
@@ -2184,6 +2184,25 @@ static int mlx4_en_ioctl(struct net_device *dev, struct 
ifreq *ifr, int cmd)
}
 }
 
+static netdev_features_t mlx4_en_fix_features(struct net_device *netdev,
+ netdev_features_t features)
+{
+   struct mlx4_en_priv *en_priv = netdev_priv(netdev);
+   struct mlx4_en_dev *mdev = en_priv-mdev;
+
+   /* Since there is no support for separate RX C-TAG/S-TAG vlan accel
+* enable/disable make sure S-TAG flag is always in same state as
+* C-TAG.
+*/
+   if (features  NETIF_F_HW_VLAN_CTAG_RX 
+   !(mdev-dev-caps.flags2  MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN))
+   features |= NETIF_F_HW_VLAN_STAG_RX;
+   else
+   features = ~NETIF_F_HW_VLAN_STAG_RX;
+
+   return features;
+}
+
 static int mlx4_en_set_features(struct net_device *netdev,
netdev_features_t features)
 {
@@ -2218,6 +2237,10 @@ static int mlx4_en_set_features(struct net_device 
*netdev,
en_info(priv, Turn %s TX vlan strip offload\n,
(features  NETIF_F_HW_VLAN_CTAG_TX) ? ON : OFF);
 
+   if (DEV_FEATURE_CHANGED(netdev, features, NETIF_F_HW_VLAN_STAG_TX))
+   en_info(priv, Turn %s TX S-VLAN strip offload\n,
+   

[PATCH net-next 2/4] net/mlx4_en: Prepare ethtool private flags to support more flags

2015-07-27 Thread Amir Vadai
From: Hadar Hen Zion had...@mellanox.com

Currently we support only one ethtool private flag. Prepare
mlx4_en_set_priv_flags function to support more than one private flag.
Will be used in the next patch to support hardware accelerated 802.1ad
vlan.


Signed-off-by: Hadar Hen Zion had...@mellanox.com
Signed-off-by: Amir Vadai am...@mellanox.com
---
 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c | 35 -
 1 file changed, 17 insertions(+), 18 deletions(-)

diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c 
b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
index 99ba1c5..70f6553 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c
@@ -1801,30 +1801,29 @@ static int mlx4_en_set_priv_flags(struct net_device 
*dev, u32 flags)
bool bf_enabled_old = !!(priv-pflags  MLX4_EN_PRIV_FLAGS_BLUEFLAME);
int i;
 
-   if (bf_enabled_new == bf_enabled_old)
-   return 0; /* Nothing to do */
+   if (bf_enabled_new != bf_enabled_old) {
+   if (bf_enabled_new) {
+   bool bf_supported = true;
 
-   if (bf_enabled_new) {
-   bool bf_supported = true;
+   for (i = 0; i  priv-tx_ring_num; i++)
+   bf_supported = priv-tx_ring[i]-bf_alloced;
 
-   for (i = 0; i  priv-tx_ring_num; i++)
-   bf_supported = priv-tx_ring[i]-bf_alloced;
+   if (!bf_supported) {
+   en_err(priv, BlueFlame is not supported\n);
+   return -EINVAL;
+   }
 
-   if (!bf_supported) {
-   en_err(priv, BlueFlame is not supported\n);
-   return -EINVAL;
+   priv-pflags |= MLX4_EN_PRIV_FLAGS_BLUEFLAME;
+   } else {
+   priv-pflags = ~MLX4_EN_PRIV_FLAGS_BLUEFLAME;
}
 
-   priv-pflags |= MLX4_EN_PRIV_FLAGS_BLUEFLAME;
-   } else {
-   priv-pflags = ~MLX4_EN_PRIV_FLAGS_BLUEFLAME;
-   }
-
-   for (i = 0; i  priv-tx_ring_num; i++)
-   priv-tx_ring[i]-bf_enabled = bf_enabled_new;
+   for (i = 0; i  priv-tx_ring_num; i++)
+   priv-tx_ring[i]-bf_enabled = bf_enabled_new;
 
-   en_info(priv, BlueFlame %s\n,
-   bf_enabled_new ?  Enabled : Disabled);
+   en_info(priv, BlueFlame %s\n,
+   bf_enabled_new ?  Enabled : Disabled);
+   }
 
return 0;
 }
-- 
2.4.3.413.ga5fe668

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 0/4] net/mlx4_en: Hardware accelerated 802.1ad

2015-07-27 Thread Amir Vadai
Hi,

This patchset by Hadar introduces support in Hardware accelerated 802.1ad, for
ConnectX-3pro NIC's.  In order to support existing deployment, and due to some
hardware limitations, the feature is disabled by default, and needed to be
enabled using a private flag in ethtool. Ofcourse user can enable the private
flag only if hardware has support.
After being enabled, the standard ethtool -k/-K can be used.

Patchset was applied and tested over commit 71790a2 (hv_netvsc: Add structs
and handlers for VF messages)

Amir

Hadar Hen Zion (4):
  net/mlx4_core: Preparations for 802.1ad VLAN support
  net/mlx4_en: Prepare ethtool private flags to support more flags
  net/mlx4: Prepare VLAN macros for 802.1ad Hardware accelerated support
  net/mlx4_en: Add support for hardware accelerated 802.1ad vlan

 drivers/infiniband/hw/mlx4/cq.c |  2 +-
 drivers/net/ethernet/mellanox/mlx4/en_ethtool.c | 51 +--
 drivers/net/ethernet/mellanox/mlx4/en_netdev.c  | 46 ++
 drivers/net/ethernet/mellanox/mlx4/en_rx.c  | 22 +--
 drivers/net/ethernet/mellanox/mlx4/en_tx.c  | 13 ++--
 drivers/net/ethernet/mellanox/mlx4/fw.c | 82 +
 drivers/net/ethernet/mellanox/mlx4/fw.h |  1 +
 drivers/net/ethernet/mellanox/mlx4/main.c   | 15 +
 drivers/net/ethernet/mellanox/mlx4/mlx4.h   |  3 +
 drivers/net/ethernet/mellanox/mlx4/mlx4_en.h|  1 +
 include/linux/mlx4/cq.h |  3 +-
 include/linux/mlx4/device.h |  5 ++
 include/linux/mlx4/qp.h |  3 +-
 13 files changed, 218 insertions(+), 29 deletions(-)

-- 
2.4.3.413.ga5fe668

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 3/4] net/mlx4: Prepare VLAN macros for 802.1ad Hardware accelerated support

2015-07-27 Thread Amir Vadai
From: Hadar Hen Zion had...@mellanox.com

To add Hardware accelerated support in 802.1ad vlan, replace
Current VLAN macros to CVLAN.
Replace:
MLX4_WQE_CTRL_INS_VLAN
MLX4_CQE_VLAN_PRESENT_MASK
With:
MLX4_WQE_CTRL_INS_CVLAN
MLX4_CQE_CVLAN_PRESENT_MASK

Signed-off-by: Hadar Hen Zion had...@mellanox.com
Signed-off-by: Amir Vadai am...@mellanox.com
---
 drivers/infiniband/hw/mlx4/cq.c| 2 +-
 drivers/net/ethernet/mellanox/mlx4/en_rx.c | 6 +++---
 drivers/net/ethernet/mellanox/mlx4/en_tx.c | 2 +-
 include/linux/mlx4/cq.h| 2 +-
 include/linux/mlx4/qp.h| 2 +-
 5 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c
index 36eb3d0..180a8f7 100644
--- a/drivers/infiniband/hw/mlx4/cq.c
+++ b/drivers/infiniband/hw/mlx4/cq.c
@@ -871,7 +871,7 @@ repoll:
if (is_eth) {
wc-sl  = be16_to_cpu(cqe-sl_vid)  13;
if (be32_to_cpu(cqe-vlan_my_qpn) 
-   MLX4_CQE_VLAN_PRESENT_MASK) {
+   MLX4_CQE_CVLAN_PRESENT_MASK) {
wc-vlan_id = be16_to_cpu(cqe-sl_vid) 
MLX4_CQE_VID_MASK;
} else {
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c 
b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
index 12c65e1..10f6c2f 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c
@@ -726,7 +726,7 @@ static int check_csum(struct mlx4_cqe *cqe, struct sk_buff 
*skb, void *va,
 
hw_checksum = csum_unfold((__force __sum16)cqe-checksum);
 
-   if (cqe-vlan_my_qpn  cpu_to_be32(MLX4_CQE_VLAN_PRESENT_MASK) 
+   if (cqe-vlan_my_qpn  cpu_to_be32(MLX4_CQE_CVLAN_PRESENT_MASK) 
!(dev_features  NETIF_F_HW_VLAN_CTAG_RX)) {
hw_checksum = get_fixed_vlan_csum(hw_checksum, hdr);
hdr += sizeof(struct vlan_hdr);
@@ -907,7 +907,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct 
mlx4_en_cq *cq, int bud
gro_skb-csum_level = 1;
 
if ((cqe-vlan_my_qpn 
-   cpu_to_be32(MLX4_CQE_VLAN_PRESENT_MASK)) 
+   cpu_to_be32(MLX4_CQE_CVLAN_PRESENT_MASK)) 
(dev-features  NETIF_F_HW_VLAN_CTAG_RX)) {
u16 vid = be16_to_cpu(cqe-sl_vid);
 
@@ -970,7 +970,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct 
mlx4_en_cq *cq, int bud
PKT_HASH_TYPE_L3);
 
if ((be32_to_cpu(cqe-vlan_my_qpn) 
-   MLX4_CQE_VLAN_PRESENT_MASK) 
+   MLX4_CQE_CVLAN_PRESENT_MASK) 
(dev-features  NETIF_F_HW_VLAN_CTAG_RX))
__vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), 
be16_to_cpu(cqe-sl_vid));
 
diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c 
b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
index c10d98f..7c858f6 100644
--- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c
+++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c
@@ -958,7 +958,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct 
net_device *dev)
ring-bf.offset ^= ring-bf.buf_size;
} else {
tx_desc-ctrl.vlan_tag = cpu_to_be16(vlan_tag);
-   tx_desc-ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN *
+   tx_desc-ctrl.ins_vlan = MLX4_WQE_CTRL_INS_CVLAN *
!!skb_vlan_tag_present(skb);
tx_desc-ctrl.fence_size = real_size;
 
diff --git a/include/linux/mlx4/cq.h b/include/linux/mlx4/cq.h
index e7ecc12..899a97b 100644
--- a/include/linux/mlx4/cq.h
+++ b/include/linux/mlx4/cq.h
@@ -88,7 +88,7 @@ struct mlx4_ts_cqe {
 
 enum {
MLX4_CQE_L2_TUNNEL_IPOK = 1  31,
-   MLX4_CQE_VLAN_PRESENT_MASK  = 1  29,
+   MLX4_CQE_CVLAN_PRESENT_MASK = 1  29,
MLX4_CQE_L2_TUNNEL  = 1  27,
MLX4_CQE_L2_TUNNEL_CSUM = 1  26,
MLX4_CQE_L2_TUNNEL_IPV4 = 1  25,
diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h
index 6fed539..6c61900 100644
--- a/include/linux/mlx4/qp.h
+++ b/include/linux/mlx4/qp.h
@@ -272,7 +272,7 @@ enum {
MLX4_WQE_CTRL_SOLICITED = 1  1,
MLX4_WQE_CTRL_IP_CSUM   = 1  4,
MLX4_WQE_CTRL_TCP_UDP_CSUM  = 1  5,
-   MLX4_WQE_CTRL_INS_VLAN  = 1  6,
+   MLX4_WQE_CTRL_INS_CVLAN = 1  6,
MLX4_WQE_CTRL_STRONG_ORDER  = 1  7,
MLX4_WQE_CTRL_FORCE_LOOPBACK= 1  0,
 };
-- 
2.4.3.413.ga5fe668

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 1/4] net/mlx4_core: Preparations for 802.1ad VLAN support

2015-07-27 Thread Amir Vadai
From: Hadar Hen Zion had...@mellanox.com

mlx4_core preparation to support hardware accelerated 802.1ad VLAN
device.

To allow 802.1ad accelerated device, packet has vlan (phv)
Firmware capability should be available. Firmware without the
phv capability won't behave properly and can't support 802.1ad device
acceleration.

The driver checks the Firmware capability and sets the phv bit
accordingly in SET_PORT command.

Signed-off-by: Hadar Hen Zion had...@mellanox.com
Signed-off-by: Amir Vadai am...@mellanox.com
---
 drivers/net/ethernet/mellanox/mlx4/fw.c   | 82 +++
 drivers/net/ethernet/mellanox/mlx4/fw.h   |  1 +
 drivers/net/ethernet/mellanox/mlx4/main.c | 15 ++
 drivers/net/ethernet/mellanox/mlx4/mlx4.h |  3 ++
 include/linux/mlx4/device.h   |  5 ++
 5 files changed, 106 insertions(+)

diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c 
b/drivers/net/ethernet/mellanox/mlx4/fw.c
index e30bf57..5a1c3d2 100644
--- a/drivers/net/ethernet/mellanox/mlx4/fw.c
+++ b/drivers/net/ethernet/mellanox/mlx4/fw.c
@@ -154,6 +154,7 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 
flags)
[26] = Port ETS Scheduler support,
[27] = Port beacon support,
[28] = RX-ALL support,
+   [29] = 802.1ad offload support,
};
int i;
 
@@ -307,6 +308,7 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int 
slave,
 
 #define QUERY_FUNC_CAP_FLAGS0_FORCE_PHY_WQE_GID 0x80
 #define QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS (1  31)
+#define QUERY_FUNC_CAP_PHV_BIT 0x40
 
if (vhcr-op_modifier == 1) {
struct mlx4_active_ports actv_ports =
@@ -351,6 +353,12 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int 
slave,
MLX4_PUT(outbox-buf, dev-caps.phys_port_id[vhcr-in_modifier],
 QUERY_FUNC_CAP_PHYS_PORT_ID);
 
+   if (dev-caps.phv_bit[port]) {
+   field = QUERY_FUNC_CAP_PHV_BIT;
+   MLX4_PUT(outbox-buf, field,
+QUERY_FUNC_CAP_FLAGS0_OFFSET);
+   }
+
} else if (vhcr-op_modifier == 0) {
struct mlx4_active_ports actv_ports =
mlx4_get_active_ports(dev, slave);
@@ -600,6 +608,9 @@ int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u8 
gen_or_port,
MLX4_GET(func_cap-phys_port_id, outbox,
 QUERY_FUNC_CAP_PHYS_PORT_ID);
 
+   MLX4_GET(field, outbox, QUERY_FUNC_CAP_FLAGS0_OFFSET);
+   func_cap-flags |= (field  QUERY_FUNC_CAP_PHV_BIT);
+
/* All other resources are allocated by the master, but we still report
 * 'num' and 'reserved' capabilities as follows:
 * - num remains the maximum resource index
@@ -700,6 +711,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct 
mlx4_dev_cap *dev_cap)
 #define QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET0x92
 #define QUERY_DEV_CAP_BMME_FLAGS_OFFSET0x94
 #define QUERY_DEV_CAP_CONFIG_DEV_OFFSET0x94
+#define QUERY_DEV_CAP_PHV_EN_OFFSET0x96
 #define QUERY_DEV_CAP_RSVD_LKEY_OFFSET 0x98
 #define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET0xa0
 #define QUERY_DEV_CAP_ETH_BACKPL_OFFSET0x9c
@@ -898,6 +910,12 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct 
mlx4_dev_cap *dev_cap)
dev_cap-flags2 |= MLX4_DEV_CAP_FLAG2_CONFIG_DEV;
if (field  (1  2))
dev_cap-flags2 |= MLX4_DEV_CAP_FLAG2_IGNORE_FCS;
+   MLX4_GET(field, outbox, QUERY_DEV_CAP_PHV_EN_OFFSET);
+   if (field  0x80)
+   dev_cap-flags2 |= MLX4_DEV_CAP_FLAG2_PHV_EN;
+   if (field  0x40)
+   dev_cap-flags2 |= MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN;
+
MLX4_GET(dev_cap-reserved_lkey, outbox,
 QUERY_DEV_CAP_RSVD_LKEY_OFFSET);
MLX4_GET(field32, outbox, QUERY_DEV_CAP_ETH_BACKPL_OFFSET);
@@ -1992,6 +2010,10 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev,
MLX4_GET(param-uar_page_sz, outbox, INIT_HCA_UAR_PAGE_SZ_OFFSET);
MLX4_GET(param-log_uar_sz, outbox, INIT_HCA_LOG_UAR_SZ_OFFSET);
 
+   /* phv_check enable */
+   MLX4_GET(byte_field, outbox, INIT_HCA_CACHELINE_SZ_OFFSET);
+   if (byte_field  0x2)
+   param-phv_check_en = 1;
 out:
mlx4_free_cmd_mailbox(dev, mailbox);
 
@@ -2758,3 +2780,63 @@ int mlx4_ACCESS_REG_wrapper(struct mlx4_dev *dev, int 
slave,
0, MLX4_CMD_ACCESS_REG, MLX4_CMD_TIME_CLASS_C,
MLX4_CMD_NATIVE);
 }
+
+static int mlx4_SET_PORT_phv_bit(struct mlx4_dev *dev, u8 port, u8 phv_bit)
+{
+#define SET_PORT_GEN_PHV_VALID 0x10
+#define SET_PORT_GEN_PHV_EN0x80
+
+   struct mlx4_cmd_mailbox *mailbox;
+   struct mlx4_set_port_general_context *context;
+   u32 in_mod;
+   int err;
+
+   mailbox = mlx4_alloc_cmd_mailbox(dev);
+   if 

[PATCH iproute2 net-next] bridge: mdb: add support for router add/del notifications monitoring

2015-07-27 Thread Nikolay Aleksandrov
From: Nikolay Aleksandrov niko...@cumulusnetworks.com

This patch adds support for ADDMDB/DELMDB notifications about router ports
which have been added or deleted/expired respectively.

Example output:
$ bridge -s monitor mdb
Deleted router port dev eth3 master br0
router port dev eth3 master br0

Signed-off-by: Nikolay Aleksandrov niko...@cumulusnetworks.com
---
 bridge/mdb.c | 22 +-
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/bridge/mdb.c b/bridge/mdb.c
index ea169b9c2e4d..dd1f942af53c 100644
--- a/bridge/mdb.c
+++ b/bridge/mdb.c
@@ -84,7 +84,7 @@ int print_mdb(const struct sockaddr_nl *who, struct nlmsghdr 
*n, void *arg)
FILE *fp = arg;
struct br_port_msg *r = NLMSG_DATA(n);
int len = n-nlmsg_len;
-   struct rtattr * tb[MDBA_MAX+1];
+   struct rtattr *tb[MDBA_MAX+1], *i;
 
if (n-nlmsg_type != RTM_GETMDB  n-nlmsg_type != RTM_NEWMDB  
n-nlmsg_type != RTM_DELMDB) {
fprintf(stderr, Not RTM_GETMDB, RTM_NEWMDB or RTM_DELMDB: %08x 
%08x %08x\n,
@@ -105,7 +105,6 @@ int print_mdb(const struct sockaddr_nl *who, struct 
nlmsghdr *n, void *arg)
parse_rtattr(tb, MDBA_MAX, MDBA_RTA(r), n-nlmsg_len - 
NLMSG_LENGTH(sizeof(*r)));
 
if (tb[MDBA_MDB]) {
-   struct rtattr *i;
int rem = RTA_PAYLOAD(tb[MDBA_MDB]);
 
for (i = RTA_DATA(tb[MDBA_MDB]); RTA_OK(i, rem); i = 
RTA_NEXT(i, rem))
@@ -113,9 +112,22 @@ int print_mdb(const struct sockaddr_nl *who, struct 
nlmsghdr *n, void *arg)
}
 
if (tb[MDBA_ROUTER]) {
-   if (show_details) {
-   fprintf(fp, router ports on %s: , 
ll_index_to_name(r-ifindex));
-   br_print_router_ports(fp, tb[MDBA_ROUTER]);
+   if (n-nlmsg_type == RTM_GETMDB) {
+   if (show_details) {
+   fprintf(fp, router ports on %s: ,
+   ll_index_to_name(r-ifindex));
+   br_print_router_ports(fp, tb[MDBA_ROUTER]);
+   }
+   } else {
+   uint32_t *port_ifindex;
+
+   i = RTA_DATA(tb[MDBA_ROUTER]);
+   port_ifindex = RTA_DATA(i);
+   if (n-nlmsg_type == RTM_DELMDB)
+   fprintf(fp, Deleted );
+   fprintf(fp, router port dev %s master %s\n,
+   ll_index_to_name(*port_ifindex),
+   ll_index_to_name(r-ifindex));
}
}
 
-- 
2.4.3

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Fwd: Need help about AR8327 Ethernet Driver

2015-07-27 Thread s prasad
Hi  All,

I am trying to sniff Ethernet data using AR8327 chip-set from other
Ethernet device KSZ8895, which gives multiplexed TX and RX data of
monitoring data of another device.

If I am using devices like DUB-E100(D-Link), am able capture data from
KSZ8895(Multiplexed TX and Rx Ethernet data).

But If I am trying to sniff Ethernet data using AR8327 Chipset(Router
TP-Link AC 1750), then am not able to capture data. But in
(switch/port) port registers showing correct values.

Am using tcpdump to check data on specific interface. When I am using
DUB-E100(D-Link) or passive Ethernet sniffer(Either Rx or Tx packets)
then able to see whole data, but not with AR8327 Chipset.

My environment is given below:
Router: TPLink AC 1750
Ethernet data coming from KSZ8895 (Multiplexed Tx and RX of another device)
OpenWRT Environment

Can somebody through where should I look to solve my issue.

Thanks and Regards,
S Prasad


-- 
S Prasad Kandregula
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Several races in usbnet module (kernel 4.1.x)

2015-07-27 Thread Eugene Shatokhin

27.07.2015 13:00, Oliver Neukum пишет:

On Fri, 2015-07-24 at 17:41 +0300, Eugene Shatokhin wrote:

23.07.2015 12:15, Oliver Neukum пишет:



  From what I see now in Documentation/atomic_ops.txt, stores to the
properly aligned memory locations are in fact atomic.


They are, but again only with respect to each other.


You are right. The architectures like sparc and may be others, indeed, 
use spinlocks to implement atomic operations, including bit manupulation.


Well then, I can only think about clearing each flag individually (with 
clear_bit()) instead of using dev-flags = 0.


Something like this:

-
diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c
index 3c86b10..826eefe 100644
--- a/drivers/net/usb/usbnet.c
+++ b/drivers/net/usb/usbnet.c
@@ -779,6 +790,7 @@ int usbnet_stop (struct net_device *net)
struct usbnet   *dev = netdev_priv(net);
struct driver_info  *info = dev-driver_info;
int retval, pm;
+   int e;

clear_bit(EVENT_DEV_OPEN, dev-flags);
netif_stop_queue (net);
@@ -813,7 +825,8 @@ int usbnet_stop (struct net_device *net)
 * can't flush_scheduled_work() until we drop rtnl (later),
 * else workers could deadlock; so make workers a NOP.
 */
-   dev-flags = 0;
+   for (e = 0; e  EVENT_NUM_EVENTS; ++e)
+   clear_bit(e, dev-flags)
del_timer_sync (dev-delay);
tasklet_kill (dev-bh);
if (!pm)

diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h
index 6e0ce8c..7ad62da 100644
--- a/include/linux/usb/usbnet.h
+++ b/include/linux/usb/usbnet.h
@@ -79,6 +79,7 @@ struct usbnet {
# define EVENT_RX_KILL 10
# define EVENT_LINK_CHANGE 11
# define EVENT_SET_RX_MODE 12
+# define EVENT_NUM_EVENTS 13 /* Or may be keep all these in an enum? */
};

static inline struct usb_driver *driver_of(struct usb_interface *intf)
---

clear_bit() is atomic w.r.t. itself and other bit ops.



So, I think, the situation you described above cannot happen for
dev-flags, which is good. No need to address that in the patch. The
race might be harmless after all.

If I understand the code correctly now, dev-flags is set to 0 in
usbnet_stop() so that the worker function (usbnet_deferred_kevent) would


Yes, particularly not reschedule itself.


do nothing, should it start later. If so, how about adding memory
barriers for all CPUs to see dev-flags is 0 before other things?


Taking a lock, as del_timer_sync() does, implies a memory barrier,
as does a work.


If so, then, yes, additional barriers are not needed.

Regards,
Eugene


--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Need help about AR8327 Ethernet Driver

2015-07-27 Thread s prasad
Hi  All,

I am trying to sniff Ethernet data using AR8327 chip-set from other
Ethernet device KSZ8895, which gives multiplexed TX and RX data of
monitoring data of another device.

If I am using devices like DUB-E100(D-Link), am able capture data from
KSZ8895(Multiplexed TX and Rx Ethernet data).

But If I am trying to sniff Ethernet data using AR8327 Chipset(Router
TP-Link AC 1750), then am not able to capture data. But in
(switch/port) port registers showing correct values.

Am using tcpdump to check data on specific interface. When I am using
DUB-E100(D-Link) or passive Ethernet sniffer(Either Rx or Tx packets)
then able to see whole data, but not with AR8327 Chipset.

My environment is given below:
Router: TPLink AC 1750
Ethernet data coming from KSZ8895 (Multiplexed Tx and RX of another device)
OpenWRT Environment

Can somebody through where should I look to solve my issue.

Thanks and Regards,
S Prasad
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Buggy cable detection on i.MX51, fec driver and LAN8700 PHY

2015-07-27 Thread Igor Plyatov

Dear all,

very often we observe issue with Ethernet cable detection during cable 
unplugging and plugging.


We use Voipac i.MX51 SOMs (System On Modules). They are based on 
Freescale i.MX51 CPU with LAN7800 PHY in MII mode. The schematic of PHY 
connection is very similar to the Freescale i.MX51 Babbage board.


The Ethernet interface eth0 is configured statically for simplicity, but 
same issue exists with DHCP configuration.


I did a lot of tests to determine stability of Ethernet cable detection 
by the fec Ethernet driver.


In normal operation, if I unplug the Ethernet cable, then fec driver 
prints fec 83fec000.ethernet eth0: Link is Down and green LED 
(Ethernet medium detected) is OFF.
If I plug cable back, then fec driver print fec 83fec000.ethernet 
eth0: Link is Up - 100Mbps/Full - flow control off and green LED is ON.


But sometimes, after cable plugging, fec driver does not print 
anything on the console and green LED does not show detection of 
Ethernet cable. Frequency of issue appearing is a random value. 
Sometimes issue appears after second cable unplugging/plugging, but 
sometimes - after 10-20 unplugging/plugging.


The issue was tested and exists on kernels from linux-3.8.5 till current 
linux-4.2-rc4-cbfe8fa6cd672011c755c3cd85c9ffd4e2d10a6f.


Same tests was made with different versions of the Barebox bootloader 
and cable detection works flawless.


Please, help to resolve issue with Linux drivers.

Best wishes.
--
Igor Plyatov
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


netns refcnt leak for kernel accept sock

2015-07-27 Thread Sowmini Varadhan

I'm running into a netns refcnt issue, and I suspect that 
eeb1bd5c has something to do with it (perhaps we need an 
additional change in sk_clone_lock() after eeb1bd5c). 
Here's the problem:

When we create an syn_recv sock based on a kernel listen sock, we
take a get_net() ref  with a stack similar to the one shown below.
Note that the parent (kernel, listen) sock itself has not taken
a get_net() ref, because it explicitly calls sock_create_kern().

  get_net /* for the newsk */
  sk_clone_lock
  inet_csk_clone_lock
  tcp_create_openreq_child
  tcp_v4_syn_recv_sock
  tcp_check_req
  tcp_v4_do_rcv
  tcp_v4_rcv 
   :

But it's not clear to me where this refcnt will be released: 
in my case, I expect to create/cleanup kernel sockets as part 
of -init/-exit for my module, but because the accept socket 
has a netns refcnt, it blocks cleanup_net(), thus my -exit 
pernet_subsys op cannot run and clean this up, and we have a leak.

I think that sk_clone_lock() should only do a get_net() if the parent
is not a kernel socket (making this similar to sk_alloc()), i.e.,

diff --git a/net/core/sock.c b/net/core/sock.c
index 08f16db..371d1b7 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1497,7 +1497,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gf
sock_copy(newsk, sk);

/* SANITY */
-   get_net(sock_net(newsk));
+   if (likely(newsk-sk_net_refcnt))
+   get_net(sock_net(newsk));
sk_node_init(newsk-sk_node);
sock_lock_init(newsk);
bh_lock_sock(newsk);

Does this sound right?

--Sowmini

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: linux-next: Tree for Jul 27 (net/mpls/af_mpls.c)

2015-07-27 Thread Randy Dunlap
On 07/26/15 23:02, Stephen Rothwell wrote:
 Hi all,
 
 Changes since 20150724:
 

on i386 or x86_64:

when CONFIG_IPV6 is not enabled:


net/built-in.o: In function `find_outdev':
af_mpls.c:(.text+0x1e8ddd): undefined reference to `ip6_route_output'
af_mpls.c:(.text+0x1e8e90): undefined reference to `ip_route_output_flow'




-- 
~Randy
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [PATCH net-next 1/1] Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet device driver

2015-07-27 Thread Woojung.Huh
Thanks for your review.
I'll repost updated patch again.

 -Original Message-
 From: David Miller [mailto:da...@davemloft.net]
 Sent: Sunday, July 26, 2015 7:42 PM
 To: Woojung Huh - C21699
 Cc: netdev@vger.kernel.org
 Subject: Re: [PATCH net-next 1/1] Microchip's LAN7800 family USB 2/3 to
 10/100/1000 Ethernet device driver
 
 From: woojung@microchip.com
 Date: Wed, 22 Jul 2015 19:01:44 +
 
  - remove module param which can be configurable by standard mechanism.
 
 You still left some unacceptable module parameters in here.
 
 The only one which is fine is the debug level setting, that's
 it.
 
 There is no way in I'm applying a patch that allows programming
 registers of the chip directly via module parameters, no way.
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: netns refcnt leak for kernel accept sock

2015-07-27 Thread Sowmini Varadhan
On (07/27/15 12:40), ebied...@xmission.com wrote:
 sock_create_kern and friends are specialied interfaces for special
 purposes.  At a quick read through I don't think we have a single in
 tree user doing with them what you are trying to do.

That doesnt change the fact that the architecture is questionable.
and my description should make it quite clear why this is so.

 
 Without seeing code using the interfaces in the way are trying to use
 them I do not have enough information to comment intelligently.

Ok, here you go.

I'm still testing it, but there's enough there for you to see the bug
quite clearly.

Enjoy. I think my other mail had better information to comment intelligently
but ymmv.

--Sowmini

diff --git a/net/core/sock.c b/net/core/sock.c
index 08f16db..371d1b7 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -1497,7 +1497,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const 
gfp_t priority)
sock_copy(newsk, sk);
 
/* SANITY */
-   get_net(sock_net(newsk));
+   if (likely(newsk-sk_net_refcnt))
+   get_net(sock_net(newsk));
sk_node_init(newsk-sk_node);
sock_lock_init(newsk);
bh_lock_sock(newsk);
diff --git a/net/rds/bind.c b/net/rds/bind.c
index 4ebd29c..dd666fb 100644
--- a/net/rds/bind.c
+++ b/net/rds/bind.c
@@ -185,7 +185,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, 
int addr_len)
ret = 0;
goto out;
}
-   trans = rds_trans_get_preferred(sin-sin_addr.s_addr);
+   trans = rds_trans_get_preferred(sock_net(sock-sk),
+   sin-sin_addr.s_addr);
if (!trans) {
ret = -EADDRNOTAVAIL;
rds_remove_bound(rs);
diff --git a/net/rds/connection.c b/net/rds/connection.c
index da6da57..273fa6c 100644
--- a/net/rds/connection.c
+++ b/net/rds/connection.c
@@ -117,7 +117,8 @@ static void rds_conn_reset(struct rds_connection *conn)
  * For now they are not garbage collected once they're created.  They
  * are torn down as the module is removed, if ever.
  */
-static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr,
+static struct rds_connection *__rds_conn_create(struct net *net,
+  __be32 laddr, __be32 faddr,
   struct rds_transport *trans, gfp_t gfp,
   int is_outgoing)
 {
@@ -157,7 +158,7 @@ new_conn:
conn-c_faddr = faddr;
spin_lock_init(conn-c_lock);
conn-c_next_tx_seq = 1;
-
+   write_pnet(conn-c_net, net);
init_waitqueue_head(conn-c_waitq);
INIT_LIST_HEAD(conn-c_send_queue);
INIT_LIST_HEAD(conn-c_retrans);
@@ -174,7 +175,7 @@ new_conn:
 * can bind to the destination address then we'd rather the messages
 * flow through loopback rather than either transport.
 */
-   loop_trans = rds_trans_get_preferred(faddr);
+   loop_trans = rds_trans_get_preferred(net, faddr);
if (loop_trans) {
rds_trans_put(loop_trans);
conn-c_loopback = 1;
@@ -260,17 +261,19 @@ out:
return conn;
 }
 
-struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr,
+struct rds_connection *rds_conn_create(struct net *net,
+  __be32 laddr, __be32 faddr,
   struct rds_transport *trans, gfp_t gfp)
 {
-   return __rds_conn_create(laddr, faddr, trans, gfp, 0);
+   return __rds_conn_create(net, laddr, faddr, trans, gfp, 0);
 }
 EXPORT_SYMBOL_GPL(rds_conn_create);
 
-struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr,
+struct rds_connection *rds_conn_create_outgoing(struct net *net,
+  __be32 laddr, __be32 faddr,
   struct rds_transport *trans, gfp_t gfp)
 {
-   return __rds_conn_create(laddr, faddr, trans, gfp, 1);
+   return __rds_conn_create(net, laddr, faddr, trans, gfp, 1);
 }
 EXPORT_SYMBOL_GPL(rds_conn_create_outgoing);
 
diff --git a/net/rds/ib.c b/net/rds/ib.c
index ba2dffe..1381422 100644
--- a/net/rds/ib.c
+++ b/net/rds/ib.c
@@ -317,7 +317,7 @@ static void rds_ib_ic_info(struct socket *sock, unsigned 
int len,
  * allowed to influence which paths have priority.  We could call userspace
  * asserting this policy routing.
  */
-static int rds_ib_laddr_check(__be32 addr)
+static int rds_ib_laddr_check(struct net *net, __be32 addr)
 {
int ret;
struct rdma_cm_id *cm_id;
diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c
index 0da2a45..c38d8a0 100644
--- a/net/rds/ib_cm.c
+++ b/net/rds/ib_cm.c
@@ -448,8 +448,8 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id,
 (unsigned long long)be64_to_cpu(lguid),
 (unsigned long long)be64_to_cpu(fguid));
 
-   conn = 

increase in time to delete an interface with 4.x kernels

2015-07-27 Thread David Ahern

Hi Alex:

I believe you did the recent overhaul to the fib implementation. I am 
seeing dramatically higher times to delete an interface with an ipv4 
address in 4.2-rc3. perf-top points to update_suffix:


   PerfTop:   15834 irqs/sec  kernel:97.3%  exact:  0.0% [4000Hz 
cpu-clock],  (all, 4 CPUs)

---

74.69%  [kernel]   [k] update_suffix
 2.38%  [kernel]   [k] fib_table_flush
 2.20%  [kernel]   [k] fib6_walk_continue
 2.03%  [kernel]   [k] fib6_ifdown
 1.31%  [kernel]   [k] fib6_age


I have a simple script to create and assign an ipv4 address to 10k dummy 
interfaces:


l=0
for (( j = 1; j = 40; j += 1))
do
for (( k = 1 ; k = 250  ; k += 1 ))
do
l=$((l + 1))
ip link add dev dummy${l} type dummy
ip addr add  72.$j.$k.1/24 dev dummy${l}
ifconfig dummy${l} up
done
done


and a counter script to delete them all:

k=$(ip link show | grep dummy | wc -l)
for (( j = 1; j = k; j += 1))
do
ip link del dev dummy${j}
done


Looking at v3.19:

# time ./tadd-dummy.sh

real3m8.896s
user0m7.104s
sys 0m22.020s


# time ./tdel-dummy.sh

real7m18.207s
user0m3.824s
sys 3m15.672s


And the time to delete 1 interface after all 10k have been created:
# time ip link del dev dummy

real0m0.064s
user0m0.000s
sys 0m0.020s


Contrast those times with 4.2.0-rc3+ running the exact same scripts

# time ./tadd-dummy.sh

real2m51.044s
user0m7.220s
sys 0m29.520s

#  time ip link del dev dummy

real0m0.441s
user0m0.000s
sys 0m0.416s

so here the time to delete 1 interface has gone up by more than 10x.


# time ./tdel-dummy.sh
^C

real14m10.000s
user0m0.528s
sys 13m14.728s

I killed the delete; after 14 minutes only ~2k+ interfaces had been deleted:

# ip link show | grep dummy | wc -l
7822

In 4.2.0-rc3 it seems to take about 60 seconds to delete 150 interfaces 
which is inline with the 1 interface time of 0.4 seconds.


David
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: netns refcnt leak for kernel accept sock

2015-07-27 Thread Eric W. Biederman

sock_create_kern and friends are specialied interfaces for special
purposes.  At a quick read through I don't think we have a single in
tree user doing with them what you are trying to do.

Without seeing code using the interfaces in the way are trying to use
them I do not have enough information to comment intelligently.

Eric

Sowmini Varadhan sowmini.varad...@oracle.com writes:
 I'm running into a netns refcnt issue, and I suspect that 
 eeb1bd5c has something to do with it (perhaps we need an 
 additional change in sk_clone_lock() after eeb1bd5c). 
 Here's the problem:

 When we create an syn_recv sock based on a kernel listen sock, we
 take a get_net() ref  with a stack similar to the one shown below.
 Note that the parent (kernel, listen) sock itself has not taken
 a get_net() ref, because it explicitly calls sock_create_kern().

   get_net /* for the newsk */
   sk_clone_lock
   inet_csk_clone_lock
   tcp_create_openreq_child
   tcp_v4_syn_recv_sock
   tcp_check_req
   tcp_v4_do_rcv
   tcp_v4_rcv 
:

 But it's not clear to me where this refcnt will be released: 
 in my case, I expect to create/cleanup kernel sockets as part 
 of -init/-exit for my module, but because the accept socket 
 has a netns refcnt, it blocks cleanup_net(), thus my -exit 
 pernet_subsys op cannot run and clean this up, and we have a leak.

 I think that sk_clone_lock() should only do a get_net() if the parent
 is not a kernel socket (making this similar to sk_alloc()), i.e.,

 diff --git a/net/core/sock.c b/net/core/sock.c
 index 08f16db..371d1b7 100644
 --- a/net/core/sock.c
 +++ b/net/core/sock.c
 @@ -1497,7 +1497,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const 
 gf
 sock_copy(newsk, sk);

 /* SANITY */
 -   get_net(sock_net(newsk));
 +   if (likely(newsk-sk_net_refcnt))
 +   get_net(sock_net(newsk));
 sk_node_init(newsk-sk_node);
 sock_lock_init(newsk);
 bh_lock_sock(newsk);

 Does this sound right?

 --Sowmini
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: netns refcnt leak for kernel accept sock

2015-07-27 Thread Sowmini Varadhan
On (07/27/15 11:13), Cong Wang wrote:
 
 That refcnt should be released in sock destructor too, when the tcp
 connection is terminated.

yes, but in my case, the listen socket is opened as part of
the -init indirection in pernet_operations (thus it is a kernel socket)
and the expectation is that this listen socket, and any accept sockets
derived from it, will be closed in -exit.

But if the accept socket is treated as a uspace socket (thus holds a get_net())
then it will block cleanup_net() and the associated -exit cleanup operations.

This is probably not a problem for other systems like vxlan/gue/geneve etc
because they all use udp sockets, thus dont have the accept equivalent.

But fundamentally, its wrong for a kspace listen socket to result in a
uspace accept socket.

 Given the fact that sk_destruct() checks for sk_net_refcnt, your
 patch makes sense to me. But I am not sure how a TCP kernel
 socket is supposed to use.

Thanks for the confirmation - I think RDS is a bit of a maverick here in
that it uses tcp sockets unlike vxlan etc.
For those curious about RDS-TCP, I've actually updated the documentation at
https://oss.oracle.com/projects/rds/dist/documentation/rds-3.1-spec.html
recently. I hope that helps.

--Sowmini
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: increase in time to delete an interface with 4.x kernels

2015-07-27 Thread Alexander Duyck

On 07/27/2015 09:49 AM, David Ahern wrote:

Hi Alex:

I believe you did the recent overhaul to the fib implementation. I am
seeing dramatically higher times to delete an interface with an ipv4
address in 4.2-rc3. perf-top points to update_suffix:

PerfTop:   15834 irqs/sec  kernel:97.3%  exact:  0.0% [4000Hz
cpu-clock],  (all, 4 CPUs)
---


 74.69%  [kernel]   [k] update_suffix
  2.38%  [kernel]   [k] fib_table_flush
  2.20%  [kernel]   [k] fib6_walk_continue
  2.03%  [kernel]   [k] fib6_ifdown
  1.31%  [kernel]   [k] fib6_age


I have a simple script to create and assign an ipv4 address to 10k dummy
interfaces:

l=0
for (( j = 1; j = 40; j += 1))
do
 for (( k = 1 ; k = 250  ; k += 1 ))
 do
 l=$((l + 1))
 ip link add dev dummy${l} type dummy
   ip addr add  72.$j.$k.1/24 dev dummy${l}
   ifconfig dummy${l} up
 done
done


and a counter script to delete them all:

k=$(ip link show | grep dummy | wc -l)
for (( j = 1; j = k; j += 1))
do
 ip link del dev dummy${j}
done



Okay so looking over what this script does it looks like it really 
exposes the worst case scenerio for update_suffix.  You have a monstrous 
tnode that is 15 bits ins size.  That is roughly 32K entries, and 
unfortunately the suffix is 8 bits long with a position of 7.


The result is that for every removal the code is scanning 16K entries in 
order to relevel things after an entry is removed.


Let me try a couple of quick things and I should have a patch for you in 
the next couple of hours.


Thanks.

- Alex
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: netns refcnt leak for kernel accept sock

2015-07-27 Thread Cong Wang
On Mon, Jul 27, 2015 at 7:21 AM, Sowmini Varadhan
sowmini.varad...@oracle.com wrote:

 I'm running into a netns refcnt issue, and I suspect that
 eeb1bd5c has something to do with it (perhaps we need an
 additional change in sk_clone_lock() after eeb1bd5c).
 Here's the problem:

 When we create an syn_recv sock based on a kernel listen sock, we
 take a get_net() ref  with a stack similar to the one shown below.
 Note that the parent (kernel, listen) sock itself has not taken
 a get_net() ref, because it explicitly calls sock_create_kern().

   get_net /* for the newsk */
   sk_clone_lock
   inet_csk_clone_lock
   tcp_create_openreq_child
   tcp_v4_syn_recv_sock
   tcp_check_req
   tcp_v4_do_rcv
   tcp_v4_rcv
:

 But it's not clear to me where this refcnt will be released:
 in my case, I expect to create/cleanup kernel sockets as part
 of -init/-exit for my module, but because the accept socket
 has a netns refcnt, it blocks cleanup_net(), thus my -exit
 pernet_subsys op cannot run and clean this up, and we have a leak.


That refcnt should be released in sock destructor too, when the tcp
connection is terminated.


 I think that sk_clone_lock() should only do a get_net() if the parent
 is not a kernel socket (making this similar to sk_alloc()), i.e.,


Given the fact that sk_destruct() checks for sk_net_refcnt, your
patch makes sense to me. But I am not sure how a TCP kernel
socket is supposed to use.
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 06/16] net: Tx via VRF device

2015-07-27 Thread David Ahern
If out device is enslaved to a VRF device we want packets to go through the
VRF master device first. This allows for example iptables rules and tc rules
to be configured on the VRF as a whole as well as the option for rules on
specific netdevices. This is accomplished by updating the dev in the dst to
point to the VRF device if it is enslaved.

Signed-off-by: Shrijeet Mukherjee s...@cumulusnetworks.com
Signed-off-by: David Ahern d...@cumulusnetworks.com
---
 net/ipv4/route.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 8119896e1159..050a3c1d89ba 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1903,6 +1903,23 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 
daddr, __be32 saddr,
 }
 EXPORT_SYMBOL(ip_route_input_noref);
 
+/* if out device is enslaved to a VRF device update dst to
+ * send through it
+ */
+static void rt_use_vrf_dev(struct rtable *rth, struct net_device *dev_out)
+{
+#if IS_ENABLED(CONFIG_NET_VRF)
+   int ifindex = vrf_master_dev_ifindex(dev_out);
+   struct net_device *mdev;
+
+   mdev = dev_get_by_index(dev_net(dev_out), ifindex);
+   if (mdev) {
+   dev_put(rth-dst.dev);
+   rth-dst.dev = mdev;
+   }
+#endif
+}
+
 /* called with rcu_read_lock() */
 static struct rtable *__mkroute_output(const struct fib_result *res,
   const struct flowi4 *fl4, int orig_oif,
@@ -2008,6 +2025,7 @@ static struct rtable *__mkroute_output(const struct 
fib_result *res,
}
 
rt_set_nexthop(rth, fl4-daddr, res, fnhe, fi, type, 0);
+   rt_use_vrf_dev(rth, dev_out);
 
return rth;
 }
-- 
2.3.2 (Apple Git-55)

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 14/16] net: Add sk_bind_dev_if to task_struct

2015-07-27 Thread David Ahern
Allow tasks to have a default device index for binding sockets. If set
the value is passed to all AF_INET/AF_INET6 sockets when they are created.

The task setting is passed parent to child on fork, but can be set or
changed after task creation using prctl (if task has CAP_NET_ADMIN
permissions). The setting for a socket can be retrieved using prctl().
This option allows an administrator to restrict a task to only send/receive
packets through the specified device. In the case of VRF devices this
option restricts tasks to a specific VRF.

Correlation of the device index to a specific VRF, ie.,
   ifindex -- VRF device -- VRF id
is left to userspace.

Example using VRF devices:
1. vrf1 is created and assigned to table 5
2. eth2 is enslaved to vrf1
3. eth2 is given the address 1.1.1.1/24

$ ip route ls table 5
prohibit default
1.1.1.0/24 dev eth2  scope link
local 1.1.1.1 dev eth2  proto kernel  scope host  src 1.1.1.1

With out setting a VRF context ping, tcp and udp attempts fail. e.g,
$ ping 1.1.1.254
connect: Network is unreachable

After binding the task to the vrf device ping succeeds:
$ ./chvrf -v 1 ping -c1 1.1.1.254
PING 1.1.1.254 (1.1.1.254) 56(84) bytes of data.
64 bytes from 1.1.1.254: icmp_seq=1 ttl=64 time=2.32 ms

Signed-off-by: David Ahern d...@cumulusnetworks.com
---
 include/linux/sched.h  |  3 +++
 include/uapi/linux/prctl.h |  4 
 kernel/fork.c  |  2 ++
 kernel/sys.c   | 35 +++
 net/ipv4/af_inet.c |  1 +
 net/ipv4/route.c   |  4 +++-
 net/ipv6/af_inet6.c|  1 +
 net/ipv6/route.c   |  2 +-
 8 files changed, 50 insertions(+), 2 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 04b5ada460b4..29b336b8a466 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1528,6 +1528,9 @@ struct task_struct {
struct files_struct *files;
 /* namespaces */
struct nsproxy *nsproxy;
+/* network */
+   /* if set INET/INET6 sockets are bound to given dev index on create */
+   int sk_bind_dev_if;
 /* signal handlers */
struct signal_struct *signal;
struct sighand_struct *sighand;
diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
index 31891d9535e2..1ef45195d146 100644
--- a/include/uapi/linux/prctl.h
+++ b/include/uapi/linux/prctl.h
@@ -190,4 +190,8 @@ struct prctl_mm_map {
 # define PR_FP_MODE_FR (1  0)/* 64b FP registers */
 # define PR_FP_MODE_FRE(1  1)/* 32b compatibility */
 
+/* get/set network interface sockets are bound to by default */
+#define PR_SET_SK_BIND_DEV_IF   47
+#define PR_GET_SK_BIND_DEV_IF   48
+
 #endif /* _LINUX_PRCTL_H */
diff --git a/kernel/fork.c b/kernel/fork.c
index dbd9b8d7b7cc..8b396e77d2bf 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -380,6 +380,8 @@ static struct task_struct *dup_task_struct(struct 
task_struct *orig)
tsk-splice_pipe = NULL;
tsk-task_frag.page = NULL;
 
+   tsk-sk_bind_dev_if = orig-sk_bind_dev_if;
+
account_kernel_stack(ti, 1);
 
return tsk;
diff --git a/kernel/sys.c b/kernel/sys.c
index 259fda25eb6b..59119ac0a0bd 100644
--- a/kernel/sys.c
+++ b/kernel/sys.c
@@ -52,6 +52,7 @@
 #include linux/rcupdate.h
 #include linux/uidgid.h
 #include linux/cred.h
+#include linux/netdevice.h
 
 #include linux/kmsg_dump.h
 /* Move somewhere else to avoid recompiling? */
@@ -2267,6 +2268,40 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, 
unsigned long, arg3,
case PR_GET_FP_MODE:
error = GET_FP_MODE(me);
break;
+#ifdef CONFIG_NET
+   case PR_SET_SK_BIND_DEV_IF:
+   {
+   struct net_device *dev;
+   int idx = (int) arg2;
+
+   if (!capable(CAP_NET_ADMIN))
+   return -EPERM;
+
+   if (idx) {
+   dev = dev_get_by_index(me-nsproxy-net_ns, idx);
+   if (!dev)
+   return -EINVAL;
+   dev_put(dev);
+   }
+   me-sk_bind_dev_if = idx;
+   break;
+   }
+   case PR_GET_SK_BIND_DEV_IF:
+   {
+   struct task_struct *tsk;
+   int sk_bind_dev_if = -EINVAL;
+
+   rcu_read_lock();
+   tsk = find_task_by_vpid(arg2);
+   if (tsk)
+   sk_bind_dev_if = tsk-sk_bind_dev_if;
+   rcu_read_unlock();
+   if (tsk != me  !capable(CAP_NET_ADMIN))
+   return -EPERM;
+   error = sk_bind_dev_if;
+   break;
+   }
+#endif
default:
error = -EINVAL;
break;
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index 09c7c1ee307e..0651efa18d39 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -352,6 +352,7 @@ static int inet_create(struct net *net, struct socket 
*sock, int protocol,
  

[PATCH net-next 13/16] net: Introduce VRF device driver - v2

2015-07-27 Thread David Ahern
This driver borrows heavily from IPvlan and teaming drivers.

Routing domains (VRF-lite) are created by instantiating a VRF master
device with an associated table and enslaving all routed interfaces that
participate in the domain. As part of the enslavement, all connected
routes for the enslaved devices are moved to the table associated with
the VRF device. Outgoing sockets must bind to the VRF device to function.

Standard FIB rules bind the VRF device to tables and regular fib rule
processing is followed. Routed traffic through the box, is forwarded by
using the VRF device as the IIF and following the IIF rule to a table
that is mated with the VRF.

Example:

   Create vrf 1:
 ip link add vrf1 type vrf table 5
 ip rule add iif vrf1 table 5
 ip rule add oif vrf1 table 5
 ip route add table 5 prohibit default
 ip link set vrf1 up

   Add interface to vrf 1:
 ip link set eth1 master vrf1

Signed-off-by: Shrijeet Mukherjee s...@cumulusnetworks.com
Signed-off-by: David Ahern d...@cumulusnetworks.com

v2:
- addressed comments from first RFC
- significant changes to improve simplicity of implementation
---
 drivers/net/Kconfig  |   7 +
 drivers/net/Makefile |   1 +
 drivers/net/vrf.c| 596 +++
 3 files changed, 604 insertions(+)
 create mode 100644 drivers/net/vrf.c

diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
index c18f9e62a9fa..e58468b02987 100644
--- a/drivers/net/Kconfig
+++ b/drivers/net/Kconfig
@@ -297,6 +297,13 @@ config NLMON
  diagnostics, etc. This is mostly intended for developers or support
  to debug netlink issues. If unsure, say N.
 
+config NET_VRF
+   tristate Virtual Routing and Forwarding (Lite)
+   depends on IP_MULTIPLE_TABLES  IPV6_MULTIPLE_TABLES
+   ---help---
+ This option enables the support for mapping interfaces into VRF's. The
+ support enables VRF devices.
+
 endif # NET_CORE
 
 config SUNGEM_PHY
diff --git a/drivers/net/Makefile b/drivers/net/Makefile
index c12cb22478a7..ca16dd689b36 100644
--- a/drivers/net/Makefile
+++ b/drivers/net/Makefile
@@ -25,6 +25,7 @@ obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
 obj-$(CONFIG_VXLAN) += vxlan.o
 obj-$(CONFIG_GENEVE) += geneve.o
 obj-$(CONFIG_NLMON) += nlmon.o
+obj-$(CONFIG_NET_VRF) += vrf.o
 
 #
 # Networking Drivers
diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
new file mode 100644
index ..8669b0f9d749
--- /dev/null
+++ b/drivers/net/vrf.c
@@ -0,0 +1,596 @@
+/*
+ * vrf.c: device driver to encapsulate a VRF space
+ *
+ * Copyright (c) 2015 Cumulus Networks
+ *
+ * Based on dummy, team and ipvlan drivers
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#include linux/module.h
+#include linux/kernel.h
+#include linux/netdevice.h
+#include linux/etherdevice.h
+#include linux/ip.h
+#include linux/init.h
+#include linux/moduleparam.h
+#include linux/rtnetlink.h
+#include net/rtnetlink.h
+#include linux/u64_stats_sync.h
+#include linux/hashtable.h
+
+#include linux/inetdevice.h
+#include net/ip.h
+#include net/ip_fib.h
+#include net/ip6_route.h
+#include net/rtnetlink.h
+#include net/route.h
+#include net/addrconf.h
+#include net/vrf.h
+
+#define DRV_NAME   vrf
+#define DRV_VERSION1.0
+
+#define vrf_is_slave(dev)   ((dev)-flags  IFF_SLAVE)
+#define vrf_is_master(dev)  ((dev)-flags  IFF_MASTER)
+
+#define vrf_master_get_rcu(dev) \
+   ((struct net_device *)rcu_dereference(dev-rx_handler_data))
+
+struct pcpu_dstats {
+   u64 tx_pkts;
+   u64 tx_bytes;
+   u64 tx_drps;
+   u64 rx_pkts;
+   u64 rx_bytes;
+   struct u64_stats_sync   syncp;
+};
+
+struct slave {
+   struct list_headlist;
+   struct net_device   *dev;
+};
+
+struct slave_queue {
+   spinlock_t  lock; /* lock for slave insert/delete */
+   struct list_headall_slaves;
+   int num_slaves;
+};
+
+struct net_vrf {
+   struct slave_queue  queue;
+   struct fib_table*tb;
+   u32 tb_id;
+};
+
+static bool is_ip_rx_frame(struct sk_buff *skb)
+{
+   switch (skb-protocol) {
+   case htons(ETH_P_IP):
+   case htons(ETH_P_IPV6):
+   return true;
+   }
+   return false;
+}
+
+/* note: already called with rcu_read_lock */
+static rx_handler_result_t vrf_handle_frame(struct sk_buff **pskb)
+{
+   struct sk_buff *skb = *pskb;
+
+   if (is_ip_rx_frame(skb)) {
+   struct net_device *dev = vrf_master_get_rcu(skb-dev);
+   struct pcpu_dstats *dstats = this_cpu_ptr(dev-dstats);
+
+   u64_stats_update_begin(dstats-syncp);
+  

[PATCH] iproute2: Add support for VRF device

2015-07-27 Thread David Ahern
Allow user to create a vrf device and specify its table binding.
Based on the iplink_vlan implementation.

Signed-off-by: Shrijeet Mukherjee s...@cumulusnetworks.com
Signed-off-by: David Ahern d...@cumulusnetworks.com
---
 include/linux/if_link.h |  8 +
 ip/Makefile |  2 +-
 ip/iplink.c |  2 +-
 ip/iplink_vrf.c | 87 +
 4 files changed, 97 insertions(+), 2 deletions(-)
 create mode 100644 ip/iplink_vrf.c

diff --git a/include/linux/if_link.h b/include/linux/if_link.h
index 8df6a8466839..28872fbf6814 100644
--- a/include/linux/if_link.h
+++ b/include/linux/if_link.h
@@ -337,6 +337,14 @@ enum macvlan_macaddr_mode {
 
 #define MACVLAN_FLAG_NOPROMISC 1
 
+/* VRF section */
+enum {
+   IFLA_VRF_UNSPEC,
+   IFLA_VRF_TABLE,
+   __IFLA_VRF_MAX
+};
+
+#define IFLA_VRF_MAX (__IFLA_VRF_MAX - 1)
 /* IPVLAN section */
 enum {
IFLA_IPVLAN_UNSPEC,
diff --git a/ip/Makefile b/ip/Makefile
index 77653ecc5785..d8b38ac2e44b 100644
--- a/ip/Makefile
+++ b/ip/Makefile
@@ -7,7 +7,7 @@ IPOBJ=ip.o ipaddress.o ipaddrlabel.o iproute.o iprule.o 
ipnetns.o \
 iplink_vxlan.o tcp_metrics.o iplink_ipoib.o ipnetconf.o link_ip6tnl.o \
 link_iptnl.o link_gre6.o iplink_bond.o iplink_bond_slave.o iplink_hsr.o \
 iplink_bridge.o iplink_bridge_slave.o ipfou.o iplink_ipvlan.o \
-iplink_geneve.o
+iplink_geneve.o iplink_vrf.o
 
 RTMONOBJ=rtmon.o
 
diff --git a/ip/iplink.c b/ip/iplink.c
index e296e6f611b8..892e8bc8808b 100644
--- a/ip/iplink.c
+++ b/ip/iplink.c
@@ -93,7 +93,7 @@ void iplink_usage(void)
fprintf(stderr, TYPE := { vlan | veth | vcan | dummy | ifb | 
macvlan | macvtap |\n);
fprintf(stderr,   bridge | bond | ipoib | ip6tnl | 
ipip | sit | vxlan |\n);
fprintf(stderr,   gre | gretap | ip6gre | ip6gretap | 
vti | nlmon |\n);
-   fprintf(stderr,   bond_slave | ipvlan | geneve }\n);
+   fprintf(stderr,   bond_slave | ipvlan | geneve | vrf 
}\n);
}
exit(-1);
 }
diff --git a/ip/iplink_vrf.c b/ip/iplink_vrf.c
new file mode 100644
index ..bfcb3cdeaf35
--- /dev/null
+++ b/ip/iplink_vrf.c
@@ -0,0 +1,87 @@
+/* iplink_vrf.cVRF device support
+ *
+ *  This program is free software; you can redistribute it and/or
+ *  modify it under the terms of the GNU General Public License
+ *  as published by the Free Software Foundation; either version
+ *  2 of the License, or (at your option) any later version.
+ *
+ * Authors: Shrijeet Mukherjee s...@cumulusnetworks.com
+ */
+
+#include stdio.h
+#include stdlib.h
+#include string.h
+#include sys/socket.h
+#include linux/if_link.h
+
+#include rt_names.h
+#include utils.h
+#include ip_common.h
+
+static void vrf_explain(FILE *f)
+{
+   fprintf(f, Usage: ... vrf table TABLEID \n);
+}
+
+static void explain(void)
+{
+   vrf_explain(stderr);
+}
+
+static int table_arg(void)
+{
+   fprintf(stderr,Error: argument of \table\ must be 0-32767 and 
currently unused\n);
+   return -1;
+}
+
+static int vrf_parse_opt(struct link_util *lu, int argc, char **argv,
+   struct nlmsghdr *n)
+{
+   while (argc  0) {
+   if (matches(*argv, table) == 0) {
+   __u32 table = 0;
+   NEXT_ARG();
+
+   table = atoi(*argv);
+   if (table  0 || table  32767)
+   return table_arg();
+   /* XXX need a table in-use check here */
+   fprintf(stderr, adding table %d\n, table);
+   addattr32(n, 1024, IFLA_VRF_TABLE, table);
+   } else if (matches(*argv, help) == 0) {
+   explain();
+   return -1;
+   } else {
+   fprintf(stderr, vrf: unknown option \%s\?\n,
+   *argv);
+   explain();
+   return -1;
+   }
+   argc--, argv++;
+   }
+
+   return 0;
+}
+
+static void vrf_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[])
+{
+   if (!tb)
+   return;
+
+   if (tb[IFLA_VRF_TABLE])
+   fprintf(f, table %u , rta_getattr_u32(tb[IFLA_VRF_TABLE]));
+}
+
+static void vrf_print_help(struct link_util *lu, int argc, char **argv,
+ FILE *f)
+{
+   vrf_explain(f);
+}
+
+struct link_util vrf_link_util = {
+   .id = vrf,
+   .maxattr= IFLA_VRF_MAX,
+   .parse_opt  = vrf_parse_opt,
+   .print_opt  = vrf_print_opt,
+   .print_help = vrf_print_help,
+};
-- 
2.3.2 (Apple Git-55)

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  

[PATCH net-next 11/16] net: Use VRF device index for socket lookups

2015-07-27 Thread David Ahern
The intent of the VRF device is to leverage the existing SO_BINDTODEVICE
as a means of creating L3 domains. Since sockets are expected to be bound
to the VRF device the index of the master device needs to be used for
socket lookups.

Signed-off-by: Shrijeet Mukherjee s...@cumulusnetworks.com
Signed-off-by: David Ahern d...@cumulusnetworks.com
---
 net/ipv4/syncookies.c |  5 -
 net/ipv4/tcp_input.c  |  6 +-
 net/ipv4/tcp_ipv4.c   | 11 +--
 3 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c
index d70b1f603692..dab52fba5872 100644
--- a/net/ipv4/syncookies.c
+++ b/net/ipv4/syncookies.c
@@ -18,6 +18,7 @@
 #include linux/export.h
 #include net/tcp.h
 #include net/route.h
+#include net/vrf.h
 
 extern int sysctl_tcp_syncookies;
 
@@ -348,7 +349,9 @@ struct sock *cookie_v4_check(struct sock *sk, struct 
sk_buff *skb)
treq-snt_synack= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0;
treq-tfo_listener  = false;
 
-   ireq-ir_iif = sk-sk_bound_dev_if;
+   ireq-ir_iif = vrf_get_master_dev_ifindex(sock_net(sk), skb-skb_iif);
+   if (!ireq-ir_iif)
+   ireq-ir_iif = sk-sk_bound_dev_if;
 
/* We throwed the options of the initial SYN away, so we hope
 * the ACK carries the same options again (see RFC1122 4.2.3.8)
diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c
index 4e4d6bcd0ca9..df82fb05c459 100644
--- a/net/ipv4/tcp_input.c
+++ b/net/ipv4/tcp_input.c
@@ -72,6 +72,7 @@
 #include net/dst.h
 #include net/tcp.h
 #include net/inet_common.h
+#include net/vrf.h
 #include linux/ipsec.h
 #include asm/unaligned.h
 #include linux/errqueue.h
@@ -6141,7 +6142,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_openreq_init(req, tmp_opt, skb, sk);
 
/* Note: tcp_v6_init_req() might override ir_iif for link locals */
-   inet_rsk(req)-ir_iif = sk-sk_bound_dev_if;
+   inet_rsk(req)-ir_iif = vrf_get_master_dev_ifindex(sock_net(sk),
+  skb-skb_iif);
+   if (!inet_rsk(req)-ir_iif)
+   inet_rsk(req)-ir_iif = sk-sk_bound_dev_if;
 
af_ops-init_req(req, sk, skb);
 
diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
index 486ba96ae91a..d0c40f4d9058 100644
--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -75,6 +75,7 @@
 #include net/secure_seq.h
 #include net/tcp_memcontrol.h
 #include net/busy_poll.h
+#include net/vrf.h
 
 #include linux/inet.h
 #include linux/ipv6.h
@@ -682,6 +683,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct 
sk_buff *skb)
 */
if (sk)
arg.bound_dev_if = sk-sk_bound_dev_if;
+   if (!arg.bound_dev_if  skb-dev)
+   arg.bound_dev_if = vrf_master_dev_ifindex(skb-dev);
 
arg.tos = ip_hdr(skb)-tos;
ip_send_unicast_reply(*this_cpu_ptr(net-ipv4.tcp_sk),
@@ -766,8 +769,10 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, 
u32 ack,
  ip_hdr(skb)-saddr, /* XXX */
  arg.iov[0].iov_len, IPPROTO_TCP, 0);
arg.csumoffset = offsetof(struct tcphdr, check) / 2;
-   if (oif)
-   arg.bound_dev_if = oif;
+   arg.bound_dev_if = oif ? : vrf_master_dev_ifindex(skb_dst(skb)-dev);
+   if (!arg.bound_dev_if)
+   arg.bound_dev_if = vrf_master_dev_ifindex(skb-dev);
+
arg.tos = tos;
ip_send_unicast_reply(*this_cpu_ptr(net-ipv4.tcp_sk),
  skb, TCP_SKB_CB(skb)-header.h4.opt,
@@ -1269,6 +1274,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct 
sk_buff *skb,
ireq  = inet_rsk(req);
sk_daddr_set(newsk, ireq-ir_rmt_addr);
sk_rcv_saddr_set(newsk, ireq-ir_loc_addr);
+   if (netif_index_is_vrf(sock_net(newsk), ireq-ir_iif))
+   newsk-sk_bound_dev_if = ireq-ir_iif;
newinet-inet_saddr   = ireq-ir_loc_addr;
inet_opt  = ireq-opt;
rcu_assign_pointer(newinet-inet_opt, inet_opt);
-- 
2.3.2 (Apple Git-55)

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 07/16] net: Add inet_addr lookup by table

2015-07-27 Thread David Ahern
Currently inet_addr_type and inet_dev_addr_type expect local addresses
to be in the local table. With the VRF device local routes for devices
associated with a VRF will be in the table associated with the VRF.
Provide an alternate inet_addr lookup to use a specific table rather
than defaulting to the local table.

Signed-off-by: Shrijeet Mukherjee s...@cumulusnetworks.com
Signed-off-by: David Ahern d...@cumulusnetworks.com
---
 include/net/route.h |  1 +
 net/ipv4/fib_frontend.c | 22 +++---
 2 files changed, 16 insertions(+), 7 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index 54f97eea0fb2..3b51c339c269 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -192,6 +192,7 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk);
 void ip_rt_send_redirect(struct sk_buff *skb);
 
 unsigned int inet_addr_type(struct net *net, __be32 addr);
+unsigned int inet_addr_type_table(struct net *net, __be32 addr, int tb_id);
 unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
__be32 addr);
 void ip_rt_multicast_event(struct in_device *);
diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 6e68a003d0fd..cc413b0170ed 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -214,12 +214,12 @@ void fib_flush_external(struct net *net)
  */
 static inline unsigned int __inet_dev_addr_type(struct net *net,
const struct net_device *dev,
-   __be32 addr)
+   __be32 addr, int tb_id)
 {
struct flowi4   fl4 = { .daddr = addr };
struct fib_result   res;
unsigned int ret = RTN_BROADCAST;
-   struct fib_table *local_table;
+   struct fib_table *table;
 
if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr))
return RTN_BROADCAST;
@@ -228,10 +228,10 @@ static inline unsigned int __inet_dev_addr_type(struct 
net *net,
 
rcu_read_lock();
 
-   local_table = fib_get_table(net, RT_TABLE_LOCAL);
-   if (local_table) {
+   table = fib_get_table(net, tb_id);
+   if (table) {
ret = RTN_UNICAST;
-   if (!fib_table_lookup(local_table, fl4, res, 
FIB_LOOKUP_NOREF)) {
+   if (!fib_table_lookup(table, fl4, res, FIB_LOOKUP_NOREF)) {
if (!dev || dev == res.fi-fib_dev)
ret = res.type;
}
@@ -241,16 +241,24 @@ static inline unsigned int __inet_dev_addr_type(struct 
net *net,
return ret;
 }
 
+unsigned int inet_addr_type_table(struct net *net, __be32 addr, int tb_id)
+{
+   return __inet_dev_addr_type(net, NULL, addr, tb_id);
+}
+EXPORT_SYMBOL(inet_addr_type_table);
+
 unsigned int inet_addr_type(struct net *net, __be32 addr)
 {
-   return __inet_dev_addr_type(net, NULL, addr);
+   return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL);
 }
 EXPORT_SYMBOL(inet_addr_type);
 
 unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
__be32 addr)
 {
-   return __inet_dev_addr_type(net, dev, addr);
+   int rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL;
+
+   return __inet_dev_addr_type(net, dev, addr, rt_table);
 }
 EXPORT_SYMBOL(inet_dev_addr_type);
 
-- 
2.3.2 (Apple Git-55)

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 12/16] net: Add ipv4 route helper to set next hop

2015-07-27 Thread David Ahern
Signed-off-by: David Ahern d...@cumulusnetworks.com
---
 include/net/route.h |  3 +++
 net/ipv4/route.c| 10 ++
 2 files changed, 13 insertions(+)

diff --git a/include/net/route.h b/include/net/route.h
index b14cbec93fbd..900d50fbcfc7 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -107,6 +107,7 @@ struct rt_cache_stat {
 extern struct ip_rt_acct __percpu *ip_rt_acct;
 
 struct in_device;
+struct fib_result;
 
 int ip_rt_init(void);
 void rt_cache_flush(struct net *net);
@@ -114,6 +115,8 @@ void rt_flush_dev(struct net_device *dev);
 struct rtable *ip_route_new_rtable(struct net_device *dev,
   unsigned int flags, u16 type,
   bool nopolicy, bool noxfrm, bool do_cache);
+void ip_route_set_nexthop(struct rtable *rt, __be32 daddr,
+ const struct fib_result *res);
 struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp);
 struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
struct sock *sk);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 050a3c1d89ba..47dae001a000 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1537,6 +1537,16 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 
daddr, __be32 saddr,
return err;
 }
 
+void ip_route_set_nexthop(struct rtable *rt, __be32 daddr,
+ const struct fib_result *res)
+{
+   struct fib_nh_exception *fnhe;
+
+   fnhe = find_exception(FIB_RES_NH(*res), daddr);
+
+   rt_set_nexthop(rt, daddr, res, fnhe, res-fi, res-type, 0);
+}
+EXPORT_SYMBOL(ip_route_set_nexthop);
 
 static void ip_handle_martian_source(struct net_device *dev,
 struct in_device *in_dev,
-- 
2.3.2 (Apple Git-55)

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 09/16] net: Add routes to the table associated with the device

2015-07-27 Thread David Ahern
When a device associated with a VRF is brought up or down routes
should be added to/removed from the table associated with the VRF.
fib_magic defaults to using the main or local tables. Have it use
the table with the device if there is one.

A part of this is directing prefsrc validations to the correct
table as well.

Signed-off-by: David Ahern d...@cumulusnetworks.com
---
 net/ipv4/fib_frontend.c  |  8 
 net/ipv4/fib_semantics.c | 25 +++--
 2 files changed, 23 insertions(+), 10 deletions(-)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 5ce0d11222ca..e35541a64449 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -805,6 +805,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct 
netlink_callback *cb)
 static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct 
in_ifaddr *ifa)
 {
struct net *net = dev_net(ifa-ifa_dev-dev);
+   int tb_id = vrf_dev_table(ifa-ifa_dev-dev);
struct fib_table *tb;
struct fib_config cfg = {
.fc_protocol = RTPROT_KERNEL,
@@ -819,11 +820,10 @@ static void fib_magic(int cmd, int type, __be32 dst, int 
dst_len, struct in_ifad
},
};
 
-   if (type == RTN_UNICAST)
-   tb = fib_new_table(net, RT_TABLE_MAIN);
-   else
-   tb = fib_new_table(net, RT_TABLE_LOCAL);
+   if (!tb_id)
+   tb_id = (type == RTN_UNICAST) ? RT_TABLE_MAIN : RT_TABLE_LOCAL;
 
+   tb = fib_new_table(net, tb_id);
if (!tb)
return;
 
diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index a578eacf9fcd..37e1dee7692a 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -838,6 +838,23 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct 
fib_nh *nh)
return nh-nh_saddr;
 }
 
+static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc)
+{
+   if (cfg-fc_type != RTN_LOCAL || !cfg-fc_dst ||
+   fib_prefsrc != cfg-fc_dst) {
+   int tb_id = cfg-fc_table;
+
+   if (tb_id == RT_TABLE_MAIN)
+   tb_id = RT_TABLE_LOCAL;
+
+   if (inet_addr_type_table(cfg-fc_nlinfo.nl_net,
+fib_prefsrc, tb_id) != RTN_LOCAL) {
+   return false;
+   }
+   }
+   return true;
+}
+
 struct fib_info *fib_create_info(struct fib_config *cfg)
 {
int err;
@@ -1033,12 +1050,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg)
fi-fib_flags |= RTNH_F_LINKDOWN;
}
 
-   if (fi-fib_prefsrc) {
-   if (cfg-fc_type != RTN_LOCAL || !cfg-fc_dst ||
-   fi-fib_prefsrc != cfg-fc_dst)
-   if (inet_addr_type(net, fi-fib_prefsrc) != RTN_LOCAL)
-   goto err_inval;
-   }
+   if (fi-fib_prefsrc  !fib_valid_prefsrc(cfg, fi-fib_prefsrc))
+   goto err_inval;
 
change_nexthops(fi) {
fib_info_update_nh_saddr(net, nexthop_nh);
-- 
2.3.2 (Apple Git-55)

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 05/16] net: Use VRF device index for lookups on TX

2015-07-27 Thread David Ahern
As with ingress use the index of VRF master device for route lookups on
egress. However, the oif should only be used to direct the lookups to a
specific table. Routes in the table are not based on the VRF device but
rather interfaces that are part of the VRF so do not consider the oif for
lookups within the table. The FLOWI_FLAG_VRFSRC is used to control this
latter part.

Signed-off-by: Shrijeet Mukherjee s...@cumulusnetworks.com
Signed-off-by: David Ahern d...@cumulusnetworks.com
---
 include/net/flow.h  | 1 +
 include/net/route.h | 3 +++
 net/ipv4/fib_trie.c | 7 +--
 net/ipv4/icmp.c | 4 
 net/ipv4/route.c| 3 +++
 5 files changed, 16 insertions(+), 2 deletions(-)

diff --git a/include/net/flow.h b/include/net/flow.h
index 3098ae33a178..f305588fc162 100644
--- a/include/net/flow.h
+++ b/include/net/flow.h
@@ -33,6 +33,7 @@ struct flowi_common {
__u8flowic_flags;
 #define FLOWI_FLAG_ANYSRC  0x01
 #define FLOWI_FLAG_KNOWN_NH0x02
+#define FLOWI_FLAG_VRFSRC  0x04
__u32   flowic_secid;
struct flowi_tunnel flowic_tun_key;
 };
diff --git a/include/net/route.h b/include/net/route.h
index cec7a2a055c8..54f97eea0fb2 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -254,6 +254,9 @@ static inline void ip_route_connect_init(struct flowi4 
*fl4, __be32 dst, __be32
if (inet_sk(sk)-transparent)
flow_flags |= FLOWI_FLAG_ANYSRC;
 
+   if (netif_index_is_vrf(sock_net(sk), oif))
+   flow_flags |= FLOWI_FLAG_VRFSRC;
+
flowi4_init_output(fl4, oif, sk-sk_mark, tos, RT_SCOPE_UNIVERSE,
   protocol, flow_flags, dst, src, dport, sport);
 }
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index ac2d828c6daa..7da901c56e35 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1421,8 +1421,11 @@ int fib_table_lookup(struct fib_table *tb, const struct 
flowi4 *flp,
nh-nh_flags  RTNH_F_LINKDOWN 
!(fib_flags  FIB_LOOKUP_IGNORE_LINKSTATE))
continue;
-   if (flp-flowi4_oif  flp-flowi4_oif != nh-nh_oif)
-   continue;
+   if (!(flp-flowi4_flags  FLOWI_FLAG_VRFSRC)) {
+   if (flp-flowi4_oif 
+   flp-flowi4_oif != nh-nh_oif)
+   continue;
+   }
 
if (!(fib_flags  FIB_LOOKUP_NOREF))
atomic_inc(fi-fib_clntref);
diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c
index c0556f1e4bf0..d2d142b775b8 100644
--- a/net/ipv4/icmp.c
+++ b/net/ipv4/icmp.c
@@ -96,6 +96,7 @@
 #include net/xfrm.h
 #include net/inet_common.h
 #include net/ip_fib.h
+#include net/vrf.h
 
 /*
  * Build xmit assembly blocks
@@ -425,6 +426,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct 
sk_buff *skb)
fl4.flowi4_mark = mark;
fl4.flowi4_tos = RT_TOS(ip_hdr(skb)-tos);
fl4.flowi4_proto = IPPROTO_ICMP;
+   fl4.flowi4_oif = vrf_master_dev_ifindex(skb-dev) ? : skb-dev-ifindex;
security_skb_classify_flow(skb, flowi4_to_flowi(fl4));
rt = ip_route_output_key(net, fl4);
if (IS_ERR(rt))
@@ -458,6 +460,8 @@ static struct rtable *icmp_route_lookup(struct net *net,
fl4-flowi4_proto = IPPROTO_ICMP;
fl4-fl4_icmp_type = type;
fl4-fl4_icmp_code = code;
+   fl4-flowi4_oif = vrf_master_dev_ifindex(skb_in-dev) ? : 
skb_in-dev-ifindex;
+
security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4));
rt = __ip_route_output_key(net, fl4);
if (IS_ERR(rt))
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ba74c83c05be..8119896e1159 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -2093,6 +2093,9 @@ struct rtable *__ip_route_output_key(struct net *net, 
struct flowi4 *fl4)
if (!dev_out)
goto out;
 
+   if (netif_is_vrf(dev_out))
+   fl4-flowi4_flags |= FLOWI_FLAG_VRFSRC;
+
/* RACE: Check return value of inet_select_addr instead. */
if (!(dev_out-flags  IFF_UP) || !__in_dev_get_rcu(dev_out)) {
rth = ERR_PTR(-ENETUNREACH);
-- 
2.3.2 (Apple Git-55)

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next v2] route: allow to route in a peer netns via lwt framework

2015-07-27 Thread Nicolas Dichtel

Le 24/07/2015 17:39, Eric Dumazet a écrit :


On Fri, 2015-07-24 at 16:16 +0200, Nicolas Dichtel wrote:

This patch takes advantage of the newly added lwtunnel framework to
allow the user to set routes that point to a peer netns.

Packets are injected to the peer netns via the loopback device. It works
only when the output device is 'lo'.

Example:
ip route add 40.1.1.1/32 encap netns nsid 5 via dev lo



Is this feature so badly wanted to add complexity on lo device ?


The goal is to be scalable when the number of netns is high (10k or more).
Which this patch, we can save two interfaces (veth) per netns, which helps to to 
reduce memory consumption and the time needed to create a netns.


[snip]

+   if (nsid != NETNSA_NSID_NOT_ASSIGNED) {
+   peernet = get_net_ns_by_id(dev_net(dev), nsid);
+   if (!peernet) {
+   kfree_skb(skb);
+   goto end;
+   }
+
+   /* it's OK to use per_cpu_ptr() because BHs are off */
+   lb_stats = this_cpu_ptr(peernet-loopback_dev-lstats);
+   ret = dev_forward_skb(peernet-loopback_dev, skb);
+   } else {
+   skb_orphan(skb);

-   /* it's OK to use per_cpu_ptr() because BHs are off */
-   lb_stats = this_cpu_ptr(dev-lstats);
+   skb-protocol = eth_type_trans(skb, dev);
+
+   /* it's OK to use per_cpu_ptr() because BHs are off */
+   lb_stats = this_cpu_ptr(dev-lstats);
+   ret = netif_rx(skb);
+   }

len = skb-len;


use after free error  At this point you no longer can access skb

Right, will fix it.
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next] route: allow to route in a peer netns via lwt framework

2015-07-27 Thread Nicolas Dichtel

Le 24/07/2015 17:19, David Ahern a écrit :

In this case you are knowingly dropping packets. Would be nice to have a counter
showing that.

Ok.
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: netns refcnt leak for kernel accept sock

2015-07-27 Thread Cong Wang
On Mon, Jul 27, 2015 at 11:19 AM, Sowmini Varadhan
sowmini.varad...@oracle.com wrote:
 On (07/27/15 11:13), Cong Wang wrote:

 That refcnt should be released in sock destructor too, when the tcp
 connection is terminated.

 yes, but in my case, the listen socket is opened as part of
 the -init indirection in pernet_operations (thus it is a kernel socket)
 and the expectation is that this listen socket, and any accept sockets
 derived from it, will be closed in -exit.

 But if the accept socket is treated as a uspace socket (thus holds a 
 get_net())
 then it will block cleanup_net() and the associated -exit cleanup operations.

 This is probably not a problem for other systems like vxlan/gue/geneve etc
 because they all use udp sockets, thus dont have the accept equivalent.


dlm uses a kernel TCP socket too, but it allocates a new socket and calls
-accept() by itself. ;)
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH v4 net-next 1/4] tcp: replace cnt rtt with struct in pkts_acked()

2015-07-27 Thread Stephen Hemminger
On Fri, 24 Jul 2015 19:47:03 -0700
Lawrence Brakmo bra...@fb.com wrote:

 Replace 2 arguments (cnt and rtt) in the congestion control modules'
 pkts_acked() function with a struct. This will allow adding more
 information without having to modify existing congestion control
 modules (tcp_nv in particular needs bytes in flight when packet
 was sent).
 
 As proposed by Neal Cardwell in his comments to the tcp_nv patch.

Adding a layer of indirection makes code changes easier, but makes
the code slower. Arguments are passed in registers, and putting an
additional level of indirection only matters if you can't change
all the CC modules. Since this is the kernel and API compatability
doesn't matter, just pass more arguments.
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[net-next 0/16] Proposal for VRF-lite - v3

2015-07-27 Thread David Ahern
In the context of internet scale routing a requirement that always comes
up is the need to partition the available routing tables into disjoint
routing planes. A specific use case is the multi-tenancy problem where
each tenant has their own unique routing tables and in the very least
need different default gateways.

This patch allows the ability to create virtual router domains (aka VRFs
(VRF-lite to be specific) in the linux packet forwarding stack. The main
observation is that through the use of rules and socket binding to interfaces,
all the facilities that we need are already present in the infrastructure. What
is missing is a handle that identifies a routing domain and can be used to
gather applicable rules/tables and uniqify neighbor selection. The scheme used
needs to preserves the notions of ECMP, and general routing principles.

This driver is a cross between functionality that the IPVLAN driver
and the Team drivers provide where a device is created and packets
into/out of the routing domain are shuttled through this device. The
device is then used as a handle to identify the applicable rules. The
VRF device is thus the layer3 equivalent of a vlan device.

The very important point to note is that this is only a Layer3 concept
so L2 tools (e.g., LLDP) do not need to be run in each VRF, processes can
run in unaware mode or select a VRF to be talking through. Also the
behavioral model is a generalized application of the familiar VRF-Lite
model with some performance paths that need optimization. (Specifically
the output route selector that Roopa, Robert, Thomas and EricB are
currently discussing on the MPLS thread)

High Level points
=
1. Simple overlay driver (minimal changes to current stack)
   * uses the existing fib tables and fib rules infrastructure
2. Modelled closely after the ipvlan driver
3. Uses current API and infrastructure.
   * Applications can use SO_BINDTODEVICE or cmsg device indentifiers
 to pick VRF (ping, traceroute just work)
   * Standard IP Rules work, and since they are aggregated against the
 device, scale is manageable
4. Completely orthogonal to Namespaces and only provides separation in
   the routing plane (and ARP)

 N2
   N1 (all configs here)  +---+
+--+  |   |
|swp1 :10.0.1.1+--+swp1 :10.0.1.2 |
|  |  |   |
|swp2 :10.0.2.1+--+swp2 :10.0.2.2 |
|  |  +---+
| VRF 1|
| table 5  |
|  |
+---+
|  |
| VRF 2| N3
| table 6  |  +---+
|  |  |   |
|swp3 :10.0.2.1+--+swp1 :10.0.2.2 |
|  |  |   |
|swp4 :10.0.3.1+--+swp2 :10.0.3.2 |
+--+  +---+


Given the topology above, the setup needed to get the basic VRF
functions working would be

Create the VRF devices and associate with a table
ip link add vrf1 type vrf table 5
ip link add vrf2 type vrf table 6

Install the lookup rules that map table to VRF domain
ip rule add pref 200 oif vrf1 lookup 5
ip rule add pref 200 iif vrf1 lookup 5
ip rule add pref 200 oif vrf2 lookup 6
ip rule add pref 200 iif vrf2 lookup 6

ip link set vrf1 up
ip link set vrf2 up

Enslave the routing member interfaces
ip link set swp1 master vrf1
ip link set swp2 master vrf1
ip link set swp3 master vrf2
ip link set swp4 master vrf2

Connected routes are automatically moved from main table to the VRF
table.

ping using VRF0 is simply
ping -I vrf0 10.0.1.2

Or using the task context and a command such as the example chvrf in
patch 15 unmodified applications are run in a VRF context using:
   chvrf -v 1 ping 10.0.1.2


Design Highlights
=
If a device is enslaved to a VRF device (ie., associated with a VRF)
then:
1. Rx path
   The master device index is used as the iif for all lookups.

2. Tx path
   Similarly, for Tx the VRF device oif is used in the flow to direct
   lookups to the table associated with the VRF via its rule. From there
   the FLOWI_FLAG_VRFSRC flag is used to indicate that the oif should
   not be used for FIB table lookups.

3. Connected and local routes
   On link up for a device, connected and local routes are added to the
   table associated with the VRF device, rather than the local and main
   tables.

4. Socket lookups
   Socket lookups use the VRF device for comparison with sk_bound_dev_if.
   If a socket is not bound to a device a socket match can happen based
   on destination address, port and protocol in which case a VRF global
   or agnostic 

[PATCH net-next 08/16] net: Fix up inet_addr_type checks

2015-07-27 Thread David Ahern
Currently inet_addr_type and inet_dev_addr_type expect local addresses
to be in the local table. With the VRF device local routes for devices
associated with a VRF will be in the table associated with the VRF.
Provide an alternate inet_addr lookup to use a specific table rather
than defaulting to the local table.

inet_addr_type_dev_table keeps the same semantics as inet_addr_type but
if the passed in device is enslaved to a VRF then the table for that VRF
is used for the lookup.

Signed-off-by: David Ahern d...@cumulusnetworks.com
---
 include/net/route.h  |  3 +++
 net/ipv4/af_inet.c   | 13 -
 net/ipv4/arp.c   | 15 +--
 net/ipv4/fib_frontend.c  | 28 +---
 net/ipv4/fib_semantics.c |  6 --
 net/ipv4/icmp.c  |  5 +++--
 6 files changed, 56 insertions(+), 14 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index 3b51c339c269..b14cbec93fbd 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -195,6 +195,9 @@ unsigned int inet_addr_type(struct net *net, __be32 addr);
 unsigned int inet_addr_type_table(struct net *net, __be32 addr, int tb_id);
 unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev,
__be32 addr);
+unsigned int inet_addr_type_dev_table(struct net *net,
+ const struct net_device *dev,
+ __be32 addr);
 void ip_rt_multicast_event(struct in_device *);
 int ip_rt_ioctl(struct net *, unsigned int cmd, void __user *arg);
 void ip_rt_get_source(u8 *src, struct sk_buff *skb, struct rtable *rt);
diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
index cc4e498a0ccf..09c7c1ee307e 100644
--- a/net/ipv4/af_inet.c
+++ b/net/ipv4/af_inet.c
@@ -119,6 +119,7 @@
 #ifdef CONFIG_IP_MROUTE
 #include linux/mroute.h
 #endif
+#include net/vrf.h
 
 
 /* The inetsw table contains everything that inet_create needs to
@@ -427,6 +428,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, 
int addr_len)
struct net *net = sock_net(sk);
unsigned short snum;
int chk_addr_ret;
+   int tb_id = 0;
int err;
 
/* If the socket has its own bind function then use it. (RAW) */
@@ -448,7 +450,16 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, 
int addr_len)
goto out;
}
 
-   chk_addr_ret = inet_addr_type(net, addr-sin_addr.s_addr);
+   if (sk-sk_bound_dev_if) {
+   struct net_device *dev;
+
+   dev = dev_get_by_index(net, sk-sk_bound_dev_if);
+   if (dev) {
+   tb_id = vrf_dev_table(dev);
+   dev_put(dev);
+   }
+   }
+   chk_addr_ret = inet_addr_type_table(net, addr-sin_addr.s_addr, tb_id);
 
/* Not specified by any standard per-se, however it breaks too
 * many applications when removed.  It is unfortunate since
diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c
index 1d59e50ce8b7..53eee7cecce8 100644
--- a/net/ipv4/arp.c
+++ b/net/ipv4/arp.c
@@ -233,7 +233,7 @@ static int arp_constructor(struct neighbour *neigh)
return -EINVAL;
}
 
-   neigh-type = inet_addr_type(dev_net(dev), addr);
+   neigh-type = inet_addr_type_dev_table(dev_net(dev), dev, addr);
 
parms = in_dev-arp_parms;
__neigh_parms_put(neigh-parms);
@@ -343,7 +343,7 @@ static void arp_solicit(struct neighbour *neigh, struct 
sk_buff *skb)
switch (IN_DEV_ARP_ANNOUNCE(in_dev)) {
default:
case 0: /* By default announce any local IP */
-   if (skb  inet_addr_type(dev_net(dev),
+   if (skb  inet_addr_type_dev_table(dev_net(dev), dev,
  ip_hdr(skb)-saddr) == RTN_LOCAL)
saddr = ip_hdr(skb)-saddr;
break;
@@ -351,7 +351,8 @@ static void arp_solicit(struct neighbour *neigh, struct 
sk_buff *skb)
if (!skb)
break;
saddr = ip_hdr(skb)-saddr;
-   if (inet_addr_type(dev_net(dev), saddr) == RTN_LOCAL) {
+   if (inet_addr_type_dev_table(dev_net(dev), dev,
+saddr) == RTN_LOCAL) {
/* saddr should be known to target */
if (inet_addr_onlink(in_dev, target, saddr))
break;
@@ -751,7 +752,7 @@ static int arp_process(struct sock *sk, struct sk_buff *skb)
/* Special case: IPv4 duplicate address detection packet (RFC2131) */
if (sip == 0) {
if (arp-ar_op == htons(ARPOP_REQUEST) 
-   inet_addr_type(net, tip) == RTN_LOCAL 
+   inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL 
!arp_ignore(in_dev, sip, tip))
arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,

[PATCH net-next 04/16] net: Use VRF device index for lookups on RX

2015-07-27 Thread David Ahern
On ingress use index of VRF master device for route lookups if real device
is enslaved. Rules are expected to be installed for the VRF device to
direct lookups to a specific table.

Signed-off-by: Shrijeet Mukherjee s...@cumulusnetworks.com
Signed-off-by: David Ahern d...@cumulusnetworks.com
---
 net/ipv4/fib_frontend.c | 8 +++-
 net/ipv4/route.c| 3 ++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index c565fc182240..6e68a003d0fd 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -45,6 +45,7 @@
 #include net/ip_fib.h
 #include net/rtnetlink.h
 #include net/xfrm.h
+#include net/vrf.h
 
 #ifndef CONFIG_IP_MULTIPLE_TABLES
 
@@ -311,7 +312,9 @@ static int __fib_validate_source(struct sk_buff *skb, 
__be32 src, __be32 dst,
bool dev_match;
 
fl4.flowi4_oif = 0;
-   fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
+   fl4.flowi4_iif = vrf_master_dev_ifindex(dev);
+   if (!fl4.flowi4_iif)
+   fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX;
fl4.daddr = src;
fl4.saddr = dst;
fl4.flowi4_tos = tos;
@@ -341,6 +344,9 @@ static int __fib_validate_source(struct sk_buff *skb, 
__be32 src, __be32 dst,
if (nh-nh_dev == dev) {
dev_match = true;
break;
+   } else if (vrf_master_dev_ifindex(nh-nh_dev) == dev-ifindex) {
+   dev_match = true;
+   break;
}
}
 #else
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index ef140919211f..ba74c83c05be 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -112,6 +112,7 @@
 #endif
 #include net/secure_seq.h
 #include net/ip_tunnels.h
+#include net/vrf.h
 
 #define RT_FL_TOS(oldflp4) \
((oldflp4)-flowi4_tos  (IPTOS_RT_MASK | RTO_ONLINK))
@@ -1735,7 +1736,7 @@ static int ip_route_input_slow(struct sk_buff *skb, 
__be32 daddr, __be32 saddr,
 *  Now we are ready to route packet.
 */
fl4.flowi4_oif = 0;
-   fl4.flowi4_iif = dev-ifindex;
+   fl4.flowi4_iif = vrf_master_dev_ifindex(dev) ? : dev-ifindex;
fl4.flowi4_mark = skb-mark;
fl4.flowi4_tos = tos;
fl4.flowi4_scope = RT_SCOPE_UNIVERSE;
-- 
2.3.2 (Apple Git-55)

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 10/16] net: Use passed in table for nexthop lookups

2015-07-27 Thread David Ahern
If a user passes in a table for new routes use that table for nexthop
lookups. Specifically, this solves the case where a connected route does
not exist in the main table, but only another table and then a subsequent
route is added with a next hop using the connected route. ie.,

$ ip route ls
default via 10.0.2.2 dev eth0
10.0.2.0/24 dev eth0  proto kernel  scope link  src 10.0.2.15
169.254.0.0/16 dev eth0  scope link  metric 1003
192.168.56.0/24 dev eth1  proto kernel  scope link  src 192.168.56.51

$ ip route ls table 10
1.1.1.0/24 dev eth2  scope link

Without this patch adding a nexthop route fails:

$ ip route add table 10 2.2.2.0/24 via 1.1.1.10
RTNETLINK answers: Network is unreachable

With this patch the route is added successfully.

Signed-off-by: David Ahern d...@cumulusnetworks.com
---
 net/ipv4/fib_semantics.c | 13 +++--
 1 file changed, 11 insertions(+), 2 deletions(-)

diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c
index 37e1dee7692a..7d79dfbfa5d2 100644
--- a/net/ipv4/fib_semantics.c
+++ b/net/ipv4/fib_semantics.c
@@ -691,6 +691,7 @@ static int fib_check_nh(struct fib_config *cfg, struct 
fib_info *fi,
}
rcu_read_lock();
{
+   struct fib_table *tbl = NULL;
struct flowi4 fl4 = {
.daddr = nh-nh_gw,
.flowi4_scope = cfg-fc_scope + 1,
@@ -701,8 +702,16 @@ static int fib_check_nh(struct fib_config *cfg, struct 
fib_info *fi,
/* It is not necessary, but requires a bit of thinking 
*/
if (fl4.flowi4_scope  RT_SCOPE_LINK)
fl4.flowi4_scope = RT_SCOPE_LINK;
-   err = fib_lookup(net, fl4, res,
-FIB_LOOKUP_IGNORE_LINKSTATE);
+
+   if (cfg-fc_table)
+   tbl = fib_get_table(net, cfg-fc_table);
+
+   if (tbl)
+   err = fib_table_lookup(tbl, fl4, res,
+  FIB_LOOKUP_IGNORE_LINKSTATE);
+   else
+   err = fib_lookup(net, fl4, res,
+FIB_LOOKUP_IGNORE_LINKSTATE);
if (err) {
rcu_read_unlock();
return err;
-- 
2.3.2 (Apple Git-55)

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 01/16] net: Refactor rtable allocation and initialization

2015-07-27 Thread David Ahern
All callers to rt_dst_alloc have nearly the same initialization following
a successful allocation. Consolidate it into ip_route_new_rtable.

Signed-off-by: David Ahern d...@cumulusnetworks.com
---
 include/net/route.h |   3 ++
 net/ipv4/route.c| 111 +++-
 2 files changed, 51 insertions(+), 63 deletions(-)

diff --git a/include/net/route.h b/include/net/route.h
index 2d45f419477f..cec7a2a055c8 100644
--- a/include/net/route.h
+++ b/include/net/route.h
@@ -111,6 +111,9 @@ struct in_device;
 int ip_rt_init(void);
 void rt_cache_flush(struct net *net);
 void rt_flush_dev(struct net_device *dev);
+struct rtable *ip_route_new_rtable(struct net_device *dev,
+  unsigned int flags, u16 type,
+  bool nopolicy, bool noxfrm, bool do_cache);
 struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp);
 struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp,
struct sock *sk);
diff --git a/net/ipv4/route.c b/net/ipv4/route.c
index 11096396ef4a..ef140919211f 100644
--- a/net/ipv4/route.c
+++ b/net/ipv4/route.c
@@ -1443,12 +1443,42 @@ static struct rtable *rt_dst_alloc(struct net_device 
*dev,
 (noxfrm ? DST_NOXFRM : 0));
 }
 
+struct rtable *ip_route_new_rtable(struct net_device *dev,
+  unsigned int flags, u16 type,
+  bool nopolicy, bool noxfrm, bool do_cache)
+{
+   struct rtable *rth;
+
+   rth = rt_dst_alloc(dev, nopolicy, noxfrm, do_cache);
+   if (rth) {
+   rth-rt_genid = rt_genid_ipv4(dev_net(dev));
+   rth-rt_flags = flags;
+   rth-rt_type = type;
+   rth-rt_is_input = 0;
+   rth-rt_iif = 0;
+   rth-rt_pmtu = 0;
+   rth-rt_gateway = 0;
+   rth-rt_uses_gateway = 0;
+   INIT_LIST_HEAD(rth-rt_uncached);
+   rth-rt_lwtstate = NULL;
+
+   rth-dst.output = ip_output;
+   if (flags  RTCF_LOCAL)
+   rth-dst.input = ip_local_deliver;
+   }
+
+   return rth;
+}
+EXPORT_SYMBOL(ip_route_new_rtable);
+
 /* called in rcu_read_lock() section */
 static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr,
u8 tos, struct net_device *dev, int our)
 {
struct rtable *rth;
struct in_device *in_dev = __in_dev_get_rcu(dev);
+   unsigned int flags = RTCF_MULTICAST;
+   u16 type = RTN_MULTICAST;
u32 itag = 0;
int err;
 
@@ -1474,8 +1504,13 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 
daddr, __be32 saddr,
if (err  0)
goto e_err;
}
-   rth = rt_dst_alloc(dev_net(dev)-loopback_dev,
-  IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false);
+   if (our)
+   flags |= RTCF_LOCAL;
+
+   rth = ip_route_new_rtable(dev_net(dev)-loopback_dev,
+ flags, type,
+ IN_DEV_CONF_GET(in_dev, NOPOLICY),
+ false, false);
if (!rth)
goto e_nobufs;
 
@@ -1483,22 +1518,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 
daddr, __be32 saddr,
rth-dst.tclassid = itag;
 #endif
rth-dst.output = ip_rt_bug;
-
-   rth-rt_genid   = rt_genid_ipv4(dev_net(dev));
-   rth-rt_flags   = RTCF_MULTICAST;
-   rth-rt_type= RTN_MULTICAST;
rth-rt_is_input= 1;
-   rth-rt_iif = 0;
-   rth-rt_pmtu= 0;
-   rth-rt_gateway = 0;
-   rth-rt_uses_gateway = 0;
-   INIT_LIST_HEAD(rth-rt_uncached);
-   rth-rt_lwtstate = NULL;
-   if (our) {
-   rth-dst.input= ip_local_deliver;
-   rth-rt_flags |= RTCF_LOCAL;
-   }
-
 #ifdef CONFIG_IP_MROUTE
if (!ipv4_is_local_multicast(daddr)  IN_DEV_MFORWARD(in_dev))
rth-dst.input = ip_mr_input;
@@ -1606,28 +1626,17 @@ static int __mkroute_input(struct sk_buff *skb,
}
}
 
-   rth = rt_dst_alloc(out_dev-dev,
-  IN_DEV_CONF_GET(in_dev, NOPOLICY),
-  IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
+   rth = ip_route_new_rtable(out_dev-dev, 0, res-type,
+ IN_DEV_CONF_GET(in_dev, NOPOLICY),
+ IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache);
if (!rth) {
err = -ENOBUFS;
goto cleanup;
}
 
-   rth-rt_genid = rt_genid_ipv4(dev_net(rth-dst.dev));
-   rth-rt_flags = 0;
-   rth-rt_type = res-type;
rth-rt_is_input = 1;
-   rth-rt_iif = 0;
-   rth-rt_pmtu= 0;
-   rth-rt_gateway = 0;
-   rth-rt_uses_gateway = 0;
-   INIT_LIST_HEAD(rth-rt_uncached);
-   

Re: netns refcnt leak for kernel accept sock

2015-07-27 Thread Sowmini Varadhan
On (07/27/15 11:37), Cong Wang wrote:
 
 dlm uses a kernel TCP socket too, but it allocates a new socket and calls
 -accept() by itself. ;)

sure, and rds does this in rds_tcp_accept_one() too.

But the newsk being created in sk_clone_lock  is the one on an 
incoming syn, i.e., the one that is saved up as part of listen backlog, 
to be returned later on the accept.

I dont know the details of dlm- can you have one dlm instance per
network namespace? That's where I'm running into this issue- when we
try to have one rds listen socket per netns, and want to be able to
do both
- dynamically build/tear down new network namepsaces, without 
  unloading rds_tcp globally
- unload rds_tcp globally withouth tearing down individual netns.

But perhaps we digress.

Fundamental issue remains: newsk is the syn_recv version of the
listen socket. If the listen socket is a kernel socket (kern == 1
for sk_alloc, and the listen socket thus has no sk_net_refcnt),
the syn_recv socket must also have that behavior, so that it is
cleaned up in the same way.

--Sowmini

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 3/3] openvswitch: 802.1AD: Flow handling, actions, vlan parsing and netlink attributes

2015-07-27 Thread Pravin Shelar
On Sun, Jul 26, 2015 at 7:52 AM, Thomas F Herbert
thomasfherb...@gmail.com wrote:
 Add support for 802.1ad including the ability to push and pop double
 tagged vlans. Add support for 802.1ad to netlink parsing and flow
 conversion. Uses double nested encap attributes to represent double
 tagged vlan. Inner TPID encoded along with ctci in nested attributes. Allows
 either 0x8100 or 0x88a8 on inner or outer tags.

 Signed-off-by: Thomas F Herbert thomasfherb...@gmail.com
 ---
  net/openvswitch/flow.c |  84 +++---
  net/openvswitch/flow.h |   5 ++
  net/openvswitch/flow_netlink.c | 196 
 ++---
  3 files changed, 243 insertions(+), 42 deletions(-)

 diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c
 index 8db22ef..0abab37 100644
 --- a/net/openvswitch/flow.c
 +++ b/net/openvswitch/flow.c
 @@ -298,21 +298,80 @@ static bool icmp6hdr_ok(struct sk_buff *skb)
  static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key)
  {
 struct qtag_prefix {
 -   __be16 eth_type; /* ETH_P_8021Q */
 +   __be16 eth_type; /* ETH_P_8021Q  or ETH_P_8021AD */
 __be16 tci;
 };
 -   struct qtag_prefix *qp;
 +   struct qtag_prefix *qp = (struct qtag_prefix *)skb-data;

 -   if (unlikely(skb-len  sizeof(struct qtag_prefix) + sizeof(__be16)))
 +   struct qinqtag_prefix {
 +   __be16 eth_type; /* ETH_P_8021Q  or ETH_P_8021AD */
 +   __be16 tci;
 +   __be16 inner_tpid; /* ETH_P_8021Q */
 +   __be16 ctci;
 +   };
 +
 +   if (likely(skb_vlan_tag_present(skb))) {
 +   key-eth.tci = htons(skb-vlan_tci);
 +
 +   /* Case where upstream
 +* processing has already stripped the outer vlan tag.
 +*/
 +   if (unlikely(skb-vlan_proto == htons(ETH_P_8021AD))) {
 +   if (unlikely(skb-len  sizeof(struct qtag_prefix) +
 +   sizeof(__be16))) {
 +   key-eth.tci = 0;
 +   return 0;
 +   }
 +
 +   if (unlikely(!pskb_may_pull(skb,
 +   sizeof(struct 
 qtag_prefix) +
 +   sizeof(__be16 {
 +   return -ENOMEM;
 +   }
 +
No need to curly brackets for single statement.

 +   if (likely(qp-eth_type == htons(ETH_P_8021Q))) {
 +   key-eth.cvlan.ctci =
 +   qp-tci | htons(VLAN_TAG_PRESENT);
 +   key-eth.cvlan.c_tpid = qp-eth_type;
 +   __skb_pull(skb, sizeof(struct qtag_prefix));
 +   }
key-eth.cvlan.tci and tpid should be set irrespective of qp-eth_type
as it is done bellow for non offload case.

 +   }
 return 0;
 +   }

 -   if (unlikely(!pskb_may_pull(skb, sizeof(struct qtag_prefix) +
 -sizeof(__be16
 -   return -ENOMEM;

 -   qp = (struct qtag_prefix *) skb-data;
 -   key-eth.tci = qp-tci | htons(VLAN_TAG_PRESENT);
 -   __skb_pull(skb, sizeof(struct qtag_prefix));
 +   if (qp-eth_type == htons(ETH_P_8021AD)) {
 +   struct qinqtag_prefix *qinqp =
 +   (struct qinqtag_prefix *)skb-data;
 +
 +   if (unlikely(skb-len  sizeof(struct qinqtag_prefix) +
 +   sizeof(__be16)))
 +   return 0;
 +
 +   if (unlikely(!pskb_may_pull(skb, sizeof(struct 
 qinqtag_prefix) +
 +   sizeof(__be16 {
 +   return -ENOMEM;
 +   }
No need to curly brackets for single statement.

 +   key-eth.tci = qinqp-tci | htons(VLAN_TAG_PRESENT);
 +   key-eth.cvlan.ctci = qinqp-ctci | htons(VLAN_TAG_PRESENT);
 +   key-eth.cvlan.c_tpid = qinqp-inner_tpid;
 +
 +   __skb_pull(skb, sizeof(struct qinqtag_prefix));
 +
 +   return 0;
 +   }
 +   if (qp-eth_type == htons(ETH_P_8021Q)) {
 +   if (unlikely(skb-len  sizeof(struct qtag_prefix) +
 +   sizeof(__be16)))
 +   return -ENOMEM;
 +
 +   if (unlikely(!pskb_may_pull(skb, sizeof(struct qtag_prefix) +
 +   sizeof(__be16
 +   return 0;
 +   key-eth.tci = qp-tci | htons(VLAN_TAG_PRESENT);
 +
 +   __skb_pull(skb, sizeof(struct qtag_prefix));
 +   }

 return 0;
  }
 @@ -474,9 +533,10 @@ static int key_extract(struct sk_buff *skb, struct 
 sw_flow_key *key)
  */

 key-eth.tci = 0;
 -   if 

Re: [RFC PATCH v4 net-next 1/4] tcp: replace cnt rtt with struct in pkts_acked()

2015-07-27 Thread Lawrence Brakmo


On 7/27/15, 11:46 AM, Stephen Hemminger step...@networkplumber.org
wrote:

On Fri, 24 Jul 2015 19:47:03 -0700
Lawrence Brakmo bra...@fb.com wrote:

 Replace 2 arguments (cnt and rtt) in the congestion control modules'
 pkts_acked() function with a struct. This will allow adding more
 information without having to modify existing congestion control
 modules (tcp_nv in particular needs bytes in flight when packet
 was sent).
 
 As proposed by Neal Cardwell in his comments to the tcp_nv patch.

Adding a layer of indirection makes code changes easier, but makes
the code slower. Arguments are passed in registers, and putting an
additional level of indirection only matters if you can't change
all the CC modules. Since this is the kernel and API compatability
doesn't matter, just pass more arguments.

I prefer the cleanliness of passing a structure and don¹t think the
overhead will be significant enough to worry about it.
Will the compiler pass struct values in registers if the struct is
passed by value?

I will be happy to do it either way (I did it like Stephen proposes
originally). What does everyone else think?

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 13/16] net: Introduce VRF device driver - v2

2015-07-27 Thread Nikolay Aleksandrov
On 07/27/2015 08:31 PM, David Ahern wrote:
 This driver borrows heavily from IPvlan and teaming drivers.
 
 Routing domains (VRF-lite) are created by instantiating a VRF master
 device with an associated table and enslaving all routed interfaces that
 participate in the domain. As part of the enslavement, all connected
 routes for the enslaved devices are moved to the table associated with
 the VRF device. Outgoing sockets must bind to the VRF device to function.
 
 Standard FIB rules bind the VRF device to tables and regular fib rule
 processing is followed. Routed traffic through the box, is forwarded by
 using the VRF device as the IIF and following the IIF rule to a table
 that is mated with the VRF.
 
 Example:
 
Create vrf 1:
  ip link add vrf1 type vrf table 5
  ip rule add iif vrf1 table 5
  ip rule add oif vrf1 table 5
  ip route add table 5 prohibit default
  ip link set vrf1 up
 
Add interface to vrf 1:
  ip link set eth1 master vrf1
 
 Signed-off-by: Shrijeet Mukherjee s...@cumulusnetworks.com
 Signed-off-by: David Ahern d...@cumulusnetworks.com
 
 v2:
 - addressed comments from first RFC
 - significant changes to improve simplicity of implementation
 ---
  drivers/net/Kconfig  |   7 +
  drivers/net/Makefile |   1 +
  drivers/net/vrf.c| 596 
 +++
  3 files changed, 604 insertions(+)
  create mode 100644 drivers/net/vrf.c
 
 diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig
 index c18f9e62a9fa..e58468b02987 100644
 --- a/drivers/net/Kconfig
 +++ b/drivers/net/Kconfig
 @@ -297,6 +297,13 @@ config NLMON
 diagnostics, etc. This is mostly intended for developers or support
 to debug netlink issues. If unsure, say N.
  
 +config NET_VRF
 + tristate Virtual Routing and Forwarding (Lite)
 + depends on IP_MULTIPLE_TABLES  IPV6_MULTIPLE_TABLES
 + ---help---
 +   This option enables the support for mapping interfaces into VRF's. The
 +   support enables VRF devices.
 +
  endif # NET_CORE
  
  config SUNGEM_PHY
 diff --git a/drivers/net/Makefile b/drivers/net/Makefile
 index c12cb22478a7..ca16dd689b36 100644
 --- a/drivers/net/Makefile
 +++ b/drivers/net/Makefile
 @@ -25,6 +25,7 @@ obj-$(CONFIG_VIRTIO_NET) += virtio_net.o
  obj-$(CONFIG_VXLAN) += vxlan.o
  obj-$(CONFIG_GENEVE) += geneve.o
  obj-$(CONFIG_NLMON) += nlmon.o
 +obj-$(CONFIG_NET_VRF) += vrf.o
  
  #
  # Networking Drivers
 diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c
 new file mode 100644
 index ..8669b0f9d749
 --- /dev/null
 +++ b/drivers/net/vrf.c
 @@ -0,0 +1,596 @@
 +/*
 + * vrf.c: device driver to encapsulate a VRF space
 + *
 + * Copyright (c) 2015 Cumulus Networks
 + *
 + * Based on dummy, team and ipvlan drivers
 + *
 + * This program is free software; you can redistribute it and/or modify
 + * it under the terms of the GNU General Public License as published by
 + * the Free Software Foundation; either version 2 of the License, or
 + * (at your option) any later version.
 + */
 +
 +#include linux/module.h
 +#include linux/kernel.h
 +#include linux/netdevice.h
 +#include linux/etherdevice.h
 +#include linux/ip.h
 +#include linux/init.h
 +#include linux/moduleparam.h
 +#include linux/rtnetlink.h
 +#include net/rtnetlink.h
 +#include linux/u64_stats_sync.h
 +#include linux/hashtable.h
 +
 +#include linux/inetdevice.h
 +#include net/ip.h
 +#include net/ip_fib.h
 +#include net/ip6_route.h
 +#include net/rtnetlink.h
 +#include net/route.h
 +#include net/addrconf.h
 +#include net/vrf.h
 +
 +#define DRV_NAME vrf
 +#define DRV_VERSION  1.0
 +
 +#define vrf_is_slave(dev)   ((dev)-flags  IFF_SLAVE)
 +#define vrf_is_master(dev)  ((dev)-flags  IFF_MASTER)
 +
 +#define vrf_master_get_rcu(dev) \
 + ((struct net_device *)rcu_dereference(dev-rx_handler_data))
 +
 +struct pcpu_dstats {
 + u64 tx_pkts;
 + u64 tx_bytes;
 + u64 tx_drps;
 + u64 rx_pkts;
 + u64 rx_bytes;
 + struct u64_stats_sync   syncp;
 +};
 +
 +struct slave {
 + struct list_headlist;
 + struct net_device   *dev;
 +};
 +
 +struct slave_queue {
 + spinlock_t  lock; /* lock for slave insert/delete */

I don't think you actually need this lock since all VRF dev operations are done
under RTNL so you already got protection against add/del running concurrently.
It would simplify the code if you can get rid of it.

 + struct list_headall_slaves;
 + int num_slaves;
 +};
 +
 +struct net_vrf {
 + struct slave_queue  queue;
 + struct fib_table*tb;
 + u32 tb_id;
 +};
 +
 +static bool is_ip_rx_frame(struct sk_buff *skb)
 +{
 + switch (skb-protocol) {
 + case htons(ETH_P_IP):
 + case htons(ETH_P_IPV6):
 + return true;
 + }
 + return false;
 +}
 +
 +/* note: already called with 

[net PATCH] fib_trie: Drop unnecessary calls to leaf_pull_suffix

2015-07-27 Thread Alexander Duyck
It was reported that update_suffix was taking a long time on systems where
a large number of leaves were attached to a single node.  As it turns out
fib_table_flush was calling update_suffix for each leaf that didn't have all
of the aliases stripped from it.  As a result, on this large node removing
one leaf would result in us calling update_suffix for every other leaf on
the node.

The fix is to just remove the calls to leaf_pull_suffix since they are
redundant as we already have a call in resize that will go through and
update the suffix length for the node before we exit out of
fib_table_flush or fib_table_flush_external.

Reported-by: David Ahern d...@cumulusnetworks.com
Signed-off-by: Alexander Duyck alexander.h.du...@redhat.com
---

This patch should apply to linux-4.1.y and newer kernels.

I've done a bit of testing on my system and I no longer see update_suffix
dominating the performance traces.  David if you can test with this patch
to see if you still see the issue I would appreciate it.

 net/ipv4/fib_trie.c |4 
 1 file changed, 4 deletions(-)

diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index ef90d73911de..70168ca4716b 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1791,8 +1791,6 @@ void fib_table_flush_external(struct fib_table *tb)
if (hlist_empty(n-leaf)) {
put_child_root(pn, n-key, NULL);
node_free(n);
-   } else {
-   leaf_pull_suffix(pn, n);
}
}
 }
@@ -1862,8 +1860,6 @@ int fib_table_flush(struct fib_table *tb)
if (hlist_empty(n-leaf)) {
put_child_root(pn, n-key, NULL);
node_free(n);
-   } else {
-   leaf_pull_suffix(pn, n);
}
}
 

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 8/8] can: replace timestamp as unique skb attribute

2015-07-27 Thread Oliver Hartkopp

Hello Greg,

On 12.07.2015 21:18, Marc Kleine-Budde wrote:

From: Oliver Hartkopp socket...@hartkopp.net

Commit 514ac99c64b can: fix multiple delivery of a single CAN frame for
overlapping CAN filters requires the skb-tstamp to be set to check for
identical CAN skbs.

Without timestamping to be required by user space applications this timestamp
was not generated which lead to commit 36c01245eb8 can: fix loss of CAN frames
in raw_rcv - which forces the timestamp to be set in all CAN related skbuffs
by introducing several __net_timestamp() calls.

This forces e.g. out of tree drivers which are not using alloc_can{,fd}_skb()
to add __net_timestamp() after skbuff creation to prevent the frame loss fixed
in mainline Linux.

This patch removes the timestamp dependency and uses an atomic counter to
create an unique identifier together with the skbuff pointer.

Btw: the new skbcnt element introduced in struct can_skb_priv has to be
initialized with zero in out-of-tree drivers which are not using
alloc_can{,fd}_skb() too.

Signed-off-by: Oliver Hartkopp socket...@hartkopp.net
Cc: linux-stable sta...@vger.kernel.org


Can you please queue up this missing/lost patch for the long term 4.1.x ?

It fixes the mess with commits

514ac99c64b can: fix multiple delivery of a single CAN frame for overlapping 
CAN filters


which originally fixed

36c01245eb8 can: fix loss of CAN frames in raw_rcv

So finally this missing patch would bring 4.1.x into the proper state we now 
have in 4.2-rc4.


Upstream commit of this patch is:

http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d3b58c47d330de8c29898fe9746f7530408f8a59

Best regards,
Oliver



Signed-off-by: Marc Kleine-Budde m...@pengutronix.de
---
  drivers/net/can/dev.c   |  7 ++-
  drivers/net/can/slcan.c |  2 +-
  drivers/net/can/vcan.c  |  3 ---
  include/linux/can/skb.h |  2 ++
  net/can/af_can.c| 12 +++-
  net/can/bcm.c   |  2 ++
  net/can/raw.c   |  7 ---
  7 files changed, 18 insertions(+), 17 deletions(-)

diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c
index e9b1810d319f..aede704605c6 100644
--- a/drivers/net/can/dev.c
+++ b/drivers/net/can/dev.c
@@ -440,9 +440,6 @@ unsigned int can_get_echo_skb(struct net_device *dev, 
unsigned int idx)
struct can_frame *cf = (struct can_frame *)skb-data;
u8 dlc = cf-can_dlc;

-   if (!(skb-tstamp.tv64))
-   __net_timestamp(skb);
-
netif_rx(priv-echo_skb[idx]);
priv-echo_skb[idx] = NULL;

@@ -578,7 +575,6 @@ struct sk_buff *alloc_can_skb(struct net_device *dev, 
struct can_frame **cf)
if (unlikely(!skb))
return NULL;

-   __net_timestamp(skb);
skb-protocol = htons(ETH_P_CAN);
skb-pkt_type = PACKET_BROADCAST;
skb-ip_summed = CHECKSUM_UNNECESSARY;
@@ -589,6 +585,7 @@ struct sk_buff *alloc_can_skb(struct net_device *dev, 
struct can_frame **cf)

can_skb_reserve(skb);
can_skb_prv(skb)-ifindex = dev-ifindex;
+   can_skb_prv(skb)-skbcnt = 0;

*cf = (struct can_frame *)skb_put(skb, sizeof(struct can_frame));
memset(*cf, 0, sizeof(struct can_frame));
@@ -607,7 +604,6 @@ struct sk_buff *alloc_canfd_skb(struct net_device *dev,
if (unlikely(!skb))
return NULL;

-   __net_timestamp(skb);
skb-protocol = htons(ETH_P_CANFD);
skb-pkt_type = PACKET_BROADCAST;
skb-ip_summed = CHECKSUM_UNNECESSARY;
@@ -618,6 +614,7 @@ struct sk_buff *alloc_canfd_skb(struct net_device *dev,

can_skb_reserve(skb);
can_skb_prv(skb)-ifindex = dev-ifindex;
+   can_skb_prv(skb)-skbcnt = 0;

*cfd = (struct canfd_frame *)skb_put(skb, sizeof(struct canfd_frame));
memset(*cfd, 0, sizeof(struct canfd_frame));
diff --git a/drivers/net/can/slcan.c b/drivers/net/can/slcan.c
index f64f5290d6f8..a23a7af8eb9a 100644
--- a/drivers/net/can/slcan.c
+++ b/drivers/net/can/slcan.c
@@ -207,7 +207,6 @@ static void slc_bump(struct slcan *sl)
if (!skb)
return;

-   __net_timestamp(skb);
skb-dev = sl-dev;
skb-protocol = htons(ETH_P_CAN);
skb-pkt_type = PACKET_BROADCAST;
@@ -215,6 +214,7 @@ static void slc_bump(struct slcan *sl)

can_skb_reserve(skb);
can_skb_prv(skb)-ifindex = sl-dev-ifindex;
+   can_skb_prv(skb)-skbcnt = 0;

memcpy(skb_put(skb, sizeof(struct can_frame)),
   cf, sizeof(struct can_frame));
diff --git a/drivers/net/can/vcan.c b/drivers/net/can/vcan.c
index 0ce868de855d..674f367087c5 100644
--- a/drivers/net/can/vcan.c
+++ b/drivers/net/can/vcan.c
@@ -78,9 +78,6 @@ static void vcan_rx(struct sk_buff *skb, struct net_device 
*dev)
skb-dev   = dev;
skb-ip_summed = CHECKSUM_UNNECESSARY;

-   if (!(skb-tstamp.tv64))
-   __net_timestamp(skb);
-
netif_rx_ni(skb);
  }

diff --git a/include/linux/can/skb.h 

Re: [PATCH 02/10] dpaa_eth: add support for DPAA Ethernet

2015-07-27 Thread Scott Wood
On Fri, 2015-07-24 at 10:45 -0500, Bucur Madalin-Cristian-B32716 wrote:
  -Original Message-
  From: Joe Perches [mailto:j...@perches.com]
  On Wed, 2015-07-22 at 19:16 +0300, Madalin Bucur wrote:
   +static int __init dpa_load(void)
   +{
  []
   + err = platform_driver_register(dpa_driver);
   + if (unlikely(err  0)) {
   + pr_err(KBUILD_MODNAME
   + : %s:%hu:%s(): platform_driver_register() = %d\n,
   + KBUILD_BASENAME .c, __LINE__, __func__, err);
   + }
   +
   + pr_debug(KBUILD_MODNAME : %s:%s() -\n,
   +  KBUILD_BASENAME .c, __func__);
  
  Perhaps these should use pr_fmt
 
 Agree.

How about dropping all that complexity, and just using pr_debug(%s\n, 
__func__), or dev_dbg where possible?

 
   +static void __exit dpa_unload(void)
   +{
   + pr_debug(KBUILD_MODNAME : - %s:%s()\n,
   +  KBUILD_BASENAME .c, __func__);
  
  dynamic debug has __func__ available and perhaps
  the function tracer might be used instead.
  
   diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h
  b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h
  []
   +#define __hot
  
  curious.
  
  Maybe it'd be good to add a real __hot to compiler.h
 
 They're mostly there to make readers aware the code is critical, any
 changes could mess performance.

Mostly or entirely?  Why not just use a comment, which could also point out 
specific things that were done for performance?

-Scott

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: kernel warning in tcp_fragment

2015-07-27 Thread Jovi Zhangwei
ping...

On Wed, Jul 22, 2015 at 11:55 AM, Jovi Zhangwei j...@cloudflare.com wrote:
 Hi Neal and Martin,

 Sorry for disturbing, our production system(3.14 and 3.18 stable
 kernel) have many tcp_fragment warnings,
 the trace is same as below one which you discussed before.

 http://comments.gmane.org/gmane.linux.network/365658

 But I didn't found the final solution in that mail thread, do you have
 any new ideas or patches on this warning?

 Great thanks.


 [5184217.672290] WARNING: CPU: 9 PID: 2801 at
 net/ipv4/tcp_output.c:1081 tcp_fragment+0x34/0x230()
 [5184217.680995] Modules linked in: sfc_char(O) sfc_resource(O)
 sfc_affinity(O) nf_conntrack_netlink xt_connlimit xt_length xt_bpf
 xt_hashlimit iptable_nat nf_nat_ipv4 nf_nat iptable_mangle xt_comment
 ip6table_security ip6table_mangle ip_set_hash_netport 8021q garp bridg
 e stp llc ipmi_devintf nf_conntrack_ipv6 nf_defrag_ipv6
 ip6table_filter ip6table_raw ip6_tables nf_conntrack_ipv4
 nf_defrag_ipv4 xt_NFLOG nfnetlink_log xt_conntrack iptable_filter
 xt_tcpudp xt_multiport xt_CT nf_conntrack xt_set iptable_raw ip_tables
 x_tables ip_set_hash
 _net ip_set_hash_ip ip_set nfnetlink rpcsec_gss_krb5 auth_rpcgss
 oid_registry nfsv4 fuse nfsv3 nfs_acl nfs fscache lockd sunrpc
 tcp_cubic sg sfc(O) mtd mdio igb dca i2c_algo_bit ptp pps_core sd_mod
 crct10dif_generic crc_t10dif crct10dif_common x86_pkg_temp_thermal
 acpi_c
 pufreq coretemp kvm_intel kvm crc32c_intel aesni_intel ablk_helper
 cryptd lrw gf128mul glue_helper aes_x86_64 ahci libahci ehci_pci
 libata ehci_hcd i2c_i801 i2c_core lpc_ich mfd_core usbcore scsi_mod
 usb_common wmi evdev ipmi_si ipmi_msghandler tpm_tis tpm acpi_pad
 proce
 ssor thermal_sys button
 [5184217.684098] CPU: 9 PID: 2801 Comm: rrdns Tainted: GW  O
 3.14.28-cloudflare #1
 [5184217.684099] Hardware name: Quanta Computer Inc QuantaPlex
 T41S-2U/S2S-MB, BIOS S2S_3A14  09/18/2014
 [5184217.684100]   81466263 
 8103bb34
 [5184217.684101]  813e07f2 8818abebcc00 004a
 0002
 [5184217.684102]  0060 813e07f2 00304120
 8818abebcc00
 [5184217.684104] Call Trace:
 [5184217.684105]  IRQ  [81466263] ? dump_stack+0x41/0x51
 [5184217.684111]  [8103bb34] ? warn_slowpath_common+0x74/0x89
 [5184217.684115]  [813e07f2] ? tcp_fragment+0x34/0x230
 [5184217.684118]  [813e07f2] ? tcp_fragment+0x34/0x230
 [5184217.684119]  [813d98b7] ? tcp_mark_head_lost+0x1bd/0x1d5
 [5184217.684123]  [813ddb71] ? tcp_fastretrans_alert+0x69f/0x71d
 [5184217.684125]  [813de567] ? tcp_ack+0x90f/0xb16
 [5184217.684126]  [813df618] ? tcp_rcv_state_process+0x5bd/0x9b8
 [5184217.684128]  [8106d9c0] ? __wake_up_sync_key+0x3a/0x4d
 [5184217.684130]  [813920ed] ? sk_wake_async+0x17/0x34
 [5184217.684133]  [81440d13] ? ipv6_skip_exthdr+0x28/0xc7
 [5184217.684139]  [81418db6] ? NF_HOOK_THRESH.constprop.11+0x4a/0x4a
 [5184217.684143]  [81435abe] ? tcp_v6_do_rcv+0x3ac/0x4f1
 [5184217.684146]  [81435eec] ? tcp_v6_rcv+0x2e9/0x554
 [5184217.684148]  [813c70d3] ? nf_hook_slow+0x66/0xf1
 [5184217.684150]  [81418db6] ? NF_HOOK_THRESH.constprop.11+0x4a/0x4a
 [5184217.684167]  [81418f70] ? ip6_input_finish+0x1ba/0x2a7
 [5184217.684169]  [813a1c12] ? __netif_receive_skb_core+0x422/0x494
 [5184217.684172]  [813a283a] ? netif_receive_skb_internal+0x37/0x6d
 [5184217.684188]  [a09a2e40] ? efx_ssr_try_merge+0x336/0x34e [sfc]
 [5184217.684215]  [a09a4075] ? __efx_ssr_end_of_burst+0x3e/0xd2 
 [sfc]
 [5184217.684225]  [a098e3bd] ? efx_process_channel+0x5d/0x71 [sfc]
 [5184217.684243]  [a098f557] ? efx_poll+0x6d/0x16b [sfc]
 [5184217.684248]  [813a2e27] ? net_rx_action+0xc6/0x191
 [5184217.684250]  [8103f7ee] ? __do_softirq+0x100/0x27c
 [5184217.684254]  [8103fae6] ? irq_exit+0x51/0xbc
 [5184217.684255]  [81003e35] ? do_IRQ+0x9d/0xb4
 [5184217.684258]  [8146992a] ? common_interrupt+0x6a/0x6a
 [5184217.684261]  EOI 4[5184217.684263] ---[ end trace 4f42d23abf1c890e 
 ]---
 [5184217.684460] [ cut here ]
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 1/1] Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet device driver

2015-07-27 Thread Woojung.Huh
This patch adds a driver for LAN7800 family of USB 2.0  USB 3.0 to Gigabit 
Ethernet.
- remove module param which can be configurable by standard mechanism.
- remove other module parms except msg_level per review comment.

Signed-off-by: Woojung Huh woojung@microchip.com
---
 drivers/net/usb/Kconfig   |   10 +
 drivers/net/usb/Makefile  |1 +
 drivers/net/usb/lan78xx.c | 3517 +
 drivers/net/usb/lan78xx.h | 1069 ++
 4 files changed, 4597 insertions(+)
 create mode 100644 drivers/net/usb/lan78xx.c
 create mode 100644 drivers/net/usb/lan78xx.h

diff --git a/drivers/net/usb/Kconfig b/drivers/net/usb/Kconfig
index 7ba8d08..1610b79 100644
--- a/drivers/net/usb/Kconfig
+++ b/drivers/net/usb/Kconfig
@@ -106,6 +106,16 @@ config USB_RTL8152
  To compile this driver as a module, choose M here: the
  module will be called r8152.
 
+config USB_LAN78XX
+   tristate Microchip LAN78XX Based USB Ethernet Adapters
+   select MII
+   help
+ This option adds support for Microchip LAN78XX based USB 2
+  USB 3 10/100/1000 Ethernet adapters.
+
+ To compile this driver as a module, choose M here: the
+ module will be called lan78xx.
+
 config USB_USBNET
tristate Multi-purpose USB Networking Framework
select MII
diff --git a/drivers/net/usb/Makefile b/drivers/net/usb/Makefile
index e2797f1..cf6a0e6 100644
--- a/drivers/net/usb/Makefile
+++ b/drivers/net/usb/Makefile
@@ -8,6 +8,7 @@ obj-$(CONFIG_USB_PEGASUS)   += pegasus.o
 obj-$(CONFIG_USB_RTL8150)  += rtl8150.o
 obj-$(CONFIG_USB_RTL8152)  += r8152.o
 obj-$(CONFIG_USB_HSO)  += hso.o
+obj-$(CONFIG_USB_LAN78XX)  += lan78xx.o
 obj-$(CONFIG_USB_NET_AX8817X)  += asix.o
 asix-y := asix_devices.o asix_common.o ax88172a.o
 obj-$(CONFIG_USB_NET_AX88179_178A)  += ax88179_178a.o
diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c
new file mode 100644
index 000..516722f
--- /dev/null
+++ b/drivers/net/usb/lan78xx.c
@@ -0,0 +1,3517 @@
+/*
+ * Copyright (C) 2015 Microchip Technology
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version 2
+ * of the License, or (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ *
+ * You should have received a copy of the GNU General Public License
+ * along with this program; if not, see http://www.gnu.org/licenses/.
+ */
+#include linux/version.h
+#include linux/module.h
+#include linux/netdevice.h
+#include linux/etherdevice.h
+#include linux/ethtool.h
+#include linux/mii.h
+#include linux/usb.h
+#include linux/crc32.h
+#include linux/signal.h
+#include linux/slab.h
+#include linux/if_vlan.h
+#include linux/uaccess.h
+#include linux/list.h
+#include linux/ip.h
+#include linux/ipv6.h
+#include linux/mdio.h
+#include net/ip6_checksum.h
+#include lan78xx.h
+
+#define DRIVER_AUTHOR  WOOJUNG HUH woojung@microchip.com
+#define DRIVER_DESCLAN78XX USB 3.0 Gigabit Ethernet Devices
+#define DRIVER_NAMElan78xx
+#define DRIVER_VERSION 1.0.0
+
+#define TX_TIMEOUT_JIFFIES (5 * HZ)
+#define THROTTLE_JIFFIES   (HZ / 8)
+#define UNLINK_TIMEOUT_MS  3
+
+#define RX_MAX_QUEUE_MEMORY(60 * 1518)
+
+#define SS_USB_PKT_SIZE(1024)
+#define HS_USB_PKT_SIZE(512)
+#define FS_USB_PKT_SIZE(64)
+
+#define MAX_RX_FIFO_SIZE   (12 * 1024)
+#define MAX_TX_FIFO_SIZE   (12 * 1024)
+#define DEFAULT_BURST_CAP_SIZE (MAX_TX_FIFO_SIZE)
+#define DEFAULT_BULK_IN_DELAY  (0x0800)
+#define MAX_SINGLE_PACKET_SIZE (9000)
+#define DEFAULT_TX_CSUM_ENABLE (true)
+#define DEFAULT_RX_CSUM_ENABLE (true)
+#define DEFAULT_TSO_CSUM_ENABLE(true)
+#define DEFAULT_VLAN_FILTER_ENABLE (true)
+#define INTERNAL_PHY_ID(2) /* 2: GMII */
+#define TX_OVERHEAD(8)
+#define RXW_PADDING2
+
+#define LAN78XX_USB_VENDOR_ID  (0x0424)
+#define LAN7800_USB_PRODUCT_ID (0x7800)
+#define LAN7850_USB_PRODUCT_ID (0x7850)
+#define LAN78XX_EEPROM_MAGIC   (0x78A5)
+#define LAN78XX_OTP_MAGIC  (0x78F3)
+
+#defineMII_READ1
+#defineMII_WRITE   0
+
+#define EEPROM_INDICATOR   (0xA5)
+#define EEPROM_MAC_OFFSET  (0x01)
+#define MAX_EEPROM_SIZE512
+#define OTP_INDICATOR_1(0xF3)
+#define OTP_INDICATOR_2(0xF7)
+

[PATCH net-next 02/16] net: export a few FIB functions

2015-07-27 Thread David Ahern
Required by the VRF driver.

Signed-off-by: Shrijeet Mukherjee s...@cumulusnetworks.com
Signed-off-by: David Ahern d...@cumulusnetworks.com
---
 net/ipv4/fib_frontend.c | 2 ++
 net/ipv4/fib_trie.c | 1 +
 2 files changed, 3 insertions(+)

diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c
index 6b98de0d7949..c565fc182240 100644
--- a/net/ipv4/fib_frontend.c
+++ b/net/ipv4/fib_frontend.c
@@ -108,6 +108,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id)
hlist_add_head_rcu(tb-tb_hlist, net-ipv4.fib_table_hash[h]);
return tb;
 }
+EXPORT_SYMBOL_GPL(fib_new_table);
 
 /* caller must hold either rtnl or rcu read lock */
 struct fib_table *fib_get_table(struct net *net, u32 id)
@@ -127,6 +128,7 @@ struct fib_table *fib_get_table(struct net *net, u32 id)
}
return NULL;
 }
+EXPORT_SYMBOL_GPL(fib_get_table);
 #endif /* CONFIG_IP_MULTIPLE_TABLES */
 
 static void fib_replace_table(struct net *net, struct fib_table *old,
diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c
index 15d32612e3c6..ac2d828c6daa 100644
--- a/net/ipv4/fib_trie.c
+++ b/net/ipv4/fib_trie.c
@@ -1887,6 +1887,7 @@ void fib_free_table(struct fib_table *tb)
 {
call_rcu(tb-rcu, __trie_free_rcu);
 }
+EXPORT_SYMBOL_GPL(fib_free_table);
 
 static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb,
 struct sk_buff *skb, struct netlink_callback *cb)
-- 
2.3.2 (Apple Git-55)

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH net-next 03/16] net: Introduce VRF related flags and helpers

2015-07-27 Thread David Ahern
Add a VRF_MASTER flag for interfaces and helper functions for determining
if a device is a VRF_MASTER.

Add link attribute for passing VRF_TABLE id.

Add vrf_ptr to netdevice.

Add various macros for determining if a device is a VRF device, the index
of the master VRF device and table associated with VRF device.

Signed-off-by: Shrijeet Mukherjee s...@cumulusnetworks.com
Signed-off-by: David Ahern d...@cumulusnetworks.com
---
 include/linux/netdevice.h| 21 +++
 include/net/vrf.h| 83 
 include/uapi/linux/if_link.h |  9 +
 3 files changed, 113 insertions(+)
 create mode 100644 include/net/vrf.h

diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h
index 607b5f41f46f..81cbaf78 100644
--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -1289,6 +1289,7 @@ enum netdev_priv_flags {
IFF_XMIT_DST_RELEASE_PERM   = 122,
IFF_IPVLAN_MASTER   = 123,
IFF_IPVLAN_SLAVE= 124,
+   IFF_VRF_MASTER  = 125,
 };
 
 #define IFF_802_1Q_VLANIFF_802_1Q_VLAN
@@ -1316,6 +1317,7 @@ enum netdev_priv_flags {
 #define IFF_XMIT_DST_RELEASE_PERM  IFF_XMIT_DST_RELEASE_PERM
 #define IFF_IPVLAN_MASTER  IFF_IPVLAN_MASTER
 #define IFF_IPVLAN_SLAVE   IFF_IPVLAN_SLAVE
+#define IFF_VRF_MASTER IFF_VRF_MASTER
 
 /**
  * struct net_device - The DEVICE structure.
@@ -1432,6 +1434,7 @@ enum netdev_priv_flags {
  * @dn_ptr:DECnet specific data
  * @ip6_ptr:   IPv6 specific data
  * @ax25_ptr:  AX.25 specific data
+ * @vrf_ptr:   VRF specific data
  * @ieee80211_ptr: IEEE 802.11 specific data, assign before registering
  *
  * @last_rx:   Time of last Rx
@@ -1650,6 +1653,7 @@ struct net_device {
struct dn_dev __rcu *dn_ptr;
struct inet6_dev __rcu  *ip6_ptr;
void*ax25_ptr;
+   struct net_vrf_dev  *vrf_ptr;
struct wireless_dev *ieee80211_ptr;
struct wpan_dev *ieee802154_ptr;
 #if IS_ENABLED(CONFIG_MPLS_ROUTING)
@@ -3808,6 +3812,23 @@ static inline bool netif_supports_nofcs(struct 
net_device *dev)
return dev-priv_flags  IFF_SUPP_NOFCS;
 }
 
+static inline bool netif_is_vrf(const struct net_device *dev)
+{
+   return dev-priv_flags  IFF_VRF_MASTER;
+}
+
+static inline bool netif_index_is_vrf(struct net *net, int ifindex)
+{
+   struct net_device *dev = dev_get_by_index(net, ifindex);
+   bool rc = false;
+
+   if (dev) {
+   rc = netif_is_vrf(dev);
+   dev_put(dev);
+   }
+   return rc;
+}
+
 /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */
 static inline void netif_keep_dst(struct net_device *dev)
 {
diff --git a/include/net/vrf.h b/include/net/vrf.h
new file mode 100644
index ..743a172ee849
--- /dev/null
+++ b/include/net/vrf.h
@@ -0,0 +1,83 @@
+/*
+ * include/net/net_vrf.h - adds vrf dev structure definitions
+ * Copyright (c) 2015 Cumulus Networks
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+
+#ifndef __LINUX_NET_VRF_H
+#define __LINUX_NET_VRF_H
+
+struct net_vrf_dev {
+   int ifindex; /* ifindex of master dev */
+   u32 tb_id;   /* table id for VRF */
+};
+
+#if IS_ENABLED(CONFIG_NET_VRF)
+static inline int vrf_master_dev_ifindex(const struct net_device *dev)
+{
+   struct net_vrf_dev *vrf_ptr;
+   int ifindex = 0;
+
+   if (!dev)
+   return 0;
+
+   if (netif_is_vrf(dev))
+   ifindex = dev-ifindex;
+   else {
+   vrf_ptr = rcu_dereference(dev-vrf_ptr);
+   if (vrf_ptr)
+   ifindex = vrf_ptr-ifindex;
+   }
+
+   return ifindex;
+}
+
+static inline int vrf_get_master_dev_ifindex(struct net *net, int ifindex)
+{
+   int rc = 0;
+
+   if (ifindex) {
+   struct net_device *dev = dev_get_by_index(net, ifindex);
+
+   if (dev) {
+   rc = vrf_master_dev_ifindex(dev);
+   dev_put(dev);
+   }
+   }
+   return rc;
+}
+
+static inline int vrf_dev_table(const struct net_device *dev)
+{
+   int tb_id = 0;
+
+   if (dev) {
+   struct net_vrf_dev *vrf_ptr = rcu_dereference(dev-vrf_ptr);
+
+   if (vrf_ptr)
+   tb_id = vrf_ptr-tb_id;
+   }
+   return tb_id;
+}
+#else
+static inline int vrf_master_dev_ifindex(const struct net_device *dev)
+{
+   return 0;
+}
+
+static inline int vrf_get_master_dev_ifindex(struct net *net, int ifindex)
+{
+   return 0;
+}
+
+static inline int 

[PATCH net-next 15/16] net: Add chvrf command

2015-07-27 Thread David Ahern
Example of how to use the default bind to interface option for tasks and
correlate with VRF devices.

Signed-off-by: David Ahern d...@cumulusnetworks.com
---
 tools/net/Makefile |   6 +-
 tools/net/chvrf.c  | 225 +
 2 files changed, 229 insertions(+), 2 deletions(-)
 create mode 100644 tools/net/chvrf.c

diff --git a/tools/net/Makefile b/tools/net/Makefile
index ee577ea03ba5..c13f11f5637a 100644
--- a/tools/net/Makefile
+++ b/tools/net/Makefile
@@ -10,7 +10,7 @@ YACC = bison
 %.lex.c: %.l
$(LEX) -o $@ $
 
-all : bpf_jit_disasm bpf_dbg bpf_asm
+all : bpf_jit_disasm bpf_dbg bpf_asm chvrf
 
 bpf_jit_disasm : CFLAGS = -Wall -O2 -DPACKAGE='bpf_jit_disasm'
 bpf_jit_disasm : LDLIBS = -lopcodes -lbfd -ldl
@@ -25,8 +25,10 @@ bpf_asm : LDLIBS =
 bpf_asm : bpf_asm.o bpf_exp.yacc.o bpf_exp.lex.o
 bpf_exp.lex.o : bpf_exp.yacc.c
 
+chvrf : CFLAGS = -Wall -O2
+
 clean :
-   rm -rf *.o bpf_jit_disasm bpf_dbg bpf_asm bpf_exp.yacc.* bpf_exp.lex.*
+   rm -rf *.o bpf_jit_disasm bpf_dbg bpf_asm bpf_exp.yacc.* bpf_exp.lex.* 
chvrf
 
 install :
install bpf_jit_disasm $(prefix)/bin/bpf_jit_disasm
diff --git a/tools/net/chvrf.c b/tools/net/chvrf.c
new file mode 100644
index ..71cc925fd101
--- /dev/null
+++ b/tools/net/chvrf.c
@@ -0,0 +1,225 @@
+/*
+ * chvrf.c - Example of how to use the default bind-to-device option for
+ *   tasks and correlate to VRFs via the VRF device.
+ *
+ * Copyright (c) 2015 Cumulus Networks
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License as published by
+ * the Free Software Foundation; either version 2 of the License, or
+ * (at your option) any later version.
+ */
+#include sys/ioctl.h
+#include sys/prctl.h
+#include sys/socket.h
+#include signal.h
+#include string.h
+#include stdio.h
+#include stdlib.h
+#include unistd.h
+#include netinet/in.h
+#include net/if.h /* for struct ifreq  */
+#include libgen.h
+#include errno.h
+
+#ifndef PR_SET_SK_BIND_DEV_IF
+#define PR_SET_SK_BIND_DEV_IF   47
+#endif
+#ifndef PR_GET_SK_BIND_DEV_IF
+#define PR_GET_SK_BIND_DEV_IF   48
+#endif
+
+static int vrf_to_device(int vrf)
+{
+   struct ifreq ifdata;
+   int sd, rc;
+
+   memset(ifdata, 0, sizeof(ifdata));
+   snprintf(ifdata.ifr_name, sizeof(ifdata.ifr_name) - 1, vrf%d, vrf);
+
+   sd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
+   if (sd  0) {
+   perror(socket failed);
+   return -1;
+   }
+
+   /* Get the index for the specified interface */
+   rc = ioctl(sd, SIOCGIFINDEX, (char *)ifdata);
+   close(sd);
+   if (rc != 0) {
+   perror(ioctl(SIOCGIFINDEX) failed);
+   return -1;
+   }
+
+   return ifdata.ifr_ifindex;
+}
+
+static int device_to_vrf(int idx)
+{
+   struct ifreq ifdata;
+   int sd, vrf, rc;
+
+   memset(ifdata, 0, sizeof(ifdata));
+   ifdata.ifr_ifindex = idx;
+
+   sd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP);
+   if (sd  0) {
+   perror(socket failed);
+   return -1;
+   }
+
+   /* Get the index for the specified interface */
+   rc = ioctl(sd, SIOCGIFNAME, (char *)ifdata);
+   close(sd);
+   if (rc != 0) {
+   perror(ioctl(SIOCGIFNAME) failed);
+   return -1;
+   }
+
+   if (sscanf(ifdata.ifr_name, vrf%d, vrf) != 1) {
+   fprintf(stderr, Unexpected device name (%s)\n, 
ifdata.ifr_name);
+   vrf = -1;
+   }
+
+   return vrf;
+}
+
+static int set_vrf(int vrf)
+{
+   int idx;
+   long err;
+
+   /* convert vrf to device index */
+   idx = vrf_to_device(vrf);
+   if (idx  0) {
+   fprintf(stderr, Failed to get device index for vrf %d\n, vrf);
+   return -1;
+   }
+
+   /* set default device bind */
+   err = prctl(PR_SET_SK_BIND_DEV_IF, idx);
+   if (err  0) {
+   fprintf(stderr, prctl failed to device index: %d\n, errno);
+   return -1;
+   }
+
+   return 0;
+}
+
+/* get vrf context for given process id */
+static int get_vrf(pid_t pid)
+{
+   int vrf;
+   long err;
+
+   /* lookup device index pid is tied to */
+   err = prctl(PR_GET_SK_BIND_DEV_IF, pid);
+   if (err  0) {
+   fprintf(stderr, prctl failed: %d\n, errno);
+   return -1;
+   }
+
+   if (err == 0)
+   return 0;
+
+   /* convert device index to vrf id */
+   vrf = device_to_vrf((int)err);
+   if (vrf  0) {
+   fprintf(stderr, Failed to get device index for vrf %d\n, vrf);
+   return -1;
+   }
+
+   return vrf;
+}
+
+static int run_vrf(char **argv, int vrf)
+{
+   char *cmd;
+
+   if (set_vrf(vrf) != 0) {
+   fprintf(stderr, Failed to set vrf context\n);
+   return 1;
+   }
+
+   cmd = 

Re: [RFC PATCH 0/4] Shared vhost design

2015-07-27 Thread Bandan Das
Eyal Moscovici eya...@il.ibm.com writes:

 Hi, 

 The test showed the same relative numbers as we got in our internal 
 testing. I was wondering about the configuration in regards to NUMA. From
Thanks for confirming.

 our testing we saw that if the VMs are spread across 2 NUMA nodes then 
 having a shared vhost thread per node performs better then having the two 
 threads in the same core.

IIUC, this is similar to my test setup and observations i.e 
 14*   1173.8  1216.9

In this case, there's a shared vhost thread on CPU 14 for numa node 0
and another on CPU 15 for numa node 1. Guests running on CPUs 0,2,4,6,8,10,12
are serviced by vhost-0 that runs on CPU 14 and guests running on CPUs 
1,3,5,7,9,11,13
get serviced by vhost-1 (Numa node 1). I tried some other configurations but
this configuration gave me the best results.


Eyal, I think it makes sense to add polling on top of these patches and
get numbers for them too. Thoughts ?

Bandan

 Eyal Moscovici
 HL-Cloud Infrastructure Solutions
 IBM Haifa Research Lab



 From:   Bandan Das b...@redhat.com
 To: k...@vger.kernel.org
 Cc: netdev@vger.kernel.org, linux-ker...@vger.kernel.org, 
 m...@redhat.com, Eyal Moscovici/Haifa/IBM@IBMIL, Razya 
 Ladelsky/Haifa/IBM@IBMIL, cgro...@vger.kernel.org, jasow...@redhat.com
 Date:   07/13/2015 07:08 AM
 Subject:[RFC PATCH 0/4] Shared vhost design



 Hello,

 There have been discussions on improving the current vhost design. The 
 first
 attempt, to my knowledge was Shirley Ma's patch to create a dedicated 
 vhost
 worker per cgroup.

 http://comments.gmane.org/gmane.linux.network/224730

 Later, I posted a cmwq based approach for performance comparisions
 http://comments.gmane.org/gmane.linux.network/286858

 More recently was the Elvis work that was presented in KVM Forum 2013
 http://www.linux-kvm.org/images/a/a3/Kvm-forum-2013-elvis.pdf

 The Elvis patches rely on common vhost thread design for scalability
 along with polling for performance. Since there are two major changes
 being proposed, we decided to split up the work. The first (this RFC),
 proposing a re-design of the vhost threading model and the second part
 (not posted yet) to focus more on improving performance. 

 I am posting this with the hope that we can have a meaningful discussion
 on the proposed new architecture. We have run some tests to show that the 
 new
 design is scalable and in terms of performance, is comparable to the 
 current
 stable design. 

 Test Setup:
 The testing is based on the setup described in the Elvis proposal.
 The initial tests are just an aggregate of Netperf STREAM and MAERTS but
 as we progress, I am happy to run more tests. The hosts are two identical
 16 core Haswell systems with point to point network links. For the first 
 10 runs,
 with n=1 upto n=10 guests running in parallel, I booted the target system 
 with nr_cpus=8
 and mem=12G. The purpose was to do a comparision of resource utilization
 and how it affects performance. Finally, with the number of guests set at 
 14,
 I didn't limit the number of CPUs booted on the host or limit memory seen 
 by
 the kernel but boot the kernel with isolcpus=14,15 that will be used to 
 run
 the vhost threads. The guests are pinned to cpus 0-13 and based on which
 cpu the guest is running on, the corresponding I/O thread is either pinned
 to cpu 14 or 15.

 Results
 # X axis is number of guests
 # Y axis is netperf number
 # nr_cpus=8 and mem=12G
 #Number of Guests#Baseline#ELVIS
 11119.3.0
 2 1135.6  1130.2
 3 1135.5  1131.6
 4 1136.0  1127.1
 5 1118.6  1129.3
 6 1123.4  1129.8
 7 1128.7  1135.4
 8 1129.9  1137.5
 9 1130.6  1135.1
 101129.3  1138.9
 14*   1173.8  1216.9

 #* Last run with the vCPU and I/O thread(s) pinned, no CPU/memory limit 
 imposed.
 #  I/O thread runs on CPU 14 or 15 depending on which guest it's serving

 There's a simple graph at
 http://people.redhat.com/~bdas/elvis/data/results.png
 that shows how task affinity results in a jump and even without it,
 as the number of guests increase, the shared vhost design performs
 slightly better.

 Observations:
 1. In terms of stock performance, the results are comparable.
 2. However, with a tuned setup, even without polling, we see an 
 improvement
 with the new design.
 3. Making the new design simulate old behavior would be a matter of 
 setting
 the number of guests per vhost threads to 1.
 4. 

Re: [PATCH iproute2] ip: replace white-spaces with tabs

2015-07-27 Thread Stephen Hemminger
On Sat, 25 Jul 2015 08:54:53 -0400
Zhang Shengju zhangshen...@cmss.chinamobile.com wrote:

 Replace white-spaces with tabs
 
 Signed-off-by: Zhang Shengju zhangshen...@cmss.chinamobile.com
 ---
  ip/ip.c | 6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)
 

There were more places that needed this, went ahead and made ip.c and bridge.c
checkpatch clean.
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] iproute2: Add support for IPv6 VTI tunnels to ip6tunnel

2015-07-27 Thread Stephen Hemminger
On Thu, 2 Oct 2014 11:11:40 +0200
Jiri Pirko j...@resnulli.us wrote:

 Thu, Oct 02, 2014 at 10:48:20AM CEST, steffen.klass...@secunet.com wrote:
 On Thu, Oct 02, 2014 at 10:41:09AM +0200, Jiri Pirko wrote:
  Fri, Sep 26, 2014 at 09:10:56AM CEST, steffen.klass...@secunet.com wrote:
  
  @@ -459,11 +462,14 @@ static int do_add(int cmd, int argc, char **argv)
switch (p.proto) {
case IPPROTO_IPIP:
case IPPROTO_IPV6:
  + if (p.i_flags != VTI_ISVTI)
  + return tnl_add_ioctl(cmd, ip6_vti0, p.name, p);
^ Wouldn't it be more
   consistent to not to use
   the underscore? 
 
 The ipv4 version of vti uses ip_vti0, so I tried to be consistent
 with that. 
 
 Okay, fine with me.

Sure, applied.
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch net-next 0/4] Introduce Mellanox Technologies Switch ASICs switchdev drivers

2015-07-27 Thread Scott Feldman
On Thu, Jul 23, 2015 at 8:43 AM, Jiri Pirko j...@resnulli.us wrote:
 This patchset introduces Mellanox Technologies Switch driver infrastructure
 and support for SwitchX-2 ASIC.

You guys did a great job on the driver; looking forward to seeing
L2/L3 hooked up.  Very nice, aesthetically pleasing code.

Is this a ground-up rewrite or a port of the SDK?  It's so clean and
tight, I'm guessing a ground-up rewrite.

Reviewed-by: Scott Feldman sfel...@gmail.com
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [patch net-next 0/4] Introduce Mellanox Technologies Switch ASICs switchdev drivers

2015-07-27 Thread Jiri Pirko
Mon, Jul 27, 2015 at 10:21:54PM CEST, sfel...@gmail.com wrote:
On Thu, Jul 23, 2015 at 8:43 AM, Jiri Pirko j...@resnulli.us wrote:
 This patchset introduces Mellanox Technologies Switch driver infrastructure
 and support for SwitchX-2 ASIC.

You guys did a great job on the driver; looking forward to seeing
L2/L3 hooked up.  Very nice, aesthetically pleasing code.

Thanks!


Is this a ground-up rewrite or a port of the SDK?  It's so clean and
tight, I'm guessing a ground-up rewrite.

It's rewritten from scratch.


Reviewed-by: Scott Feldman sfel...@gmail.com

Thanks for your review!
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [net:master 41/49] drivers/net/ethernet/cadence/macb.c:164:1: error: macro writel passed 3 arguments, but takes just 2

2015-07-27 Thread David Miller
From: Andy Shevchenko andriy.shevche...@linux.intel.com
Date: Mon, 27 Jul 2015 14:07:53 +0300

 I do use compiler from Debian for AVR32, didn't check this on other
 architectures.
 
 Possible something like following will fix it:

That isn't going to fix it.  You misunderstand the nature of the problem
I think, the issue looks like this:


#define readl(x, y) ((x) + (y))

struct foo {
int (*readl)(int x, int y, int z);
};

int test(struct foo *p)
{
p-readl(1, 2, 3);
}


Archs typically define readl as a macro, so when you do things like
p-readl() CPP tries to expand the macro when it sees the readl(
part, and that's the fundamental issue.

We have to rename the method names so that the macro expansion does't
interfere.

Here is the fix I am committing to fix this:


[PATCH] macb: Fix build with macro'ized readl/writel.

If an architecture defines readl/writel using CPP macros, we
get the following kinds of build failure:

   drivers/net/ethernet/cadence/macb.c:164:1: error: macro writel
   passed 3 arguments, but takes just 2
  macb_or_gem_writel(bp, SA1B, bottom);
 ^

Rename the methods so that this doesn't happen.

Signed-off-by: David S. Miller da...@davemloft.net
---
 drivers/net/ethernet/cadence/macb.c | 14 +++---
 drivers/net/ethernet/cadence/macb.h | 16 
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/drivers/net/ethernet/cadence/macb.c 
b/drivers/net/ethernet/cadence/macb.c
index c638757..bf9eb2e 100644
--- a/drivers/net/ethernet/cadence/macb.c
+++ b/drivers/net/ethernet/cadence/macb.c
@@ -506,7 +506,7 @@ static void macb_update_stats(struct macb *bp)
WARN_ON((unsigned long)(end - p - 1) != (MACB_TPF - MACB_PFR) / 4);
 
for(; p  end; p++, offset += 4)
-   *p += bp-readl(bp, offset);
+   *p += bp-macb_reg_readl(bp, offset);
 }
 
 static int macb_halt_tx(struct macb *bp)
@@ -1934,14 +1934,14 @@ static void gem_update_stats(struct macb *bp)
 
for (i = 0; i  GEM_STATS_LEN; ++i, ++p) {
u32 offset = gem_statistics[i].offset;
-   u64 val = bp-readl(bp, offset);
+   u64 val = bp-macb_reg_readl(bp, offset);
 
bp-ethtool_stats[i] += val;
*p += val;
 
if (offset == GEM_OCTTXL || offset == GEM_OCTRXL) {
/* Add GEM_OCTTXH, GEM_OCTRXH */
-   val = bp-readl(bp, offset + 4);
+   val = bp-macb_reg_readl(bp, offset + 4);
bp-ethtool_stats[i] += ((u64)val)  32;
*(++p) += val;
}
@@ -2867,11 +2867,11 @@ static int macb_probe(struct platform_device *pdev)
bp-regs = mem;
bp-native_io = native_io;
if (native_io) {
-   bp-readl = hw_readl_native;
-   bp-writel = hw_writel_native;
+   bp-macb_reg_readl = hw_readl_native;
+   bp-macb_reg_writel = hw_writel_native;
} else {
-   bp-readl = hw_readl;
-   bp-writel = hw_writel;
+   bp-macb_reg_readl = hw_readl;
+   bp-macb_reg_writel = hw_writel;
}
bp-num_queues = num_queues;
bp-queue_mask = queue_mask;
diff --git a/drivers/net/ethernet/cadence/macb.h 
b/drivers/net/ethernet/cadence/macb.h
index 2aa102e..1895b6b 100644
--- a/drivers/net/ethernet/cadence/macb.h
+++ b/drivers/net/ethernet/cadence/macb.h
@@ -429,12 +429,12 @@
 | GEM_BF(name, value))
 
 /* Register access macros */
-#define macb_readl(port, reg)  (port)-readl((port), MACB_##reg)
-#define macb_writel(port, reg, value)  (port)-writel((port), MACB_##reg, 
(value))
-#define gem_readl(port, reg)   (port)-readl((port), GEM_##reg)
-#define gem_writel(port, reg, value)   (port)-writel((port), GEM_##reg, 
(value))
-#define queue_readl(queue, reg)(queue)-bp-readl((queue)-bp, 
(queue)-reg)
-#define queue_writel(queue, reg, value)
(queue)-bp-writel((queue)-bp, (queue)-reg, (value))
+#define macb_readl(port, reg)  (port)-macb_reg_readl((port), 
MACB_##reg)
+#define macb_writel(port, reg, value)  (port)-macb_reg_writel((port), 
MACB_##reg, (value))
+#define gem_readl(port, reg)   (port)-macb_reg_readl((port), 
GEM_##reg)
+#define gem_writel(port, reg, value)   (port)-macb_reg_writel((port), 
GEM_##reg, (value))
+#define queue_readl(queue, reg)
(queue)-bp-macb_reg_readl((queue)-bp, (queue)-reg)
+#define queue_writel(queue, reg, value)
(queue)-bp-macb_reg_writel((queue)-bp, (queue)-reg, (value))
 
 /* Conditional GEM/MACB macros.  These perform the operation to the correct
  * register dependent on whether the device is a GEM or a MACB.  For registers
@@ -782,8 +782,8 @@ struct macb {
boolnative_io;
 
/* hardware IO accessors */
-   u32 (*readl)(struct macb 

Re: [patch net-next 0/4] Introduce Mellanox Technologies Switch ASICs switchdev drivers

2015-07-27 Thread Florian Fainelli
On 27/07/15 13:27, Jiri Pirko wrote:
 Mon, Jul 27, 2015 at 10:21:54PM CEST, sfel...@gmail.com wrote:
 On Thu, Jul 23, 2015 at 8:43 AM, Jiri Pirko j...@resnulli.us wrote:
 This patchset introduces Mellanox Technologies Switch driver infrastructure
 and support for SwitchX-2 ASIC.

 You guys did a great job on the driver; looking forward to seeing
 L2/L3 hooked up.  Very nice, aesthetically pleasing code.
 
 Thanks!
 

 Is this a ground-up rewrite or a port of the SDK?  It's so clean and
 tight, I'm guessing a ground-up rewrite.
 
 It's rewritten from scratch.

Only glanced through the driver, but this looks like really really nice
and clean, very pleased to see such a driver being submitted. That is
very encouraging and should inspire other companies in doing so, so
thanks for doing this!
-- 
Florian
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH iproute2] tc: fix bpf compilation with old glibc

2015-07-27 Thread Stephen Hemminger
On Wed, 22 Jul 2015 14:29:30 +0200
Nicolas Dichtel nicolas.dich...@6wind.com wrote:

 Error was:
 f_bpf.o: In function `bpf_parse_opt':
 f_bpf.c:(.text+0x88f): undefined reference to `secure_getenv'
 m_bpf.o: In function `parse_bpf':
 m_bpf.c:(.text+0x587): undefined reference to `secure_getenv'
 collect2: error: ld returned 1 exit status
 
 CC: Daniel Borkmann dan...@iogearbox.net
 Fixes: 88eea5395483 (tc: {f,m}_bpf: allow to retrieve uds path from env)
 Signed-off-by: Nicolas Dichtel nicolas.dich...@6wind.com

Applied thanks.
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [net-next 0/16] Proposal for VRF-lite - v3

2015-07-27 Thread Eric W. Biederman
David Ahern d...@cumulusnetworks.com writes:

 In the context of internet scale routing a requirement that always comes
 up is the need to partition the available routing tables into disjoint
 routing planes. A specific use case is the multi-tenancy problem where
 each tenant has their own unique routing tables and in the very least
 need different default gateways.

 This patch allows the ability to create virtual router domains (aka VRFs
 (VRF-lite to be specific) in the linux packet forwarding stack. The main
 observation is that through the use of rules and socket binding to interfaces,
 all the facilities that we need are already present in the infrastructure. 
 What
 is missing is a handle that identifies a routing domain and can be used to
 gather applicable rules/tables and uniqify neighbor selection. The scheme used
 needs to preserves the notions of ECMP, and general routing
 principles.

This paragraph is false when it comes to sockets, as I have already
pointed out.

- VPN Routing and Forwarding (RFC4364 and it's kin) implies isolation
  strong enough to allow using the the same ip on different machines
  in different VPN instances and not have confusion.

- The routing table is not the only table in the kernel that uses
  an ip address as a key.

  The result is that you can combine packets fragments that come in
  on different interfaces (irrespective of your VPN), confuse tcp
  parameters between interfaces, scramble your ipsec connections and I
  don't know what else.

Binding a socket to a network device is not strong enough to do what you
want to do and it will lead to subtle bugs, that can be triggered by
accident or by hostile actors.

If these kinds of limitations are well documented and it is specified
that these kinds of problems can occur with your socket code there may
be a place for this code somewhere.

However described like it is your code is wrong and fundmentally broken.

 Version 3
 - addressed comments from first 2 RFCs with the exception of the name
   Nicolas: We will do the name conversion once we agree on what the
correct name should be (vrf, mrf or something else)

Not so.  I described the deep problems between your goals and your
implementation and they are not even mentioned let alone addressed.

 -  packets flow through the VRF device in both directions allowing the
following:
- tcpdump -i vrfn
- tc rules on vrf device
- netfilter rules on vrf device

 Ingo/Andy: I added you two as a start point for the proposed task related
changes. Not sure who should be the reviewer; please let me know
if someone else is more appropriate. Thanks.

It looks like you are trying to implement a namespace that isn't a
namespace.  Given that it is broken by design you have my nack.

Nacked-by: Eric W. Biederman ebied...@xmission.com

Eric
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 14/16] net: Add sk_bind_dev_if to task_struct

2015-07-27 Thread Eric W. Biederman
David Ahern d...@cumulusnetworks.com writes:

 Allow tasks to have a default device index for binding sockets. If set
 the value is passed to all AF_INET/AF_INET6 sockets when they are
 created.

 The task setting is passed parent to child on fork, but can be set or
 changed after task creation using prctl (if task has CAP_NET_ADMIN
 permissions). The setting for a socket can be retrieved using prctl().
 This option allows an administrator to restrict a task to only send/receive
 packets through the specified device. In the case of VRF devices this
 option restricts tasks to a specific VRF.

 Correlation of the device index to a specific VRF, ie.,
ifindex -- VRF device -- VRF id
 is left to userspace.

Nacked-by: Eric W. Biederman ebied...@xmission.com

Because it is broken by design.  Your routing device is only safe for
programs that know it's limitations it is not appropriate for general
applications.

Since you don't even seen to know it's limitations I think this is a
bad path to walk down.

 Example using VRF devices:
 1. vrf1 is created and assigned to table 5
 2. eth2 is enslaved to vrf1
 3. eth2 is given the address 1.1.1.1/24

 $ ip route ls table 5
 prohibit default
 1.1.1.0/24 dev eth2  scope link
 local 1.1.1.1 dev eth2  proto kernel  scope host  src 1.1.1.1

 With out setting a VRF context ping, tcp and udp attempts fail. e.g,
 $ ping 1.1.1.254
 connect: Network is unreachable

 After binding the task to the vrf device ping succeeds:
 $ ./chvrf -v 1 ping -c1 1.1.1.254
 PING 1.1.1.254 (1.1.1.254) 56(84) bytes of data.
 64 bytes from 1.1.1.254: icmp_seq=1 ttl=64 time=2.32 ms

 Signed-off-by: David Ahern d...@cumulusnetworks.com
 ---
  include/linux/sched.h  |  3 +++
  include/uapi/linux/prctl.h |  4 
  kernel/fork.c  |  2 ++
  kernel/sys.c   | 35 +++
  net/ipv4/af_inet.c |  1 +
  net/ipv4/route.c   |  4 +++-
  net/ipv6/af_inet6.c|  1 +
  net/ipv6/route.c   |  2 +-
  8 files changed, 50 insertions(+), 2 deletions(-)

 diff --git a/include/linux/sched.h b/include/linux/sched.h
 index 04b5ada460b4..29b336b8a466 100644
 --- a/include/linux/sched.h
 +++ b/include/linux/sched.h
 @@ -1528,6 +1528,9 @@ struct task_struct {
   struct files_struct *files;
  /* namespaces */
   struct nsproxy *nsproxy;
 +/* network */
 + /* if set INET/INET6 sockets are bound to given dev index on create */
 + int sk_bind_dev_if;
  /* signal handlers */
   struct signal_struct *signal;
   struct sighand_struct *sighand;
 diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h
 index 31891d9535e2..1ef45195d146 100644
 --- a/include/uapi/linux/prctl.h
 +++ b/include/uapi/linux/prctl.h
 @@ -190,4 +190,8 @@ struct prctl_mm_map {
  # define PR_FP_MODE_FR   (1  0)/* 64b FP registers */
  # define PR_FP_MODE_FRE  (1  1)/* 32b compatibility */
  
 +/* get/set network interface sockets are bound to by default */
 +#define PR_SET_SK_BIND_DEV_IF   47
 +#define PR_GET_SK_BIND_DEV_IF   48
 +
  #endif /* _LINUX_PRCTL_H */
 diff --git a/kernel/fork.c b/kernel/fork.c
 index dbd9b8d7b7cc..8b396e77d2bf 100644
 --- a/kernel/fork.c
 +++ b/kernel/fork.c
 @@ -380,6 +380,8 @@ static struct task_struct *dup_task_struct(struct 
 task_struct *orig)
   tsk-splice_pipe = NULL;
   tsk-task_frag.page = NULL;
  
 + tsk-sk_bind_dev_if = orig-sk_bind_dev_if;
 +
   account_kernel_stack(ti, 1);
  
   return tsk;
 diff --git a/kernel/sys.c b/kernel/sys.c
 index 259fda25eb6b..59119ac0a0bd 100644
 --- a/kernel/sys.c
 +++ b/kernel/sys.c
 @@ -52,6 +52,7 @@
  #include linux/rcupdate.h
  #include linux/uidgid.h
  #include linux/cred.h
 +#include linux/netdevice.h
  
  #include linux/kmsg_dump.h
  /* Move somewhere else to avoid recompiling? */
 @@ -2267,6 +2268,40 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, 
 arg2, unsigned long, arg3,
   case PR_GET_FP_MODE:
   error = GET_FP_MODE(me);
   break;
 +#ifdef CONFIG_NET
 + case PR_SET_SK_BIND_DEV_IF:
 + {
 + struct net_device *dev;
 + int idx = (int) arg2;
 +
 + if (!capable(CAP_NET_ADMIN))
 + return -EPERM;
 +
 + if (idx) {
 + dev = dev_get_by_index(me-nsproxy-net_ns, idx);
 + if (!dev)
 + return -EINVAL;
 + dev_put(dev);
 + }
 + me-sk_bind_dev_if = idx;
 + break;
 + }
 + case PR_GET_SK_BIND_DEV_IF:
 + {
 + struct task_struct *tsk;
 + int sk_bind_dev_if = -EINVAL;
 +
 + rcu_read_lock();
 + tsk = find_task_by_vpid(arg2);
 + if (tsk)
 + sk_bind_dev_if = tsk-sk_bind_dev_if;
 + rcu_read_unlock();
 + if (tsk != me  !capable(CAP_NET_ADMIN))
 + return -EPERM;
 + 

Re: [PATCH iproute2 net-next] bridge: mdb: add support for router add/del notifications monitoring

2015-07-27 Thread Stephen Hemminger
On Mon, 27 Jul 2015 13:44:05 +0200
Nikolay Aleksandrov ra...@blackwall.org wrote:

 From: Nikolay Aleksandrov niko...@cumulusnetworks.com
 
 This patch adds support for ADDMDB/DELMDB notifications about router ports
 which have been added or deleted/expired respectively.
 
 Example output:
 $ bridge -s monitor mdb
 Deleted router port dev eth3 master br0
 router port dev eth3 master br0
 
 Signed-off-by: Nikolay Aleksandrov niko...@cumulusnetworks.com

Looks useful, applied.

Does usage or manual page need to be updated as well?


--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next]r8169: Correct values on dma_alloc_coherent

2015-07-27 Thread Francois Romieu
Corcodel Marian corcodel.mar...@gmail.com :
[...]
 diff --git a/drivers/net/ethernet/realtek/r8169.c 
 b/drivers/net/ethernet/realtek/r8169.c
 index 3df51fa..fd249a6 100644
 --- a/drivers/net/ethernet/realtek/r8169.c
 +++ b/drivers/net/ethernet/realtek/r8169.c
 @@ -6724,8 +6724,8 @@ static int rtl8169_init_ring(struct net_device *dev)
  
   rtl8169_init_ring_indexes(tp);
  
 - memset(tp-tx_skb, 0x0, NUM_TX_DESC * sizeof(struct ring_info));
 - memset(tp-Rx_databuff, 0x0, NUM_RX_DESC * sizeof(void *));
 + memset(tp-tx_skb, 0x0, NUM_RX_DESC);
 + memset(tp-Rx_databuff, 0x0, NUM_RX_DESC);

void *Rx_databuff[NUM_RX_DESC];

:o(

Please don't mess with the kernel code until you've figured how wrong
these changes are. Then give yourself a few months and read more code.

Really.

-- 
Ueimor
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [net-next PATCH 2/2] drivers: net: cpsw: add separate napi for tx packet handling for performance improvment

2015-07-27 Thread Francois Romieu
Mugunthan V N mugunthan...@ti.com :
[...]
 diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c
 index d68d759..4f98537 100644
 --- a/drivers/net/ethernet/ti/cpsw.c
 +++ b/drivers/net/ethernet/ti/cpsw.c
 @@ -752,13 +753,22 @@ static irqreturn_t cpsw_tx_interrupt(int irq, void 
 *dev_id)
   struct cpsw_priv *priv = dev_id;
  
   cpdma_ctlr_eoi(priv-dma, CPDMA_EOI_TX);
 - cpdma_chan_process(priv-txch, 128);
 + writel(0, priv-wr_regs-tx_en);
 +
 + if (netif_running(priv-ndev)) {
 + napi_schedule(priv-napi_tx);
 + return IRQ_HANDLED;
 + }


cpsw_ndo_stop calls napi_disable: you can remove netif_running.

-- 
Ueimor
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Fw: [Bug 102051] New: Unexpected TCP behavior

2015-07-27 Thread Eric Dumazet
On Mon, 2015-07-27 at 14:02 -0700, Stephen Hemminger wrote:
 
 Begin forwarded message:
 
 Date: Mon, 27 Jul 2015 20:06:07 +
 From: bugzilla-dae...@bugzilla.kernel.org 
 bugzilla-dae...@bugzilla.kernel.org
 To: shemmin...@linux-foundation.org shemmin...@linux-foundation.org
 Subject: [Bug 102051] New: Unexpected TCP behavior
 
 
 https://bugzilla.kernel.org/show_bug.cgi?id=102051
 
 Bug ID: 102051
Summary: Unexpected TCP behavior
Product: Networking
Version: 2.5
 Kernel Version: 3.19
   Hardware: All
 OS: Linux
   Tree: Mainline
 Status: NEW
   Severity: low
   Priority: P1
  Component: IPV4
   Assignee: shemmin...@linux-foundation.org
   Reporter: vreme...@gmail.com
 Regression: No
 
 While running nmap against localhost I started to see open ports in the 
 dynamic
 range (1024). Kind of odd, knowing netstat and ss did not show any listeners
 on the port.
 
 With tcpdump, I confirmed system was returning S/ACK for ports that did not
 have a listener enabled. The issue or feature only occurs if source port
 matches the destination port.
 
 # netstat -nap | grep 5000
 #
 
 $ nc localhost -p 5000 5000
 a
 a
 
 # tcpdump -i lo port 5000
 14:28:18.059708 IP 127.0.0.1.5000  127.0.0.1.5000: Flags [S], seq 2005295207,
 win 43690, options [mss 65495,sackOK,TS val 4481790 ecr 0,nop,wscale 7], 
 length
 0
 14:28:18.059721 IP 127.0.0.1.5000  127.0.0.1.5000: Flags [S.], seq 
 2005295207,
 ack 2005295208, win 43690, options [mss 65495,sackOK,TS val 4481790 ecr
 4481790,nop,wscale 7], length 0

Nothing wrong here. This is well known TCP behavior ( cross syn ).

 14:28:18.059729 IP 127.0.0.1.5000  127.0.0.1.5000: Flags [.], ack 2005295208,
 win 342, options [nop,nop,TS val 4481790 ecr 4481790], length 0
 14:28:19.121392 IP 127.0.0.1.5000  127.0.0.1.5000: Flags [P.], seq
 2005295208:2005295210, ack 2005295208, win 342, options [nop,nop,TS val 
 4482056
 ecr 4481790], length 2
 14:28:19.121407 IP 127.0.0.1.5000  127.0.0.1.5000: Flags [.], ack 2005295210,
 win 342, options [nop,nop,TS val 4482056 ecr 4482056], length 0
 
 # hping3 -S 127.0.0.1 -p 5000 -s 5000
 HPING 127.0.0.1 (lo 127.0.0.1): S set, 40 headers + 0 data bytes 
 len=40 ip=127.0.0.1 ttl=64 id=2036 sport=5000 flags=S seq=0 win=512 rtt=3.8 
 ms 
 SYN
 DUP! len=52 ip=127.0.0.1 ttl=64 DF id=670 sport=5000 flags=A seq=0 win=342
 rtt=3.8 ms   SYN/ACK
 len=40 ip=127.0.0.1 ttl=64 DF id=43435 sport=5000 flags=RA seq=1 win=0 rtt=3.7
 ms
 
 I confirmed it with nmap, nc, and hping3; granted they build on same c
 libraries, so I am not even sure if this should be filed as a kernel bug (or
 even a bug);
 just did not expect to see this behavior // 
 
 Expected to see a RST instead.
 

Sigh.

Wont fix.


--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH iproute2 net-next] bridge: mdb: add support for router add/del notifications monitoring

2015-07-27 Thread Nikolay Aleksandrov

 On 27 Jul 2015, at 23:40, Stephen Hemminger step...@networkplumber.org 
 wrote:
 
 On Mon, 27 Jul 2015 13:44:05 +0200
 Nikolay Aleksandrov ra...@blackwall.org wrote:
 
 From: Nikolay Aleksandrov niko...@cumulusnetworks.com
 
 This patch adds support for ADDMDB/DELMDB notifications about router ports
 which have been added or deleted/expired respectively.
 
 Example output:
 $ bridge -s monitor mdb
 Deleted router port dev eth3 master br0
 router port dev eth3 master br0
 
 Signed-off-by: Nikolay Aleksandrov niko...@cumulusnetworks.com
 
 Looks useful, applied.
 
 Does usage or manual page need to be updated as well?
 
 

Good question :-) I'll look into it.
Thanks!

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [net PATCH] fib_trie: Drop unnecessary calls to leaf_pull_suffix

2015-07-27 Thread David Miller
From: Alexander Duyck alexander.h.du...@redhat.com
Date: Mon, 27 Jul 2015 13:08:06 -0700

 It was reported that update_suffix was taking a long time on systems where
 a large number of leaves were attached to a single node.  As it turns out
 fib_table_flush was calling update_suffix for each leaf that didn't have all
 of the aliases stripped from it.  As a result, on this large node removing
 one leaf would result in us calling update_suffix for every other leaf on
 the node.
 
 The fix is to just remove the calls to leaf_pull_suffix since they are
 redundant as we already have a call in resize that will go through and
 update the suffix length for the node before we exit out of
 fib_table_flush or fib_table_flush_external.
 
 Reported-by: David Ahern d...@cumulusnetworks.com
 Signed-off-by: Alexander Duyck alexander.h.du...@redhat.com

Applied and queued up for -stable, thanks.
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [net PATCH] fib_trie: Drop unnecessary calls to leaf_pull_suffix

2015-07-27 Thread David Ahern

On 7/27/15 2:08 PM, Alexander Duyck wrote:

It was reported that update_suffix was taking a long time on systems where
a large number of leaves were attached to a single node.  As it turns out
fib_table_flush was calling update_suffix for each leaf that didn't have all
of the aliases stripped from it.  As a result, on this large node removing
one leaf would result in us calling update_suffix for every other leaf on
the node.

The fix is to just remove the calls to leaf_pull_suffix since they are
redundant as we already have a call in resize that will go through and
update the suffix length for the node before we exit out of
fib_table_flush or fib_table_flush_external.

Reported-by: David Ahernd...@cumulusnetworks.com
Signed-off-by: Alexander Duyckalexander.h.du...@redhat.com
---

This patch should apply to linux-4.1.y and newer kernels.

I've done a bit of testing on my system and I no longer see update_suffix
dominating the performance traces.  David if you can test with this patch
to see if you still see the issue I would appreciate it.



Works for me. Thanks.

Tested-by: David Ahern d...@cumulusnetworks.com
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/4] Shared vhost design

2015-07-27 Thread Michael S. Tsirkin
On Mon, Jul 13, 2015 at 12:07:31AM -0400, Bandan Das wrote:
 Hello,
 
 There have been discussions on improving the current vhost design. The first
 attempt, to my knowledge was Shirley Ma's patch to create a dedicated vhost
 worker per cgroup.
 
 http://comments.gmane.org/gmane.linux.network/224730
 
 Later, I posted a cmwq based approach for performance comparisions
 http://comments.gmane.org/gmane.linux.network/286858
 
 More recently was the Elvis work that was presented in KVM Forum 2013
 http://www.linux-kvm.org/images/a/a3/Kvm-forum-2013-elvis.pdf
 
 The Elvis patches rely on common vhost thread design for scalability
 along with polling for performance. Since there are two major changes
 being proposed, we decided to split up the work. The first (this RFC),
 proposing a re-design of the vhost threading model and the second part
 (not posted yet) to focus more on improving performance. 
 
 I am posting this with the hope that we can have a meaningful discussion
 on the proposed new architecture. We have run some tests to show that the new
 design is scalable and in terms of performance, is comparable to the current
 stable design. 
 
 Test Setup:
 The testing is based on the setup described in the Elvis proposal.
 The initial tests are just an aggregate of Netperf STREAM and MAERTS but
 as we progress, I am happy to run more tests. The hosts are two identical
 16 core Haswell systems with point to point network links. For the first 10 
 runs,
 with n=1 upto n=10 guests running in parallel, I booted the target system 
 with nr_cpus=8
 and mem=12G. The purpose was to do a comparision of resource utilization
 and how it affects performance. Finally, with the number of guests set at 14,
 I didn't limit the number of CPUs booted on the host or limit memory seen by
 the kernel but boot the kernel with isolcpus=14,15 that will be used to run
 the vhost threads. The guests are pinned to cpus 0-13 and based on which
 cpu the guest is running on, the corresponding I/O thread is either pinned
 to cpu 14 or 15.
 Results
 # X axis is number of guests
 # Y axis is netperf number
 # nr_cpus=8 and mem=12G
 #Number of Guests#Baseline#ELVIS
 11119.3 .0
 2  1135.6   1130.2
 3  1135.5   1131.6
 4  1136.0   1127.1
 5  1118.6   1129.3
 6  1123.4   1129.8
 7  1128.7   1135.4
 8  1129.9   1137.5
 9  1130.6   1135.1
 10 1129.3   1138.9
 14*1173.8   1216.9

I'm a bit too busy now, with 2.4 and related stuff, will review once we
finish 2.4.  But I'd like to ask two things:
- did you actually test a config where cgroups were used?
- does the design address the issue of VM 1 being blocked
  (e.g. because it hits swap) and blocking VM 2?

 
 #* Last run with the vCPU and I/O thread(s) pinned, no CPU/memory limit 
 imposed.
 #  I/O thread runs on CPU 14 or 15 depending on which guest it's serving
 
 There's a simple graph at
 http://people.redhat.com/~bdas/elvis/data/results.png
 that shows how task affinity results in a jump and even without it,
 as the number of guests increase, the shared vhost design performs
 slightly better.
 
 Observations:
 1. In terms of stock performance, the results are comparable.
 2. However, with a tuned setup, even without polling, we see an improvement
 with the new design.
 3. Making the new design simulate old behavior would be a matter of setting
 the number of guests per vhost threads to 1.
 4. Maybe, setting a per guest limit on the work being done by a specific vhost
 thread is needed for it to be fair.
 5. cgroup associations needs to be figured out. I just slightly hacked the
 current cgroup association mechanism to work with the new model. Ccing cgroups
 for input/comments.
 
 Many thanks to Razya Ladelsky and Eyal Moscovici, IBM for the initial
 patches, the helpful testing suggestions and discussions.
 
 Bandan Das (4):
   vhost: Introduce a universal thread to serve all users
   vhost: Limit the number of devices served by a single worker thread
   cgroup: Introduce a function to compare cgroups
   vhost: Add cgroup-aware creation of worker threads
 
  drivers/vhost/net.c|   6 +-
  drivers/vhost/scsi.c   |  18 ++--
  drivers/vhost/vhost.c  | 272 
 +++--
  drivers/vhost/vhost.h  |  32 +-
  include/linux/cgroup.h |   1 +
  kernel/cgroup.c|  40 
  6 files changed, 275 insertions(+), 94 deletions(-)
 
 -- 
 2.4.3
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Fw: [Bug 102051] New: Unexpected TCP behavior

2015-07-27 Thread Stephen Hemminger


Begin forwarded message:

Date: Mon, 27 Jul 2015 20:06:07 +
From: bugzilla-dae...@bugzilla.kernel.org 
bugzilla-dae...@bugzilla.kernel.org
To: shemmin...@linux-foundation.org shemmin...@linux-foundation.org
Subject: [Bug 102051] New: Unexpected TCP behavior


https://bugzilla.kernel.org/show_bug.cgi?id=102051

Bug ID: 102051
   Summary: Unexpected TCP behavior
   Product: Networking
   Version: 2.5
Kernel Version: 3.19
  Hardware: All
OS: Linux
  Tree: Mainline
Status: NEW
  Severity: low
  Priority: P1
 Component: IPV4
  Assignee: shemmin...@linux-foundation.org
  Reporter: vreme...@gmail.com
Regression: No

While running nmap against localhost I started to see open ports in the dynamic
range (1024). Kind of odd, knowing netstat and ss did not show any listeners
on the port.

With tcpdump, I confirmed system was returning S/ACK for ports that did not
have a listener enabled. The issue or feature only occurs if source port
matches the destination port.

# netstat -nap | grep 5000
#

$ nc localhost -p 5000 5000
a
a

# tcpdump -i lo port 5000
14:28:18.059708 IP 127.0.0.1.5000  127.0.0.1.5000: Flags [S], seq 2005295207,
win 43690, options [mss 65495,sackOK,TS val 4481790 ecr 0,nop,wscale 7], length
0
14:28:18.059721 IP 127.0.0.1.5000  127.0.0.1.5000: Flags [S.], seq 2005295207,
ack 2005295208, win 43690, options [mss 65495,sackOK,TS val 4481790 ecr
4481790,nop,wscale 7], length 0
14:28:18.059729 IP 127.0.0.1.5000  127.0.0.1.5000: Flags [.], ack 2005295208,
win 342, options [nop,nop,TS val 4481790 ecr 4481790], length 0
14:28:19.121392 IP 127.0.0.1.5000  127.0.0.1.5000: Flags [P.], seq
2005295208:2005295210, ack 2005295208, win 342, options [nop,nop,TS val 4482056
ecr 4481790], length 2
14:28:19.121407 IP 127.0.0.1.5000  127.0.0.1.5000: Flags [.], ack 2005295210,
win 342, options [nop,nop,TS val 4482056 ecr 4482056], length 0

# hping3 -S 127.0.0.1 -p 5000 -s 5000
HPING 127.0.0.1 (lo 127.0.0.1): S set, 40 headers + 0 data bytes 
len=40 ip=127.0.0.1 ttl=64 id=2036 sport=5000 flags=S seq=0 win=512 rtt=3.8 ms 
SYN
DUP! len=52 ip=127.0.0.1 ttl=64 DF id=670 sport=5000 flags=A seq=0 win=342
rtt=3.8 ms   SYN/ACK
len=40 ip=127.0.0.1 ttl=64 DF id=43435 sport=5000 flags=RA seq=1 win=0 rtt=3.7
ms

I confirmed it with nmap, nc, and hping3; granted they build on same c
libraries, so I am not even sure if this should be filed as a kernel bug (or
even a bug);
just did not expect to see this behavior // 

Expected to see a RST instead.

-- 
You are receiving this mail because:
You are the assignee for the bug.
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 0/4] Shared vhost design

2015-07-27 Thread Michael S. Tsirkin
On Mon, Jul 27, 2015 at 03:48:19PM -0400, Bandan Das wrote:
 Eyal Moscovici eya...@il.ibm.com writes:
 
  Hi, 
 
  The test showed the same relative numbers as we got in our internal 
  testing. I was wondering about the configuration in regards to NUMA. From
 Thanks for confirming.
 
  our testing we saw that if the VMs are spread across 2 NUMA nodes then 
  having a shared vhost thread per node performs better then having the two 
  threads in the same core.
 
 IIUC, this is similar to my test setup and observations i.e 
  14*   1173.8  1216.9
 
 In this case, there's a shared vhost thread on CPU 14 for numa node 0
 and another on CPU 15 for numa node 1. Guests running on CPUs 0,2,4,6,8,10,12
 are serviced by vhost-0 that runs on CPU 14 and guests running on CPUs 
 1,3,5,7,9,11,13
 get serviced by vhost-1 (Numa node 1). I tried some other configurations but
 this configuration gave me the best results.
 
 
 Eyal, I think it makes sense to add polling on top of these patches and
 get numbers for them too. Thoughts ?
 
 Bandan

So simple polling by vhost is kind of ok for some guests, but I think to
really make it work for a reasonably wide selection of guests/workloads
you need to combine it with 1. polling the NIC - it makes no sense to me
to only poll one side of the equation; and probably 2. - polling in
guest.

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 4/4] vhost: Add cgroup-aware creation of worker threads

2015-07-27 Thread Michael S. Tsirkin
On Mon, Jul 13, 2015 at 12:07:35AM -0400, Bandan Das wrote:
 With the help of the cgroup function to compare groups introduced
 in the previous patch, this changes worker creation policy.
 If the new device belongs to different cgroups than any of the
 devices we are currently serving, we end up creating a new worker
 thread even if we haven't reached the devs_per_worker threshold
 
 Signed-off-by: Bandan Das b...@redhat.com

Would it make sense to integrate this in the work-queue mechanism somehow?
Just a thought - correctly accounting kernel's work
on behalf of specific userspace groups might have value generally.
Or is the usecase too special?
Cc Tejun for comments.

 ---
  drivers/vhost/vhost.c | 47 +++
  1 file changed, 39 insertions(+), 8 deletions(-)
 
 diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c
 index 6a5d4c0..dc0fa37 100644
 --- a/drivers/vhost/vhost.c
 +++ b/drivers/vhost/vhost.c
 @@ -261,12 +261,6 @@ static int vhost_worker(void *data)
   use_mm(dev-mm);
   }
  
 - /* TODO: Consider a more elegant solution */
 - if (worker-owner != dev-owner) {
 - /* Should check for return value */
 - cgroup_attach_task_all(dev-owner, current);
 - worker-owner = dev-owner;
 - }
   work-fn(work);
   if (need_resched())
   schedule();
 @@ -278,6 +272,36 @@ static int vhost_worker(void *data)
   return 0;
  }
  
 +struct vhost_attach_cgroups_struct {
 + struct vhost_work work;
 + struct task_struct *owner;
 + int ret;
 +};
 +
 +static void vhost_attach_cgroups_work(struct vhost_work *work)
 +{
 + struct vhost_attach_cgroups_struct *s;
 +
 + s = container_of(work, struct vhost_attach_cgroups_struct, work);
 + s-ret = cgroup_attach_task_all(s-owner, current);
 +}
 +
 +static void vhost_attach_cgroups(struct vhost_dev *dev,
 + struct vhost_worker *worker)
 +{
 + struct vhost_attach_cgroups_struct attach;
 +
 + attach.owner = dev-owner;
 + vhost_work_init(dev, attach.work, vhost_attach_cgroups_work);
 + vhost_work_queue(worker, attach.work);
 + vhost_work_flush(worker, attach.work);
 +
 + if (!attach.ret)
 + worker-owner = dev-owner;
 +
 + dev-err = attach.ret;
 +}
 +
  static void vhost_create_worker(struct vhost_dev *dev)
  {
   struct vhost_worker *worker;
 @@ -300,8 +324,14 @@ static void vhost_create_worker(struct vhost_dev *dev)
  
   spin_lock_init(worker-work_lock);
   INIT_LIST_HEAD(worker-work_list);
 +
 + /* attach to the cgroups of the process that created us */
 + vhost_attach_cgroups(dev, worker);
 + if (dev-err)
 + goto therror;
 + worker-owner = dev-owner;
 +
   list_add(worker-node, pool-workers);
 - worker-owner = NULL;
   worker-num_devices++;
   total_vhost_workers++;
   dev-worker = worker;
 @@ -320,7 +350,8 @@ static int vhost_dev_assign_worker(struct vhost_dev *dev)
  
   mutex_lock(vhost_pool-pool_lock);
   list_for_each_entry(worker, vhost_pool-workers, node) {
 - if (worker-num_devices  devs_per_worker) {
 + if (worker-num_devices  devs_per_worker 
 + (!cgroup_match_groups(dev-owner, worker-owner))) {
   dev-worker = worker;
   dev-worker_assigned = true;
   worker-num_devices++;
 -- 
 2.4.3
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH iproute2] xfrm: remove duplicated include

2015-07-27 Thread Stephen Hemminger
On Sat, 25 Jul 2015 04:44:24 -0400
Zhang Shengju zhangshen...@cmss.chinamobile.com wrote:

 Remove dupldated include for linux/xfrm.h, since it's already
 included by 'xfrm.h'.
 
 Signed-off-by: Zhang Shengju zhangshen...@cmss.chinamobile.com

Applied, thanks

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] phylib: add driver for aquantia phy

2015-07-27 Thread Florian Fainelli
On 27/07/15 01:30, Shaohui Xie wrote:
 -Original Message-
 From: Florian Fainelli [mailto:f.faine...@gmail.com]
 Sent: Friday, July 24, 2015 12:39 PM
 To: shh@gmail.com; netdev@vger.kernel.org; da...@davemloft.net
 Cc: Xie Shaohui-B21989
 Subject: Re: [PATCH] phylib: add driver for aquantia phy

 Le 07/23/15 20:46, shh@gmail.com a écrit :
 From: Shaohui Xie shaohui@freescale.com

 This patch added driver to support Aquantia PHYs AQ1202, AQ2104,
 AQR105, AQR405, which accessed through clause 45.

 Could you prefix your patches with net: phy:  in the future to be
 consistent with what is typically used?
 [S.H] OK.
 

 See comments below


 Signed-off-by: Shaohui Xie shaohui@freescale.com
 ---

 [snip]

 +static int aquantia_read_status(struct phy_device *phydev) {
 +   int reg;
 +
 +   phydev-speed = SPEED_1;
 +   phydev-duplex = DUPLEX_FULL;
 +
 +   reg = phy_read_mmd(phydev, MDIO_MMD_AN, MDIO_STAT1);
 +   reg = phy_read_mmd(phydev, MDIO_MMD_AN, MDIO_STAT1);
 +   if (reg  MDIO_STAT1_LSTATUS)
 +   phydev-link = 1;
 +   else
 +   phydev-link = 0;
 +
 +   reg = phy_read_mmd(phydev, MDIO_MMD_AN, 0xc800);
 +   mdelay(10);
 +   reg = phy_read_mmd(phydev, MDIO_MMD_AN, 0xc800);
 +   if (reg == 0x9)
 +   phydev-speed = SPEED_2500;
 +   else if (reg == 0x5)
 +   phydev-speed = SPEED_1000;
 +   else if (reg == 0x3)
 +   phydev-speed = SPEED_100;

 Could we use a switch/case here? 
 [S.H] OK.
 
 How about 10Mbits/sec and duplex are we
 guaranteed to be full-duplex at e.g: 100 or 10Mbits/sec?
 [S.H] The PHY does not support 10M bits/sec. 
 When link to 100M. the phy is full-duplex.

Ok, that means you need to restrict the supported flags accordingly not
to advertise these modes as being supported in the first place, see below:

 

 +
 +   return 0;
 +}
 +
 +static struct phy_driver aquantia_driver[] = { {
 +   .phy_id = PHY_ID_AQ1202,
 +   .phy_id_mask= 0xfff0,
 +   .name   = Aquantia AQ1202,
 +   .features   = PHY_GBIT_FEATURES,

 If these are 10GbE PHYs, should not we start defining a new features
 bitmask here to reflect that accordingly? That way MAC
 [S.H] there are several defines for 10G PHYs, should be used by a given 10G 
 PHY. 
 
 for this Aquantia PHY, SUPPORTED_1baseT_Full is a valid define, should I 
 set it as below:
 .features = PHY_GBIT_FEATURES | SUPPORTED_1baseT_Full,

PHY_GBIT_FEATURES means 10/100/1000 half and full-duplex are supported,
which are not supported as you indicated above, I would go with adding
only the supported modes here, this is really important since this is
the contract between the PHY driver and the Ethernet MAC using it
through the PHY library.

Thanks!
-- 
Florian
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


ip6t_SYNPROXY crashes kernel

2015-07-27 Thread Phil Sutter
Hi,

When synproxy_send_server_ack() calls synproxy_send_tcp(), it passes
NULL as third parameter (struct nf_conntrack *nfct). And the first thing
synproxy_send_tcp() does, is dereference it:

| struct net *net = nf_ct_net((struct nf_conn *)nfct);

I could not find a commit leading to this breakage in the commit log,
which makes me doubt ip6t_SYNPROXY has ever worked at all.

If you need one, I have a reproducer at hand. (Though I would want to
strip it down a bit first.) Just let me know.

Cheers, Phil
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH V3 net-next 0/3] ARM BPF JIT features

2015-07-27 Thread David Miller
From: Nicolas Schichan nschic...@freebox.fr
Date: Mon, 27 Jul 2015 15:06:48 +0200

 This series adds support for more instructions to the ARM BPF JIT
 namely skb netdevice type retrieval, skb payload offset retrieval, and
 skb packet type retrieval.
 
 This allows 35 tests to use the JIT instead of 29 before.
 
 This series depends on the BPF JIT fixes for ARM serie sent earlier.

Series applied, thanks.
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next 0/4] net/mlx4_en: Hardware accelerated 802.1ad

2015-07-27 Thread David Miller
From: Amir Vadai am...@mellanox.com
Date: Mon, 27 Jul 2015 14:46:30 +0300

 This patchset by Hadar introduces support in Hardware accelerated 802.1ad, for
 ConnectX-3pro NIC's.  In order to support existing deployment, and due to some
 hardware limitations, the feature is disabled by default, and needed to be
 enabled using a private flag in ethtool. Ofcourse user can enable the private
 flag only if hardware has support.
 After being enabled, the standard ethtool -k/-K can be used.
 
 Patchset was applied and tested over commit 71790a2 (hv_netvsc: Add structs
 and handlers for VF messages)

Series applied, thanks.
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/2] ixgbe: Teardown SR-IOV before unregister_netdev()

2015-07-27 Thread Alex Williamson
When the .remove() callback for a PF is called, SR-IOV support for the
device is disabled, which requires unbinding and removing the VFs.
The VFs may be in-use either by the host kernel or userspace, such as
assigned to a VM through vfio-pci.  In this latter case, the VFs may
be removed either by shutting down the VM or hot-unplugging the
devices from the VM.  Unfortunately in the case of a Windows 2012 R2
guest, hot-unplug is broken due to the ordering of the PF driver
teardown.  Disabling SR-IOV prior to unregister_netdev() avoids this
issue.

Signed-off-by: Alex Williamson alex.william...@redhat.com
---
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c 
b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
index f775123..e27813c 100644
--- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
+++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
@@ -9035,12 +9035,12 @@ static void ixgbe_remove(struct pci_dev *pdev)
/* remove the added san mac */
ixgbe_del_sanmac_netdev(netdev);
 
-   if (netdev-reg_state == NETREG_REGISTERED)
-   unregister_netdev(netdev);
-
 #ifdef CONFIG_PCI_IOV
ixgbe_disable_sriov(adapter);
 #endif
+   if (netdev-reg_state == NETREG_REGISTERED)
+   unregister_netdev(netdev);
+
ixgbe_clear_interrupt_scheme(adapter);
 
ixgbe_release_hw_control(adapter);

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [Intel-wired-lan] [PATCH 2/2] ixgbe: Teardown SR-IOV before unregister_netdev()

2015-07-27 Thread Williams, Mitch A
ACK

 -Original Message-
 From: Intel-wired-lan [mailto:intel-wired-lan-boun...@lists.osuosl.org] On
 Behalf Of Alex Williamson
 Sent: Monday, July 27, 2015 4:19 PM
 To: intel-wired-...@lists.osuosl.org; Kirsher, Jeffrey T
 Cc: netdev@vger.kernel.org; linux-ker...@vger.kernel.org
 Subject: [Intel-wired-lan] [PATCH 2/2] ixgbe: Teardown SR-IOV before
 unregister_netdev()
 
 When the .remove() callback for a PF is called, SR-IOV support for the
 device is disabled, which requires unbinding and removing the VFs.
 The VFs may be in-use either by the host kernel or userspace, such as
 assigned to a VM through vfio-pci.  In this latter case, the VFs may
 be removed either by shutting down the VM or hot-unplugging the
 devices from the VM.  Unfortunately in the case of a Windows 2012 R2
 guest, hot-unplug is broken due to the ordering of the PF driver
 teardown.  Disabling SR-IOV prior to unregister_netdev() avoids this
 issue.
 
 Signed-off-by: Alex Williamson alex.william...@redhat.com
 ---
  drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |6 +++---
  1 file changed, 3 insertions(+), 3 deletions(-)
 
 diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
 b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
 index f775123..e27813c 100644
 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
 +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c
 @@ -9035,12 +9035,12 @@ static void ixgbe_remove(struct pci_dev *pdev)
   /* remove the added san mac */
   ixgbe_del_sanmac_netdev(netdev);
 
 - if (netdev-reg_state == NETREG_REGISTERED)
 - unregister_netdev(netdev);
 -
  #ifdef CONFIG_PCI_IOV
   ixgbe_disable_sriov(adapter);
  #endif
 + if (netdev-reg_state == NETREG_REGISTERED)
 + unregister_netdev(netdev);
 +
   ixgbe_clear_interrupt_scheme(adapter);
 
   ixgbe_release_hw_control(adapter);
 
 ___
 Intel-wired-lan mailing list
 intel-wired-...@lists.osuosl.org
 http://lists.osuosl.org/mailman/listinfo/intel-wired-lan
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [Intel-wired-lan] [PATCH 1/2] igb: Teardown SR-IOV before unregister_netdev()

2015-07-27 Thread Williams, Mitch A
ACK

 -Original Message-
 From: Intel-wired-lan [mailto:intel-wired-lan-boun...@lists.osuosl.org] On
 Behalf Of Alex Williamson
 Sent: Monday, July 27, 2015 4:19 PM
 To: intel-wired-...@lists.osuosl.org; Kirsher, Jeffrey T
 Cc: netdev@vger.kernel.org; linux-ker...@vger.kernel.org
 Subject: [Intel-wired-lan] [PATCH 1/2] igb: Teardown SR-IOV before
 unregister_netdev()
 
 When the .remove() callback for a PF is called, SR-IOV support for the
 device is disabled, which requires unbinding and removing the VFs.
 The VFs may be in-use either by the host kernel or userspace, such as
 assigned to a VM through vfio-pci.  In this latter case, the VFs may
 be removed either by shutting down the VM or hot-unplugging the
 devices from the VM.  Unfortunately in the case of a Windows 2012 R2
 guest, hot-unplug is broken due to the ordering of the PF driver
 teardown.  Disabling SR-IOV prior to unregister_netdev() avoids this
 issue.
 
 Signed-off-by: Alex Williamson alex.william...@redhat.com
 ---
  drivers/net/ethernet/intel/igb/igb_main.c |8 
  1 file changed, 4 insertions(+), 4 deletions(-)
 
 diff --git a/drivers/net/ethernet/intel/igb/igb_main.c
 b/drivers/net/ethernet/intel/igb/igb_main.c
 index 517746f..606a7ae 100644
 --- a/drivers/net/ethernet/intel/igb/igb_main.c
 +++ b/drivers/net/ethernet/intel/igb/igb_main.c
 @@ -2805,14 +2805,14 @@ static void igb_remove(struct pci_dev *pdev)
*/
   igb_release_hw_control(adapter);
 
 - unregister_netdev(netdev);
 -
 - igb_clear_interrupt_scheme(adapter);
 -
  #ifdef CONFIG_PCI_IOV
   igb_disable_sriov(pdev);
  #endif
 
 + unregister_netdev(netdev);
 +
 + igb_clear_interrupt_scheme(adapter);
 +
   pci_iounmap(pdev, hw-hw_addr);
   if (hw-flash_address)
   iounmap(hw-flash_address);
 
 ___
 Intel-wired-lan mailing list
 intel-wired-...@lists.osuosl.org
 http://lists.osuosl.org/mailman/listinfo/intel-wired-lan
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] packet: Allow packets with only a header (but no payload)

2015-07-27 Thread Martin Blumenstingl
Hi Johann,

On Tue, Jul 21, 2015 at 6:51 PM, Willem de Bruijn will...@google.com wrote:
 I don't see a simple way of verifying the safety of allowing packets
 without data short of a code audit, which would be huge, especially
 when taking device driver logic into account. Perhaps someone
 remembers why that statement was added and what edge case(s)
 it refers to. I'm afraid that I don't. It was added in 69e3c75f4d54. I
 added the author to this thread.
I know it's summer (and thus vacation-time), but did you already have
a chance to look into this?


Regards,
Martin
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch -master] netfilter: xt_CT: checking for IS_ERR() instead of NULL

2015-07-27 Thread Dan Carpenter
We recently changed this from nf_conntrack_alloc() to nf_ct_tmpl_alloc()
so the error handling needs to changed to check for NULL instead of
IS_ERR().

Fixes: 0838aa7fcfcd ('netfilter: fix netns dependencies with conntrack 
templates')
Signed-off-by: Dan Carpenter dan.carpen...@oracle.com

diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c
index c663003..43ddeee 100644
--- a/net/netfilter/xt_CT.c
+++ b/net/netfilter/xt_CT.c
@@ -202,9 +202,10 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par,
goto err1;
 
ct = nf_ct_tmpl_alloc(par-net, info-zone, GFP_KERNEL);
-   ret = PTR_ERR(ct);
-   if (IS_ERR(ct))
+   if (!ct) {
+   ret = -ENOMEM;
goto err2;
+   }
 
ret = 0;
if ((info-ct_events || info-exp_events) 
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: kernel warning in tcp_fragment

2015-07-27 Thread Martin KaFai Lau
On Wed, Jul 22, 2015 at 11:55:35AM -0700, Jovi Zhangwei wrote:
 Sorry for disturbing, our production system(3.14 and 3.18 stable
 kernel) have many tcp_fragment warnings,
 the trace is same as below one which you discussed before.
 
 https://urldefense.proofpoint.com/v1/url?u=http://comments.gmane.org/gmane.linux.network/365658k=ZVNjlDMF0FElm4dQtryO4A%3D%3D%0Ar=%2Faj1ZOQObwbmtLwlDw3XzQ%3D%3D%0Am=fQUME5h%2FYY3oZjXbnLC3z6TaEEcTBSCAji4PkNqFjq8%3D%0As=1527f3221a6f31cba9544e5ddaa20986aafe8be8c898b42c7e9ce5e68d3803d8
 
 But I didn't found the final solution in that mail thread, do you have
 any new ideas or patches on this warning?

I think the following points to the last discussion.  We are currently using a
similar patch:
http://comments.gmane.org/gmane.linux.network/366549

Eric, any update on your findings? or you have already pushed a fix?

Thanks,
--Martin
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch] packet: missing dev_put() in packet_do_bind()

2015-07-27 Thread Dan Carpenter
From: Lars Westerhoff lars.westerh...@newtec.eu

When binding a PF_PACKET socket, the use count of the bound interface is
always increased with dev_hold in dev_get_by_{index,name}.  However,
when rebound with the same protocol and device as in the previous bind
the use count of the interface was not decreased.  Ultimately, this
caused the deletion of the interface to fail with the following message:

unregister_netdevice: waiting for dummy0 to become free. Usage count = 1

This patch moves the dev_put out of the conditional part that was only
executed when either the protocol or device changed on a bind.

Fixes: 902fefb82ef7 ('packet: improve socket create/bind latency in some cases')
Signed-off-by: Lars Westerhoff lars.westerh...@newtec.eu
Signed-off-by: Dan Carpenter dan.carpen...@oracle.com
Reviewed-by: Daniel Borkmann dbork...@redhat.com

diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c
index c9e8741..c7c42eb 100644
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -2784,7 +2784,7 @@ static int packet_release(struct socket *sock)
 static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 
proto)
 {
struct packet_sock *po = pkt_sk(sk);
-   const struct net_device *dev_curr;
+   struct net_device *dev_curr;
__be16 proto_curr;
bool need_rehook;
 
@@ -2808,15 +2808,13 @@ static int packet_do_bind(struct sock *sk, struct 
net_device *dev, __be16 proto)
 
po-num = proto;
po-prot_hook.type = proto;
-
-   if (po-prot_hook.dev)
-   dev_put(po-prot_hook.dev);
-
po-prot_hook.dev = dev;
 
po-ifindex = dev ? dev-ifindex : 0;
packet_cached_dev_assign(po, dev);
}
+   if (dev_curr)
+   dev_put(dev_curr);
 
if (proto == 0 || !need_rehook)
goto out_unlock;
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH] netfilter: ipt_SYNPROXY: fix sending window update to client

2015-07-27 Thread Phil Sutter
Upon receipt of SYNACK from the server, ipt_SYNPROXY first sends back an ACK to
finish the server handshake, then calls nf_ct_seqadj_init() to initiate
sequence number adjustment of forwarded packets to the client and finally sends
a window update to the client to unblock it's TX queue.

Since synproxy_send_client_ack() does not set synproxy_send_tcp()'s nfct
parameter, no sequence number adjustment happens and the client receives the
window update with incorrect sequence number. Depending on client TCP
implementation, this leads to a significant delay (until a window probe is
being sent).

Signed-off-by: Phil Sutter p...@nwl.cc
---
 net/ipv4/netfilter/ipt_SYNPROXY.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c 
b/net/ipv4/netfilter/ipt_SYNPROXY.c
index fe8cc18..95ea633e 100644
--- a/net/ipv4/netfilter/ipt_SYNPROXY.c
+++ b/net/ipv4/netfilter/ipt_SYNPROXY.c
@@ -226,7 +226,8 @@ synproxy_send_client_ack(const struct synproxy_net *snet,
 
synproxy_build_options(nth, opts);
 
-   synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size);
+   synproxy_send_tcp(skb, nskb, skb-nfct, IP_CT_ESTABLISHED_REPLY,
+ niph, nth, tcp_hdr_size);
 }
 
 static bool
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net v3] macvtap: fix network header pointer for VLAN tagged pkts

2015-07-27 Thread David Miller
From: Ivan Vecera ivec...@redhat.com
Date: Thu, 23 Jul 2015 16:37:43 +0200

 Network header is set with offset ETH_HLEN but it is not true for VLAN
 (multiple-)tagged and results in checksum issues in lower devices.
 
 v2: leave skb-protocol untouched (thx Vlad), comment added
 v3: moved after skb_probe_transport_header() call (thx Toshiaki)
 
 Signed-off-by: Ivan Vecera ivec...@redhat.com

Applied, thanks.
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 0/2] igb/ixgbe: Fix ordering of SR-IOV teardown

2015-07-27 Thread Alex Williamson
When running a Windows 2012 R2 guest with a pair of VFs assigned
through vfio-pci, we run into a problem trying to hot-unplug those VFs
after the PF has unregistered the netdev.  This is a common scenario
if the PF is unbound from the driver while VFs are active.  In the
case of igb, the resulting guest behavior differs slightly between the
Microsoft provided and Intel add-on guest drivers.  With the Microsoft
driver, the guest seems to stumble through ejecting both VFs, but
takes longer than normal to do so.  With the Intel drivers, only one
VF is unplugged, but Device Manager still shows it as present.  The
second VF is non-functional but also still shown in Device Manager.
At this point, the guest is in such a state that it will not cleanly
shutdown.  With ixgbe VFs, both the Microsoft and Intel drivers take
on this latter behavior.

For both, I've found that disabling SR-IOV before unregistering the PF
netdev device allows the hot-unplug to proceed without interruption or
further ill behavior in the guest.  This is true regardless of which
driver is used.  I don't fully understand what dependency is broken
by unregistering the netdev prior to disabling SR-IOV, but I also
don't see the benefit in delaying SR-IOV teardown in this call path.
It could potentially be moved even earlier, but I'll let those more
familiar with the hardware and code make that determination.  In any
case, the VM behavior is substantially improved by this slight
re-ordering.

I don't have an i40e for testing, but it already appears to disable
SR-IOV much earlier in the unbind path, so I wouldn't expect to find
similar issues.  Thanks,

Alex

---

Alex Williamson (2):
  igb: Teardown SR-IOV before unregister_netdev()
  ixgbe: Teardown SR-IOV before unregister_netdev()


 drivers/net/ethernet/intel/igb/igb_main.c |8 
 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |6 +++---
 2 files changed, 7 insertions(+), 7 deletions(-)
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH net-next v2 1/2] ipv6: Re-arrange code in rt6_probe()

2015-07-27 Thread YOSHIFUJI Hideaki/吉藤英明
Martin KaFai Lau wrote:
 It is a prep work for the next patch to remove write_lock
 from rt6_probe().
 
 1. Reduce the number of if(neigh) check.  From 4 to 1.
 2. Bring the write_(un)lock() closer to the operations that the
lock is protecting.
 
 Hopefully, the above make rt6_probe() more readable.
 
 Signed-off-by: Martin KaFai Lau ka...@fb.com
 Cc: Hannes Frederic Sowa han...@stressinduktion.org
 Cc: Julian Anastasov j...@ssi.bg
 Cc: YOSHIFUJI Hideaki hideaki.yoshif...@miraclelinux.com

Acked-by: YOSHIFUJI Hideaki hideaki.yoshif...@miraclelinux.com

--yoshfuji

 ---
  net/ipv6/route.c | 44 
  1 file changed, 20 insertions(+), 24 deletions(-)
 
 diff --git a/net/ipv6/route.c b/net/ipv6/route.c
 index 7f2214f..6d503db 100644
 --- a/net/ipv6/route.c
 +++ b/net/ipv6/route.c
 @@ -545,6 +545,7 @@ static void rt6_probe_deferred(struct work_struct *w)
  
  static void rt6_probe(struct rt6_info *rt)
  {
 + struct __rt6_probe_work *work;
   struct neighbour *neigh;
   /*
* Okay, this does not seem to be appropriate
 @@ -559,34 +560,29 @@ static void rt6_probe(struct rt6_info *rt)
   rcu_read_lock_bh();
   neigh = __ipv6_neigh_lookup_noref(rt-dst.dev, rt-rt6i_gateway);
   if (neigh) {
 + work = NULL;
   write_lock(neigh-lock);
 - if (neigh-nud_state  NUD_VALID)
 - goto out;
 - }
 -
 - if (!neigh ||
 - time_after(jiffies, neigh-updated + 
 rt-rt6i_idev-cnf.rtr_probe_interval)) {
 - struct __rt6_probe_work *work;
 -
 - work = kmalloc(sizeof(*work), GFP_ATOMIC);
 -
 - if (neigh  work)
 - __neigh_set_probe_once(neigh);
 -
 - if (neigh)
 - write_unlock(neigh-lock);
 -
 - if (work) {
 - INIT_WORK(work-work, rt6_probe_deferred);
 - work-target = rt-rt6i_gateway;
 - dev_hold(rt-dst.dev);
 - work-dev = rt-dst.dev;
 - schedule_work(work-work);
 + if (!(neigh-nud_state  NUD_VALID) 
 + time_after(jiffies,
 +neigh-updated +
 +rt-rt6i_idev-cnf.rtr_probe_interval)) {
 + work = kmalloc(sizeof(*work), GFP_ATOMIC);
 + if (work)
 + __neigh_set_probe_once(neigh);
   }
 - } else {
 -out:
   write_unlock(neigh-lock);
 + } else {
 + work = kmalloc(sizeof(*work), GFP_ATOMIC);
 + }
 +
 + if (work) {
 + INIT_WORK(work-work, rt6_probe_deferred);
 + work-target = rt-rt6i_gateway;
 + dev_hold(rt-dst.dev);
 + work-dev = rt-dst.dev;
 + schedule_work(work-work);
   }
 +
   rcu_read_unlock_bh();
  }
  #else
 

-- 
吉藤英明 hideaki.yoshif...@miraclelinux.com
ミラクル・リナックス株式会社 技術本部 サポート部
--
To unsubscribe from this list: send the line unsubscribe netdev in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


  1   2   >