[PATCH v2] rsi: Fix failure to load firmware after memory leak fix and fix the leak
Fixes commit eae79b4f3e82 (rsi: fix memory leak in rsi_load_ta_instructions()) which stopped the driver from functioning. Firmware data has been allocated using vmalloc(), resulting in memory that cannot be used for DMA. Hence the firmware was first copied to a buffer allocated with kmalloc() in the original code. This patch reverts the commit and only calls kfree() to release the buffer after sending the data. This fixes the memory leak without breaking the driver. Add a comment to the kmemdup() calls to explain why this is done, and abort if memory allocation fails. Tested on a Topic Miami-Florida board which contains the rsi SDIO chip. Also added the same kfree() call to the USB glue driver. This was not tested on actual hardware though, as I only have the SDIO version. Fixes: eae79b4f3e82 (rsi: fix memory leak in rsi_load_ta_instructions()) Signed-off-by: Mike Looijmans mike.looijm...@topic.nl Cc: sta...@vger.kernel.org --- v2: Add Fixes: header and abbreviate git hashes. Return -ENOMEM if kmemdup() fails. drivers/net/wireless/rsi/rsi_91x_sdio_ops.c | 8 +++- drivers/net/wireless/rsi/rsi_91x_usb_ops.c | 4 2 files changed, 11 insertions(+), 1 deletion(-) diff --git a/drivers/net/wireless/rsi/rsi_91x_sdio_ops.c b/drivers/net/wireless/rsi/rsi_91x_sdio_ops.c index b6cc9ff..1c6788a 100644 --- a/drivers/net/wireless/rsi/rsi_91x_sdio_ops.c +++ b/drivers/net/wireless/rsi/rsi_91x_sdio_ops.c @@ -172,6 +172,7 @@ static int rsi_load_ta_instructions(struct rsi_common *common) (struct rsi_91x_sdiodev *)adapter-rsi_dev; u32 len; u32 num_blocks; + const u8 *fw; const struct firmware *fw_entry = NULL; u32 block_size = dev-tx_blk_size; int status = 0; @@ -200,6 +201,10 @@ static int rsi_load_ta_instructions(struct rsi_common *common) return status; } + /* Copy firmware into DMA-accessible memory */ + fw = kmemdup(fw_entry-data, fw_entry-size, GFP_KERNEL); + if (!fw) + return -ENOMEM; len = fw_entry-size; if (len % 4) @@ -210,7 +215,8 @@ static int rsi_load_ta_instructions(struct rsi_common *common) rsi_dbg(INIT_ZONE, %s: Instruction size:%d\n, __func__, len); rsi_dbg(INIT_ZONE, %s: num blocks: %d\n, __func__, num_blocks); - status = rsi_copy_to_card(common, fw_entry-data, len, num_blocks); + status = rsi_copy_to_card(common, fw, len, num_blocks); + kfree(fw); release_firmware(fw_entry); return status; } diff --git a/drivers/net/wireless/rsi/rsi_91x_usb_ops.c b/drivers/net/wireless/rsi/rsi_91x_usb_ops.c index 1106ce7..30c2cf7 100644 --- a/drivers/net/wireless/rsi/rsi_91x_usb_ops.c +++ b/drivers/net/wireless/rsi/rsi_91x_usb_ops.c @@ -146,7 +146,10 @@ static int rsi_load_ta_instructions(struct rsi_common *common) return status; } + /* Copy firmware into DMA-accessible memory */ fw = kmemdup(fw_entry-data, fw_entry-size, GFP_KERNEL); + if (!fw) + return -ENOMEM; len = fw_entry-size; if (len % 4) @@ -158,6 +161,7 @@ static int rsi_load_ta_instructions(struct rsi_common *common) rsi_dbg(INIT_ZONE, %s: num blocks: %d\n, __func__, num_blocks); status = rsi_copy_to_card(common, fw, len, num_blocks); + kfree(fw); release_firmware(fw_entry); return status; } -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [net:master 41/49] drivers/net/ethernet/cadence/macb.c:164:1: error: macro writel passed 3 arguments, but takes just 2
On Mon, 2015-07-27 at 17:03 +0800, kbuild test robot wrote: tree: git://git.kernel.org/pub/scm/linux/kernel/git/davem/net.git master head: 8fff755e9f8d0f70a595e79f248695ce6aef5cc3 commit: f2ce8a9e48385f444389e75cfe293637c3eb5410 [41/49] net/macb: improve big endian CPU support config: arm-at91_dt_defconfig (attached as .config) reproduce: wget https://git.kernel.org/cgit/linux/kernel/git/wfg/lkp -tests.git/plain/sbin/make.cross -O ~/bin/make.cross chmod +x ~/bin/make.cross git checkout f2ce8a9e48385f444389e75cfe293637c3eb5410 # save the attached .config to linux build tree make.cross ARCH=arm Oh, no. I do use compiler from Debian for AVR32, didn't check this on other architectures. Possible something like following will fix it: --- a/drivers/net/ethernet/cadence/macb.h +++ b/drivers/net/ethernet/cadence/macb.h @@ -429,12 +429,12 @@ | GEM_BF(name, value)) /* Register access macros */ -#define macb_readl(port, reg) (port)-readl((port), MACB_##reg) -#define macb_writel(port, reg, value) (port)-writel((port), MACB_##reg, (value)) -#define gem_readl(port, reg) (port)-readl((port), GEM_##reg) -#define gem_writel(port, reg, value) (port)-writel((port), GEM_##reg, (value)) -#define queue_readl(queue, reg)(queue)-bp -readl((queue)-bp, (queue)-reg) -#define queue_writel(queue, reg, value)(queue)-bp -writel((queue)-bp, (queue)-reg, (value)) +#define macb_readl(port, reg) port-readl(port, MACB_##reg) +#define macb_writel(port, reg, value) port-writel(port, MACB_##reg, (value)) +#define gem_readl(port, reg) port-readl(port, GEM_##reg) +#define gem_writel(port, reg, value) port-writel(port, GEM_##reg, (value)) +#define queue_readl(queue, reg)queue-bp-readl(queue -bp, queue-reg) +#define queue_writel(queue, reg, value)queue-bp-writel(queue -bp, queue-reg, (value)) /* Conditional GEM/MACB macros. These perform the operation to the correct * register dependent on whether the device is a GEM or a MACB. For registers All error/warnings (new ones prefixed by ): drivers/net/ethernet/cadence/macb.c: In function 'macb_set_hwaddr': drivers/net/ethernet/cadence/macb.c:164:1: error: macro writel passed 3 arguments, but takes just 2 macb_or_gem_writel(bp, SA1B, bottom); ^ In file included from drivers/net/ethernet/cadence/macb.c:34:0: drivers/net/ethernet/cadence/macb.h:435:38: warning: statement with no effect [-Wunused-value] #define gem_writel(port, reg, value) (port)-writel((port), GEM_##reg, (value)) ^ drivers/net/ethernet/cadence/macb.h:447:4: note: in expansion of macro 'gem_writel' gem_writel((__bp), __reg, __value); \ ^ drivers/net/ethernet/cadence/macb.c:164:2: note: in expansion of macro 'macb_or_gem_writel' macb_or_gem_writel(bp, SA1B, bottom); ^ drivers/net/ethernet/cadence/macb.c:164:1: error: macro writel passed 3 arguments, but takes just 2 macb_or_gem_writel(bp, SA1B, bottom); ^ In file included from drivers/net/ethernet/cadence/macb.c:34:0: drivers/net/ethernet/cadence/macb.h:433:39: warning: statement with no effect [-Wunused-value] #define macb_writel(port, reg, value) (port)-writel((port), MACB_##reg, (value)) ^ drivers/net/ethernet/cadence/macb.h:449:4: note: in expansion of macro 'macb_writel' macb_writel((__bp), __reg, __value); \ ^ drivers/net/ethernet/cadence/macb.c:164:2: note: in expansion of macro 'macb_or_gem_writel' macb_or_gem_writel(bp, SA1B, bottom); ^ drivers/net/ethernet/cadence/macb.c:166:1: error: macro writel passed 3 arguments, but takes just 2 macb_or_gem_writel(bp, SA1T, top); ^ In file included from drivers/net/ethernet/cadence/macb.c:34:0: drivers/net/ethernet/cadence/macb.h:435:38: warning: statement with no effect [-Wunused-value] #define gem_writel(port, reg, value) (port)-writel((port), GEM_##reg, (value)) ^ drivers/net/ethernet/cadence/macb.h:447:4: note: in expansion of macro 'gem_writel' gem_writel((__bp), __reg, __value); \ ^ drivers/net/ethernet/cadence/macb.c:166:2: note: in expansion of macro 'macb_or_gem_writel' macb_or_gem_writel(bp, SA1T, top); ^ drivers/net/ethernet/cadence/macb.c:166:1: error: macro writel passed 3 arguments, but takes just 2 macb_or_gem_writel(bp, SA1T, top); ^ In file included from drivers/net/ethernet/cadence/macb.c:34:0: drivers/net/ethernet/cadence/macb.h:433:39: warning: statement with no effect [-Wunused-value] #define macb_writel(port, reg, value) (port)-writel((port), MACB_##reg, (value)) ^ drivers/net/ethernet/cadence/macb.h:449:4: note: in expansion of macro
[PATCH V3 net-next 3/3] ARM: net: add support for BPF_ANC | SKF_AD_HATYPE in ARM JIT.
Signed-off-by: Nicolas Schichan nschic...@freebox.fr --- arch/arm/net/bpf_jit_32.c | 22 -- arch/arm/net/bpf_jit_32.h | 3 +++ 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 3c73caf..876060b 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -857,7 +857,9 @@ b_epilogue: emit(ARM_LDR_I(r_A, r_scratch, off), ctx); break; case BPF_ANC | SKF_AD_IFINDEX: + case BPF_ANC | SKF_AD_HATYPE: /* A = skb-dev-ifindex */ + /* A = skb-dev-type */ ctx-seen |= SEEN_SKB; off = offsetof(struct sk_buff, dev); emit(ARM_LDR_I(r_scratch, r_skb, off), ctx); @@ -867,8 +869,24 @@ b_epilogue: BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, ifindex) != 4); - off = offsetof(struct net_device, ifindex); - emit(ARM_LDR_I(r_A, r_scratch, off), ctx); + BUILD_BUG_ON(FIELD_SIZEOF(struct net_device, + type) != 2); + + if (code == (BPF_ANC | SKF_AD_IFINDEX)) { + off = offsetof(struct net_device, ifindex); + emit(ARM_LDR_I(r_A, r_scratch, off), ctx); + } else { + /* +* offset of field type in struct +* net_device is above what can be +* used in the ldrh rd, [rn, #imm] +* instruction, so load the offset in +* a register and use ldrh rd, [rn, rm] +*/ + off = offsetof(struct net_device, type); + emit_mov_i(ARM_R3, off, ctx); + emit(ARM_LDRH_R(r_A, r_scratch, ARM_R3), ctx); + } break; case BPF_ANC | SKF_AD_MARK: ctx-seen |= SEEN_SKB; diff --git a/arch/arm/net/bpf_jit_32.h b/arch/arm/net/bpf_jit_32.h index b2d7d92..4b17d5ab 100644 --- a/arch/arm/net/bpf_jit_32.h +++ b/arch/arm/net/bpf_jit_32.h @@ -74,6 +74,7 @@ #define ARM_INST_LDRB_I0x05d0 #define ARM_INST_LDRB_R0x07d0 #define ARM_INST_LDRH_I0x01d000b0 +#define ARM_INST_LDRH_R0x019000b0 #define ARM_INST_LDR_I 0x0590 #define ARM_INST_LDM 0x0890 @@ -160,6 +161,8 @@ | (rm)) #define ARM_LDRH_I(rt, rn, off)(ARM_INST_LDRH_I | (rt) 12 | (rn) 16 \ | (((off) 0xf0) 4) | ((off) 0xf)) +#define ARM_LDRH_R(rt, rn, rm) (ARM_INST_LDRH_R | (rt) 12 | (rn) 16 \ +| (rm)) #define ARM_LDM(rn, regs) (ARM_INST_LDM | (rn) 16 | (regs)) -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V3 net-next 0/3] ARM BPF JIT features
Hello, This series adds support for more instructions to the ARM BPF JIT namely skb netdevice type retrieval, skb payload offset retrieval, and skb packet type retrieval. This allows 35 tests to use the JIT instead of 29 before. This series depends on the BPF JIT fixes for ARM serie sent earlier. Regards, Changes from V1 to V2: * split fixes and features in separate patch series. Changes from V2 to V3: * respin against latest net-next. Nicolas Schichan (3): ARM: net: add support for BPF_ANC | SKF_AD_PKTTYPE in ARM JIT. ARM: net: add support for BPF_ANC | SKF_AD_PAY_OFFSET in ARM JIT. ARM: net: add support for BPF_ANC | SKF_AD_HATYPE in ARM JIT. arch/arm/net/bpf_jit_32.c | 41 +++-- arch/arm/net/bpf_jit_32.h | 3 +++ 2 files changed, 42 insertions(+), 2 deletions(-) -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] MIPS: Remove most of the custom gpio.h
On Thu, Jul 23, 2015 at 8:25 PM, Lars-Peter Clausen l...@metafoo.de wrote: On 07/22/2015 07:33 PM, Alban Bedel wrote: diff --git a/arch/mips/jz4740/gpio.c b/arch/mips/jz4740/gpio.c index 54c80d4..3dc500c 100644 --- a/arch/mips/jz4740/gpio.c +++ b/arch/mips/jz4740/gpio.c @@ -262,18 +262,6 @@ uint32_t jz_gpio_port_get_value(int port, uint32_t mask) } EXPORT_SYMBOL(jz_gpio_port_get_value); -int gpio_to_irq(unsigned gpio) -{ - return JZ4740_IRQ_GPIO(0) + gpio; -} -EXPORT_SYMBOL_GPL(gpio_to_irq); This need to be hooked up the gpio_to_irq() callback of the gpio_chip struct of this driver rather than completely removing it. Otherwise this functionality will be broken. Similar for other platforms which implement the function. Even better is to see if we can convert the driver to GPIOLIB_IRQCHIP which moves the handling of IRQ mapping to the gpiolib core. This works for all simple cascading GPIO-with-IRQ controllers with a local mask register. (Not when the system intcon and GPIO is mashed up though.) But no hurry with that. Yours, Linus Walleij -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH net] sctp: ASCONF-ACK with Unresolvable Address should be sent
On Sat, Jul 25, 2015 at 01:08:08PM +0800, Xin Long wrote: RFC 5061: This is an opaque integer assigned by the sender to identify each request parameter. The receiver of the ASCONF Chunk will copy this 32-bit value into the ASCONF Response Correlation ID field of the ASCONF-ACK response parameter. The sender of the ASCONF can use this same value in the ASCONF-ACK to find which request the response is for. Note that the receiver MUST NOT change this 32-bit value. Address Parameter: TLV This field contains an IPv4 or IPv6 address parameter, as described in Section 3.3.2.1 of [RFC4960]. ASCONF chunk with Error Cause Indication Parameter (Unresolvable Address) should be sent if the Delete IP Address is not part of the association. Endpoint A Endpoint B (ESTABLISHED)(ESTABLISHED) ASCONF- (Delete IP Address) - ASCONF-ACK (Unresolvable Address) Signed-off-by: Xin Long lucien@gmail.com --- net/sctp/sm_make_chunk.c | 15 +-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/net/sctp/sm_make_chunk.c b/net/sctp/sm_make_chunk.c index 06320c8..6e399f6 100644 --- a/net/sctp/sm_make_chunk.c +++ b/net/sctp/sm_make_chunk.c @@ -3090,8 +3090,19 @@ static __be16 sctp_process_asconf_param(struct sctp_association *asoc, sctp_assoc_set_primary(asoc, asconf-transport); sctp_assoc_del_nonprimary_peers(asoc, asconf-transport); - } else - sctp_assoc_del_peer(asoc, addr); + return SCTP_ERROR_NO_ERROR; + } + + /* If the address is not part of the association, the + * ASCONF-ACK with Error Cause Indication Parameter + * which including cause of Unresolvable Address should + * be sent. + */ + peer = sctp_assoc_lookup_paddr(asoc, addr); + if (!peer) + return SCTP_ERROR_DNS_FAILED; + + sctp_assoc_rm_peer(asoc, peer); break; case SCTP_PARAM_SET_PRIMARY: /* ADDIP Section 4.2.4 -- 2.1.0 Looks good to me. Marcelo -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Several races in usbnet module (kernel 4.1.x)
On Fri, 2015-07-24 at 20:38 +0300, Eugene Shatokhin wrote: 21.07.2015 15:04, Oliver Neukum пишет: your analysis is correct and it looks like in addition to your proposed fix locking needs to be simplified and a common lock to be taken. Suggestions? Just an idea, I haven't tested it. How about moving the operations with dev-done under list-lock in defer_bh, while keeping dev-done.lock too and changing Why keep dev-done.lock? Does it make sense at all? usbnet_terminate_urbs() as described below? Like this: @@ -428,12 +428,12 @@ static enum skb_state defer_bh(struct usbnet *dev, struct sk_buff *skb, old_state = entry-state; entry-state = state; __skb_unlink(skb, list); - spin_unlock(list-lock); spin_lock(dev-done.lock); __skb_queue_tail(dev-done, skb); if (dev-done.qlen == 1) tasklet_schedule(dev-bh); - spin_unlock_irqrestore(dev-done.lock, flags); + spin_unlock(dev-done.lock); + spin_unlock_irqrestore(list-lock, flags); return old_state; } --- usbnet_terminate_urbs() can then be changed as follows: @@ -749,6 +749,20 @@ EXPORT_SYMBOL_GPL(usbnet_unlink_rx_urbs); /*-*/ +static void wait_skb_queue_empty(struct sk_buff_head *q) +{ + unsigned long flags; + + spin_lock_irqsave(q-lock, flags); + while (!skb_queue_empty(q)) { + spin_unlock_irqrestore(q-lock, flags); + schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); + set_current_state(TASK_UNINTERRUPTIBLE); I suppose you want to invert those lines + spin_lock_irqsave(q-lock, flags); + } + spin_unlock_irqrestore(q-lock, flags); +} + Your changes make sense, but it locks to me as if a lock would become totally redundant. Regards Oliver -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V3 net-next 1/3] ARM: net: add support for BPF_ANC | SKF_AD_PKTTYPE in ARM JIT.
Signed-off-by: Nicolas Schichan nschic...@freebox.fr --- arch/arm/net/bpf_jit_32.c | 11 +++ 1 file changed, 11 insertions(+) diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index c011e22..6ff248c 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -895,6 +895,17 @@ b_epilogue: OP_IMM3(ARM_AND, r_A, r_A, 0x1, ctx); } break; + case BPF_ANC | SKF_AD_PKTTYPE: + ctx-seen |= SEEN_SKB; + BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, + __pkt_type_offset[0]) != 1); + off = PKT_TYPE_OFFSET(); + emit(ARM_LDRB_I(r_A, r_skb, off), ctx); + emit(ARM_AND_I(r_A, r_A, PKT_TYPE_MAX), ctx); +#ifdef __BIG_ENDIAN_BITFIELD + emit(ARM_LSR_I(r_A, r_A, 5), ctx); +#endif + break; case BPF_ANC | SKF_AD_QUEUE: ctx-seen |= SEEN_SKB; BUILD_BUG_ON(FIELD_SIZEOF(struct sk_buff, -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH V3 net-next 2/3] ARM: net: add support for BPF_ANC | SKF_AD_PAY_OFFSET in ARM JIT.
Signed-off-by: Nicolas Schichan nschic...@freebox.fr --- arch/arm/net/bpf_jit_32.c | 8 1 file changed, 8 insertions(+) diff --git a/arch/arm/net/bpf_jit_32.c b/arch/arm/net/bpf_jit_32.c index 6ff248c..3c73caf 100644 --- a/arch/arm/net/bpf_jit_32.c +++ b/arch/arm/net/bpf_jit_32.c @@ -915,6 +915,14 @@ b_epilogue: off = offsetof(struct sk_buff, queue_mapping); emit(ARM_LDRH_I(r_A, r_skb, off), ctx); break; + case BPF_ANC | SKF_AD_PAY_OFFSET: + ctx-seen |= SEEN_SKB | SEEN_CALL; + + emit(ARM_MOV_R(ARM_R0, r_skb), ctx); + emit_mov_i(ARM_R3, (unsigned int)skb_get_poff, ctx); + emit_blx_r(ARM_R3, ctx); + emit(ARM_MOV_R(r_A, ARM_R0), ctx); + break; case BPF_LDX | BPF_W | BPF_ABS: /* * load a 32bit word from struct seccomp_data. -- 1.9.1 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Several races in usbnet module (kernel 4.1.x)
27.07.2015 15:29, Oliver Neukum пишет: On Fri, 2015-07-24 at 20:38 +0300, Eugene Shatokhin wrote: 21.07.2015 15:04, Oliver Neukum пишет: your analysis is correct and it looks like in addition to your proposed fix locking needs to be simplified and a common lock to be taken. Suggestions? Just an idea, I haven't tested it. How about moving the operations with dev-done under list-lock in defer_bh, while keeping dev-done.lock too and changing Why keep dev-done.lock? Does it make sense at all? I think it does. Both skb_queue_tail(dev-done, skb) called from rx_process() and skb_dequeue (dev-done) called from usbnet_bh() take dev-done.lock internally. So, to synchronize accesses to dev-done, one needs that lock in defer_bh() too. usbnet_terminate_urbs() as described below? Like this: @@ -428,12 +428,12 @@ static enum skb_state defer_bh(struct usbnet *dev, struct sk_buff *skb, old_state = entry-state; entry-state = state; __skb_unlink(skb, list); - spin_unlock(list-lock); spin_lock(dev-done.lock); __skb_queue_tail(dev-done, skb); if (dev-done.qlen == 1) tasklet_schedule(dev-bh); - spin_unlock_irqrestore(dev-done.lock, flags); + spin_unlock(dev-done.lock); + spin_unlock_irqrestore(list-lock, flags); return old_state; } --- usbnet_terminate_urbs() can then be changed as follows: @@ -749,6 +749,20 @@ EXPORT_SYMBOL_GPL(usbnet_unlink_rx_urbs); /*-*/ +static void wait_skb_queue_empty(struct sk_buff_head *q) +{ + unsigned long flags; + + spin_lock_irqsave(q-lock, flags); + while (!skb_queue_empty(q)) { + spin_unlock_irqrestore(q-lock, flags); + schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); + set_current_state(TASK_UNINTERRUPTIBLE); I suppose you want to invert those lines Do you mean +set_current_state(TASK_UNINTERRUPTIBLE); +schedule_timeout(msecs_to_jiffies(UNLINK_TIMEOUT_MS)); ? + spin_lock_irqsave(q-lock, flags); + } + spin_unlock_irqrestore(q-lock, flags); +} + Your changes make sense, but it locks to me as if a lock would become totally redundant. Regards, Eugene -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH v2] ravb: minimize TX data copying
On 7/27/2015 11:47 AM, David Laight wrote: Renesas Ethernet AVB controller requires that all data are aligned on 4-byte boundary. While it's easily achievable for the RX data with the help of skb_reserve() (we even align on 128-byte boundary as recommended by the manual), we can't do the same with the TX data, and it always comes unaligned from the networking core. Originally we solved it an easy way, copying all packet to a preallocated aligned buffer; however, it's enough to copy only up to 3 first bytes from each packet, doing the transfer using 2 TX descriptors instead of just 1. Here's an implementation of the new TX algorithm that significantly reduces the driver's memory requirements. ... - buffer = PTR_ALIGN(priv-tx_buffers[q][entry], RAVB_ALIGN); - memcpy(buffer, skb-data, skb-len); - desc = priv-tx_ring[q][entry]; - desc-ds_tagl = cpu_to_le16(skb-len); - dma_addr = dma_map_single(ndev-dev, buffer, skb-len, DMA_TO_DEVICE); + buffer = PTR_ALIGN(priv-tx_align[q], DPTR_ALIGN) + +entry / NUM_TX_DESC * DPTR_ALIGN; The above would be clearer if tx_align was char[DPTR_ALIGN][]. tx_align is a pointer, not an array. + len = PTR_ALIGN(skb-data, DPTR_ALIGN) - skb-data; + memcpy(buffer, skb-data, len); Does this imply there has been an skb_linearize() ??? Sure, I don't support S/G (and it seems problematic given how the DMA descriptors are handled by the h/w). The old version didn't really need it (it was doing a copy anyway). It did since it copied the whole packet. + dma_addr = dma_map_single(ndev-dev, buffer, len, DMA_TO_DEVICE); if (dma_mapping_error(ndev-dev, dma_addr)) goto drop; + + desc = priv-tx_ring[q][entry]; + desc-ds_tagl = cpu_to_le16(len); + desc-dptr = cpu_to_le32(dma_addr); + + buffer = skb-data + len; + len = skb-len - len; + dma_addr = dma_map_single(ndev-dev, buffer, len, DMA_TO_DEVICE); + if (dma_mapping_error(ndev-dev, dma_addr)) + goto unmap; + + desc++; + desc-ds_tagl = cpu_to_le16(len); What happens if a fragment is less than DPTR_ALIGN bytes ??? It's always the case. If you mean a packet shorter than DPTR_ALIGN, it can happen due to call to skb_put_padto(skb, ETH_ZLEN). Actually is looks like you relying on having a linear skb. Yes, and I was relying on it even before this patch. David WBR, Sergei -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[net-next PATCH 2/2] drivers: net: cpsw: add separate napi for tx packet handling for performance improvment
Instead of processing tx events in ISR itself, moving the tx event processing to a separate napi improves tx performance by 180 Mbps with omap2plus_defconfig. Also cleaning up rx napis by renaming to napi_rx for better understanding the code. Signed-off-by: Mugunthan V N mugunthan...@ti.com --- drivers/net/ethernet/ti/cpsw.c | 61 -- 1 file changed, 47 insertions(+), 14 deletions(-) diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index d68d759..4f98537 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -365,7 +365,8 @@ struct cpsw_priv { spinlock_t lock; struct platform_device *pdev; struct net_device *ndev; - struct napi_struct napi; + struct napi_struct napi_rx; + struct napi_struct napi_tx; struct device *dev; struct cpsw_platform_data data; struct cpsw_ss_regs __iomem *regs; @@ -752,13 +753,22 @@ static irqreturn_t cpsw_tx_interrupt(int irq, void *dev_id) struct cpsw_priv *priv = dev_id; cpdma_ctlr_eoi(priv-dma, CPDMA_EOI_TX); - cpdma_chan_process(priv-txch, 128); + writel(0, priv-wr_regs-tx_en); + + if (netif_running(priv-ndev)) { + napi_schedule(priv-napi_tx); + return IRQ_HANDLED; + } priv = cpsw_get_slave_priv(priv, 1); - if (priv) - cpdma_chan_process(priv-txch, 128); + if (!priv) + return IRQ_NONE; - return IRQ_HANDLED; + if (netif_running(priv-ndev)) { + napi_schedule(priv-napi_tx); + return IRQ_HANDLED; + } + return IRQ_NONE; } static irqreturn_t cpsw_rx_interrupt(int irq, void *dev_id) @@ -769,7 +779,7 @@ static irqreturn_t cpsw_rx_interrupt(int irq, void *dev_id) writel(0, priv-wr_regs-rx_en); if (netif_running(priv-ndev)) { - napi_schedule(priv-napi); + napi_schedule(priv-napi_rx); return IRQ_HANDLED; } @@ -778,20 +788,37 @@ static irqreturn_t cpsw_rx_interrupt(int irq, void *dev_id) return IRQ_NONE; if (netif_running(priv-ndev)) { - napi_schedule(priv-napi); + napi_schedule(priv-napi_rx); return IRQ_HANDLED; } return IRQ_NONE; } -static int cpsw_poll(struct napi_struct *napi, int budget) +static int cpsw_tx_poll(struct napi_struct *napi_tx, int budget) +{ + struct cpsw_priv*priv = napi_to_priv(napi_tx); + int num_tx; + + num_tx = cpdma_chan_process(priv-txch, budget); + if (num_tx budget) { + napi_complete(napi_tx); + writel(0xff, priv-wr_regs-tx_en); + } + + if (num_tx) + cpsw_dbg(priv, intr, poll %d tx pkts\n, num_tx); + + return num_tx; +} + +static int cpsw_rx_poll(struct napi_struct *napi_rx, int budget) { - struct cpsw_priv*priv = napi_to_priv(napi); + struct cpsw_priv*priv = napi_to_priv(napi_rx); int num_rx; num_rx = cpdma_chan_process(priv-rxch, budget); if (num_rx budget) { - napi_complete(napi); + napi_complete(napi_rx); writel(0xff, priv-wr_regs-rx_en); } @@ -1297,7 +1324,8 @@ static int cpsw_ndo_open(struct net_device *ndev) cpsw_set_coalesce(ndev, coal); } - napi_enable(priv-napi); + napi_enable(priv-napi_rx); + napi_enable(priv-napi_tx); cpdma_ctlr_start(priv-dma); cpsw_intr_enable(priv); @@ -1319,7 +1347,8 @@ static int cpsw_ndo_stop(struct net_device *ndev) cpsw_info(priv, ifdown, shutting down cpsw device\n); netif_stop_queue(priv-ndev); - napi_disable(priv-napi); + napi_disable(priv-napi_rx); + napi_disable(priv-napi_tx); netif_carrier_off(priv-ndev); if (cpsw_common_res_usage_state(priv) = 1) { @@ -2105,7 +2134,10 @@ static int cpsw_probe_dual_emac(struct platform_device *pdev, ndev-netdev_ops = cpsw_netdev_ops; ndev-ethtool_ops = cpsw_ethtool_ops; - netif_napi_add(ndev, priv_sl2-napi, cpsw_poll, CPSW_POLL_WEIGHT); + netif_napi_add(ndev, priv_sl2-napi_rx, cpsw_rx_poll, + CPSW_POLL_WEIGHT); + netif_napi_add(ndev, priv_sl2-napi_tx, cpsw_tx_poll, + CPSW_POLL_WEIGHT); /* register the network device */ SET_NETDEV_DEV(ndev, pdev-dev); @@ -2357,7 +2389,8 @@ static int cpsw_probe(struct platform_device *pdev) ndev-netdev_ops = cpsw_netdev_ops; ndev-ethtool_ops = cpsw_ethtool_ops; - netif_napi_add(ndev, priv-napi, cpsw_poll, CPSW_POLL_WEIGHT); + netif_napi_add(ndev, priv-napi_rx,
[net-next PATCH 1/2] drivers: net: cpsw: remove disable_irq/enable_irq as irq can be masked from cpsw itself
CPSW interrupts can be disabled by masking CPSW interrupts and clearing interrupt by writing appropriate EOI. So removing all disable_irq/enable_irq as discussed in [1] [1] http://patchwork.ozlabs.org/patch/492741/ Signed-off-by: Mugunthan V N mugunthan...@ti.com --- drivers/net/ethernet/ti/cpsw.c | 27 ++- 1 file changed, 2 insertions(+), 25 deletions(-) diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index d155bf2..d68d759 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -389,7 +389,6 @@ struct cpsw_priv { /* snapshot of IRQ numbers */ u32 irqs_table[4]; u32 num_irqs; - bool irq_enabled; struct cpts *cpts; u32 emac_port; }; @@ -767,12 +766,7 @@ static irqreturn_t cpsw_rx_interrupt(int irq, void *dev_id) struct cpsw_priv *priv = dev_id; cpdma_ctlr_eoi(priv-dma, CPDMA_EOI_RX); - - cpsw_intr_disable(priv); - if (priv-irq_enabled == true) { - disable_irq_nosync(priv-irqs_table[0]); - priv-irq_enabled = false; - } + writel(0, priv-wr_regs-rx_en); if (netif_running(priv-ndev)) { napi_schedule(priv-napi); @@ -797,15 +791,8 @@ static int cpsw_poll(struct napi_struct *napi, int budget) num_rx = cpdma_chan_process(priv-rxch, budget); if (num_rx budget) { - struct cpsw_priv *prim_cpsw; - napi_complete(napi); - cpsw_intr_enable(priv); - prim_cpsw = cpsw_get_slave_priv(priv, 0); - if (prim_cpsw-irq_enabled == false) { - prim_cpsw-irq_enabled = true; - enable_irq(priv-irqs_table[0]); - } + writel(0xff, priv-wr_regs-rx_en); } if (num_rx) @@ -1230,7 +1217,6 @@ static void cpsw_slave_stop(struct cpsw_slave *slave, struct cpsw_priv *priv) static int cpsw_ndo_open(struct net_device *ndev) { struct cpsw_priv *priv = netdev_priv(ndev); - struct cpsw_priv *prim_cpsw; int i, ret; u32 reg; @@ -1315,14 +1301,6 @@ static int cpsw_ndo_open(struct net_device *ndev) cpdma_ctlr_start(priv-dma); cpsw_intr_enable(priv); - prim_cpsw = cpsw_get_slave_priv(priv, 0); - if (prim_cpsw-irq_enabled == false) { - if ((priv == prim_cpsw) || !netif_running(prim_cpsw-ndev)) { - prim_cpsw-irq_enabled = true; - enable_irq(prim_cpsw-irqs_table[0]); - } - } - if (priv-data.dual_emac) priv-slaves[priv-emac_port].open_stat = true; return 0; @@ -2169,7 +2147,6 @@ static int cpsw_probe(struct platform_device *pdev) priv-msg_enable = netif_msg_init(debug_level, CPSW_DEBUG); priv-rx_packet_max = max(rx_packet_max, 128); priv-cpts = devm_kzalloc(pdev-dev, sizeof(struct cpts), GFP_KERNEL); - priv-irq_enabled = true; if (!priv-cpts) { dev_err(pdev-dev, error allocating cpts\n); ret = -ENOMEM; -- 2.5.0.rc3.2.g6f9504c -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[net-next PATCH 0/2] CPSW interrupt handling cleanup and performance improvement
This patch series removes the irq controller disable interrupt and adding a napi for tx event handling which improves the performance by 180Mbps on dra7-evm [ 5] local 192.168.10.116 port 5001 connected with 192.168.10.125 port 44174 [ 5] 0.0-60.0 sec 1.46 GBytes 209 Mbits/sec [ 4] local 192.168.10.116 port 5001 connected with 192.168.10.125 port 33954 [ 4] 0.0-60.0 sec 2.72 GBytes 390 Mbits/sec Mugunthan V N (2): drivers: net: cpsw: remove disable_irq/enable_irq as irq can be masked from cpsw itself drivers: net: cpsw: add separate napi for tx packet handling for performance improvment drivers/net/ethernet/ti/cpsw.c | 88 +++--- 1 file changed, 49 insertions(+), 39 deletions(-) -- 2.5.0.rc3.2.g6f9504c -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH net-next 4/4] net/mlx4_en: Add support for hardware accelerated 802.1ad vlan
From: Hadar Hen Zion had...@mellanox.com To enable device support in accelerated 802.1ad vlan, the port capability packet has vlan enable (phv_en) should be set. Firmware won't work properly, in case phv_en is not set. The user can enable phv_en port capability with the new ethtool private flag phv-bit. The phv-bit private flag default value is OFF, users who are interested in 802.1ad hardware acceleration should turn ON the phv-bit private flag: $ ethtool --set-priv-flags eth1 phv-bit on Once the private flag is set, the device is ready for 802.1ad vlan acceleration. The user should also change the interface device features and turn on tx-vlan-stag-hw-insert which is off by default: $ ethtool -K eth1 tx-vlan-stag-hw-insert on phv-bit private flag setting is available only for Physical Functions(PF), the Virtual Function (VF) will be able to use the feature by setting tx-vlan-stag-hw-insert ethtool device feature only if the feature was enabled by the Hypervisor. Signed-off-by: Hadar Hen Zion had...@mellanox.com Signed-off-by: Amir Vadai am...@mellanox.com --- drivers/net/ethernet/mellanox/mlx4/en_ethtool.c | 16 + drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 46 + drivers/net/ethernet/mellanox/mlx4/en_rx.c | 16 - drivers/net/ethernet/mellanox/mlx4/en_tx.c | 13 --- drivers/net/ethernet/mellanox/mlx4/mlx4_en.h| 1 + include/linux/mlx4/cq.h | 1 + include/linux/mlx4/qp.h | 1 + 7 files changed, 89 insertions(+), 5 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c index 70f6553..f79d812 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c @@ -102,6 +102,7 @@ mlx4_en_get_drvinfo(struct net_device *dev, struct ethtool_drvinfo *drvinfo) static const char mlx4_en_priv_flags[][ETH_GSTRING_LEN] = { blueflame, + phv-bit }; static const char main_strings[][ETH_GSTRING_LEN] = { @@ -1797,9 +1798,13 @@ static int mlx4_en_get_ts_info(struct net_device *dev, static int mlx4_en_set_priv_flags(struct net_device *dev, u32 flags) { struct mlx4_en_priv *priv = netdev_priv(dev); + struct mlx4_en_dev *mdev = priv-mdev; bool bf_enabled_new = !!(flags MLX4_EN_PRIV_FLAGS_BLUEFLAME); bool bf_enabled_old = !!(priv-pflags MLX4_EN_PRIV_FLAGS_BLUEFLAME); + bool phv_enabled_new = !!(flags MLX4_EN_PRIV_FLAGS_PHV); + bool phv_enabled_old = !!(priv-pflags MLX4_EN_PRIV_FLAGS_PHV); int i; + int ret = 0; if (bf_enabled_new != bf_enabled_old) { if (bf_enabled_new) { @@ -1825,6 +1830,17 @@ static int mlx4_en_set_priv_flags(struct net_device *dev, u32 flags) bf_enabled_new ? Enabled : Disabled); } + if (phv_enabled_new != phv_enabled_old) { + ret = set_phv_bit(mdev-dev, priv-port, (int)phv_enabled_new); + if (ret) + return ret; + else if (phv_enabled_new) + priv-pflags |= MLX4_EN_PRIV_FLAGS_PHV; + else + priv-pflags = ~MLX4_EN_PRIV_FLAGS_PHV; + en_info(priv, PHV bit %s\n, + phv_enabled_new ? Enabled : Disabled); + } return 0; } diff --git a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c index e0de2fd..4726122 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_netdev.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_netdev.c @@ -2184,6 +2184,25 @@ static int mlx4_en_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) } } +static netdev_features_t mlx4_en_fix_features(struct net_device *netdev, + netdev_features_t features) +{ + struct mlx4_en_priv *en_priv = netdev_priv(netdev); + struct mlx4_en_dev *mdev = en_priv-mdev; + + /* Since there is no support for separate RX C-TAG/S-TAG vlan accel +* enable/disable make sure S-TAG flag is always in same state as +* C-TAG. +*/ + if (features NETIF_F_HW_VLAN_CTAG_RX + !(mdev-dev-caps.flags2 MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN)) + features |= NETIF_F_HW_VLAN_STAG_RX; + else + features = ~NETIF_F_HW_VLAN_STAG_RX; + + return features; +} + static int mlx4_en_set_features(struct net_device *netdev, netdev_features_t features) { @@ -2218,6 +2237,10 @@ static int mlx4_en_set_features(struct net_device *netdev, en_info(priv, Turn %s TX vlan strip offload\n, (features NETIF_F_HW_VLAN_CTAG_TX) ? ON : OFF); + if (DEV_FEATURE_CHANGED(netdev, features, NETIF_F_HW_VLAN_STAG_TX)) + en_info(priv, Turn %s TX S-VLAN strip offload\n, +
[PATCH net-next 2/4] net/mlx4_en: Prepare ethtool private flags to support more flags
From: Hadar Hen Zion had...@mellanox.com Currently we support only one ethtool private flag. Prepare mlx4_en_set_priv_flags function to support more than one private flag. Will be used in the next patch to support hardware accelerated 802.1ad vlan. Signed-off-by: Hadar Hen Zion had...@mellanox.com Signed-off-by: Amir Vadai am...@mellanox.com --- drivers/net/ethernet/mellanox/mlx4/en_ethtool.c | 35 - 1 file changed, 17 insertions(+), 18 deletions(-) diff --git a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c index 99ba1c5..70f6553 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_ethtool.c @@ -1801,30 +1801,29 @@ static int mlx4_en_set_priv_flags(struct net_device *dev, u32 flags) bool bf_enabled_old = !!(priv-pflags MLX4_EN_PRIV_FLAGS_BLUEFLAME); int i; - if (bf_enabled_new == bf_enabled_old) - return 0; /* Nothing to do */ + if (bf_enabled_new != bf_enabled_old) { + if (bf_enabled_new) { + bool bf_supported = true; - if (bf_enabled_new) { - bool bf_supported = true; + for (i = 0; i priv-tx_ring_num; i++) + bf_supported = priv-tx_ring[i]-bf_alloced; - for (i = 0; i priv-tx_ring_num; i++) - bf_supported = priv-tx_ring[i]-bf_alloced; + if (!bf_supported) { + en_err(priv, BlueFlame is not supported\n); + return -EINVAL; + } - if (!bf_supported) { - en_err(priv, BlueFlame is not supported\n); - return -EINVAL; + priv-pflags |= MLX4_EN_PRIV_FLAGS_BLUEFLAME; + } else { + priv-pflags = ~MLX4_EN_PRIV_FLAGS_BLUEFLAME; } - priv-pflags |= MLX4_EN_PRIV_FLAGS_BLUEFLAME; - } else { - priv-pflags = ~MLX4_EN_PRIV_FLAGS_BLUEFLAME; - } - - for (i = 0; i priv-tx_ring_num; i++) - priv-tx_ring[i]-bf_enabled = bf_enabled_new; + for (i = 0; i priv-tx_ring_num; i++) + priv-tx_ring[i]-bf_enabled = bf_enabled_new; - en_info(priv, BlueFlame %s\n, - bf_enabled_new ? Enabled : Disabled); + en_info(priv, BlueFlame %s\n, + bf_enabled_new ? Enabled : Disabled); + } return 0; } -- 2.4.3.413.ga5fe668 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH net-next 0/4] net/mlx4_en: Hardware accelerated 802.1ad
Hi, This patchset by Hadar introduces support in Hardware accelerated 802.1ad, for ConnectX-3pro NIC's. In order to support existing deployment, and due to some hardware limitations, the feature is disabled by default, and needed to be enabled using a private flag in ethtool. Ofcourse user can enable the private flag only if hardware has support. After being enabled, the standard ethtool -k/-K can be used. Patchset was applied and tested over commit 71790a2 (hv_netvsc: Add structs and handlers for VF messages) Amir Hadar Hen Zion (4): net/mlx4_core: Preparations for 802.1ad VLAN support net/mlx4_en: Prepare ethtool private flags to support more flags net/mlx4: Prepare VLAN macros for 802.1ad Hardware accelerated support net/mlx4_en: Add support for hardware accelerated 802.1ad vlan drivers/infiniband/hw/mlx4/cq.c | 2 +- drivers/net/ethernet/mellanox/mlx4/en_ethtool.c | 51 +-- drivers/net/ethernet/mellanox/mlx4/en_netdev.c | 46 ++ drivers/net/ethernet/mellanox/mlx4/en_rx.c | 22 +-- drivers/net/ethernet/mellanox/mlx4/en_tx.c | 13 ++-- drivers/net/ethernet/mellanox/mlx4/fw.c | 82 + drivers/net/ethernet/mellanox/mlx4/fw.h | 1 + drivers/net/ethernet/mellanox/mlx4/main.c | 15 + drivers/net/ethernet/mellanox/mlx4/mlx4.h | 3 + drivers/net/ethernet/mellanox/mlx4/mlx4_en.h| 1 + include/linux/mlx4/cq.h | 3 +- include/linux/mlx4/device.h | 5 ++ include/linux/mlx4/qp.h | 3 +- 13 files changed, 218 insertions(+), 29 deletions(-) -- 2.4.3.413.ga5fe668 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH net-next 3/4] net/mlx4: Prepare VLAN macros for 802.1ad Hardware accelerated support
From: Hadar Hen Zion had...@mellanox.com To add Hardware accelerated support in 802.1ad vlan, replace Current VLAN macros to CVLAN. Replace: MLX4_WQE_CTRL_INS_VLAN MLX4_CQE_VLAN_PRESENT_MASK With: MLX4_WQE_CTRL_INS_CVLAN MLX4_CQE_CVLAN_PRESENT_MASK Signed-off-by: Hadar Hen Zion had...@mellanox.com Signed-off-by: Amir Vadai am...@mellanox.com --- drivers/infiniband/hw/mlx4/cq.c| 2 +- drivers/net/ethernet/mellanox/mlx4/en_rx.c | 6 +++--- drivers/net/ethernet/mellanox/mlx4/en_tx.c | 2 +- include/linux/mlx4/cq.h| 2 +- include/linux/mlx4/qp.h| 2 +- 5 files changed, 7 insertions(+), 7 deletions(-) diff --git a/drivers/infiniband/hw/mlx4/cq.c b/drivers/infiniband/hw/mlx4/cq.c index 36eb3d0..180a8f7 100644 --- a/drivers/infiniband/hw/mlx4/cq.c +++ b/drivers/infiniband/hw/mlx4/cq.c @@ -871,7 +871,7 @@ repoll: if (is_eth) { wc-sl = be16_to_cpu(cqe-sl_vid) 13; if (be32_to_cpu(cqe-vlan_my_qpn) - MLX4_CQE_VLAN_PRESENT_MASK) { + MLX4_CQE_CVLAN_PRESENT_MASK) { wc-vlan_id = be16_to_cpu(cqe-sl_vid) MLX4_CQE_VID_MASK; } else { diff --git a/drivers/net/ethernet/mellanox/mlx4/en_rx.c b/drivers/net/ethernet/mellanox/mlx4/en_rx.c index 12c65e1..10f6c2f 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_rx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_rx.c @@ -726,7 +726,7 @@ static int check_csum(struct mlx4_cqe *cqe, struct sk_buff *skb, void *va, hw_checksum = csum_unfold((__force __sum16)cqe-checksum); - if (cqe-vlan_my_qpn cpu_to_be32(MLX4_CQE_VLAN_PRESENT_MASK) + if (cqe-vlan_my_qpn cpu_to_be32(MLX4_CQE_CVLAN_PRESENT_MASK) !(dev_features NETIF_F_HW_VLAN_CTAG_RX)) { hw_checksum = get_fixed_vlan_csum(hw_checksum, hdr); hdr += sizeof(struct vlan_hdr); @@ -907,7 +907,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud gro_skb-csum_level = 1; if ((cqe-vlan_my_qpn - cpu_to_be32(MLX4_CQE_VLAN_PRESENT_MASK)) + cpu_to_be32(MLX4_CQE_CVLAN_PRESENT_MASK)) (dev-features NETIF_F_HW_VLAN_CTAG_RX)) { u16 vid = be16_to_cpu(cqe-sl_vid); @@ -970,7 +970,7 @@ int mlx4_en_process_rx_cq(struct net_device *dev, struct mlx4_en_cq *cq, int bud PKT_HASH_TYPE_L3); if ((be32_to_cpu(cqe-vlan_my_qpn) - MLX4_CQE_VLAN_PRESENT_MASK) + MLX4_CQE_CVLAN_PRESENT_MASK) (dev-features NETIF_F_HW_VLAN_CTAG_RX)) __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), be16_to_cpu(cqe-sl_vid)); diff --git a/drivers/net/ethernet/mellanox/mlx4/en_tx.c b/drivers/net/ethernet/mellanox/mlx4/en_tx.c index c10d98f..7c858f6 100644 --- a/drivers/net/ethernet/mellanox/mlx4/en_tx.c +++ b/drivers/net/ethernet/mellanox/mlx4/en_tx.c @@ -958,7 +958,7 @@ netdev_tx_t mlx4_en_xmit(struct sk_buff *skb, struct net_device *dev) ring-bf.offset ^= ring-bf.buf_size; } else { tx_desc-ctrl.vlan_tag = cpu_to_be16(vlan_tag); - tx_desc-ctrl.ins_vlan = MLX4_WQE_CTRL_INS_VLAN * + tx_desc-ctrl.ins_vlan = MLX4_WQE_CTRL_INS_CVLAN * !!skb_vlan_tag_present(skb); tx_desc-ctrl.fence_size = real_size; diff --git a/include/linux/mlx4/cq.h b/include/linux/mlx4/cq.h index e7ecc12..899a97b 100644 --- a/include/linux/mlx4/cq.h +++ b/include/linux/mlx4/cq.h @@ -88,7 +88,7 @@ struct mlx4_ts_cqe { enum { MLX4_CQE_L2_TUNNEL_IPOK = 1 31, - MLX4_CQE_VLAN_PRESENT_MASK = 1 29, + MLX4_CQE_CVLAN_PRESENT_MASK = 1 29, MLX4_CQE_L2_TUNNEL = 1 27, MLX4_CQE_L2_TUNNEL_CSUM = 1 26, MLX4_CQE_L2_TUNNEL_IPV4 = 1 25, diff --git a/include/linux/mlx4/qp.h b/include/linux/mlx4/qp.h index 6fed539..6c61900 100644 --- a/include/linux/mlx4/qp.h +++ b/include/linux/mlx4/qp.h @@ -272,7 +272,7 @@ enum { MLX4_WQE_CTRL_SOLICITED = 1 1, MLX4_WQE_CTRL_IP_CSUM = 1 4, MLX4_WQE_CTRL_TCP_UDP_CSUM = 1 5, - MLX4_WQE_CTRL_INS_VLAN = 1 6, + MLX4_WQE_CTRL_INS_CVLAN = 1 6, MLX4_WQE_CTRL_STRONG_ORDER = 1 7, MLX4_WQE_CTRL_FORCE_LOOPBACK= 1 0, }; -- 2.4.3.413.ga5fe668 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH net-next 1/4] net/mlx4_core: Preparations for 802.1ad VLAN support
From: Hadar Hen Zion had...@mellanox.com mlx4_core preparation to support hardware accelerated 802.1ad VLAN device. To allow 802.1ad accelerated device, packet has vlan (phv) Firmware capability should be available. Firmware without the phv capability won't behave properly and can't support 802.1ad device acceleration. The driver checks the Firmware capability and sets the phv bit accordingly in SET_PORT command. Signed-off-by: Hadar Hen Zion had...@mellanox.com Signed-off-by: Amir Vadai am...@mellanox.com --- drivers/net/ethernet/mellanox/mlx4/fw.c | 82 +++ drivers/net/ethernet/mellanox/mlx4/fw.h | 1 + drivers/net/ethernet/mellanox/mlx4/main.c | 15 ++ drivers/net/ethernet/mellanox/mlx4/mlx4.h | 3 ++ include/linux/mlx4/device.h | 5 ++ 5 files changed, 106 insertions(+) diff --git a/drivers/net/ethernet/mellanox/mlx4/fw.c b/drivers/net/ethernet/mellanox/mlx4/fw.c index e30bf57..5a1c3d2 100644 --- a/drivers/net/ethernet/mellanox/mlx4/fw.c +++ b/drivers/net/ethernet/mellanox/mlx4/fw.c @@ -154,6 +154,7 @@ static void dump_dev_cap_flags2(struct mlx4_dev *dev, u64 flags) [26] = Port ETS Scheduler support, [27] = Port beacon support, [28] = RX-ALL support, + [29] = 802.1ad offload support, }; int i; @@ -307,6 +308,7 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave, #define QUERY_FUNC_CAP_FLAGS0_FORCE_PHY_WQE_GID 0x80 #define QUERY_FUNC_CAP_SUPPORTS_NON_POWER_OF_2_NUM_EQS (1 31) +#define QUERY_FUNC_CAP_PHV_BIT 0x40 if (vhcr-op_modifier == 1) { struct mlx4_active_ports actv_ports = @@ -351,6 +353,12 @@ int mlx4_QUERY_FUNC_CAP_wrapper(struct mlx4_dev *dev, int slave, MLX4_PUT(outbox-buf, dev-caps.phys_port_id[vhcr-in_modifier], QUERY_FUNC_CAP_PHYS_PORT_ID); + if (dev-caps.phv_bit[port]) { + field = QUERY_FUNC_CAP_PHV_BIT; + MLX4_PUT(outbox-buf, field, +QUERY_FUNC_CAP_FLAGS0_OFFSET); + } + } else if (vhcr-op_modifier == 0) { struct mlx4_active_ports actv_ports = mlx4_get_active_ports(dev, slave); @@ -600,6 +608,9 @@ int mlx4_QUERY_FUNC_CAP(struct mlx4_dev *dev, u8 gen_or_port, MLX4_GET(func_cap-phys_port_id, outbox, QUERY_FUNC_CAP_PHYS_PORT_ID); + MLX4_GET(field, outbox, QUERY_FUNC_CAP_FLAGS0_OFFSET); + func_cap-flags |= (field QUERY_FUNC_CAP_PHV_BIT); + /* All other resources are allocated by the master, but we still report * 'num' and 'reserved' capabilities as follows: * - num remains the maximum resource index @@ -700,6 +711,7 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) #define QUERY_DEV_CAP_D_MPT_ENTRY_SZ_OFFSET0x92 #define QUERY_DEV_CAP_BMME_FLAGS_OFFSET0x94 #define QUERY_DEV_CAP_CONFIG_DEV_OFFSET0x94 +#define QUERY_DEV_CAP_PHV_EN_OFFSET0x96 #define QUERY_DEV_CAP_RSVD_LKEY_OFFSET 0x98 #define QUERY_DEV_CAP_MAX_ICM_SZ_OFFSET0xa0 #define QUERY_DEV_CAP_ETH_BACKPL_OFFSET0x9c @@ -898,6 +910,12 @@ int mlx4_QUERY_DEV_CAP(struct mlx4_dev *dev, struct mlx4_dev_cap *dev_cap) dev_cap-flags2 |= MLX4_DEV_CAP_FLAG2_CONFIG_DEV; if (field (1 2)) dev_cap-flags2 |= MLX4_DEV_CAP_FLAG2_IGNORE_FCS; + MLX4_GET(field, outbox, QUERY_DEV_CAP_PHV_EN_OFFSET); + if (field 0x80) + dev_cap-flags2 |= MLX4_DEV_CAP_FLAG2_PHV_EN; + if (field 0x40) + dev_cap-flags2 |= MLX4_DEV_CAP_FLAG2_SKIP_OUTER_VLAN; + MLX4_GET(dev_cap-reserved_lkey, outbox, QUERY_DEV_CAP_RSVD_LKEY_OFFSET); MLX4_GET(field32, outbox, QUERY_DEV_CAP_ETH_BACKPL_OFFSET); @@ -1992,6 +2010,10 @@ int mlx4_QUERY_HCA(struct mlx4_dev *dev, MLX4_GET(param-uar_page_sz, outbox, INIT_HCA_UAR_PAGE_SZ_OFFSET); MLX4_GET(param-log_uar_sz, outbox, INIT_HCA_LOG_UAR_SZ_OFFSET); + /* phv_check enable */ + MLX4_GET(byte_field, outbox, INIT_HCA_CACHELINE_SZ_OFFSET); + if (byte_field 0x2) + param-phv_check_en = 1; out: mlx4_free_cmd_mailbox(dev, mailbox); @@ -2758,3 +2780,63 @@ int mlx4_ACCESS_REG_wrapper(struct mlx4_dev *dev, int slave, 0, MLX4_CMD_ACCESS_REG, MLX4_CMD_TIME_CLASS_C, MLX4_CMD_NATIVE); } + +static int mlx4_SET_PORT_phv_bit(struct mlx4_dev *dev, u8 port, u8 phv_bit) +{ +#define SET_PORT_GEN_PHV_VALID 0x10 +#define SET_PORT_GEN_PHV_EN0x80 + + struct mlx4_cmd_mailbox *mailbox; + struct mlx4_set_port_general_context *context; + u32 in_mod; + int err; + + mailbox = mlx4_alloc_cmd_mailbox(dev); + if
[PATCH iproute2 net-next] bridge: mdb: add support for router add/del notifications monitoring
From: Nikolay Aleksandrov niko...@cumulusnetworks.com This patch adds support for ADDMDB/DELMDB notifications about router ports which have been added or deleted/expired respectively. Example output: $ bridge -s monitor mdb Deleted router port dev eth3 master br0 router port dev eth3 master br0 Signed-off-by: Nikolay Aleksandrov niko...@cumulusnetworks.com --- bridge/mdb.c | 22 +- 1 file changed, 17 insertions(+), 5 deletions(-) diff --git a/bridge/mdb.c b/bridge/mdb.c index ea169b9c2e4d..dd1f942af53c 100644 --- a/bridge/mdb.c +++ b/bridge/mdb.c @@ -84,7 +84,7 @@ int print_mdb(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) FILE *fp = arg; struct br_port_msg *r = NLMSG_DATA(n); int len = n-nlmsg_len; - struct rtattr * tb[MDBA_MAX+1]; + struct rtattr *tb[MDBA_MAX+1], *i; if (n-nlmsg_type != RTM_GETMDB n-nlmsg_type != RTM_NEWMDB n-nlmsg_type != RTM_DELMDB) { fprintf(stderr, Not RTM_GETMDB, RTM_NEWMDB or RTM_DELMDB: %08x %08x %08x\n, @@ -105,7 +105,6 @@ int print_mdb(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) parse_rtattr(tb, MDBA_MAX, MDBA_RTA(r), n-nlmsg_len - NLMSG_LENGTH(sizeof(*r))); if (tb[MDBA_MDB]) { - struct rtattr *i; int rem = RTA_PAYLOAD(tb[MDBA_MDB]); for (i = RTA_DATA(tb[MDBA_MDB]); RTA_OK(i, rem); i = RTA_NEXT(i, rem)) @@ -113,9 +112,22 @@ int print_mdb(const struct sockaddr_nl *who, struct nlmsghdr *n, void *arg) } if (tb[MDBA_ROUTER]) { - if (show_details) { - fprintf(fp, router ports on %s: , ll_index_to_name(r-ifindex)); - br_print_router_ports(fp, tb[MDBA_ROUTER]); + if (n-nlmsg_type == RTM_GETMDB) { + if (show_details) { + fprintf(fp, router ports on %s: , + ll_index_to_name(r-ifindex)); + br_print_router_ports(fp, tb[MDBA_ROUTER]); + } + } else { + uint32_t *port_ifindex; + + i = RTA_DATA(tb[MDBA_ROUTER]); + port_ifindex = RTA_DATA(i); + if (n-nlmsg_type == RTM_DELMDB) + fprintf(fp, Deleted ); + fprintf(fp, router port dev %s master %s\n, + ll_index_to_name(*port_ifindex), + ll_index_to_name(r-ifindex)); } } -- 2.4.3 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Fwd: Need help about AR8327 Ethernet Driver
Hi All, I am trying to sniff Ethernet data using AR8327 chip-set from other Ethernet device KSZ8895, which gives multiplexed TX and RX data of monitoring data of another device. If I am using devices like DUB-E100(D-Link), am able capture data from KSZ8895(Multiplexed TX and Rx Ethernet data). But If I am trying to sniff Ethernet data using AR8327 Chipset(Router TP-Link AC 1750), then am not able to capture data. But in (switch/port) port registers showing correct values. Am using tcpdump to check data on specific interface. When I am using DUB-E100(D-Link) or passive Ethernet sniffer(Either Rx or Tx packets) then able to see whole data, but not with AR8327 Chipset. My environment is given below: Router: TPLink AC 1750 Ethernet data coming from KSZ8895 (Multiplexed Tx and RX of another device) OpenWRT Environment Can somebody through where should I look to solve my issue. Thanks and Regards, S Prasad -- S Prasad Kandregula -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Several races in usbnet module (kernel 4.1.x)
27.07.2015 13:00, Oliver Neukum пишет: On Fri, 2015-07-24 at 17:41 +0300, Eugene Shatokhin wrote: 23.07.2015 12:15, Oliver Neukum пишет: From what I see now in Documentation/atomic_ops.txt, stores to the properly aligned memory locations are in fact atomic. They are, but again only with respect to each other. You are right. The architectures like sparc and may be others, indeed, use spinlocks to implement atomic operations, including bit manupulation. Well then, I can only think about clearing each flag individually (with clear_bit()) instead of using dev-flags = 0. Something like this: - diff --git a/drivers/net/usb/usbnet.c b/drivers/net/usb/usbnet.c index 3c86b10..826eefe 100644 --- a/drivers/net/usb/usbnet.c +++ b/drivers/net/usb/usbnet.c @@ -779,6 +790,7 @@ int usbnet_stop (struct net_device *net) struct usbnet *dev = netdev_priv(net); struct driver_info *info = dev-driver_info; int retval, pm; + int e; clear_bit(EVENT_DEV_OPEN, dev-flags); netif_stop_queue (net); @@ -813,7 +825,8 @@ int usbnet_stop (struct net_device *net) * can't flush_scheduled_work() until we drop rtnl (later), * else workers could deadlock; so make workers a NOP. */ - dev-flags = 0; + for (e = 0; e EVENT_NUM_EVENTS; ++e) + clear_bit(e, dev-flags) del_timer_sync (dev-delay); tasklet_kill (dev-bh); if (!pm) diff --git a/include/linux/usb/usbnet.h b/include/linux/usb/usbnet.h index 6e0ce8c..7ad62da 100644 --- a/include/linux/usb/usbnet.h +++ b/include/linux/usb/usbnet.h @@ -79,6 +79,7 @@ struct usbnet { # define EVENT_RX_KILL 10 # define EVENT_LINK_CHANGE 11 # define EVENT_SET_RX_MODE 12 +# define EVENT_NUM_EVENTS 13 /* Or may be keep all these in an enum? */ }; static inline struct usb_driver *driver_of(struct usb_interface *intf) --- clear_bit() is atomic w.r.t. itself and other bit ops. So, I think, the situation you described above cannot happen for dev-flags, which is good. No need to address that in the patch. The race might be harmless after all. If I understand the code correctly now, dev-flags is set to 0 in usbnet_stop() so that the worker function (usbnet_deferred_kevent) would Yes, particularly not reschedule itself. do nothing, should it start later. If so, how about adding memory barriers for all CPUs to see dev-flags is 0 before other things? Taking a lock, as del_timer_sync() does, implies a memory barrier, as does a work. If so, then, yes, additional barriers are not needed. Regards, Eugene -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Need help about AR8327 Ethernet Driver
Hi All, I am trying to sniff Ethernet data using AR8327 chip-set from other Ethernet device KSZ8895, which gives multiplexed TX and RX data of monitoring data of another device. If I am using devices like DUB-E100(D-Link), am able capture data from KSZ8895(Multiplexed TX and Rx Ethernet data). But If I am trying to sniff Ethernet data using AR8327 Chipset(Router TP-Link AC 1750), then am not able to capture data. But in (switch/port) port registers showing correct values. Am using tcpdump to check data on specific interface. When I am using DUB-E100(D-Link) or passive Ethernet sniffer(Either Rx or Tx packets) then able to see whole data, but not with AR8327 Chipset. My environment is given below: Router: TPLink AC 1750 Ethernet data coming from KSZ8895 (Multiplexed Tx and RX of another device) OpenWRT Environment Can somebody through where should I look to solve my issue. Thanks and Regards, S Prasad -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Buggy cable detection on i.MX51, fec driver and LAN8700 PHY
Dear all, very often we observe issue with Ethernet cable detection during cable unplugging and plugging. We use Voipac i.MX51 SOMs (System On Modules). They are based on Freescale i.MX51 CPU with LAN7800 PHY in MII mode. The schematic of PHY connection is very similar to the Freescale i.MX51 Babbage board. The Ethernet interface eth0 is configured statically for simplicity, but same issue exists with DHCP configuration. I did a lot of tests to determine stability of Ethernet cable detection by the fec Ethernet driver. In normal operation, if I unplug the Ethernet cable, then fec driver prints fec 83fec000.ethernet eth0: Link is Down and green LED (Ethernet medium detected) is OFF. If I plug cable back, then fec driver print fec 83fec000.ethernet eth0: Link is Up - 100Mbps/Full - flow control off and green LED is ON. But sometimes, after cable plugging, fec driver does not print anything on the console and green LED does not show detection of Ethernet cable. Frequency of issue appearing is a random value. Sometimes issue appears after second cable unplugging/plugging, but sometimes - after 10-20 unplugging/plugging. The issue was tested and exists on kernels from linux-3.8.5 till current linux-4.2-rc4-cbfe8fa6cd672011c755c3cd85c9ffd4e2d10a6f. Same tests was made with different versions of the Barebox bootloader and cable detection works flawless. Please, help to resolve issue with Linux drivers. Best wishes. -- Igor Plyatov -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
netns refcnt leak for kernel accept sock
I'm running into a netns refcnt issue, and I suspect that eeb1bd5c has something to do with it (perhaps we need an additional change in sk_clone_lock() after eeb1bd5c). Here's the problem: When we create an syn_recv sock based on a kernel listen sock, we take a get_net() ref with a stack similar to the one shown below. Note that the parent (kernel, listen) sock itself has not taken a get_net() ref, because it explicitly calls sock_create_kern(). get_net /* for the newsk */ sk_clone_lock inet_csk_clone_lock tcp_create_openreq_child tcp_v4_syn_recv_sock tcp_check_req tcp_v4_do_rcv tcp_v4_rcv : But it's not clear to me where this refcnt will be released: in my case, I expect to create/cleanup kernel sockets as part of -init/-exit for my module, but because the accept socket has a netns refcnt, it blocks cleanup_net(), thus my -exit pernet_subsys op cannot run and clean this up, and we have a leak. I think that sk_clone_lock() should only do a get_net() if the parent is not a kernel socket (making this similar to sk_alloc()), i.e., diff --git a/net/core/sock.c b/net/core/sock.c index 08f16db..371d1b7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1497,7 +1497,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gf sock_copy(newsk, sk); /* SANITY */ - get_net(sock_net(newsk)); + if (likely(newsk-sk_net_refcnt)) + get_net(sock_net(newsk)); sk_node_init(newsk-sk_node); sock_lock_init(newsk); bh_lock_sock(newsk); Does this sound right? --Sowmini -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: linux-next: Tree for Jul 27 (net/mpls/af_mpls.c)
On 07/26/15 23:02, Stephen Rothwell wrote: Hi all, Changes since 20150724: on i386 or x86_64: when CONFIG_IPV6 is not enabled: net/built-in.o: In function `find_outdev': af_mpls.c:(.text+0x1e8ddd): undefined reference to `ip6_route_output' af_mpls.c:(.text+0x1e8e90): undefined reference to `ip_route_output_flow' -- ~Randy -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [PATCH net-next 1/1] Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet device driver
Thanks for your review. I'll repost updated patch again. -Original Message- From: David Miller [mailto:da...@davemloft.net] Sent: Sunday, July 26, 2015 7:42 PM To: Woojung Huh - C21699 Cc: netdev@vger.kernel.org Subject: Re: [PATCH net-next 1/1] Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet device driver From: woojung@microchip.com Date: Wed, 22 Jul 2015 19:01:44 + - remove module param which can be configurable by standard mechanism. You still left some unacceptable module parameters in here. The only one which is fine is the debug level setting, that's it. There is no way in I'm applying a patch that allows programming registers of the chip directly via module parameters, no way. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: netns refcnt leak for kernel accept sock
On (07/27/15 12:40), ebied...@xmission.com wrote: sock_create_kern and friends are specialied interfaces for special purposes. At a quick read through I don't think we have a single in tree user doing with them what you are trying to do. That doesnt change the fact that the architecture is questionable. and my description should make it quite clear why this is so. Without seeing code using the interfaces in the way are trying to use them I do not have enough information to comment intelligently. Ok, here you go. I'm still testing it, but there's enough there for you to see the bug quite clearly. Enjoy. I think my other mail had better information to comment intelligently but ymmv. --Sowmini diff --git a/net/core/sock.c b/net/core/sock.c index 08f16db..371d1b7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1497,7 +1497,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gfp_t priority) sock_copy(newsk, sk); /* SANITY */ - get_net(sock_net(newsk)); + if (likely(newsk-sk_net_refcnt)) + get_net(sock_net(newsk)); sk_node_init(newsk-sk_node); sock_lock_init(newsk); bh_lock_sock(newsk); diff --git a/net/rds/bind.c b/net/rds/bind.c index 4ebd29c..dd666fb 100644 --- a/net/rds/bind.c +++ b/net/rds/bind.c @@ -185,7 +185,8 @@ int rds_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) ret = 0; goto out; } - trans = rds_trans_get_preferred(sin-sin_addr.s_addr); + trans = rds_trans_get_preferred(sock_net(sock-sk), + sin-sin_addr.s_addr); if (!trans) { ret = -EADDRNOTAVAIL; rds_remove_bound(rs); diff --git a/net/rds/connection.c b/net/rds/connection.c index da6da57..273fa6c 100644 --- a/net/rds/connection.c +++ b/net/rds/connection.c @@ -117,7 +117,8 @@ static void rds_conn_reset(struct rds_connection *conn) * For now they are not garbage collected once they're created. They * are torn down as the module is removed, if ever. */ -static struct rds_connection *__rds_conn_create(__be32 laddr, __be32 faddr, +static struct rds_connection *__rds_conn_create(struct net *net, + __be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp, int is_outgoing) { @@ -157,7 +158,7 @@ new_conn: conn-c_faddr = faddr; spin_lock_init(conn-c_lock); conn-c_next_tx_seq = 1; - + write_pnet(conn-c_net, net); init_waitqueue_head(conn-c_waitq); INIT_LIST_HEAD(conn-c_send_queue); INIT_LIST_HEAD(conn-c_retrans); @@ -174,7 +175,7 @@ new_conn: * can bind to the destination address then we'd rather the messages * flow through loopback rather than either transport. */ - loop_trans = rds_trans_get_preferred(faddr); + loop_trans = rds_trans_get_preferred(net, faddr); if (loop_trans) { rds_trans_put(loop_trans); conn-c_loopback = 1; @@ -260,17 +261,19 @@ out: return conn; } -struct rds_connection *rds_conn_create(__be32 laddr, __be32 faddr, +struct rds_connection *rds_conn_create(struct net *net, + __be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp) { - return __rds_conn_create(laddr, faddr, trans, gfp, 0); + return __rds_conn_create(net, laddr, faddr, trans, gfp, 0); } EXPORT_SYMBOL_GPL(rds_conn_create); -struct rds_connection *rds_conn_create_outgoing(__be32 laddr, __be32 faddr, +struct rds_connection *rds_conn_create_outgoing(struct net *net, + __be32 laddr, __be32 faddr, struct rds_transport *trans, gfp_t gfp) { - return __rds_conn_create(laddr, faddr, trans, gfp, 1); + return __rds_conn_create(net, laddr, faddr, trans, gfp, 1); } EXPORT_SYMBOL_GPL(rds_conn_create_outgoing); diff --git a/net/rds/ib.c b/net/rds/ib.c index ba2dffe..1381422 100644 --- a/net/rds/ib.c +++ b/net/rds/ib.c @@ -317,7 +317,7 @@ static void rds_ib_ic_info(struct socket *sock, unsigned int len, * allowed to influence which paths have priority. We could call userspace * asserting this policy routing. */ -static int rds_ib_laddr_check(__be32 addr) +static int rds_ib_laddr_check(struct net *net, __be32 addr) { int ret; struct rdma_cm_id *cm_id; diff --git a/net/rds/ib_cm.c b/net/rds/ib_cm.c index 0da2a45..c38d8a0 100644 --- a/net/rds/ib_cm.c +++ b/net/rds/ib_cm.c @@ -448,8 +448,8 @@ int rds_ib_cm_handle_connect(struct rdma_cm_id *cm_id, (unsigned long long)be64_to_cpu(lguid), (unsigned long long)be64_to_cpu(fguid)); - conn =
increase in time to delete an interface with 4.x kernels
Hi Alex: I believe you did the recent overhaul to the fib implementation. I am seeing dramatically higher times to delete an interface with an ipv4 address in 4.2-rc3. perf-top points to update_suffix: PerfTop: 15834 irqs/sec kernel:97.3% exact: 0.0% [4000Hz cpu-clock], (all, 4 CPUs) --- 74.69% [kernel] [k] update_suffix 2.38% [kernel] [k] fib_table_flush 2.20% [kernel] [k] fib6_walk_continue 2.03% [kernel] [k] fib6_ifdown 1.31% [kernel] [k] fib6_age I have a simple script to create and assign an ipv4 address to 10k dummy interfaces: l=0 for (( j = 1; j = 40; j += 1)) do for (( k = 1 ; k = 250 ; k += 1 )) do l=$((l + 1)) ip link add dev dummy${l} type dummy ip addr add 72.$j.$k.1/24 dev dummy${l} ifconfig dummy${l} up done done and a counter script to delete them all: k=$(ip link show | grep dummy | wc -l) for (( j = 1; j = k; j += 1)) do ip link del dev dummy${j} done Looking at v3.19: # time ./tadd-dummy.sh real3m8.896s user0m7.104s sys 0m22.020s # time ./tdel-dummy.sh real7m18.207s user0m3.824s sys 3m15.672s And the time to delete 1 interface after all 10k have been created: # time ip link del dev dummy real0m0.064s user0m0.000s sys 0m0.020s Contrast those times with 4.2.0-rc3+ running the exact same scripts # time ./tadd-dummy.sh real2m51.044s user0m7.220s sys 0m29.520s # time ip link del dev dummy real0m0.441s user0m0.000s sys 0m0.416s so here the time to delete 1 interface has gone up by more than 10x. # time ./tdel-dummy.sh ^C real14m10.000s user0m0.528s sys 13m14.728s I killed the delete; after 14 minutes only ~2k+ interfaces had been deleted: # ip link show | grep dummy | wc -l 7822 In 4.2.0-rc3 it seems to take about 60 seconds to delete 150 interfaces which is inline with the 1 interface time of 0.4 seconds. David -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: netns refcnt leak for kernel accept sock
sock_create_kern and friends are specialied interfaces for special purposes. At a quick read through I don't think we have a single in tree user doing with them what you are trying to do. Without seeing code using the interfaces in the way are trying to use them I do not have enough information to comment intelligently. Eric Sowmini Varadhan sowmini.varad...@oracle.com writes: I'm running into a netns refcnt issue, and I suspect that eeb1bd5c has something to do with it (perhaps we need an additional change in sk_clone_lock() after eeb1bd5c). Here's the problem: When we create an syn_recv sock based on a kernel listen sock, we take a get_net() ref with a stack similar to the one shown below. Note that the parent (kernel, listen) sock itself has not taken a get_net() ref, because it explicitly calls sock_create_kern(). get_net /* for the newsk */ sk_clone_lock inet_csk_clone_lock tcp_create_openreq_child tcp_v4_syn_recv_sock tcp_check_req tcp_v4_do_rcv tcp_v4_rcv : But it's not clear to me where this refcnt will be released: in my case, I expect to create/cleanup kernel sockets as part of -init/-exit for my module, but because the accept socket has a netns refcnt, it blocks cleanup_net(), thus my -exit pernet_subsys op cannot run and clean this up, and we have a leak. I think that sk_clone_lock() should only do a get_net() if the parent is not a kernel socket (making this similar to sk_alloc()), i.e., diff --git a/net/core/sock.c b/net/core/sock.c index 08f16db..371d1b7 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -1497,7 +1497,8 @@ struct sock *sk_clone_lock(const struct sock *sk, const gf sock_copy(newsk, sk); /* SANITY */ - get_net(sock_net(newsk)); + if (likely(newsk-sk_net_refcnt)) + get_net(sock_net(newsk)); sk_node_init(newsk-sk_node); sock_lock_init(newsk); bh_lock_sock(newsk); Does this sound right? --Sowmini -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: netns refcnt leak for kernel accept sock
On (07/27/15 11:13), Cong Wang wrote: That refcnt should be released in sock destructor too, when the tcp connection is terminated. yes, but in my case, the listen socket is opened as part of the -init indirection in pernet_operations (thus it is a kernel socket) and the expectation is that this listen socket, and any accept sockets derived from it, will be closed in -exit. But if the accept socket is treated as a uspace socket (thus holds a get_net()) then it will block cleanup_net() and the associated -exit cleanup operations. This is probably not a problem for other systems like vxlan/gue/geneve etc because they all use udp sockets, thus dont have the accept equivalent. But fundamentally, its wrong for a kspace listen socket to result in a uspace accept socket. Given the fact that sk_destruct() checks for sk_net_refcnt, your patch makes sense to me. But I am not sure how a TCP kernel socket is supposed to use. Thanks for the confirmation - I think RDS is a bit of a maverick here in that it uses tcp sockets unlike vxlan etc. For those curious about RDS-TCP, I've actually updated the documentation at https://oss.oracle.com/projects/rds/dist/documentation/rds-3.1-spec.html recently. I hope that helps. --Sowmini -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: increase in time to delete an interface with 4.x kernels
On 07/27/2015 09:49 AM, David Ahern wrote: Hi Alex: I believe you did the recent overhaul to the fib implementation. I am seeing dramatically higher times to delete an interface with an ipv4 address in 4.2-rc3. perf-top points to update_suffix: PerfTop: 15834 irqs/sec kernel:97.3% exact: 0.0% [4000Hz cpu-clock], (all, 4 CPUs) --- 74.69% [kernel] [k] update_suffix 2.38% [kernel] [k] fib_table_flush 2.20% [kernel] [k] fib6_walk_continue 2.03% [kernel] [k] fib6_ifdown 1.31% [kernel] [k] fib6_age I have a simple script to create and assign an ipv4 address to 10k dummy interfaces: l=0 for (( j = 1; j = 40; j += 1)) do for (( k = 1 ; k = 250 ; k += 1 )) do l=$((l + 1)) ip link add dev dummy${l} type dummy ip addr add 72.$j.$k.1/24 dev dummy${l} ifconfig dummy${l} up done done and a counter script to delete them all: k=$(ip link show | grep dummy | wc -l) for (( j = 1; j = k; j += 1)) do ip link del dev dummy${j} done Okay so looking over what this script does it looks like it really exposes the worst case scenerio for update_suffix. You have a monstrous tnode that is 15 bits ins size. That is roughly 32K entries, and unfortunately the suffix is 8 bits long with a position of 7. The result is that for every removal the code is scanning 16K entries in order to relevel things after an entry is removed. Let me try a couple of quick things and I should have a patch for you in the next couple of hours. Thanks. - Alex -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: netns refcnt leak for kernel accept sock
On Mon, Jul 27, 2015 at 7:21 AM, Sowmini Varadhan sowmini.varad...@oracle.com wrote: I'm running into a netns refcnt issue, and I suspect that eeb1bd5c has something to do with it (perhaps we need an additional change in sk_clone_lock() after eeb1bd5c). Here's the problem: When we create an syn_recv sock based on a kernel listen sock, we take a get_net() ref with a stack similar to the one shown below. Note that the parent (kernel, listen) sock itself has not taken a get_net() ref, because it explicitly calls sock_create_kern(). get_net /* for the newsk */ sk_clone_lock inet_csk_clone_lock tcp_create_openreq_child tcp_v4_syn_recv_sock tcp_check_req tcp_v4_do_rcv tcp_v4_rcv : But it's not clear to me where this refcnt will be released: in my case, I expect to create/cleanup kernel sockets as part of -init/-exit for my module, but because the accept socket has a netns refcnt, it blocks cleanup_net(), thus my -exit pernet_subsys op cannot run and clean this up, and we have a leak. That refcnt should be released in sock destructor too, when the tcp connection is terminated. I think that sk_clone_lock() should only do a get_net() if the parent is not a kernel socket (making this similar to sk_alloc()), i.e., Given the fact that sk_destruct() checks for sk_net_refcnt, your patch makes sense to me. But I am not sure how a TCP kernel socket is supposed to use. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH net-next 06/16] net: Tx via VRF device
If out device is enslaved to a VRF device we want packets to go through the VRF master device first. This allows for example iptables rules and tc rules to be configured on the VRF as a whole as well as the option for rules on specific netdevices. This is accomplished by updating the dev in the dst to point to the VRF device if it is enslaved. Signed-off-by: Shrijeet Mukherjee s...@cumulusnetworks.com Signed-off-by: David Ahern d...@cumulusnetworks.com --- net/ipv4/route.c | 18 ++ 1 file changed, 18 insertions(+) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 8119896e1159..050a3c1d89ba 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1903,6 +1903,23 @@ int ip_route_input_noref(struct sk_buff *skb, __be32 daddr, __be32 saddr, } EXPORT_SYMBOL(ip_route_input_noref); +/* if out device is enslaved to a VRF device update dst to + * send through it + */ +static void rt_use_vrf_dev(struct rtable *rth, struct net_device *dev_out) +{ +#if IS_ENABLED(CONFIG_NET_VRF) + int ifindex = vrf_master_dev_ifindex(dev_out); + struct net_device *mdev; + + mdev = dev_get_by_index(dev_net(dev_out), ifindex); + if (mdev) { + dev_put(rth-dst.dev); + rth-dst.dev = mdev; + } +#endif +} + /* called with rcu_read_lock() */ static struct rtable *__mkroute_output(const struct fib_result *res, const struct flowi4 *fl4, int orig_oif, @@ -2008,6 +2025,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, } rt_set_nexthop(rth, fl4-daddr, res, fnhe, fi, type, 0); + rt_use_vrf_dev(rth, dev_out); return rth; } -- 2.3.2 (Apple Git-55) -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH net-next 14/16] net: Add sk_bind_dev_if to task_struct
Allow tasks to have a default device index for binding sockets. If set the value is passed to all AF_INET/AF_INET6 sockets when they are created. The task setting is passed parent to child on fork, but can be set or changed after task creation using prctl (if task has CAP_NET_ADMIN permissions). The setting for a socket can be retrieved using prctl(). This option allows an administrator to restrict a task to only send/receive packets through the specified device. In the case of VRF devices this option restricts tasks to a specific VRF. Correlation of the device index to a specific VRF, ie., ifindex -- VRF device -- VRF id is left to userspace. Example using VRF devices: 1. vrf1 is created and assigned to table 5 2. eth2 is enslaved to vrf1 3. eth2 is given the address 1.1.1.1/24 $ ip route ls table 5 prohibit default 1.1.1.0/24 dev eth2 scope link local 1.1.1.1 dev eth2 proto kernel scope host src 1.1.1.1 With out setting a VRF context ping, tcp and udp attempts fail. e.g, $ ping 1.1.1.254 connect: Network is unreachable After binding the task to the vrf device ping succeeds: $ ./chvrf -v 1 ping -c1 1.1.1.254 PING 1.1.1.254 (1.1.1.254) 56(84) bytes of data. 64 bytes from 1.1.1.254: icmp_seq=1 ttl=64 time=2.32 ms Signed-off-by: David Ahern d...@cumulusnetworks.com --- include/linux/sched.h | 3 +++ include/uapi/linux/prctl.h | 4 kernel/fork.c | 2 ++ kernel/sys.c | 35 +++ net/ipv4/af_inet.c | 1 + net/ipv4/route.c | 4 +++- net/ipv6/af_inet6.c| 1 + net/ipv6/route.c | 2 +- 8 files changed, 50 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 04b5ada460b4..29b336b8a466 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1528,6 +1528,9 @@ struct task_struct { struct files_struct *files; /* namespaces */ struct nsproxy *nsproxy; +/* network */ + /* if set INET/INET6 sockets are bound to given dev index on create */ + int sk_bind_dev_if; /* signal handlers */ struct signal_struct *signal; struct sighand_struct *sighand; diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 31891d9535e2..1ef45195d146 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -190,4 +190,8 @@ struct prctl_mm_map { # define PR_FP_MODE_FR (1 0)/* 64b FP registers */ # define PR_FP_MODE_FRE(1 1)/* 32b compatibility */ +/* get/set network interface sockets are bound to by default */ +#define PR_SET_SK_BIND_DEV_IF 47 +#define PR_GET_SK_BIND_DEV_IF 48 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/fork.c b/kernel/fork.c index dbd9b8d7b7cc..8b396e77d2bf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -380,6 +380,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk-splice_pipe = NULL; tsk-task_frag.page = NULL; + tsk-sk_bind_dev_if = orig-sk_bind_dev_if; + account_kernel_stack(ti, 1); return tsk; diff --git a/kernel/sys.c b/kernel/sys.c index 259fda25eb6b..59119ac0a0bd 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -52,6 +52,7 @@ #include linux/rcupdate.h #include linux/uidgid.h #include linux/cred.h +#include linux/netdevice.h #include linux/kmsg_dump.h /* Move somewhere else to avoid recompiling? */ @@ -2267,6 +2268,40 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_GET_FP_MODE: error = GET_FP_MODE(me); break; +#ifdef CONFIG_NET + case PR_SET_SK_BIND_DEV_IF: + { + struct net_device *dev; + int idx = (int) arg2; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (idx) { + dev = dev_get_by_index(me-nsproxy-net_ns, idx); + if (!dev) + return -EINVAL; + dev_put(dev); + } + me-sk_bind_dev_if = idx; + break; + } + case PR_GET_SK_BIND_DEV_IF: + { + struct task_struct *tsk; + int sk_bind_dev_if = -EINVAL; + + rcu_read_lock(); + tsk = find_task_by_vpid(arg2); + if (tsk) + sk_bind_dev_if = tsk-sk_bind_dev_if; + rcu_read_unlock(); + if (tsk != me !capable(CAP_NET_ADMIN)) + return -EPERM; + error = sk_bind_dev_if; + break; + } +#endif default: error = -EINVAL; break; diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index 09c7c1ee307e..0651efa18d39 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -352,6 +352,7 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
[PATCH net-next 13/16] net: Introduce VRF device driver - v2
This driver borrows heavily from IPvlan and teaming drivers. Routing domains (VRF-lite) are created by instantiating a VRF master device with an associated table and enslaving all routed interfaces that participate in the domain. As part of the enslavement, all connected routes for the enslaved devices are moved to the table associated with the VRF device. Outgoing sockets must bind to the VRF device to function. Standard FIB rules bind the VRF device to tables and regular fib rule processing is followed. Routed traffic through the box, is forwarded by using the VRF device as the IIF and following the IIF rule to a table that is mated with the VRF. Example: Create vrf 1: ip link add vrf1 type vrf table 5 ip rule add iif vrf1 table 5 ip rule add oif vrf1 table 5 ip route add table 5 prohibit default ip link set vrf1 up Add interface to vrf 1: ip link set eth1 master vrf1 Signed-off-by: Shrijeet Mukherjee s...@cumulusnetworks.com Signed-off-by: David Ahern d...@cumulusnetworks.com v2: - addressed comments from first RFC - significant changes to improve simplicity of implementation --- drivers/net/Kconfig | 7 + drivers/net/Makefile | 1 + drivers/net/vrf.c| 596 +++ 3 files changed, 604 insertions(+) create mode 100644 drivers/net/vrf.c diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index c18f9e62a9fa..e58468b02987 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -297,6 +297,13 @@ config NLMON diagnostics, etc. This is mostly intended for developers or support to debug netlink issues. If unsure, say N. +config NET_VRF + tristate Virtual Routing and Forwarding (Lite) + depends on IP_MULTIPLE_TABLES IPV6_MULTIPLE_TABLES + ---help--- + This option enables the support for mapping interfaces into VRF's. The + support enables VRF devices. + endif # NET_CORE config SUNGEM_PHY diff --git a/drivers/net/Makefile b/drivers/net/Makefile index c12cb22478a7..ca16dd689b36 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -25,6 +25,7 @@ obj-$(CONFIG_VIRTIO_NET) += virtio_net.o obj-$(CONFIG_VXLAN) += vxlan.o obj-$(CONFIG_GENEVE) += geneve.o obj-$(CONFIG_NLMON) += nlmon.o +obj-$(CONFIG_NET_VRF) += vrf.o # # Networking Drivers diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c new file mode 100644 index ..8669b0f9d749 --- /dev/null +++ b/drivers/net/vrf.c @@ -0,0 +1,596 @@ +/* + * vrf.c: device driver to encapsulate a VRF space + * + * Copyright (c) 2015 Cumulus Networks + * + * Based on dummy, team and ipvlan drivers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include linux/module.h +#include linux/kernel.h +#include linux/netdevice.h +#include linux/etherdevice.h +#include linux/ip.h +#include linux/init.h +#include linux/moduleparam.h +#include linux/rtnetlink.h +#include net/rtnetlink.h +#include linux/u64_stats_sync.h +#include linux/hashtable.h + +#include linux/inetdevice.h +#include net/ip.h +#include net/ip_fib.h +#include net/ip6_route.h +#include net/rtnetlink.h +#include net/route.h +#include net/addrconf.h +#include net/vrf.h + +#define DRV_NAME vrf +#define DRV_VERSION1.0 + +#define vrf_is_slave(dev) ((dev)-flags IFF_SLAVE) +#define vrf_is_master(dev) ((dev)-flags IFF_MASTER) + +#define vrf_master_get_rcu(dev) \ + ((struct net_device *)rcu_dereference(dev-rx_handler_data)) + +struct pcpu_dstats { + u64 tx_pkts; + u64 tx_bytes; + u64 tx_drps; + u64 rx_pkts; + u64 rx_bytes; + struct u64_stats_sync syncp; +}; + +struct slave { + struct list_headlist; + struct net_device *dev; +}; + +struct slave_queue { + spinlock_t lock; /* lock for slave insert/delete */ + struct list_headall_slaves; + int num_slaves; +}; + +struct net_vrf { + struct slave_queue queue; + struct fib_table*tb; + u32 tb_id; +}; + +static bool is_ip_rx_frame(struct sk_buff *skb) +{ + switch (skb-protocol) { + case htons(ETH_P_IP): + case htons(ETH_P_IPV6): + return true; + } + return false; +} + +/* note: already called with rcu_read_lock */ +static rx_handler_result_t vrf_handle_frame(struct sk_buff **pskb) +{ + struct sk_buff *skb = *pskb; + + if (is_ip_rx_frame(skb)) { + struct net_device *dev = vrf_master_get_rcu(skb-dev); + struct pcpu_dstats *dstats = this_cpu_ptr(dev-dstats); + + u64_stats_update_begin(dstats-syncp); +
[PATCH] iproute2: Add support for VRF device
Allow user to create a vrf device and specify its table binding. Based on the iplink_vlan implementation. Signed-off-by: Shrijeet Mukherjee s...@cumulusnetworks.com Signed-off-by: David Ahern d...@cumulusnetworks.com --- include/linux/if_link.h | 8 + ip/Makefile | 2 +- ip/iplink.c | 2 +- ip/iplink_vrf.c | 87 + 4 files changed, 97 insertions(+), 2 deletions(-) create mode 100644 ip/iplink_vrf.c diff --git a/include/linux/if_link.h b/include/linux/if_link.h index 8df6a8466839..28872fbf6814 100644 --- a/include/linux/if_link.h +++ b/include/linux/if_link.h @@ -337,6 +337,14 @@ enum macvlan_macaddr_mode { #define MACVLAN_FLAG_NOPROMISC 1 +/* VRF section */ +enum { + IFLA_VRF_UNSPEC, + IFLA_VRF_TABLE, + __IFLA_VRF_MAX +}; + +#define IFLA_VRF_MAX (__IFLA_VRF_MAX - 1) /* IPVLAN section */ enum { IFLA_IPVLAN_UNSPEC, diff --git a/ip/Makefile b/ip/Makefile index 77653ecc5785..d8b38ac2e44b 100644 --- a/ip/Makefile +++ b/ip/Makefile @@ -7,7 +7,7 @@ IPOBJ=ip.o ipaddress.o ipaddrlabel.o iproute.o iprule.o ipnetns.o \ iplink_vxlan.o tcp_metrics.o iplink_ipoib.o ipnetconf.o link_ip6tnl.o \ link_iptnl.o link_gre6.o iplink_bond.o iplink_bond_slave.o iplink_hsr.o \ iplink_bridge.o iplink_bridge_slave.o ipfou.o iplink_ipvlan.o \ -iplink_geneve.o +iplink_geneve.o iplink_vrf.o RTMONOBJ=rtmon.o diff --git a/ip/iplink.c b/ip/iplink.c index e296e6f611b8..892e8bc8808b 100644 --- a/ip/iplink.c +++ b/ip/iplink.c @@ -93,7 +93,7 @@ void iplink_usage(void) fprintf(stderr, TYPE := { vlan | veth | vcan | dummy | ifb | macvlan | macvtap |\n); fprintf(stderr, bridge | bond | ipoib | ip6tnl | ipip | sit | vxlan |\n); fprintf(stderr, gre | gretap | ip6gre | ip6gretap | vti | nlmon |\n); - fprintf(stderr, bond_slave | ipvlan | geneve }\n); + fprintf(stderr, bond_slave | ipvlan | geneve | vrf }\n); } exit(-1); } diff --git a/ip/iplink_vrf.c b/ip/iplink_vrf.c new file mode 100644 index ..bfcb3cdeaf35 --- /dev/null +++ b/ip/iplink_vrf.c @@ -0,0 +1,87 @@ +/* iplink_vrf.cVRF device support + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + * + * Authors: Shrijeet Mukherjee s...@cumulusnetworks.com + */ + +#include stdio.h +#include stdlib.h +#include string.h +#include sys/socket.h +#include linux/if_link.h + +#include rt_names.h +#include utils.h +#include ip_common.h + +static void vrf_explain(FILE *f) +{ + fprintf(f, Usage: ... vrf table TABLEID \n); +} + +static void explain(void) +{ + vrf_explain(stderr); +} + +static int table_arg(void) +{ + fprintf(stderr,Error: argument of \table\ must be 0-32767 and currently unused\n); + return -1; +} + +static int vrf_parse_opt(struct link_util *lu, int argc, char **argv, + struct nlmsghdr *n) +{ + while (argc 0) { + if (matches(*argv, table) == 0) { + __u32 table = 0; + NEXT_ARG(); + + table = atoi(*argv); + if (table 0 || table 32767) + return table_arg(); + /* XXX need a table in-use check here */ + fprintf(stderr, adding table %d\n, table); + addattr32(n, 1024, IFLA_VRF_TABLE, table); + } else if (matches(*argv, help) == 0) { + explain(); + return -1; + } else { + fprintf(stderr, vrf: unknown option \%s\?\n, + *argv); + explain(); + return -1; + } + argc--, argv++; + } + + return 0; +} + +static void vrf_print_opt(struct link_util *lu, FILE *f, struct rtattr *tb[]) +{ + if (!tb) + return; + + if (tb[IFLA_VRF_TABLE]) + fprintf(f, table %u , rta_getattr_u32(tb[IFLA_VRF_TABLE])); +} + +static void vrf_print_help(struct link_util *lu, int argc, char **argv, + FILE *f) +{ + vrf_explain(f); +} + +struct link_util vrf_link_util = { + .id = vrf, + .maxattr= IFLA_VRF_MAX, + .parse_opt = vrf_parse_opt, + .print_opt = vrf_print_opt, + .print_help = vrf_print_help, +}; -- 2.3.2 (Apple Git-55) -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at
[PATCH net-next 11/16] net: Use VRF device index for socket lookups
The intent of the VRF device is to leverage the existing SO_BINDTODEVICE as a means of creating L3 domains. Since sockets are expected to be bound to the VRF device the index of the master device needs to be used for socket lookups. Signed-off-by: Shrijeet Mukherjee s...@cumulusnetworks.com Signed-off-by: David Ahern d...@cumulusnetworks.com --- net/ipv4/syncookies.c | 5 - net/ipv4/tcp_input.c | 6 +- net/ipv4/tcp_ipv4.c | 11 +-- 3 files changed, 18 insertions(+), 4 deletions(-) diff --git a/net/ipv4/syncookies.c b/net/ipv4/syncookies.c index d70b1f603692..dab52fba5872 100644 --- a/net/ipv4/syncookies.c +++ b/net/ipv4/syncookies.c @@ -18,6 +18,7 @@ #include linux/export.h #include net/tcp.h #include net/route.h +#include net/vrf.h extern int sysctl_tcp_syncookies; @@ -348,7 +349,9 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) treq-snt_synack= tcp_opt.saw_tstamp ? tcp_opt.rcv_tsecr : 0; treq-tfo_listener = false; - ireq-ir_iif = sk-sk_bound_dev_if; + ireq-ir_iif = vrf_get_master_dev_ifindex(sock_net(sk), skb-skb_iif); + if (!ireq-ir_iif) + ireq-ir_iif = sk-sk_bound_dev_if; /* We throwed the options of the initial SYN away, so we hope * the ACK carries the same options again (see RFC1122 4.2.3.8) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c index 4e4d6bcd0ca9..df82fb05c459 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c @@ -72,6 +72,7 @@ #include net/dst.h #include net/tcp.h #include net/inet_common.h +#include net/vrf.h #include linux/ipsec.h #include asm/unaligned.h #include linux/errqueue.h @@ -6141,7 +6142,10 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, tcp_openreq_init(req, tmp_opt, skb, sk); /* Note: tcp_v6_init_req() might override ir_iif for link locals */ - inet_rsk(req)-ir_iif = sk-sk_bound_dev_if; + inet_rsk(req)-ir_iif = vrf_get_master_dev_ifindex(sock_net(sk), + skb-skb_iif); + if (!inet_rsk(req)-ir_iif) + inet_rsk(req)-ir_iif = sk-sk_bound_dev_if; af_ops-init_req(req, sk, skb); diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c index 486ba96ae91a..d0c40f4d9058 100644 --- a/net/ipv4/tcp_ipv4.c +++ b/net/ipv4/tcp_ipv4.c @@ -75,6 +75,7 @@ #include net/secure_seq.h #include net/tcp_memcontrol.h #include net/busy_poll.h +#include net/vrf.h #include linux/inet.h #include linux/ipv6.h @@ -682,6 +683,8 @@ static void tcp_v4_send_reset(struct sock *sk, struct sk_buff *skb) */ if (sk) arg.bound_dev_if = sk-sk_bound_dev_if; + if (!arg.bound_dev_if skb-dev) + arg.bound_dev_if = vrf_master_dev_ifindex(skb-dev); arg.tos = ip_hdr(skb)-tos; ip_send_unicast_reply(*this_cpu_ptr(net-ipv4.tcp_sk), @@ -766,8 +769,10 @@ static void tcp_v4_send_ack(struct sk_buff *skb, u32 seq, u32 ack, ip_hdr(skb)-saddr, /* XXX */ arg.iov[0].iov_len, IPPROTO_TCP, 0); arg.csumoffset = offsetof(struct tcphdr, check) / 2; - if (oif) - arg.bound_dev_if = oif; + arg.bound_dev_if = oif ? : vrf_master_dev_ifindex(skb_dst(skb)-dev); + if (!arg.bound_dev_if) + arg.bound_dev_if = vrf_master_dev_ifindex(skb-dev); + arg.tos = tos; ip_send_unicast_reply(*this_cpu_ptr(net-ipv4.tcp_sk), skb, TCP_SKB_CB(skb)-header.h4.opt, @@ -1269,6 +1274,8 @@ struct sock *tcp_v4_syn_recv_sock(struct sock *sk, struct sk_buff *skb, ireq = inet_rsk(req); sk_daddr_set(newsk, ireq-ir_rmt_addr); sk_rcv_saddr_set(newsk, ireq-ir_loc_addr); + if (netif_index_is_vrf(sock_net(newsk), ireq-ir_iif)) + newsk-sk_bound_dev_if = ireq-ir_iif; newinet-inet_saddr = ireq-ir_loc_addr; inet_opt = ireq-opt; rcu_assign_pointer(newinet-inet_opt, inet_opt); -- 2.3.2 (Apple Git-55) -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH net-next 07/16] net: Add inet_addr lookup by table
Currently inet_addr_type and inet_dev_addr_type expect local addresses to be in the local table. With the VRF device local routes for devices associated with a VRF will be in the table associated with the VRF. Provide an alternate inet_addr lookup to use a specific table rather than defaulting to the local table. Signed-off-by: Shrijeet Mukherjee s...@cumulusnetworks.com Signed-off-by: David Ahern d...@cumulusnetworks.com --- include/net/route.h | 1 + net/ipv4/fib_frontend.c | 22 +++--- 2 files changed, 16 insertions(+), 7 deletions(-) diff --git a/include/net/route.h b/include/net/route.h index 54f97eea0fb2..3b51c339c269 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -192,6 +192,7 @@ void ipv4_sk_redirect(struct sk_buff *skb, struct sock *sk); void ip_rt_send_redirect(struct sk_buff *skb); unsigned int inet_addr_type(struct net *net, __be32 addr); +unsigned int inet_addr_type_table(struct net *net, __be32 addr, int tb_id); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr); void ip_rt_multicast_event(struct in_device *); diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 6e68a003d0fd..cc413b0170ed 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -214,12 +214,12 @@ void fib_flush_external(struct net *net) */ static inline unsigned int __inet_dev_addr_type(struct net *net, const struct net_device *dev, - __be32 addr) + __be32 addr, int tb_id) { struct flowi4 fl4 = { .daddr = addr }; struct fib_result res; unsigned int ret = RTN_BROADCAST; - struct fib_table *local_table; + struct fib_table *table; if (ipv4_is_zeronet(addr) || ipv4_is_lbcast(addr)) return RTN_BROADCAST; @@ -228,10 +228,10 @@ static inline unsigned int __inet_dev_addr_type(struct net *net, rcu_read_lock(); - local_table = fib_get_table(net, RT_TABLE_LOCAL); - if (local_table) { + table = fib_get_table(net, tb_id); + if (table) { ret = RTN_UNICAST; - if (!fib_table_lookup(local_table, fl4, res, FIB_LOOKUP_NOREF)) { + if (!fib_table_lookup(table, fl4, res, FIB_LOOKUP_NOREF)) { if (!dev || dev == res.fi-fib_dev) ret = res.type; } @@ -241,16 +241,24 @@ static inline unsigned int __inet_dev_addr_type(struct net *net, return ret; } +unsigned int inet_addr_type_table(struct net *net, __be32 addr, int tb_id) +{ + return __inet_dev_addr_type(net, NULL, addr, tb_id); +} +EXPORT_SYMBOL(inet_addr_type_table); + unsigned int inet_addr_type(struct net *net, __be32 addr) { - return __inet_dev_addr_type(net, NULL, addr); + return __inet_dev_addr_type(net, NULL, addr, RT_TABLE_LOCAL); } EXPORT_SYMBOL(inet_addr_type); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr) { - return __inet_dev_addr_type(net, dev, addr); + int rt_table = vrf_dev_table(dev) ? : RT_TABLE_LOCAL; + + return __inet_dev_addr_type(net, dev, addr, rt_table); } EXPORT_SYMBOL(inet_dev_addr_type); -- 2.3.2 (Apple Git-55) -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH net-next 12/16] net: Add ipv4 route helper to set next hop
Signed-off-by: David Ahern d...@cumulusnetworks.com --- include/net/route.h | 3 +++ net/ipv4/route.c| 10 ++ 2 files changed, 13 insertions(+) diff --git a/include/net/route.h b/include/net/route.h index b14cbec93fbd..900d50fbcfc7 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -107,6 +107,7 @@ struct rt_cache_stat { extern struct ip_rt_acct __percpu *ip_rt_acct; struct in_device; +struct fib_result; int ip_rt_init(void); void rt_cache_flush(struct net *net); @@ -114,6 +115,8 @@ void rt_flush_dev(struct net_device *dev); struct rtable *ip_route_new_rtable(struct net_device *dev, unsigned int flags, u16 type, bool nopolicy, bool noxfrm, bool do_cache); +void ip_route_set_nexthop(struct rtable *rt, __be32 daddr, + const struct fib_result *res); struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp); struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp, struct sock *sk); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 050a3c1d89ba..47dae001a000 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1537,6 +1537,16 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, return err; } +void ip_route_set_nexthop(struct rtable *rt, __be32 daddr, + const struct fib_result *res) +{ + struct fib_nh_exception *fnhe; + + fnhe = find_exception(FIB_RES_NH(*res), daddr); + + rt_set_nexthop(rt, daddr, res, fnhe, res-fi, res-type, 0); +} +EXPORT_SYMBOL(ip_route_set_nexthop); static void ip_handle_martian_source(struct net_device *dev, struct in_device *in_dev, -- 2.3.2 (Apple Git-55) -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH net-next 09/16] net: Add routes to the table associated with the device
When a device associated with a VRF is brought up or down routes should be added to/removed from the table associated with the VRF. fib_magic defaults to using the main or local tables. Have it use the table with the device if there is one. A part of this is directing prefsrc validations to the correct table as well. Signed-off-by: David Ahern d...@cumulusnetworks.com --- net/ipv4/fib_frontend.c | 8 net/ipv4/fib_semantics.c | 25 +++-- 2 files changed, 23 insertions(+), 10 deletions(-) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 5ce0d11222ca..e35541a64449 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -805,6 +805,7 @@ static int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb) static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifaddr *ifa) { struct net *net = dev_net(ifa-ifa_dev-dev); + int tb_id = vrf_dev_table(ifa-ifa_dev-dev); struct fib_table *tb; struct fib_config cfg = { .fc_protocol = RTPROT_KERNEL, @@ -819,11 +820,10 @@ static void fib_magic(int cmd, int type, __be32 dst, int dst_len, struct in_ifad }, }; - if (type == RTN_UNICAST) - tb = fib_new_table(net, RT_TABLE_MAIN); - else - tb = fib_new_table(net, RT_TABLE_LOCAL); + if (!tb_id) + tb_id = (type == RTN_UNICAST) ? RT_TABLE_MAIN : RT_TABLE_LOCAL; + tb = fib_new_table(net, tb_id); if (!tb) return; diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index a578eacf9fcd..37e1dee7692a 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -838,6 +838,23 @@ __be32 fib_info_update_nh_saddr(struct net *net, struct fib_nh *nh) return nh-nh_saddr; } +static bool fib_valid_prefsrc(struct fib_config *cfg, __be32 fib_prefsrc) +{ + if (cfg-fc_type != RTN_LOCAL || !cfg-fc_dst || + fib_prefsrc != cfg-fc_dst) { + int tb_id = cfg-fc_table; + + if (tb_id == RT_TABLE_MAIN) + tb_id = RT_TABLE_LOCAL; + + if (inet_addr_type_table(cfg-fc_nlinfo.nl_net, +fib_prefsrc, tb_id) != RTN_LOCAL) { + return false; + } + } + return true; +} + struct fib_info *fib_create_info(struct fib_config *cfg) { int err; @@ -1033,12 +1050,8 @@ struct fib_info *fib_create_info(struct fib_config *cfg) fi-fib_flags |= RTNH_F_LINKDOWN; } - if (fi-fib_prefsrc) { - if (cfg-fc_type != RTN_LOCAL || !cfg-fc_dst || - fi-fib_prefsrc != cfg-fc_dst) - if (inet_addr_type(net, fi-fib_prefsrc) != RTN_LOCAL) - goto err_inval; - } + if (fi-fib_prefsrc !fib_valid_prefsrc(cfg, fi-fib_prefsrc)) + goto err_inval; change_nexthops(fi) { fib_info_update_nh_saddr(net, nexthop_nh); -- 2.3.2 (Apple Git-55) -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH net-next 05/16] net: Use VRF device index for lookups on TX
As with ingress use the index of VRF master device for route lookups on egress. However, the oif should only be used to direct the lookups to a specific table. Routes in the table are not based on the VRF device but rather interfaces that are part of the VRF so do not consider the oif for lookups within the table. The FLOWI_FLAG_VRFSRC is used to control this latter part. Signed-off-by: Shrijeet Mukherjee s...@cumulusnetworks.com Signed-off-by: David Ahern d...@cumulusnetworks.com --- include/net/flow.h | 1 + include/net/route.h | 3 +++ net/ipv4/fib_trie.c | 7 +-- net/ipv4/icmp.c | 4 net/ipv4/route.c| 3 +++ 5 files changed, 16 insertions(+), 2 deletions(-) diff --git a/include/net/flow.h b/include/net/flow.h index 3098ae33a178..f305588fc162 100644 --- a/include/net/flow.h +++ b/include/net/flow.h @@ -33,6 +33,7 @@ struct flowi_common { __u8flowic_flags; #define FLOWI_FLAG_ANYSRC 0x01 #define FLOWI_FLAG_KNOWN_NH0x02 +#define FLOWI_FLAG_VRFSRC 0x04 __u32 flowic_secid; struct flowi_tunnel flowic_tun_key; }; diff --git a/include/net/route.h b/include/net/route.h index cec7a2a055c8..54f97eea0fb2 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -254,6 +254,9 @@ static inline void ip_route_connect_init(struct flowi4 *fl4, __be32 dst, __be32 if (inet_sk(sk)-transparent) flow_flags |= FLOWI_FLAG_ANYSRC; + if (netif_index_is_vrf(sock_net(sk), oif)) + flow_flags |= FLOWI_FLAG_VRFSRC; + flowi4_init_output(fl4, oif, sk-sk_mark, tos, RT_SCOPE_UNIVERSE, protocol, flow_flags, dst, src, dport, sport); } diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index ac2d828c6daa..7da901c56e35 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1421,8 +1421,11 @@ int fib_table_lookup(struct fib_table *tb, const struct flowi4 *flp, nh-nh_flags RTNH_F_LINKDOWN !(fib_flags FIB_LOOKUP_IGNORE_LINKSTATE)) continue; - if (flp-flowi4_oif flp-flowi4_oif != nh-nh_oif) - continue; + if (!(flp-flowi4_flags FLOWI_FLAG_VRFSRC)) { + if (flp-flowi4_oif + flp-flowi4_oif != nh-nh_oif) + continue; + } if (!(fib_flags FIB_LOOKUP_NOREF)) atomic_inc(fi-fib_clntref); diff --git a/net/ipv4/icmp.c b/net/ipv4/icmp.c index c0556f1e4bf0..d2d142b775b8 100644 --- a/net/ipv4/icmp.c +++ b/net/ipv4/icmp.c @@ -96,6 +96,7 @@ #include net/xfrm.h #include net/inet_common.h #include net/ip_fib.h +#include net/vrf.h /* * Build xmit assembly blocks @@ -425,6 +426,7 @@ static void icmp_reply(struct icmp_bxm *icmp_param, struct sk_buff *skb) fl4.flowi4_mark = mark; fl4.flowi4_tos = RT_TOS(ip_hdr(skb)-tos); fl4.flowi4_proto = IPPROTO_ICMP; + fl4.flowi4_oif = vrf_master_dev_ifindex(skb-dev) ? : skb-dev-ifindex; security_skb_classify_flow(skb, flowi4_to_flowi(fl4)); rt = ip_route_output_key(net, fl4); if (IS_ERR(rt)) @@ -458,6 +460,8 @@ static struct rtable *icmp_route_lookup(struct net *net, fl4-flowi4_proto = IPPROTO_ICMP; fl4-fl4_icmp_type = type; fl4-fl4_icmp_code = code; + fl4-flowi4_oif = vrf_master_dev_ifindex(skb_in-dev) ? : skb_in-dev-ifindex; + security_skb_classify_flow(skb_in, flowi4_to_flowi(fl4)); rt = __ip_route_output_key(net, fl4); if (IS_ERR(rt)) diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ba74c83c05be..8119896e1159 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -2093,6 +2093,9 @@ struct rtable *__ip_route_output_key(struct net *net, struct flowi4 *fl4) if (!dev_out) goto out; + if (netif_is_vrf(dev_out)) + fl4-flowi4_flags |= FLOWI_FLAG_VRFSRC; + /* RACE: Check return value of inet_select_addr instead. */ if (!(dev_out-flags IFF_UP) || !__in_dev_get_rcu(dev_out)) { rth = ERR_PTR(-ENETUNREACH); -- 2.3.2 (Apple Git-55) -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH net-next v2] route: allow to route in a peer netns via lwt framework
Le 24/07/2015 17:39, Eric Dumazet a écrit : On Fri, 2015-07-24 at 16:16 +0200, Nicolas Dichtel wrote: This patch takes advantage of the newly added lwtunnel framework to allow the user to set routes that point to a peer netns. Packets are injected to the peer netns via the loopback device. It works only when the output device is 'lo'. Example: ip route add 40.1.1.1/32 encap netns nsid 5 via dev lo Is this feature so badly wanted to add complexity on lo device ? The goal is to be scalable when the number of netns is high (10k or more). Which this patch, we can save two interfaces (veth) per netns, which helps to to reduce memory consumption and the time needed to create a netns. [snip] + if (nsid != NETNSA_NSID_NOT_ASSIGNED) { + peernet = get_net_ns_by_id(dev_net(dev), nsid); + if (!peernet) { + kfree_skb(skb); + goto end; + } + + /* it's OK to use per_cpu_ptr() because BHs are off */ + lb_stats = this_cpu_ptr(peernet-loopback_dev-lstats); + ret = dev_forward_skb(peernet-loopback_dev, skb); + } else { + skb_orphan(skb); - /* it's OK to use per_cpu_ptr() because BHs are off */ - lb_stats = this_cpu_ptr(dev-lstats); + skb-protocol = eth_type_trans(skb, dev); + + /* it's OK to use per_cpu_ptr() because BHs are off */ + lb_stats = this_cpu_ptr(dev-lstats); + ret = netif_rx(skb); + } len = skb-len; use after free error At this point you no longer can access skb Right, will fix it. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH net-next] route: allow to route in a peer netns via lwt framework
Le 24/07/2015 17:19, David Ahern a écrit : In this case you are knowingly dropping packets. Would be nice to have a counter showing that. Ok. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: netns refcnt leak for kernel accept sock
On Mon, Jul 27, 2015 at 11:19 AM, Sowmini Varadhan sowmini.varad...@oracle.com wrote: On (07/27/15 11:13), Cong Wang wrote: That refcnt should be released in sock destructor too, when the tcp connection is terminated. yes, but in my case, the listen socket is opened as part of the -init indirection in pernet_operations (thus it is a kernel socket) and the expectation is that this listen socket, and any accept sockets derived from it, will be closed in -exit. But if the accept socket is treated as a uspace socket (thus holds a get_net()) then it will block cleanup_net() and the associated -exit cleanup operations. This is probably not a problem for other systems like vxlan/gue/geneve etc because they all use udp sockets, thus dont have the accept equivalent. dlm uses a kernel TCP socket too, but it allocates a new socket and calls -accept() by itself. ;) -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH v4 net-next 1/4] tcp: replace cnt rtt with struct in pkts_acked()
On Fri, 24 Jul 2015 19:47:03 -0700 Lawrence Brakmo bra...@fb.com wrote: Replace 2 arguments (cnt and rtt) in the congestion control modules' pkts_acked() function with a struct. This will allow adding more information without having to modify existing congestion control modules (tcp_nv in particular needs bytes in flight when packet was sent). As proposed by Neal Cardwell in his comments to the tcp_nv patch. Adding a layer of indirection makes code changes easier, but makes the code slower. Arguments are passed in registers, and putting an additional level of indirection only matters if you can't change all the CC modules. Since this is the kernel and API compatability doesn't matter, just pass more arguments. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[net-next 0/16] Proposal for VRF-lite - v3
In the context of internet scale routing a requirement that always comes up is the need to partition the available routing tables into disjoint routing planes. A specific use case is the multi-tenancy problem where each tenant has their own unique routing tables and in the very least need different default gateways. This patch allows the ability to create virtual router domains (aka VRFs (VRF-lite to be specific) in the linux packet forwarding stack. The main observation is that through the use of rules and socket binding to interfaces, all the facilities that we need are already present in the infrastructure. What is missing is a handle that identifies a routing domain and can be used to gather applicable rules/tables and uniqify neighbor selection. The scheme used needs to preserves the notions of ECMP, and general routing principles. This driver is a cross between functionality that the IPVLAN driver and the Team drivers provide where a device is created and packets into/out of the routing domain are shuttled through this device. The device is then used as a handle to identify the applicable rules. The VRF device is thus the layer3 equivalent of a vlan device. The very important point to note is that this is only a Layer3 concept so L2 tools (e.g., LLDP) do not need to be run in each VRF, processes can run in unaware mode or select a VRF to be talking through. Also the behavioral model is a generalized application of the familiar VRF-Lite model with some performance paths that need optimization. (Specifically the output route selector that Roopa, Robert, Thomas and EricB are currently discussing on the MPLS thread) High Level points = 1. Simple overlay driver (minimal changes to current stack) * uses the existing fib tables and fib rules infrastructure 2. Modelled closely after the ipvlan driver 3. Uses current API and infrastructure. * Applications can use SO_BINDTODEVICE or cmsg device indentifiers to pick VRF (ping, traceroute just work) * Standard IP Rules work, and since they are aggregated against the device, scale is manageable 4. Completely orthogonal to Namespaces and only provides separation in the routing plane (and ARP) N2 N1 (all configs here) +---+ +--+ | | |swp1 :10.0.1.1+--+swp1 :10.0.1.2 | | | | | |swp2 :10.0.2.1+--+swp2 :10.0.2.2 | | | +---+ | VRF 1| | table 5 | | | +---+ | | | VRF 2| N3 | table 6 | +---+ | | | | |swp3 :10.0.2.1+--+swp1 :10.0.2.2 | | | | | |swp4 :10.0.3.1+--+swp2 :10.0.3.2 | +--+ +---+ Given the topology above, the setup needed to get the basic VRF functions working would be Create the VRF devices and associate with a table ip link add vrf1 type vrf table 5 ip link add vrf2 type vrf table 6 Install the lookup rules that map table to VRF domain ip rule add pref 200 oif vrf1 lookup 5 ip rule add pref 200 iif vrf1 lookup 5 ip rule add pref 200 oif vrf2 lookup 6 ip rule add pref 200 iif vrf2 lookup 6 ip link set vrf1 up ip link set vrf2 up Enslave the routing member interfaces ip link set swp1 master vrf1 ip link set swp2 master vrf1 ip link set swp3 master vrf2 ip link set swp4 master vrf2 Connected routes are automatically moved from main table to the VRF table. ping using VRF0 is simply ping -I vrf0 10.0.1.2 Or using the task context and a command such as the example chvrf in patch 15 unmodified applications are run in a VRF context using: chvrf -v 1 ping 10.0.1.2 Design Highlights = If a device is enslaved to a VRF device (ie., associated with a VRF) then: 1. Rx path The master device index is used as the iif for all lookups. 2. Tx path Similarly, for Tx the VRF device oif is used in the flow to direct lookups to the table associated with the VRF via its rule. From there the FLOWI_FLAG_VRFSRC flag is used to indicate that the oif should not be used for FIB table lookups. 3. Connected and local routes On link up for a device, connected and local routes are added to the table associated with the VRF device, rather than the local and main tables. 4. Socket lookups Socket lookups use the VRF device for comparison with sk_bound_dev_if. If a socket is not bound to a device a socket match can happen based on destination address, port and protocol in which case a VRF global or agnostic
[PATCH net-next 08/16] net: Fix up inet_addr_type checks
Currently inet_addr_type and inet_dev_addr_type expect local addresses to be in the local table. With the VRF device local routes for devices associated with a VRF will be in the table associated with the VRF. Provide an alternate inet_addr lookup to use a specific table rather than defaulting to the local table. inet_addr_type_dev_table keeps the same semantics as inet_addr_type but if the passed in device is enslaved to a VRF then the table for that VRF is used for the lookup. Signed-off-by: David Ahern d...@cumulusnetworks.com --- include/net/route.h | 3 +++ net/ipv4/af_inet.c | 13 - net/ipv4/arp.c | 15 +-- net/ipv4/fib_frontend.c | 28 +--- net/ipv4/fib_semantics.c | 6 -- net/ipv4/icmp.c | 5 +++-- 6 files changed, 56 insertions(+), 14 deletions(-) diff --git a/include/net/route.h b/include/net/route.h index 3b51c339c269..b14cbec93fbd 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -195,6 +195,9 @@ unsigned int inet_addr_type(struct net *net, __be32 addr); unsigned int inet_addr_type_table(struct net *net, __be32 addr, int tb_id); unsigned int inet_dev_addr_type(struct net *net, const struct net_device *dev, __be32 addr); +unsigned int inet_addr_type_dev_table(struct net *net, + const struct net_device *dev, + __be32 addr); void ip_rt_multicast_event(struct in_device *); int ip_rt_ioctl(struct net *, unsigned int cmd, void __user *arg); void ip_rt_get_source(u8 *src, struct sk_buff *skb, struct rtable *rt); diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c index cc4e498a0ccf..09c7c1ee307e 100644 --- a/net/ipv4/af_inet.c +++ b/net/ipv4/af_inet.c @@ -119,6 +119,7 @@ #ifdef CONFIG_IP_MROUTE #include linux/mroute.h #endif +#include net/vrf.h /* The inetsw table contains everything that inet_create needs to @@ -427,6 +428,7 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) struct net *net = sock_net(sk); unsigned short snum; int chk_addr_ret; + int tb_id = 0; int err; /* If the socket has its own bind function then use it. (RAW) */ @@ -448,7 +450,16 @@ int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len) goto out; } - chk_addr_ret = inet_addr_type(net, addr-sin_addr.s_addr); + if (sk-sk_bound_dev_if) { + struct net_device *dev; + + dev = dev_get_by_index(net, sk-sk_bound_dev_if); + if (dev) { + tb_id = vrf_dev_table(dev); + dev_put(dev); + } + } + chk_addr_ret = inet_addr_type_table(net, addr-sin_addr.s_addr, tb_id); /* Not specified by any standard per-se, however it breaks too * many applications when removed. It is unfortunate since diff --git a/net/ipv4/arp.c b/net/ipv4/arp.c index 1d59e50ce8b7..53eee7cecce8 100644 --- a/net/ipv4/arp.c +++ b/net/ipv4/arp.c @@ -233,7 +233,7 @@ static int arp_constructor(struct neighbour *neigh) return -EINVAL; } - neigh-type = inet_addr_type(dev_net(dev), addr); + neigh-type = inet_addr_type_dev_table(dev_net(dev), dev, addr); parms = in_dev-arp_parms; __neigh_parms_put(neigh-parms); @@ -343,7 +343,7 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { default: case 0: /* By default announce any local IP */ - if (skb inet_addr_type(dev_net(dev), + if (skb inet_addr_type_dev_table(dev_net(dev), dev, ip_hdr(skb)-saddr) == RTN_LOCAL) saddr = ip_hdr(skb)-saddr; break; @@ -351,7 +351,8 @@ static void arp_solicit(struct neighbour *neigh, struct sk_buff *skb) if (!skb) break; saddr = ip_hdr(skb)-saddr; - if (inet_addr_type(dev_net(dev), saddr) == RTN_LOCAL) { + if (inet_addr_type_dev_table(dev_net(dev), dev, +saddr) == RTN_LOCAL) { /* saddr should be known to target */ if (inet_addr_onlink(in_dev, target, saddr)) break; @@ -751,7 +752,7 @@ static int arp_process(struct sock *sk, struct sk_buff *skb) /* Special case: IPv4 duplicate address detection packet (RFC2131) */ if (sip == 0) { if (arp-ar_op == htons(ARPOP_REQUEST) - inet_addr_type(net, tip) == RTN_LOCAL + inet_addr_type_dev_table(net, dev, tip) == RTN_LOCAL !arp_ignore(in_dev, sip, tip)) arp_send(ARPOP_REPLY, ETH_P_ARP, sip, dev, tip, sha,
[PATCH net-next 04/16] net: Use VRF device index for lookups on RX
On ingress use index of VRF master device for route lookups if real device is enslaved. Rules are expected to be installed for the VRF device to direct lookups to a specific table. Signed-off-by: Shrijeet Mukherjee s...@cumulusnetworks.com Signed-off-by: David Ahern d...@cumulusnetworks.com --- net/ipv4/fib_frontend.c | 8 +++- net/ipv4/route.c| 3 ++- 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index c565fc182240..6e68a003d0fd 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -45,6 +45,7 @@ #include net/ip_fib.h #include net/rtnetlink.h #include net/xfrm.h +#include net/vrf.h #ifndef CONFIG_IP_MULTIPLE_TABLES @@ -311,7 +312,9 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, bool dev_match; fl4.flowi4_oif = 0; - fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; + fl4.flowi4_iif = vrf_master_dev_ifindex(dev); + if (!fl4.flowi4_iif) + fl4.flowi4_iif = oif ? : LOOPBACK_IFINDEX; fl4.daddr = src; fl4.saddr = dst; fl4.flowi4_tos = tos; @@ -341,6 +344,9 @@ static int __fib_validate_source(struct sk_buff *skb, __be32 src, __be32 dst, if (nh-nh_dev == dev) { dev_match = true; break; + } else if (vrf_master_dev_ifindex(nh-nh_dev) == dev-ifindex) { + dev_match = true; + break; } } #else diff --git a/net/ipv4/route.c b/net/ipv4/route.c index ef140919211f..ba74c83c05be 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -112,6 +112,7 @@ #endif #include net/secure_seq.h #include net/ip_tunnels.h +#include net/vrf.h #define RT_FL_TOS(oldflp4) \ ((oldflp4)-flowi4_tos (IPTOS_RT_MASK | RTO_ONLINK)) @@ -1735,7 +1736,7 @@ static int ip_route_input_slow(struct sk_buff *skb, __be32 daddr, __be32 saddr, * Now we are ready to route packet. */ fl4.flowi4_oif = 0; - fl4.flowi4_iif = dev-ifindex; + fl4.flowi4_iif = vrf_master_dev_ifindex(dev) ? : dev-ifindex; fl4.flowi4_mark = skb-mark; fl4.flowi4_tos = tos; fl4.flowi4_scope = RT_SCOPE_UNIVERSE; -- 2.3.2 (Apple Git-55) -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH net-next 10/16] net: Use passed in table for nexthop lookups
If a user passes in a table for new routes use that table for nexthop lookups. Specifically, this solves the case where a connected route does not exist in the main table, but only another table and then a subsequent route is added with a next hop using the connected route. ie., $ ip route ls default via 10.0.2.2 dev eth0 10.0.2.0/24 dev eth0 proto kernel scope link src 10.0.2.15 169.254.0.0/16 dev eth0 scope link metric 1003 192.168.56.0/24 dev eth1 proto kernel scope link src 192.168.56.51 $ ip route ls table 10 1.1.1.0/24 dev eth2 scope link Without this patch adding a nexthop route fails: $ ip route add table 10 2.2.2.0/24 via 1.1.1.10 RTNETLINK answers: Network is unreachable With this patch the route is added successfully. Signed-off-by: David Ahern d...@cumulusnetworks.com --- net/ipv4/fib_semantics.c | 13 +++-- 1 file changed, 11 insertions(+), 2 deletions(-) diff --git a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c index 37e1dee7692a..7d79dfbfa5d2 100644 --- a/net/ipv4/fib_semantics.c +++ b/net/ipv4/fib_semantics.c @@ -691,6 +691,7 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, } rcu_read_lock(); { + struct fib_table *tbl = NULL; struct flowi4 fl4 = { .daddr = nh-nh_gw, .flowi4_scope = cfg-fc_scope + 1, @@ -701,8 +702,16 @@ static int fib_check_nh(struct fib_config *cfg, struct fib_info *fi, /* It is not necessary, but requires a bit of thinking */ if (fl4.flowi4_scope RT_SCOPE_LINK) fl4.flowi4_scope = RT_SCOPE_LINK; - err = fib_lookup(net, fl4, res, -FIB_LOOKUP_IGNORE_LINKSTATE); + + if (cfg-fc_table) + tbl = fib_get_table(net, cfg-fc_table); + + if (tbl) + err = fib_table_lookup(tbl, fl4, res, + FIB_LOOKUP_IGNORE_LINKSTATE); + else + err = fib_lookup(net, fl4, res, +FIB_LOOKUP_IGNORE_LINKSTATE); if (err) { rcu_read_unlock(); return err; -- 2.3.2 (Apple Git-55) -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH net-next 01/16] net: Refactor rtable allocation and initialization
All callers to rt_dst_alloc have nearly the same initialization following a successful allocation. Consolidate it into ip_route_new_rtable. Signed-off-by: David Ahern d...@cumulusnetworks.com --- include/net/route.h | 3 ++ net/ipv4/route.c| 111 +++- 2 files changed, 51 insertions(+), 63 deletions(-) diff --git a/include/net/route.h b/include/net/route.h index 2d45f419477f..cec7a2a055c8 100644 --- a/include/net/route.h +++ b/include/net/route.h @@ -111,6 +111,9 @@ struct in_device; int ip_rt_init(void); void rt_cache_flush(struct net *net); void rt_flush_dev(struct net_device *dev); +struct rtable *ip_route_new_rtable(struct net_device *dev, + unsigned int flags, u16 type, + bool nopolicy, bool noxfrm, bool do_cache); struct rtable *__ip_route_output_key(struct net *, struct flowi4 *flp); struct rtable *ip_route_output_flow(struct net *, struct flowi4 *flp, struct sock *sk); diff --git a/net/ipv4/route.c b/net/ipv4/route.c index 11096396ef4a..ef140919211f 100644 --- a/net/ipv4/route.c +++ b/net/ipv4/route.c @@ -1443,12 +1443,42 @@ static struct rtable *rt_dst_alloc(struct net_device *dev, (noxfrm ? DST_NOXFRM : 0)); } +struct rtable *ip_route_new_rtable(struct net_device *dev, + unsigned int flags, u16 type, + bool nopolicy, bool noxfrm, bool do_cache) +{ + struct rtable *rth; + + rth = rt_dst_alloc(dev, nopolicy, noxfrm, do_cache); + if (rth) { + rth-rt_genid = rt_genid_ipv4(dev_net(dev)); + rth-rt_flags = flags; + rth-rt_type = type; + rth-rt_is_input = 0; + rth-rt_iif = 0; + rth-rt_pmtu = 0; + rth-rt_gateway = 0; + rth-rt_uses_gateway = 0; + INIT_LIST_HEAD(rth-rt_uncached); + rth-rt_lwtstate = NULL; + + rth-dst.output = ip_output; + if (flags RTCF_LOCAL) + rth-dst.input = ip_local_deliver; + } + + return rth; +} +EXPORT_SYMBOL(ip_route_new_rtable); + /* called in rcu_read_lock() section */ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, u8 tos, struct net_device *dev, int our) { struct rtable *rth; struct in_device *in_dev = __in_dev_get_rcu(dev); + unsigned int flags = RTCF_MULTICAST; + u16 type = RTN_MULTICAST; u32 itag = 0; int err; @@ -1474,8 +1504,13 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, if (err 0) goto e_err; } - rth = rt_dst_alloc(dev_net(dev)-loopback_dev, - IN_DEV_CONF_GET(in_dev, NOPOLICY), false, false); + if (our) + flags |= RTCF_LOCAL; + + rth = ip_route_new_rtable(dev_net(dev)-loopback_dev, + flags, type, + IN_DEV_CONF_GET(in_dev, NOPOLICY), + false, false); if (!rth) goto e_nobufs; @@ -1483,22 +1518,7 @@ static int ip_route_input_mc(struct sk_buff *skb, __be32 daddr, __be32 saddr, rth-dst.tclassid = itag; #endif rth-dst.output = ip_rt_bug; - - rth-rt_genid = rt_genid_ipv4(dev_net(dev)); - rth-rt_flags = RTCF_MULTICAST; - rth-rt_type= RTN_MULTICAST; rth-rt_is_input= 1; - rth-rt_iif = 0; - rth-rt_pmtu= 0; - rth-rt_gateway = 0; - rth-rt_uses_gateway = 0; - INIT_LIST_HEAD(rth-rt_uncached); - rth-rt_lwtstate = NULL; - if (our) { - rth-dst.input= ip_local_deliver; - rth-rt_flags |= RTCF_LOCAL; - } - #ifdef CONFIG_IP_MROUTE if (!ipv4_is_local_multicast(daddr) IN_DEV_MFORWARD(in_dev)) rth-dst.input = ip_mr_input; @@ -1606,28 +1626,17 @@ static int __mkroute_input(struct sk_buff *skb, } } - rth = rt_dst_alloc(out_dev-dev, - IN_DEV_CONF_GET(in_dev, NOPOLICY), - IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache); + rth = ip_route_new_rtable(out_dev-dev, 0, res-type, + IN_DEV_CONF_GET(in_dev, NOPOLICY), + IN_DEV_CONF_GET(out_dev, NOXFRM), do_cache); if (!rth) { err = -ENOBUFS; goto cleanup; } - rth-rt_genid = rt_genid_ipv4(dev_net(rth-dst.dev)); - rth-rt_flags = 0; - rth-rt_type = res-type; rth-rt_is_input = 1; - rth-rt_iif = 0; - rth-rt_pmtu= 0; - rth-rt_gateway = 0; - rth-rt_uses_gateway = 0; - INIT_LIST_HEAD(rth-rt_uncached); -
Re: netns refcnt leak for kernel accept sock
On (07/27/15 11:37), Cong Wang wrote: dlm uses a kernel TCP socket too, but it allocates a new socket and calls -accept() by itself. ;) sure, and rds does this in rds_tcp_accept_one() too. But the newsk being created in sk_clone_lock is the one on an incoming syn, i.e., the one that is saved up as part of listen backlog, to be returned later on the accept. I dont know the details of dlm- can you have one dlm instance per network namespace? That's where I'm running into this issue- when we try to have one rds listen socket per netns, and want to be able to do both - dynamically build/tear down new network namepsaces, without unloading rds_tcp globally - unload rds_tcp globally withouth tearing down individual netns. But perhaps we digress. Fundamental issue remains: newsk is the syn_recv version of the listen socket. If the listen socket is a kernel socket (kern == 1 for sk_alloc, and the listen socket thus has no sk_net_refcnt), the syn_recv socket must also have that behavior, so that it is cleaned up in the same way. --Sowmini -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH net-next 3/3] openvswitch: 802.1AD: Flow handling, actions, vlan parsing and netlink attributes
On Sun, Jul 26, 2015 at 7:52 AM, Thomas F Herbert thomasfherb...@gmail.com wrote: Add support for 802.1ad including the ability to push and pop double tagged vlans. Add support for 802.1ad to netlink parsing and flow conversion. Uses double nested encap attributes to represent double tagged vlan. Inner TPID encoded along with ctci in nested attributes. Allows either 0x8100 or 0x88a8 on inner or outer tags. Signed-off-by: Thomas F Herbert thomasfherb...@gmail.com --- net/openvswitch/flow.c | 84 +++--- net/openvswitch/flow.h | 5 ++ net/openvswitch/flow_netlink.c | 196 ++--- 3 files changed, 243 insertions(+), 42 deletions(-) diff --git a/net/openvswitch/flow.c b/net/openvswitch/flow.c index 8db22ef..0abab37 100644 --- a/net/openvswitch/flow.c +++ b/net/openvswitch/flow.c @@ -298,21 +298,80 @@ static bool icmp6hdr_ok(struct sk_buff *skb) static int parse_vlan(struct sk_buff *skb, struct sw_flow_key *key) { struct qtag_prefix { - __be16 eth_type; /* ETH_P_8021Q */ + __be16 eth_type; /* ETH_P_8021Q or ETH_P_8021AD */ __be16 tci; }; - struct qtag_prefix *qp; + struct qtag_prefix *qp = (struct qtag_prefix *)skb-data; - if (unlikely(skb-len sizeof(struct qtag_prefix) + sizeof(__be16))) + struct qinqtag_prefix { + __be16 eth_type; /* ETH_P_8021Q or ETH_P_8021AD */ + __be16 tci; + __be16 inner_tpid; /* ETH_P_8021Q */ + __be16 ctci; + }; + + if (likely(skb_vlan_tag_present(skb))) { + key-eth.tci = htons(skb-vlan_tci); + + /* Case where upstream +* processing has already stripped the outer vlan tag. +*/ + if (unlikely(skb-vlan_proto == htons(ETH_P_8021AD))) { + if (unlikely(skb-len sizeof(struct qtag_prefix) + + sizeof(__be16))) { + key-eth.tci = 0; + return 0; + } + + if (unlikely(!pskb_may_pull(skb, + sizeof(struct qtag_prefix) + + sizeof(__be16 { + return -ENOMEM; + } + No need to curly brackets for single statement. + if (likely(qp-eth_type == htons(ETH_P_8021Q))) { + key-eth.cvlan.ctci = + qp-tci | htons(VLAN_TAG_PRESENT); + key-eth.cvlan.c_tpid = qp-eth_type; + __skb_pull(skb, sizeof(struct qtag_prefix)); + } key-eth.cvlan.tci and tpid should be set irrespective of qp-eth_type as it is done bellow for non offload case. + } return 0; + } - if (unlikely(!pskb_may_pull(skb, sizeof(struct qtag_prefix) + -sizeof(__be16 - return -ENOMEM; - qp = (struct qtag_prefix *) skb-data; - key-eth.tci = qp-tci | htons(VLAN_TAG_PRESENT); - __skb_pull(skb, sizeof(struct qtag_prefix)); + if (qp-eth_type == htons(ETH_P_8021AD)) { + struct qinqtag_prefix *qinqp = + (struct qinqtag_prefix *)skb-data; + + if (unlikely(skb-len sizeof(struct qinqtag_prefix) + + sizeof(__be16))) + return 0; + + if (unlikely(!pskb_may_pull(skb, sizeof(struct qinqtag_prefix) + + sizeof(__be16 { + return -ENOMEM; + } No need to curly brackets for single statement. + key-eth.tci = qinqp-tci | htons(VLAN_TAG_PRESENT); + key-eth.cvlan.ctci = qinqp-ctci | htons(VLAN_TAG_PRESENT); + key-eth.cvlan.c_tpid = qinqp-inner_tpid; + + __skb_pull(skb, sizeof(struct qinqtag_prefix)); + + return 0; + } + if (qp-eth_type == htons(ETH_P_8021Q)) { + if (unlikely(skb-len sizeof(struct qtag_prefix) + + sizeof(__be16))) + return -ENOMEM; + + if (unlikely(!pskb_may_pull(skb, sizeof(struct qtag_prefix) + + sizeof(__be16 + return 0; + key-eth.tci = qp-tci | htons(VLAN_TAG_PRESENT); + + __skb_pull(skb, sizeof(struct qtag_prefix)); + } return 0; } @@ -474,9 +533,10 @@ static int key_extract(struct sk_buff *skb, struct sw_flow_key *key) */ key-eth.tci = 0; - if
Re: [RFC PATCH v4 net-next 1/4] tcp: replace cnt rtt with struct in pkts_acked()
On 7/27/15, 11:46 AM, Stephen Hemminger step...@networkplumber.org wrote: On Fri, 24 Jul 2015 19:47:03 -0700 Lawrence Brakmo bra...@fb.com wrote: Replace 2 arguments (cnt and rtt) in the congestion control modules' pkts_acked() function with a struct. This will allow adding more information without having to modify existing congestion control modules (tcp_nv in particular needs bytes in flight when packet was sent). As proposed by Neal Cardwell in his comments to the tcp_nv patch. Adding a layer of indirection makes code changes easier, but makes the code slower. Arguments are passed in registers, and putting an additional level of indirection only matters if you can't change all the CC modules. Since this is the kernel and API compatability doesn't matter, just pass more arguments. I prefer the cleanliness of passing a structure and don¹t think the overhead will be significant enough to worry about it. Will the compiler pass struct values in registers if the struct is passed by value? I will be happy to do it either way (I did it like Stephen proposes originally). What does everyone else think? -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH net-next 13/16] net: Introduce VRF device driver - v2
On 07/27/2015 08:31 PM, David Ahern wrote: This driver borrows heavily from IPvlan and teaming drivers. Routing domains (VRF-lite) are created by instantiating a VRF master device with an associated table and enslaving all routed interfaces that participate in the domain. As part of the enslavement, all connected routes for the enslaved devices are moved to the table associated with the VRF device. Outgoing sockets must bind to the VRF device to function. Standard FIB rules bind the VRF device to tables and regular fib rule processing is followed. Routed traffic through the box, is forwarded by using the VRF device as the IIF and following the IIF rule to a table that is mated with the VRF. Example: Create vrf 1: ip link add vrf1 type vrf table 5 ip rule add iif vrf1 table 5 ip rule add oif vrf1 table 5 ip route add table 5 prohibit default ip link set vrf1 up Add interface to vrf 1: ip link set eth1 master vrf1 Signed-off-by: Shrijeet Mukherjee s...@cumulusnetworks.com Signed-off-by: David Ahern d...@cumulusnetworks.com v2: - addressed comments from first RFC - significant changes to improve simplicity of implementation --- drivers/net/Kconfig | 7 + drivers/net/Makefile | 1 + drivers/net/vrf.c| 596 +++ 3 files changed, 604 insertions(+) create mode 100644 drivers/net/vrf.c diff --git a/drivers/net/Kconfig b/drivers/net/Kconfig index c18f9e62a9fa..e58468b02987 100644 --- a/drivers/net/Kconfig +++ b/drivers/net/Kconfig @@ -297,6 +297,13 @@ config NLMON diagnostics, etc. This is mostly intended for developers or support to debug netlink issues. If unsure, say N. +config NET_VRF + tristate Virtual Routing and Forwarding (Lite) + depends on IP_MULTIPLE_TABLES IPV6_MULTIPLE_TABLES + ---help--- + This option enables the support for mapping interfaces into VRF's. The + support enables VRF devices. + endif # NET_CORE config SUNGEM_PHY diff --git a/drivers/net/Makefile b/drivers/net/Makefile index c12cb22478a7..ca16dd689b36 100644 --- a/drivers/net/Makefile +++ b/drivers/net/Makefile @@ -25,6 +25,7 @@ obj-$(CONFIG_VIRTIO_NET) += virtio_net.o obj-$(CONFIG_VXLAN) += vxlan.o obj-$(CONFIG_GENEVE) += geneve.o obj-$(CONFIG_NLMON) += nlmon.o +obj-$(CONFIG_NET_VRF) += vrf.o # # Networking Drivers diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c new file mode 100644 index ..8669b0f9d749 --- /dev/null +++ b/drivers/net/vrf.c @@ -0,0 +1,596 @@ +/* + * vrf.c: device driver to encapsulate a VRF space + * + * Copyright (c) 2015 Cumulus Networks + * + * Based on dummy, team and ipvlan drivers + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#include linux/module.h +#include linux/kernel.h +#include linux/netdevice.h +#include linux/etherdevice.h +#include linux/ip.h +#include linux/init.h +#include linux/moduleparam.h +#include linux/rtnetlink.h +#include net/rtnetlink.h +#include linux/u64_stats_sync.h +#include linux/hashtable.h + +#include linux/inetdevice.h +#include net/ip.h +#include net/ip_fib.h +#include net/ip6_route.h +#include net/rtnetlink.h +#include net/route.h +#include net/addrconf.h +#include net/vrf.h + +#define DRV_NAME vrf +#define DRV_VERSION 1.0 + +#define vrf_is_slave(dev) ((dev)-flags IFF_SLAVE) +#define vrf_is_master(dev) ((dev)-flags IFF_MASTER) + +#define vrf_master_get_rcu(dev) \ + ((struct net_device *)rcu_dereference(dev-rx_handler_data)) + +struct pcpu_dstats { + u64 tx_pkts; + u64 tx_bytes; + u64 tx_drps; + u64 rx_pkts; + u64 rx_bytes; + struct u64_stats_sync syncp; +}; + +struct slave { + struct list_headlist; + struct net_device *dev; +}; + +struct slave_queue { + spinlock_t lock; /* lock for slave insert/delete */ I don't think you actually need this lock since all VRF dev operations are done under RTNL so you already got protection against add/del running concurrently. It would simplify the code if you can get rid of it. + struct list_headall_slaves; + int num_slaves; +}; + +struct net_vrf { + struct slave_queue queue; + struct fib_table*tb; + u32 tb_id; +}; + +static bool is_ip_rx_frame(struct sk_buff *skb) +{ + switch (skb-protocol) { + case htons(ETH_P_IP): + case htons(ETH_P_IPV6): + return true; + } + return false; +} + +/* note: already called with
[net PATCH] fib_trie: Drop unnecessary calls to leaf_pull_suffix
It was reported that update_suffix was taking a long time on systems where a large number of leaves were attached to a single node. As it turns out fib_table_flush was calling update_suffix for each leaf that didn't have all of the aliases stripped from it. As a result, on this large node removing one leaf would result in us calling update_suffix for every other leaf on the node. The fix is to just remove the calls to leaf_pull_suffix since they are redundant as we already have a call in resize that will go through and update the suffix length for the node before we exit out of fib_table_flush or fib_table_flush_external. Reported-by: David Ahern d...@cumulusnetworks.com Signed-off-by: Alexander Duyck alexander.h.du...@redhat.com --- This patch should apply to linux-4.1.y and newer kernels. I've done a bit of testing on my system and I no longer see update_suffix dominating the performance traces. David if you can test with this patch to see if you still see the issue I would appreciate it. net/ipv4/fib_trie.c |4 1 file changed, 4 deletions(-) diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index ef90d73911de..70168ca4716b 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1791,8 +1791,6 @@ void fib_table_flush_external(struct fib_table *tb) if (hlist_empty(n-leaf)) { put_child_root(pn, n-key, NULL); node_free(n); - } else { - leaf_pull_suffix(pn, n); } } } @@ -1862,8 +1860,6 @@ int fib_table_flush(struct fib_table *tb) if (hlist_empty(n-leaf)) { put_child_root(pn, n-key, NULL); node_free(n); - } else { - leaf_pull_suffix(pn, n); } } -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 8/8] can: replace timestamp as unique skb attribute
Hello Greg, On 12.07.2015 21:18, Marc Kleine-Budde wrote: From: Oliver Hartkopp socket...@hartkopp.net Commit 514ac99c64b can: fix multiple delivery of a single CAN frame for overlapping CAN filters requires the skb-tstamp to be set to check for identical CAN skbs. Without timestamping to be required by user space applications this timestamp was not generated which lead to commit 36c01245eb8 can: fix loss of CAN frames in raw_rcv - which forces the timestamp to be set in all CAN related skbuffs by introducing several __net_timestamp() calls. This forces e.g. out of tree drivers which are not using alloc_can{,fd}_skb() to add __net_timestamp() after skbuff creation to prevent the frame loss fixed in mainline Linux. This patch removes the timestamp dependency and uses an atomic counter to create an unique identifier together with the skbuff pointer. Btw: the new skbcnt element introduced in struct can_skb_priv has to be initialized with zero in out-of-tree drivers which are not using alloc_can{,fd}_skb() too. Signed-off-by: Oliver Hartkopp socket...@hartkopp.net Cc: linux-stable sta...@vger.kernel.org Can you please queue up this missing/lost patch for the long term 4.1.x ? It fixes the mess with commits 514ac99c64b can: fix multiple delivery of a single CAN frame for overlapping CAN filters which originally fixed 36c01245eb8 can: fix loss of CAN frames in raw_rcv So finally this missing patch would bring 4.1.x into the proper state we now have in 4.2-rc4. Upstream commit of this patch is: http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=d3b58c47d330de8c29898fe9746f7530408f8a59 Best regards, Oliver Signed-off-by: Marc Kleine-Budde m...@pengutronix.de --- drivers/net/can/dev.c | 7 ++- drivers/net/can/slcan.c | 2 +- drivers/net/can/vcan.c | 3 --- include/linux/can/skb.h | 2 ++ net/can/af_can.c| 12 +++- net/can/bcm.c | 2 ++ net/can/raw.c | 7 --- 7 files changed, 18 insertions(+), 17 deletions(-) diff --git a/drivers/net/can/dev.c b/drivers/net/can/dev.c index e9b1810d319f..aede704605c6 100644 --- a/drivers/net/can/dev.c +++ b/drivers/net/can/dev.c @@ -440,9 +440,6 @@ unsigned int can_get_echo_skb(struct net_device *dev, unsigned int idx) struct can_frame *cf = (struct can_frame *)skb-data; u8 dlc = cf-can_dlc; - if (!(skb-tstamp.tv64)) - __net_timestamp(skb); - netif_rx(priv-echo_skb[idx]); priv-echo_skb[idx] = NULL; @@ -578,7 +575,6 @@ struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf) if (unlikely(!skb)) return NULL; - __net_timestamp(skb); skb-protocol = htons(ETH_P_CAN); skb-pkt_type = PACKET_BROADCAST; skb-ip_summed = CHECKSUM_UNNECESSARY; @@ -589,6 +585,7 @@ struct sk_buff *alloc_can_skb(struct net_device *dev, struct can_frame **cf) can_skb_reserve(skb); can_skb_prv(skb)-ifindex = dev-ifindex; + can_skb_prv(skb)-skbcnt = 0; *cf = (struct can_frame *)skb_put(skb, sizeof(struct can_frame)); memset(*cf, 0, sizeof(struct can_frame)); @@ -607,7 +604,6 @@ struct sk_buff *alloc_canfd_skb(struct net_device *dev, if (unlikely(!skb)) return NULL; - __net_timestamp(skb); skb-protocol = htons(ETH_P_CANFD); skb-pkt_type = PACKET_BROADCAST; skb-ip_summed = CHECKSUM_UNNECESSARY; @@ -618,6 +614,7 @@ struct sk_buff *alloc_canfd_skb(struct net_device *dev, can_skb_reserve(skb); can_skb_prv(skb)-ifindex = dev-ifindex; + can_skb_prv(skb)-skbcnt = 0; *cfd = (struct canfd_frame *)skb_put(skb, sizeof(struct canfd_frame)); memset(*cfd, 0, sizeof(struct canfd_frame)); diff --git a/drivers/net/can/slcan.c b/drivers/net/can/slcan.c index f64f5290d6f8..a23a7af8eb9a 100644 --- a/drivers/net/can/slcan.c +++ b/drivers/net/can/slcan.c @@ -207,7 +207,6 @@ static void slc_bump(struct slcan *sl) if (!skb) return; - __net_timestamp(skb); skb-dev = sl-dev; skb-protocol = htons(ETH_P_CAN); skb-pkt_type = PACKET_BROADCAST; @@ -215,6 +214,7 @@ static void slc_bump(struct slcan *sl) can_skb_reserve(skb); can_skb_prv(skb)-ifindex = sl-dev-ifindex; + can_skb_prv(skb)-skbcnt = 0; memcpy(skb_put(skb, sizeof(struct can_frame)), cf, sizeof(struct can_frame)); diff --git a/drivers/net/can/vcan.c b/drivers/net/can/vcan.c index 0ce868de855d..674f367087c5 100644 --- a/drivers/net/can/vcan.c +++ b/drivers/net/can/vcan.c @@ -78,9 +78,6 @@ static void vcan_rx(struct sk_buff *skb, struct net_device *dev) skb-dev = dev; skb-ip_summed = CHECKSUM_UNNECESSARY; - if (!(skb-tstamp.tv64)) - __net_timestamp(skb); - netif_rx_ni(skb); } diff --git a/include/linux/can/skb.h
Re: [PATCH 02/10] dpaa_eth: add support for DPAA Ethernet
On Fri, 2015-07-24 at 10:45 -0500, Bucur Madalin-Cristian-B32716 wrote: -Original Message- From: Joe Perches [mailto:j...@perches.com] On Wed, 2015-07-22 at 19:16 +0300, Madalin Bucur wrote: +static int __init dpa_load(void) +{ [] + err = platform_driver_register(dpa_driver); + if (unlikely(err 0)) { + pr_err(KBUILD_MODNAME + : %s:%hu:%s(): platform_driver_register() = %d\n, + KBUILD_BASENAME .c, __LINE__, __func__, err); + } + + pr_debug(KBUILD_MODNAME : %s:%s() -\n, + KBUILD_BASENAME .c, __func__); Perhaps these should use pr_fmt Agree. How about dropping all that complexity, and just using pr_debug(%s\n, __func__), or dev_dbg where possible? +static void __exit dpa_unload(void) +{ + pr_debug(KBUILD_MODNAME : - %s:%s()\n, + KBUILD_BASENAME .c, __func__); dynamic debug has __func__ available and perhaps the function tracer might be used instead. diff --git a/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h b/drivers/net/ethernet/freescale/dpaa/dpaa_eth.h [] +#define __hot curious. Maybe it'd be good to add a real __hot to compiler.h They're mostly there to make readers aware the code is critical, any changes could mess performance. Mostly or entirely? Why not just use a comment, which could also point out specific things that were done for performance? -Scott -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: kernel warning in tcp_fragment
ping... On Wed, Jul 22, 2015 at 11:55 AM, Jovi Zhangwei j...@cloudflare.com wrote: Hi Neal and Martin, Sorry for disturbing, our production system(3.14 and 3.18 stable kernel) have many tcp_fragment warnings, the trace is same as below one which you discussed before. http://comments.gmane.org/gmane.linux.network/365658 But I didn't found the final solution in that mail thread, do you have any new ideas or patches on this warning? Great thanks. [5184217.672290] WARNING: CPU: 9 PID: 2801 at net/ipv4/tcp_output.c:1081 tcp_fragment+0x34/0x230() [5184217.680995] Modules linked in: sfc_char(O) sfc_resource(O) sfc_affinity(O) nf_conntrack_netlink xt_connlimit xt_length xt_bpf xt_hashlimit iptable_nat nf_nat_ipv4 nf_nat iptable_mangle xt_comment ip6table_security ip6table_mangle ip_set_hash_netport 8021q garp bridg e stp llc ipmi_devintf nf_conntrack_ipv6 nf_defrag_ipv6 ip6table_filter ip6table_raw ip6_tables nf_conntrack_ipv4 nf_defrag_ipv4 xt_NFLOG nfnetlink_log xt_conntrack iptable_filter xt_tcpudp xt_multiport xt_CT nf_conntrack xt_set iptable_raw ip_tables x_tables ip_set_hash _net ip_set_hash_ip ip_set nfnetlink rpcsec_gss_krb5 auth_rpcgss oid_registry nfsv4 fuse nfsv3 nfs_acl nfs fscache lockd sunrpc tcp_cubic sg sfc(O) mtd mdio igb dca i2c_algo_bit ptp pps_core sd_mod crct10dif_generic crc_t10dif crct10dif_common x86_pkg_temp_thermal acpi_c pufreq coretemp kvm_intel kvm crc32c_intel aesni_intel ablk_helper cryptd lrw gf128mul glue_helper aes_x86_64 ahci libahci ehci_pci libata ehci_hcd i2c_i801 i2c_core lpc_ich mfd_core usbcore scsi_mod usb_common wmi evdev ipmi_si ipmi_msghandler tpm_tis tpm acpi_pad proce ssor thermal_sys button [5184217.684098] CPU: 9 PID: 2801 Comm: rrdns Tainted: GW O 3.14.28-cloudflare #1 [5184217.684099] Hardware name: Quanta Computer Inc QuantaPlex T41S-2U/S2S-MB, BIOS S2S_3A14 09/18/2014 [5184217.684100] 81466263 8103bb34 [5184217.684101] 813e07f2 8818abebcc00 004a 0002 [5184217.684102] 0060 813e07f2 00304120 8818abebcc00 [5184217.684104] Call Trace: [5184217.684105] IRQ [81466263] ? dump_stack+0x41/0x51 [5184217.684111] [8103bb34] ? warn_slowpath_common+0x74/0x89 [5184217.684115] [813e07f2] ? tcp_fragment+0x34/0x230 [5184217.684118] [813e07f2] ? tcp_fragment+0x34/0x230 [5184217.684119] [813d98b7] ? tcp_mark_head_lost+0x1bd/0x1d5 [5184217.684123] [813ddb71] ? tcp_fastretrans_alert+0x69f/0x71d [5184217.684125] [813de567] ? tcp_ack+0x90f/0xb16 [5184217.684126] [813df618] ? tcp_rcv_state_process+0x5bd/0x9b8 [5184217.684128] [8106d9c0] ? __wake_up_sync_key+0x3a/0x4d [5184217.684130] [813920ed] ? sk_wake_async+0x17/0x34 [5184217.684133] [81440d13] ? ipv6_skip_exthdr+0x28/0xc7 [5184217.684139] [81418db6] ? NF_HOOK_THRESH.constprop.11+0x4a/0x4a [5184217.684143] [81435abe] ? tcp_v6_do_rcv+0x3ac/0x4f1 [5184217.684146] [81435eec] ? tcp_v6_rcv+0x2e9/0x554 [5184217.684148] [813c70d3] ? nf_hook_slow+0x66/0xf1 [5184217.684150] [81418db6] ? NF_HOOK_THRESH.constprop.11+0x4a/0x4a [5184217.684167] [81418f70] ? ip6_input_finish+0x1ba/0x2a7 [5184217.684169] [813a1c12] ? __netif_receive_skb_core+0x422/0x494 [5184217.684172] [813a283a] ? netif_receive_skb_internal+0x37/0x6d [5184217.684188] [a09a2e40] ? efx_ssr_try_merge+0x336/0x34e [sfc] [5184217.684215] [a09a4075] ? __efx_ssr_end_of_burst+0x3e/0xd2 [sfc] [5184217.684225] [a098e3bd] ? efx_process_channel+0x5d/0x71 [sfc] [5184217.684243] [a098f557] ? efx_poll+0x6d/0x16b [sfc] [5184217.684248] [813a2e27] ? net_rx_action+0xc6/0x191 [5184217.684250] [8103f7ee] ? __do_softirq+0x100/0x27c [5184217.684254] [8103fae6] ? irq_exit+0x51/0xbc [5184217.684255] [81003e35] ? do_IRQ+0x9d/0xb4 [5184217.684258] [8146992a] ? common_interrupt+0x6a/0x6a [5184217.684261] EOI 4[5184217.684263] ---[ end trace 4f42d23abf1c890e ]--- [5184217.684460] [ cut here ] -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH net-next 1/1] Microchip's LAN7800 family USB 2/3 to 10/100/1000 Ethernet device driver
This patch adds a driver for LAN7800 family of USB 2.0 USB 3.0 to Gigabit Ethernet. - remove module param which can be configurable by standard mechanism. - remove other module parms except msg_level per review comment. Signed-off-by: Woojung Huh woojung@microchip.com --- drivers/net/usb/Kconfig | 10 + drivers/net/usb/Makefile |1 + drivers/net/usb/lan78xx.c | 3517 + drivers/net/usb/lan78xx.h | 1069 ++ 4 files changed, 4597 insertions(+) create mode 100644 drivers/net/usb/lan78xx.c create mode 100644 drivers/net/usb/lan78xx.h diff --git a/drivers/net/usb/Kconfig b/drivers/net/usb/Kconfig index 7ba8d08..1610b79 100644 --- a/drivers/net/usb/Kconfig +++ b/drivers/net/usb/Kconfig @@ -106,6 +106,16 @@ config USB_RTL8152 To compile this driver as a module, choose M here: the module will be called r8152. +config USB_LAN78XX + tristate Microchip LAN78XX Based USB Ethernet Adapters + select MII + help + This option adds support for Microchip LAN78XX based USB 2 + USB 3 10/100/1000 Ethernet adapters. + + To compile this driver as a module, choose M here: the + module will be called lan78xx. + config USB_USBNET tristate Multi-purpose USB Networking Framework select MII diff --git a/drivers/net/usb/Makefile b/drivers/net/usb/Makefile index e2797f1..cf6a0e6 100644 --- a/drivers/net/usb/Makefile +++ b/drivers/net/usb/Makefile @@ -8,6 +8,7 @@ obj-$(CONFIG_USB_PEGASUS) += pegasus.o obj-$(CONFIG_USB_RTL8150) += rtl8150.o obj-$(CONFIG_USB_RTL8152) += r8152.o obj-$(CONFIG_USB_HSO) += hso.o +obj-$(CONFIG_USB_LAN78XX) += lan78xx.o obj-$(CONFIG_USB_NET_AX8817X) += asix.o asix-y := asix_devices.o asix_common.o ax88172a.o obj-$(CONFIG_USB_NET_AX88179_178A) += ax88179_178a.o diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c new file mode 100644 index 000..516722f --- /dev/null +++ b/drivers/net/usb/lan78xx.c @@ -0,0 +1,3517 @@ +/* + * Copyright (C) 2015 Microchip Technology + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, see http://www.gnu.org/licenses/. + */ +#include linux/version.h +#include linux/module.h +#include linux/netdevice.h +#include linux/etherdevice.h +#include linux/ethtool.h +#include linux/mii.h +#include linux/usb.h +#include linux/crc32.h +#include linux/signal.h +#include linux/slab.h +#include linux/if_vlan.h +#include linux/uaccess.h +#include linux/list.h +#include linux/ip.h +#include linux/ipv6.h +#include linux/mdio.h +#include net/ip6_checksum.h +#include lan78xx.h + +#define DRIVER_AUTHOR WOOJUNG HUH woojung@microchip.com +#define DRIVER_DESCLAN78XX USB 3.0 Gigabit Ethernet Devices +#define DRIVER_NAMElan78xx +#define DRIVER_VERSION 1.0.0 + +#define TX_TIMEOUT_JIFFIES (5 * HZ) +#define THROTTLE_JIFFIES (HZ / 8) +#define UNLINK_TIMEOUT_MS 3 + +#define RX_MAX_QUEUE_MEMORY(60 * 1518) + +#define SS_USB_PKT_SIZE(1024) +#define HS_USB_PKT_SIZE(512) +#define FS_USB_PKT_SIZE(64) + +#define MAX_RX_FIFO_SIZE (12 * 1024) +#define MAX_TX_FIFO_SIZE (12 * 1024) +#define DEFAULT_BURST_CAP_SIZE (MAX_TX_FIFO_SIZE) +#define DEFAULT_BULK_IN_DELAY (0x0800) +#define MAX_SINGLE_PACKET_SIZE (9000) +#define DEFAULT_TX_CSUM_ENABLE (true) +#define DEFAULT_RX_CSUM_ENABLE (true) +#define DEFAULT_TSO_CSUM_ENABLE(true) +#define DEFAULT_VLAN_FILTER_ENABLE (true) +#define INTERNAL_PHY_ID(2) /* 2: GMII */ +#define TX_OVERHEAD(8) +#define RXW_PADDING2 + +#define LAN78XX_USB_VENDOR_ID (0x0424) +#define LAN7800_USB_PRODUCT_ID (0x7800) +#define LAN7850_USB_PRODUCT_ID (0x7850) +#define LAN78XX_EEPROM_MAGIC (0x78A5) +#define LAN78XX_OTP_MAGIC (0x78F3) + +#defineMII_READ1 +#defineMII_WRITE 0 + +#define EEPROM_INDICATOR (0xA5) +#define EEPROM_MAC_OFFSET (0x01) +#define MAX_EEPROM_SIZE512 +#define OTP_INDICATOR_1(0xF3) +#define OTP_INDICATOR_2(0xF7) +
[PATCH net-next 02/16] net: export a few FIB functions
Required by the VRF driver. Signed-off-by: Shrijeet Mukherjee s...@cumulusnetworks.com Signed-off-by: David Ahern d...@cumulusnetworks.com --- net/ipv4/fib_frontend.c | 2 ++ net/ipv4/fib_trie.c | 1 + 2 files changed, 3 insertions(+) diff --git a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c index 6b98de0d7949..c565fc182240 100644 --- a/net/ipv4/fib_frontend.c +++ b/net/ipv4/fib_frontend.c @@ -108,6 +108,7 @@ struct fib_table *fib_new_table(struct net *net, u32 id) hlist_add_head_rcu(tb-tb_hlist, net-ipv4.fib_table_hash[h]); return tb; } +EXPORT_SYMBOL_GPL(fib_new_table); /* caller must hold either rtnl or rcu read lock */ struct fib_table *fib_get_table(struct net *net, u32 id) @@ -127,6 +128,7 @@ struct fib_table *fib_get_table(struct net *net, u32 id) } return NULL; } +EXPORT_SYMBOL_GPL(fib_get_table); #endif /* CONFIG_IP_MULTIPLE_TABLES */ static void fib_replace_table(struct net *net, struct fib_table *old, diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 15d32612e3c6..ac2d828c6daa 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -1887,6 +1887,7 @@ void fib_free_table(struct fib_table *tb) { call_rcu(tb-rcu, __trie_free_rcu); } +EXPORT_SYMBOL_GPL(fib_free_table); static int fn_trie_dump_leaf(struct key_vector *l, struct fib_table *tb, struct sk_buff *skb, struct netlink_callback *cb) -- 2.3.2 (Apple Git-55) -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH net-next 03/16] net: Introduce VRF related flags and helpers
Add a VRF_MASTER flag for interfaces and helper functions for determining if a device is a VRF_MASTER. Add link attribute for passing VRF_TABLE id. Add vrf_ptr to netdevice. Add various macros for determining if a device is a VRF device, the index of the master VRF device and table associated with VRF device. Signed-off-by: Shrijeet Mukherjee s...@cumulusnetworks.com Signed-off-by: David Ahern d...@cumulusnetworks.com --- include/linux/netdevice.h| 21 +++ include/net/vrf.h| 83 include/uapi/linux/if_link.h | 9 + 3 files changed, 113 insertions(+) create mode 100644 include/net/vrf.h diff --git a/include/linux/netdevice.h b/include/linux/netdevice.h index 607b5f41f46f..81cbaf78 100644 --- a/include/linux/netdevice.h +++ b/include/linux/netdevice.h @@ -1289,6 +1289,7 @@ enum netdev_priv_flags { IFF_XMIT_DST_RELEASE_PERM = 122, IFF_IPVLAN_MASTER = 123, IFF_IPVLAN_SLAVE= 124, + IFF_VRF_MASTER = 125, }; #define IFF_802_1Q_VLANIFF_802_1Q_VLAN @@ -1316,6 +1317,7 @@ enum netdev_priv_flags { #define IFF_XMIT_DST_RELEASE_PERM IFF_XMIT_DST_RELEASE_PERM #define IFF_IPVLAN_MASTER IFF_IPVLAN_MASTER #define IFF_IPVLAN_SLAVE IFF_IPVLAN_SLAVE +#define IFF_VRF_MASTER IFF_VRF_MASTER /** * struct net_device - The DEVICE structure. @@ -1432,6 +1434,7 @@ enum netdev_priv_flags { * @dn_ptr:DECnet specific data * @ip6_ptr: IPv6 specific data * @ax25_ptr: AX.25 specific data + * @vrf_ptr: VRF specific data * @ieee80211_ptr: IEEE 802.11 specific data, assign before registering * * @last_rx: Time of last Rx @@ -1650,6 +1653,7 @@ struct net_device { struct dn_dev __rcu *dn_ptr; struct inet6_dev __rcu *ip6_ptr; void*ax25_ptr; + struct net_vrf_dev *vrf_ptr; struct wireless_dev *ieee80211_ptr; struct wpan_dev *ieee802154_ptr; #if IS_ENABLED(CONFIG_MPLS_ROUTING) @@ -3808,6 +3812,23 @@ static inline bool netif_supports_nofcs(struct net_device *dev) return dev-priv_flags IFF_SUPP_NOFCS; } +static inline bool netif_is_vrf(const struct net_device *dev) +{ + return dev-priv_flags IFF_VRF_MASTER; +} + +static inline bool netif_index_is_vrf(struct net *net, int ifindex) +{ + struct net_device *dev = dev_get_by_index(net, ifindex); + bool rc = false; + + if (dev) { + rc = netif_is_vrf(dev); + dev_put(dev); + } + return rc; +} + /* This device needs to keep skb dst for qdisc enqueue or ndo_start_xmit() */ static inline void netif_keep_dst(struct net_device *dev) { diff --git a/include/net/vrf.h b/include/net/vrf.h new file mode 100644 index ..743a172ee849 --- /dev/null +++ b/include/net/vrf.h @@ -0,0 +1,83 @@ +/* + * include/net/net_vrf.h - adds vrf dev structure definitions + * Copyright (c) 2015 Cumulus Networks + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ + +#ifndef __LINUX_NET_VRF_H +#define __LINUX_NET_VRF_H + +struct net_vrf_dev { + int ifindex; /* ifindex of master dev */ + u32 tb_id; /* table id for VRF */ +}; + +#if IS_ENABLED(CONFIG_NET_VRF) +static inline int vrf_master_dev_ifindex(const struct net_device *dev) +{ + struct net_vrf_dev *vrf_ptr; + int ifindex = 0; + + if (!dev) + return 0; + + if (netif_is_vrf(dev)) + ifindex = dev-ifindex; + else { + vrf_ptr = rcu_dereference(dev-vrf_ptr); + if (vrf_ptr) + ifindex = vrf_ptr-ifindex; + } + + return ifindex; +} + +static inline int vrf_get_master_dev_ifindex(struct net *net, int ifindex) +{ + int rc = 0; + + if (ifindex) { + struct net_device *dev = dev_get_by_index(net, ifindex); + + if (dev) { + rc = vrf_master_dev_ifindex(dev); + dev_put(dev); + } + } + return rc; +} + +static inline int vrf_dev_table(const struct net_device *dev) +{ + int tb_id = 0; + + if (dev) { + struct net_vrf_dev *vrf_ptr = rcu_dereference(dev-vrf_ptr); + + if (vrf_ptr) + tb_id = vrf_ptr-tb_id; + } + return tb_id; +} +#else +static inline int vrf_master_dev_ifindex(const struct net_device *dev) +{ + return 0; +} + +static inline int vrf_get_master_dev_ifindex(struct net *net, int ifindex) +{ + return 0; +} + +static inline int
[PATCH net-next 15/16] net: Add chvrf command
Example of how to use the default bind to interface option for tasks and correlate with VRF devices. Signed-off-by: David Ahern d...@cumulusnetworks.com --- tools/net/Makefile | 6 +- tools/net/chvrf.c | 225 + 2 files changed, 229 insertions(+), 2 deletions(-) create mode 100644 tools/net/chvrf.c diff --git a/tools/net/Makefile b/tools/net/Makefile index ee577ea03ba5..c13f11f5637a 100644 --- a/tools/net/Makefile +++ b/tools/net/Makefile @@ -10,7 +10,7 @@ YACC = bison %.lex.c: %.l $(LEX) -o $@ $ -all : bpf_jit_disasm bpf_dbg bpf_asm +all : bpf_jit_disasm bpf_dbg bpf_asm chvrf bpf_jit_disasm : CFLAGS = -Wall -O2 -DPACKAGE='bpf_jit_disasm' bpf_jit_disasm : LDLIBS = -lopcodes -lbfd -ldl @@ -25,8 +25,10 @@ bpf_asm : LDLIBS = bpf_asm : bpf_asm.o bpf_exp.yacc.o bpf_exp.lex.o bpf_exp.lex.o : bpf_exp.yacc.c +chvrf : CFLAGS = -Wall -O2 + clean : - rm -rf *.o bpf_jit_disasm bpf_dbg bpf_asm bpf_exp.yacc.* bpf_exp.lex.* + rm -rf *.o bpf_jit_disasm bpf_dbg bpf_asm bpf_exp.yacc.* bpf_exp.lex.* chvrf install : install bpf_jit_disasm $(prefix)/bin/bpf_jit_disasm diff --git a/tools/net/chvrf.c b/tools/net/chvrf.c new file mode 100644 index ..71cc925fd101 --- /dev/null +++ b/tools/net/chvrf.c @@ -0,0 +1,225 @@ +/* + * chvrf.c - Example of how to use the default bind-to-device option for + * tasks and correlate to VRFs via the VRF device. + * + * Copyright (c) 2015 Cumulus Networks + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License as published by + * the Free Software Foundation; either version 2 of the License, or + * (at your option) any later version. + */ +#include sys/ioctl.h +#include sys/prctl.h +#include sys/socket.h +#include signal.h +#include string.h +#include stdio.h +#include stdlib.h +#include unistd.h +#include netinet/in.h +#include net/if.h /* for struct ifreq */ +#include libgen.h +#include errno.h + +#ifndef PR_SET_SK_BIND_DEV_IF +#define PR_SET_SK_BIND_DEV_IF 47 +#endif +#ifndef PR_GET_SK_BIND_DEV_IF +#define PR_GET_SK_BIND_DEV_IF 48 +#endif + +static int vrf_to_device(int vrf) +{ + struct ifreq ifdata; + int sd, rc; + + memset(ifdata, 0, sizeof(ifdata)); + snprintf(ifdata.ifr_name, sizeof(ifdata.ifr_name) - 1, vrf%d, vrf); + + sd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); + if (sd 0) { + perror(socket failed); + return -1; + } + + /* Get the index for the specified interface */ + rc = ioctl(sd, SIOCGIFINDEX, (char *)ifdata); + close(sd); + if (rc != 0) { + perror(ioctl(SIOCGIFINDEX) failed); + return -1; + } + + return ifdata.ifr_ifindex; +} + +static int device_to_vrf(int idx) +{ + struct ifreq ifdata; + int sd, vrf, rc; + + memset(ifdata, 0, sizeof(ifdata)); + ifdata.ifr_ifindex = idx; + + sd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); + if (sd 0) { + perror(socket failed); + return -1; + } + + /* Get the index for the specified interface */ + rc = ioctl(sd, SIOCGIFNAME, (char *)ifdata); + close(sd); + if (rc != 0) { + perror(ioctl(SIOCGIFNAME) failed); + return -1; + } + + if (sscanf(ifdata.ifr_name, vrf%d, vrf) != 1) { + fprintf(stderr, Unexpected device name (%s)\n, ifdata.ifr_name); + vrf = -1; + } + + return vrf; +} + +static int set_vrf(int vrf) +{ + int idx; + long err; + + /* convert vrf to device index */ + idx = vrf_to_device(vrf); + if (idx 0) { + fprintf(stderr, Failed to get device index for vrf %d\n, vrf); + return -1; + } + + /* set default device bind */ + err = prctl(PR_SET_SK_BIND_DEV_IF, idx); + if (err 0) { + fprintf(stderr, prctl failed to device index: %d\n, errno); + return -1; + } + + return 0; +} + +/* get vrf context for given process id */ +static int get_vrf(pid_t pid) +{ + int vrf; + long err; + + /* lookup device index pid is tied to */ + err = prctl(PR_GET_SK_BIND_DEV_IF, pid); + if (err 0) { + fprintf(stderr, prctl failed: %d\n, errno); + return -1; + } + + if (err == 0) + return 0; + + /* convert device index to vrf id */ + vrf = device_to_vrf((int)err); + if (vrf 0) { + fprintf(stderr, Failed to get device index for vrf %d\n, vrf); + return -1; + } + + return vrf; +} + +static int run_vrf(char **argv, int vrf) +{ + char *cmd; + + if (set_vrf(vrf) != 0) { + fprintf(stderr, Failed to set vrf context\n); + return 1; + } + + cmd =
Re: [RFC PATCH 0/4] Shared vhost design
Eyal Moscovici eya...@il.ibm.com writes: Hi, The test showed the same relative numbers as we got in our internal testing. I was wondering about the configuration in regards to NUMA. From Thanks for confirming. our testing we saw that if the VMs are spread across 2 NUMA nodes then having a shared vhost thread per node performs better then having the two threads in the same core. IIUC, this is similar to my test setup and observations i.e 14* 1173.8 1216.9 In this case, there's a shared vhost thread on CPU 14 for numa node 0 and another on CPU 15 for numa node 1. Guests running on CPUs 0,2,4,6,8,10,12 are serviced by vhost-0 that runs on CPU 14 and guests running on CPUs 1,3,5,7,9,11,13 get serviced by vhost-1 (Numa node 1). I tried some other configurations but this configuration gave me the best results. Eyal, I think it makes sense to add polling on top of these patches and get numbers for them too. Thoughts ? Bandan Eyal Moscovici HL-Cloud Infrastructure Solutions IBM Haifa Research Lab From: Bandan Das b...@redhat.com To: k...@vger.kernel.org Cc: netdev@vger.kernel.org, linux-ker...@vger.kernel.org, m...@redhat.com, Eyal Moscovici/Haifa/IBM@IBMIL, Razya Ladelsky/Haifa/IBM@IBMIL, cgro...@vger.kernel.org, jasow...@redhat.com Date: 07/13/2015 07:08 AM Subject:[RFC PATCH 0/4] Shared vhost design Hello, There have been discussions on improving the current vhost design. The first attempt, to my knowledge was Shirley Ma's patch to create a dedicated vhost worker per cgroup. http://comments.gmane.org/gmane.linux.network/224730 Later, I posted a cmwq based approach for performance comparisions http://comments.gmane.org/gmane.linux.network/286858 More recently was the Elvis work that was presented in KVM Forum 2013 http://www.linux-kvm.org/images/a/a3/Kvm-forum-2013-elvis.pdf The Elvis patches rely on common vhost thread design for scalability along with polling for performance. Since there are two major changes being proposed, we decided to split up the work. The first (this RFC), proposing a re-design of the vhost threading model and the second part (not posted yet) to focus more on improving performance. I am posting this with the hope that we can have a meaningful discussion on the proposed new architecture. We have run some tests to show that the new design is scalable and in terms of performance, is comparable to the current stable design. Test Setup: The testing is based on the setup described in the Elvis proposal. The initial tests are just an aggregate of Netperf STREAM and MAERTS but as we progress, I am happy to run more tests. The hosts are two identical 16 core Haswell systems with point to point network links. For the first 10 runs, with n=1 upto n=10 guests running in parallel, I booted the target system with nr_cpus=8 and mem=12G. The purpose was to do a comparision of resource utilization and how it affects performance. Finally, with the number of guests set at 14, I didn't limit the number of CPUs booted on the host or limit memory seen by the kernel but boot the kernel with isolcpus=14,15 that will be used to run the vhost threads. The guests are pinned to cpus 0-13 and based on which cpu the guest is running on, the corresponding I/O thread is either pinned to cpu 14 or 15. Results # X axis is number of guests # Y axis is netperf number # nr_cpus=8 and mem=12G #Number of Guests#Baseline#ELVIS 11119.3.0 2 1135.6 1130.2 3 1135.5 1131.6 4 1136.0 1127.1 5 1118.6 1129.3 6 1123.4 1129.8 7 1128.7 1135.4 8 1129.9 1137.5 9 1130.6 1135.1 101129.3 1138.9 14* 1173.8 1216.9 #* Last run with the vCPU and I/O thread(s) pinned, no CPU/memory limit imposed. # I/O thread runs on CPU 14 or 15 depending on which guest it's serving There's a simple graph at http://people.redhat.com/~bdas/elvis/data/results.png that shows how task affinity results in a jump and even without it, as the number of guests increase, the shared vhost design performs slightly better. Observations: 1. In terms of stock performance, the results are comparable. 2. However, with a tuned setup, even without polling, we see an improvement with the new design. 3. Making the new design simulate old behavior would be a matter of setting the number of guests per vhost threads to 1. 4.
Re: [PATCH iproute2] ip: replace white-spaces with tabs
On Sat, 25 Jul 2015 08:54:53 -0400 Zhang Shengju zhangshen...@cmss.chinamobile.com wrote: Replace white-spaces with tabs Signed-off-by: Zhang Shengju zhangshen...@cmss.chinamobile.com --- ip/ip.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) There were more places that needed this, went ahead and made ip.c and bridge.c checkpatch clean. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH 1/2] iproute2: Add support for IPv6 VTI tunnels to ip6tunnel
On Thu, 2 Oct 2014 11:11:40 +0200 Jiri Pirko j...@resnulli.us wrote: Thu, Oct 02, 2014 at 10:48:20AM CEST, steffen.klass...@secunet.com wrote: On Thu, Oct 02, 2014 at 10:41:09AM +0200, Jiri Pirko wrote: Fri, Sep 26, 2014 at 09:10:56AM CEST, steffen.klass...@secunet.com wrote: @@ -459,11 +462,14 @@ static int do_add(int cmd, int argc, char **argv) switch (p.proto) { case IPPROTO_IPIP: case IPPROTO_IPV6: + if (p.i_flags != VTI_ISVTI) + return tnl_add_ioctl(cmd, ip6_vti0, p.name, p); ^ Wouldn't it be more consistent to not to use the underscore? The ipv4 version of vti uses ip_vti0, so I tried to be consistent with that. Okay, fine with me. Sure, applied. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch net-next 0/4] Introduce Mellanox Technologies Switch ASICs switchdev drivers
On Thu, Jul 23, 2015 at 8:43 AM, Jiri Pirko j...@resnulli.us wrote: This patchset introduces Mellanox Technologies Switch driver infrastructure and support for SwitchX-2 ASIC. You guys did a great job on the driver; looking forward to seeing L2/L3 hooked up. Very nice, aesthetically pleasing code. Is this a ground-up rewrite or a port of the SDK? It's so clean and tight, I'm guessing a ground-up rewrite. Reviewed-by: Scott Feldman sfel...@gmail.com -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [patch net-next 0/4] Introduce Mellanox Technologies Switch ASICs switchdev drivers
Mon, Jul 27, 2015 at 10:21:54PM CEST, sfel...@gmail.com wrote: On Thu, Jul 23, 2015 at 8:43 AM, Jiri Pirko j...@resnulli.us wrote: This patchset introduces Mellanox Technologies Switch driver infrastructure and support for SwitchX-2 ASIC. You guys did a great job on the driver; looking forward to seeing L2/L3 hooked up. Very nice, aesthetically pleasing code. Thanks! Is this a ground-up rewrite or a port of the SDK? It's so clean and tight, I'm guessing a ground-up rewrite. It's rewritten from scratch. Reviewed-by: Scott Feldman sfel...@gmail.com Thanks for your review! -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [net:master 41/49] drivers/net/ethernet/cadence/macb.c:164:1: error: macro writel passed 3 arguments, but takes just 2
From: Andy Shevchenko andriy.shevche...@linux.intel.com Date: Mon, 27 Jul 2015 14:07:53 +0300 I do use compiler from Debian for AVR32, didn't check this on other architectures. Possible something like following will fix it: That isn't going to fix it. You misunderstand the nature of the problem I think, the issue looks like this: #define readl(x, y) ((x) + (y)) struct foo { int (*readl)(int x, int y, int z); }; int test(struct foo *p) { p-readl(1, 2, 3); } Archs typically define readl as a macro, so when you do things like p-readl() CPP tries to expand the macro when it sees the readl( part, and that's the fundamental issue. We have to rename the method names so that the macro expansion does't interfere. Here is the fix I am committing to fix this: [PATCH] macb: Fix build with macro'ized readl/writel. If an architecture defines readl/writel using CPP macros, we get the following kinds of build failure: drivers/net/ethernet/cadence/macb.c:164:1: error: macro writel passed 3 arguments, but takes just 2 macb_or_gem_writel(bp, SA1B, bottom); ^ Rename the methods so that this doesn't happen. Signed-off-by: David S. Miller da...@davemloft.net --- drivers/net/ethernet/cadence/macb.c | 14 +++--- drivers/net/ethernet/cadence/macb.h | 16 2 files changed, 15 insertions(+), 15 deletions(-) diff --git a/drivers/net/ethernet/cadence/macb.c b/drivers/net/ethernet/cadence/macb.c index c638757..bf9eb2e 100644 --- a/drivers/net/ethernet/cadence/macb.c +++ b/drivers/net/ethernet/cadence/macb.c @@ -506,7 +506,7 @@ static void macb_update_stats(struct macb *bp) WARN_ON((unsigned long)(end - p - 1) != (MACB_TPF - MACB_PFR) / 4); for(; p end; p++, offset += 4) - *p += bp-readl(bp, offset); + *p += bp-macb_reg_readl(bp, offset); } static int macb_halt_tx(struct macb *bp) @@ -1934,14 +1934,14 @@ static void gem_update_stats(struct macb *bp) for (i = 0; i GEM_STATS_LEN; ++i, ++p) { u32 offset = gem_statistics[i].offset; - u64 val = bp-readl(bp, offset); + u64 val = bp-macb_reg_readl(bp, offset); bp-ethtool_stats[i] += val; *p += val; if (offset == GEM_OCTTXL || offset == GEM_OCTRXL) { /* Add GEM_OCTTXH, GEM_OCTRXH */ - val = bp-readl(bp, offset + 4); + val = bp-macb_reg_readl(bp, offset + 4); bp-ethtool_stats[i] += ((u64)val) 32; *(++p) += val; } @@ -2867,11 +2867,11 @@ static int macb_probe(struct platform_device *pdev) bp-regs = mem; bp-native_io = native_io; if (native_io) { - bp-readl = hw_readl_native; - bp-writel = hw_writel_native; + bp-macb_reg_readl = hw_readl_native; + bp-macb_reg_writel = hw_writel_native; } else { - bp-readl = hw_readl; - bp-writel = hw_writel; + bp-macb_reg_readl = hw_readl; + bp-macb_reg_writel = hw_writel; } bp-num_queues = num_queues; bp-queue_mask = queue_mask; diff --git a/drivers/net/ethernet/cadence/macb.h b/drivers/net/ethernet/cadence/macb.h index 2aa102e..1895b6b 100644 --- a/drivers/net/ethernet/cadence/macb.h +++ b/drivers/net/ethernet/cadence/macb.h @@ -429,12 +429,12 @@ | GEM_BF(name, value)) /* Register access macros */ -#define macb_readl(port, reg) (port)-readl((port), MACB_##reg) -#define macb_writel(port, reg, value) (port)-writel((port), MACB_##reg, (value)) -#define gem_readl(port, reg) (port)-readl((port), GEM_##reg) -#define gem_writel(port, reg, value) (port)-writel((port), GEM_##reg, (value)) -#define queue_readl(queue, reg)(queue)-bp-readl((queue)-bp, (queue)-reg) -#define queue_writel(queue, reg, value) (queue)-bp-writel((queue)-bp, (queue)-reg, (value)) +#define macb_readl(port, reg) (port)-macb_reg_readl((port), MACB_##reg) +#define macb_writel(port, reg, value) (port)-macb_reg_writel((port), MACB_##reg, (value)) +#define gem_readl(port, reg) (port)-macb_reg_readl((port), GEM_##reg) +#define gem_writel(port, reg, value) (port)-macb_reg_writel((port), GEM_##reg, (value)) +#define queue_readl(queue, reg) (queue)-bp-macb_reg_readl((queue)-bp, (queue)-reg) +#define queue_writel(queue, reg, value) (queue)-bp-macb_reg_writel((queue)-bp, (queue)-reg, (value)) /* Conditional GEM/MACB macros. These perform the operation to the correct * register dependent on whether the device is a GEM or a MACB. For registers @@ -782,8 +782,8 @@ struct macb { boolnative_io; /* hardware IO accessors */ - u32 (*readl)(struct macb
Re: [patch net-next 0/4] Introduce Mellanox Technologies Switch ASICs switchdev drivers
On 27/07/15 13:27, Jiri Pirko wrote: Mon, Jul 27, 2015 at 10:21:54PM CEST, sfel...@gmail.com wrote: On Thu, Jul 23, 2015 at 8:43 AM, Jiri Pirko j...@resnulli.us wrote: This patchset introduces Mellanox Technologies Switch driver infrastructure and support for SwitchX-2 ASIC. You guys did a great job on the driver; looking forward to seeing L2/L3 hooked up. Very nice, aesthetically pleasing code. Thanks! Is this a ground-up rewrite or a port of the SDK? It's so clean and tight, I'm guessing a ground-up rewrite. It's rewritten from scratch. Only glanced through the driver, but this looks like really really nice and clean, very pleased to see such a driver being submitted. That is very encouraging and should inspire other companies in doing so, so thanks for doing this! -- Florian -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH iproute2] tc: fix bpf compilation with old glibc
On Wed, 22 Jul 2015 14:29:30 +0200 Nicolas Dichtel nicolas.dich...@6wind.com wrote: Error was: f_bpf.o: In function `bpf_parse_opt': f_bpf.c:(.text+0x88f): undefined reference to `secure_getenv' m_bpf.o: In function `parse_bpf': m_bpf.c:(.text+0x587): undefined reference to `secure_getenv' collect2: error: ld returned 1 exit status CC: Daniel Borkmann dan...@iogearbox.net Fixes: 88eea5395483 (tc: {f,m}_bpf: allow to retrieve uds path from env) Signed-off-by: Nicolas Dichtel nicolas.dich...@6wind.com Applied thanks. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [net-next 0/16] Proposal for VRF-lite - v3
David Ahern d...@cumulusnetworks.com writes: In the context of internet scale routing a requirement that always comes up is the need to partition the available routing tables into disjoint routing planes. A specific use case is the multi-tenancy problem where each tenant has their own unique routing tables and in the very least need different default gateways. This patch allows the ability to create virtual router domains (aka VRFs (VRF-lite to be specific) in the linux packet forwarding stack. The main observation is that through the use of rules and socket binding to interfaces, all the facilities that we need are already present in the infrastructure. What is missing is a handle that identifies a routing domain and can be used to gather applicable rules/tables and uniqify neighbor selection. The scheme used needs to preserves the notions of ECMP, and general routing principles. This paragraph is false when it comes to sockets, as I have already pointed out. - VPN Routing and Forwarding (RFC4364 and it's kin) implies isolation strong enough to allow using the the same ip on different machines in different VPN instances and not have confusion. - The routing table is not the only table in the kernel that uses an ip address as a key. The result is that you can combine packets fragments that come in on different interfaces (irrespective of your VPN), confuse tcp parameters between interfaces, scramble your ipsec connections and I don't know what else. Binding a socket to a network device is not strong enough to do what you want to do and it will lead to subtle bugs, that can be triggered by accident or by hostile actors. If these kinds of limitations are well documented and it is specified that these kinds of problems can occur with your socket code there may be a place for this code somewhere. However described like it is your code is wrong and fundmentally broken. Version 3 - addressed comments from first 2 RFCs with the exception of the name Nicolas: We will do the name conversion once we agree on what the correct name should be (vrf, mrf or something else) Not so. I described the deep problems between your goals and your implementation and they are not even mentioned let alone addressed. - packets flow through the VRF device in both directions allowing the following: - tcpdump -i vrfn - tc rules on vrf device - netfilter rules on vrf device Ingo/Andy: I added you two as a start point for the proposed task related changes. Not sure who should be the reviewer; please let me know if someone else is more appropriate. Thanks. It looks like you are trying to implement a namespace that isn't a namespace. Given that it is broken by design you have my nack. Nacked-by: Eric W. Biederman ebied...@xmission.com Eric -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH net-next 14/16] net: Add sk_bind_dev_if to task_struct
David Ahern d...@cumulusnetworks.com writes: Allow tasks to have a default device index for binding sockets. If set the value is passed to all AF_INET/AF_INET6 sockets when they are created. The task setting is passed parent to child on fork, but can be set or changed after task creation using prctl (if task has CAP_NET_ADMIN permissions). The setting for a socket can be retrieved using prctl(). This option allows an administrator to restrict a task to only send/receive packets through the specified device. In the case of VRF devices this option restricts tasks to a specific VRF. Correlation of the device index to a specific VRF, ie., ifindex -- VRF device -- VRF id is left to userspace. Nacked-by: Eric W. Biederman ebied...@xmission.com Because it is broken by design. Your routing device is only safe for programs that know it's limitations it is not appropriate for general applications. Since you don't even seen to know it's limitations I think this is a bad path to walk down. Example using VRF devices: 1. vrf1 is created and assigned to table 5 2. eth2 is enslaved to vrf1 3. eth2 is given the address 1.1.1.1/24 $ ip route ls table 5 prohibit default 1.1.1.0/24 dev eth2 scope link local 1.1.1.1 dev eth2 proto kernel scope host src 1.1.1.1 With out setting a VRF context ping, tcp and udp attempts fail. e.g, $ ping 1.1.1.254 connect: Network is unreachable After binding the task to the vrf device ping succeeds: $ ./chvrf -v 1 ping -c1 1.1.1.254 PING 1.1.1.254 (1.1.1.254) 56(84) bytes of data. 64 bytes from 1.1.1.254: icmp_seq=1 ttl=64 time=2.32 ms Signed-off-by: David Ahern d...@cumulusnetworks.com --- include/linux/sched.h | 3 +++ include/uapi/linux/prctl.h | 4 kernel/fork.c | 2 ++ kernel/sys.c | 35 +++ net/ipv4/af_inet.c | 1 + net/ipv4/route.c | 4 +++- net/ipv6/af_inet6.c| 1 + net/ipv6/route.c | 2 +- 8 files changed, 50 insertions(+), 2 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 04b5ada460b4..29b336b8a466 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1528,6 +1528,9 @@ struct task_struct { struct files_struct *files; /* namespaces */ struct nsproxy *nsproxy; +/* network */ + /* if set INET/INET6 sockets are bound to given dev index on create */ + int sk_bind_dev_if; /* signal handlers */ struct signal_struct *signal; struct sighand_struct *sighand; diff --git a/include/uapi/linux/prctl.h b/include/uapi/linux/prctl.h index 31891d9535e2..1ef45195d146 100644 --- a/include/uapi/linux/prctl.h +++ b/include/uapi/linux/prctl.h @@ -190,4 +190,8 @@ struct prctl_mm_map { # define PR_FP_MODE_FR (1 0)/* 64b FP registers */ # define PR_FP_MODE_FRE (1 1)/* 32b compatibility */ +/* get/set network interface sockets are bound to by default */ +#define PR_SET_SK_BIND_DEV_IF 47 +#define PR_GET_SK_BIND_DEV_IF 48 + #endif /* _LINUX_PRCTL_H */ diff --git a/kernel/fork.c b/kernel/fork.c index dbd9b8d7b7cc..8b396e77d2bf 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -380,6 +380,8 @@ static struct task_struct *dup_task_struct(struct task_struct *orig) tsk-splice_pipe = NULL; tsk-task_frag.page = NULL; + tsk-sk_bind_dev_if = orig-sk_bind_dev_if; + account_kernel_stack(ti, 1); return tsk; diff --git a/kernel/sys.c b/kernel/sys.c index 259fda25eb6b..59119ac0a0bd 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -52,6 +52,7 @@ #include linux/rcupdate.h #include linux/uidgid.h #include linux/cred.h +#include linux/netdevice.h #include linux/kmsg_dump.h /* Move somewhere else to avoid recompiling? */ @@ -2267,6 +2268,40 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, case PR_GET_FP_MODE: error = GET_FP_MODE(me); break; +#ifdef CONFIG_NET + case PR_SET_SK_BIND_DEV_IF: + { + struct net_device *dev; + int idx = (int) arg2; + + if (!capable(CAP_NET_ADMIN)) + return -EPERM; + + if (idx) { + dev = dev_get_by_index(me-nsproxy-net_ns, idx); + if (!dev) + return -EINVAL; + dev_put(dev); + } + me-sk_bind_dev_if = idx; + break; + } + case PR_GET_SK_BIND_DEV_IF: + { + struct task_struct *tsk; + int sk_bind_dev_if = -EINVAL; + + rcu_read_lock(); + tsk = find_task_by_vpid(arg2); + if (tsk) + sk_bind_dev_if = tsk-sk_bind_dev_if; + rcu_read_unlock(); + if (tsk != me !capable(CAP_NET_ADMIN)) + return -EPERM; +
Re: [PATCH iproute2 net-next] bridge: mdb: add support for router add/del notifications monitoring
On Mon, 27 Jul 2015 13:44:05 +0200 Nikolay Aleksandrov ra...@blackwall.org wrote: From: Nikolay Aleksandrov niko...@cumulusnetworks.com This patch adds support for ADDMDB/DELMDB notifications about router ports which have been added or deleted/expired respectively. Example output: $ bridge -s monitor mdb Deleted router port dev eth3 master br0 router port dev eth3 master br0 Signed-off-by: Nikolay Aleksandrov niko...@cumulusnetworks.com Looks useful, applied. Does usage or manual page need to be updated as well? -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH net-next]r8169: Correct values on dma_alloc_coherent
Corcodel Marian corcodel.mar...@gmail.com : [...] diff --git a/drivers/net/ethernet/realtek/r8169.c b/drivers/net/ethernet/realtek/r8169.c index 3df51fa..fd249a6 100644 --- a/drivers/net/ethernet/realtek/r8169.c +++ b/drivers/net/ethernet/realtek/r8169.c @@ -6724,8 +6724,8 @@ static int rtl8169_init_ring(struct net_device *dev) rtl8169_init_ring_indexes(tp); - memset(tp-tx_skb, 0x0, NUM_TX_DESC * sizeof(struct ring_info)); - memset(tp-Rx_databuff, 0x0, NUM_RX_DESC * sizeof(void *)); + memset(tp-tx_skb, 0x0, NUM_RX_DESC); + memset(tp-Rx_databuff, 0x0, NUM_RX_DESC); void *Rx_databuff[NUM_RX_DESC]; :o( Please don't mess with the kernel code until you've figured how wrong these changes are. Then give yourself a few months and read more code. Really. -- Ueimor -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [net-next PATCH 2/2] drivers: net: cpsw: add separate napi for tx packet handling for performance improvment
Mugunthan V N mugunthan...@ti.com : [...] diff --git a/drivers/net/ethernet/ti/cpsw.c b/drivers/net/ethernet/ti/cpsw.c index d68d759..4f98537 100644 --- a/drivers/net/ethernet/ti/cpsw.c +++ b/drivers/net/ethernet/ti/cpsw.c @@ -752,13 +753,22 @@ static irqreturn_t cpsw_tx_interrupt(int irq, void *dev_id) struct cpsw_priv *priv = dev_id; cpdma_ctlr_eoi(priv-dma, CPDMA_EOI_TX); - cpdma_chan_process(priv-txch, 128); + writel(0, priv-wr_regs-tx_en); + + if (netif_running(priv-ndev)) { + napi_schedule(priv-napi_tx); + return IRQ_HANDLED; + } cpsw_ndo_stop calls napi_disable: you can remove netif_running. -- Ueimor -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: Fw: [Bug 102051] New: Unexpected TCP behavior
On Mon, 2015-07-27 at 14:02 -0700, Stephen Hemminger wrote: Begin forwarded message: Date: Mon, 27 Jul 2015 20:06:07 + From: bugzilla-dae...@bugzilla.kernel.org bugzilla-dae...@bugzilla.kernel.org To: shemmin...@linux-foundation.org shemmin...@linux-foundation.org Subject: [Bug 102051] New: Unexpected TCP behavior https://bugzilla.kernel.org/show_bug.cgi?id=102051 Bug ID: 102051 Summary: Unexpected TCP behavior Product: Networking Version: 2.5 Kernel Version: 3.19 Hardware: All OS: Linux Tree: Mainline Status: NEW Severity: low Priority: P1 Component: IPV4 Assignee: shemmin...@linux-foundation.org Reporter: vreme...@gmail.com Regression: No While running nmap against localhost I started to see open ports in the dynamic range (1024). Kind of odd, knowing netstat and ss did not show any listeners on the port. With tcpdump, I confirmed system was returning S/ACK for ports that did not have a listener enabled. The issue or feature only occurs if source port matches the destination port. # netstat -nap | grep 5000 # $ nc localhost -p 5000 5000 a a # tcpdump -i lo port 5000 14:28:18.059708 IP 127.0.0.1.5000 127.0.0.1.5000: Flags [S], seq 2005295207, win 43690, options [mss 65495,sackOK,TS val 4481790 ecr 0,nop,wscale 7], length 0 14:28:18.059721 IP 127.0.0.1.5000 127.0.0.1.5000: Flags [S.], seq 2005295207, ack 2005295208, win 43690, options [mss 65495,sackOK,TS val 4481790 ecr 4481790,nop,wscale 7], length 0 Nothing wrong here. This is well known TCP behavior ( cross syn ). 14:28:18.059729 IP 127.0.0.1.5000 127.0.0.1.5000: Flags [.], ack 2005295208, win 342, options [nop,nop,TS val 4481790 ecr 4481790], length 0 14:28:19.121392 IP 127.0.0.1.5000 127.0.0.1.5000: Flags [P.], seq 2005295208:2005295210, ack 2005295208, win 342, options [nop,nop,TS val 4482056 ecr 4481790], length 2 14:28:19.121407 IP 127.0.0.1.5000 127.0.0.1.5000: Flags [.], ack 2005295210, win 342, options [nop,nop,TS val 4482056 ecr 4482056], length 0 # hping3 -S 127.0.0.1 -p 5000 -s 5000 HPING 127.0.0.1 (lo 127.0.0.1): S set, 40 headers + 0 data bytes len=40 ip=127.0.0.1 ttl=64 id=2036 sport=5000 flags=S seq=0 win=512 rtt=3.8 ms SYN DUP! len=52 ip=127.0.0.1 ttl=64 DF id=670 sport=5000 flags=A seq=0 win=342 rtt=3.8 ms SYN/ACK len=40 ip=127.0.0.1 ttl=64 DF id=43435 sport=5000 flags=RA seq=1 win=0 rtt=3.7 ms I confirmed it with nmap, nc, and hping3; granted they build on same c libraries, so I am not even sure if this should be filed as a kernel bug (or even a bug); just did not expect to see this behavior // Expected to see a RST instead. Sigh. Wont fix. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH iproute2 net-next] bridge: mdb: add support for router add/del notifications monitoring
On 27 Jul 2015, at 23:40, Stephen Hemminger step...@networkplumber.org wrote: On Mon, 27 Jul 2015 13:44:05 +0200 Nikolay Aleksandrov ra...@blackwall.org wrote: From: Nikolay Aleksandrov niko...@cumulusnetworks.com This patch adds support for ADDMDB/DELMDB notifications about router ports which have been added or deleted/expired respectively. Example output: $ bridge -s monitor mdb Deleted router port dev eth3 master br0 router port dev eth3 master br0 Signed-off-by: Nikolay Aleksandrov niko...@cumulusnetworks.com Looks useful, applied. Does usage or manual page need to be updated as well? Good question :-) I'll look into it. Thanks! -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [net PATCH] fib_trie: Drop unnecessary calls to leaf_pull_suffix
From: Alexander Duyck alexander.h.du...@redhat.com Date: Mon, 27 Jul 2015 13:08:06 -0700 It was reported that update_suffix was taking a long time on systems where a large number of leaves were attached to a single node. As it turns out fib_table_flush was calling update_suffix for each leaf that didn't have all of the aliases stripped from it. As a result, on this large node removing one leaf would result in us calling update_suffix for every other leaf on the node. The fix is to just remove the calls to leaf_pull_suffix since they are redundant as we already have a call in resize that will go through and update the suffix length for the node before we exit out of fib_table_flush or fib_table_flush_external. Reported-by: David Ahern d...@cumulusnetworks.com Signed-off-by: Alexander Duyck alexander.h.du...@redhat.com Applied and queued up for -stable, thanks. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [net PATCH] fib_trie: Drop unnecessary calls to leaf_pull_suffix
On 7/27/15 2:08 PM, Alexander Duyck wrote: It was reported that update_suffix was taking a long time on systems where a large number of leaves were attached to a single node. As it turns out fib_table_flush was calling update_suffix for each leaf that didn't have all of the aliases stripped from it. As a result, on this large node removing one leaf would result in us calling update_suffix for every other leaf on the node. The fix is to just remove the calls to leaf_pull_suffix since they are redundant as we already have a call in resize that will go through and update the suffix length for the node before we exit out of fib_table_flush or fib_table_flush_external. Reported-by: David Ahernd...@cumulusnetworks.com Signed-off-by: Alexander Duyckalexander.h.du...@redhat.com --- This patch should apply to linux-4.1.y and newer kernels. I've done a bit of testing on my system and I no longer see update_suffix dominating the performance traces. David if you can test with this patch to see if you still see the issue I would appreciate it. Works for me. Thanks. Tested-by: David Ahern d...@cumulusnetworks.com -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 0/4] Shared vhost design
On Mon, Jul 13, 2015 at 12:07:31AM -0400, Bandan Das wrote: Hello, There have been discussions on improving the current vhost design. The first attempt, to my knowledge was Shirley Ma's patch to create a dedicated vhost worker per cgroup. http://comments.gmane.org/gmane.linux.network/224730 Later, I posted a cmwq based approach for performance comparisions http://comments.gmane.org/gmane.linux.network/286858 More recently was the Elvis work that was presented in KVM Forum 2013 http://www.linux-kvm.org/images/a/a3/Kvm-forum-2013-elvis.pdf The Elvis patches rely on common vhost thread design for scalability along with polling for performance. Since there are two major changes being proposed, we decided to split up the work. The first (this RFC), proposing a re-design of the vhost threading model and the second part (not posted yet) to focus more on improving performance. I am posting this with the hope that we can have a meaningful discussion on the proposed new architecture. We have run some tests to show that the new design is scalable and in terms of performance, is comparable to the current stable design. Test Setup: The testing is based on the setup described in the Elvis proposal. The initial tests are just an aggregate of Netperf STREAM and MAERTS but as we progress, I am happy to run more tests. The hosts are two identical 16 core Haswell systems with point to point network links. For the first 10 runs, with n=1 upto n=10 guests running in parallel, I booted the target system with nr_cpus=8 and mem=12G. The purpose was to do a comparision of resource utilization and how it affects performance. Finally, with the number of guests set at 14, I didn't limit the number of CPUs booted on the host or limit memory seen by the kernel but boot the kernel with isolcpus=14,15 that will be used to run the vhost threads. The guests are pinned to cpus 0-13 and based on which cpu the guest is running on, the corresponding I/O thread is either pinned to cpu 14 or 15. Results # X axis is number of guests # Y axis is netperf number # nr_cpus=8 and mem=12G #Number of Guests#Baseline#ELVIS 11119.3 .0 2 1135.6 1130.2 3 1135.5 1131.6 4 1136.0 1127.1 5 1118.6 1129.3 6 1123.4 1129.8 7 1128.7 1135.4 8 1129.9 1137.5 9 1130.6 1135.1 10 1129.3 1138.9 14*1173.8 1216.9 I'm a bit too busy now, with 2.4 and related stuff, will review once we finish 2.4. But I'd like to ask two things: - did you actually test a config where cgroups were used? - does the design address the issue of VM 1 being blocked (e.g. because it hits swap) and blocking VM 2? #* Last run with the vCPU and I/O thread(s) pinned, no CPU/memory limit imposed. # I/O thread runs on CPU 14 or 15 depending on which guest it's serving There's a simple graph at http://people.redhat.com/~bdas/elvis/data/results.png that shows how task affinity results in a jump and even without it, as the number of guests increase, the shared vhost design performs slightly better. Observations: 1. In terms of stock performance, the results are comparable. 2. However, with a tuned setup, even without polling, we see an improvement with the new design. 3. Making the new design simulate old behavior would be a matter of setting the number of guests per vhost threads to 1. 4. Maybe, setting a per guest limit on the work being done by a specific vhost thread is needed for it to be fair. 5. cgroup associations needs to be figured out. I just slightly hacked the current cgroup association mechanism to work with the new model. Ccing cgroups for input/comments. Many thanks to Razya Ladelsky and Eyal Moscovici, IBM for the initial patches, the helpful testing suggestions and discussions. Bandan Das (4): vhost: Introduce a universal thread to serve all users vhost: Limit the number of devices served by a single worker thread cgroup: Introduce a function to compare cgroups vhost: Add cgroup-aware creation of worker threads drivers/vhost/net.c| 6 +- drivers/vhost/scsi.c | 18 ++-- drivers/vhost/vhost.c | 272 +++-- drivers/vhost/vhost.h | 32 +- include/linux/cgroup.h | 1 + kernel/cgroup.c| 40 6 files changed, 275 insertions(+), 94 deletions(-) -- 2.4.3 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Fw: [Bug 102051] New: Unexpected TCP behavior
Begin forwarded message: Date: Mon, 27 Jul 2015 20:06:07 + From: bugzilla-dae...@bugzilla.kernel.org bugzilla-dae...@bugzilla.kernel.org To: shemmin...@linux-foundation.org shemmin...@linux-foundation.org Subject: [Bug 102051] New: Unexpected TCP behavior https://bugzilla.kernel.org/show_bug.cgi?id=102051 Bug ID: 102051 Summary: Unexpected TCP behavior Product: Networking Version: 2.5 Kernel Version: 3.19 Hardware: All OS: Linux Tree: Mainline Status: NEW Severity: low Priority: P1 Component: IPV4 Assignee: shemmin...@linux-foundation.org Reporter: vreme...@gmail.com Regression: No While running nmap against localhost I started to see open ports in the dynamic range (1024). Kind of odd, knowing netstat and ss did not show any listeners on the port. With tcpdump, I confirmed system was returning S/ACK for ports that did not have a listener enabled. The issue or feature only occurs if source port matches the destination port. # netstat -nap | grep 5000 # $ nc localhost -p 5000 5000 a a # tcpdump -i lo port 5000 14:28:18.059708 IP 127.0.0.1.5000 127.0.0.1.5000: Flags [S], seq 2005295207, win 43690, options [mss 65495,sackOK,TS val 4481790 ecr 0,nop,wscale 7], length 0 14:28:18.059721 IP 127.0.0.1.5000 127.0.0.1.5000: Flags [S.], seq 2005295207, ack 2005295208, win 43690, options [mss 65495,sackOK,TS val 4481790 ecr 4481790,nop,wscale 7], length 0 14:28:18.059729 IP 127.0.0.1.5000 127.0.0.1.5000: Flags [.], ack 2005295208, win 342, options [nop,nop,TS val 4481790 ecr 4481790], length 0 14:28:19.121392 IP 127.0.0.1.5000 127.0.0.1.5000: Flags [P.], seq 2005295208:2005295210, ack 2005295208, win 342, options [nop,nop,TS val 4482056 ecr 4481790], length 2 14:28:19.121407 IP 127.0.0.1.5000 127.0.0.1.5000: Flags [.], ack 2005295210, win 342, options [nop,nop,TS val 4482056 ecr 4482056], length 0 # hping3 -S 127.0.0.1 -p 5000 -s 5000 HPING 127.0.0.1 (lo 127.0.0.1): S set, 40 headers + 0 data bytes len=40 ip=127.0.0.1 ttl=64 id=2036 sport=5000 flags=S seq=0 win=512 rtt=3.8 ms SYN DUP! len=52 ip=127.0.0.1 ttl=64 DF id=670 sport=5000 flags=A seq=0 win=342 rtt=3.8 ms SYN/ACK len=40 ip=127.0.0.1 ttl=64 DF id=43435 sport=5000 flags=RA seq=1 win=0 rtt=3.7 ms I confirmed it with nmap, nc, and hping3; granted they build on same c libraries, so I am not even sure if this should be filed as a kernel bug (or even a bug); just did not expect to see this behavior // Expected to see a RST instead. -- You are receiving this mail because: You are the assignee for the bug. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 0/4] Shared vhost design
On Mon, Jul 27, 2015 at 03:48:19PM -0400, Bandan Das wrote: Eyal Moscovici eya...@il.ibm.com writes: Hi, The test showed the same relative numbers as we got in our internal testing. I was wondering about the configuration in regards to NUMA. From Thanks for confirming. our testing we saw that if the VMs are spread across 2 NUMA nodes then having a shared vhost thread per node performs better then having the two threads in the same core. IIUC, this is similar to my test setup and observations i.e 14* 1173.8 1216.9 In this case, there's a shared vhost thread on CPU 14 for numa node 0 and another on CPU 15 for numa node 1. Guests running on CPUs 0,2,4,6,8,10,12 are serviced by vhost-0 that runs on CPU 14 and guests running on CPUs 1,3,5,7,9,11,13 get serviced by vhost-1 (Numa node 1). I tried some other configurations but this configuration gave me the best results. Eyal, I think it makes sense to add polling on top of these patches and get numbers for them too. Thoughts ? Bandan So simple polling by vhost is kind of ok for some guests, but I think to really make it work for a reasonably wide selection of guests/workloads you need to combine it with 1. polling the NIC - it makes no sense to me to only poll one side of the equation; and probably 2. - polling in guest. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [RFC PATCH 4/4] vhost: Add cgroup-aware creation of worker threads
On Mon, Jul 13, 2015 at 12:07:35AM -0400, Bandan Das wrote: With the help of the cgroup function to compare groups introduced in the previous patch, this changes worker creation policy. If the new device belongs to different cgroups than any of the devices we are currently serving, we end up creating a new worker thread even if we haven't reached the devs_per_worker threshold Signed-off-by: Bandan Das b...@redhat.com Would it make sense to integrate this in the work-queue mechanism somehow? Just a thought - correctly accounting kernel's work on behalf of specific userspace groups might have value generally. Or is the usecase too special? Cc Tejun for comments. --- drivers/vhost/vhost.c | 47 +++ 1 file changed, 39 insertions(+), 8 deletions(-) diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 6a5d4c0..dc0fa37 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -261,12 +261,6 @@ static int vhost_worker(void *data) use_mm(dev-mm); } - /* TODO: Consider a more elegant solution */ - if (worker-owner != dev-owner) { - /* Should check for return value */ - cgroup_attach_task_all(dev-owner, current); - worker-owner = dev-owner; - } work-fn(work); if (need_resched()) schedule(); @@ -278,6 +272,36 @@ static int vhost_worker(void *data) return 0; } +struct vhost_attach_cgroups_struct { + struct vhost_work work; + struct task_struct *owner; + int ret; +}; + +static void vhost_attach_cgroups_work(struct vhost_work *work) +{ + struct vhost_attach_cgroups_struct *s; + + s = container_of(work, struct vhost_attach_cgroups_struct, work); + s-ret = cgroup_attach_task_all(s-owner, current); +} + +static void vhost_attach_cgroups(struct vhost_dev *dev, + struct vhost_worker *worker) +{ + struct vhost_attach_cgroups_struct attach; + + attach.owner = dev-owner; + vhost_work_init(dev, attach.work, vhost_attach_cgroups_work); + vhost_work_queue(worker, attach.work); + vhost_work_flush(worker, attach.work); + + if (!attach.ret) + worker-owner = dev-owner; + + dev-err = attach.ret; +} + static void vhost_create_worker(struct vhost_dev *dev) { struct vhost_worker *worker; @@ -300,8 +324,14 @@ static void vhost_create_worker(struct vhost_dev *dev) spin_lock_init(worker-work_lock); INIT_LIST_HEAD(worker-work_list); + + /* attach to the cgroups of the process that created us */ + vhost_attach_cgroups(dev, worker); + if (dev-err) + goto therror; + worker-owner = dev-owner; + list_add(worker-node, pool-workers); - worker-owner = NULL; worker-num_devices++; total_vhost_workers++; dev-worker = worker; @@ -320,7 +350,8 @@ static int vhost_dev_assign_worker(struct vhost_dev *dev) mutex_lock(vhost_pool-pool_lock); list_for_each_entry(worker, vhost_pool-workers, node) { - if (worker-num_devices devs_per_worker) { + if (worker-num_devices devs_per_worker + (!cgroup_match_groups(dev-owner, worker-owner))) { dev-worker = worker; dev-worker_assigned = true; worker-num_devices++; -- 2.4.3 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH iproute2] xfrm: remove duplicated include
On Sat, 25 Jul 2015 04:44:24 -0400 Zhang Shengju zhangshen...@cmss.chinamobile.com wrote: Remove dupldated include for linux/xfrm.h, since it's already included by 'xfrm.h'. Signed-off-by: Zhang Shengju zhangshen...@cmss.chinamobile.com Applied, thanks -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] phylib: add driver for aquantia phy
On 27/07/15 01:30, Shaohui Xie wrote: -Original Message- From: Florian Fainelli [mailto:f.faine...@gmail.com] Sent: Friday, July 24, 2015 12:39 PM To: shh@gmail.com; netdev@vger.kernel.org; da...@davemloft.net Cc: Xie Shaohui-B21989 Subject: Re: [PATCH] phylib: add driver for aquantia phy Le 07/23/15 20:46, shh@gmail.com a écrit : From: Shaohui Xie shaohui@freescale.com This patch added driver to support Aquantia PHYs AQ1202, AQ2104, AQR105, AQR405, which accessed through clause 45. Could you prefix your patches with net: phy: in the future to be consistent with what is typically used? [S.H] OK. See comments below Signed-off-by: Shaohui Xie shaohui@freescale.com --- [snip] +static int aquantia_read_status(struct phy_device *phydev) { + int reg; + + phydev-speed = SPEED_1; + phydev-duplex = DUPLEX_FULL; + + reg = phy_read_mmd(phydev, MDIO_MMD_AN, MDIO_STAT1); + reg = phy_read_mmd(phydev, MDIO_MMD_AN, MDIO_STAT1); + if (reg MDIO_STAT1_LSTATUS) + phydev-link = 1; + else + phydev-link = 0; + + reg = phy_read_mmd(phydev, MDIO_MMD_AN, 0xc800); + mdelay(10); + reg = phy_read_mmd(phydev, MDIO_MMD_AN, 0xc800); + if (reg == 0x9) + phydev-speed = SPEED_2500; + else if (reg == 0x5) + phydev-speed = SPEED_1000; + else if (reg == 0x3) + phydev-speed = SPEED_100; Could we use a switch/case here? [S.H] OK. How about 10Mbits/sec and duplex are we guaranteed to be full-duplex at e.g: 100 or 10Mbits/sec? [S.H] The PHY does not support 10M bits/sec. When link to 100M. the phy is full-duplex. Ok, that means you need to restrict the supported flags accordingly not to advertise these modes as being supported in the first place, see below: + + return 0; +} + +static struct phy_driver aquantia_driver[] = { { + .phy_id = PHY_ID_AQ1202, + .phy_id_mask= 0xfff0, + .name = Aquantia AQ1202, + .features = PHY_GBIT_FEATURES, If these are 10GbE PHYs, should not we start defining a new features bitmask here to reflect that accordingly? That way MAC [S.H] there are several defines for 10G PHYs, should be used by a given 10G PHY. for this Aquantia PHY, SUPPORTED_1baseT_Full is a valid define, should I set it as below: .features = PHY_GBIT_FEATURES | SUPPORTED_1baseT_Full, PHY_GBIT_FEATURES means 10/100/1000 half and full-duplex are supported, which are not supported as you indicated above, I would go with adding only the supported modes here, this is really important since this is the contract between the PHY driver and the Ethernet MAC using it through the PHY library. Thanks! -- Florian -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
ip6t_SYNPROXY crashes kernel
Hi, When synproxy_send_server_ack() calls synproxy_send_tcp(), it passes NULL as third parameter (struct nf_conntrack *nfct). And the first thing synproxy_send_tcp() does, is dereference it: | struct net *net = nf_ct_net((struct nf_conn *)nfct); I could not find a commit leading to this breakage in the commit log, which makes me doubt ip6t_SYNPROXY has ever worked at all. If you need one, I have a reproducer at hand. (Though I would want to strip it down a bit first.) Just let me know. Cheers, Phil -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH V3 net-next 0/3] ARM BPF JIT features
From: Nicolas Schichan nschic...@freebox.fr Date: Mon, 27 Jul 2015 15:06:48 +0200 This series adds support for more instructions to the ARM BPF JIT namely skb netdevice type retrieval, skb payload offset retrieval, and skb packet type retrieval. This allows 35 tests to use the JIT instead of 29 before. This series depends on the BPF JIT fixes for ARM serie sent earlier. Series applied, thanks. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH net-next 0/4] net/mlx4_en: Hardware accelerated 802.1ad
From: Amir Vadai am...@mellanox.com Date: Mon, 27 Jul 2015 14:46:30 +0300 This patchset by Hadar introduces support in Hardware accelerated 802.1ad, for ConnectX-3pro NIC's. In order to support existing deployment, and due to some hardware limitations, the feature is disabled by default, and needed to be enabled using a private flag in ethtool. Ofcourse user can enable the private flag only if hardware has support. After being enabled, the standard ethtool -k/-K can be used. Patchset was applied and tested over commit 71790a2 (hv_netvsc: Add structs and handlers for VF messages) Series applied, thanks. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 2/2] ixgbe: Teardown SR-IOV before unregister_netdev()
When the .remove() callback for a PF is called, SR-IOV support for the device is disabled, which requires unbinding and removing the VFs. The VFs may be in-use either by the host kernel or userspace, such as assigned to a VM through vfio-pci. In this latter case, the VFs may be removed either by shutting down the VM or hot-unplugging the devices from the VM. Unfortunately in the case of a Windows 2012 R2 guest, hot-unplug is broken due to the ordering of the PF driver teardown. Disabling SR-IOV prior to unregister_netdev() avoids this issue. Signed-off-by: Alex Williamson alex.william...@redhat.com --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index f775123..e27813c 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -9035,12 +9035,12 @@ static void ixgbe_remove(struct pci_dev *pdev) /* remove the added san mac */ ixgbe_del_sanmac_netdev(netdev); - if (netdev-reg_state == NETREG_REGISTERED) - unregister_netdev(netdev); - #ifdef CONFIG_PCI_IOV ixgbe_disable_sriov(adapter); #endif + if (netdev-reg_state == NETREG_REGISTERED) + unregister_netdev(netdev); + ixgbe_clear_interrupt_scheme(adapter); ixgbe_release_hw_control(adapter); -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [Intel-wired-lan] [PATCH 2/2] ixgbe: Teardown SR-IOV before unregister_netdev()
ACK -Original Message- From: Intel-wired-lan [mailto:intel-wired-lan-boun...@lists.osuosl.org] On Behalf Of Alex Williamson Sent: Monday, July 27, 2015 4:19 PM To: intel-wired-...@lists.osuosl.org; Kirsher, Jeffrey T Cc: netdev@vger.kernel.org; linux-ker...@vger.kernel.org Subject: [Intel-wired-lan] [PATCH 2/2] ixgbe: Teardown SR-IOV before unregister_netdev() When the .remove() callback for a PF is called, SR-IOV support for the device is disabled, which requires unbinding and removing the VFs. The VFs may be in-use either by the host kernel or userspace, such as assigned to a VM through vfio-pci. In this latter case, the VFs may be removed either by shutting down the VM or hot-unplugging the devices from the VM. Unfortunately in the case of a Windows 2012 R2 guest, hot-unplug is broken due to the ordering of the PF driver teardown. Disabling SR-IOV prior to unregister_netdev() avoids this issue. Signed-off-by: Alex Williamson alex.william...@redhat.com --- drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c index f775123..e27813c 100644 --- a/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c +++ b/drivers/net/ethernet/intel/ixgbe/ixgbe_main.c @@ -9035,12 +9035,12 @@ static void ixgbe_remove(struct pci_dev *pdev) /* remove the added san mac */ ixgbe_del_sanmac_netdev(netdev); - if (netdev-reg_state == NETREG_REGISTERED) - unregister_netdev(netdev); - #ifdef CONFIG_PCI_IOV ixgbe_disable_sriov(adapter); #endif + if (netdev-reg_state == NETREG_REGISTERED) + unregister_netdev(netdev); + ixgbe_clear_interrupt_scheme(adapter); ixgbe_release_hw_control(adapter); ___ Intel-wired-lan mailing list intel-wired-...@lists.osuosl.org http://lists.osuosl.org/mailman/listinfo/intel-wired-lan -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
RE: [Intel-wired-lan] [PATCH 1/2] igb: Teardown SR-IOV before unregister_netdev()
ACK -Original Message- From: Intel-wired-lan [mailto:intel-wired-lan-boun...@lists.osuosl.org] On Behalf Of Alex Williamson Sent: Monday, July 27, 2015 4:19 PM To: intel-wired-...@lists.osuosl.org; Kirsher, Jeffrey T Cc: netdev@vger.kernel.org; linux-ker...@vger.kernel.org Subject: [Intel-wired-lan] [PATCH 1/2] igb: Teardown SR-IOV before unregister_netdev() When the .remove() callback for a PF is called, SR-IOV support for the device is disabled, which requires unbinding and removing the VFs. The VFs may be in-use either by the host kernel or userspace, such as assigned to a VM through vfio-pci. In this latter case, the VFs may be removed either by shutting down the VM or hot-unplugging the devices from the VM. Unfortunately in the case of a Windows 2012 R2 guest, hot-unplug is broken due to the ordering of the PF driver teardown. Disabling SR-IOV prior to unregister_netdev() avoids this issue. Signed-off-by: Alex Williamson alex.william...@redhat.com --- drivers/net/ethernet/intel/igb/igb_main.c |8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/intel/igb/igb_main.c b/drivers/net/ethernet/intel/igb/igb_main.c index 517746f..606a7ae 100644 --- a/drivers/net/ethernet/intel/igb/igb_main.c +++ b/drivers/net/ethernet/intel/igb/igb_main.c @@ -2805,14 +2805,14 @@ static void igb_remove(struct pci_dev *pdev) */ igb_release_hw_control(adapter); - unregister_netdev(netdev); - - igb_clear_interrupt_scheme(adapter); - #ifdef CONFIG_PCI_IOV igb_disable_sriov(pdev); #endif + unregister_netdev(netdev); + + igb_clear_interrupt_scheme(adapter); + pci_iounmap(pdev, hw-hw_addr); if (hw-flash_address) iounmap(hw-flash_address); ___ Intel-wired-lan mailing list intel-wired-...@lists.osuosl.org http://lists.osuosl.org/mailman/listinfo/intel-wired-lan -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH] packet: Allow packets with only a header (but no payload)
Hi Johann, On Tue, Jul 21, 2015 at 6:51 PM, Willem de Bruijn will...@google.com wrote: I don't see a simple way of verifying the safety of allowing packets without data short of a code audit, which would be huge, especially when taking device driver logic into account. Perhaps someone remembers why that statement was added and what edge case(s) it refers to. I'm afraid that I don't. It was added in 69e3c75f4d54. I added the author to this thread. I know it's summer (and thus vacation-time), but did you already have a chance to look into this? Regards, Martin -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch -master] netfilter: xt_CT: checking for IS_ERR() instead of NULL
We recently changed this from nf_conntrack_alloc() to nf_ct_tmpl_alloc() so the error handling needs to changed to check for NULL instead of IS_ERR(). Fixes: 0838aa7fcfcd ('netfilter: fix netns dependencies with conntrack templates') Signed-off-by: Dan Carpenter dan.carpen...@oracle.com diff --git a/net/netfilter/xt_CT.c b/net/netfilter/xt_CT.c index c663003..43ddeee 100644 --- a/net/netfilter/xt_CT.c +++ b/net/netfilter/xt_CT.c @@ -202,9 +202,10 @@ static int xt_ct_tg_check(const struct xt_tgchk_param *par, goto err1; ct = nf_ct_tmpl_alloc(par-net, info-zone, GFP_KERNEL); - ret = PTR_ERR(ct); - if (IS_ERR(ct)) + if (!ct) { + ret = -ENOMEM; goto err2; + } ret = 0; if ((info-ct_events || info-exp_events) -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: kernel warning in tcp_fragment
On Wed, Jul 22, 2015 at 11:55:35AM -0700, Jovi Zhangwei wrote: Sorry for disturbing, our production system(3.14 and 3.18 stable kernel) have many tcp_fragment warnings, the trace is same as below one which you discussed before. https://urldefense.proofpoint.com/v1/url?u=http://comments.gmane.org/gmane.linux.network/365658k=ZVNjlDMF0FElm4dQtryO4A%3D%3D%0Ar=%2Faj1ZOQObwbmtLwlDw3XzQ%3D%3D%0Am=fQUME5h%2FYY3oZjXbnLC3z6TaEEcTBSCAji4PkNqFjq8%3D%0As=1527f3221a6f31cba9544e5ddaa20986aafe8be8c898b42c7e9ce5e68d3803d8 But I didn't found the final solution in that mail thread, do you have any new ideas or patches on this warning? I think the following points to the last discussion. We are currently using a similar patch: http://comments.gmane.org/gmane.linux.network/366549 Eric, any update on your findings? or you have already pushed a fix? Thanks, --Martin -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[patch] packet: missing dev_put() in packet_do_bind()
From: Lars Westerhoff lars.westerh...@newtec.eu When binding a PF_PACKET socket, the use count of the bound interface is always increased with dev_hold in dev_get_by_{index,name}. However, when rebound with the same protocol and device as in the previous bind the use count of the interface was not decreased. Ultimately, this caused the deletion of the interface to fail with the following message: unregister_netdevice: waiting for dummy0 to become free. Usage count = 1 This patch moves the dev_put out of the conditional part that was only executed when either the protocol or device changed on a bind. Fixes: 902fefb82ef7 ('packet: improve socket create/bind latency in some cases') Signed-off-by: Lars Westerhoff lars.westerh...@newtec.eu Signed-off-by: Dan Carpenter dan.carpen...@oracle.com Reviewed-by: Daniel Borkmann dbork...@redhat.com diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c index c9e8741..c7c42eb 100644 --- a/net/packet/af_packet.c +++ b/net/packet/af_packet.c @@ -2784,7 +2784,7 @@ static int packet_release(struct socket *sock) static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto) { struct packet_sock *po = pkt_sk(sk); - const struct net_device *dev_curr; + struct net_device *dev_curr; __be16 proto_curr; bool need_rehook; @@ -2808,15 +2808,13 @@ static int packet_do_bind(struct sock *sk, struct net_device *dev, __be16 proto) po-num = proto; po-prot_hook.type = proto; - - if (po-prot_hook.dev) - dev_put(po-prot_hook.dev); - po-prot_hook.dev = dev; po-ifindex = dev ? dev-ifindex : 0; packet_cached_dev_assign(po, dev); } + if (dev_curr) + dev_put(dev_curr); if (proto == 0 || !need_rehook) goto out_unlock; -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH] netfilter: ipt_SYNPROXY: fix sending window update to client
Upon receipt of SYNACK from the server, ipt_SYNPROXY first sends back an ACK to finish the server handshake, then calls nf_ct_seqadj_init() to initiate sequence number adjustment of forwarded packets to the client and finally sends a window update to the client to unblock it's TX queue. Since synproxy_send_client_ack() does not set synproxy_send_tcp()'s nfct parameter, no sequence number adjustment happens and the client receives the window update with incorrect sequence number. Depending on client TCP implementation, this leads to a significant delay (until a window probe is being sent). Signed-off-by: Phil Sutter p...@nwl.cc --- net/ipv4/netfilter/ipt_SYNPROXY.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/net/ipv4/netfilter/ipt_SYNPROXY.c b/net/ipv4/netfilter/ipt_SYNPROXY.c index fe8cc18..95ea633e 100644 --- a/net/ipv4/netfilter/ipt_SYNPROXY.c +++ b/net/ipv4/netfilter/ipt_SYNPROXY.c @@ -226,7 +226,8 @@ synproxy_send_client_ack(const struct synproxy_net *snet, synproxy_build_options(nth, opts); - synproxy_send_tcp(skb, nskb, NULL, 0, niph, nth, tcp_hdr_size); + synproxy_send_tcp(skb, nskb, skb-nfct, IP_CT_ESTABLISHED_REPLY, + niph, nth, tcp_hdr_size); } static bool -- 1.8.3.1 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH net v3] macvtap: fix network header pointer for VLAN tagged pkts
From: Ivan Vecera ivec...@redhat.com Date: Thu, 23 Jul 2015 16:37:43 +0200 Network header is set with offset ETH_HLEN but it is not true for VLAN (multiple-)tagged and results in checksum issues in lower devices. v2: leave skb-protocol untouched (thx Vlad), comment added v3: moved after skb_probe_transport_header() call (thx Toshiaki) Signed-off-by: Ivan Vecera ivec...@redhat.com Applied, thanks. -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
[PATCH 0/2] igb/ixgbe: Fix ordering of SR-IOV teardown
When running a Windows 2012 R2 guest with a pair of VFs assigned through vfio-pci, we run into a problem trying to hot-unplug those VFs after the PF has unregistered the netdev. This is a common scenario if the PF is unbound from the driver while VFs are active. In the case of igb, the resulting guest behavior differs slightly between the Microsoft provided and Intel add-on guest drivers. With the Microsoft driver, the guest seems to stumble through ejecting both VFs, but takes longer than normal to do so. With the Intel drivers, only one VF is unplugged, but Device Manager still shows it as present. The second VF is non-functional but also still shown in Device Manager. At this point, the guest is in such a state that it will not cleanly shutdown. With ixgbe VFs, both the Microsoft and Intel drivers take on this latter behavior. For both, I've found that disabling SR-IOV before unregistering the PF netdev device allows the hot-unplug to proceed without interruption or further ill behavior in the guest. This is true regardless of which driver is used. I don't fully understand what dependency is broken by unregistering the netdev prior to disabling SR-IOV, but I also don't see the benefit in delaying SR-IOV teardown in this call path. It could potentially be moved even earlier, but I'll let those more familiar with the hardware and code make that determination. In any case, the VM behavior is substantially improved by this slight re-ordering. I don't have an i40e for testing, but it already appears to disable SR-IOV much earlier in the unbind path, so I wouldn't expect to find similar issues. Thanks, Alex --- Alex Williamson (2): igb: Teardown SR-IOV before unregister_netdev() ixgbe: Teardown SR-IOV before unregister_netdev() drivers/net/ethernet/intel/igb/igb_main.c |8 drivers/net/ethernet/intel/ixgbe/ixgbe_main.c |6 +++--- 2 files changed, 7 insertions(+), 7 deletions(-) -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html
Re: [PATCH net-next v2 1/2] ipv6: Re-arrange code in rt6_probe()
Martin KaFai Lau wrote: It is a prep work for the next patch to remove write_lock from rt6_probe(). 1. Reduce the number of if(neigh) check. From 4 to 1. 2. Bring the write_(un)lock() closer to the operations that the lock is protecting. Hopefully, the above make rt6_probe() more readable. Signed-off-by: Martin KaFai Lau ka...@fb.com Cc: Hannes Frederic Sowa han...@stressinduktion.org Cc: Julian Anastasov j...@ssi.bg Cc: YOSHIFUJI Hideaki hideaki.yoshif...@miraclelinux.com Acked-by: YOSHIFUJI Hideaki hideaki.yoshif...@miraclelinux.com --yoshfuji --- net/ipv6/route.c | 44 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/net/ipv6/route.c b/net/ipv6/route.c index 7f2214f..6d503db 100644 --- a/net/ipv6/route.c +++ b/net/ipv6/route.c @@ -545,6 +545,7 @@ static void rt6_probe_deferred(struct work_struct *w) static void rt6_probe(struct rt6_info *rt) { + struct __rt6_probe_work *work; struct neighbour *neigh; /* * Okay, this does not seem to be appropriate @@ -559,34 +560,29 @@ static void rt6_probe(struct rt6_info *rt) rcu_read_lock_bh(); neigh = __ipv6_neigh_lookup_noref(rt-dst.dev, rt-rt6i_gateway); if (neigh) { + work = NULL; write_lock(neigh-lock); - if (neigh-nud_state NUD_VALID) - goto out; - } - - if (!neigh || - time_after(jiffies, neigh-updated + rt-rt6i_idev-cnf.rtr_probe_interval)) { - struct __rt6_probe_work *work; - - work = kmalloc(sizeof(*work), GFP_ATOMIC); - - if (neigh work) - __neigh_set_probe_once(neigh); - - if (neigh) - write_unlock(neigh-lock); - - if (work) { - INIT_WORK(work-work, rt6_probe_deferred); - work-target = rt-rt6i_gateway; - dev_hold(rt-dst.dev); - work-dev = rt-dst.dev; - schedule_work(work-work); + if (!(neigh-nud_state NUD_VALID) + time_after(jiffies, +neigh-updated + +rt-rt6i_idev-cnf.rtr_probe_interval)) { + work = kmalloc(sizeof(*work), GFP_ATOMIC); + if (work) + __neigh_set_probe_once(neigh); } - } else { -out: write_unlock(neigh-lock); + } else { + work = kmalloc(sizeof(*work), GFP_ATOMIC); + } + + if (work) { + INIT_WORK(work-work, rt6_probe_deferred); + work-target = rt-rt6i_gateway; + dev_hold(rt-dst.dev); + work-dev = rt-dst.dev; + schedule_work(work-work); } + rcu_read_unlock_bh(); } #else -- 吉藤英明 hideaki.yoshif...@miraclelinux.com ミラクル・リナックス株式会社 技術本部 サポート部 -- To unsubscribe from this list: send the line unsubscribe netdev in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html