[RFC PATCH V2] ARM: compressed: Move ramdisk forward to reserve more memory for kernel image
From: Wu Zhangjin During embedded linux system booting, before decompressing the kernel image, the bootloader(E.g. Uboot) loads the compressed kernel image and ramdisk into two contiguous memory space, these two memory space are fixed after the Uboot is released, as a result, the maximum size of the decompressed kernel image is limited by the size of the reserved memory space (the difference of the two contiguous memory addresses). If want more functions or some debug options, the decompressed kernel image may be bigger and may overwrite the followed ramdisk and result in kernel boot failure for missing a valid ramdisk. To fix up this issue, before decompressing the kernel image, this option moves the loaded ramdisk image forward with a specified offset and reserve more memory for the decompressed kernel image. -- v1-->v2: o Fix up ATAG_RAMDISK support o Add phys base addr for ORION5X and KIRKWOOD o Add an independent move_ramdisk.h, new board support can be added here. TODO: o Add dtb support Signed-off-by: Wu Zhangjin --- arch/arm/boot/compressed/Makefile |2 +- arch/arm/boot/compressed/misc.c |7 +++- arch/arm/boot/compressed/move_ramdisk.c | 61 +++ arch/arm/boot/compressed/move_ramdisk.h | 35 ++ usr/Kconfig | 29 +++ 5 files changed, 131 insertions(+), 3 deletions(-) create mode 100644 arch/arm/boot/compressed/move_ramdisk.c create mode 100644 arch/arm/boot/compressed/move_ramdisk.h diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index bb26756..e4ef227 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -23,7 +23,7 @@ endif AFLAGS_head.o += -DTEXT_OFFSET=$(TEXT_OFFSET) HEAD = head.o -OBJS += misc.o decompress.o +OBJS += misc.o decompress.o move_ramdisk.o FONTC = $(srctree)/drivers/video/console/font_acorn_8x8.c # string library code (-Os is enforced to keep it much smaller) diff --git a/arch/arm/boot/compressed/misc.c b/arch/arm/boot/compressed/misc.c index 8e2a8fc..df1df60 100644 --- a/arch/arm/boot/compressed/misc.c +++ b/arch/arm/boot/compressed/misc.c @@ -22,7 +22,8 @@ unsigned int __machine_arch_type; #include #include -static void putstr(const char *ptr); +extern void move_ramdisk(void); +extern void putstr(const char *ptr); extern void error(char *x); #include @@ -83,7 +84,7 @@ static void icedcc_putc(int ch) #define putc(ch) icedcc_putc(ch) #endif -static void putstr(const char *ptr) +void putstr(const char *ptr) { char c; @@ -144,6 +145,8 @@ decompress_kernel(unsigned long output_start, unsigned long free_mem_ptr_p, arch_decomp_setup(); + move_ramdisk(); + putstr("Uncompressing Linux..."); ret = do_decompress(input_data, input_data_end - input_data, output_data, error); diff --git a/arch/arm/boot/compressed/move_ramdisk.c b/arch/arm/boot/compressed/move_ramdisk.c new file mode 100644 index 000..b984433 --- /dev/null +++ b/arch/arm/boot/compressed/move_ramdisk.c @@ -0,0 +1,61 @@ +/* + * move_ramdisk.c + * + * Author: Wu Zhangjin, wuzhang...@gmail.com, December 2013 + * Copyright: (C) 2013 Meizu Telecom Equipment Co., Ltd + * + * Please get help of MOVE_RAMDISK from usr/Kconfig. + */ + + +#include "move_ramdisk.h" + +extern void putstr(const char *ptr); + +void move_ramdisk(void) +{ +#if defined(CONFIG_MOVE_RAMDISK) && defined(TAG_BASE_ADDR) + struct tag *tags = (struct tag *)(TAG_BASE_ADDR); + __u32 start, target, size; + int found = 0, type = 0; + + putstr("Searching ramdisk ...\n"); + if (tags->hdr.tag == ATAG_CORE) { + putstr("Found tags ...\n"); + for (tags = tag_next(tags); tags->hdr.size; tags = tag_next(tags)) { + if (tags->hdr.tag == ATAG_INITRD2) { + putstr("Found initrd2 tag ...\n"); + found = 1; + break; + } else if (tags->hdr.tag == ATAG_INITRD) { + putstr("Found initrd tag ...\n"); + found = 1; + break; + } else if (tags->hdr.tag == ATAG_RAMDISK) { + putstr("Found ramdisk tag ...\n"); + found = 1; + type = 1; + break; + } + } + if (found) { + if (type == 0) { + start = tags->u.initrd.start; + size = tags->u.initrd.size; + target = start + MOVE_RAMDISK_OFFSET; + tags->u.initrd.start = target; + } else { + s
[RFC PATCH] ARM: compressed: Move ramdisk forward to reserve more memory for kernel image
From: Wu Zhangjin During the booting of an embedded Linux system, before decompressing the kernel image, the bootloader(E.g. Uboot) loads the compressed kernel image and ramdisk image into two contiguous memory spaces, these two memory spaces are fixed after the Uboot is released, as a result, the maximum size of the (decompressed) kernel image is limited by the size of the reserved memory space (the difference of the two contiguous memory addresses). If want more functions or some debug options, the decompressed kernel image may be bigger and may overwrite the followed ramdisk mage and result in kernel boot failure for missing a valid ramdisk. To fix up this issue, before decompressing the kernel image, this option moves the loaded ramdisk image forward with a configurable offset and reserve more memory for the decompressed kernel image. Signed-off-by: Wu Zhangjin --- arch/arm/boot/compressed/Makefile |2 +- arch/arm/boot/compressed/misc.c |7 ++-- arch/arm/boot/compressed/move_ramdisk.c | 58 +++ usr/Kconfig | 29 4 files changed, 93 insertions(+), 3 deletions(-) create mode 100644 arch/arm/boot/compressed/move_ramdisk.c diff --git a/arch/arm/boot/compressed/Makefile b/arch/arm/boot/compressed/Makefile index e7190bb..c215918 100644 --- a/arch/arm/boot/compressed/Makefile +++ b/arch/arm/boot/compressed/Makefile @@ -23,7 +23,7 @@ endif AFLAGS_head.o += -DTEXT_OFFSET=$(TEXT_OFFSET) HEAD = head.o -OBJS += misc.o decompress.o +OBJS += misc.o decompress.o move_ramdisk.o ifeq ($(CONFIG_DEBUG_UNCOMPRESS),y) OBJS += debug.o endif diff --git a/arch/arm/boot/compressed/misc.c b/arch/arm/boot/compressed/misc.c index 31bd43b..27bac3c 100644 --- a/arch/arm/boot/compressed/misc.c +++ b/arch/arm/boot/compressed/misc.c @@ -22,7 +22,8 @@ unsigned int __machine_arch_type; #include #include -static void putstr(const char *ptr); +extern void move_ramdisk(void); +extern void putstr(const char *ptr); extern void error(char *x); #include CONFIG_UNCOMPRESS_INCLUDE @@ -83,7 +84,7 @@ static void icedcc_putc(int ch) #define putc(ch) icedcc_putc(ch) #endif -static void putstr(const char *ptr) +void putstr(const char *ptr) { char c; @@ -144,6 +145,8 @@ decompress_kernel(unsigned long output_start, unsigned long free_mem_ptr_p, arch_decomp_setup(); + move_ramdisk(); + putstr("Uncompressing Linux..."); ret = do_decompress(input_data, input_data_end - input_data, output_data, error); diff --git a/arch/arm/boot/compressed/move_ramdisk.c b/arch/arm/boot/compressed/move_ramdisk.c new file mode 100644 index 000..4b7ecfc --- /dev/null +++ b/arch/arm/boot/compressed/move_ramdisk.c @@ -0,0 +1,58 @@ +/* + * move_ramdisk.c + * + * Author: Wu Zhangjin, wuzhang...@gmail.com, December 2013 + * Copyright: (C) 2013 TinyLab.org + * + * Please get help of MOVE_RAMDISK from usr/Kconfig. + */ + +#include +#include +#include + +/* The tag address, may need to be changed for your board, see PHYS_OFFSET, atag_offset */ +#define TAG_BASE_ADDR (PLAT_PHYS_OFFSET + 0x100) + +#define MOVE_RAMDISK_OFFSET(CONFIG_MOVE_RAMDISK_OFFSET_M * 1024 * 1024) +#define tag_next(t)((struct tag *)((__u32 *)(t) + (t)->hdr.size)) + +extern void putstr(const char *ptr); + +void move_ramdisk(void) +{ +#ifdef CONFIG_MOVE_RAMDISK + struct tag *tags = (struct tag *)(TAG_BASE_ADDR); + __u32 start, size; + int found = 0; + + putstr("Searching ramdisk ...\n"); + if (tags->hdr.tag == ATAG_CORE) { + putstr("Found tags ...\n"); + for (tags = tag_next(tags); tags->hdr.size; tags = tag_next(tags)) { + if (tags->hdr.tag == ATAG_INITRD2) { + putstr("Found initrd2 tag ...\n"); + found = 1; + break; + } else if (tags->hdr.tag == ATAG_INITRD) { + putstr("Found initrd tag ...\n"); + found = 1; + break; + } else if (tags->hdr.tag == ATAG_RAMDISK) { + putstr("Found ramdisk tag ...\n"); + found = 1; + break; + } + } + if (found) { + start = tags->u.initrd.start; + size = tags->u.initrd.size; + tags->u.initrd.start = start + MOVE_RAMDISK_OFFSET; + putstr("Moving ramdisk forward ...\n"); + memcpy((char *)tags->u.initrd.start, (char *)start, size); + } + } else { + putstr("No tag found ...\n"); + } +#endif +} diff --git a/usr/Kconfig b/usr/Kconfig index 642f503..bd39418 100644 ---
[PATCH 2/2] async: Allow to group the asynced device probings
From: Wu Zhangjin [*Note*: NOT applicable, only for comments.] This allows to schedule a group of probings in order. Usage: If the probing of driver2 depends on the probing of driver1, we can put them into a group, here put them into a group named domain 1, they will be probed in the linking order. ... static struct platform_driver first_driver = { .probe = first_driver_probe, .driver = { .name = "first driver", + .async_probe = 1, + .async_domain = 1, }, }; ... static struct platform_driver second_driver = { .probe = second_driver_probe, .driver = { .name = "second_driver", + .async_probe = 1, + .async_domain = 1, }, }; ... With this feature, it is possible to async different class of drivers, for example, put all sound drivers into domain 2, put all display/video drivers into domain 3, and sensors domain 4, network drivers domain 5 and so forth. *TODO*: o To share the existing wait_for_device_probe(), register all async domains with registered=1. But it is too early than our wait_for_async_probe_domain(). o It may be possible to async the whole kernel initcalls(except the one before scheduler available) with more complicated group features, currently, this implementation only allows to group the probings linearly, but the real dependencies of the probings are more complicated, to solve this issue, group the probings in a *tree* architecture may work. Signed-off-by: Wu Zhangjin --- drivers/base/dd.c | 101 ++-- include/linux/device.h |1 + init/main.c|4 ++ 3 files changed, 103 insertions(+), 3 deletions(-) diff --git a/drivers/base/dd.c b/drivers/base/dd.c index 357f36e..025d8a9 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -358,11 +358,52 @@ void wait_for_device_probe(void) } EXPORT_SYMBOL_GPL(wait_for_device_probe); +struct async_domain_structure { + /* Data Elements */ + struct async_domain domain; + unsigned int domain_id; + unsigned int domain_members; + struct completion domain_order; + char *last_drv; + /* List Link */ + struct list_head list; +}; + struct stupid_thread_structure { struct device_driver *drv; struct device *dev; + struct async_domain_structure *domain; + unsigned int wait:1; }; +static LIST_HEAD(async_domain_list); + +/* Create the domain list based on the ids, the same ids share the same domain. */ +struct async_domain_structure *get_async_domain(unsigned int domain_id) +{ + struct async_domain_structure *domain; + + /* Check if our list exist, If exist, return it */ + if (!list_empty(&async_domain_list)) { + list_for_each_entry(domain, &async_domain_list, list) + if (domain->domain_id == domain_id) { + domain->domain_members++; + return domain; + } + } + + /* If not exist, add a new one */ + domain = kzalloc(sizeof(struct async_domain_structure), GFP_KERNEL); + domain->domain_id = domain_id; + domain->domain_members = 1; + init_completion(&domain->domain_order); + INIT_LIST_HEAD(&domain->domain.pending); + domain->domain.registered = 0; + list_add(&domain->list, &async_domain_list); + + return domain; +} + /** * driver_probe_device - attempt to bind device & driver together * @drv: driver to bind a device to @@ -379,12 +420,57 @@ static void __driver_probe_device(void *void_data, async_cookie_t cookie) struct stupid_thread_structure *data = void_data; struct device_driver *drv = data->drv; struct device *dev = data->dev; + struct async_domain_structure *domain = data->domain; + unsigned int wait = data->wait; + + /* Wait for the previous one */ + if (wait) { + pr_info("%s: %s: wait for %s\n", __func__, drv->name, (char *)domain->last_drv); + wait_for_completion_interruptible(&domain->domain_order); + } + + if (domain) + domain->last_drv = (char *)drv->name; pm_runtime_barrier(dev); really_probe(dev, drv); pm_request_idle(dev); kfree(data); + + /* Finish */ + if (domain) { + pr_info("%s: %s: complete device probing\n", __func__, drv->name); + complete(&domain->domain_order); + } +} + +void wait_for_async_probe_in_domain(void) +{ + struct async_domain_structure *domain; + + pr_info("%s: Wait for all asynced probe devices\n", __func__); + list_for_each_entry(domain, &async_domain_list, list) { + pr_debug("%s: Wait for domain %d\n", __func__, domain->domain_id); + async_synchronize_full_domain(&domain->domain);
[PATCH 1/2] async: async device driver probing
From: Wu Zhangjin [*Note*: NOT applicable, only for comments.] To async the slow driver probing function of some devices, the device probing support is modified to support async scheduling. In order to async your driver probing function, please mask the async_probe flag to 1, and to make sure one asynced probing is executed before an specified point, please call async_synchronize_full() in that point.. Usage: static struct i2c_driver test_driver = { .driver = { .name = TEST_DEV_NAME, .owner = THIS_MODULE, + .async_probe = 1, }, Signed-off-by: Wu Zhangjin --- drivers/base/dd.c | 36 +--- include/linux/device.h |2 ++ 2 files changed, 35 insertions(+), 3 deletions(-) diff --git a/drivers/base/dd.c b/drivers/base/dd.c index 0605176..357f36e 100644 --- a/drivers/base/dd.c +++ b/drivers/base/dd.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -357,6 +358,11 @@ void wait_for_device_probe(void) } EXPORT_SYMBOL_GPL(wait_for_device_probe); +struct stupid_thread_structure { + struct device_driver *drv; + struct device *dev; +}; + /** * driver_probe_device - attempt to bind device & driver together * @drv: driver to bind a device to @@ -368,8 +374,23 @@ EXPORT_SYMBOL_GPL(wait_for_device_probe); * This function must be called with @dev lock held. When called for a * USB interface, @dev->parent lock must be held as well. */ +static void __driver_probe_device(void *void_data, async_cookie_t cookie) +{ + struct stupid_thread_structure *data = void_data; + struct device_driver *drv = data->drv; + struct device *dev = data->dev; + + pm_runtime_barrier(dev); + really_probe(dev, drv); + pm_request_idle(dev); + + kfree(data); +} + int driver_probe_device(struct device_driver *drv, struct device *dev) { + struct stupid_thread_structure *data; + async_cookie_t cookie; int ret = 0; if (!device_is_registered(dev)) @@ -378,9 +399,18 @@ int driver_probe_device(struct device_driver *drv, struct device *dev) pr_debug("bus: '%s': %s: matched device %s with driver %s\n", drv->bus->name, __func__, dev_name(dev), drv->name); - pm_runtime_barrier(dev); - ret = really_probe(dev, drv); - pm_request_idle(dev); + if (drv->async_probe) { + data = kmalloc(sizeof(*data), GFP_KERNEL); + data->drv = drv; + data->dev = dev; + + cookie = async_schedule(__driver_probe_device, data); + pr_info("%s: async call %s driver, cookie is %llu\n", __func__, drv->name, cookie); + } else { + pm_runtime_barrier(dev); + ret = really_probe(dev, drv); + pm_request_idle(dev); + } return ret; } diff --git a/include/linux/device.h b/include/linux/device.h index 952b010..f39ee48 100644 --- a/include/linux/device.h +++ b/include/linux/device.h @@ -247,6 +247,8 @@ struct device_driver { const struct dev_pm_ops *pm; struct driver_private *p; + + unsigned int async_probe:1; }; -- 1.7.10.4 -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCHv2] ibmveth: Fix off-by-one error in ibmveth_change_mtu()
On 04/20/2015 08:07 PM, David Gibson wrote: > AFAIK the PAPR document which defines the virtual device interface used by > the ibmveth driver doesn't specify a specific maximum MTU. So, in the > ibmveth driver, the maximum allowed MTU is determined by the maximum > allocated buffer size of 64k (corresponding to one page in the common case) > minus the per-buffer overhead IBMVETH_BUFF_OH (which has value 22 for 14 > bytes of ethernet header, plus 8 bytes for an opaque handle). > > This suggests a maximum allowable MTU of 65514 bytes, but in fact the > driver only permits a maximum MTU of 65513. This is because there is a < > instead of an <= in ibmveth_change_mtu(), which only permits an MTU which > is strictly smaller than the buffer size, rather than allowing the buffer > to be completely filled. > > This patch fixes the buglet. Thanks! Acked-by: Thomas Falcon > > Signed-off-by: David Gibson 1 > --- > drivers/net/ethernet/ibm/ibmveth.c | 4 ++-- > 1 file changed, 2 insertions(+), 2 deletions(-) > > Changes since v1: > * Fixed a second instance of the same off-by-one error. Thanks to >Thomas Falcon for spotting this. > > diff --git a/drivers/net/ethernet/ibm/ibmveth.c > b/drivers/net/ethernet/ibm/ibmveth.c > index cd7675a..1813476 100644 > --- a/drivers/net/ethernet/ibm/ibmveth.c > +++ b/drivers/net/ethernet/ibm/ibmveth.c > @@ -1238,7 +1238,7 @@ static int ibmveth_change_mtu(struct net_device *dev, > int new_mtu) > return -EINVAL; > > for (i = 0; i < IBMVETH_NUM_BUFF_POOLS; i++) > - if (new_mtu_oh < adapter->rx_buff_pool[i].buff_size) > + if (new_mtu_oh <= adapter->rx_buff_pool[i].buff_size) > break; > > if (i == IBMVETH_NUM_BUFF_POOLS) > @@ -1257,7 +1257,7 @@ static int ibmveth_change_mtu(struct net_device *dev, > int new_mtu) > for (i = 0; i < IBMVETH_NUM_BUFF_POOLS; i++) { > adapter->rx_buff_pool[i].active = 1; > > - if (new_mtu_oh < adapter->rx_buff_pool[i].buff_size) { > + if (new_mtu_oh <= adapter->rx_buff_pool[i].buff_size) { > dev->mtu = new_mtu; > vio_cmo_set_dev_desired(viodev, > ibmveth_get_desired_dma -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [PATCH] ibmveth: Fix off-by-one error in ibmveth_change_mtu()
On 04/13/2015 12:39 AM, David Gibson wrote: > AFAIK the PAPR document which defines the virtual device interface used by > the ibmveth driver doesn't specify a specific maximum MTU. So, in the > ibmveth driver, the maximum allowed MTU is determined by the maximum > allocated buffer size of 64k (corresponding to one page in the common case) > minus the per-buffer overhead IBMVETH_BUFF_OH (which has value 22 for 14 > bytes of ethernet header, plus 8 bytes for an opaque handle). > > This suggests a maximum allowable MTU of 65514 bytes, but in fact the > driver only permits a maximum MTU of 65513. This is because there is a < > instead of an <= in ibmveth_change_mtu(), which only permits an MTU which > is strictly smaller than the buffer size, rather than allowing the buffer > to be completely filled. > > This patch fixes the buglet. The same expression is made using < just a few lines above. Shouldn't this be changed to <= too? @@ -1238,7 +1238,7 @@ static int ibmveth_change_mtu(struct net_device *dev, int new_mtu) return -EINVAL; for (i = 0; i < IBMVETH_NUM_BUFF_POOLS; i++) - if (new_mtu_oh < adapter->rx_buff_pool[i].buff_size) + if (new_mtu_oh <= adapter->rx_buff_pool[i].buff_size) break; if (i == IBMVETH_NUM_BUFF_POOLS) Thanks, Tom > Signed-off-by: David Gibson > --- > drivers/net/ethernet/ibm/ibmveth.c | 2 +- > 1 file changed, 1 insertion(+), 1 deletion(-) > > diff --git a/drivers/net/ethernet/ibm/ibmveth.c > b/drivers/net/ethernet/ibm/ibmveth.c > index cd7675a..3aa280a 100644 > --- a/drivers/net/ethernet/ibm/ibmveth.c > +++ b/drivers/net/ethernet/ibm/ibmveth.c > @@ -1257,7 +1257,7 @@ static int ibmveth_change_mtu(struct net_device *dev, > int new_mtu) > for (i = 0; i < IBMVETH_NUM_BUFF_POOLS; i++) { > adapter->rx_buff_pool[i].active = 1; > > - if (new_mtu_oh < adapter->rx_buff_pool[i].buff_size) { > + if (new_mtu_oh <= adapter->rx_buff_pool[i].buff_size) { > dev->mtu = new_mtu; > vio_cmo_set_dev_desired(viodev, > ibmveth_get_desired_dma -- Tom Falcon Software Engineer IBM LTC Austin -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: linux-next: manual merge of the net-next tree with the net tree
On 12/1/20 7:20 PM, Stephen Rothwell wrote: Hi all, Today's linux-next merge of the net-next tree got a conflict in: drivers/net/ethernet/ibm/ibmvnic.c between commit: b71ec9522346 ("ibmvnic: Ensure that SCRQ entry reads are correctly ordered") from the net tree and commit: ec20f36bb41a ("ibmvnic: Correctly re-enable interrupts in NAPI polling routine") from the net-next tree. I fixed it up (see below) and can carry the fix as necessary. This is now fixed as far as linux-next is concerned, but any non trivial conflicts should be mentioned to your upstream maintainer when your tree is submitted for merging. You may also want to consider cooperating with the maintainer of the conflicting tree to minimise any particularly complex conflicts. Hi, Stephen, thank you for fixing that conflict. Sorry for the inconvenience.
Re: [PATCH net-next] ibmveth: v1 calculate correct gso_size and set gso_type
On 10/25/2016 07:09 PM, Jon Maxwell wrote: > We recently encountered a bug where a few customers using ibmveth on the > same LPAR hit an issue where a TCP session hung when large receive was > enabled. Closer analysis revealed that the session was stuck because the > one side was advertising a zero window repeatedly. > > We narrowed this down to the fact the ibmveth driver did not set gso_size > which is translated by TCP into the MSS later up the stack. The MSS is > used to calculate the TCP window size and as that was abnormally large, > it was calculating a zero window, even although the sockets receive buffer > was completely empty. > > We were able to reproduce this and worked with IBM to fix this. Thanks Tom > and Marcelo for all your help and review on this. > > The patch fixes both our internal reproduction tests and our customers tests. > > Signed-off-by: Jon Maxwell Thanks, Jon. Acked-by: Thomas Falcon > --- > drivers/net/ethernet/ibm/ibmveth.c | 20 > 1 file changed, 20 insertions(+) > > diff --git a/drivers/net/ethernet/ibm/ibmveth.c > b/drivers/net/ethernet/ibm/ibmveth.c > index 29c05d0..c51717e 100644 > --- a/drivers/net/ethernet/ibm/ibmveth.c > +++ b/drivers/net/ethernet/ibm/ibmveth.c > @@ -1182,6 +1182,8 @@ static int ibmveth_poll(struct napi_struct *napi, int > budget) > int frames_processed = 0; > unsigned long lpar_rc; > struct iphdr *iph; > + bool large_packet = 0; > + u16 hdr_len = ETH_HLEN + sizeof(struct tcphdr); > > restart_poll: > while (frames_processed < budget) { > @@ -1236,10 +1238,28 @@ static int ibmveth_poll(struct napi_struct *napi, int > budget) > iph->check = 0; > iph->check = > ip_fast_csum((unsigned char *)iph, iph->ihl); > adapter->rx_large_packets++; > + large_packet = 1; > } > } > } > > + if (skb->len > netdev->mtu) { > + iph = (struct iphdr *)skb->data; > + if (be16_to_cpu(skb->protocol) == ETH_P_IP && > + iph->protocol == IPPROTO_TCP) { > + hdr_len += sizeof(struct iphdr); > + skb_shinfo(skb)->gso_type = > SKB_GSO_TCPV4; > + skb_shinfo(skb)->gso_size = netdev->mtu > - hdr_len; > + } else if (be16_to_cpu(skb->protocol) == > ETH_P_IPV6 && > +iph->protocol == IPPROTO_TCP) { > + hdr_len += sizeof(struct ipv6hdr); > + skb_shinfo(skb)->gso_type = > SKB_GSO_TCPV6; > + skb_shinfo(skb)->gso_size = netdev->mtu > - hdr_len; > + } > + if (!large_packet) > + adapter->rx_large_packets++; > + } > + > napi_gro_receive(napi, skb);/* send it up */ > > netdev->stats.rx_packets++;
Re: [PATCH net-next] ibmveth: v1 calculate correct gso_size and set gso_type
On 10/27/2016 10:26 AM, Eric Dumazet wrote: > On Wed, 2016-10-26 at 11:09 +1100, Jon Maxwell wrote: >> We recently encountered a bug where a few customers using ibmveth on the >> same LPAR hit an issue where a TCP session hung when large receive was >> enabled. Closer analysis revealed that the session was stuck because the >> one side was advertising a zero window repeatedly. >> >> We narrowed this down to the fact the ibmveth driver did not set gso_size >> which is translated by TCP into the MSS later up the stack. The MSS is >> used to calculate the TCP window size and as that was abnormally large, >> it was calculating a zero window, even although the sockets receive buffer >> was completely empty. >> >> We were able to reproduce this and worked with IBM to fix this. Thanks Tom >> and Marcelo for all your help and review on this. >> >> The patch fixes both our internal reproduction tests and our customers tests. >> >> Signed-off-by: Jon Maxwell >> --- >> drivers/net/ethernet/ibm/ibmveth.c | 20 >> 1 file changed, 20 insertions(+) >> >> diff --git a/drivers/net/ethernet/ibm/ibmveth.c >> b/drivers/net/ethernet/ibm/ibmveth.c >> index 29c05d0..c51717e 100644 >> --- a/drivers/net/ethernet/ibm/ibmveth.c >> +++ b/drivers/net/ethernet/ibm/ibmveth.c >> @@ -1182,6 +1182,8 @@ static int ibmveth_poll(struct napi_struct *napi, int >> budget) >> int frames_processed = 0; >> unsigned long lpar_rc; >> struct iphdr *iph; >> +bool large_packet = 0; >> +u16 hdr_len = ETH_HLEN + sizeof(struct tcphdr); >> >> restart_poll: >> while (frames_processed < budget) { >> @@ -1236,10 +1238,28 @@ static int ibmveth_poll(struct napi_struct *napi, >> int budget) >> iph->check = 0; >> iph->check = >> ip_fast_csum((unsigned char *)iph, iph->ihl); >> adapter->rx_large_packets++; >> +large_packet = 1; >> } >> } >> } >> >> +if (skb->len > netdev->mtu) { >> +iph = (struct iphdr *)skb->data; >> +if (be16_to_cpu(skb->protocol) == ETH_P_IP && >> +iph->protocol == IPPROTO_TCP) { >> +hdr_len += sizeof(struct iphdr); >> +skb_shinfo(skb)->gso_type = >> SKB_GSO_TCPV4; >> +skb_shinfo(skb)->gso_size = netdev->mtu >> - hdr_len; >> +} else if (be16_to_cpu(skb->protocol) == >> ETH_P_IPV6 && >> + iph->protocol == IPPROTO_TCP) { >> +hdr_len += sizeof(struct ipv6hdr); >> +skb_shinfo(skb)->gso_type = >> SKB_GSO_TCPV6; >> +skb_shinfo(skb)->gso_size = netdev->mtu >> - hdr_len; >> +} >> +if (!large_packet) >> +adapter->rx_large_packets++; >> +} >> + >> > This might break forwarding and PMTU discovery. > > You force gso_size to device mtu, regardless of real MSS used by the TCP > sender. > > Don't you have the MSS provided in RX descriptor, instead of guessing > the value ? > > > The MSS is not always available unfortunately, so this is the best solution there is at the moment.