[Devel] [PATCH RHEL8 COMMIT] vdso: fix VM_BUG_ON_PAGE(PageSlab(page)) on unmap
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh8-4.18.0-240.1.1.vz8.5.1 --> commit eac26b43815a2f0ebc2128f9161097bed17bdc71 Author: Andrey Ryabinin Date: Tue Dec 15 20:12:18 2020 +0300 vdso: fix VM_BUG_ON_PAGE(PageSlab(page)) on unmap vdso_data is mapped to userspace which means that we can't use kmalloc() to allocate it. Kmalloc() doesn't even guarantee that we will get page aligned memory. kernel BUG at include/linux/mm.h:693! RIP: 0010:unmap_page_range+0x15f2/0x2630 Call Trace: unmap_vmas+0x11e/0x1d0 exit_mmap+0x215/0x420 mmput+0x10a/0x400 do_exit+0x98f/0x2d00 do_group_exit+0xec/0x2b0 __x64_sys_exit_group+0x3a/0x50 do_syscall_64+0xa5/0x4d0 entry_SYSCALL_64_after_hwframe+0x6a/0xdf Use alloc_pages_exact() to allocate it. We can't use alloc_pages(), or __get_free_pages() here since vdso_fault() need to perform get_page() on individual sub-pages and alloc_pages() doesn't initalize sub-pages. https://jira.sw.ru/browse/PSBM-123551 Signed-off-by: Andrey Ryabinin --- kernel/ve/ve.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index b114e2918bb7..0c6630c6616a 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -568,7 +568,7 @@ static int copy_vdso(struct vdso_image **vdso_dst, const struct vdso_image *vdso if (!vdso) return -ENOMEM; - vdso_data = kmalloc(vdso_src->size, GFP_KERNEL); + vdso_data = alloc_pages_exact(vdso_src->size, GFP_KERNEL); if (!vdso_data) { kfree(vdso); return -ENOMEM; @@ -585,11 +585,11 @@ static int copy_vdso(struct vdso_image **vdso_dst, const struct vdso_image *vdso static void ve_free_vdso(struct ve_struct *ve) { if (ve->vdso_64 && ve->vdso_64 != _image_64) { - kfree(ve->vdso_64->data); + free_pages_exact(ve->vdso_64->data, ve->vdso_64->size); kfree(ve->vdso_64); } if (ve->vdso_32 && ve->vdso_32 != _image_32) { - kfree(ve->vdso_32->data); + free_pages_exact(ve->vdso_32->data, ve->vdso_32->size); kfree(ve->vdso_32); } } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL8 COMMIT] ploop: Zero tail of tail page
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh8-4.18.0-240.1.1.vz8.5.1 --> commit 0497d745e201c4eb6f894c87afb55044f075708d Author: Kirill Tkhai Date: Tue Dec 15 20:12:18 2020 +0300 ploop: Zero tail of tail page In case of BAT ends in middle of page, zero its tail. Otherwise, garbage is there. https://jira.sw.ru/browse/PSBM-123639 Signed-off-by: Kirill Tkhai --- drivers/md/dm-ploop-bat.c | 5 + drivers/md/dm-ploop.h | 1 + 2 files changed, 6 insertions(+) diff --git a/drivers/md/dm-ploop-bat.c b/drivers/md/dm-ploop-bat.c index d6b687806118..da18dd2e4638 100644 --- a/drivers/md/dm-ploop-bat.c +++ b/drivers/md/dm-ploop-bat.c @@ -168,6 +168,11 @@ static int ploop_read_bat(struct ploop *ploop, struct bio *bio) from = kmap(bio->bi_io_vec[page].bv_page); memcpy(to, from, nr_copy * sizeof(map_index_t)); kunmap(bio->bi_io_vec[page].bv_page); + if (unlikely(nr_copy < BAT_ENTRIES_PER_PAGE)) { + memset(from + nr_copy, 0, sizeof(map_index_t) * + (BAT_ENTRIES_PER_PAGE - nr_copy)); + } + ret = parse_bat_entries(ploop, to, md->bat_levels, nr_copy, id); kunmap(md->page); diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h index dc5362f01e21..a025df4bf328 100644 --- a/drivers/md/dm-ploop.h +++ b/drivers/md/dm-ploop.h @@ -6,6 +6,7 @@ #define PLOOP_MAP_OFFSET 16 typedef u32 map_index_t; +#define BAT_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(map_index_t)) #define SIGNATURE_DISK_IN_USE 0x746F6E59 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH8] dm-ploop: Skip zero writes to unallocated clusters
Sometimes this may safe some space... https://jira.sw.ru/browse/PSBM-123748 Signed-off-by: Kirill Tkhai --- drivers/md/dm-ploop-map.c | 24 1 file changed, 24 insertions(+) diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c index 4b12b5fc082a..f193b25cbd28 100644 --- a/drivers/md/dm-ploop-map.c +++ b/drivers/md/dm-ploop-map.c @@ -397,6 +397,27 @@ static void maybe_unlink_completed_bio(struct ploop *ploop, struct bio *bio) queue_work(ploop->wq, >worker); } +static bool bio_endio_if_all_zeros(struct bio *bio) +{ + struct bvec_iter bi = { + .bi_size = bio->bi_iter.bi_size, + }; + struct bio_vec bv; + void *data, *ret; + + for_each_bvec(bv, bio->bi_io_vec, bi, bi) { + data = kmap(bv.bv_page); + ret = memchr_inv(data + bv.bv_offset, 0, bv.bv_len); + kunmap(bv.bv_page); + if (ret) + return false; + } + + bio->bi_status = BLK_STS_OK; + bio_endio(bio); + return true; +} + static void handle_discard_bio(struct ploop *ploop, struct bio *bio, unsigned int cluster, unsigned int dst_cluster) { @@ -1326,6 +1347,9 @@ static int process_one_deferred_bio(struct ploop *ploop, struct bio *bio, goto out; } + if (unlikely(bio_endio_if_all_zeros(bio))) + goto out; + /* Cluster exists nowhere. Allocate it and setup bio as outrunning */ ret = locate_new_cluster_and_attach_bio(ploop, piwb, cluster, _cluster, bio); ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH vz8] vdso: fix VM_BUG_ON_PAGE(PageSlab(page)) on unmap
vdso_data is mapped to userspace which means that we can't use kmalloc() to allocate it. Kmalloc() doesn't even guarantee that we will get page aligned memory. kernel BUG at include/linux/mm.h:693! RIP: 0010:unmap_page_range+0x15f2/0x2630 Call Trace: unmap_vmas+0x11e/0x1d0 exit_mmap+0x215/0x420 mmput+0x10a/0x400 do_exit+0x98f/0x2d00 do_group_exit+0xec/0x2b0 __x64_sys_exit_group+0x3a/0x50 do_syscall_64+0xa5/0x4d0 entry_SYSCALL_64_after_hwframe+0x6a/0xdf Use alloc_pages_exact() to allocate it. We can't use alloc_pages(), or __get_free_pages() here since vdso_fault() need to perform get_page() on individual sub-pages and alloc_pages() doesn't initalize sub-pages. https://jira.sw.ru/browse/PSBM-123551 Signed-off-by: Andrey Ryabinin --- kernel/ve/ve.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c index b114e2918bb7..0c6630c6616a 100644 --- a/kernel/ve/ve.c +++ b/kernel/ve/ve.c @@ -568,7 +568,7 @@ static int copy_vdso(struct vdso_image **vdso_dst, const struct vdso_image *vdso if (!vdso) return -ENOMEM; - vdso_data = kmalloc(vdso_src->size, GFP_KERNEL); + vdso_data = alloc_pages_exact(vdso_src->size, GFP_KERNEL); if (!vdso_data) { kfree(vdso); return -ENOMEM; @@ -585,11 +585,11 @@ static int copy_vdso(struct vdso_image **vdso_dst, const struct vdso_image *vdso static void ve_free_vdso(struct ve_struct *ve) { if (ve->vdso_64 && ve->vdso_64 != _image_64) { - kfree(ve->vdso_64->data); + free_pages_exact(ve->vdso_64->data, ve->vdso_64->size); kfree(ve->vdso_64); } if (ve->vdso_32 && ve->vdso_32 != _image_32) { - kfree(ve->vdso_32->data); + free_pages_exact(ve->vdso_32->data, ve->vdso_32->size); kfree(ve->vdso_32); } } -- 2.26.2 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ms/KVM: x86: reinstate vendor-agnostic check on SPEC_CTRL cpuid bits #PSBM-120787 #PSBM-123538 #PSBM-121767
The commit is pushed to "branch-rh7-3.10.0-1160.6.1.vz7.171.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-1160.6.1.vz7.171.4 --> commit b1ecfc303d3e53654c7d440c4214844ee6e69d78 Author: Paolo Bonzini Date: Tue Dec 15 12:31:37 2020 +0300 ms/KVM: x86: reinstate vendor-agnostic check on SPEC_CTRL cpuid bits #PSBM-120787 #PSBM-123538 #PSBM-121767 Until commit e7c587da1252 ("x86/speculation: Use synthetic bits for IBRS/IBPB/STIBP", 2018-05-17), KVM was testing both Intel and AMD CPUID bits before allowing the guest to write MSR_IA32_SPEC_CTRL and MSR_IA32_PRED_CMD. Testing only Intel bits on VMX processors, or only AMD bits on SVM processors, fails if the guests are created with the "opposite" vendor as the host. While at it, also tweak the host CPU check to use the vendor-agnostic feature bit X86_FEATURE_IBPB, since we only care about the availability of the MSR on the host here and not about specific CPUID bits. mFixes: e7c587da1252 ("x86/speculation: Use synthetic bits for IBRS/IBPB/STIBP") Cc: sta...@vger.kernel.org Reported-by: Denis V. Lunev Signed-off-by: Paolo Bonzini https://jira.sw.ru/browse/PSBM-123538 The patch is a replacement for below vz patch, which has been reverted due to https://jira.sw.ru/browse/PSBM-121767 commit 39d637ddbcf876f897e01c737bbb351461921df0 Author: Denis V. Lunev Date: Wed Oct 28 19:25:57 2020 +0300 kvm: fix AMD IBRS/IBPB/STIBP/SSBD reporting #PSBM-120787 We should report these bits in 8008 EBX on AMD only, i.e. when AMD specific feature bits are enabled. https://jira.sw.ru/browse/PSBM-120787 Signed-off-by: Denis V. Lunev CC: Vasily Averin CC: Konstantin Khorenko Port to vz7 note: in vz7 both functions svm_set_msr() and vmx_set_msr() did not have checks for !boot_cpu_has(X86_FEATURE_AMD_IBPB) and !boot_cpu_has(X86_FEATURE_SPEC_CTRL) Signed-off-by: Konstantin Khorenko --- arch/x86/kvm/svm.c | 3 +++ arch/x86/kvm/vmx.c | 10 -- 2 files changed, 11 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 5715da2..3a04bd0 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -3881,11 +3881,14 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr) break; case MSR_IA32_PRED_CMD: if (!msr->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB)) return 1; if (data & ~PRED_CMD_IBPB) return 1; + if (!boot_cpu_has(X86_FEATURE_IBPB)) + return 1; if (!data) break; diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c index f43b2db..a120208 100644 --- a/arch/x86/kvm/vmx.c +++ b/arch/x86/kvm/vmx.c @@ -3350,7 +3350,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) break; case MSR_IA32_SPEC_CTRL: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) && + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) && + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD)) return 1; /* The STIBP bit doesn't fault even if it's not advertised */ @@ -3387,11 +3390,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) goto find_shared_msr; case MSR_IA32_PRED_CMD: if (!msr_info->host_initiated && - !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL)) + !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) && + !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB)) return 1; if (data & ~PRED_CMD_IBPB) return 1; + if (!boot_cpu_has(X86_FEATURE_IBPB)) + return 1; if (!data) break; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ve/device_cgroup: show all devices allowed in ct to fool docker
The commit is pushed to "branch-rh7-3.10.0-1160.6.1.vz7.171.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-1160.6.1.vz7.171.4 --> commit 510bede663a9018c86c2b45ede7c5e26f6f8 Author: Pavel Tikhomirov Date: Tue Dec 15 12:30:55 2020 +0300 ve/device_cgroup: show all devices allowed in ct to fool docker We've seen that docker 20+ not only writes "a *:* rwm" to privileged docker container device-cgroup (as pre-19 version did) but also checks the content after write, and docker expects that all devices are allowed for privileged docker container. In our VZCT we obviously can't afford to actually allow all devices because root device cgroup of VZCT should restrict which devices are allowed to be read/modified/mknod in VZCT and which are not, and all nested cgroup inherit this. Before the patch reading devices list in VZCT one would see a whitelist there each allowed device is present: CT-101 /# cat /sys/fs/cgroup/devices/test/devices.list ... c 1:11 rwm c 10:200 rwm c 10:235 rwm c 10:229 rwm b 182:177568 rm b 182:177569 rm Docker expects to see "a *:* rwm" as if docker is on bare host and nobody touched device cgroup before that. As a solution we can just show docker what he wants. The idea is to detect if the content of the whitelist of the device cgroup to be shown is equal to the content of the whitelist of the root device cgroup of the VZCT, then always show "a *:* rwm". CT-101 /# cat /sys/fs/cgroup/devices/test/devices.list a *:* rwm If one changes the whitelist (even reorder) this cgroup would show a full list of all allowed devices as before. This change of the output looks consistent enough: when you see "a *:* rwm" in your cgroup it means that all devices of your VZCT are available for you. Only difference to mainstream behaviour is when you prohibit some device via devices.deny you get not a blacklist but an inverse whitelist. FIXME: we have a problem here because this approach does not survive container migration as devices cgroup c/r looks broken: https://jira.sw.ru/browse/PSBM-123668 https://jira.sw.ru/browse/PSBM-123630 Signed-off-by: Pavel Tikhomirov --- include/linux/cgroup.h | 1 + security/device_cgroup.c | 48 2 files changed, 49 insertions(+) diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h index 645c9fd..ac255e4 100644 --- a/include/linux/cgroup.h +++ b/include/linux/cgroup.h @@ -673,6 +673,7 @@ void cgroup_release_agent(struct work_struct *work); #ifdef CONFIG_VE int cgroup_mark_ve_roots(struct ve_struct *ve); void cgroup_unmark_ve_roots(struct ve_struct *ve); +struct cgroup *cgroup_get_local_root(struct cgroup *cgrp); struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp); #endif diff --git a/security/device_cgroup.c b/security/device_cgroup.c index d980020..f9d205f 100644 --- a/security/device_cgroup.c +++ b/security/device_cgroup.c @@ -304,12 +304,41 @@ static void set_majmin(char *str, unsigned m) sprintf(str, "%u", m); } +struct dev_exception_item *dev_exeption_next(struct list_head *head) +{ + return list_entry_rcu(head->next, struct dev_exception_item, list); +} + +static bool dev_exceptions_equal(struct list_head *first, struct list_head *second) +{ + struct dev_exception_item *exf, *exs; + + for (exf = dev_exeption_next(first->next), +exs = dev_exeption_next(second->next); +>list != first && >list != second; +exf = dev_exeption_next(exf->list.next), +exs = dev_exeption_next(exs->list.next)) { + /* Check that exceptions are equal */ + if (exf->type != exs->type || + exf->major != exs->major || + exf->minor != exs->minor || + exf->access != exs->access) + return false; + } + + if (>list != first || >list != second) + return false; + + return true; +} + static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft, struct seq_file *m) { struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup); struct dev_exception_item *ex; char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN]; + struct dev_cgroup *root_cgrp; rcu_read_lock(); /* @@ -325,6 +354,25 @@ static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft, seq_printf(m, "%c %s:%s %s\n", type_to_char(DEV_ALL), maj, min, acc); } else { + /* +* Fooling docker in CT again: if exceptions in ve are the same +* as in ve root cgroup - show as if we allow everyting +
[Devel] [PATCH RHEL7 COMMIT] ms/net: rtnetlink: validate IFLA_MTU attribute in rtnl_create_link()
The commit is pushed to "branch-rh7-3.10.0-1160.6.1.vz7.171.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-1160.6.1.vz7.171.4 --> commit 1efe3f91436dea98a9e9c97bc4b0d96d20f34e4c Author: Eric Dumazet Date: Tue Dec 15 12:29:56 2020 +0300 ms/net: rtnetlink: validate IFLA_MTU attribute in rtnl_create_link() rtnl_create_link() needs to apply dev->min_mtu and dev->max_mtu checks that we apply in do_setlink() Otherwise malicious users can crash the kernel, for example after an integer overflow : BUG: KASAN: use-after-free in memset include/linux/string.h:365 [inline] BUG: KASAN: use-after-free in __alloc_skb+0x37b/0x5e0 net/core/skbuff.c:238 Write of size 32 at addr 88819f20b9c0 by task swapper/0/0 CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.5.0-rc1-syzkaller #0 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 Call Trace: __dump_stack lib/dump_stack.c:77 [inline] dump_stack+0x197/0x210 lib/dump_stack.c:118 print_address_description.constprop.0.cold+0xd4/0x30b mm/kasan/report.c:374 __kasan_report.cold+0x1b/0x41 mm/kasan/report.c:506 kasan_report+0x12/0x20 mm/kasan/common.c:639 check_memory_region_inline mm/kasan/generic.c:185 [inline] check_memory_region+0x134/0x1a0 mm/kasan/generic.c:192 memset+0x24/0x40 mm/kasan/common.c:108 memset include/linux/string.h:365 [inline] __alloc_skb+0x37b/0x5e0 net/core/skbuff.c:238 alloc_skb include/linux/skbuff.h:1049 [inline] alloc_skb_with_frags+0x93/0x590 net/core/skbuff.c:5664 sock_alloc_send_pskb+0x7ad/0x920 net/core/sock.c:2242 sock_alloc_send_skb+0x32/0x40 net/core/sock.c:2259 mld_newpack+0x1d7/0x7f0 net/ipv6/mcast.c:1609 add_grhead.isra.0+0x299/0x370 net/ipv6/mcast.c:1713 add_grec+0x7db/0x10b0 net/ipv6/mcast.c:1844 mld_send_cr net/ipv6/mcast.c:1970 [inline] mld_ifc_timer_expire+0x3d3/0x950 net/ipv6/mcast.c:2477 call_timer_fn+0x1ac/0x780 kernel/time/timer.c:1404 expire_timers kernel/time/timer.c:1449 [inline] __run_timers kernel/time/timer.c:1773 [inline] __run_timers kernel/time/timer.c:1740 [inline] run_timer_softirq+0x6c3/0x1790 kernel/time/timer.c:1786 __do_softirq+0x262/0x98c kernel/softirq.c:292 invoke_softirq kernel/softirq.c:373 [inline] irq_exit+0x19b/0x1e0 kernel/softirq.c:413 exiting_irq arch/x86/include/asm/apic.h:536 [inline] smp_apic_timer_interrupt+0x1a3/0x610 arch/x86/kernel/apic/apic.c:1137 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:829 RIP: 0010:native_safe_halt+0xe/0x10 arch/x86/include/asm/irqflags.h:61 Code: 98 6b ea f9 eb 8a cc cc cc cc cc cc e9 07 00 00 00 0f 00 2d 44 1c 60 00 f4 c3 66 90 e9 07 00 00 00 0f 00 2d 34 1c 60 00 fb f4 cc 55 48 89 e5 41 57 41 56 41 55 41 54 53 e8 4e 5d 9a f9 e8 79 RSP: 0018:89807ce8 EFLAGS: 0286 ORIG_RAX: ff13 RAX: 113266ae RBX: 8987a1c0 RCX: RDX: dc00 RSI: 0006 RDI: 8987aa54 RBP: 89807d18 R08: 8987a1c0 R09: R10: R11: R12: dc00 R13: 8a799980 R14: R15: arch_cpu_idle+0xa/0x10 arch/x86/kernel/process.c:690 default_idle_call+0x84/0xb0 kernel/sched/idle.c:94 cpuidle_idle_call kernel/sched/idle.c:154 [inline] do_idle+0x3c8/0x6e0 kernel/sched/idle.c:269 cpu_startup_entry+0x1b/0x20 kernel/sched/idle.c:361 rest_init+0x23b/0x371 init/main.c:451 arch_call_rest_init+0xe/0x1b start_kernel+0x904/0x943 init/main.c:784 x86_64_start_reservations+0x29/0x2b arch/x86/kernel/head64.c:490 x86_64_start_kernel+0x77/0x7b arch/x86/kernel/head64.c:471 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:242 The buggy address belongs to the page: page:ea00067c82c0 refcount:0 mapcount:0 mapping: index:0x0 raw: 057ffe00 ea00067c82c8 ea00067c82c8 raw: page dumped because: kasan: bad access detected Memory state around the buggy address: 88819f20b880: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff 88819f20b900: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff >88819f20b980: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ^ 88819f20ba00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff 88819f20ba80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff Fixes: 61e84623ace3 ("net: centralize net_device min/max MTU checking") Signed-off-by: Eric Dumazet Reported-by: syzbot Signed-off-by: David S. Miller (cherry-picked from commit
[Devel] [PATCH RHEL7 COMMIT] ve/net/core: allow to call setsockopt(SO_RCVBUFFORCE) from Containers
The commit is pushed to "branch-rh7-3.10.0-1160.6.1.vz7.171.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-1160.6.1.vz7.171.4 --> commit bd3e846de350fcff5cfdcd6133adb7c610b3a4af Author: Konstantin Khorenko Date: Tue Dec 15 12:30:04 2020 +0300 ve/net/core: allow to call setsockopt(SO_RCVBUFFORCE) from Containers "nft" util (in CentOS 8 environment) does use setsockopt(SO_RCVBUFFORCE) unconditionally, so we have to allow it from inside a Container. At the same time we don't want to allow a Container to set too much memory for a socket, so just threat SO_RCVBUFFORCE like SO_RCVBUF if called inside a Container. Simple rule to test: # NFT=/usr/sbin/nft ./run-tests.sh -v -g testcases/nft-f/0011manydefines_0 which fails inside a Container because of not enough rcb buffer because of failed setsockopt(3, SOL_SOCKET, SO_RCVBUFFORCE, [10561584], 4) = -1 EPERM (Operation not permitted) Signed-off-by: Konstantin Khorenko --- net/core/sock.c | 7 ++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/net/core/sock.c b/net/core/sock.c index 07ea42f..44e91c8 100644 --- a/net/core/sock.c +++ b/net/core/sock.c @@ -793,6 +793,7 @@ set_sndbuf: goto set_sndbuf; case SO_RCVBUF: +unpriv_rcvbuf: /* Don't error on this BSD doesn't and if you think * about it this is right. Otherwise apps have to * play 'guess the biggest size' games. RCVBUF/SNDBUF @@ -824,11 +825,15 @@ set_rcvbuf: break; case SO_RCVBUFFORCE: - if (!capable(CAP_NET_ADMIN)) { + if (!ve_capable(CAP_NET_ADMIN)) { ret = -EPERM; break; } + /* nft utility uses this sockopt in CentOS 8 env */ + if (!ve_is_super(get_exec_env())) + goto unpriv_rcvbuf; + /* No negative values (to prevent underflow, as val will be * multiplied by 2). */ ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ms/net: drop bogus skb with CHECKSUM_PARTIAL and offset beyond end of trimmed packet
The commit is pushed to "branch-rh7-3.10.0-1160.6.1.vz7.171.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-1160.6.1.vz7.171.4 --> commit dd590cd4c2f3c21197754f40d9f67165a7470dc5 Author: Vasily Averin Date: Tue Dec 15 12:24:50 2020 +0300 ms/net: drop bogus skb with CHECKSUM_PARTIAL and offset beyond end of trimmed packet syzbot reproduces BUG_ON in skb_checksum_help(): tun creates (bogus) skb with huge partial-checksummed area and small ip packet inside. Then ip_rcv trims the skb based on size of internal ip packet, after that csum offset points beyond of trimmed skb. Then checksum_tg() called via netfilter hook triggers BUG_ON: offset = skb_checksum_start_offset(skb); BUG_ON(offset >= skb_headlen(skb)); To work around the problem this patch forces pskb_trim_rcsum_slow() to return -EINVAL in described scenario. It allows its callers to drop such kind of packets. Link: https://syzkaller.appspot.com/bug?id=b419a5ca95062664fe1a60b764621eb4526e2cd0 Reported-by: syzbot+7010af67ced6105e5...@syzkaller.appspotmail.com Signed-off-by: Vasily Averin Acked-by: Willem de Bruijn Link: https://lore.kernel.org/r/1b2494af-2c56-8ee2-7bc0-923fcad1c...@virtuozzo.com Signed-off-by: Jakub Kicinski Now it is in net-next: https://git.kernel.org/netdev/net-next/c/54970a2fbb67 https://jira.sw.ru/browse/PSBM-123062 Signed-off-by: Vasily Averin --- include/linux/skbuff.h | 7 +++ 1 file changed, 7 insertions(+) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 296e734..f2c66b1 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3108,6 +3108,13 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len) return 0; if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; + else if (skb->ip_summed == CHECKSUM_PARTIAL) { + int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len; + int offset = skb_checksum_start_offset(skb) + skb->csum_offset; + + if (offset + sizeof(__sum16) > hdlen) + return -EINVAL; + } return __pskb_trim(skb, len); } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ms/net: Fix usage of pskb_trim_rcsum
The commit is pushed to "branch-rh7-3.10.0-1160.6.1.vz7.171.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-1160.6.1.vz7.171.4 --> commit 36a0097fafb6b00c33a9b5497eda51cafa6972aa Author: Ross Lagerwall Date: Tue Dec 15 12:24:33 2020 +0300 ms/net: Fix usage of pskb_trim_rcsum In certain cases, pskb_trim_rcsum() may change skb pointers. Reinitialize header pointers afterwards to avoid potential use-after-frees. Add a note in the documentation of pskb_trim_rcsum(). Found by KASAN. Signed-off-by: Ross Lagerwall Signed-off-by: David S. Miller (cherry-picked from commit 6c57f0458022298e4da1729c67bd33ce41c14e7a) https://jira.sw.ru/browse/PSBM-123062 Signed-off-by: Vasily Averin --- drivers/net/ppp/pppoe.c | 1 + include/linux/skbuff.h | 1 + net/bridge/br_netfilter_ipv6.c | 1 + net/bridge/netfilter/nft_reject_bridge.c | 1 + net/ipv4/ip_input.c | 1 + 5 files changed, 5 insertions(+) diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 15d3f44..3101720 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -442,6 +442,7 @@ static int pppoe_rcv(struct sk_buff *skb, struct net_device *dev, if (pskb_trim_rcsum(skb, len)) goto drop; + ph = pppoe_hdr(skb); pn = pppoe_pernet(dev_net(dev)); if (!pn) /* no VE_FEATURE_PPP */ goto drop; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c39936f..296e734 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3099,6 +3099,7 @@ static inline void *skb_push_rcsum(struct sk_buff *skb, unsigned int len) * * This is exactly the same as pskb_trim except that it ensures the * checksum of received packets are still valid after the operation. + * It can change skb pointers. */ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len) diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index 2d8de1d..ca93162 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -132,6 +132,7 @@ int br_validate_ipv6(struct sk_buff *skb) IPSTATS_MIB_INDISCARDS); goto drop; } + hdr = ipv6_hdr(skb); } if (hdr->nexthdr == NEXTHDR_HOP && br_nf_check_hbh_len(skb)) goto drop; diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index 634068b..0315584 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -189,6 +189,7 @@ static bool reject6_br_csum_ok(struct sk_buff *skb, int hook) pskb_trim_rcsum(skb, ntohs(ip6h->payload_len) + sizeof(*ip6h))) return false; + ip6h = ipv6_hdr(skb); thoff = ipv6_skip_exthdr(skb, ((u8*)(ip6h+1) - skb->data), , ); if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0) return false; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 2b97550..f537e32 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -450,6 +450,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, goto drop; } + iph = ip_hdr(skb); skb->transport_header = skb->network_header + iph->ihl*4; /* Remove any debris in the socket control block */ ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ms/net: make skb_partial_csum_set() more robust against overflows
The commit is pushed to "branch-rh7-3.10.0-1160.6.1.vz7.171.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-1160.6.1.vz7.171.4 --> commit b18c22895955bd9f3153784fa4018c1ff5a5996f Author: Eric Dumazet Date: Tue Dec 15 12:24:41 2020 +0300 ms/net: make skb_partial_csum_set() more robust against overflows syzbot managed to crash in skb_checksum_help() [1] : BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); Root cause is the following check in skb_partial_csum_set() if (unlikely(start > skb_headlen(skb)) || unlikely((int)start + off > skb_headlen(skb) - 2)) return false; If skb_headlen(skb) is 1, then (skb_headlen(skb) - 2) becomes 0x and the check fails to detect that ((int)start + off) is off the limit, since the compare is unsigned. When we fix that, then the first condition (start > skb_headlen(skb)) becomes obsolete. Then we should also check that (skb_headroom(skb) + start) wont overflow 16bit field. [1] kernel BUG at net/core/dev.c:2880! invalid opcode: [#1] PREEMPT SMP KASAN CPU: 1 PID: 7330 Comm: syz-executor4 Not tainted 4.19.0-rc6+ #253 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:skb_checksum_help+0x9e3/0xbb0 net/core/dev.c:2880 Code: 85 00 ff ff ff 48 c1 e8 03 42 80 3c 28 00 0f 84 09 fb ff ff 48 8b bd 00 ff ff ff e8 97 a8 b9 fb e9 f8 fa ff ff e8 2d 09 76 fb <0f> 0b 48 8b bd 28 ff ff ff e8 1f a8 b9 fb e9 b1 f6 ff ff 48 89 cf RSP: 0018:8801d83a6f60 EFLAGS: 00010293 RAX: 8801b9834380 RBX: 8801b9f8d8c0 RCX: 8608c6d7 RDX: RSI: 8608cc63 RDI: 0006 RBP: 8801d83a7068 R08: 8801b9834380 R09: R10: 8801d83a76d8 R11: R12: 0001 R13: 00010001 R14: R15: 00a8 FS: 7f1a66db5700() GS:8801daf0() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: 7f7d77f091b0 CR3: 0001ba252000 CR4: 001406e0 DR0: DR1: DR2: DR3: DR6: fffe0ff0 DR7: 0400 Call Trace: skb_csum_hwoffload_help+0x8f/0xe0 net/core/dev.c:3269 validate_xmit_skb+0xa2a/0xf30 net/core/dev.c:3312 __dev_queue_xmit+0xc2f/0x3950 net/core/dev.c:3797 dev_queue_xmit+0x17/0x20 net/core/dev.c:3838 packet_snd net/packet/af_packet.c:2928 [inline] packet_sendmsg+0x422d/0x64c0 net/packet/af_packet.c:2953 Fixes: 5ff8dda3035d ("net: Ensure partial checksum offset is inside the skb head") Signed-off-by: Eric Dumazet Cc: Herbert Xu Reported-by: syzbot Signed-off-by: David S. Miller (cherry-picked from commit 52b5d6f5dcf0e5201392f7d417148ccb537dbf6f) https://jira.sw.ru/browse/PSBM-123062 Signed-off-by: Vasily Averin --- net/core/skbuff.c | 12 +++- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index fa5ba0d..eef4100 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3961,14 +3961,16 @@ EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); */ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) { - if (unlikely(start > skb_headlen(skb)) || - unlikely((int)start + off > skb_headlen(skb) - 2)) { - net_warn_ratelimited("bad partial csum: csum=%u/%u len=%u\n", -start, off, skb_headlen(skb)); + u32 csum_end = (u32)start + (u32)off + sizeof(__sum16); + u32 csum_start = skb_headroom(skb) + (u32)start; + + if (unlikely(csum_start > U16_MAX || csum_end > skb_headlen(skb))) { + net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n", +start, off, skb_headroom(skb), skb_headlen(skb)); return false; } skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum_start = skb_headroom(skb) + start; + skb->csum_start = csum_start; skb->csum_offset = off; skb_set_transport_header(skb, start); return true; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ms/netfilter: xt_checksum: ignore gso skbs
The commit is pushed to "branch-rh7-3.10.0-1160.6.1.vz7.171.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-1160.6.1.vz7.171.4 --> commit 2bcf37acf916a4e7f6fed7400dce34a875d67058 Author: Florian Westphal Date: Tue Dec 15 12:24:26 2020 +0300 ms/netfilter: xt_checksum: ignore gso skbs Satish Patel reports a skb_warn_bad_offload() splat caused by -j CHECKSUM rules: -A POSTROUTING -p tcp -m tcp --sport 80 -j CHECKSUM The CHECKSUM target has never worked with GSO skbs, and the above rule makes no sense as kernel will handle checksum updates on transmit. Unfortunately, there are 3rd party tools that install such rules, so we cannot reject this from the config plane without potential breakage. Amend Kconfig text to clarify that the CHECKSUM target is only useful in virtualized environments, where old dhcp clients that use AF_PACKET used to discard UDP packets with a 'bad' header checksum and add a one-time warning in case such rule isn't restricted to UDP. v2: check IP6T_F_PROTO flag before cmp (Michal Kubecek) Reported-by: Satish Patel Reported-by: Markos Chandras Reported-by: Michal Kubecek Signed-off-by: Florian Westphal Reviewed-by: Michal Kubecek Signed-off-by: Pablo Neira Ayuso (cherry-picked from commit 10568f6c5761db24249c610c94d6e44d5505a0ba) VvS: backported with minor context changes https://jira.sw.ru/browse/PSBM-123062 Signed-off-by: Vasily Averin --- net/netfilter/Kconfig | 12 ++-- net/netfilter/xt_CHECKSUM.c | 23 ++- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 561b065..0b02434 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -622,13 +622,13 @@ config NETFILTER_XT_TARGET_CHECKSUM depends on NETFILTER_ADVANCED ---help--- This option adds a `CHECKSUM' target, which can be used in the iptables mangle - table. + table to work around buggy DHCP clients in virtualized environments. - You can use this target to compute and fill in the checksum in - a packet that lacks a checksum. This is particularly useful, - if you need to work around old applications such as dhcp clients, - that do not work well with checksum offloads, but don't want to disable - checksum offload in your device. + Some old DHCP clients drop packets because they are not aware + that the checksum would normally be offloaded to hardware and + thus should be considered valid. + This target can be used to fill in the checksum using iptables + when such packets are sent via a virtual network device. To compile it as a module, choose M here. If unsure, say N. diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c index 0f642ef..db286fc 100644 --- a/net/netfilter/xt_CHECKSUM.c +++ b/net/netfilter/xt_CHECKSUM.c @@ -16,6 +16,9 @@ #include #include +#include +#include + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Michael S. Tsirkin "); MODULE_DESCRIPTION("Xtables: checksum modification"); @@ -25,7 +28,7 @@ MODULE_ALIAS("ip6t_CHECKSUM"); static unsigned int checksum_tg(struct sk_buff *skb, const struct xt_action_param *par) { - if (skb->ip_summed == CHECKSUM_PARTIAL) + if (skb->ip_summed == CHECKSUM_PARTIAL && !skb_is_gso(skb)) skb_checksum_help(skb); return XT_CONTINUE; @@ -34,6 +37,8 @@ checksum_tg(struct sk_buff *skb, const struct xt_action_param *par) static int checksum_tg_check(const struct xt_tgchk_param *par) { const struct xt_CHECKSUM_info *einfo = par->targinfo; + const struct ip6t_ip6 *i6 = par->entryinfo; + const struct ipt_ip *i4 = par->entryinfo; if (einfo->operation & ~XT_CHECKSUM_OP_FILL) { pr_info("unsupported CHECKSUM operation %x\n", einfo->operation); @@ -43,6 +48,22 @@ static int checksum_tg_check(const struct xt_tgchk_param *par) pr_info("no CHECKSUM operation enabled\n"); return -EINVAL; } + + switch (par->family) { + case NFPROTO_IPV4: + if (i4->proto == IPPROTO_UDP && + (i4->invflags & XT_INV_PROTO) == 0) + return 0; + break; + case NFPROTO_IPV6: + if ((i6->flags & IP6T_F_PROTO) && + i6->proto == IPPROTO_UDP && + (i6->invflags & XT_INV_PROTO) == 0) + return 0; + break; + } + + pr_warn_once("CHECKSUM should be avoided. If really needed, restrict with \"-p udp\" and only use in OUTPUT\n"); return 0; } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ms/netfilter: Replace spin_is_locked() with lockdep
The commit is pushed to "branch-rh7-3.10.0-1160.6.1.vz7.171.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-1160.6.1.vz7.171.4 --> commit feca123b54148d68b49ddf71efe31e189706e07c Author: Lance Roy Date: Tue Dec 15 12:22:24 2020 +0300 ms/netfilter: Replace spin_is_locked() with lockdep lockdep_assert_held() is better suited to checking locking requirements, since it won't get confused when someone else holds the lock. This is also a step towards possibly removing spin_is_locked(). Signed-off-by: Lance Roy Cc: Pablo Neira Ayuso Cc: Jozsef Kadlecsik Cc: Florian Westphal Cc: "David S. Miller" Cc: Cc: Cc: Acked-by: Jozsef Kadlecsik Signed-off-by: Pablo Neira Ayuso (cherry-picked from commit 4b87dd54be21ad611a1c740f9df0c4376d496e09) https://jira.sw.ru/browse/PSBM-123086 Signed-off-by: Vasily Averin --- net/netfilter/ipset/ip_set_hash_gen.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/net/netfilter/ipset/ip_set_hash_gen.h b/net/netfilter/ipset/ip_set_hash_gen.h index 225348a..a437adf 100644 --- a/net/netfilter/ipset/ip_set_hash_gen.h +++ b/net/netfilter/ipset/ip_set_hash_gen.h @@ -15,7 +15,7 @@ #define __ipset_dereference_protected(p, c)rcu_dereference_protected(p, c) #define ipset_dereference_protected(p, set) \ - __ipset_dereference_protected(p, spin_is_locked(&(set)->lock)) + __ipset_dereference_protected(p, lockdep_is_held(&(set)->lock)) #define rcu_dereference_bh_nfnl(p) rcu_dereference_bh_check(p, 1) ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ms/netfilter: ipset: Convert timers to use timer_setup()
The commit is pushed to "branch-rh7-3.10.0-1160.6.1.vz7.171.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-1160.6.1.vz7.171.4 --> commit 121cca10e432208a2de0f782fcdf15190a4937c8 Author: Kees Cook Date: Tue Dec 15 12:22:17 2020 +0300 ms/netfilter: ipset: Convert timers to use timer_setup() In preparation for unconditionally passing the struct timer_list pointer to all timer callbacks, switch to using the new timer_setup() and from_timer() to pass the timer pointer explicitly. This introduces a pointer back to the struct ip_set, which is used instead of the struct timer_list .data field. Cc: Pablo Neira Ayuso Cc: Jozsef Kadlecsik Cc: Florian Westphal Cc: "David S. Miller" Cc: Stephen Hemminger Cc: simran singhal Cc: Muhammad Falak R Wani Cc: netfilter-de...@vger.kernel.org Cc: coret...@netfilter.org Cc: net...@vger.kernel.org Signed-off-by: Kees Cook Signed-off-by: David S. Miller (cherry-picked frm commit a92c5751b97cca55d8140ec0bf26a53c7e00bfa5) https://jira.sw.ru/browse/PSBM-123086 Signed off-by: Vasily Averin --- net/netfilter/ipset/ip_set_bitmap_gen.h | 10 +- net/netfilter/ipset/ip_set_bitmap_ip.c| 2 ++ net/netfilter/ipset/ip_set_bitmap_ipmac.c | 2 ++ net/netfilter/ipset/ip_set_bitmap_port.c | 2 ++ net/netfilter/ipset/ip_set_hash_gen.h | 12 +++- net/netfilter/ipset/ip_set_list_set.c | 12 +++- 6 files changed, 25 insertions(+), 15 deletions(-) diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h b/net/netfilter/ipset/ip_set_bitmap_gen.h index a77e3f3..257ca39 100644 --- a/net/netfilter/ipset/ip_set_bitmap_gen.h +++ b/net/netfilter/ipset/ip_set_bitmap_gen.h @@ -37,11 +37,11 @@ #define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id))) static void -mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set)) +mtype_gc_init(struct ip_set *set, void (*gc)(struct timer_list *t)) { struct mtype *map = set->data; - setup_timer(>gc, gc, (unsigned long)set); + timer_setup(>gc, gc, 0); mod_timer(>gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ); } @@ -266,10 +266,10 @@ out: } static void -mtype_gc(unsigned long ul_set) +mtype_gc(struct timer_list *t) { - struct ip_set *set = (struct ip_set *)ul_set; - struct mtype *map = set->data; + struct mtype *map = from_timer(map, t, gc); + struct ip_set *set = map->set; void *x; u32 id; diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c b/net/netfilter/ipset/ip_set_bitmap_ip.c index 9f4f28d..488d6d0 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ip.c +++ b/net/netfilter/ipset/ip_set_bitmap_ip.c @@ -48,6 +48,7 @@ struct bitmap_ip { size_t memsize; /* members size */ u8 netmask; /* subnet netmask */ struct timer_list gc; /* garbage collection */ + struct ip_set *set; /* attached to this ip_set */ unsigned char extensions[0] /* data extensions */ __aligned(__alignof__(u64)); }; @@ -232,6 +233,7 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map, map->netmask = netmask; set->timeout = IPSET_NO_TIMEOUT; + map->set = set; set->data = map; set->family = NFPROTO_IPV4; diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c b/net/netfilter/ipset/ip_set_bitmap_ipmac.c index ae927b3..794e033 100644 --- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c +++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c @@ -52,6 +52,7 @@ struct bitmap_ipmac { u32 elements; /* number of max elements in the set */ size_t memsize; /* members size */ struct timer_list gc; /* garbage collector */ + struct ip_set *set; /* attached to this ip_set */ unsigned char extensions[0] /* MAC + data extensions */ __aligned(__alignof__(u64)); }; @@ -310,6 +311,7 @@ init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map, map->elements = elements; set->timeout = IPSET_NO_TIMEOUT; + map->set = set; set->data = map; set->family = NFPROTO_IPV4; diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c b/net/netfilter/ipset/ip_set_bitmap_port.c index c6e1ebf..b561ca8 100644 --- a/net/netfilter/ipset/ip_set_bitmap_port.c +++ b/net/netfilter/ipset/ip_set_bitmap_port.c @@ -40,6 +40,7 @@ struct bitmap_port { u32 elements; /* number of max elements in the set */ size_t memsize; /* members size */ struct timer_list gc; /* garbage collection */ + struct ip_set *set; /* attached to this ip_set */ unsigned char extensions[0] /* data extensions */ __aligned(__alignof__(u64)); }; @@ -214,6 +215,7 @@ init_map_port(struct ip_set *set, struct bitmap_port *map, map->last_port = last_port;
[Devel] [PATCH RHEL7 COMMIT] ms/netfilter: ipset: Fix "INFO: rcu detected stall in hash_xxx" reports
The commit is pushed to "branch-rh7-3.10.0-1160.6.1.vz7.171.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-1160.6.1.vz7.171.4 --> commit f382244983c50d1f353741e0ebaf71e162cfa8c6 Author: Jozsef Kadlecsik Date: Tue Dec 15 12:22:33 2020 +0300 ms/netfilter: ipset: Fix "INFO: rcu detected stall in hash_xxx" reports In the case of huge hash:* types of sets, due to the single spinlock of a set the processing of the whole set under spinlock protection could take too long. There were four places where the whole hash table of the set was processed from bucket to bucket under holding the spinlock: - During resizing a set, the original set was locked to exclude kernel side add/del element operations (userspace add/del is excluded by the nfnetlink mutex). The original set is actually just read during the resize, so the spinlocking is replaced with rcu locking of regions. However, thus there can be parallel kernel side add/del of entries. In order not to loose those operations a backlog is added and replayed after the successful resize. - Garbage collection of timed out entries was also protected by the spinlock. In order not to lock too long, region locking is introduced and a single region is processed in one gc go. Also, the simple timer based gc running is replaced with a workqueue based solution. The internal book-keeping (number of elements, size of extensions) is moved to region level due to the region locking. - Adding elements: when the max number of the elements is reached, the gc was called to evict the timed out entries. The new approach is that the gc is called just for the matching region, assuming that if the region (proportionally) seems to be full, then the whole set does. We could scan the other regions to check every entry under rcu locking, but for huge sets it'd mean a slowdown at adding elements. - Listing the set header data: when the set was defined with timeout support, the garbage collector was called to clean up timed out entries to get the correct element numbers and set size values. Now the set is scanned to check non-timed out entries, without actually calling the gc for the whole set. Thanks to Florian Westphal for helping me to solve the SOFTIRQ-safe -> SOFTIRQ-unsafe lock order issues during working on the patch. Reported-by: syzbot+4b0e9d4ff3cf11783...@syzkaller.appspotmail.com Reported-by: syzbot+c27b8d5010f45c666...@syzkaller.appspotmail.com Reported-by: syzbot+68a806795ac89df3a...@syzkaller.appspotmail.com Fixes: 23c42a403a9c ("netfilter: ipset: Introduction of new commands and protocol version 7") Signed-off-by: Jozsef Kadlecsik (cherry-picked from f66ee0410b1c3481ee75e5db9b34547b4d582465) VvS changes: minor context changes because of skipped backport of bd96b4c7 "netfilter: inline four headers files into another one." https://jira.sw.ru/browse/PSBM-123086 Signed-off-by: Vasily Averin --- include/linux/netfilter/ipset/ip_set.h | 11 +- net/netfilter/ipset/ip_set_core.c | 34 +- net/netfilter/ipset/ip_set_hash_gen.h | 633 +++-- 3 files changed, 472 insertions(+), 206 deletions(-) diff --git a/include/linux/netfilter/ipset/ip_set.h b/include/linux/netfilter/ipset/ip_set.h index f2e1e6b..471363b 100644 --- a/include/linux/netfilter/ipset/ip_set.h +++ b/include/linux/netfilter/ipset/ip_set.h @@ -124,6 +124,7 @@ struct ip_set_ext { u32 timeout; u8 packets_op; u8 bytes_op; + bool target; }; struct ip_set; @@ -190,6 +191,14 @@ struct ip_set_type_variant { /* Return true if "b" set is the same as "a" * according to the create set parameters */ bool (*same_set)(const struct ip_set *a, const struct ip_set *b); + /* Region-locking is used */ + bool region_lock; +}; + +struct ip_set_region { + spinlock_t lock;/* Region lock */ + size_t ext_size;/* Size of the dynamic extensions */ + u32 elements; /* Number of elements vs timeout */ }; /* The core set type structure */ @@ -464,7 +473,7 @@ bitmap_bytes(u32 a, u32 b) #include #define IP_SET_INIT_KEXT(skb, opt, set)\ - { .bytes = (skb)->len, .packets = 1,\ + { .bytes = (skb)->len, .packets = 1, .target = true,\ .timeout = ip_set_adt_opt_timeout(opt, set) } #define IP_SET_INIT_UEXT(set) \ diff --git a/net/netfilter/ipset/ip_set_core.c b/net/netfilter/ipset/ip_set_core.c index b067879..d47d978 100644 --- a/net/netfilter/ipset/ip_set_core.c +++ b/net/netfilter/ipset/ip_set_core.c @@ -559,6 +559,20 @@ ip_set_rcu_get(struct net *net, ip_set_id_t index) return set; } +static inline void +ip_set_lock(struct
[Devel] [PATCH RH7 4/4] ms/net: drop bogus skb with CHECKSUM_PARTIAL and offset beyond end of trimmed packet
syzbot reproduces BUG_ON in skb_checksum_help(): tun creates (bogus) skb with huge partial-checksummed area and small ip packet inside. Then ip_rcv trims the skb based on size of internal ip packet, after that csum offset points beyond of trimmed skb. Then checksum_tg() called via netfilter hook triggers BUG_ON: offset = skb_checksum_start_offset(skb); BUG_ON(offset >= skb_headlen(skb)); To work around the problem this patch forces pskb_trim_rcsum_slow() to return -EINVAL in described scenario. It allows its callers to drop such kind of packets. Link: https://syzkaller.appspot.com/bug?id=b419a5ca95062664fe1a60b764621eb4526e2cd0 Reported-by: syzbot+7010af67ced6105e5...@syzkaller.appspotmail.com Signed-off-by: Vasily Averin Acked-by: Willem de Bruijn Link: https://lore.kernel.org/r/1b2494af-2c56-8ee2-7bc0-923fcad1c...@virtuozzo.com Signed-off-by: Jakub Kicinski Now it is in net-next: https://git.kernel.org/netdev/net-next/c/54970a2fbb67 https://jira.sw.ru/browse/PSBM-123062 Signed-off-by: Vasily Averin --- include/linux/skbuff.h | 7 +++ 1 file changed, 7 insertions(+) diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index 296e734..f2c66b1 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3108,6 +3108,13 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len) return 0; if (skb->ip_summed == CHECKSUM_COMPLETE) skb->ip_summed = CHECKSUM_NONE; + else if (skb->ip_summed == CHECKSUM_PARTIAL) { + int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len; + int offset = skb_checksum_start_offset(skb) + skb->csum_offset; + + if (offset + sizeof(__sum16) > hdlen) + return -EINVAL; + } return __pskb_trim(skb, len); } -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH7 3/4] ms/net: make skb_partial_csum_set() more robust against overflows
From: Eric Dumazet syzbot managed to crash in skb_checksum_help() [1] : BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb)); Root cause is the following check in skb_partial_csum_set() if (unlikely(start > skb_headlen(skb)) || unlikely((int)start + off > skb_headlen(skb) - 2)) return false; If skb_headlen(skb) is 1, then (skb_headlen(skb) - 2) becomes 0x and the check fails to detect that ((int)start + off) is off the limit, since the compare is unsigned. When we fix that, then the first condition (start > skb_headlen(skb)) becomes obsolete. Then we should also check that (skb_headroom(skb) + start) wont overflow 16bit field. [1] kernel BUG at net/core/dev.c:2880! invalid opcode: [#1] PREEMPT SMP KASAN CPU: 1 PID: 7330 Comm: syz-executor4 Not tainted 4.19.0-rc6+ #253 Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/01/2011 RIP: 0010:skb_checksum_help+0x9e3/0xbb0 net/core/dev.c:2880 Code: 85 00 ff ff ff 48 c1 e8 03 42 80 3c 28 00 0f 84 09 fb ff ff 48 8b bd 00 ff ff ff e8 97 a8 b9 fb e9 f8 fa ff ff e8 2d 09 76 fb <0f> 0b 48 8b bd 28 ff ff ff e8 1f a8 b9 fb e9 b1 f6 ff ff 48 89 cf RSP: 0018:8801d83a6f60 EFLAGS: 00010293 RAX: 8801b9834380 RBX: 8801b9f8d8c0 RCX: 8608c6d7 RDX: RSI: 8608cc63 RDI: 0006 RBP: 8801d83a7068 R08: 8801b9834380 R09: R10: 8801d83a76d8 R11: R12: 0001 R13: 00010001 R14: R15: 00a8 FS: 7f1a66db5700() GS:8801daf0() knlGS: CS: 0010 DS: ES: CR0: 80050033 CR2: 7f7d77f091b0 CR3: 0001ba252000 CR4: 001406e0 DR0: DR1: DR2: DR3: DR6: fffe0ff0 DR7: 0400 Call Trace: skb_csum_hwoffload_help+0x8f/0xe0 net/core/dev.c:3269 validate_xmit_skb+0xa2a/0xf30 net/core/dev.c:3312 __dev_queue_xmit+0xc2f/0x3950 net/core/dev.c:3797 dev_queue_xmit+0x17/0x20 net/core/dev.c:3838 packet_snd net/packet/af_packet.c:2928 [inline] packet_sendmsg+0x422d/0x64c0 net/packet/af_packet.c:2953 Fixes: 5ff8dda3035d ("net: Ensure partial checksum offset is inside the skb head") Signed-off-by: Eric Dumazet Cc: Herbert Xu Reported-by: syzbot Signed-off-by: David S. Miller (cherry-picked from commit 52b5d6f5dcf0e5201392f7d417148ccb537dbf6f) https://jira.sw.ru/browse/PSBM-123062 Signed-off-by: Vasily Averin --- net/core/skbuff.c | 12 +++- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/net/core/skbuff.c b/net/core/skbuff.c index fa5ba0d..eef4100 100644 --- a/net/core/skbuff.c +++ b/net/core/skbuff.c @@ -3961,14 +3961,16 @@ EXPORT_SYMBOL_GPL(skb_complete_wifi_ack); */ bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off) { - if (unlikely(start > skb_headlen(skb)) || - unlikely((int)start + off > skb_headlen(skb) - 2)) { - net_warn_ratelimited("bad partial csum: csum=%u/%u len=%u\n", -start, off, skb_headlen(skb)); + u32 csum_end = (u32)start + (u32)off + sizeof(__sum16); + u32 csum_start = skb_headroom(skb) + (u32)start; + + if (unlikely(csum_start > U16_MAX || csum_end > skb_headlen(skb))) { + net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u headlen=%u\n", +start, off, skb_headroom(skb), skb_headlen(skb)); return false; } skb->ip_summed = CHECKSUM_PARTIAL; - skb->csum_start = skb_headroom(skb) + start; + skb->csum_start = csum_start; skb->csum_offset = off; skb_set_transport_header(skb, start); return true; -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH7 2/4] ms/net: Fix usage of pskb_trim_rcsum
From: Ross Lagerwall In certain cases, pskb_trim_rcsum() may change skb pointers. Reinitialize header pointers afterwards to avoid potential use-after-frees. Add a note in the documentation of pskb_trim_rcsum(). Found by KASAN. Signed-off-by: Ross Lagerwall Signed-off-by: David S. Miller (cherry-picked from commit 6c57f0458022298e4da1729c67bd33ce41c14e7a) https://jira.sw.ru/browse/PSBM-123062 Signed-off-by: Vasily Averin --- drivers/net/ppp/pppoe.c | 1 + include/linux/skbuff.h | 1 + net/bridge/br_netfilter_ipv6.c | 1 + net/bridge/netfilter/nft_reject_bridge.c | 1 + net/ipv4/ip_input.c | 1 + 5 files changed, 5 insertions(+) diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c index 15d3f44..3101720 100644 --- a/drivers/net/ppp/pppoe.c +++ b/drivers/net/ppp/pppoe.c @@ -442,6 +442,7 @@ static int pppoe_rcv(struct sk_buff *skb, struct net_device *dev, if (pskb_trim_rcsum(skb, len)) goto drop; + ph = pppoe_hdr(skb); pn = pppoe_pernet(dev_net(dev)); if (!pn) /* no VE_FEATURE_PPP */ goto drop; diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h index c39936f..296e734 100644 --- a/include/linux/skbuff.h +++ b/include/linux/skbuff.h @@ -3099,6 +3099,7 @@ static inline void *skb_push_rcsum(struct sk_buff *skb, unsigned int len) * * This is exactly the same as pskb_trim except that it ensures the * checksum of received packets are still valid after the operation. + * It can change skb pointers. */ static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len) diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c index 2d8de1d..ca93162 100644 --- a/net/bridge/br_netfilter_ipv6.c +++ b/net/bridge/br_netfilter_ipv6.c @@ -132,6 +132,7 @@ int br_validate_ipv6(struct sk_buff *skb) IPSTATS_MIB_INDISCARDS); goto drop; } + hdr = ipv6_hdr(skb); } if (hdr->nexthdr == NEXTHDR_HOP && br_nf_check_hbh_len(skb)) goto drop; diff --git a/net/bridge/netfilter/nft_reject_bridge.c b/net/bridge/netfilter/nft_reject_bridge.c index 634068b..0315584 100644 --- a/net/bridge/netfilter/nft_reject_bridge.c +++ b/net/bridge/netfilter/nft_reject_bridge.c @@ -189,6 +189,7 @@ static bool reject6_br_csum_ok(struct sk_buff *skb, int hook) pskb_trim_rcsum(skb, ntohs(ip6h->payload_len) + sizeof(*ip6h))) return false; + ip6h = ipv6_hdr(skb); thoff = ipv6_skip_exthdr(skb, ((u8*)(ip6h+1) - skb->data), , ); if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0) return false; diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c index 2b97550..f537e32 100644 --- a/net/ipv4/ip_input.c +++ b/net/ipv4/ip_input.c @@ -450,6 +450,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, struct packet_type *pt, goto drop; } + iph = ip_hdr(skb); skb->transport_header = skb->network_header + iph->ihl*4; /* Remove any debris in the socket control block */ -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RH7 1/4] ms/netfilter: xt_checksum: ignore gso skbs
From: Florian Westphal Satish Patel reports a skb_warn_bad_offload() splat caused by -j CHECKSUM rules: -A POSTROUTING -p tcp -m tcp --sport 80 -j CHECKSUM The CHECKSUM target has never worked with GSO skbs, and the above rule makes no sense as kernel will handle checksum updates on transmit. Unfortunately, there are 3rd party tools that install such rules, so we cannot reject this from the config plane without potential breakage. Amend Kconfig text to clarify that the CHECKSUM target is only useful in virtualized environments, where old dhcp clients that use AF_PACKET used to discard UDP packets with a 'bad' header checksum and add a one-time warning in case such rule isn't restricted to UDP. v2: check IP6T_F_PROTO flag before cmp (Michal Kubecek) Reported-by: Satish Patel Reported-by: Markos Chandras Reported-by: Michal Kubecek Signed-off-by: Florian Westphal Reviewed-by: Michal Kubecek Signed-off-by: Pablo Neira Ayuso (cherry-picked from commit 10568f6c5761db24249c610c94d6e44d5505a0ba) VvS: backported with minor context changes https://jira.sw.ru/browse/PSBM-123062 Signed-off-by: Vasily Averin --- net/netfilter/Kconfig | 12 ++-- net/netfilter/xt_CHECKSUM.c | 23 ++- 2 files changed, 28 insertions(+), 7 deletions(-) diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig index 561b065..0b02434 100644 --- a/net/netfilter/Kconfig +++ b/net/netfilter/Kconfig @@ -622,13 +622,13 @@ config NETFILTER_XT_TARGET_CHECKSUM depends on NETFILTER_ADVANCED ---help--- This option adds a `CHECKSUM' target, which can be used in the iptables mangle - table. + table to work around buggy DHCP clients in virtualized environments. - You can use this target to compute and fill in the checksum in - a packet that lacks a checksum. This is particularly useful, - if you need to work around old applications such as dhcp clients, - that do not work well with checksum offloads, but don't want to disable - checksum offload in your device. + Some old DHCP clients drop packets because they are not aware + that the checksum would normally be offloaded to hardware and + thus should be considered valid. + This target can be used to fill in the checksum using iptables + when such packets are sent via a virtual network device. To compile it as a module, choose M here. If unsure, say N. diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c index 0f642ef..db286fc 100644 --- a/net/netfilter/xt_CHECKSUM.c +++ b/net/netfilter/xt_CHECKSUM.c @@ -16,6 +16,9 @@ #include #include +#include +#include + MODULE_LICENSE("GPL"); MODULE_AUTHOR("Michael S. Tsirkin "); MODULE_DESCRIPTION("Xtables: checksum modification"); @@ -25,7 +28,7 @@ MODULE_ALIAS("ip6t_CHECKSUM"); static unsigned int checksum_tg(struct sk_buff *skb, const struct xt_action_param *par) { - if (skb->ip_summed == CHECKSUM_PARTIAL) + if (skb->ip_summed == CHECKSUM_PARTIAL && !skb_is_gso(skb)) skb_checksum_help(skb); return XT_CONTINUE; @@ -34,6 +37,8 @@ checksum_tg(struct sk_buff *skb, const struct xt_action_param *par) static int checksum_tg_check(const struct xt_tgchk_param *par) { const struct xt_CHECKSUM_info *einfo = par->targinfo; + const struct ip6t_ip6 *i6 = par->entryinfo; + const struct ipt_ip *i4 = par->entryinfo; if (einfo->operation & ~XT_CHECKSUM_OP_FILL) { pr_info("unsupported CHECKSUM operation %x\n", einfo->operation); @@ -43,6 +48,22 @@ static int checksum_tg_check(const struct xt_tgchk_param *par) pr_info("no CHECKSUM operation enabled\n"); return -EINVAL; } + + switch (par->family) { + case NFPROTO_IPV4: + if (i4->proto == IPPROTO_UDP && + (i4->invflags & XT_INV_PROTO) == 0) + return 0; + break; + case NFPROTO_IPV6: + if ((i6->flags & IP6T_F_PROTO) && + i6->proto == IPPROTO_UDP && + (i6->invflags & XT_INV_PROTO) == 0) + return 0; + break; + } + + pr_warn_once("CHECKSUM should be avoided. If really needed, restrict with \"-p udp\" and only use in OUTPUT\n"); return 0; } -- 1.8.3.1 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel