[Devel] [PATCH RHEL8 COMMIT] vdso: fix VM_BUG_ON_PAGE(PageSlab(page)) on unmap

2020-12-15 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.1
-->
commit eac26b43815a2f0ebc2128f9161097bed17bdc71
Author: Andrey Ryabinin 
Date:   Tue Dec 15 20:12:18 2020 +0300

vdso: fix VM_BUG_ON_PAGE(PageSlab(page)) on unmap

vdso_data is mapped to userspace which means that we can't
use kmalloc() to allocate it. Kmalloc() doesn't even guarantee
that we will get page aligned memory.

 kernel BUG at include/linux/mm.h:693!
 RIP: 0010:unmap_page_range+0x15f2/0x2630
 Call Trace:
  unmap_vmas+0x11e/0x1d0
  exit_mmap+0x215/0x420
  mmput+0x10a/0x400
  do_exit+0x98f/0x2d00
  do_group_exit+0xec/0x2b0
  __x64_sys_exit_group+0x3a/0x50
  do_syscall_64+0xa5/0x4d0
  entry_SYSCALL_64_after_hwframe+0x6a/0xdf

Use alloc_pages_exact() to allocate it. We can't use
alloc_pages(), or __get_free_pages() here since vdso_fault()
need to perform get_page() on individual sub-pages and alloc_pages()
doesn't initalize sub-pages.

https://jira.sw.ru/browse/PSBM-123551
Signed-off-by: Andrey Ryabinin 
---
 kernel/ve/ve.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index b114e2918bb7..0c6630c6616a 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -568,7 +568,7 @@ static int copy_vdso(struct vdso_image **vdso_dst, const 
struct vdso_image *vdso
if (!vdso)
return -ENOMEM;
 
-   vdso_data = kmalloc(vdso_src->size, GFP_KERNEL);
+   vdso_data = alloc_pages_exact(vdso_src->size, GFP_KERNEL);
if (!vdso_data) {
kfree(vdso);
return -ENOMEM;
@@ -585,11 +585,11 @@ static int copy_vdso(struct vdso_image **vdso_dst, const 
struct vdso_image *vdso
 static void ve_free_vdso(struct ve_struct *ve)
 {
if (ve->vdso_64 && ve->vdso_64 != _image_64) {
-   kfree(ve->vdso_64->data);
+   free_pages_exact(ve->vdso_64->data, ve->vdso_64->size);
kfree(ve->vdso_64);
}
if (ve->vdso_32 && ve->vdso_32 != _image_32) {
-   kfree(ve->vdso_32->data);
+   free_pages_exact(ve->vdso_32->data, ve->vdso_32->size);
kfree(ve->vdso_32);
}
 }
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL8 COMMIT] ploop: Zero tail of tail page

2020-12-15 Thread Konstantin Khorenko
The commit is pushed to "branch-rh8-4.18.0-240.1.1.vz8.5.x-ovz" and will appear 
at https://src.openvz.org/scm/ovz/vzkernel.git
after rh8-4.18.0-240.1.1.vz8.5.1
-->
commit 0497d745e201c4eb6f894c87afb55044f075708d
Author: Kirill Tkhai 
Date:   Tue Dec 15 20:12:18 2020 +0300

ploop: Zero tail of tail page

In case of BAT ends in middle of page, zero its tail.
Otherwise, garbage is there.

https://jira.sw.ru/browse/PSBM-123639
Signed-off-by: Kirill Tkhai 
---
 drivers/md/dm-ploop-bat.c | 5 +
 drivers/md/dm-ploop.h | 1 +
 2 files changed, 6 insertions(+)

diff --git a/drivers/md/dm-ploop-bat.c b/drivers/md/dm-ploop-bat.c
index d6b687806118..da18dd2e4638 100644
--- a/drivers/md/dm-ploop-bat.c
+++ b/drivers/md/dm-ploop-bat.c
@@ -168,6 +168,11 @@ static int ploop_read_bat(struct ploop *ploop, struct bio 
*bio)
from = kmap(bio->bi_io_vec[page].bv_page);
memcpy(to, from, nr_copy * sizeof(map_index_t));
kunmap(bio->bi_io_vec[page].bv_page);
+   if (unlikely(nr_copy < BAT_ENTRIES_PER_PAGE)) {
+   memset(from + nr_copy, 0, sizeof(map_index_t) *
+  (BAT_ENTRIES_PER_PAGE - nr_copy));
+   }
+
ret = parse_bat_entries(ploop, to, md->bat_levels,
nr_copy, id);
kunmap(md->page);
diff --git a/drivers/md/dm-ploop.h b/drivers/md/dm-ploop.h
index dc5362f01e21..a025df4bf328 100644
--- a/drivers/md/dm-ploop.h
+++ b/drivers/md/dm-ploop.h
@@ -6,6 +6,7 @@
 
 #define PLOOP_MAP_OFFSET 16
 typedef u32 map_index_t;
+#define BAT_ENTRIES_PER_PAGE (PAGE_SIZE / sizeof(map_index_t))
 
 #define SIGNATURE_DISK_IN_USE   0x746F6E59
 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RH8] dm-ploop: Skip zero writes to unallocated clusters

2020-12-15 Thread Kirill Tkhai
Sometimes this may safe some space...

https://jira.sw.ru/browse/PSBM-123748

Signed-off-by: Kirill Tkhai 
---
 drivers/md/dm-ploop-map.c |   24 
 1 file changed, 24 insertions(+)

diff --git a/drivers/md/dm-ploop-map.c b/drivers/md/dm-ploop-map.c
index 4b12b5fc082a..f193b25cbd28 100644
--- a/drivers/md/dm-ploop-map.c
+++ b/drivers/md/dm-ploop-map.c
@@ -397,6 +397,27 @@ static void maybe_unlink_completed_bio(struct ploop 
*ploop, struct bio *bio)
queue_work(ploop->wq, >worker);
 }
 
+static bool bio_endio_if_all_zeros(struct bio *bio)
+{
+   struct bvec_iter bi = {
+   .bi_size = bio->bi_iter.bi_size,
+   };
+   struct bio_vec bv;
+   void *data, *ret;
+
+   for_each_bvec(bv, bio->bi_io_vec, bi, bi) {
+   data = kmap(bv.bv_page);
+   ret = memchr_inv(data + bv.bv_offset, 0, bv.bv_len);
+   kunmap(bv.bv_page);
+   if (ret)
+   return false;
+   }
+
+   bio->bi_status = BLK_STS_OK;
+   bio_endio(bio);
+   return true;
+}
+
 static void handle_discard_bio(struct ploop *ploop, struct bio *bio,
 unsigned int cluster, unsigned int dst_cluster)
 {
@@ -1326,6 +1347,9 @@ static int process_one_deferred_bio(struct ploop *ploop, 
struct bio *bio,
goto out;
}
 
+   if (unlikely(bio_endio_if_all_zeros(bio)))
+   goto out;
+
/* Cluster exists nowhere. Allocate it and setup bio as outrunning */
ret = locate_new_cluster_and_attach_bio(ploop, piwb, cluster,
_cluster, bio);


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH vz8] vdso: fix VM_BUG_ON_PAGE(PageSlab(page)) on unmap

2020-12-15 Thread Andrey Ryabinin
vdso_data is mapped to userspace which means that we can't
use kmalloc() to allocate it. Kmalloc() doesn't even guarantee
that we will get page aligned memory.

 kernel BUG at include/linux/mm.h:693!
 RIP: 0010:unmap_page_range+0x15f2/0x2630
 Call Trace:
  unmap_vmas+0x11e/0x1d0
  exit_mmap+0x215/0x420
  mmput+0x10a/0x400
  do_exit+0x98f/0x2d00
  do_group_exit+0xec/0x2b0
  __x64_sys_exit_group+0x3a/0x50
  do_syscall_64+0xa5/0x4d0
  entry_SYSCALL_64_after_hwframe+0x6a/0xdf

Use alloc_pages_exact() to allocate it. We can't use
alloc_pages(), or __get_free_pages() here since vdso_fault()
need to perform get_page() on individual sub-pages and alloc_pages()
doesn't initalize sub-pages.

https://jira.sw.ru/browse/PSBM-123551
Signed-off-by: Andrey Ryabinin 
---
 kernel/ve/ve.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/ve/ve.c b/kernel/ve/ve.c
index b114e2918bb7..0c6630c6616a 100644
--- a/kernel/ve/ve.c
+++ b/kernel/ve/ve.c
@@ -568,7 +568,7 @@ static int copy_vdso(struct vdso_image **vdso_dst, const 
struct vdso_image *vdso
if (!vdso)
return -ENOMEM;
 
-   vdso_data = kmalloc(vdso_src->size, GFP_KERNEL);
+   vdso_data = alloc_pages_exact(vdso_src->size, GFP_KERNEL);
if (!vdso_data) {
kfree(vdso);
return -ENOMEM;
@@ -585,11 +585,11 @@ static int copy_vdso(struct vdso_image **vdso_dst, const 
struct vdso_image *vdso
 static void ve_free_vdso(struct ve_struct *ve)
 {
if (ve->vdso_64 && ve->vdso_64 != _image_64) {
-   kfree(ve->vdso_64->data);
+   free_pages_exact(ve->vdso_64->data, ve->vdso_64->size);
kfree(ve->vdso_64);
}
if (ve->vdso_32 && ve->vdso_32 != _image_32) {
-   kfree(ve->vdso_32->data);
+   free_pages_exact(ve->vdso_32->data, ve->vdso_32->size);
kfree(ve->vdso_32);
}
 }
-- 
2.26.2

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL7 COMMIT] ms/KVM: x86: reinstate vendor-agnostic check on SPEC_CTRL cpuid bits #PSBM-120787 #PSBM-123538 #PSBM-121767

2020-12-15 Thread Vasily Averin
The commit is pushed to "branch-rh7-3.10.0-1160.6.1.vz7.171.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1160.6.1.vz7.171.4
-->
commit b1ecfc303d3e53654c7d440c4214844ee6e69d78
Author: Paolo Bonzini 
Date:   Tue Dec 15 12:31:37 2020 +0300

ms/KVM: x86: reinstate vendor-agnostic check on SPEC_CTRL cpuid bits 
#PSBM-120787 #PSBM-123538 #PSBM-121767

Until commit e7c587da1252 ("x86/speculation: Use synthetic bits for 
IBRS/IBPB/STIBP",
2018-05-17), KVM was testing both Intel and AMD CPUID bits before allowing 
the
guest to write MSR_IA32_SPEC_CTRL and MSR_IA32_PRED_CMD.  Testing only 
Intel bits
on VMX processors, or only AMD bits on SVM processors, fails if the guests 
are
created with the "opposite" vendor as the host.

While at it, also tweak the host CPU check to use the vendor-agnostic 
feature bit
X86_FEATURE_IBPB, since we only care about the availability of the MSR on 
the host
here and not about specific CPUID bits.

mFixes: e7c587da1252 ("x86/speculation: Use synthetic bits for 
IBRS/IBPB/STIBP")
Cc: sta...@vger.kernel.org
Reported-by: Denis V. Lunev 
Signed-off-by: Paolo Bonzini 

https://jira.sw.ru/browse/PSBM-123538


The patch is a replacement for below vz patch,
which has been reverted due to
https://jira.sw.ru/browse/PSBM-121767

 commit 39d637ddbcf876f897e01c737bbb351461921df0
 Author: Denis V. Lunev 
 Date:   Wed Oct 28 19:25:57 2020 +0300

kvm: fix AMD IBRS/IBPB/STIBP/SSBD reporting #PSBM-120787

We should report these bits in 8008 EBX on AMD only, i.e. when AMD
specific feature bits are enabled.

https://jira.sw.ru/browse/PSBM-120787

Signed-off-by: Denis V. Lunev 
CC: Vasily Averin 
CC: Konstantin Khorenko 


Port to vz7 note:

in vz7 both functions svm_set_msr() and vmx_set_msr() did not have
checks for
  !boot_cpu_has(X86_FEATURE_AMD_IBPB)
and
  !boot_cpu_has(X86_FEATURE_SPEC_CTRL)

Signed-off-by: Konstantin Khorenko 
---
 arch/x86/kvm/svm.c |  3 +++
 arch/x86/kvm/vmx.c | 10 --
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 5715da2..3a04bd0 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -3881,11 +3881,14 @@ static int svm_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr)
break;
case MSR_IA32_PRED_CMD:
if (!msr->host_initiated &&
+   !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
!guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB))
return 1;
 
if (data & ~PRED_CMD_IBPB)
return 1;
+   if (!boot_cpu_has(X86_FEATURE_IBPB))
+   return 1;
 
if (!data)
break;
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index f43b2db..a120208 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -3350,7 +3350,10 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
break;
case MSR_IA32_SPEC_CTRL:
if (!msr_info->host_initiated &&
-   !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+   !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
+   !guest_cpuid_has(vcpu, X86_FEATURE_AMD_STIBP) &&
+   !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBRS) &&
+   !guest_cpuid_has(vcpu, X86_FEATURE_AMD_SSBD))
return 1;
 
/* The STIBP bit doesn't fault even if it's not advertised */
@@ -3387,11 +3390,14 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
goto find_shared_msr;
case MSR_IA32_PRED_CMD:
if (!msr_info->host_initiated &&
-   !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL))
+   !guest_cpuid_has(vcpu, X86_FEATURE_SPEC_CTRL) &&
+   !guest_cpuid_has(vcpu, X86_FEATURE_AMD_IBPB))
return 1;
 
if (data & ~PRED_CMD_IBPB)
return 1;
+   if (!boot_cpu_has(X86_FEATURE_IBPB))
+   return 1;
 
if (!data)
break;
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL7 COMMIT] ve/device_cgroup: show all devices allowed in ct to fool docker

2020-12-15 Thread Vasily Averin
The commit is pushed to "branch-rh7-3.10.0-1160.6.1.vz7.171.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1160.6.1.vz7.171.4
-->
commit 510bede663a9018c86c2b45ede7c5e26f6f8
Author: Pavel Tikhomirov 
Date:   Tue Dec 15 12:30:55 2020 +0300

ve/device_cgroup: show all devices allowed in ct to fool docker

We've seen that docker 20+ not only writes "a *:* rwm" to privileged
docker container device-cgroup (as pre-19 version did) but also checks
the content after write, and docker expects that all devices are allowed
for privileged docker container.

In our VZCT we obviously can't afford to actually allow all devices
because root device cgroup of VZCT should restrict which devices are
allowed to be read/modified/mknod in VZCT and which are not, and all
nested cgroup inherit this. Before the patch reading devices list in
VZCT one would see a whitelist there each allowed device is present:

  CT-101 /# cat /sys/fs/cgroup/devices/test/devices.list
  ...
  c 1:11 rwm
  c 10:200 rwm
  c 10:235 rwm
  c 10:229 rwm
  b 182:177568 rm
  b 182:177569 rm

Docker expects to see "a *:* rwm" as if docker is on bare host and
nobody touched device cgroup before that.

As a solution we can just show docker what he wants. The idea is to
detect if the content of the whitelist of the device cgroup to be
shown is equal to the content of the whitelist of the root device cgroup
of the VZCT, then always show "a *:* rwm".

  CT-101 /# cat /sys/fs/cgroup/devices/test/devices.list
  a *:* rwm

If one changes the whitelist (even reorder) this cgroup would show a
full list of all allowed devices as before.

This change of the output looks consistent enough: when you see
"a *:* rwm" in your cgroup it means that all devices of your VZCT are
available for you.

Only difference to mainstream behaviour is when you prohibit some device
via devices.deny you get not a blacklist but an inverse whitelist.

FIXME: we have a problem here because this approach does not survive
container migration as devices cgroup c/r looks broken:
https://jira.sw.ru/browse/PSBM-123668

https://jira.sw.ru/browse/PSBM-123630

Signed-off-by: Pavel Tikhomirov 
---
 include/linux/cgroup.h   |  1 +
 security/device_cgroup.c | 48 
 2 files changed, 49 insertions(+)

diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h
index 645c9fd..ac255e4 100644
--- a/include/linux/cgroup.h
+++ b/include/linux/cgroup.h
@@ -673,6 +673,7 @@ void cgroup_release_agent(struct work_struct *work);
 #ifdef CONFIG_VE
 int cgroup_mark_ve_roots(struct ve_struct *ve);
 void cgroup_unmark_ve_roots(struct ve_struct *ve);
+struct cgroup *cgroup_get_local_root(struct cgroup *cgrp);
 struct ve_struct *cgroup_get_ve_owner(struct cgroup *cgrp);
 #endif
 
diff --git a/security/device_cgroup.c b/security/device_cgroup.c
index d980020..f9d205f 100644
--- a/security/device_cgroup.c
+++ b/security/device_cgroup.c
@@ -304,12 +304,41 @@ static void set_majmin(char *str, unsigned m)
sprintf(str, "%u", m);
 }
 
+struct dev_exception_item *dev_exeption_next(struct list_head *head)
+{
+   return list_entry_rcu(head->next, struct dev_exception_item, list);
+}
+
+static bool dev_exceptions_equal(struct list_head *first, struct list_head 
*second)
+{
+   struct dev_exception_item *exf, *exs;
+
+   for (exf = dev_exeption_next(first->next),
+exs = dev_exeption_next(second->next);
+>list != first && >list != second;
+exf = dev_exeption_next(exf->list.next),
+exs = dev_exeption_next(exs->list.next)) {
+   /* Check that exceptions are equal */
+   if (exf->type != exs->type ||
+   exf->major != exs->major ||
+   exf->minor != exs->minor ||
+   exf->access != exs->access)
+   return false;
+   }
+
+   if (>list != first || >list != second)
+   return false;
+
+   return true;
+}
+
 static int devcgroup_seq_read(struct cgroup *cgroup, struct cftype *cft,
struct seq_file *m)
 {
struct dev_cgroup *devcgroup = cgroup_to_devcgroup(cgroup);
struct dev_exception_item *ex;
char maj[MAJMINLEN], min[MAJMINLEN], acc[ACCLEN];
+   struct dev_cgroup *root_cgrp;
 
rcu_read_lock();
/*
@@ -325,6 +354,25 @@ static int devcgroup_seq_read(struct cgroup *cgroup, 
struct cftype *cft,
seq_printf(m, "%c %s:%s %s\n", type_to_char(DEV_ALL),
   maj, min, acc);
} else {
+   /*
+* Fooling docker in CT again: if exceptions in ve are the same
+* as in ve root cgroup - show as if we allow everyting
+   

[Devel] [PATCH RHEL7 COMMIT] ms/net: rtnetlink: validate IFLA_MTU attribute in rtnl_create_link()

2020-12-15 Thread Vasily Averin
The commit is pushed to "branch-rh7-3.10.0-1160.6.1.vz7.171.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1160.6.1.vz7.171.4
-->
commit 1efe3f91436dea98a9e9c97bc4b0d96d20f34e4c
Author: Eric Dumazet 
Date:   Tue Dec 15 12:29:56 2020 +0300

ms/net: rtnetlink: validate IFLA_MTU attribute in rtnl_create_link()

rtnl_create_link() needs to apply dev->min_mtu and dev->max_mtu
checks that we apply in do_setlink()

Otherwise malicious users can crash the kernel, for example after
an integer overflow :

BUG: KASAN: use-after-free in memset include/linux/string.h:365 [inline]
BUG: KASAN: use-after-free in __alloc_skb+0x37b/0x5e0 net/core/skbuff.c:238
Write of size 32 at addr 88819f20b9c0 by task swapper/0/0

CPU: 0 PID: 0 Comm: swapper/0 Not tainted 5.5.0-rc1-syzkaller #0
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS 
Google 01/01/2011
Call Trace:
 
 __dump_stack lib/dump_stack.c:77 [inline]
 dump_stack+0x197/0x210 lib/dump_stack.c:118
 print_address_description.constprop.0.cold+0xd4/0x30b mm/kasan/report.c:374
 __kasan_report.cold+0x1b/0x41 mm/kasan/report.c:506
 kasan_report+0x12/0x20 mm/kasan/common.c:639
 check_memory_region_inline mm/kasan/generic.c:185 [inline]
 check_memory_region+0x134/0x1a0 mm/kasan/generic.c:192
 memset+0x24/0x40 mm/kasan/common.c:108
 memset include/linux/string.h:365 [inline]
 __alloc_skb+0x37b/0x5e0 net/core/skbuff.c:238
 alloc_skb include/linux/skbuff.h:1049 [inline]
 alloc_skb_with_frags+0x93/0x590 net/core/skbuff.c:5664
 sock_alloc_send_pskb+0x7ad/0x920 net/core/sock.c:2242
 sock_alloc_send_skb+0x32/0x40 net/core/sock.c:2259
 mld_newpack+0x1d7/0x7f0 net/ipv6/mcast.c:1609
 add_grhead.isra.0+0x299/0x370 net/ipv6/mcast.c:1713
 add_grec+0x7db/0x10b0 net/ipv6/mcast.c:1844
 mld_send_cr net/ipv6/mcast.c:1970 [inline]
 mld_ifc_timer_expire+0x3d3/0x950 net/ipv6/mcast.c:2477
 call_timer_fn+0x1ac/0x780 kernel/time/timer.c:1404
 expire_timers kernel/time/timer.c:1449 [inline]
 __run_timers kernel/time/timer.c:1773 [inline]
 __run_timers kernel/time/timer.c:1740 [inline]
 run_timer_softirq+0x6c3/0x1790 kernel/time/timer.c:1786
 __do_softirq+0x262/0x98c kernel/softirq.c:292
 invoke_softirq kernel/softirq.c:373 [inline]
 irq_exit+0x19b/0x1e0 kernel/softirq.c:413
 exiting_irq arch/x86/include/asm/apic.h:536 [inline]
 smp_apic_timer_interrupt+0x1a3/0x610 arch/x86/kernel/apic/apic.c:1137
 apic_timer_interrupt+0xf/0x20 arch/x86/entry/entry_64.S:829
 
RIP: 0010:native_safe_halt+0xe/0x10 arch/x86/include/asm/irqflags.h:61
Code: 98 6b ea f9 eb 8a cc cc cc cc cc cc e9 07 00 00 00 0f 00 2d 44 1c 60 
00 f4 c3 66 90 e9 07 00 00 00 0f 00 2d 34 1c 60 00 fb f4  cc 55 48 89 e5 41 
57 41 56 41 55 41 54 53 e8 4e 5d 9a f9 e8 79
RSP: 0018:89807ce8 EFLAGS: 0286 ORIG_RAX: ff13
RAX: 113266ae RBX: 8987a1c0 RCX: 
RDX: dc00 RSI: 0006 RDI: 8987aa54
RBP: 89807d18 R08: 8987a1c0 R09: 
R10:  R11:  R12: dc00
R13: 8a799980 R14:  R15: 
 arch_cpu_idle+0xa/0x10 arch/x86/kernel/process.c:690
 default_idle_call+0x84/0xb0 kernel/sched/idle.c:94
 cpuidle_idle_call kernel/sched/idle.c:154 [inline]
 do_idle+0x3c8/0x6e0 kernel/sched/idle.c:269
 cpu_startup_entry+0x1b/0x20 kernel/sched/idle.c:361
 rest_init+0x23b/0x371 init/main.c:451
 arch_call_rest_init+0xe/0x1b
 start_kernel+0x904/0x943 init/main.c:784
 x86_64_start_reservations+0x29/0x2b arch/x86/kernel/head64.c:490
 x86_64_start_kernel+0x77/0x7b arch/x86/kernel/head64.c:471
 secondary_startup_64+0xa4/0xb0 arch/x86/kernel/head_64.S:242

The buggy address belongs to the page:
page:ea00067c82c0 refcount:0 mapcount:0 mapping: 
index:0x0
raw: 057ffe00 ea00067c82c8 ea00067c82c8 
raw:    
page dumped because: kasan: bad access detected

Memory state around the buggy address:
 88819f20b880: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
 88819f20b900: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
>88819f20b980: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
   ^
 88819f20ba00: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff
 88819f20ba80: ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff ff

Fixes: 61e84623ace3 ("net: centralize net_device min/max MTU checking")
Signed-off-by: Eric Dumazet 
Reported-by: syzbot 
Signed-off-by: David S. Miller 

(cherry-picked from commit 

[Devel] [PATCH RHEL7 COMMIT] ve/net/core: allow to call setsockopt(SO_RCVBUFFORCE) from Containers

2020-12-15 Thread Vasily Averin
The commit is pushed to "branch-rh7-3.10.0-1160.6.1.vz7.171.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1160.6.1.vz7.171.4
-->
commit bd3e846de350fcff5cfdcd6133adb7c610b3a4af
Author: Konstantin Khorenko 
Date:   Tue Dec 15 12:30:04 2020 +0300

ve/net/core: allow to call setsockopt(SO_RCVBUFFORCE) from Containers

"nft" util (in CentOS 8 environment) does use setsockopt(SO_RCVBUFFORCE)
unconditionally, so we have to allow it from inside a Container.

At the same time we don't want to allow a Container to set too much
memory for a socket, so just threat SO_RCVBUFFORCE like SO_RCVBUF if
called inside a Container.

Simple rule to test:

  # NFT=/usr/sbin/nft ./run-tests.sh -v -g testcases/nft-f/0011manydefines_0

which fails inside a Container because of not enough rcb buffer because
of failed
  setsockopt(3, SOL_SOCKET, SO_RCVBUFFORCE, [10561584], 4) = -1
  EPERM (Operation not permitted)

Signed-off-by: Konstantin Khorenko 
---
 net/core/sock.c | 7 ++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/net/core/sock.c b/net/core/sock.c
index 07ea42f..44e91c8 100644
--- a/net/core/sock.c
+++ b/net/core/sock.c
@@ -793,6 +793,7 @@ set_sndbuf:
goto set_sndbuf;
 
case SO_RCVBUF:
+unpriv_rcvbuf:
/* Don't error on this BSD doesn't and if you think
 * about it this is right. Otherwise apps have to
 * play 'guess the biggest size' games. RCVBUF/SNDBUF
@@ -824,11 +825,15 @@ set_rcvbuf:
break;
 
case SO_RCVBUFFORCE:
-   if (!capable(CAP_NET_ADMIN)) {
+   if (!ve_capable(CAP_NET_ADMIN)) {
ret = -EPERM;
break;
}
 
+   /* nft utility uses this sockopt in CentOS 8 env */
+   if (!ve_is_super(get_exec_env()))
+   goto unpriv_rcvbuf;
+
/* No negative values (to prevent underflow, as val will be
 * multiplied by 2).
 */
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL7 COMMIT] ms/net: drop bogus skb with CHECKSUM_PARTIAL and offset beyond end of trimmed packet

2020-12-15 Thread Vasily Averin
The commit is pushed to "branch-rh7-3.10.0-1160.6.1.vz7.171.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1160.6.1.vz7.171.4
-->
commit dd590cd4c2f3c21197754f40d9f67165a7470dc5
Author: Vasily Averin 
Date:   Tue Dec 15 12:24:50 2020 +0300

ms/net: drop bogus skb with CHECKSUM_PARTIAL and offset beyond end of 
trimmed packet

syzbot reproduces BUG_ON in skb_checksum_help():
tun creates (bogus) skb with huge partial-checksummed area and
small ip packet inside. Then ip_rcv trims the skb based on size
of internal ip packet, after that csum offset points beyond of
trimmed skb. Then checksum_tg() called via netfilter hook
triggers BUG_ON:

offset = skb_checksum_start_offset(skb);
BUG_ON(offset >= skb_headlen(skb));

To work around the problem this patch forces pskb_trim_rcsum_slow()
to return -EINVAL in described scenario. It allows its callers to
drop such kind of packets.

Link: 
https://syzkaller.appspot.com/bug?id=b419a5ca95062664fe1a60b764621eb4526e2cd0
Reported-by: syzbot+7010af67ced6105e5...@syzkaller.appspotmail.com
Signed-off-by: Vasily Averin 

Acked-by: Willem de Bruijn 
Link: 
https://lore.kernel.org/r/1b2494af-2c56-8ee2-7bc0-923fcad1c...@virtuozzo.com
Signed-off-by: Jakub Kicinski 

Now it is in net-next:
https://git.kernel.org/netdev/net-next/c/54970a2fbb67
https://jira.sw.ru/browse/PSBM-123062
Signed-off-by: Vasily Averin 
---
 include/linux/skbuff.h | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 296e734..f2c66b1 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3108,6 +3108,13 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, 
unsigned int len)
return 0;
if (skb->ip_summed == CHECKSUM_COMPLETE)
skb->ip_summed = CHECKSUM_NONE;
+   else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+   int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len;
+   int offset = skb_checksum_start_offset(skb) + skb->csum_offset;
+
+   if (offset + sizeof(__sum16) > hdlen)
+   return -EINVAL;
+   }
return __pskb_trim(skb, len);
 }
 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL7 COMMIT] ms/net: Fix usage of pskb_trim_rcsum

2020-12-15 Thread Vasily Averin
The commit is pushed to "branch-rh7-3.10.0-1160.6.1.vz7.171.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1160.6.1.vz7.171.4
-->
commit 36a0097fafb6b00c33a9b5497eda51cafa6972aa
Author: Ross Lagerwall 
Date:   Tue Dec 15 12:24:33 2020 +0300

ms/net: Fix usage of pskb_trim_rcsum

In certain cases, pskb_trim_rcsum() may change skb pointers.
Reinitialize header pointers afterwards to avoid potential
use-after-frees. Add a note in the documentation of
pskb_trim_rcsum(). Found by KASAN.

Signed-off-by: Ross Lagerwall 
Signed-off-by: David S. Miller 

(cherry-picked from commit 6c57f0458022298e4da1729c67bd33ce41c14e7a)
https://jira.sw.ru/browse/PSBM-123062
Signed-off-by: Vasily Averin 
---
 drivers/net/ppp/pppoe.c  | 1 +
 include/linux/skbuff.h   | 1 +
 net/bridge/br_netfilter_ipv6.c   | 1 +
 net/bridge/netfilter/nft_reject_bridge.c | 1 +
 net/ipv4/ip_input.c  | 1 +
 5 files changed, 5 insertions(+)

diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index 15d3f44..3101720 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -442,6 +442,7 @@ static int pppoe_rcv(struct sk_buff *skb, struct net_device 
*dev,
if (pskb_trim_rcsum(skb, len))
goto drop;
 
+   ph = pppoe_hdr(skb);
pn = pppoe_pernet(dev_net(dev));
if (!pn) /* no VE_FEATURE_PPP */
goto drop;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c39936f..296e734 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3099,6 +3099,7 @@ static inline void *skb_push_rcsum(struct sk_buff *skb, 
unsigned int len)
  *
  * This is exactly the same as pskb_trim except that it ensures the
  * checksum of received packets are still valid after the operation.
+ * It can change skb pointers.
  */
 
 static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)
diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c
index 2d8de1d..ca93162 100644
--- a/net/bridge/br_netfilter_ipv6.c
+++ b/net/bridge/br_netfilter_ipv6.c
@@ -132,6 +132,7 @@ int br_validate_ipv6(struct sk_buff *skb)
 IPSTATS_MIB_INDISCARDS);
goto drop;
}
+   hdr = ipv6_hdr(skb);
}
if (hdr->nexthdr == NEXTHDR_HOP && br_nf_check_hbh_len(skb))
goto drop;
diff --git a/net/bridge/netfilter/nft_reject_bridge.c 
b/net/bridge/netfilter/nft_reject_bridge.c
index 634068b..0315584 100644
--- a/net/bridge/netfilter/nft_reject_bridge.c
+++ b/net/bridge/netfilter/nft_reject_bridge.c
@@ -189,6 +189,7 @@ static bool reject6_br_csum_ok(struct sk_buff *skb, int 
hook)
pskb_trim_rcsum(skb, ntohs(ip6h->payload_len) + sizeof(*ip6h)))
return false;
 
+   ip6h = ipv6_hdr(skb);
thoff = ipv6_skip_exthdr(skb, ((u8*)(ip6h+1) - skb->data), , );
if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0)
return false;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 2b97550..f537e32 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -450,6 +450,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, 
struct packet_type *pt,
goto drop;
}
 
+   iph = ip_hdr(skb);
skb->transport_header = skb->network_header + iph->ihl*4;
 
/* Remove any debris in the socket control block */
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL7 COMMIT] ms/net: make skb_partial_csum_set() more robust against overflows

2020-12-15 Thread Vasily Averin
The commit is pushed to "branch-rh7-3.10.0-1160.6.1.vz7.171.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1160.6.1.vz7.171.4
-->
commit b18c22895955bd9f3153784fa4018c1ff5a5996f
Author: Eric Dumazet 
Date:   Tue Dec 15 12:24:41 2020 +0300

ms/net: make skb_partial_csum_set() more robust against overflows

syzbot managed to crash in skb_checksum_help() [1] :

BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));

Root cause is the following check in skb_partial_csum_set()

if (unlikely(start > skb_headlen(skb)) ||
unlikely((int)start + off > skb_headlen(skb) - 2))
return false;

If skb_headlen(skb) is 1, then (skb_headlen(skb) - 2) becomes 0x
and the check fails to detect that ((int)start + off) is off the limit,
since the compare is unsigned.

When we fix that, then the first condition (start > skb_headlen(skb))
becomes obsolete.

Then we should also check that (skb_headroom(skb) + start) wont
overflow 16bit field.

[1]
kernel BUG at net/core/dev.c:2880!
invalid opcode:  [#1] PREEMPT SMP KASAN
CPU: 1 PID: 7330 Comm: syz-executor4 Not tainted 4.19.0-rc6+ #253
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS 
Google 01/01/2011
RIP: 0010:skb_checksum_help+0x9e3/0xbb0 net/core/dev.c:2880
Code: 85 00 ff ff ff 48 c1 e8 03 42 80 3c 28 00 0f 84 09 fb ff ff 48 8b bd 
00 ff ff ff e8 97 a8 b9 fb e9 f8 fa ff ff e8 2d 09 76 fb <0f> 0b 48 8b bd 28 ff 
ff ff e8 1f a8 b9 fb e9 b1 f6 ff ff 48 89 cf
RSP: 0018:8801d83a6f60 EFLAGS: 00010293
RAX: 8801b9834380 RBX: 8801b9f8d8c0 RCX: 8608c6d7
RDX:  RSI: 8608cc63 RDI: 0006
RBP: 8801d83a7068 R08: 8801b9834380 R09: 
R10: 8801d83a76d8 R11:  R12: 0001
R13: 00010001 R14:  R15: 00a8
FS:  7f1a66db5700() GS:8801daf0() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2: 7f7d77f091b0 CR3: 0001ba252000 CR4: 001406e0
DR0:  DR1:  DR2: 
DR3:  DR6: fffe0ff0 DR7: 0400
Call Trace:
 skb_csum_hwoffload_help+0x8f/0xe0 net/core/dev.c:3269
 validate_xmit_skb+0xa2a/0xf30 net/core/dev.c:3312
 __dev_queue_xmit+0xc2f/0x3950 net/core/dev.c:3797
 dev_queue_xmit+0x17/0x20 net/core/dev.c:3838
 packet_snd net/packet/af_packet.c:2928 [inline]
 packet_sendmsg+0x422d/0x64c0 net/packet/af_packet.c:2953

Fixes: 5ff8dda3035d ("net: Ensure partial checksum offset is inside the skb 
head")
Signed-off-by: Eric Dumazet 
Cc: Herbert Xu 
Reported-by: syzbot 
Signed-off-by: David S. Miller 

(cherry-picked from commit 52b5d6f5dcf0e5201392f7d417148ccb537dbf6f)
https://jira.sw.ru/browse/PSBM-123062
Signed-off-by: Vasily Averin 
---
 net/core/skbuff.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index fa5ba0d..eef4100 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3961,14 +3961,16 @@ EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
  */
 bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
 {
-   if (unlikely(start > skb_headlen(skb)) ||
-   unlikely((int)start + off > skb_headlen(skb) - 2)) {
-   net_warn_ratelimited("bad partial csum: csum=%u/%u len=%u\n",
-start, off, skb_headlen(skb));
+   u32 csum_end = (u32)start + (u32)off + sizeof(__sum16);
+   u32 csum_start = skb_headroom(skb) + (u32)start;
+
+   if (unlikely(csum_start > U16_MAX || csum_end > skb_headlen(skb))) {
+   net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u 
headlen=%u\n",
+start, off, skb_headroom(skb), 
skb_headlen(skb));
return false;
}
skb->ip_summed = CHECKSUM_PARTIAL;
-   skb->csum_start = skb_headroom(skb) + start;
+   skb->csum_start = csum_start;
skb->csum_offset = off;
skb_set_transport_header(skb, start);
return true;
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL7 COMMIT] ms/netfilter: xt_checksum: ignore gso skbs

2020-12-15 Thread Vasily Averin
The commit is pushed to "branch-rh7-3.10.0-1160.6.1.vz7.171.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1160.6.1.vz7.171.4
-->
commit 2bcf37acf916a4e7f6fed7400dce34a875d67058
Author: Florian Westphal 
Date:   Tue Dec 15 12:24:26 2020 +0300

ms/netfilter: xt_checksum: ignore gso skbs

Satish Patel reports a skb_warn_bad_offload() splat caused
by -j CHECKSUM rules:

-A POSTROUTING -p tcp -m tcp --sport 80 -j CHECKSUM

The CHECKSUM target has never worked with GSO skbs, and the above rule
makes no sense as kernel will handle checksum updates on transmit.

Unfortunately, there are 3rd party tools that install such rules, so we
cannot reject this from the config plane without potential breakage.

Amend Kconfig text to clarify that the CHECKSUM target is only useful
in virtualized environments, where old dhcp clients that use AF_PACKET
used to discard UDP packets with a 'bad' header checksum and add a
one-time warning in case such rule isn't restricted to UDP.

v2: check IP6T_F_PROTO flag before cmp (Michal Kubecek)

Reported-by: Satish Patel 
Reported-by: Markos Chandras 
Reported-by: Michal Kubecek 
Signed-off-by: Florian Westphal 
Reviewed-by: Michal Kubecek 
Signed-off-by: Pablo Neira Ayuso 

(cherry-picked from commit 10568f6c5761db24249c610c94d6e44d5505a0ba)
VvS: backported with minor context changes
https://jira.sw.ru/browse/PSBM-123062
Signed-off-by: Vasily Averin 
---
 net/netfilter/Kconfig   | 12 ++--
 net/netfilter/xt_CHECKSUM.c | 23 ++-
 2 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 561b065..0b02434 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -622,13 +622,13 @@ config NETFILTER_XT_TARGET_CHECKSUM
depends on NETFILTER_ADVANCED
---help---
  This option adds a `CHECKSUM' target, which can be used in the 
iptables mangle
- table.
+ table to work around buggy DHCP clients in virtualized environments.
 
- You can use this target to compute and fill in the checksum in
- a packet that lacks a checksum.  This is particularly useful,
- if you need to work around old applications such as dhcp clients,
- that do not work well with checksum offloads, but don't want to 
disable
- checksum offload in your device.
+ Some old DHCP clients drop packets because they are not aware
+ that the checksum would normally be offloaded to hardware and
+ thus should be considered valid.
+ This target can be used to fill in the checksum using iptables
+ when such packets are sent via a virtual network device.
 
  To compile it as a module, choose M here.  If unsure, say N.
 
diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c
index 0f642ef..db286fc 100644
--- a/net/netfilter/xt_CHECKSUM.c
+++ b/net/netfilter/xt_CHECKSUM.c
@@ -16,6 +16,9 @@
 #include 
 #include 
 
+#include 
+#include 
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Michael S. Tsirkin ");
 MODULE_DESCRIPTION("Xtables: checksum modification");
@@ -25,7 +28,7 @@ MODULE_ALIAS("ip6t_CHECKSUM");
 static unsigned int
 checksum_tg(struct sk_buff *skb, const struct xt_action_param *par)
 {
-   if (skb->ip_summed == CHECKSUM_PARTIAL)
+   if (skb->ip_summed == CHECKSUM_PARTIAL && !skb_is_gso(skb))
skb_checksum_help(skb);
 
return XT_CONTINUE;
@@ -34,6 +37,8 @@ checksum_tg(struct sk_buff *skb, const struct xt_action_param 
*par)
 static int checksum_tg_check(const struct xt_tgchk_param *par)
 {
const struct xt_CHECKSUM_info *einfo = par->targinfo;
+   const struct ip6t_ip6 *i6 = par->entryinfo;
+   const struct ipt_ip *i4 = par->entryinfo;
 
if (einfo->operation & ~XT_CHECKSUM_OP_FILL) {
pr_info("unsupported CHECKSUM operation %x\n", 
einfo->operation);
@@ -43,6 +48,22 @@ static int checksum_tg_check(const struct xt_tgchk_param 
*par)
pr_info("no CHECKSUM operation enabled\n");
return -EINVAL;
}
+
+   switch (par->family) {
+   case NFPROTO_IPV4:
+   if (i4->proto == IPPROTO_UDP &&
+   (i4->invflags & XT_INV_PROTO) == 0)
+   return 0;
+   break;
+   case NFPROTO_IPV6:
+   if ((i6->flags & IP6T_F_PROTO) &&
+   i6->proto == IPPROTO_UDP &&
+   (i6->invflags & XT_INV_PROTO) == 0)
+   return 0;
+   break;
+   }
+
+   pr_warn_once("CHECKSUM should be avoided.  If really needed, restrict 
with \"-p udp\" and only use in OUTPUT\n");
return 0;
 }
 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL7 COMMIT] ms/netfilter: Replace spin_is_locked() with lockdep

2020-12-15 Thread Vasily Averin
The commit is pushed to "branch-rh7-3.10.0-1160.6.1.vz7.171.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1160.6.1.vz7.171.4
-->
commit feca123b54148d68b49ddf71efe31e189706e07c
Author: Lance Roy 
Date:   Tue Dec 15 12:22:24 2020 +0300

ms/netfilter: Replace spin_is_locked() with lockdep

lockdep_assert_held() is better suited to checking locking requirements,
since it won't get confused when someone else holds the lock. This is
also a step towards possibly removing spin_is_locked().

Signed-off-by: Lance Roy 
Cc: Pablo Neira Ayuso 
Cc: Jozsef Kadlecsik 
Cc: Florian Westphal 
Cc: "David S. Miller" 
Cc: 
Cc: 
Cc: 
Acked-by: Jozsef Kadlecsik 
Signed-off-by: Pablo Neira Ayuso 

(cherry-picked from commit 4b87dd54be21ad611a1c740f9df0c4376d496e09)
https://jira.sw.ru/browse/PSBM-123086
Signed-off-by: Vasily Averin 
---
 net/netfilter/ipset/ip_set_hash_gen.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/net/netfilter/ipset/ip_set_hash_gen.h 
b/net/netfilter/ipset/ip_set_hash_gen.h
index 225348a..a437adf 100644
--- a/net/netfilter/ipset/ip_set_hash_gen.h
+++ b/net/netfilter/ipset/ip_set_hash_gen.h
@@ -15,7 +15,7 @@
 
 #define __ipset_dereference_protected(p, c)rcu_dereference_protected(p, c)
 #define ipset_dereference_protected(p, set) \
-   __ipset_dereference_protected(p, spin_is_locked(&(set)->lock))
+   __ipset_dereference_protected(p, lockdep_is_held(&(set)->lock))
 
 #define rcu_dereference_bh_nfnl(p) rcu_dereference_bh_check(p, 1)
 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL7 COMMIT] ms/netfilter: ipset: Convert timers to use timer_setup()

2020-12-15 Thread Vasily Averin
The commit is pushed to "branch-rh7-3.10.0-1160.6.1.vz7.171.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1160.6.1.vz7.171.4
-->
commit 121cca10e432208a2de0f782fcdf15190a4937c8
Author: Kees Cook 
Date:   Tue Dec 15 12:22:17 2020 +0300

ms/netfilter: ipset: Convert timers to use timer_setup()

In preparation for unconditionally passing the struct timer_list pointer to
all timer callbacks, switch to using the new timer_setup() and from_timer()
to pass the timer pointer explicitly. This introduces a pointer back to the
struct ip_set, which is used instead of the struct timer_list .data field.

Cc: Pablo Neira Ayuso 
Cc: Jozsef Kadlecsik 
Cc: Florian Westphal 
Cc: "David S. Miller" 
Cc: Stephen Hemminger 
Cc: simran singhal 
Cc: Muhammad Falak R Wani 
Cc: netfilter-de...@vger.kernel.org
Cc: coret...@netfilter.org
Cc: net...@vger.kernel.org
Signed-off-by: Kees Cook 
Signed-off-by: David S. Miller 

(cherry-picked frm commit a92c5751b97cca55d8140ec0bf26a53c7e00bfa5)
https://jira.sw.ru/browse/PSBM-123086
Signed off-by: Vasily Averin 
---
 net/netfilter/ipset/ip_set_bitmap_gen.h   | 10 +-
 net/netfilter/ipset/ip_set_bitmap_ip.c|  2 ++
 net/netfilter/ipset/ip_set_bitmap_ipmac.c |  2 ++
 net/netfilter/ipset/ip_set_bitmap_port.c  |  2 ++
 net/netfilter/ipset/ip_set_hash_gen.h | 12 +++-
 net/netfilter/ipset/ip_set_list_set.c | 12 +++-
 6 files changed, 25 insertions(+), 15 deletions(-)

diff --git a/net/netfilter/ipset/ip_set_bitmap_gen.h 
b/net/netfilter/ipset/ip_set_bitmap_gen.h
index a77e3f3..257ca39 100644
--- a/net/netfilter/ipset/ip_set_bitmap_gen.h
+++ b/net/netfilter/ipset/ip_set_bitmap_gen.h
@@ -37,11 +37,11 @@
 #define get_ext(set, map, id)  ((map)->extensions + ((set)->dsize * (id)))
 
 static void
-mtype_gc_init(struct ip_set *set, void (*gc)(unsigned long ul_set))
+mtype_gc_init(struct ip_set *set, void (*gc)(struct timer_list *t))
 {
struct mtype *map = set->data;
 
-   setup_timer(>gc, gc, (unsigned long)set);
+   timer_setup(>gc, gc, 0);
mod_timer(>gc, jiffies + IPSET_GC_PERIOD(set->timeout) * HZ);
 }
 
@@ -266,10 +266,10 @@ out:
 }
 
 static void
-mtype_gc(unsigned long ul_set)
+mtype_gc(struct timer_list *t)
 {
-   struct ip_set *set = (struct ip_set *)ul_set;
-   struct mtype *map = set->data;
+   struct mtype *map = from_timer(map, t, gc);
+   struct ip_set *set = map->set;
void *x;
u32 id;
 
diff --git a/net/netfilter/ipset/ip_set_bitmap_ip.c 
b/net/netfilter/ipset/ip_set_bitmap_ip.c
index 9f4f28d..488d6d0 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ip.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ip.c
@@ -48,6 +48,7 @@ struct bitmap_ip {
size_t memsize; /* members size */
u8 netmask; /* subnet netmask */
struct timer_list gc;   /* garbage collection */
+   struct ip_set *set; /* attached to this ip_set */
unsigned char extensions[0] /* data extensions */
__aligned(__alignof__(u64));
 };
@@ -232,6 +233,7 @@ init_map_ip(struct ip_set *set, struct bitmap_ip *map,
map->netmask = netmask;
set->timeout = IPSET_NO_TIMEOUT;
 
+   map->set = set;
set->data = map;
set->family = NFPROTO_IPV4;
 
diff --git a/net/netfilter/ipset/ip_set_bitmap_ipmac.c 
b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
index ae927b3..794e033 100644
--- a/net/netfilter/ipset/ip_set_bitmap_ipmac.c
+++ b/net/netfilter/ipset/ip_set_bitmap_ipmac.c
@@ -52,6 +52,7 @@ struct bitmap_ipmac {
u32 elements;   /* number of max elements in the set */
size_t memsize; /* members size */
struct timer_list gc;   /* garbage collector */
+   struct ip_set *set; /* attached to this ip_set */
unsigned char extensions[0] /* MAC + data extensions */
__aligned(__alignof__(u64));
 };
@@ -310,6 +311,7 @@ init_map_ipmac(struct ip_set *set, struct bitmap_ipmac *map,
map->elements = elements;
set->timeout = IPSET_NO_TIMEOUT;
 
+   map->set = set;
set->data = map;
set->family = NFPROTO_IPV4;
 
diff --git a/net/netfilter/ipset/ip_set_bitmap_port.c 
b/net/netfilter/ipset/ip_set_bitmap_port.c
index c6e1ebf..b561ca8 100644
--- a/net/netfilter/ipset/ip_set_bitmap_port.c
+++ b/net/netfilter/ipset/ip_set_bitmap_port.c
@@ -40,6 +40,7 @@ struct bitmap_port {
u32 elements;   /* number of max elements in the set */
size_t memsize; /* members size */
struct timer_list gc;   /* garbage collection */
+   struct ip_set *set; /* attached to this ip_set */
unsigned char extensions[0] /* data extensions */
__aligned(__alignof__(u64));
 };
@@ -214,6 +215,7 @@ init_map_port(struct ip_set *set, struct bitmap_port *map,
map->last_port = last_port;
   

[Devel] [PATCH RHEL7 COMMIT] ms/netfilter: ipset: Fix "INFO: rcu detected stall in hash_xxx" reports

2020-12-15 Thread Vasily Averin
The commit is pushed to "branch-rh7-3.10.0-1160.6.1.vz7.171.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-1160.6.1.vz7.171.4
-->
commit f382244983c50d1f353741e0ebaf71e162cfa8c6
Author: Jozsef Kadlecsik 
Date:   Tue Dec 15 12:22:33 2020 +0300

ms/netfilter: ipset: Fix "INFO: rcu detected stall in hash_xxx" reports

In the case of huge hash:* types of sets, due to the single spinlock of
a set the processing of the whole set under spinlock protection could take
too long.

There were four places where the whole hash table of the set was processed
from bucket to bucket under holding the spinlock:

- During resizing a set, the original set was locked to exclude kernel side
  add/del element operations (userspace add/del is excluded by the
  nfnetlink mutex). The original set is actually just read during the
  resize, so the spinlocking is replaced with rcu locking of regions.
  However, thus there can be parallel kernel side add/del of entries.
  In order not to loose those operations a backlog is added and replayed
  after the successful resize.
- Garbage collection of timed out entries was also protected by the 
spinlock.
  In order not to lock too long, region locking is introduced and a single
  region is processed in one gc go. Also, the simple timer based gc running
  is replaced with a workqueue based solution. The internal book-keeping
  (number of elements, size of extensions) is moved to region level due to
  the region locking.
- Adding elements: when the max number of the elements is reached, the gc
  was called to evict the timed out entries. The new approach is that the gc
  is called just for the matching region, assuming that if the region
  (proportionally) seems to be full, then the whole set does. We could scan
  the other regions to check every entry under rcu locking, but for huge
  sets it'd mean a slowdown at adding elements.
- Listing the set header data: when the set was defined with timeout
  support, the garbage collector was called to clean up timed out entries
  to get the correct element numbers and set size values. Now the set is
  scanned to check non-timed out entries, without actually calling the gc
  for the whole set.

Thanks to Florian Westphal for helping me to solve the SOFTIRQ-safe ->
SOFTIRQ-unsafe lock order issues during working on the patch.

Reported-by: syzbot+4b0e9d4ff3cf11783...@syzkaller.appspotmail.com
Reported-by: syzbot+c27b8d5010f45c666...@syzkaller.appspotmail.com
Reported-by: syzbot+68a806795ac89df3a...@syzkaller.appspotmail.com
Fixes: 23c42a403a9c ("netfilter: ipset: Introduction of new commands and 
protocol version 7")
Signed-off-by: Jozsef Kadlecsik 

(cherry-picked from f66ee0410b1c3481ee75e5db9b34547b4d582465)
VvS changes: minor context changes because of skipped backport of
 bd96b4c7 "netfilter: inline four headers files into another one."
https://jira.sw.ru/browse/PSBM-123086
Signed-off-by: Vasily Averin 
---
 include/linux/netfilter/ipset/ip_set.h |  11 +-
 net/netfilter/ipset/ip_set_core.c  |  34 +-
 net/netfilter/ipset/ip_set_hash_gen.h  | 633 +++--
 3 files changed, 472 insertions(+), 206 deletions(-)

diff --git a/include/linux/netfilter/ipset/ip_set.h 
b/include/linux/netfilter/ipset/ip_set.h
index f2e1e6b..471363b 100644
--- a/include/linux/netfilter/ipset/ip_set.h
+++ b/include/linux/netfilter/ipset/ip_set.h
@@ -124,6 +124,7 @@ struct ip_set_ext {
u32 timeout;
u8 packets_op;
u8 bytes_op;
+   bool target;
 };
 
 struct ip_set;
@@ -190,6 +191,14 @@ struct ip_set_type_variant {
/* Return true if "b" set is the same as "a"
 * according to the create set parameters */
bool (*same_set)(const struct ip_set *a, const struct ip_set *b);
+   /* Region-locking is used */
+   bool region_lock;
+};
+
+struct ip_set_region {
+   spinlock_t lock;/* Region lock */
+   size_t ext_size;/* Size of the dynamic extensions */
+   u32 elements;   /* Number of elements vs timeout */
 };
 
 /* The core set type structure */
@@ -464,7 +473,7 @@ bitmap_bytes(u32 a, u32 b)
 #include 
 
 #define IP_SET_INIT_KEXT(skb, opt, set)\
-   { .bytes = (skb)->len, .packets = 1,\
+   { .bytes = (skb)->len, .packets = 1, .target = true,\
  .timeout = ip_set_adt_opt_timeout(opt, set) }
 
 #define IP_SET_INIT_UEXT(set)  \
diff --git a/net/netfilter/ipset/ip_set_core.c 
b/net/netfilter/ipset/ip_set_core.c
index b067879..d47d978 100644
--- a/net/netfilter/ipset/ip_set_core.c
+++ b/net/netfilter/ipset/ip_set_core.c
@@ -559,6 +559,20 @@ ip_set_rcu_get(struct net *net, ip_set_id_t index)
return set;
 }
 
+static inline void
+ip_set_lock(struct 

[Devel] [PATCH RH7 4/4] ms/net: drop bogus skb with CHECKSUM_PARTIAL and offset beyond end of trimmed packet

2020-12-15 Thread Vasily Averin
syzbot reproduces BUG_ON in skb_checksum_help():
tun creates (bogus) skb with huge partial-checksummed area and
small ip packet inside. Then ip_rcv trims the skb based on size
of internal ip packet, after that csum offset points beyond of
trimmed skb. Then checksum_tg() called via netfilter hook
triggers BUG_ON:

offset = skb_checksum_start_offset(skb);
BUG_ON(offset >= skb_headlen(skb));

To work around the problem this patch forces pskb_trim_rcsum_slow()
to return -EINVAL in described scenario. It allows its callers to
drop such kind of packets.

Link: 
https://syzkaller.appspot.com/bug?id=b419a5ca95062664fe1a60b764621eb4526e2cd0
Reported-by: syzbot+7010af67ced6105e5...@syzkaller.appspotmail.com
Signed-off-by: Vasily Averin 
Acked-by: Willem de Bruijn 
Link: 
https://lore.kernel.org/r/1b2494af-2c56-8ee2-7bc0-923fcad1c...@virtuozzo.com
Signed-off-by: Jakub Kicinski 

Now it is in net-next:
https://git.kernel.org/netdev/net-next/c/54970a2fbb67
https://jira.sw.ru/browse/PSBM-123062
Signed-off-by: Vasily Averin 
---
 include/linux/skbuff.h | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index 296e734..f2c66b1 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3108,6 +3108,13 @@ static inline int pskb_trim_rcsum(struct sk_buff *skb, 
unsigned int len)
return 0;
if (skb->ip_summed == CHECKSUM_COMPLETE)
skb->ip_summed = CHECKSUM_NONE;
+   else if (skb->ip_summed == CHECKSUM_PARTIAL) {
+   int hdlen = (len > skb_headlen(skb)) ? skb_headlen(skb) : len;
+   int offset = skb_checksum_start_offset(skb) + skb->csum_offset;
+
+   if (offset + sizeof(__sum16) > hdlen)
+   return -EINVAL;
+   }
return __pskb_trim(skb, len);
 }
 
-- 
1.8.3.1

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RH7 3/4] ms/net: make skb_partial_csum_set() more robust against overflows

2020-12-15 Thread Vasily Averin
From: Eric Dumazet 

syzbot managed to crash in skb_checksum_help() [1] :

BUG_ON(offset + sizeof(__sum16) > skb_headlen(skb));

Root cause is the following check in skb_partial_csum_set()

if (unlikely(start > skb_headlen(skb)) ||
unlikely((int)start + off > skb_headlen(skb) - 2))
return false;

If skb_headlen(skb) is 1, then (skb_headlen(skb) - 2) becomes 0x
and the check fails to detect that ((int)start + off) is off the limit,
since the compare is unsigned.

When we fix that, then the first condition (start > skb_headlen(skb))
becomes obsolete.

Then we should also check that (skb_headroom(skb) + start) wont
overflow 16bit field.

[1]
kernel BUG at net/core/dev.c:2880!
invalid opcode:  [#1] PREEMPT SMP KASAN
CPU: 1 PID: 7330 Comm: syz-executor4 Not tainted 4.19.0-rc6+ #253
Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 
01/01/2011
RIP: 0010:skb_checksum_help+0x9e3/0xbb0 net/core/dev.c:2880
Code: 85 00 ff ff ff 48 c1 e8 03 42 80 3c 28 00 0f 84 09 fb ff ff 48 8b bd 00 
ff ff ff e8 97 a8 b9 fb e9 f8 fa ff ff e8 2d 09 76 fb <0f> 0b 48 8b bd 28 ff ff 
ff e8 1f a8 b9 fb e9 b1 f6 ff ff 48 89 cf
RSP: 0018:8801d83a6f60 EFLAGS: 00010293
RAX: 8801b9834380 RBX: 8801b9f8d8c0 RCX: 8608c6d7
RDX:  RSI: 8608cc63 RDI: 0006
RBP: 8801d83a7068 R08: 8801b9834380 R09: 
R10: 8801d83a76d8 R11:  R12: 0001
R13: 00010001 R14:  R15: 00a8
FS:  7f1a66db5700() GS:8801daf0() knlGS:
CS:  0010 DS:  ES:  CR0: 80050033
CR2: 7f7d77f091b0 CR3: 0001ba252000 CR4: 001406e0
DR0:  DR1:  DR2: 
DR3:  DR6: fffe0ff0 DR7: 0400
Call Trace:
 skb_csum_hwoffload_help+0x8f/0xe0 net/core/dev.c:3269
 validate_xmit_skb+0xa2a/0xf30 net/core/dev.c:3312
 __dev_queue_xmit+0xc2f/0x3950 net/core/dev.c:3797
 dev_queue_xmit+0x17/0x20 net/core/dev.c:3838
 packet_snd net/packet/af_packet.c:2928 [inline]
 packet_sendmsg+0x422d/0x64c0 net/packet/af_packet.c:2953

Fixes: 5ff8dda3035d ("net: Ensure partial checksum offset is inside the skb 
head")
Signed-off-by: Eric Dumazet 
Cc: Herbert Xu 
Reported-by: syzbot 
Signed-off-by: David S. Miller 

(cherry-picked from commit 52b5d6f5dcf0e5201392f7d417148ccb537dbf6f)
https://jira.sw.ru/browse/PSBM-123062
Signed-off-by: Vasily Averin 
---
 net/core/skbuff.c | 12 +++-
 1 file changed, 7 insertions(+), 5 deletions(-)

diff --git a/net/core/skbuff.c b/net/core/skbuff.c
index fa5ba0d..eef4100 100644
--- a/net/core/skbuff.c
+++ b/net/core/skbuff.c
@@ -3961,14 +3961,16 @@ EXPORT_SYMBOL_GPL(skb_complete_wifi_ack);
  */
 bool skb_partial_csum_set(struct sk_buff *skb, u16 start, u16 off)
 {
-   if (unlikely(start > skb_headlen(skb)) ||
-   unlikely((int)start + off > skb_headlen(skb) - 2)) {
-   net_warn_ratelimited("bad partial csum: csum=%u/%u len=%u\n",
-start, off, skb_headlen(skb));
+   u32 csum_end = (u32)start + (u32)off + sizeof(__sum16);
+   u32 csum_start = skb_headroom(skb) + (u32)start;
+
+   if (unlikely(csum_start > U16_MAX || csum_end > skb_headlen(skb))) {
+   net_warn_ratelimited("bad partial csum: csum=%u/%u headroom=%u 
headlen=%u\n",
+start, off, skb_headroom(skb), 
skb_headlen(skb));
return false;
}
skb->ip_summed = CHECKSUM_PARTIAL;
-   skb->csum_start = skb_headroom(skb) + start;
+   skb->csum_start = csum_start;
skb->csum_offset = off;
skb_set_transport_header(skb, start);
return true;
-- 
1.8.3.1

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RH7 2/4] ms/net: Fix usage of pskb_trim_rcsum

2020-12-15 Thread Vasily Averin
From: Ross Lagerwall 

In certain cases, pskb_trim_rcsum() may change skb pointers.
Reinitialize header pointers afterwards to avoid potential
use-after-frees. Add a note in the documentation of
pskb_trim_rcsum(). Found by KASAN.

Signed-off-by: Ross Lagerwall 
Signed-off-by: David S. Miller 

(cherry-picked from commit 6c57f0458022298e4da1729c67bd33ce41c14e7a)
https://jira.sw.ru/browse/PSBM-123062
Signed-off-by: Vasily Averin 
---
 drivers/net/ppp/pppoe.c  | 1 +
 include/linux/skbuff.h   | 1 +
 net/bridge/br_netfilter_ipv6.c   | 1 +
 net/bridge/netfilter/nft_reject_bridge.c | 1 +
 net/ipv4/ip_input.c  | 1 +
 5 files changed, 5 insertions(+)

diff --git a/drivers/net/ppp/pppoe.c b/drivers/net/ppp/pppoe.c
index 15d3f44..3101720 100644
--- a/drivers/net/ppp/pppoe.c
+++ b/drivers/net/ppp/pppoe.c
@@ -442,6 +442,7 @@ static int pppoe_rcv(struct sk_buff *skb, struct net_device 
*dev,
if (pskb_trim_rcsum(skb, len))
goto drop;
 
+   ph = pppoe_hdr(skb);
pn = pppoe_pernet(dev_net(dev));
if (!pn) /* no VE_FEATURE_PPP */
goto drop;
diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h
index c39936f..296e734 100644
--- a/include/linux/skbuff.h
+++ b/include/linux/skbuff.h
@@ -3099,6 +3099,7 @@ static inline void *skb_push_rcsum(struct sk_buff *skb, 
unsigned int len)
  *
  * This is exactly the same as pskb_trim except that it ensures the
  * checksum of received packets are still valid after the operation.
+ * It can change skb pointers.
  */
 
 static inline int pskb_trim_rcsum(struct sk_buff *skb, unsigned int len)
diff --git a/net/bridge/br_netfilter_ipv6.c b/net/bridge/br_netfilter_ipv6.c
index 2d8de1d..ca93162 100644
--- a/net/bridge/br_netfilter_ipv6.c
+++ b/net/bridge/br_netfilter_ipv6.c
@@ -132,6 +132,7 @@ int br_validate_ipv6(struct sk_buff *skb)
 IPSTATS_MIB_INDISCARDS);
goto drop;
}
+   hdr = ipv6_hdr(skb);
}
if (hdr->nexthdr == NEXTHDR_HOP && br_nf_check_hbh_len(skb))
goto drop;
diff --git a/net/bridge/netfilter/nft_reject_bridge.c 
b/net/bridge/netfilter/nft_reject_bridge.c
index 634068b..0315584 100644
--- a/net/bridge/netfilter/nft_reject_bridge.c
+++ b/net/bridge/netfilter/nft_reject_bridge.c
@@ -189,6 +189,7 @@ static bool reject6_br_csum_ok(struct sk_buff *skb, int 
hook)
pskb_trim_rcsum(skb, ntohs(ip6h->payload_len) + sizeof(*ip6h)))
return false;
 
+   ip6h = ipv6_hdr(skb);
thoff = ipv6_skip_exthdr(skb, ((u8*)(ip6h+1) - skb->data), , );
if (thoff < 0 || thoff >= skb->len || (fo & htons(~0x7)) != 0)
return false;
diff --git a/net/ipv4/ip_input.c b/net/ipv4/ip_input.c
index 2b97550..f537e32 100644
--- a/net/ipv4/ip_input.c
+++ b/net/ipv4/ip_input.c
@@ -450,6 +450,7 @@ int ip_rcv(struct sk_buff *skb, struct net_device *dev, 
struct packet_type *pt,
goto drop;
}
 
+   iph = ip_hdr(skb);
skb->transport_header = skb->network_header + iph->ihl*4;
 
/* Remove any debris in the socket control block */
-- 
1.8.3.1

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RH7 1/4] ms/netfilter: xt_checksum: ignore gso skbs

2020-12-15 Thread Vasily Averin
From: Florian Westphal 

Satish Patel reports a skb_warn_bad_offload() splat caused
by -j CHECKSUM rules:

-A POSTROUTING -p tcp -m tcp --sport 80 -j CHECKSUM

The CHECKSUM target has never worked with GSO skbs, and the above rule
makes no sense as kernel will handle checksum updates on transmit.

Unfortunately, there are 3rd party tools that install such rules, so we
cannot reject this from the config plane without potential breakage.

Amend Kconfig text to clarify that the CHECKSUM target is only useful
in virtualized environments, where old dhcp clients that use AF_PACKET
used to discard UDP packets with a 'bad' header checksum and add a
one-time warning in case such rule isn't restricted to UDP.

v2: check IP6T_F_PROTO flag before cmp (Michal Kubecek)

Reported-by: Satish Patel 
Reported-by: Markos Chandras 
Reported-by: Michal Kubecek 
Signed-off-by: Florian Westphal 
Reviewed-by: Michal Kubecek 
Signed-off-by: Pablo Neira Ayuso 

(cherry-picked from commit 10568f6c5761db24249c610c94d6e44d5505a0ba)
VvS: backported with minor context changes
https://jira.sw.ru/browse/PSBM-123062
Signed-off-by: Vasily Averin 
---
 net/netfilter/Kconfig   | 12 ++--
 net/netfilter/xt_CHECKSUM.c | 23 ++-
 2 files changed, 28 insertions(+), 7 deletions(-)

diff --git a/net/netfilter/Kconfig b/net/netfilter/Kconfig
index 561b065..0b02434 100644
--- a/net/netfilter/Kconfig
+++ b/net/netfilter/Kconfig
@@ -622,13 +622,13 @@ config NETFILTER_XT_TARGET_CHECKSUM
depends on NETFILTER_ADVANCED
---help---
  This option adds a `CHECKSUM' target, which can be used in the 
iptables mangle
- table.
+ table to work around buggy DHCP clients in virtualized environments.
 
- You can use this target to compute and fill in the checksum in
- a packet that lacks a checksum.  This is particularly useful,
- if you need to work around old applications such as dhcp clients,
- that do not work well with checksum offloads, but don't want to 
disable
- checksum offload in your device.
+ Some old DHCP clients drop packets because they are not aware
+ that the checksum would normally be offloaded to hardware and
+ thus should be considered valid.
+ This target can be used to fill in the checksum using iptables
+ when such packets are sent via a virtual network device.
 
  To compile it as a module, choose M here.  If unsure, say N.
 
diff --git a/net/netfilter/xt_CHECKSUM.c b/net/netfilter/xt_CHECKSUM.c
index 0f642ef..db286fc 100644
--- a/net/netfilter/xt_CHECKSUM.c
+++ b/net/netfilter/xt_CHECKSUM.c
@@ -16,6 +16,9 @@
 #include 
 #include 
 
+#include 
+#include 
+
 MODULE_LICENSE("GPL");
 MODULE_AUTHOR("Michael S. Tsirkin ");
 MODULE_DESCRIPTION("Xtables: checksum modification");
@@ -25,7 +28,7 @@ MODULE_ALIAS("ip6t_CHECKSUM");
 static unsigned int
 checksum_tg(struct sk_buff *skb, const struct xt_action_param *par)
 {
-   if (skb->ip_summed == CHECKSUM_PARTIAL)
+   if (skb->ip_summed == CHECKSUM_PARTIAL && !skb_is_gso(skb))
skb_checksum_help(skb);
 
return XT_CONTINUE;
@@ -34,6 +37,8 @@ checksum_tg(struct sk_buff *skb, const struct xt_action_param 
*par)
 static int checksum_tg_check(const struct xt_tgchk_param *par)
 {
const struct xt_CHECKSUM_info *einfo = par->targinfo;
+   const struct ip6t_ip6 *i6 = par->entryinfo;
+   const struct ipt_ip *i4 = par->entryinfo;
 
if (einfo->operation & ~XT_CHECKSUM_OP_FILL) {
pr_info("unsupported CHECKSUM operation %x\n", 
einfo->operation);
@@ -43,6 +48,22 @@ static int checksum_tg_check(const struct xt_tgchk_param 
*par)
pr_info("no CHECKSUM operation enabled\n");
return -EINVAL;
}
+
+   switch (par->family) {
+   case NFPROTO_IPV4:
+   if (i4->proto == IPPROTO_UDP &&
+   (i4->invflags & XT_INV_PROTO) == 0)
+   return 0;
+   break;
+   case NFPROTO_IPV6:
+   if ((i6->flags & IP6T_F_PROTO) &&
+   i6->proto == IPPROTO_UDP &&
+   (i6->invflags & XT_INV_PROTO) == 0)
+   return 0;
+   break;
+   }
+
+   pr_warn_once("CHECKSUM should be avoided.  If really needed, restrict 
with \"-p udp\" and only use in OUTPUT\n");
return 0;
 }
 
-- 
1.8.3.1

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel