[Devel] [PATCH rh7] netfilter: Allow xt_owner in any user namespace

2017-10-13 Thread Andrei Vagin
From: "Eric W. Biederman" 

ML: 9847371a84b0be330f4bc4aaa98904101ee8573d
https://jira.sw.ru/browse/PSBM-69409?

Making this work is a little tricky as it really isn't kosher to
change the xt_owner_match_info in a check function.

Without changing xt_owner_match_info we need to know the user
namespace the uids and gids are specified in.  In the common case
net->user_ns == current_user_ns().  Verify net->user_ns ==
current_user_ns() in owner_check so we can later assume it in
owner_mt.

In owner_check also verify that all of the uids and gids specified are
in net->user_ns and that the expected min/max relationship exists
between the uids and gids in xt_owner_match_info.

In owner_mt get the network namespace from the outgoing socket, as this
must be the same network namespace as the netfilter rules, and use that
network namespace to find the user namespace the uids and gids in
xt_match_owner_info are encoded in.  Then convert from their encoded
from into the kernel internal format for uids and gids and perform the
owner match.

Similar to ping_group_range, this code does not try to detect
noncontiguous UID/GID ranges.

Signed-off-by: "Eric W. Biederman" 
Signed-off-by: Kevin Cernekee 
Signed-off-by: Pablo Neira Ayuso 
Signed-off-by: Andrei Vagin 
---
 net/netfilter/xt_owner.c | 41 +++--
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c
index 31dec4a..1744f78 100644
--- a/net/netfilter/xt_owner.c
+++ b/net/netfilter/xt_owner.c
@@ -80,11 +80,39 @@ owner_mt6_v0(const struct sk_buff *skb, struct 
xt_action_param *par)
 static int owner_check(const struct xt_mtchk_param *par)
 {
struct xt_owner_match_info *info = par->matchinfo;
+   struct net *net = par->net;
 
-   /* For now only allow adding matches from the initial user namespace */
+   /* Only allow the common case where the userns of the writer
+* matches the userns of the network namespace.
+*/
if ((info->match & (XT_OWNER_UID|XT_OWNER_GID)) &&
-   !current_user_ns_initial())
+   (current_user_ns() != net->user_ns))
return -EINVAL;
+
+   /* Ensure the uids are valid */
+   if (info->match & XT_OWNER_UID) {
+   kuid_t uid_min = make_kuid(net->user_ns, info->uid_min);
+   kuid_t uid_max = make_kuid(net->user_ns, info->uid_max);
+
+   if (!uid_valid(uid_min) || !uid_valid(uid_max) ||
+   (info->uid_max < info->uid_min) ||
+   uid_lt(uid_max, uid_min)) {
+   return -EINVAL;
+   }
+   }
+
+   /* Ensure the gids are valid */
+   if (info->match & XT_OWNER_GID) {
+   kgid_t gid_min = make_kgid(net->user_ns, info->gid_min);
+   kgid_t gid_max = make_kgid(net->user_ns, info->gid_max);
+
+   if (!gid_valid(gid_min) || !gid_valid(gid_max) ||
+   (info->gid_max < info->gid_min) ||
+   gid_lt(gid_max, gid_min)) {
+   return -EINVAL;
+   }
+   }
+
return 0;
 }
 
@@ -93,6 +121,7 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param 
*par)
 {
const struct xt_owner_match_info *info = par->matchinfo;
const struct file *filp;
+   struct net *net = dev_net(par->in ? par->in : par->out);
 
if (skb->sk == NULL || skb->sk->sk_socket == NULL)
return (info->match ^ info->invert) == 0;
@@ -109,8 +138,8 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param 
*par)
   (XT_OWNER_UID | XT_OWNER_GID)) == 0;
 
if (info->match & XT_OWNER_UID) {
-   kuid_t uid_min = make_kuid(ve_init_user_ns(), info->uid_min);
-   kuid_t uid_max = make_kuid(ve_init_user_ns(), info->uid_max);
+   kuid_t uid_min = make_kuid(net->user_ns, info->uid_min);
+   kuid_t uid_max = make_kuid(net->user_ns, info->uid_max);
if ((uid_gte(filp->f_cred->fsuid, uid_min) &&
 uid_lte(filp->f_cred->fsuid, uid_max)) ^
!(info->invert & XT_OWNER_UID))
@@ -118,8 +147,8 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param 
*par)
}
 
if (info->match & XT_OWNER_GID) {
-   kgid_t gid_min = make_kgid(ve_init_user_ns(), info->gid_min);
-   kgid_t gid_max = make_kgid(ve_init_user_ns(), info->gid_max);
+   kgid_t gid_min = make_kgid(net->user_ns, info->gid_min);
+   kgid_t gid_max = make_kgid(net->user_ns, info->gid_max);
if ((gid_gte(filp->f_cred->fsgid, gid_min) &&
 gid_lte(filp->f_cred->fsgid, gid_max)) ^
!(info->invert & XT_OWNER_GID))
-- 
1.8.3.1

___
Devel 

[Devel] [PATCH] venet: destroy VE IP on venet destruction in NFS is enabled

2017-10-13 Thread Stanislav Kinsburskiy
We skip VE IP destruion in shutdown hook, if NFS is enabled in CT (to allow
NFS mounts to disappear.
Thus we have to destroy it with venet device.

https://jira.sw.ru/browse/PSBM-75120

Signed-off-by: Stanislav Kinsburskiy 
---
 drivers/net/venetdev.c |7 +--
 1 file changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/venetdev.c b/drivers/net/venetdev.c
index 7a546cc..fad232b 100644
--- a/drivers/net/venetdev.c
+++ b/drivers/net/venetdev.c
@@ -759,9 +759,12 @@ static void venet_dellink(struct net_device *dev, struct 
list_head *head)
struct ve_struct *env = dev->nd_net->owner_ve;
 
/* We check ve_netns to avoid races with veip SHUTDOWN hook, called from
-* ve_exit_ns()
+* ve_exit_ns().
+* Also, in veip SHUTDOWN hook we skip veip destructionif, if container
+* has VE_FEATURE_NFS enabled. Thus here we have to destroy veip in
+* this case.
 */
-   if (env->ve_netns)
+   if (env->ve_netns || (env->features & VE_FEATURE_NFS))
veip_shutdown(env);
 
env->_venet_dev = NULL;

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH 2/2] KVM: MMU: always terminate page walks at level 1

2017-10-13 Thread Denis Plotnikov
From: Ladi Prosek 

is_last_gpte() is not equivalent to the pseudo-code given in commit
6bb69c9b69c31 ("KVM: MMU: simplify last_pte_bitmap") because an incorrect
value of last_nonleaf_level may override the result even if level == 1.

It is critical for is_last_gpte() to return true on level == 1 to
terminate page walks. Otherwise memory corruption may occur as level
is used as an index to various data structures throughout the page
walking code.  Even though the actual bug would be wherever the MMU is
initialized (as in the previous patch), be defensive and ensure here
that is_last_gpte() returns the correct value.

This patch is also enough to fix CVE-2017-12188.

Fixes: 6bb69c9b69c315200ddc2bc79aee14c0184cf5b2
Cc: sta...@vger.kernel.org
Cc: Andy Honig 
Signed-off-by: Ladi Prosek 
[Panic if walk_addr_generic gets an incorrect level; this is a serious
 bug and it's not worth a WARN_ON where the recovery path might hide
 further exploitable issues; suggested by Andrew Honig. - Paolo]
Signed-off-by: Paolo Bonzini 

(cherry picked from commit 829ee279aed43faa5cb1e4d65c0cad52f2426c53)
fix #PSBM-74910
Signed-off-by: Denis Plotnikov 
---
 arch/x86/kvm/mmu.c | 14 +++---
 arch/x86/kvm/paging_tmpl.h |  3 ++-
 2 files changed, 9 insertions(+), 8 deletions(-)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 5b64390..d1f5589 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -3676,19 +3676,19 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu,
unsigned level, unsigned gpte)
 {
/*
-* PT_PAGE_TABLE_LEVEL always terminates.  The RHS has bit 7 set
-* iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
-* level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
-*/
-   gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
-
-   /*
 * The RHS has bit 7 set iff level < mmu->last_nonleaf_level.
 * If it is clear, there are no large pages at this level, so clear
 * PT_PAGE_SIZE_MASK in gpte if that is the case.
 */
gpte &= level - mmu->last_nonleaf_level;
 
+   /*
+* PT_PAGE_TABLE_LEVEL always terminates.  The RHS has bit 7 set
+* iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means
+* level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then.
+*/
+   gpte |= level - PT_PAGE_TABLE_LEVEL - 1;
+
return gpte & PT_PAGE_SIZE_MASK;
 }
 
diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h
index 1545467..bc6a43a 100644
--- a/arch/x86/kvm/paging_tmpl.h
+++ b/arch/x86/kvm/paging_tmpl.h
@@ -307,10 +307,11 @@ retry_walk:
--walker->level;
 
index = PT_INDEX(addr, walker->level);
-
table_gfn = gpte_to_gfn(pte);
offset= index * sizeof(pt_element_t);
pte_gpa   = gfn_to_gpa(table_gfn) + offset;
+
+   BUG_ON(walker->level < 1);
walker->table_gfn[walker->level - 1] = table_gfn;
walker->pte_gpa[walker->level - 1] = pte_gpa;
 
-- 
2.7.4

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH 1/2] KVM: nVMX: update last_nonleaf_level when initializing nested EPT

2017-10-13 Thread Denis Plotnikov
From: Ladi Prosek 

The function updates context->root_level but didn't call
update_last_nonleaf_level so the previous and potentially wrong value
was used for page walks.  For example, a zero value of last_nonleaf_level
would allow a potential out-of-bounds access in arch/x86/mmu/paging_tmpl.h's
walk_addr_generic function (CVE-2017-12188).

Fixes: 155a97a3d7c78b46cef6f1a973c831bc5a4f82bb
Signed-off-by: Ladi Prosek 
Signed-off-by: Paolo Bonzini 

(cherry picked from commit fd19d3b45164466a4adce7cbff448ba9189e1427)
fix #PSBM-74910
Signed-off-by: Denis Plotnikov 
---
 arch/x86/kvm/mmu.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index bb15151..5b64390 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -4118,6 +4118,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool 
execonly)
context->direct_map = false;
 
update_permission_bitmask(vcpu, context, true);
+   update_last_nonleaf_level(vcpu, context);
reset_rsvds_bits_mask_ept(vcpu, context, execonly);
reset_ept_shadow_zero_bits_mask(vcpu, context, execonly);
 }
-- 
2.7.4

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH 0/2] fix CVE-2017-12188 KVM: MMU potential stack buffer overrun during page walks

2017-10-13 Thread Denis Plotnikov
https://jira.sw.ru/browse/PSBM-74910

backport of two patches from the mainstream

Ladi Prosek (2):
  KVM: nVMX: update last_nonleaf_level when initializing nested EPT
  KVM: MMU: always terminate page walks at level 1

 arch/x86/kvm/mmu.c | 15 ---
 arch/x86/kvm/paging_tmpl.h |  3 ++-
 2 files changed, 10 insertions(+), 8 deletions(-)

-- 
2.7.4

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL7 COMMIT] ms/trylock_super(): replacement for grab_super_passive()

2017-10-13 Thread Konstantin Khorenko
The commit is pushed to "branch-rh7-3.10.0-693.1.1.vz7.37.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-693.1.1.vz7.37.13
-->
commit 22c1ffe395aa3d560ba33139aeb38a8840783766
Author: Konstantin Khlebnikov 
Date:   Fri Oct 13 15:40:02 2017 +0300

ms/trylock_super(): replacement for grab_super_passive()

We have a softlockup in wb_do_writeback -> wb_writeback ->
__writeback_inodes_wb stack, as we iterate over works in
wb->bdi->work_list and for each work we move wb->b_dirty inodes to
wb->b_io(in queue_io), and later in __writeback_inodes_wb() we try
to write them, we trylock inode->i_sb->s_umount rw_semaphore and
fail, we put each inode back to wb->b_dirty, and on each inode we
take sb_lock - which can be long. We do aproximately ~100.000.000
iterations, witch takes more than softlockup threshold(20sec).
The above s_umount lock is taken by sync() in container in
sync_filesystems_ve() wich sleeps for 50 sec.

Removing sb_lock from the equation should remove most of contention
and all iterations should pass much faster.

https://jira.sw.ru/browse/PSBM-73370
ms commit: eb6ef3d ("trylock_super(): replacement for grab_super_passive()")

original commit message:

I've noticed significant locking contention in memory reclaimer around
sb_lock inside grab_super_passive(). Grab_super_passive() is called from
two places: in icache/dcache shrinkers (function super_cache_scan) and
from writeback (function __writeback_inodes_wb). Both are required for
progress in memory allocator.

Grab_super_passive() acquires sb_lock to increment sb->s_count and check
sb->s_instances. It seems sb->s_umount locked for read is enough here:
super-block deactivation always runs under sb->s_umount locked for write.
Protecting super-block itself isn't a problem: in super_cache_scan() sb
is protected by shrinker_rwsem: it cannot be freed if its slab shrinkers
are still active. Inside writeback super-block comes from inode from bdi
writeback list under wb->list_lock.

This patch removes locking sb_lock and checks s_instances under s_umount:
generic_shutdown_super() unlinks it under sb->s_umount locked for write.
New variant is called trylock_super() and since it only locks semaphore,
callers must call up_read(>s_umount) instead of drop_super(sb) when
they're done.

Signed-off-by: Konstantin Khlebnikov 
Signed-off-by: Al Viro 

Suggested-by: Andrey Ryabinin 
Signed-off-by: Pavel Tikhomirov 
---
 fs/fs-writeback.c |  6 +++---
 fs/internal.h |  2 +-
 fs/super.c| 42 +++---
 3 files changed, 23 insertions(+), 27 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 91f1d0a..7cea021 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -797,9 +797,9 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
if (time_is_before_jiffies(start_time + 15* HZ))
trace = 1;
 
-   if (!grab_super_passive(sb)) {
+   if (!trylock_super(sb)) {
/*
-* grab_super_passive() may fail consistently due to
+* trylock_super() may fail consistently due to
 * s_umount being grabbed by someone else. Don't use
 * requeue_io() to avoid busy retrying the inode/sb.
 */
@@ -810,7 +810,7 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
continue;
}
wrote += writeback_sb_inodes(sb, wb, work);
-   drop_super(sb);
+   up_read(>s_umount);
 
/* refer to the same tests at the end of writeback_sb_inodes */
if (wrote) {
diff --git a/fs/internal.h b/fs/internal.h
index 289437c..ab2f706 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -86,7 +86,7 @@ extern struct file *get_empty_filp(void);
  * super.c
  */
 extern int do_remount_sb(struct super_block *, int, void *, int);
-extern bool grab_super_passive(struct super_block *sb);
+extern bool trylock_super(struct super_block *sb);
 extern struct dentry *mount_fs(struct file_system_type *,
   int, const char *, void *);
 extern struct super_block *user_get_super(dev_t);
diff --git a/fs/super.c b/fs/super.c
index af7ae96..3b59975 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -94,8 +94,8 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
if (!(sc->gfp_mask & __GFP_FS))
return SHRINK_STOP;
 
-   if (!grab_super_passive(sb))
-return SHRINK_STOP;
+   if (!trylock_super(sb))
+   return 

[Devel] [PATCH] ms: trylock_super(): replacement for grab_super_passive()

2017-10-13 Thread Pavel Tikhomirov
From: Konstantin Khlebnikov 

We have a softlockup in wb_do_writeback -> wb_writeback ->
__writeback_inodes_wb stack, as we iterate over works in
wb->bdi->work_list and for each work we move wb->b_dirty inodes to
wb->b_io(in queue_io), and later in __writeback_inodes_wb() we try
to write them, we trylock inode->i_sb->s_umount rw_semaphore and
fail, we put each inode back to wb->b_dirty, and on each inode we
take sb_lock - which can be long. We do aproximately ~100.000.000
iterations, witch takes more than softlockup threshold(20sec).
The above s_umount lock is taken by sync() in container in
sync_filesystems_ve() wich sleeps for 50 sec.

Removing sb_lock from the equation should remove most of contention
and all iterations should pass much faster.

https://jira.sw.ru/browse/PSBM-73370

original commit message:

I've noticed significant locking contention in memory reclaimer around
sb_lock inside grab_super_passive(). Grab_super_passive() is called from
two places: in icache/dcache shrinkers (function super_cache_scan) and
from writeback (function __writeback_inodes_wb). Both are required for
progress in memory allocator.

Grab_super_passive() acquires sb_lock to increment sb->s_count and check
sb->s_instances. It seems sb->s_umount locked for read is enough here:
super-block deactivation always runs under sb->s_umount locked for write.
Protecting super-block itself isn't a problem: in super_cache_scan() sb
is protected by shrinker_rwsem: it cannot be freed if its slab shrinkers
are still active. Inside writeback super-block comes from inode from bdi
writeback list under wb->list_lock.

This patch removes locking sb_lock and checks s_instances under s_umount:
generic_shutdown_super() unlinks it under sb->s_umount locked for write.
New variant is called trylock_super() and since it only locks semaphore,
callers must call up_read(>s_umount) instead of drop_super(sb) when
they're done.

Signed-off-by: Konstantin Khlebnikov 
Signed-off-by: Al Viro 

Suggested-by: Andrey Ryabinin 
Signed-off-by: Pavel Tikhomirov 
---
 fs/fs-writeback.c |  6 +++---
 fs/internal.h |  2 +-
 fs/super.c| 42 +++---
 3 files changed, 23 insertions(+), 27 deletions(-)

diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index 9da7e47f83f3..486259bcdafd 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -797,9 +797,9 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
if (time_is_before_jiffies(start_time + 15* HZ))
trace = 1;
 
-   if (!grab_super_passive(sb)) {
+   if (!trylock_super(sb)) {
/*
-* grab_super_passive() may fail consistently due to
+* trylock_super() may fail consistently due to
 * s_umount being grabbed by someone else. Don't use
 * requeue_io() to avoid busy retrying the inode/sb.
 */
@@ -810,7 +810,7 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb,
continue;
}
wrote += writeback_sb_inodes(sb, wb, work);
-   drop_super(sb);
+   up_read(>s_umount);
 
/* refer to the same tests at the end of writeback_sb_inodes */
if (wrote) {
diff --git a/fs/internal.h b/fs/internal.h
index 289437c7cf59..ab2f706031f6 100644
--- a/fs/internal.h
+++ b/fs/internal.h
@@ -86,7 +86,7 @@ extern struct file *get_empty_filp(void);
  * super.c
  */
 extern int do_remount_sb(struct super_block *, int, void *, int);
-extern bool grab_super_passive(struct super_block *sb);
+extern bool trylock_super(struct super_block *sb);
 extern struct dentry *mount_fs(struct file_system_type *,
   int, const char *, void *);
 extern struct super_block *user_get_super(dev_t);
diff --git a/fs/super.c b/fs/super.c
index af7ae96a38e8..3b59975c5265 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -94,8 +94,8 @@ static unsigned long super_cache_scan(struct shrinker *shrink,
if (!(sc->gfp_mask & __GFP_FS))
return SHRINK_STOP;
 
-   if (!grab_super_passive(sb))
-return SHRINK_STOP;
+   if (!trylock_super(sb))
+   return SHRINK_STOP;
 
if (sb->s_op && sb->s_op->nr_cached_objects)
fs_objects = sb->s_op->nr_cached_objects(sb, sc);
@@ -123,7 +123,7 @@ static unsigned long super_cache_scan(struct shrinker 
*shrink,
freed += sb->s_op->free_cached_objects(sb, sc);
}
 
-   drop_super(sb);
+   up_read(>s_umount);
return freed;
 }
 
@@ -139,7 +139,7 @@ static unsigned long super_cache_count(struct shrinker 
*shrink,
sb = container_of(shrink, struct super_block, s_shrink);
 
/*
-* Don't call