[Devel] [PATCH rh7] netfilter: Allow xt_owner in any user namespace
From: "Eric W. Biederman"ML: 9847371a84b0be330f4bc4aaa98904101ee8573d https://jira.sw.ru/browse/PSBM-69409? Making this work is a little tricky as it really isn't kosher to change the xt_owner_match_info in a check function. Without changing xt_owner_match_info we need to know the user namespace the uids and gids are specified in. In the common case net->user_ns == current_user_ns(). Verify net->user_ns == current_user_ns() in owner_check so we can later assume it in owner_mt. In owner_check also verify that all of the uids and gids specified are in net->user_ns and that the expected min/max relationship exists between the uids and gids in xt_owner_match_info. In owner_mt get the network namespace from the outgoing socket, as this must be the same network namespace as the netfilter rules, and use that network namespace to find the user namespace the uids and gids in xt_match_owner_info are encoded in. Then convert from their encoded from into the kernel internal format for uids and gids and perform the owner match. Similar to ping_group_range, this code does not try to detect noncontiguous UID/GID ranges. Signed-off-by: "Eric W. Biederman" Signed-off-by: Kevin Cernekee Signed-off-by: Pablo Neira Ayuso Signed-off-by: Andrei Vagin --- net/netfilter/xt_owner.c | 41 +++-- 1 file changed, 35 insertions(+), 6 deletions(-) diff --git a/net/netfilter/xt_owner.c b/net/netfilter/xt_owner.c index 31dec4a..1744f78 100644 --- a/net/netfilter/xt_owner.c +++ b/net/netfilter/xt_owner.c @@ -80,11 +80,39 @@ owner_mt6_v0(const struct sk_buff *skb, struct xt_action_param *par) static int owner_check(const struct xt_mtchk_param *par) { struct xt_owner_match_info *info = par->matchinfo; + struct net *net = par->net; - /* For now only allow adding matches from the initial user namespace */ + /* Only allow the common case where the userns of the writer +* matches the userns of the network namespace. +*/ if ((info->match & (XT_OWNER_UID|XT_OWNER_GID)) && - !current_user_ns_initial()) + (current_user_ns() != net->user_ns)) return -EINVAL; + + /* Ensure the uids are valid */ + if (info->match & XT_OWNER_UID) { + kuid_t uid_min = make_kuid(net->user_ns, info->uid_min); + kuid_t uid_max = make_kuid(net->user_ns, info->uid_max); + + if (!uid_valid(uid_min) || !uid_valid(uid_max) || + (info->uid_max < info->uid_min) || + uid_lt(uid_max, uid_min)) { + return -EINVAL; + } + } + + /* Ensure the gids are valid */ + if (info->match & XT_OWNER_GID) { + kgid_t gid_min = make_kgid(net->user_ns, info->gid_min); + kgid_t gid_max = make_kgid(net->user_ns, info->gid_max); + + if (!gid_valid(gid_min) || !gid_valid(gid_max) || + (info->gid_max < info->gid_min) || + gid_lt(gid_max, gid_min)) { + return -EINVAL; + } + } + return 0; } @@ -93,6 +121,7 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) { const struct xt_owner_match_info *info = par->matchinfo; const struct file *filp; + struct net *net = dev_net(par->in ? par->in : par->out); if (skb->sk == NULL || skb->sk->sk_socket == NULL) return (info->match ^ info->invert) == 0; @@ -109,8 +138,8 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) (XT_OWNER_UID | XT_OWNER_GID)) == 0; if (info->match & XT_OWNER_UID) { - kuid_t uid_min = make_kuid(ve_init_user_ns(), info->uid_min); - kuid_t uid_max = make_kuid(ve_init_user_ns(), info->uid_max); + kuid_t uid_min = make_kuid(net->user_ns, info->uid_min); + kuid_t uid_max = make_kuid(net->user_ns, info->uid_max); if ((uid_gte(filp->f_cred->fsuid, uid_min) && uid_lte(filp->f_cred->fsuid, uid_max)) ^ !(info->invert & XT_OWNER_UID)) @@ -118,8 +147,8 @@ owner_mt(const struct sk_buff *skb, struct xt_action_param *par) } if (info->match & XT_OWNER_GID) { - kgid_t gid_min = make_kgid(ve_init_user_ns(), info->gid_min); - kgid_t gid_max = make_kgid(ve_init_user_ns(), info->gid_max); + kgid_t gid_min = make_kgid(net->user_ns, info->gid_min); + kgid_t gid_max = make_kgid(net->user_ns, info->gid_max); if ((gid_gte(filp->f_cred->fsgid, gid_min) && gid_lte(filp->f_cred->fsgid, gid_max)) ^ !(info->invert & XT_OWNER_GID)) -- 1.8.3.1 ___ Devel
[Devel] [PATCH] venet: destroy VE IP on venet destruction in NFS is enabled
We skip VE IP destruion in shutdown hook, if NFS is enabled in CT (to allow NFS mounts to disappear. Thus we have to destroy it with venet device. https://jira.sw.ru/browse/PSBM-75120 Signed-off-by: Stanislav Kinsburskiy--- drivers/net/venetdev.c |7 +-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/venetdev.c b/drivers/net/venetdev.c index 7a546cc..fad232b 100644 --- a/drivers/net/venetdev.c +++ b/drivers/net/venetdev.c @@ -759,9 +759,12 @@ static void venet_dellink(struct net_device *dev, struct list_head *head) struct ve_struct *env = dev->nd_net->owner_ve; /* We check ve_netns to avoid races with veip SHUTDOWN hook, called from -* ve_exit_ns() +* ve_exit_ns(). +* Also, in veip SHUTDOWN hook we skip veip destructionif, if container +* has VE_FEATURE_NFS enabled. Thus here we have to destroy veip in +* this case. */ - if (env->ve_netns) + if (env->ve_netns || (env->features & VE_FEATURE_NFS)) veip_shutdown(env); env->_venet_dev = NULL; ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH 2/2] KVM: MMU: always terminate page walks at level 1
From: Ladi Prosekis_last_gpte() is not equivalent to the pseudo-code given in commit 6bb69c9b69c31 ("KVM: MMU: simplify last_pte_bitmap") because an incorrect value of last_nonleaf_level may override the result even if level == 1. It is critical for is_last_gpte() to return true on level == 1 to terminate page walks. Otherwise memory corruption may occur as level is used as an index to various data structures throughout the page walking code. Even though the actual bug would be wherever the MMU is initialized (as in the previous patch), be defensive and ensure here that is_last_gpte() returns the correct value. This patch is also enough to fix CVE-2017-12188. Fixes: 6bb69c9b69c315200ddc2bc79aee14c0184cf5b2 Cc: sta...@vger.kernel.org Cc: Andy Honig Signed-off-by: Ladi Prosek [Panic if walk_addr_generic gets an incorrect level; this is a serious bug and it's not worth a WARN_ON where the recovery path might hide further exploitable issues; suggested by Andrew Honig. - Paolo] Signed-off-by: Paolo Bonzini (cherry picked from commit 829ee279aed43faa5cb1e4d65c0cad52f2426c53) fix #PSBM-74910 Signed-off-by: Denis Plotnikov --- arch/x86/kvm/mmu.c | 14 +++--- arch/x86/kvm/paging_tmpl.h | 3 ++- 2 files changed, 9 insertions(+), 8 deletions(-) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 5b64390..d1f5589 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -3676,19 +3676,19 @@ static inline bool is_last_gpte(struct kvm_mmu *mmu, unsigned level, unsigned gpte) { /* -* PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set -* iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means -* level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then. -*/ - gpte |= level - PT_PAGE_TABLE_LEVEL - 1; - - /* * The RHS has bit 7 set iff level < mmu->last_nonleaf_level. * If it is clear, there are no large pages at this level, so clear * PT_PAGE_SIZE_MASK in gpte if that is the case. */ gpte &= level - mmu->last_nonleaf_level; + /* +* PT_PAGE_TABLE_LEVEL always terminates. The RHS has bit 7 set +* iff level <= PT_PAGE_TABLE_LEVEL, which for our purpose means +* level == PT_PAGE_TABLE_LEVEL; set PT_PAGE_SIZE_MASK in gpte then. +*/ + gpte |= level - PT_PAGE_TABLE_LEVEL - 1; + return gpte & PT_PAGE_SIZE_MASK; } diff --git a/arch/x86/kvm/paging_tmpl.h b/arch/x86/kvm/paging_tmpl.h index 1545467..bc6a43a 100644 --- a/arch/x86/kvm/paging_tmpl.h +++ b/arch/x86/kvm/paging_tmpl.h @@ -307,10 +307,11 @@ retry_walk: --walker->level; index = PT_INDEX(addr, walker->level); - table_gfn = gpte_to_gfn(pte); offset= index * sizeof(pt_element_t); pte_gpa = gfn_to_gpa(table_gfn) + offset; + + BUG_ON(walker->level < 1); walker->table_gfn[walker->level - 1] = table_gfn; walker->pte_gpa[walker->level - 1] = pte_gpa; -- 2.7.4 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH 1/2] KVM: nVMX: update last_nonleaf_level when initializing nested EPT
From: Ladi ProsekThe function updates context->root_level but didn't call update_last_nonleaf_level so the previous and potentially wrong value was used for page walks. For example, a zero value of last_nonleaf_level would allow a potential out-of-bounds access in arch/x86/mmu/paging_tmpl.h's walk_addr_generic function (CVE-2017-12188). Fixes: 155a97a3d7c78b46cef6f1a973c831bc5a4f82bb Signed-off-by: Ladi Prosek Signed-off-by: Paolo Bonzini (cherry picked from commit fd19d3b45164466a4adce7cbff448ba9189e1427) fix #PSBM-74910 Signed-off-by: Denis Plotnikov --- arch/x86/kvm/mmu.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index bb15151..5b64390 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -4118,6 +4118,7 @@ void kvm_init_shadow_ept_mmu(struct kvm_vcpu *vcpu, bool execonly) context->direct_map = false; update_permission_bitmask(vcpu, context, true); + update_last_nonleaf_level(vcpu, context); reset_rsvds_bits_mask_ept(vcpu, context, execonly); reset_ept_shadow_zero_bits_mask(vcpu, context, execonly); } -- 2.7.4 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH 0/2] fix CVE-2017-12188 KVM: MMU potential stack buffer overrun during page walks
https://jira.sw.ru/browse/PSBM-74910 backport of two patches from the mainstream Ladi Prosek (2): KVM: nVMX: update last_nonleaf_level when initializing nested EPT KVM: MMU: always terminate page walks at level 1 arch/x86/kvm/mmu.c | 15 --- arch/x86/kvm/paging_tmpl.h | 3 ++- 2 files changed, 10 insertions(+), 8 deletions(-) -- 2.7.4 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH RHEL7 COMMIT] ms/trylock_super(): replacement for grab_super_passive()
The commit is pushed to "branch-rh7-3.10.0-693.1.1.vz7.37.x-ovz" and will appear at https://src.openvz.org/scm/ovz/vzkernel.git after rh7-3.10.0-693.1.1.vz7.37.13 --> commit 22c1ffe395aa3d560ba33139aeb38a8840783766 Author: Konstantin KhlebnikovDate: Fri Oct 13 15:40:02 2017 +0300 ms/trylock_super(): replacement for grab_super_passive() We have a softlockup in wb_do_writeback -> wb_writeback -> __writeback_inodes_wb stack, as we iterate over works in wb->bdi->work_list and for each work we move wb->b_dirty inodes to wb->b_io(in queue_io), and later in __writeback_inodes_wb() we try to write them, we trylock inode->i_sb->s_umount rw_semaphore and fail, we put each inode back to wb->b_dirty, and on each inode we take sb_lock - which can be long. We do aproximately ~100.000.000 iterations, witch takes more than softlockup threshold(20sec). The above s_umount lock is taken by sync() in container in sync_filesystems_ve() wich sleeps for 50 sec. Removing sb_lock from the equation should remove most of contention and all iterations should pass much faster. https://jira.sw.ru/browse/PSBM-73370 ms commit: eb6ef3d ("trylock_super(): replacement for grab_super_passive()") original commit message: I've noticed significant locking contention in memory reclaimer around sb_lock inside grab_super_passive(). Grab_super_passive() is called from two places: in icache/dcache shrinkers (function super_cache_scan) and from writeback (function __writeback_inodes_wb). Both are required for progress in memory allocator. Grab_super_passive() acquires sb_lock to increment sb->s_count and check sb->s_instances. It seems sb->s_umount locked for read is enough here: super-block deactivation always runs under sb->s_umount locked for write. Protecting super-block itself isn't a problem: in super_cache_scan() sb is protected by shrinker_rwsem: it cannot be freed if its slab shrinkers are still active. Inside writeback super-block comes from inode from bdi writeback list under wb->list_lock. This patch removes locking sb_lock and checks s_instances under s_umount: generic_shutdown_super() unlinks it under sb->s_umount locked for write. New variant is called trylock_super() and since it only locks semaphore, callers must call up_read(>s_umount) instead of drop_super(sb) when they're done. Signed-off-by: Konstantin Khlebnikov Signed-off-by: Al Viro Suggested-by: Andrey Ryabinin Signed-off-by: Pavel Tikhomirov --- fs/fs-writeback.c | 6 +++--- fs/internal.h | 2 +- fs/super.c| 42 +++--- 3 files changed, 23 insertions(+), 27 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 91f1d0a..7cea021 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -797,9 +797,9 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb, if (time_is_before_jiffies(start_time + 15* HZ)) trace = 1; - if (!grab_super_passive(sb)) { + if (!trylock_super(sb)) { /* -* grab_super_passive() may fail consistently due to +* trylock_super() may fail consistently due to * s_umount being grabbed by someone else. Don't use * requeue_io() to avoid busy retrying the inode/sb. */ @@ -810,7 +810,7 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb, continue; } wrote += writeback_sb_inodes(sb, wb, work); - drop_super(sb); + up_read(>s_umount); /* refer to the same tests at the end of writeback_sb_inodes */ if (wrote) { diff --git a/fs/internal.h b/fs/internal.h index 289437c..ab2f706 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -86,7 +86,7 @@ extern struct file *get_empty_filp(void); * super.c */ extern int do_remount_sb(struct super_block *, int, void *, int); -extern bool grab_super_passive(struct super_block *sb); +extern bool trylock_super(struct super_block *sb); extern struct dentry *mount_fs(struct file_system_type *, int, const char *, void *); extern struct super_block *user_get_super(dev_t); diff --git a/fs/super.c b/fs/super.c index af7ae96..3b59975 100644 --- a/fs/super.c +++ b/fs/super.c @@ -94,8 +94,8 @@ static unsigned long super_cache_scan(struct shrinker *shrink, if (!(sc->gfp_mask & __GFP_FS)) return SHRINK_STOP; - if (!grab_super_passive(sb)) -return SHRINK_STOP; + if (!trylock_super(sb)) + return
[Devel] [PATCH] ms: trylock_super(): replacement for grab_super_passive()
From: Konstantin KhlebnikovWe have a softlockup in wb_do_writeback -> wb_writeback -> __writeback_inodes_wb stack, as we iterate over works in wb->bdi->work_list and for each work we move wb->b_dirty inodes to wb->b_io(in queue_io), and later in __writeback_inodes_wb() we try to write them, we trylock inode->i_sb->s_umount rw_semaphore and fail, we put each inode back to wb->b_dirty, and on each inode we take sb_lock - which can be long. We do aproximately ~100.000.000 iterations, witch takes more than softlockup threshold(20sec). The above s_umount lock is taken by sync() in container in sync_filesystems_ve() wich sleeps for 50 sec. Removing sb_lock from the equation should remove most of contention and all iterations should pass much faster. https://jira.sw.ru/browse/PSBM-73370 original commit message: I've noticed significant locking contention in memory reclaimer around sb_lock inside grab_super_passive(). Grab_super_passive() is called from two places: in icache/dcache shrinkers (function super_cache_scan) and from writeback (function __writeback_inodes_wb). Both are required for progress in memory allocator. Grab_super_passive() acquires sb_lock to increment sb->s_count and check sb->s_instances. It seems sb->s_umount locked for read is enough here: super-block deactivation always runs under sb->s_umount locked for write. Protecting super-block itself isn't a problem: in super_cache_scan() sb is protected by shrinker_rwsem: it cannot be freed if its slab shrinkers are still active. Inside writeback super-block comes from inode from bdi writeback list under wb->list_lock. This patch removes locking sb_lock and checks s_instances under s_umount: generic_shutdown_super() unlinks it under sb->s_umount locked for write. New variant is called trylock_super() and since it only locks semaphore, callers must call up_read(>s_umount) instead of drop_super(sb) when they're done. Signed-off-by: Konstantin Khlebnikov Signed-off-by: Al Viro Suggested-by: Andrey Ryabinin Signed-off-by: Pavel Tikhomirov --- fs/fs-writeback.c | 6 +++--- fs/internal.h | 2 +- fs/super.c| 42 +++--- 3 files changed, 23 insertions(+), 27 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 9da7e47f83f3..486259bcdafd 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -797,9 +797,9 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb, if (time_is_before_jiffies(start_time + 15* HZ)) trace = 1; - if (!grab_super_passive(sb)) { + if (!trylock_super(sb)) { /* -* grab_super_passive() may fail consistently due to +* trylock_super() may fail consistently due to * s_umount being grabbed by someone else. Don't use * requeue_io() to avoid busy retrying the inode/sb. */ @@ -810,7 +810,7 @@ static long __writeback_inodes_wb(struct bdi_writeback *wb, continue; } wrote += writeback_sb_inodes(sb, wb, work); - drop_super(sb); + up_read(>s_umount); /* refer to the same tests at the end of writeback_sb_inodes */ if (wrote) { diff --git a/fs/internal.h b/fs/internal.h index 289437c7cf59..ab2f706031f6 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -86,7 +86,7 @@ extern struct file *get_empty_filp(void); * super.c */ extern int do_remount_sb(struct super_block *, int, void *, int); -extern bool grab_super_passive(struct super_block *sb); +extern bool trylock_super(struct super_block *sb); extern struct dentry *mount_fs(struct file_system_type *, int, const char *, void *); extern struct super_block *user_get_super(dev_t); diff --git a/fs/super.c b/fs/super.c index af7ae96a38e8..3b59975c5265 100644 --- a/fs/super.c +++ b/fs/super.c @@ -94,8 +94,8 @@ static unsigned long super_cache_scan(struct shrinker *shrink, if (!(sc->gfp_mask & __GFP_FS)) return SHRINK_STOP; - if (!grab_super_passive(sb)) -return SHRINK_STOP; + if (!trylock_super(sb)) + return SHRINK_STOP; if (sb->s_op && sb->s_op->nr_cached_objects) fs_objects = sb->s_op->nr_cached_objects(sb, sc); @@ -123,7 +123,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink, freed += sb->s_op->free_cached_objects(sb, sc); } - drop_super(sb); + up_read(>s_umount); return freed; } @@ -139,7 +139,7 @@ static unsigned long super_cache_count(struct shrinker *shrink, sb = container_of(shrink, struct super_block, s_shrink); /* -* Don't call