[Devel] [PATCH] nfs: protect callback execution against per-net callback thread shutdown
From: Stanislav Kinsburskiy The problem is that per-net SUNRPC transports shutdown is done regardless current callback execution. This is a race leading to transport use-after-free in callback handler. This patch fixes it in stright-forward way. I.e. it protects callback execution with the same mutex used for per-net data creation and destruction. Hopefully, it won't slow down NFS client significantly. https://jira.sw.ru/browse/PSBM-75751 Signed-off-by: Stanislav Kinsburskiy --- fs/nfs/callback.c |3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c index 0beb275..82e8ed1 100644 --- a/fs/nfs/callback.c +++ b/fs/nfs/callback.c @@ -118,6 +118,7 @@ nfs41_callback_svc(void *vrqstp) continue; prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE); + mutex_lock(&nfs_callback_mutex); spin_lock_bh(&serv->sv_cb_lock); if (!list_empty(&serv->sv_cb_list)) { req = list_first_entry(&serv->sv_cb_list, @@ -129,8 +130,10 @@ nfs41_callback_svc(void *vrqstp) error = bc_svc_process(serv, req, rqstp); dprintk("bc_svc_process() returned w/ error code= %d\n", error); + mutex_unlock(&nfs_callback_mutex); } else { spin_unlock_bh(&serv->sv_cb_lock); + mutex_unlock(&nfs_callback_mutex); schedule(); finish_wait(&serv->sv_cb_waitq, &wq); } ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7 2/2] ms/mm: memcontrol: use vmalloc fallback for large kmem memcg arrays
From: Johannes Weiner commit f80c7dab95a1f0f968acbafe4426ee9525b6f6ab upstream. For quick per-memcg indexing, slab caches and list_lru structures maintain linear arrays of descriptors. As the number of concurrent memory cgroups in the system goes up, this requires large contiguous allocations (8k cgroups = order-5, 16k cgroups = order-6 etc.) for every existing slab cache and list_lru, which can easily fail on loaded systems. E.g.: mkdir: page allocation failure: order:5, mode:0x14040c0(GFP_KERNEL|__GFP_COMP), nodemask=(null) CPU: 1 PID: 6399 Comm: mkdir Not tainted 4.13.0-mm1-00065-g720bbe532b7c-dirty #481 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-20170228_101828-anatol 04/01/2014 Call Trace: dump_stack+0x70/0x9d warn_alloc+0xd6/0x170 ? __alloc_pages_direct_compact+0x4c/0x110 __alloc_pages_nodemask+0xf50/0x1430 ? __lock_acquire+0xd19/0x1360 ? memcg_update_all_list_lrus+0x2e/0x2e0 ? __mutex_lock+0x7c/0x950 ? memcg_update_all_list_lrus+0x2e/0x2e0 alloc_pages_current+0x60/0xc0 kmalloc_order_trace+0x29/0x1b0 __kmalloc+0x1f4/0x320 memcg_update_all_list_lrus+0xca/0x2e0 mem_cgroup_css_alloc+0x612/0x670 cgroup_apply_control_enable+0x19e/0x360 cgroup_mkdir+0x322/0x490 kernfs_iop_mkdir+0x55/0x80 vfs_mkdir+0xd0/0x120 SyS_mkdirat+0x6c/0xe0 SyS_mkdir+0x14/0x20 entry_SYSCALL_64_fastpath+0x18/0xad RIP: 0033:0x7f9ff36cee87 RSP: 002b:7ffc7612d758 EFLAGS: 0202 ORIG_RAX: 0053 RAX: ffda RBX: 7ffc7612da48 RCX: 7f9ff36cee87 RDX: 01ff RSI: 01ff RDI: 7ffc7612de86 RBP: 0002 R08: 01ff R09: 00401db0 R10: 01e2 R11: 0202 R12: R13: 7ffc7612da40 R14: R15: Mem-Info: active_anon:2965 inactive_anon:19 isolated_anon:0 active_file:100270 inactive_file:98846 isolated_file:0 unevictable:0 dirty:0 writeback:0 unstable:0 slab_reclaimable:7328 slab_unreclaimable:16402 mapped:771 shmem:52 pagetables:278 bounce:0 free:13718 free_pcp:0 free_cma:0 This output is from an artificial reproducer, but we have repeatedly observed order-7 failures in production in the Facebook fleet. These systems become useless as they cannot run more jobs, even though there is plenty of memory to allocate 128 individual pages. Use kvmalloc and kvzalloc to fall back to vmalloc space if these arrays prove too large for allocating them physically contiguous. Link: http://lkml.kernel.org/r/20170918184919.20644-1-han...@cmpxchg.org Signed-off-by: Johannes Weiner Reviewed-by: Josef Bacik Acked-by: Michal Hocko Acked-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds https://jira.sw.ru/browse/PSBM-76752 Signed-off-by: Andrey Ryabinin --- mm/list_lru.c| 17 +++-- mm/slab_common.c | 20 ++-- 2 files changed, 25 insertions(+), 12 deletions(-) diff --git a/mm/list_lru.c b/mm/list_lru.c index 5adc6621b338..91dccc1e30bf 100644 --- a/mm/list_lru.c +++ b/mm/list_lru.c @@ -322,13 +322,13 @@ static int memcg_init_list_lru_node(struct list_lru_node *nlru) struct list_lru_memcg *memcg_lrus; int size = memcg_nr_cache_ids; - memcg_lrus = kmalloc(sizeof(*memcg_lrus) + + memcg_lrus = kvmalloc(sizeof(*memcg_lrus) + size * sizeof(void *), GFP_KERNEL); if (!memcg_lrus) return -ENOMEM; if (__memcg_init_list_lru_node(memcg_lrus, 0, size)) { - kfree(memcg_lrus); + kvfree(memcg_lrus); return -ENOMEM; } rcu_assign_pointer(nlru->memcg_lrus, memcg_lrus); @@ -346,7 +346,12 @@ static void memcg_destroy_list_lru_node(struct list_lru_node *nlru) */ memcg_lrus = rcu_dereference_check(nlru->memcg_lrus, true); __memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids); - kfree(memcg_lrus); + kvfree(memcg_lrus); +} + +static void free_list_lru_memcg(struct rcu_head *head) +{ + kvfree(container_of(head, struct list_lru_memcg, rcu)); } static int memcg_update_list_lru_node(struct list_lru_node *nlru, @@ -359,12 +364,12 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, /* list_lrus_mutex is held, nobody can change memcg_lrus. Silence RCU */ old = rcu_dereference_check(nlru->memcg_lrus, true); - new = kmalloc(sizeof(*new) + new_size * sizeof(void *), GFP_KERNEL); + new = kvmalloc(sizeof(*new) + new_size * sizeof(void *), GFP_KERNEL); if (!new) return -ENOMEM; if (__memcg_init_list_lru_node(new, old_size, new_size)) { - kfree(new); + kvfree(new); return -ENOMEM; } @@ -381,7 +386,7 @@ static int memcg_update_list_lru_node(struct list_lru_node *nlru, rcu_assign_pointer(nlru->memcg_lrus, new); spin_unlock_irq(&nlru->lock); - kfree_rcu(old, rcu); +
[Devel] [PATCH rh7 1/2] ms/mm: introduce kv[mz]alloc helpers
This only small part of the upstream commit a7c3e901a46ff54c016d040847eda598a9e3e653. I backported only part that introduce kv[mz]alloc helpers. Description of the original patch: commit a7c3e901a46ff54c016d040847eda598a9e3e653 Author: Michal Hocko Date: Mon May 8 15:57:09 2017 -0700 mm: introduce kv[mz]alloc helpers Patch series "kvmalloc", v5. There are many open coded kmalloc with vmalloc fallback instances in the tree. Most of them are not careful enough or simply do not care about the underlying semantic of the kmalloc/page allocator which means that a) some vmalloc fallbacks are basically unreachable because the kmalloc part will keep retrying until it succeeds b) the page allocator can invoke a really disruptive steps like the OOM killer to move forward which doesn't sound appropriate when we consider that the vmalloc fallback is available. As it can be seen implementing kvmalloc requires quite an intimate knowledge if the page allocator and the memory reclaim internals which strongly suggests that a helper should be implemented in the memory subsystem proper. Most callers, I could find, have been converted to use the helper instead. This is patch 6. There are some more relying on __GFP_REPEAT in the networking stack which I have converted as well and Eric Dumazet was not opposed [2] to convert them as well. [1] http://lkml.kernel.org/r/20170130094940.13546-1-mho...@kernel.org [2] http://lkml.kernel.org/r/1485273626.16328.301.ca...@edumazet-glaptop3.roam.corp.google.com This patch (of 9): Using kmalloc with the vmalloc fallback for larger allocations is a common pattern in the kernel code. Yet we do not have any common helper for that and so users have invented their own helpers. Some of them are really creative when doing so. Let's just add kv[mz]alloc and make sure it is implemented properly. This implementation makes sure to not make a large memory pressure for > PAGE_SZE requests (__GFP_NORETRY) and also to not warn about allocation failures. This also rules out the OOM killer as the vmalloc is a more approapriate fallback than a disruptive user visible action. This patch also changes some existing users and removes helpers which are specific for them. In some cases this is not possible (e.g. ext4_kvmalloc, libcfs_kvzalloc) because those seems to be broken and require GFP_NO{FS,IO} context which is not vmalloc compatible in general (note that the page table allocation is GFP_KERNEL). Those need to be fixed separately. While we are at it, document that __vmalloc{_node} about unsupported gfp mask because there seems to be a lot of confusion out there. kvmalloc_node will warn about GFP_KERNEL incompatible (which are not superset) flags to catch new abusers. Existing ones would have to die slowly. https://jira.sw.ru/browse/PSBM-76752 Signed-off-by: Andrey Ryabinin --- include/linux/mm.h | 14 + include/linux/vmalloc.h | 1 + mm/nommu.c | 5 +++ mm/util.c| 45 ++ mm/vmalloc.c | 2 +- security/apparmor/apparmorfs.c | 2 +- security/apparmor/include/apparmor.h | 2 -- security/apparmor/lib.c | 61 security/apparmor/match.c| 2 +- 9 files changed, 68 insertions(+), 66 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index c806f43b5b59..897d7cfd2269 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -401,6 +401,20 @@ static inline int is_vmalloc_or_module_addr(const void *x) } #endif +extern void *kvmalloc_node(size_t size, gfp_t flags, int node); +static inline void *kvmalloc(size_t size, gfp_t flags) +{ + return kvmalloc_node(size, flags, NUMA_NO_NODE); +} +static inline void *kvzalloc_node(size_t size, gfp_t flags, int node) +{ + return kvmalloc_node(size, flags | __GFP_ZERO, node); +} +static inline void *kvzalloc(size_t size, gfp_t flags) +{ + return kvmalloc(size, flags | __GFP_ZERO); +} + extern void kvfree(const void *addr); static inline void compound_lock(struct page *page) diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h index 6ea82cf30dc1..59c80dd655a3 100644 --- a/include/linux/vmalloc.h +++ b/include/linux/vmalloc.h @@ -81,6 +81,7 @@ extern void *__vmalloc_node_range(unsigned long size, unsigned long align, unsigned long start, unsigned long end, gfp_t gfp_mask, pgprot_t prot, unsigned long vm_flags, int node, const void *caller); +extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags); extern void vfree(const void *addr); diff --git a/mm/nommu.c b/mm/nommu.c index beecd953c29c..a16aee9188a8 100644 --- a/mm/nommu.c +++ b/mm/n