[Devel] [PATCH] nfs: protect callback execution against per-net callback thread shutdown

2017-11-03 Thread Stanislav Kinsburskiy
From: Stanislav Kinsburskiy 

The problem is that per-net SUNRPC transports shutdown is done regardless
current callback execution. This is a race leading to transport use-after-free
in callback handler.
This patch fixes it in stright-forward way. I.e. it protects callback
execution with the same mutex used for per-net data creation and destruction.
Hopefully, it won't slow down NFS client significantly.

https://jira.sw.ru/browse/PSBM-75751

Signed-off-by: Stanislav Kinsburskiy 
---
 fs/nfs/callback.c |3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 0beb275..82e8ed1 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -118,6 +118,7 @@ nfs41_callback_svc(void *vrqstp)
continue;
 
prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
+   mutex_lock(&nfs_callback_mutex);
spin_lock_bh(&serv->sv_cb_lock);
if (!list_empty(&serv->sv_cb_list)) {
req = list_first_entry(&serv->sv_cb_list,
@@ -129,8 +130,10 @@ nfs41_callback_svc(void *vrqstp)
error = bc_svc_process(serv, req, rqstp);
dprintk("bc_svc_process() returned w/ error code= %d\n",
error);
+   mutex_unlock(&nfs_callback_mutex);
} else {
spin_unlock_bh(&serv->sv_cb_lock);
+   mutex_unlock(&nfs_callback_mutex);
schedule();
finish_wait(&serv->sv_cb_waitq, &wq);
}

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 2/2] ms/mm: memcontrol: use vmalloc fallback for large kmem memcg arrays

2017-11-03 Thread Andrey Ryabinin
From: Johannes Weiner 

commit f80c7dab95a1f0f968acbafe4426ee9525b6f6ab upstream.

For quick per-memcg indexing, slab caches and list_lru structures
maintain linear arrays of descriptors. As the number of concurrent
memory cgroups in the system goes up, this requires large contiguous
allocations (8k cgroups = order-5, 16k cgroups = order-6 etc.) for
every existing slab cache and list_lru, which can easily fail on
loaded systems. E.g.:

mkdir: page allocation failure: order:5, mode:0x14040c0(GFP_KERNEL|__GFP_COMP), 
nodemask=(null)
CPU: 1 PID: 6399 Comm: mkdir Not tainted 4.13.0-mm1-00065-g720bbe532b7c-dirty 
#481
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.10.2-20170228_101828-anatol 04/01/2014
Call Trace:
 dump_stack+0x70/0x9d
 warn_alloc+0xd6/0x170
 ? __alloc_pages_direct_compact+0x4c/0x110
 __alloc_pages_nodemask+0xf50/0x1430
 ? __lock_acquire+0xd19/0x1360
 ? memcg_update_all_list_lrus+0x2e/0x2e0
 ? __mutex_lock+0x7c/0x950
 ? memcg_update_all_list_lrus+0x2e/0x2e0
 alloc_pages_current+0x60/0xc0
 kmalloc_order_trace+0x29/0x1b0
 __kmalloc+0x1f4/0x320
 memcg_update_all_list_lrus+0xca/0x2e0
 mem_cgroup_css_alloc+0x612/0x670
 cgroup_apply_control_enable+0x19e/0x360
 cgroup_mkdir+0x322/0x490
 kernfs_iop_mkdir+0x55/0x80
 vfs_mkdir+0xd0/0x120
 SyS_mkdirat+0x6c/0xe0
 SyS_mkdir+0x14/0x20
 entry_SYSCALL_64_fastpath+0x18/0xad
RIP: 0033:0x7f9ff36cee87
RSP: 002b:7ffc7612d758 EFLAGS: 0202 ORIG_RAX: 0053
RAX: ffda RBX: 7ffc7612da48 RCX: 7f9ff36cee87
RDX: 01ff RSI: 01ff RDI: 7ffc7612de86
RBP: 0002 R08: 01ff R09: 00401db0
R10: 01e2 R11: 0202 R12: 
R13: 7ffc7612da40 R14:  R15: 
Mem-Info:
active_anon:2965 inactive_anon:19 isolated_anon:0
 active_file:100270 inactive_file:98846 isolated_file:0
 unevictable:0 dirty:0 writeback:0 unstable:0
 slab_reclaimable:7328 slab_unreclaimable:16402
 mapped:771 shmem:52 pagetables:278 bounce:0
 free:13718 free_pcp:0 free_cma:0

This output is from an artificial reproducer, but we have repeatedly
observed order-7 failures in production in the Facebook fleet. These
systems become useless as they cannot run more jobs, even though there
is plenty of memory to allocate 128 individual pages.

Use kvmalloc and kvzalloc to fall back to vmalloc space if these
arrays prove too large for allocating them physically contiguous.

Link: http://lkml.kernel.org/r/20170918184919.20644-1-han...@cmpxchg.org
Signed-off-by: Johannes Weiner 
Reviewed-by: Josef Bacik 
Acked-by: Michal Hocko 
Acked-by: Vladimir Davydov 
Signed-off-by: Andrew Morton 
Signed-off-by: Linus Torvalds 

https://jira.sw.ru/browse/PSBM-76752
Signed-off-by: Andrey Ryabinin 
---
 mm/list_lru.c| 17 +++--
 mm/slab_common.c | 20 ++--
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/mm/list_lru.c b/mm/list_lru.c
index 5adc6621b338..91dccc1e30bf 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -322,13 +322,13 @@ static int memcg_init_list_lru_node(struct list_lru_node 
*nlru)
struct list_lru_memcg *memcg_lrus;
int size = memcg_nr_cache_ids;
 
-   memcg_lrus = kmalloc(sizeof(*memcg_lrus) +
+   memcg_lrus = kvmalloc(sizeof(*memcg_lrus) +
 size * sizeof(void *), GFP_KERNEL);
if (!memcg_lrus)
return -ENOMEM;
 
if (__memcg_init_list_lru_node(memcg_lrus, 0, size)) {
-   kfree(memcg_lrus);
+   kvfree(memcg_lrus);
return -ENOMEM;
}
rcu_assign_pointer(nlru->memcg_lrus, memcg_lrus);
@@ -346,7 +346,12 @@ static void memcg_destroy_list_lru_node(struct 
list_lru_node *nlru)
 */
memcg_lrus = rcu_dereference_check(nlru->memcg_lrus, true);
__memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids);
-   kfree(memcg_lrus);
+   kvfree(memcg_lrus);
+}
+
+static void free_list_lru_memcg(struct rcu_head *head)
+{
+   kvfree(container_of(head, struct list_lru_memcg, rcu));
 }
 
 static int memcg_update_list_lru_node(struct list_lru_node *nlru,
@@ -359,12 +364,12 @@ static int memcg_update_list_lru_node(struct 
list_lru_node *nlru,
 
/* list_lrus_mutex is held, nobody can change memcg_lrus. Silence RCU */
old = rcu_dereference_check(nlru->memcg_lrus, true);
-   new = kmalloc(sizeof(*new) + new_size * sizeof(void *), GFP_KERNEL);
+   new = kvmalloc(sizeof(*new) + new_size * sizeof(void *), GFP_KERNEL);
if (!new)
return -ENOMEM;
 
if (__memcg_init_list_lru_node(new, old_size, new_size)) {
-   kfree(new);
+   kvfree(new);
return -ENOMEM;
}
 
@@ -381,7 +386,7 @@ static int memcg_update_list_lru_node(struct list_lru_node 
*nlru,
rcu_assign_pointer(nlru->memcg_lrus, new);
spin_unlock_irq(&nlru->lock);
 
-   kfree_rcu(old, rcu);
+

[Devel] [PATCH rh7 1/2] ms/mm: introduce kv[mz]alloc helpers

2017-11-03 Thread Andrey Ryabinin
This only small part of the upstream commit
a7c3e901a46ff54c016d040847eda598a9e3e653. I backported only
part that introduce kv[mz]alloc helpers.

Description of the original patch:

commit a7c3e901a46ff54c016d040847eda598a9e3e653
Author: Michal Hocko 
Date:   Mon May 8 15:57:09 2017 -0700

mm: introduce kv[mz]alloc helpers

Patch series "kvmalloc", v5.

There are many open coded kmalloc with vmalloc fallback instances in the
tree.  Most of them are not careful enough or simply do not care about
the underlying semantic of the kmalloc/page allocator which means that
a) some vmalloc fallbacks are basically unreachable because the kmalloc
part will keep retrying until it succeeds b) the page allocator can
invoke a really disruptive steps like the OOM killer to move forward
which doesn't sound appropriate when we consider that the vmalloc
fallback is available.

As it can be seen implementing kvmalloc requires quite an intimate
knowledge if the page allocator and the memory reclaim internals which
strongly suggests that a helper should be implemented in the memory
subsystem proper.

Most callers, I could find, have been converted to use the helper
instead.  This is patch 6.  There are some more relying on __GFP_REPEAT
in the networking stack which I have converted as well and Eric Dumazet
was not opposed [2] to convert them as well.

[1] http://lkml.kernel.org/r/20170130094940.13546-1-mho...@kernel.org
[2] 
http://lkml.kernel.org/r/1485273626.16328.301.ca...@edumazet-glaptop3.roam.corp.google.com

This patch (of 9):

Using kmalloc with the vmalloc fallback for larger allocations is a
common pattern in the kernel code.  Yet we do not have any common helper
for that and so users have invented their own helpers.  Some of them are
really creative when doing so.  Let's just add kv[mz]alloc and make sure
it is implemented properly.  This implementation makes sure to not make
a large memory pressure for > PAGE_SZE requests (__GFP_NORETRY) and also
to not warn about allocation failures.  This also rules out the OOM
killer as the vmalloc is a more approapriate fallback than a disruptive
user visible action.

This patch also changes some existing users and removes helpers which
are specific for them.  In some cases this is not possible (e.g.
ext4_kvmalloc, libcfs_kvzalloc) because those seems to be broken and
require GFP_NO{FS,IO} context which is not vmalloc compatible in general
(note that the page table allocation is GFP_KERNEL).  Those need to be
fixed separately.

While we are at it, document that __vmalloc{_node} about unsupported gfp
mask because there seems to be a lot of confusion out there.
kvmalloc_node will warn about GFP_KERNEL incompatible (which are not
superset) flags to catch new abusers.  Existing ones would have to die
slowly.

https://jira.sw.ru/browse/PSBM-76752
Signed-off-by: Andrey Ryabinin 
---
 include/linux/mm.h   | 14 +
 include/linux/vmalloc.h  |  1 +
 mm/nommu.c   |  5 +++
 mm/util.c| 45 ++
 mm/vmalloc.c |  2 +-
 security/apparmor/apparmorfs.c   |  2 +-
 security/apparmor/include/apparmor.h |  2 --
 security/apparmor/lib.c  | 61 
 security/apparmor/match.c|  2 +-
 9 files changed, 68 insertions(+), 66 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c806f43b5b59..897d7cfd2269 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -401,6 +401,20 @@ static inline int is_vmalloc_or_module_addr(const void *x)
 }
 #endif
 
+extern void *kvmalloc_node(size_t size, gfp_t flags, int node);
+static inline void *kvmalloc(size_t size, gfp_t flags)
+{
+   return kvmalloc_node(size, flags, NUMA_NO_NODE);
+}
+static inline void *kvzalloc_node(size_t size, gfp_t flags, int node)
+{
+   return kvmalloc_node(size, flags | __GFP_ZERO, node);
+}
+static inline void *kvzalloc(size_t size, gfp_t flags)
+{
+   return kvmalloc(size, flags | __GFP_ZERO);
+}
+
 extern void kvfree(const void *addr);
 
 static inline void compound_lock(struct page *page)
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 6ea82cf30dc1..59c80dd655a3 100644
--- a/include/linux/vmalloc.h
+++ b/include/linux/vmalloc.h
@@ -81,6 +81,7 @@ extern void *__vmalloc_node_range(unsigned long size, 
unsigned long align,
unsigned long start, unsigned long end, gfp_t gfp_mask,
pgprot_t prot, unsigned long vm_flags, int node,
const void *caller);
+extern void *__vmalloc_node_flags(unsigned long size, int node, gfp_t flags);
 
 extern void vfree(const void *addr);
 
diff --git a/mm/nommu.c b/mm/nommu.c
index beecd953c29c..a16aee9188a8 100644
--- a/mm/nommu.c
+++ b/mm/n