[Devel] [PATCH RHEL7 COMMIT] ve/cgroup: make cgroup_get_ve_root visible in kernel/sched/core.c

2017-11-07 Thread Konstantin Khorenko
The commit is pushed to "branch-rh7-3.10.0-693.1.1.vz7.37.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-693.1.1.vz7.37.21
-->
commit 5fe025036ced8c46edc1316856df5617f3184a59
Author: Pavel Tikhomirov 
Date:   Tue Nov 7 12:06:20 2017 +0300

ve/cgroup: make cgroup_get_ve_root visible in kernel/sched/core.c

Need it to find out root ve cgroup for nr_cpus and cpu_rate.

Signed-off-by: Pavel Tikhomirov 
Reviewed-by: Andrey Ryabinin 
---
 include/linux/ve.h | 7 +++
 kernel/cgroup.c| 7 +--
 2 files changed, 8 insertions(+), 6 deletions(-)

diff --git a/include/linux/ve.h b/include/linux/ve.h
index 7999746..486fa24 100644
--- a/include/linux/ve.h
+++ b/include/linux/ve.h
@@ -226,6 +226,8 @@ extern void vtty_release(struct tty_struct *tty, struct 
tty_struct *o_tty,
 extern bool vtty_is_master(struct tty_struct *tty);
 #endif /* CONFIG_TTY */
 
+extern struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp);
+
 #else  /* CONFIG_VE */
 
 #define ve_uevent_seqnum uevent_seqnum
@@ -259,6 +261,11 @@ static inline void monotonic_abs_to_ve(clockid_t 
which_clock,
struct timespec *tp) { }
 static inline void monotonic_ve_to_abs(clockid_t which_clock,
struct timepsec *tp) { }
+
+static inline struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
+{
+   return NULL;
+}
 #endif /* CONFIG_VE */
 
 struct seq_file;
diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index 06863de..51ce4ab 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4280,7 +4280,7 @@ void cgroup_mark_ve_root(struct ve_struct *ve)
mutex_unlock(&cgroup_mutex);
 }
 
-static struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
+struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
 {
struct cgroup *ve_root = NULL;
 
@@ -4294,11 +4294,6 @@ static struct cgroup *cgroup_get_ve_root(struct cgroup 
*cgrp)
 
return ve_root;
 }
-#else
-static inline struct cgroup *cgroup_get_ve_root(struct cgroup *cgrp)
-{
-   return NULL;
-}
 #endif
 
 /*
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL7 COMMIT] ve/sched: take nr_cpus and cpu_rate from ve root task group

2017-11-07 Thread Konstantin Khorenko
The commit is pushed to "branch-rh7-3.10.0-693.1.1.vz7.37.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-693.1.1.vz7.37.21
-->
commit e661261a0f8af475ae0dd7980bd73555ff7724a1
Author: Pavel Tikhomirov 
Date:   Tue Nov 7 12:06:21 2017 +0300

ve/sched: take nr_cpus and cpu_rate from ve root task group

Patchset description:

ve: properly handle nr_cpus and cpu_rate for nested cgroups

https://jira.sw.ru/browse/PSBM-69678

Pavel Tikhomirov (3):
  cgroup: remove rcu_read_lock from cgroup_get_ve_root
  cgroup: make cgroup_get_ve_root visible in kernel/sched/core.c
  sched: take nr_cpus and cpu_rate from ve root task group

=
This patch description:

Cpu view in container should depend only from root cpu cgroup
nr_cpus/rate configuration. So replace tg->xxx references by
tg_xxx(tg) helpers to get xxx from root ve cgroup. We still
allow set/read rate and nr_cpus directly in nested cgroups,
but they are just converted to corresponding cfs_period and
cfs_quota setup, and does _not_ influence in container view
of cpus and their stats.

Also remove excessive rcu_read_lock/unlock as we have no rcu
dereference in between, looks like some leftover for task_group()
which differs in VZ6 and VZ7.

https://jira.sw.ru/browse/PSBM-69678

Signed-off-by: Pavel Tikhomirov 
Reviewed-by: Andrey Ryabinin 
---
 include/linux/sched.h |  2 ++
 kernel/sched/core.c   | 56 +--
 kernel/sched/fair.c   |  9 +
 3 files changed, 52 insertions(+), 15 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 84fe6cd..03c06ff6 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -3182,6 +3182,8 @@ static inline void set_task_cpu(struct task_struct *p, 
unsigned int cpu)
 
 #endif /* CONFIG_SMP */
 
+extern unsigned int tg_cpu_rate(struct task_group *tg);
+extern unsigned int tg_nr_cpus(struct task_group *tg);
 #ifdef CONFIG_CFS_CPULIMIT
 extern unsigned int task_nr_cpus(struct task_struct *p);
 extern unsigned int task_vcpu_id(struct task_struct *p);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7a40fa8..5b3daa1 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -340,15 +340,40 @@ __read_mostly int scheduler_running;
  */
 int sysctl_sched_rt_runtime = 95;
 
+static inline struct task_group *cgroup_tg(struct cgroup *cgrp);
+
+static struct task_group *ve_root_tg(struct task_group *tg) {
+   struct cgroup *cg;
+
+   if (!tg)
+   return NULL;
+
+   cg = cgroup_get_ve_root(tg->css.cgroup);
+   WARN_ONCE(!cg, "Failed to find ve root cgroup, possible container 
configuration problem.\n");
+   return cg ? cgroup_tg(cg) : NULL;
+}
+
+unsigned int tg_cpu_rate(struct task_group *tg)
+{
+   unsigned int cpu_rate = 0;
 #ifdef CONFIG_CFS_CPULIMIT
-unsigned int task_nr_cpus(struct task_struct *p)
+   tg = ve_root_tg(tg);
+   if (tg)
+   cpu_rate = tg->cpu_rate;
+#endif
+   return cpu_rate;
+}
+
+unsigned int tg_nr_cpus(struct task_group *tg)
 {
unsigned int nr_cpus = 0;
unsigned int max_nr_cpus = num_online_cpus();
 
-   rcu_read_lock();
-   nr_cpus = task_group(p)->nr_cpus;
-   rcu_read_unlock();
+#ifdef CONFIG_CFS_CPULIMIT
+   tg = ve_root_tg(tg);
+   if (tg)
+   nr_cpus = tg->nr_cpus;
+#endif
 
if (!nr_cpus || nr_cpus > max_nr_cpus)
nr_cpus = max_nr_cpus;
@@ -356,6 +381,17 @@ unsigned int task_nr_cpus(struct task_struct *p)
return nr_cpus;
 }
 
+#ifdef CONFIG_CFS_CPULIMIT
+unsigned int task_nr_cpus(struct task_struct *p)
+{
+   return tg_nr_cpus(task_group(p));
+}
+
+static unsigned int task_cpu_rate(struct task_struct *p)
+{
+   return tg_cpu_rate(task_group(p));
+}
+
 unsigned int task_vcpu_id(struct task_struct *p)
 {
return task_cpu(p) % task_nr_cpus(p);
@@ -370,9 +406,7 @@ unsigned int sched_cpulimit_scale_cpufreq(unsigned int freq)
if (!sysctl_sched_cpulimit_scale_cpufreq)
return freq;
 
-   rcu_read_lock();
-   rate = task_group(current)->cpu_rate;
-   rcu_read_unlock();
+   rate = task_cpu_rate(current);
 
max_rate = num_online_vcpus() * MAX_CPU_RATE;
if (!rate || rate >= max_rate)
@@ -9919,8 +9953,8 @@ static void cpu_cgroup_update_vcpustat(struct cgroup 
*cgrp)
spin_lock(&tg->vcpustat_lock);
 
now = ktime_get();
-   nr_vcpus = tg->nr_cpus ?: num_online_cpus();
-   vcpu_rate = DIV_ROUND_UP(tg->cpu_rate, nr_vcpus);
+   nr_vcpus = tg_nr_cpus(tg);
+   vcpu_rate = DIV_ROUND_UP(tg_cpu_rate(tg), nr_vcpus);
if (!vcpu_rate || vcpu_rate > MAX_CPU_RATE)
vcpu_rate = MAX_CPU_RATE;
 
@@ -10005,7 +10039,7 @@ int cpu_cgroup_proc_stat(struct cgroup *cgrp

[Devel] [PATCH RHEL7 COMMIT] ve/cgroup: remove rcu_read_lock from cgroup_get_ve_root

2017-11-07 Thread Konstantin Khorenko
The commit is pushed to "branch-rh7-3.10.0-693.1.1.vz7.37.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-693.1.1.vz7.37.21
-->
commit f8168436b4eb925096fe7f14bfc18b0789aa6b3c
Author: Pavel Tikhomirov 
Date:   Tue Nov 7 12:06:19 2017 +0300

ve/cgroup: remove rcu_read_lock from cgroup_get_ve_root

It is likely a leftover from __cgroup_path where it protects cgrp->name
in cgroup_get_ve_root there is nothing rcu_dereferenced, also in
cgroup_is_descendant there is no rcu for cgrp->parent.

https://jira.sw.ru/browse/PSBM-69678

Signed-off-by: Pavel Tikhomirov 
Reviewed-by: Andrey Ryabinin 
---
 kernel/cgroup.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/cgroup.c b/kernel/cgroup.c
index f960c34..06863de 100644
--- a/kernel/cgroup.c
+++ b/kernel/cgroup.c
@@ -4284,7 +4284,6 @@ static struct cgroup *cgroup_get_ve_root(struct cgroup 
*cgrp)
 {
struct cgroup *ve_root = NULL;
 
-   rcu_read_lock();
do {
if (test_bit(CGRP_VE_ROOT, &cgrp->flags)) {
ve_root = cgrp;
@@ -4292,7 +4291,6 @@ static struct cgroup *cgroup_get_ve_root(struct cgroup 
*cgrp)
}
cgrp = cgrp->parent;
} while (cgrp);
-   rcu_read_unlock();
 
return ve_root;
 }
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH] nfs: protect callback execution against per-net callback thread shutdown

2017-11-07 Thread Stanislav Kinsburskiy
Well... Maybe.
Let's check, how it works with our kernel.

07.11.2017 10:16, Konstantin Khorenko пишет:
> Going to send it to mainstream as well?
> 
> -- 
> Best regards,
> 
> Konstantin Khorenko,
> Virtuozzo Linux Kernel Team
> 
> On 11/03/2017 07:47 PM, Stanislav Kinsburskiy wrote:
>> From: Stanislav Kinsburskiy 
>>
>> The problem is that per-net SUNRPC transports shutdown is done regardless
>> current callback execution. This is a race leading to transport 
>> use-after-free
>> in callback handler.
>> This patch fixes it in stright-forward way. I.e. it protects callback
>> execution with the same mutex used for per-net data creation and destruction.
>> Hopefully, it won't slow down NFS client significantly.
>>
>> https://jira.sw.ru/browse/PSBM-75751
>>
>> Signed-off-by: Stanislav Kinsburskiy 
>> ---
>>  fs/nfs/callback.c |    3 +++
>>  1 file changed, 3 insertions(+)
>>
>> diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
>> index 0beb275..82e8ed1 100644
>> --- a/fs/nfs/callback.c
>> +++ b/fs/nfs/callback.c
>> @@ -118,6 +118,7 @@ nfs41_callback_svc(void *vrqstp)
>>  continue;
>>
>>  prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
>> +    mutex_lock(&nfs_callback_mutex);
>>  spin_lock_bh(&serv->sv_cb_lock);
>>  if (!list_empty(&serv->sv_cb_list)) {
>>  req = list_first_entry(&serv->sv_cb_list,
>> @@ -129,8 +130,10 @@ nfs41_callback_svc(void *vrqstp)
>>  error = bc_svc_process(serv, req, rqstp);
>>  dprintk("bc_svc_process() returned w/ error code= %d\n",
>>  error);
>> +    mutex_unlock(&nfs_callback_mutex);
>>  } else {
>>  spin_unlock_bh(&serv->sv_cb_lock);
>> +    mutex_unlock(&nfs_callback_mutex);
>>  schedule();
>>  finish_wait(&serv->sv_cb_waitq, &wq);
>>  }
>>
>> ___
>> Devel mailing list
>> Devel@openvz.org
>> https://lists.openvz.org/mailman/listinfo/devel
>> .
>>
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL7 COMMIT] nfs: protect callback execution against per-net callback thread shutdown

2017-11-07 Thread Konstantin Khorenko
The commit is pushed to "branch-rh7-3.10.0-693.1.1.vz7.37.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-693.1.1.vz7.37.21
-->
commit 163d706e88397ff953321004ef6cc34da93b509d
Author: Stanislav Kinsburskiy 
Date:   Tue Nov 7 12:16:46 2017 +0300

nfs: protect callback execution against per-net callback thread shutdown

The problem is that per-net SUNRPC transports shutdown is done regardless
current callback execution. This is a race leading to transport 
use-after-free
in callback handler.
This patch fixes it in stright-forward way. I.e. it protects callback
execution with the same mutex used for per-net data creation and 
destruction.
Hopefully, it won't slow down NFS client significantly.

https://jira.sw.ru/browse/PSBM-75751

Signed-off-by: Stanislav Kinsburskiy 
---
 fs/nfs/callback.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 0beb275..82e8ed1 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -118,6 +118,7 @@ nfs41_callback_svc(void *vrqstp)
continue;
 
prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
+   mutex_lock(&nfs_callback_mutex);
spin_lock_bh(&serv->sv_cb_lock);
if (!list_empty(&serv->sv_cb_list)) {
req = list_first_entry(&serv->sv_cb_list,
@@ -129,8 +130,10 @@ nfs41_callback_svc(void *vrqstp)
error = bc_svc_process(serv, req, rqstp);
dprintk("bc_svc_process() returned w/ error code= %d\n",
error);
+   mutex_unlock(&nfs_callback_mutex);
} else {
spin_unlock_bh(&serv->sv_cb_lock);
+   mutex_unlock(&nfs_callback_mutex);
schedule();
finish_wait(&serv->sv_cb_waitq, &wq);
}
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH] nfs: protect callback execution against per-net callback thread shutdown

2017-11-07 Thread Konstantin Khorenko

Going to send it to mainstream as well?

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team

On 11/03/2017 07:47 PM, Stanislav Kinsburskiy wrote:

From: Stanislav Kinsburskiy 

The problem is that per-net SUNRPC transports shutdown is done regardless
current callback execution. This is a race leading to transport use-after-free
in callback handler.
This patch fixes it in stright-forward way. I.e. it protects callback
execution with the same mutex used for per-net data creation and destruction.
Hopefully, it won't slow down NFS client significantly.

https://jira.sw.ru/browse/PSBM-75751

Signed-off-by: Stanislav Kinsburskiy 
---
 fs/nfs/callback.c |3 +++
 1 file changed, 3 insertions(+)

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 0beb275..82e8ed1 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -118,6 +118,7 @@ nfs41_callback_svc(void *vrqstp)
continue;

prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
+   mutex_lock(&nfs_callback_mutex);
spin_lock_bh(&serv->sv_cb_lock);
if (!list_empty(&serv->sv_cb_list)) {
req = list_first_entry(&serv->sv_cb_list,
@@ -129,8 +130,10 @@ nfs41_callback_svc(void *vrqstp)
error = bc_svc_process(serv, req, rqstp);
dprintk("bc_svc_process() returned w/ error code= %d\n",
error);
+   mutex_unlock(&nfs_callback_mutex);
} else {
spin_unlock_bh(&serv->sv_cb_lock);
+   mutex_unlock(&nfs_callback_mutex);
schedule();
finish_wait(&serv->sv_cb_waitq, &wq);
}

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel
.


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH] nfs: protect callback execution against per-net callback thread shutdown

2017-11-07 Thread Kirill Tkhai
On 03.11.2017 19:47, Stanislav Kinsburskiy wrote:
> From: Stanislav Kinsburskiy 
> 
> The problem is that per-net SUNRPC transports shutdown is done regardless
> current callback execution. This is a race leading to transport use-after-free
> in callback handler.

Could you please draw the race to show the interaction between functions?

> This patch fixes it in stright-forward way. I.e. it protects callback
> execution with the same mutex used for per-net data creation and destruction.
> Hopefully, it won't slow down NFS client significantly.
> 
> https://jira.sw.ru/browse/PSBM-75751
> 
> Signed-off-by: Stanislav Kinsburskiy 
> ---
>  fs/nfs/callback.c |3 +++
>  1 file changed, 3 insertions(+)
> 
> diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
> index 0beb275..82e8ed1 100644
> --- a/fs/nfs/callback.c
> +++ b/fs/nfs/callback.c
> @@ -118,6 +118,7 @@ nfs41_callback_svc(void *vrqstp)
>   continue;
>  
>   prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
> + mutex_lock(&nfs_callback_mutex);
>   spin_lock_bh(&serv->sv_cb_lock);
>   if (!list_empty(&serv->sv_cb_list)) {
>   req = list_first_entry(&serv->sv_cb_list,
> @@ -129,8 +130,10 @@ nfs41_callback_svc(void *vrqstp)
>   error = bc_svc_process(serv, req, rqstp);
>   dprintk("bc_svc_process() returned w/ error code= %d\n",
>   error);
> + mutex_unlock(&nfs_callback_mutex);
>   } else {
>   spin_unlock_bh(&serv->sv_cb_lock);
> + mutex_unlock(&nfs_callback_mutex);
>   schedule();
>   finish_wait(&serv->sv_cb_waitq, &wq);
>   }

Couldn't this change introduce a deadlock like below?
  [thread]
nfs_callback_down()   nfs41_callback_svc()
   mutex_lock(&nfs_callback_mutex);   
   kthread_stop(cb_info->task);   
  wake_up_process();  
  wait_for_completion(&kthread->exited);  

  
mutex_lock(&nfs_callback_mutex); 


___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL7 COMMIT] ms/mm: introduce kv[mz]alloc helpers

2017-11-07 Thread Konstantin Khorenko
The commit is pushed to "branch-rh7-3.10.0-693.1.1.vz7.37.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-693.1.1.vz7.37.22
-->
commit e297497b1e36380d5022ed98411c4f3ab3741bee
Author: Andrey Ryabinin 
Date:   Tue Nov 7 12:59:17 2017 +0300

ms/mm: introduce kv[mz]alloc helpers

This only small part of the upstream commit
a7c3e901a46ff54c016d040847eda598a9e3e653. I backported only
part that introduces kv[mz]alloc helpers.

Description of the original patch:

commit a7c3e901a46ff54c016d040847eda598a9e3e653
Author: Michal Hocko 
Date:   Mon May 8 15:57:09 2017 -0700

mm: introduce kv[mz]alloc helpers

Patch series "kvmalloc", v5.

There are many open coded kmalloc with vmalloc fallback instances in the
tree.  Most of them are not careful enough or simply do not care about
the underlying semantic of the kmalloc/page allocator which means that
a) some vmalloc fallbacks are basically unreachable because the kmalloc
part will keep retrying until it succeeds b) the page allocator can
invoke a really disruptive steps like the OOM killer to move forward
which doesn't sound appropriate when we consider that the vmalloc
fallback is available.

As it can be seen implementing kvmalloc requires quite an intimate
knowledge if the page allocator and the memory reclaim internals which
strongly suggests that a helper should be implemented in the memory
subsystem proper.

Most callers, I could find, have been converted to use the helper
instead.  This is patch 6.  There are some more relying on __GFP_REPEAT
in the networking stack which I have converted as well and Eric Dumazet
was not opposed [2] to convert them as well.

[1] http://lkml.kernel.org/r/20170130094940.13546-1-mho...@kernel.org
[2] 
http://lkml.kernel.org/r/1485273626.16328.301.ca...@edumazet-glaptop3.roam.corp.google.com

This patch (of 9):

Using kmalloc with the vmalloc fallback for larger allocations is a
common pattern in the kernel code.  Yet we do not have any common helper
for that and so users have invented their own helpers.  Some of them are
really creative when doing so.  Let's just add kv[mz]alloc and make sure
it is implemented properly.  This implementation makes sure to not make
a large memory pressure for > PAGE_SZE requests (__GFP_NORETRY) and also
to not warn about allocation failures.  This also rules out the OOM
killer as the vmalloc is a more approapriate fallback than a disruptive
user visible action.

This patch also changes some existing users and removes helpers which
are specific for them.  In some cases this is not possible (e.g.
ext4_kvmalloc, libcfs_kvzalloc) because those seems to be broken and
require GFP_NO{FS,IO} context which is not vmalloc compatible in general
(note that the page table allocation is GFP_KERNEL).  Those need to be
fixed separately.

While we are at it, document that __vmalloc{_node} about unsupported gfp
mask because there seems to be a lot of confusion out there.
kvmalloc_node will warn about GFP_KERNEL incompatible (which are not
superset) flags to catch new abusers.  Existing ones would have to die
slowly.

https://jira.sw.ru/browse/PSBM-76752
Signed-off-by: Andrey Ryabinin 
---
 include/linux/mm.h   | 14 +
 include/linux/vmalloc.h  |  1 +
 mm/nommu.c   |  5 +++
 mm/util.c| 45 ++
 mm/vmalloc.c |  2 +-
 security/apparmor/apparmorfs.c   |  2 +-
 security/apparmor/include/apparmor.h |  2 --
 security/apparmor/lib.c  | 61 
 security/apparmor/match.c|  2 +-
 9 files changed, 68 insertions(+), 66 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index c806f43..897d7cf 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -401,6 +401,20 @@ static inline int is_vmalloc_or_module_addr(const void *x)
 }
 #endif
 
+extern void *kvmalloc_node(size_t size, gfp_t flags, int node);
+static inline void *kvmalloc(size_t size, gfp_t flags)
+{
+   return kvmalloc_node(size, flags, NUMA_NO_NODE);
+}
+static inline void *kvzalloc_node(size_t size, gfp_t flags, int node)
+{
+   return kvmalloc_node(size, flags | __GFP_ZERO, node);
+}
+static inline void *kvzalloc(size_t size, gfp_t flags)
+{
+   return kvmalloc(size, flags | __GFP_ZERO);
+}
+
 extern void kvfree(const void *addr);
 
 static inline void compound_lock(struct page *page)
diff --git a/include/linux/vmalloc.h b/include/linux/vmalloc.h
index 6ea82cf..59c80dd 100644
--- a/include/

[Devel] [PATCH RHEL7 COMMIT] ms/mm: memcontrol: use vmalloc fallback for large kmem memcg arrays

2017-11-07 Thread Konstantin Khorenko
The commit is pushed to "branch-rh7-3.10.0-693.1.1.vz7.37.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-693.1.1.vz7.37.22
-->
commit 82946e31776cc710b9b79c9dfacc0c74405c9e0c
Author: Johannes Weiner 
Date:   Tue Nov 7 12:59:17 2017 +0300

ms/mm: memcontrol: use vmalloc fallback for large kmem memcg arrays

commit f80c7dab95a1f0f968acbafe4426ee9525b6f6ab upstream.

For quick per-memcg indexing, slab caches and list_lru structures
maintain linear arrays of descriptors. As the number of concurrent
memory cgroups in the system goes up, this requires large contiguous
allocations (8k cgroups = order-5, 16k cgroups = order-6 etc.) for
every existing slab cache and list_lru, which can easily fail on
loaded systems. E.g.:

mkdir: page allocation failure: order:5, 
mode:0x14040c0(GFP_KERNEL|__GFP_COMP), nodemask=(null)
CPU: 1 PID: 6399 Comm: mkdir Not tainted 
4.13.0-mm1-00065-g720bbe532b7c-dirty #481
Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 
1.10.2-20170228_101828-anatol 04/01/2014
Call Trace:
 dump_stack+0x70/0x9d
 warn_alloc+0xd6/0x170
 ? __alloc_pages_direct_compact+0x4c/0x110
 __alloc_pages_nodemask+0xf50/0x1430
 ? __lock_acquire+0xd19/0x1360
 ? memcg_update_all_list_lrus+0x2e/0x2e0
 ? __mutex_lock+0x7c/0x950
 ? memcg_update_all_list_lrus+0x2e/0x2e0
 alloc_pages_current+0x60/0xc0
 kmalloc_order_trace+0x29/0x1b0
 __kmalloc+0x1f4/0x320
 memcg_update_all_list_lrus+0xca/0x2e0
 mem_cgroup_css_alloc+0x612/0x670
 cgroup_apply_control_enable+0x19e/0x360
 cgroup_mkdir+0x322/0x490
 kernfs_iop_mkdir+0x55/0x80
 vfs_mkdir+0xd0/0x120
 SyS_mkdirat+0x6c/0xe0
 SyS_mkdir+0x14/0x20
 entry_SYSCALL_64_fastpath+0x18/0xad
RIP: 0033:0x7f9ff36cee87
RSP: 002b:7ffc7612d758 EFLAGS: 0202 ORIG_RAX: 0053
RAX: ffda RBX: 7ffc7612da48 RCX: 7f9ff36cee87
RDX: 01ff RSI: 01ff RDI: 7ffc7612de86
RBP: 0002 R08: 01ff R09: 00401db0
R10: 01e2 R11: 0202 R12: 
R13: 7ffc7612da40 R14:  R15: 
Mem-Info:
active_anon:2965 inactive_anon:19 isolated_anon:0
 active_file:100270 inactive_file:98846 isolated_file:0
 unevictable:0 dirty:0 writeback:0 unstable:0
 slab_reclaimable:7328 slab_unreclaimable:16402
 mapped:771 shmem:52 pagetables:278 bounce:0
 free:13718 free_pcp:0 free_cma:0

This output is from an artificial reproducer, but we have repeatedly
observed order-7 failures in production in the Facebook fleet. These
systems become useless as they cannot run more jobs, even though there
is plenty of memory to allocate 128 individual pages.

Use kvmalloc and kvzalloc to fall back to vmalloc space if these
arrays prove too large for allocating them physically contiguous.

Link: http://lkml.kernel.org/r/20170918184919.20644-1-han...@cmpxchg.org
Signed-off-by: Johannes Weiner 
Reviewed-by: Josef Bacik 
Acked-by: Michal Hocko 
Acked-by: Vladimir Davydov 
Signed-off-by: Andrew Morton 
Signed-off-by: Linus Torvalds 

https://jira.sw.ru/browse/PSBM-76752
Signed-off-by: Andrey Ryabinin 
---
 mm/list_lru.c| 17 +++--
 mm/slab_common.c | 20 ++--
 2 files changed, 25 insertions(+), 12 deletions(-)

diff --git a/mm/list_lru.c b/mm/list_lru.c
index 5adc6621..91dccc1 100644
--- a/mm/list_lru.c
+++ b/mm/list_lru.c
@@ -322,13 +322,13 @@ static int memcg_init_list_lru_node(struct list_lru_node 
*nlru)
struct list_lru_memcg *memcg_lrus;
int size = memcg_nr_cache_ids;
 
-   memcg_lrus = kmalloc(sizeof(*memcg_lrus) +
+   memcg_lrus = kvmalloc(sizeof(*memcg_lrus) +
 size * sizeof(void *), GFP_KERNEL);
if (!memcg_lrus)
return -ENOMEM;
 
if (__memcg_init_list_lru_node(memcg_lrus, 0, size)) {
-   kfree(memcg_lrus);
+   kvfree(memcg_lrus);
return -ENOMEM;
}
rcu_assign_pointer(nlru->memcg_lrus, memcg_lrus);
@@ -346,7 +346,12 @@ static void memcg_destroy_list_lru_node(struct 
list_lru_node *nlru)
 */
memcg_lrus = rcu_dereference_check(nlru->memcg_lrus, true);
__memcg_destroy_list_lru_node(memcg_lrus, 0, memcg_nr_cache_ids);
-   kfree(memcg_lrus);
+   kvfree(memcg_lrus);
+}
+
+static void free_list_lru_memcg(struct rcu_head *head)
+{
+   kvfree(container_of(head, struct list_lru_memcg, rcu));
 }
 
 static int memcg_update_list_lru_node(struct list_lru_node *nlru,
@@ -359,12 +364,12 @@ static int memcg_update_list_lru_node(struct 
list_lru_node *nlru,
 
/* list_lrus_mutex is held, nobody can change memcg_lrus. Silence RCU */
old = rcu_dereference_check(nlru->memcg_lrus, tr

Re: [Devel] [PATCH] nfs: protect callback execution against per-net callback thread shutdown

2017-11-07 Thread Stanislav Kinsburskiy
Sure, here it is:

CPU #0  CPU#1

cleanup_mnt nfs41_callback_svc
... (get transport from the list)
nfs_callback_down   ...
... ...
svc_close_net   ...
... ...
svc_xprt_free   ...
svc_bc_sock_freebc_svc_process
kfree(xprt) svc_process_common
rqstp->rq_xprt->xpt_ops (use 
after free)



07.11.2017 10:49, Kirill Tkhai пишет:
> On 03.11.2017 19:47, Stanislav Kinsburskiy wrote:
>> From: Stanislav Kinsburskiy 
>>
>> The problem is that per-net SUNRPC transports shutdown is done regardless
>> current callback execution. This is a race leading to transport 
>> use-after-free
>> in callback handler.
> 
> Could you please draw the race to show the interaction between functions?
> 
>> This patch fixes it in stright-forward way. I.e. it protects callback
>> execution with the same mutex used for per-net data creation and destruction.
>> Hopefully, it won't slow down NFS client significantly.
>>
>> https://jira.sw.ru/browse/PSBM-75751
>>
>> Signed-off-by: Stanislav Kinsburskiy 
>> ---
>>  fs/nfs/callback.c |3 +++
>>  1 file changed, 3 insertions(+)
>>
>> diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
>> index 0beb275..82e8ed1 100644
>> --- a/fs/nfs/callback.c
>> +++ b/fs/nfs/callback.c
>> @@ -118,6 +118,7 @@ nfs41_callback_svc(void *vrqstp)
>>  continue;
>>  
>>  prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
>> +mutex_lock(&nfs_callback_mutex);
>>  spin_lock_bh(&serv->sv_cb_lock);
>>  if (!list_empty(&serv->sv_cb_list)) {
>>  req = list_first_entry(&serv->sv_cb_list,
>> @@ -129,8 +130,10 @@ nfs41_callback_svc(void *vrqstp)
>>  error = bc_svc_process(serv, req, rqstp);
>>  dprintk("bc_svc_process() returned w/ error code= %d\n",
>>  error);
>> +mutex_unlock(&nfs_callback_mutex);
>>  } else {
>>  spin_unlock_bh(&serv->sv_cb_lock);
>> +mutex_unlock(&nfs_callback_mutex);
>>  schedule();
>>  finish_wait(&serv->sv_cb_waitq, &wq);
>>  }
> 
> Couldn't this change introduce a deadlock like below?
>   [thread]
> nfs_callback_down()   nfs41_callback_svc()
>mutex_lock(&nfs_callback_mutex);   
>kthread_stop(cb_info->task);   
>   wake_up_process();  
>   wait_for_completion(&kthread->exited);  
> 
>   
> mutex_lock(&nfs_callback_mutex); 
> 
> 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH] nfs: protect callback execution against per-net callback thread shutdown

2017-11-07 Thread Kirill Tkhai
On 07.11.2017 13:01, Stanislav Kinsburskiy wrote:
> Sure, here it is:
> 
> CPU #0CPU#1
> 
> cleanup_mnt   nfs41_callback_svc
> ...   (get transport from the list)
> nfs_callback_down ...
> ...   ...
> svc_close_net ...
> ...   ...
> svc_xprt_free ...
> svc_bc_sock_free  bc_svc_process
> kfree(xprt)   svc_process_common
>   rqstp->rq_xprt->xpt_ops (use 
> after free)
>   
> 
> 
> 07.11.2017 10:49, Kirill Tkhai пишет:
>> On 03.11.2017 19:47, Stanislav Kinsburskiy wrote:
>>> From: Stanislav Kinsburskiy 
>>>
>>> The problem is that per-net SUNRPC transports shutdown is done regardless
>>> current callback execution. This is a race leading to transport 
>>> use-after-free
>>> in callback handler.
>>
>> Could you please draw the race to show the interaction between functions?
>>
>>> This patch fixes it in stright-forward way. I.e. it protects callback
>>> execution with the same mutex used for per-net data creation and 
>>> destruction.
>>> Hopefully, it won't slow down NFS client significantly.
>>>
>>> https://jira.sw.ru/browse/PSBM-75751
>>>
>>> Signed-off-by: Stanislav Kinsburskiy 
>>> ---
>>>  fs/nfs/callback.c |3 +++
>>>  1 file changed, 3 insertions(+)
>>>
>>> diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
>>> index 0beb275..82e8ed1 100644
>>> --- a/fs/nfs/callback.c
>>> +++ b/fs/nfs/callback.c
>>> @@ -118,6 +118,7 @@ nfs41_callback_svc(void *vrqstp)
>>> continue;
>>>  
>>> prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
>>> +   mutex_lock(&nfs_callback_mutex);
>>> spin_lock_bh(&serv->sv_cb_lock);
>>> if (!list_empty(&serv->sv_cb_list)) {
>>> req = list_first_entry(&serv->sv_cb_list,
>>> @@ -129,8 +130,10 @@ nfs41_callback_svc(void *vrqstp)
>>> error = bc_svc_process(serv, req, rqstp);
>>> dprintk("bc_svc_process() returned w/ error code= %d\n",
>>> error);
>>> +   mutex_unlock(&nfs_callback_mutex);
>>> } else {
>>> spin_unlock_bh(&serv->sv_cb_lock);
>>> +   mutex_unlock(&nfs_callback_mutex);
>>> schedule();
>>> finish_wait(&serv->sv_cb_waitq, &wq);
>>> }
>>
>> Couldn't this change introduce a deadlock like below?
>>   [thread]
>> nfs_callback_down()   
>> nfs41_callback_svc()
>>mutex_lock(&nfs_callback_mutex);   
>>kthread_stop(cb_info->task);   
>>   wake_up_process();  
>>   wait_for_completion(&kthread->exited);  

And what about above one?
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH] nfs: protect callback execution against per-net callback thread shutdown

2017-11-07 Thread Stanislav Kinsburskiy
07.11.2017 11:03, Kirill Tkhai пишет:
>>> Couldn't this change introduce a deadlock like below?
>>>   [thread]
>>> nfs_callback_down()   
>>> nfs41_callback_svc()
>>>mutex_lock(&nfs_callback_mutex);   
>>>kthread_stop(cb_info->task);   
>>>   wake_up_process();  
>>>   wait_for_completion(&kthread->exited);  
> 
> And what about above one?
> 

Good catch. Need to think more about it then.
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL7 COMMIT] Revert "nfs: protect callback execution against per-net callback thread shutdown"

2017-11-07 Thread Konstantin Khorenko
The commit is pushed to "branch-rh7-3.10.0-693.1.1.vz7.37.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-693.1.1.vz7.37.22
-->
commit 118f85ebd4b9e495b9afd77ddc4b44ef2659570e
Author: Konstantin Khorenko 
Date:   Tue Nov 7 13:25:10 2017 +0300

Revert "nfs: protect callback execution against per-net callback thread 
shutdown"

This reverts commit 1164badab1522ad0c93076203fcf231cc846139f.

The patch introduces another deadlock => rool back it for now.

https://jira.sw.ru/browse/PSBM-75751

Signed-off-by: Konstantin Khorenko 
---
 fs/nfs/callback.c | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 82e8ed1..0beb275 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -118,7 +118,6 @@ nfs41_callback_svc(void *vrqstp)
continue;
 
prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
-   mutex_lock(&nfs_callback_mutex);
spin_lock_bh(&serv->sv_cb_lock);
if (!list_empty(&serv->sv_cb_list)) {
req = list_first_entry(&serv->sv_cb_list,
@@ -130,10 +129,8 @@ nfs41_callback_svc(void *vrqstp)
error = bc_svc_process(serv, req, rqstp);
dprintk("bc_svc_process() returned w/ error code= %d\n",
error);
-   mutex_unlock(&nfs_callback_mutex);
} else {
spin_unlock_bh(&serv->sv_cb_lock);
-   mutex_unlock(&nfs_callback_mutex);
schedule();
finish_wait(&serv->sv_cb_waitq, &wq);
}
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH v2] nfs: protect callback execution against per-net callback thread shutdown

2017-11-07 Thread Stanislav Kinsburskiy
Here is the race:

CPU #0  CPU#1

cleanup_mnt nfs41_callback_svc (get xprt from the list)
nfs_callback_down   ...
... ...
svc_close_net   ...
... ...
svc_xprt_free   ...
svc_bc_sock_freebc_svc_process
kfree(xprt) svc_process_common
rqstp->rq_xprt->xpt_ops (use after free)

The problem is that per-net SUNRPC transports shutdown is done regardless
current callback execution. This is a race leading to transport use-after-free
in callback handler.
This patch fixes it in stright-forward way. I.e. it protects callback
execution with the same mutex used for per-net data creation and destruction.
Hopefully, it won't slow down NFS client significantly.

https://jira.sw.ru/browse/PSBM-75751

v2: Fix mutex deadlock, when shutdown callback waits for thread to exit (with
mutex taken), while thread wait for the mutex to take.

Signed-off-by: Stanislav Kinsburskiy 
---
 fs/nfs/callback.c |7 +++
 1 file changed, 7 insertions(+)

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 0beb275..eed861a 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -118,6 +118,11 @@ nfs41_callback_svc(void *vrqstp)
continue;
 
prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
+
+   if (!mutex_trylock(&nfs_callback_mutex) &&
+   kthread_should_stop())
+   return 0;
+
spin_lock_bh(&serv->sv_cb_lock);
if (!list_empty(&serv->sv_cb_list)) {
req = list_first_entry(&serv->sv_cb_list,
@@ -129,8 +134,10 @@ nfs41_callback_svc(void *vrqstp)
error = bc_svc_process(serv, req, rqstp);
dprintk("bc_svc_process() returned w/ error code= %d\n",
error);
+   mutex_unlock(&nfs_callback_mutex);
} else {
spin_unlock_bh(&serv->sv_cb_lock);
+   mutex_unlock(&nfs_callback_mutex);
schedule();
finish_wait(&serv->sv_cb_waitq, &wq);
}

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH v2] nfs: protect callback execution against per-net callback thread shutdown

2017-11-07 Thread Stanislav Kinsburskiy
Sorry, this one is bad as well.

07.11.2017 11:28, Stanislav Kinsburskiy пишет:
> Here is the race:
> 
> CPU #0CPU#1
> 
> cleanup_mnt   nfs41_callback_svc (get xprt from the list)
> nfs_callback_down ...
> ...   ...
> svc_close_net ...
> ...   ...
> svc_xprt_free ...
> svc_bc_sock_free  bc_svc_process
> kfree(xprt)   svc_process_common
>   rqstp->rq_xprt->xpt_ops (use after free)
> 
> The problem is that per-net SUNRPC transports shutdown is done regardless
> current callback execution. This is a race leading to transport use-after-free
> in callback handler.
> This patch fixes it in stright-forward way. I.e. it protects callback
> execution with the same mutex used for per-net data creation and destruction.
> Hopefully, it won't slow down NFS client significantly.
> 
> https://jira.sw.ru/browse/PSBM-75751
> 
> v2: Fix mutex deadlock, when shutdown callback waits for thread to exit (with
> mutex taken), while thread wait for the mutex to take.
> 
> Signed-off-by: Stanislav Kinsburskiy 
> ---
>  fs/nfs/callback.c |7 +++
>  1 file changed, 7 insertions(+)
> 
> diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
> index 0beb275..eed861a 100644
> --- a/fs/nfs/callback.c
> +++ b/fs/nfs/callback.c
> @@ -118,6 +118,11 @@ nfs41_callback_svc(void *vrqstp)
>   continue;
>  
>   prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
> +
> + if (!mutex_trylock(&nfs_callback_mutex) &&
> + kthread_should_stop())
> + return 0;
> +
>   spin_lock_bh(&serv->sv_cb_lock);
>   if (!list_empty(&serv->sv_cb_list)) {
>   req = list_first_entry(&serv->sv_cb_list,
> @@ -129,8 +134,10 @@ nfs41_callback_svc(void *vrqstp)
>   error = bc_svc_process(serv, req, rqstp);
>   dprintk("bc_svc_process() returned w/ error code= %d\n",
>   error);
> + mutex_unlock(&nfs_callback_mutex);
>   } else {
>   spin_unlock_bh(&serv->sv_cb_lock);
> + mutex_unlock(&nfs_callback_mutex);
>   schedule();
>   finish_wait(&serv->sv_cb_waitq, &wq);
>   }
> 
> ___
> Devel mailing list
> Devel@openvz.org
> https://lists.openvz.org/mailman/listinfo/devel
> 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH v3] nfs: protect callback execution against per-net callback thread shutdown

2017-11-07 Thread Stanislav Kinsburskiy
From: Stanislav Kinsburskiy 

Here is the race:

CPU #0  CPU#1

cleanup_mnt nfs41_callback_svc (get xprt from the list)
nfs_callback_down   ...
... ...
svc_close_net   ...
... ...
svc_xprt_free   ...
svc_bc_sock_freebc_svc_process
kfree(xprt) svc_process_common
rqstp->rq_xprt->xpt_ops (use after free)

The problem is that per-net SUNRPC transports shutdown is done regardless
current callback execution. This is a race leading to transport use-after-free
in callback handler.
This patch fixes it in stright-forward way. I.e. it protects callback
execution with the same mutex used for per-net data creation and destruction.
Hopefully, it won't slow down NFS client significantly.

https://jira.sw.ru/browse/PSBM-75751

v3: Fix mutex deadlock, when shutdown callback waits for thread to exit (with
mutex taken), while thread wait for the mutex to take.
The idea is to simply check if thread has to exit, if mutex lock has failed.
This is a busy loop, but it shouldn't happend often and for long.

Signed-off-by: Stanislav Kinsburskiy 
---
 fs/nfs/callback.c |6 ++
 1 file changed, 6 insertions(+)

diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
index 0beb275..bbb07e4 100644
--- a/fs/nfs/callback.c
+++ b/fs/nfs/callback.c
@@ -117,7 +117,11 @@ nfs41_callback_svc(void *vrqstp)
if (try_to_freeze())
continue;
 
+   if (!mutex_trylock(&nfs_callback_mutex))
+  continue;
+
prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
+
spin_lock_bh(&serv->sv_cb_lock);
if (!list_empty(&serv->sv_cb_list)) {
req = list_first_entry(&serv->sv_cb_list,
@@ -129,8 +133,10 @@ nfs41_callback_svc(void *vrqstp)
error = bc_svc_process(serv, req, rqstp);
dprintk("bc_svc_process() returned w/ error code= %d\n",
error);
+   mutex_unlock(&nfs_callback_mutex);
} else {
spin_unlock_bh(&serv->sv_cb_lock);
+   mutex_unlock(&nfs_callback_mutex);
schedule();
finish_wait(&serv->sv_cb_waitq, &wq);
}

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH v3] nfs: protect callback execution against per-net callback thread shutdown

2017-11-07 Thread Kirill Tkhai
On 07.11.2017 13:39, Stanislav Kinsburskiy wrote:
> From: Stanislav Kinsburskiy 
> 
> Here is the race:
> 
> CPU #0CPU#1
> 
> cleanup_mnt   nfs41_callback_svc (get xprt from the list)
> nfs_callback_down ...
> ...   ...
> svc_close_net ...
> ...   ...
> svc_xprt_free ...
> svc_bc_sock_free  bc_svc_process
> kfree(xprt)   svc_process_common
>   rqstp->rq_xprt->xpt_ops (use after free)
> 
> The problem is that per-net SUNRPC transports shutdown is done regardless
> current callback execution. This is a race leading to transport use-after-free
> in callback handler.
> This patch fixes it in stright-forward way. I.e. it protects callback
> execution with the same mutex used for per-net data creation and destruction.
> Hopefully, it won't slow down NFS client significantly.
> 
> https://jira.sw.ru/browse/PSBM-75751
> 
> v3: Fix mutex deadlock, when shutdown callback waits for thread to exit (with
> mutex taken), while thread wait for the mutex to take.
> The idea is to simply check if thread has to exit, if mutex lock has failed.
> This is a busy loop, but it shouldn't happend often and for long.
> 
> Signed-off-by: Stanislav Kinsburskiy 
> ---
>  fs/nfs/callback.c |6 ++
>  1 file changed, 6 insertions(+)
> 
> diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
> index 0beb275..bbb07e4 100644
> --- a/fs/nfs/callback.c
> +++ b/fs/nfs/callback.c
> @@ -117,7 +117,11 @@ nfs41_callback_svc(void *vrqstp)
>   if (try_to_freeze())
>   continue;
>  
> + if (!mutex_trylock(&nfs_callback_mutex))
> +continue;

This looks like a busy loop (that especially bad, because mutex-owner may sleep
at the moment we are looping).

It seems the solution may be to change nfs_callback_down() function.
Can't we flush pending request in nfs_callback_down()? Or just delete
it from sv_cb_list without handling (does nfs proto allow that)?

Also, one more additional argument for flush is a suspicion there may be more 
than
one pending request in the list. Can it be?

> +
>   prepare_to_wait(&serv->sv_cb_waitq, &wq, TASK_INTERRUPTIBLE);
> +
>   spin_lock_bh(&serv->sv_cb_lock);
>   if (!list_empty(&serv->sv_cb_list)) {
>   req = list_first_entry(&serv->sv_cb_list,
> @@ -129,8 +133,10 @@ nfs41_callback_svc(void *vrqstp)
>   error = bc_svc_process(serv, req, rqstp);
>   dprintk("bc_svc_process() returned w/ error code= %d\n",
>   error);
> + mutex_unlock(&nfs_callback_mutex);
>   } else {
>   spin_unlock_bh(&serv->sv_cb_lock);
> + mutex_unlock(&nfs_callback_mutex);
>   schedule();
>   finish_wait(&serv->sv_cb_waitq, &wq);
>   }
> 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RH7] ve/sched: remove no ve root cgroup warning

2017-11-07 Thread Pavel Tikhomirov
We can get to ve_root_tg() from host's cgroup so it is expected
to have no ve root cgroup for it. Call stack on task wakeup:

wake_up_process -> try_to_wake_up -> select_task_rq_fair
-> select_runnable_cpu -> check_cpulimit_spread -> tg_cpu_rate
-> ve_root_tg

Signed-off-by: Pavel Tikhomirov 
---
 kernel/sched/core.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5b3daa139638..2d258b92529d 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -349,7 +349,6 @@ static struct task_group *ve_root_tg(struct task_group *tg) 
{
return NULL;
 
cg = cgroup_get_ve_root(tg->css.cgroup);
-   WARN_ONCE(!cg, "Failed to find ve root cgroup, possible container 
configuration problem.\n");
return cg ? cgroup_tg(cg) : NULL;
 }
 
-- 
2.13.5

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH RHEL7 COMMIT] ve/sched: remove no ve root cgroup warning

2017-11-07 Thread Konstantin Khorenko
The commit is pushed to "branch-rh7-3.10.0-693.1.1.vz7.37.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-693.1.1.vz7.37.23
-->
commit 31659699d99c1af8cc90a8e1b0212168aee1c981
Author: Pavel Tikhomirov 
Date:   Tue Nov 7 15:34:15 2017 +0300

ve/sched: remove no ve root cgroup warning

We can get to ve_root_tg() from host's cgroup so it is expected
to have no ve root cgroup for it. Call stack on task wakeup:

wake_up_process -> try_to_wake_up -> select_task_rq_fair
-> select_runnable_cpu -> check_cpulimit_spread -> tg_cpu_rate
-> ve_root_tg

Fixes: e661261 ("ve/sched: take nr_cpus and cpu_rate from ve root task 
group")

Signed-off-by: Pavel Tikhomirov 
---
 kernel/sched/core.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 5b3daa1..2d258b9 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -349,7 +349,6 @@ static struct task_group *ve_root_tg(struct task_group *tg) 
{
return NULL;
 
cg = cgroup_get_ve_root(tg->css.cgroup);
-   WARN_ONCE(!cg, "Failed to find ve root cgroup, possible container 
configuration problem.\n");
return cg ? cgroup_tg(cg) : NULL;
 }
 
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH v3] nfs: protect callback execution against per-net callback thread shutdown

2017-11-07 Thread Stanislav Kinsburskiy
Please, see my comments below.

07.11.2017 12:30, Kirill Tkhai пишет:
> On 07.11.2017 13:39, Stanislav Kinsburskiy wrote:
>> From: Stanislav Kinsburskiy 
>>
>> Here is the race:
>>
>> CPU #0   CPU#1
>>
>> cleanup_mnt  nfs41_callback_svc (get xprt from the list)
>> nfs_callback_down...
>> ...  ...
>> svc_close_net...
>> ...  ...
>> svc_xprt_free...
>> svc_bc_sock_free bc_svc_process
>> kfree(xprt)  svc_process_common
>>  rqstp->rq_xprt->xpt_ops (use after free)
>>
>> The problem is that per-net SUNRPC transports shutdown is done regardless
>> current callback execution. This is a race leading to transport 
>> use-after-free
>> in callback handler.
>> This patch fixes it in stright-forward way. I.e. it protects callback
>> execution with the same mutex used for per-net data creation and destruction.
>> Hopefully, it won't slow down NFS client significantly.
>>
>> https://jira.sw.ru/browse/PSBM-75751
>>
>> v3: Fix mutex deadlock, when shutdown callback waits for thread to exit (with
>> mutex taken), while thread wait for the mutex to take.
>> The idea is to simply check if thread has to exit, if mutex lock has failed.
>> This is a busy loop, but it shouldn't happend often and for long.
>>
>> Signed-off-by: Stanislav Kinsburskiy 
>> ---
>>  fs/nfs/callback.c |6 ++
>>  1 file changed, 6 insertions(+)
>>
>> diff --git a/fs/nfs/callback.c b/fs/nfs/callback.c
>> index 0beb275..bbb07e4 100644
>> --- a/fs/nfs/callback.c
>> +++ b/fs/nfs/callback.c
>> @@ -117,7 +117,11 @@ nfs41_callback_svc(void *vrqstp)
>>  if (try_to_freeze())
>>  continue;
>>  
>> +if (!mutex_trylock(&nfs_callback_mutex))
>> +   continue;
> 
> This looks like a busy loop (that especially bad, because mutex-owner may 
> sleep
> at the moment we are looping).
> 

Well, yes. It a busy loop. And that's not good. But I have to mention, that 
this busy loop can happen only on umount request.

> It seems the solution may be to change nfs_callback_down() function.
> Can't we flush pending request in nfs_callback_down()?

Well, this already happens. The problem is in race between transport usage 
(unconditional) and its destruction (also unconditional).
IOW, there should be either reference counting or some critical section.
Looks like the former will need a way more code and thus more error-prone, the 
latter is uglier, but code should be simpler at first sight.


> Or just delete it from sv_cb_list without handling (does nfs proto allow 
> that)?
> 

Well, this looks like a promising idea. But yet again, there should be some 
protection against transport usage in shutdown helper.
And it's not yet clear, how to implement it without significant code change...

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel