[Devel] [PATCH] sched/fair: Fix ENQUEUE_WAKEUP false appearance in enqueue_task_fair()

2017-04-13 Thread Kirill Tkhai
When enqueue_task_fair() is called without ENQUEUE_WAKEUP,
e.g. on priority changing or on migration, we forced add it,
that is wrong, because this leads to incorrect accounting
of vruntime of parent cfs_rq's. I think (and the sane vision
confims this), the initial aim was to clear ENQUEUE_BOOST bit
before next iteration, but it was made incorrect.

Fix that by doing this apparently, instead of corrupting
parent's vruntime.

Found in scope of https://jira.sw.ru/browse/PSBM-62208

Signed-off-by: Kirill Tkhai 
---
 kernel/sched/fair.c |2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4fde0d42a95..a8cf67c977b 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4289,7 +4289,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, 
int flags)
if (boost)
boost = enqueue_boosted_entity(cfs_rq, se);
 
-   flags = ENQUEUE_WAKEUP;
+   flags &= ~ENQUEUE_BOOST;
}
 
for_each_sched_entity(se) {

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH] sched/numa: Fix use-after-free bug in the task_numa_compare

2017-04-13 Thread Konstantin Khorenko

May be apply following patch as well?
It mostly reverts your patch back.

commit bac7857319bcf7fed329a10bb760053e761115c0
Author: Oleg Nesterov 
Date:   Wed May 18 21:57:33 2016 +0200

sched/fair: Use task_rcu_dereference()

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team

On 04/13/2017 01:09 PM, Kirill Tkhai wrote:

ms commit 1dff76b92f69 by Gavin Guo 

The following message can be observed on the Ubuntu v3.13.0-65 with KASan
backported:

  ==
  BUG: KASan: use after free in task_numa_find_cpu+0x64c/0x890 at addr 
880dd393ecd8
  Read of size 8 by task qemu-system-x86/3998900
  =
  BUG kmalloc-128 (Tainted: GB): kasan: bad access detected
  -

  INFO: Allocated in task_numa_fault+0xc1b/0xed0 age=41980 cpu=18 pid=3998890
__slab_alloc+0x4f8/0x560
__kmalloc+0x1eb/0x280
task_numa_fault+0xc1b/0xed0
do_numa_page+0x192/0x200
handle_mm_fault+0x808/0x1160
__do_page_fault+0x218/0x750
do_page_fault+0x1a/0x70
page_fault+0x28/0x30
SyS_poll+0x66/0x1a0
system_call_fastpath+0x1a/0x1f
  INFO: Freed in task_numa_free+0x1d2/0x200 age=62 cpu=18 pid=0
__slab_free+0x2ab/0x3f0
kfree+0x161/0x170
task_numa_free+0x1d2/0x200
finish_task_switch+0x1d2/0x210
__schedule+0x5d4/0xc60
schedule_preempt_disabled+0x40/0xc0
cpu_startup_entry+0x2da/0x340
start_secondary+0x28f/0x360
  Call Trace:
   [] dump_stack+0x45/0x56
   [] print_trailer+0xfd/0x170
   [] object_err+0x36/0x40
   [] kasan_report_error+0x1e9/0x3a0
   [] kasan_report+0x40/0x50
   [] ? task_numa_find_cpu+0x64c/0x890
   [] __asan_load8+0x69/0xa0
   [] ? find_next_bit+0xd8/0x120
   [] task_numa_find_cpu+0x64c/0x890
   [] task_numa_migrate+0x4ac/0x7b0
   [] numa_migrate_preferred+0xb3/0xc0
   [] task_numa_fault+0xb88/0xed0
   [] do_numa_page+0x192/0x200
   [] handle_mm_fault+0x808/0x1160
   [] ? sched_clock_cpu+0x10d/0x160
   [] ? native_load_tls+0x82/0xa0
   [] __do_page_fault+0x218/0x750
   [] ? hrtimer_try_to_cancel+0x76/0x160
   [] ? schedule_hrtimeout_range_clock.part.24+0xf7/0x1c0
   [] do_page_fault+0x1a/0x70
   [] page_fault+0x28/0x30
   [] ? do_sys_poll+0x1c4/0x6d0
   [] ? enqueue_task_fair+0x4b6/0xaa0
   [] ? sched_clock+0x9/0x10
   [] ? resched_task+0x7a/0xc0
   [] ? check_preempt_curr+0xb3/0x130
   [] ? poll_select_copy_remaining+0x170/0x170
   [] ? wake_up_state+0x10/0x20
   [] ? drop_futex_key_refs.isra.14+0x1f/0x90
   [] ? futex_requeue+0x3de/0xba0
   [] ? do_futex+0xbe/0x8f0
   [] ? read_tsc+0x9/0x20
   [] ? ktime_get_ts+0x12d/0x170
   [] ? timespec_add_safe+0x59/0xe0
   [] SyS_poll+0x66/0x1a0
   [] system_call_fastpath+0x1a/0x1f

As commit 1effd9f19324 ("sched/numa: Fix unsafe get_task_struct() in
task_numa_assign()") points out, the rcu_read_lock() cannot protect the
task_struct from being freed in the finish_task_switch(). And the bug
happens in the process of calculation of imp which requires the access of
p->numa_faults being freed in the following path:

do_exit()
current->flags |= PF_EXITING;
release_task()
~~delayed_put_task_struct()~~
schedule()
...
...
rq->curr = next;
context_switch()
finish_task_switch()
put_task_struct()
__put_task_struct()
task_numa_free()

The fix here to get_task_struct() early before end of dst_rq->lock to
protect the calculation process and also put_task_struct() in the
corresponding point if finally the dst_rq->curr somehow cannot be
assigned.

Additional credit to Liang Chen who helped fix the error logic and add the
put_task_struct() to the place it missed.

Signed-off-by: Gavin Guo 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Andrea Arcangeli 
Cc: Andrew Morton 
Cc: Hugh Dickins 
Cc: Linus Torvalds 
Cc: Mel Gorman 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Cc: jay.vosbu...@canonical.com
Cc: liang.c...@canonical.com
Link: 
http://lkml.kernel.org/r/1453264618-17645-1-git-send-email-gavin@canonical.com
Signed-off-by: Ingo Molnar 

In scope of https://jira.sw.ru/browse/PSBM-62208

Signed-off-by: Kirill Tkhai 
---
 kernel/sched/fair.c |   29 ++---
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index daf709e40b6..4fde0d42a95 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1388,8 +1388,6 @@ static void task_numa_assign(struct task_numa_env *env,
 {
if (env->best_task)
put_task_struct(env->best_task);
-   if (p)
-   get_task_struct(p);

env->best_task = p;
env->best_imp = imp;
@@ -1411,19 +1409,29 @@ static void task_numa_compare(struct task_numa_env *env,
long dst_load, src

[Devel] [PATCH RHEL7 COMMIT] fs/block_dev: always invalidate cleancache in invalidate_bdev()

2017-04-13 Thread Konstantin Khorenko
The commit is pushed to "branch-rh7-3.10.0-514.10.2.vz7.29.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-514.10.2.vz7.29.15
-->
commit 5ffa7e4f1451a245f9fc4ba94df77c2a0e911add
Author: Andrey Ryabinin 
Date:   Thu Apr 13 14:41:06 2017 +0400

fs/block_dev: always invalidate cleancache in invalidate_bdev()

invalidate_bdev() calls cleancache_invalidate_inode() iff ->nrpages != 0
which doen't make any sense.
Make invalidate_bdev() always invalidate cleancache data.

https://jira.sw.ru/browse/PSBM-63908

Signed-off-by: Andrey Ryabinin 
---
 fs/block_dev.c | 12 ++--
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/fs/block_dev.c b/fs/block_dev.c
index 967588e..8f2c6ee 100644
--- a/fs/block_dev.c
+++ b/fs/block_dev.c
@@ -108,12 +108,12 @@ void invalidate_bdev(struct block_device *bdev)
 {
struct address_space *mapping = bdev->bd_inode->i_mapping;
 
-   if (mapping->nrpages == 0)
-   return;
-
-   invalidate_bh_lrus();
-   lru_add_drain_all();/* make sure all lru add caches are flushed */
-   invalidate_mapping_pages(mapping, 0, -1);
+   /* FIXME: Shouldn't we add '|| mapping->nrexceptional' ? */
+   if (mapping->nrpages) {
+   invalidate_bh_lrus();
+   lru_add_drain_all();/* make sure all lru add caches are 
flushed */
+   invalidate_mapping_pages(mapping, 0, -1);
+   }
/* 99% of the time, we don't need to flush the cleancache on the bdev.
 * But, for the strange corners, lets be cautious
 */
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH] sched/numa: Fix use-after-free bug in the task_numa_compare

2017-04-13 Thread Konstantin Khorenko

On 04/13/2017 01:58 PM, Konstantin Khorenko wrote:

May be apply following patch as well?

Surely with preparation patch as well:

150593b sched/api: Introduce task_rcu_dereference() and try_get_task_struct()


It mostly reverts your patch back.

commit bac7857319bcf7fed329a10bb760053e761115c0
Author: Oleg Nesterov 
Date:   Wed May 18 21:57:33 2016 +0200

 sched/fair: Use task_rcu_dereference()

--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team

On 04/13/2017 01:09 PM, Kirill Tkhai wrote:

ms commit 1dff76b92f69 by Gavin Guo 

The following message can be observed on the Ubuntu v3.13.0-65 with KASan
backported:

  ==
  BUG: KASan: use after free in task_numa_find_cpu+0x64c/0x890 at addr 
880dd393ecd8
  Read of size 8 by task qemu-system-x86/3998900
  =
  BUG kmalloc-128 (Tainted: GB): kasan: bad access detected
  -

  INFO: Allocated in task_numa_fault+0xc1b/0xed0 age=41980 cpu=18 pid=3998890
__slab_alloc+0x4f8/0x560
__kmalloc+0x1eb/0x280
task_numa_fault+0xc1b/0xed0
do_numa_page+0x192/0x200
handle_mm_fault+0x808/0x1160
__do_page_fault+0x218/0x750
do_page_fault+0x1a/0x70
page_fault+0x28/0x30
SyS_poll+0x66/0x1a0
system_call_fastpath+0x1a/0x1f
  INFO: Freed in task_numa_free+0x1d2/0x200 age=62 cpu=18 pid=0
__slab_free+0x2ab/0x3f0
kfree+0x161/0x170
task_numa_free+0x1d2/0x200
finish_task_switch+0x1d2/0x210
__schedule+0x5d4/0xc60
schedule_preempt_disabled+0x40/0xc0
cpu_startup_entry+0x2da/0x340
start_secondary+0x28f/0x360
  Call Trace:
   [] dump_stack+0x45/0x56
   [] print_trailer+0xfd/0x170
   [] object_err+0x36/0x40
   [] kasan_report_error+0x1e9/0x3a0
   [] kasan_report+0x40/0x50
   [] ? task_numa_find_cpu+0x64c/0x890
   [] __asan_load8+0x69/0xa0
   [] ? find_next_bit+0xd8/0x120
   [] task_numa_find_cpu+0x64c/0x890
   [] task_numa_migrate+0x4ac/0x7b0
   [] numa_migrate_preferred+0xb3/0xc0
   [] task_numa_fault+0xb88/0xed0
   [] do_numa_page+0x192/0x200
   [] handle_mm_fault+0x808/0x1160
   [] ? sched_clock_cpu+0x10d/0x160
   [] ? native_load_tls+0x82/0xa0
   [] __do_page_fault+0x218/0x750
   [] ? hrtimer_try_to_cancel+0x76/0x160
   [] ? schedule_hrtimeout_range_clock.part.24+0xf7/0x1c0
   [] do_page_fault+0x1a/0x70
   [] page_fault+0x28/0x30
   [] ? do_sys_poll+0x1c4/0x6d0
   [] ? enqueue_task_fair+0x4b6/0xaa0
   [] ? sched_clock+0x9/0x10
   [] ? resched_task+0x7a/0xc0
   [] ? check_preempt_curr+0xb3/0x130
   [] ? poll_select_copy_remaining+0x170/0x170
   [] ? wake_up_state+0x10/0x20
   [] ? drop_futex_key_refs.isra.14+0x1f/0x90
   [] ? futex_requeue+0x3de/0xba0
   [] ? do_futex+0xbe/0x8f0
   [] ? read_tsc+0x9/0x20
   [] ? ktime_get_ts+0x12d/0x170
   [] ? timespec_add_safe+0x59/0xe0
   [] SyS_poll+0x66/0x1a0
   [] system_call_fastpath+0x1a/0x1f

As commit 1effd9f19324 ("sched/numa: Fix unsafe get_task_struct() in
task_numa_assign()") points out, the rcu_read_lock() cannot protect the
task_struct from being freed in the finish_task_switch(). And the bug
happens in the process of calculation of imp which requires the access of
p->numa_faults being freed in the following path:

do_exit()
current->flags |= PF_EXITING;
release_task()
~~delayed_put_task_struct()~~
schedule()
...
...
rq->curr = next;
context_switch()
finish_task_switch()
put_task_struct()
__put_task_struct()
task_numa_free()

The fix here to get_task_struct() early before end of dst_rq->lock to
protect the calculation process and also put_task_struct() in the
corresponding point if finally the dst_rq->curr somehow cannot be
assigned.

Additional credit to Liang Chen who helped fix the error logic and add the
put_task_struct() to the place it missed.

Signed-off-by: Gavin Guo 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Andrea Arcangeli 
Cc: Andrew Morton 
Cc: Hugh Dickins 
Cc: Linus Torvalds 
Cc: Mel Gorman 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Cc: jay.vosbu...@canonical.com
Cc: liang.c...@canonical.com
Link: 
http://lkml.kernel.org/r/1453264618-17645-1-git-send-email-gavin@canonical.com
Signed-off-by: Ingo Molnar 

In scope of https://jira.sw.ru/browse/PSBM-62208

Signed-off-by: Kirill Tkhai 
---
 kernel/sched/fair.c |   29 ++---
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index daf709e40b6..4fde0d42a95 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1388,8 +1388,6 @@ static void task_numa_assign(struct task_numa_env *env,
 {
if (env->best_task)
put_task_struct(env->best_task);
-   if (p)
-   get_task_st

Re: [Devel] [PATCH] sched/numa: Fix use-after-free bug in the task_numa_compare

2017-04-13 Thread Kirill Tkhai
On 13.04.2017 13:58, Konstantin Khorenko wrote:
> May be apply following patch as well?
> It mostly reverts your patch back.

It has a patch-dependence, so I didn't choose it. Which way you prefer?
 
> commit bac7857319bcf7fed329a10bb760053e761115c0
> Author: Oleg Nesterov 
> Date:   Wed May 18 21:57:33 2016 +0200
> 
> sched/fair: Use task_rcu_dereference()
> 
> -- 
> Best regards,
> 
> Konstantin Khorenko,
> Virtuozzo Linux Kernel Team
> 
> On 04/13/2017 01:09 PM, Kirill Tkhai wrote:
>> ms commit 1dff76b92f69 by Gavin Guo 
>>
>> The following message can be observed on the Ubuntu v3.13.0-65 with KASan
>> backported:
>>
>>   ==
>>   BUG: KASan: use after free in task_numa_find_cpu+0x64c/0x890 at addr 
>> 880dd393ecd8
>>   Read of size 8 by task qemu-system-x86/3998900
>>   
>> =
>>   BUG kmalloc-128 (Tainted: GB): kasan: bad access detected
>>   
>> -
>>
>>   INFO: Allocated in task_numa_fault+0xc1b/0xed0 age=41980 cpu=18 pid=3998890
>> __slab_alloc+0x4f8/0x560
>> __kmalloc+0x1eb/0x280
>> task_numa_fault+0xc1b/0xed0
>> do_numa_page+0x192/0x200
>> handle_mm_fault+0x808/0x1160
>> __do_page_fault+0x218/0x750
>> do_page_fault+0x1a/0x70
>> page_fault+0x28/0x30
>> SyS_poll+0x66/0x1a0
>> system_call_fastpath+0x1a/0x1f
>>   INFO: Freed in task_numa_free+0x1d2/0x200 age=62 cpu=18 pid=0
>> __slab_free+0x2ab/0x3f0
>> kfree+0x161/0x170
>> task_numa_free+0x1d2/0x200
>> finish_task_switch+0x1d2/0x210
>> __schedule+0x5d4/0xc60
>> schedule_preempt_disabled+0x40/0xc0
>> cpu_startup_entry+0x2da/0x340
>> start_secondary+0x28f/0x360
>>   Call Trace:
>>[] dump_stack+0x45/0x56
>>[] print_trailer+0xfd/0x170
>>[] object_err+0x36/0x40
>>[] kasan_report_error+0x1e9/0x3a0
>>[] kasan_report+0x40/0x50
>>[] ? task_numa_find_cpu+0x64c/0x890
>>[] __asan_load8+0x69/0xa0
>>[] ? find_next_bit+0xd8/0x120
>>[] task_numa_find_cpu+0x64c/0x890
>>[] task_numa_migrate+0x4ac/0x7b0
>>[] numa_migrate_preferred+0xb3/0xc0
>>[] task_numa_fault+0xb88/0xed0
>>[] do_numa_page+0x192/0x200
>>[] handle_mm_fault+0x808/0x1160
>>[] ? sched_clock_cpu+0x10d/0x160
>>[] ? native_load_tls+0x82/0xa0
>>[] __do_page_fault+0x218/0x750
>>[] ? hrtimer_try_to_cancel+0x76/0x160
>>[] ? schedule_hrtimeout_range_clock.part.24+0xf7/0x1c0
>>[] do_page_fault+0x1a/0x70
>>[] page_fault+0x28/0x30
>>[] ? do_sys_poll+0x1c4/0x6d0
>>[] ? enqueue_task_fair+0x4b6/0xaa0
>>[] ? sched_clock+0x9/0x10
>>[] ? resched_task+0x7a/0xc0
>>[] ? check_preempt_curr+0xb3/0x130
>>[] ? poll_select_copy_remaining+0x170/0x170
>>[] ? wake_up_state+0x10/0x20
>>[] ? drop_futex_key_refs.isra.14+0x1f/0x90
>>[] ? futex_requeue+0x3de/0xba0
>>[] ? do_futex+0xbe/0x8f0
>>[] ? read_tsc+0x9/0x20
>>[] ? ktime_get_ts+0x12d/0x170
>>[] ? timespec_add_safe+0x59/0xe0
>>[] SyS_poll+0x66/0x1a0
>>[] system_call_fastpath+0x1a/0x1f
>>
>> As commit 1effd9f19324 ("sched/numa: Fix unsafe get_task_struct() in
>> task_numa_assign()") points out, the rcu_read_lock() cannot protect the
>> task_struct from being freed in the finish_task_switch(). And the bug
>> happens in the process of calculation of imp which requires the access of
>> p->numa_faults being freed in the following path:
>>
>> do_exit()
>> current->flags |= PF_EXITING;
>> release_task()
>> ~~delayed_put_task_struct()~~
>> schedule()
>> ...
>> ...
>> rq->curr = next;
>> context_switch()
>> finish_task_switch()
>> put_task_struct()
>> __put_task_struct()
>> task_numa_free()
>>
>> The fix here to get_task_struct() early before end of dst_rq->lock to
>> protect the calculation process and also put_task_struct() in the
>> corresponding point if finally the dst_rq->curr somehow cannot be
>> assigned.
>>
>> Additional credit to Liang Chen who helped fix the error logic and add the
>> put_task_struct() to the place it missed.
>>
>> Signed-off-by: Gavin Guo 
>> Signed-off-by: Peter Zijlstra (Intel) 
>> Cc: Andrea Arcangeli 
>> Cc: Andrew Morton 
>> Cc: Hugh Dickins 
>> Cc: Linus Torvalds 
>> Cc: Mel Gorman 
>> Cc: Peter Zijlstra 
>> Cc: Rik van Riel 
>> Cc: Thomas Gleixner 
>> Cc: jay.vosbu...@canonical.com
>> Cc: liang.c...@canonical.com
>> Link: 
>> http://lkml.kernel.org/r/1453264618-17645-1-git-send-email-gavin@canonical.com
>> Signed-off-by: Ingo Molnar 
>>
>> In scope of https://jira.sw.ru/browse/PSBM-62208
>>
>> Signed-off-by: Kirill Tkhai 
>> ---
>>  kernel/sched/fair.c |   29 ++---
>>  1 file changed, 22 insertions(+), 7 deletions(-)
>>
>> di

[Devel] [PATCH RHEL7 COMMIT] fs/cleancache: fix data invalidation in the cleancache during direct_io

2017-04-13 Thread Konstantin Khorenko
The commit is pushed to "branch-rh7-3.10.0-514.10.2.vz7.29.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-514.10.2.vz7.29.15
-->
commit f6657e2f0a0a27991b09be0421320c3bef8a5a96
Author: Andrey Ryabinin 
Date:   Thu Apr 13 14:41:04 2017 +0400

fs/cleancache: fix data invalidation in the cleancache during direct_io

Currently some direct_io fs hooks call invalidate_inode_pages2_range()
conditionally iff mapping->nrpages is not zero. So if nrpages is zero,
data in cleancache wouldn't be invalidated. So the next buffered read
may get stale data from the cleancache.

Fix this by calling invalidate_inode_pages2_range() regardless of nrpages
value. And if nrpages is zero, bail out from invalidate_inode_pages2_range()
only after cleancache_invalidate_inode(), so that we invalidate cleancache
but still avoid pointless page cache lookups.

https://jira.sw.ru/browse/PSBM-63908

Signed-off-by: Andrey Ryabinin 
---
 fs/9p/vfs_file.c  |  4 ++--
 fs/nfs/direct.c   | 16 ++--
 fs/nfs/inode.c|  7 ---
 fs/xfs/xfs_file.c | 30 ++
 mm/filemap.c  | 28 
 mm/truncate.c |  4 
 6 files changed, 42 insertions(+), 47 deletions(-)

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 7da03f8..afe0036 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -482,7 +482,7 @@ v9fs_file_write_internal(struct inode *inode, struct p9_fid 
*fid,
if (invalidate && (total > 0)) {
pg_start = origin >> PAGE_CACHE_SHIFT;
pg_end = (origin + total - 1) >> PAGE_CACHE_SHIFT;
-   if (inode->i_mapping && inode->i_mapping->nrpages)
+   if (inode->i_mapping)
invalidate_inode_pages2_range(inode->i_mapping,
  pg_start, pg_end);
*offset += total;
@@ -688,7 +688,7 @@ v9fs_direct_write(struct file *filp, const char __user * 
data,
 * about to write.  We do this *before* the write so that if we fail
 * here we fall back to buffered write
 */
-   if (mapping->nrpages) {
+   {
pgoff_t pg_start = offset >> PAGE_CACHE_SHIFT;
pgoff_t pg_end   = (offset + count - 1) >> PAGE_CACHE_SHIFT;
 
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index ab96f01..963 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -1132,12 +1132,10 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const 
struct iovec *iov,
if (result)
goto out_unlock;
 
-   if (mapping->nrpages) {
-   result = invalidate_inode_pages2_range(mapping,
-   pos >> PAGE_CACHE_SHIFT, end);
-   if (result)
-   goto out_unlock;
-   }
+   result = invalidate_inode_pages2_range(mapping,
+   pos >> PAGE_CACHE_SHIFT, end);
+   if (result)
+   goto out_unlock;
 
task_io_account_write(count);
 
@@ -1161,10 +1159,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const 
struct iovec *iov,
 
result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);
 
-   if (mapping->nrpages) {
-   invalidate_inode_pages2_range(mapping,
- pos >> PAGE_CACHE_SHIFT, end);
-   }
+   invalidate_inode_pages2_range(mapping,
+   pos >> PAGE_CACHE_SHIFT, end);
 
mutex_unlock(&inode->i_mutex);
 
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 8c06aed..779b05c 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1065,10 +1065,11 @@ static int nfs_invalidate_mapping(struct inode *inode, 
struct address_space *map
if (ret < 0)
return ret;
}
-   ret = invalidate_inode_pages2(mapping);
-   if (ret < 0)
-   return ret;
}
+   ret = invalidate_inode_pages2(mapping);
+   if (ret < 0)
+   return ret;
+
if (S_ISDIR(inode->i_mode)) {
spin_lock(&inode->i_lock);
memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 9a2193b..0b7a35b 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -346,7 +346,7 @@ xfs_file_aio_read(
 * serialisation.
 */
xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-   if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
+   if ((ioflags & XFS_IO_ISDIRECT)) {
xfs_rw_iunlock(ip, XFS_IOLOCK_SHARED);
xfs_rw_ilock(ip, XFS_IOLOCK_EXCL);
 
@@ -361,22 +361,20 @@ xfs_file_aio_read(
 * flush and reduce the chances of repeated iolock cycles going
 * forward.
 */
-   if (inode->i_ma

[Devel] [PATCH RHEL7 COMMIT] cleancache: avoid pointless cleancache_invalidate_inode() calls.

2017-04-13 Thread Konstantin Khorenko
The commit is pushed to "branch-rh7-3.10.0-514.10.2.vz7.29.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-514.10.2.vz7.29.15
-->
commit 456aa581f3cd05b156995b2b03a498133733b25f
Author: Andrey Ryabinin 
Date:   Thu Apr 13 14:41:05 2017 +0400

cleancache: avoid pointless cleancache_invalidate_inode() calls.

Per Alexey: there is no point in invalidation of cleancache on entry to
truncate_inode_pages_range/invalidate_inode_pages2_range routines.
It is waste of time, cleancache will be repopulated by invalidation
(which it stupid, of course). It is enough to do this once at exit.

https://jira.sw.ru/browse/PSBM-63908

Signed-off-by: Andrey Ryabinin 
---
 mm/truncate.c | 13 +++--
 1 file changed, 7 insertions(+), 6 deletions(-)

diff --git a/mm/truncate.c b/mm/truncate.c
index ce4b1d8..1db0425 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -285,9 +285,8 @@ void truncate_inode_pages_range(struct address_space 
*mapping,
int i;
int bug_if_page_has_bh = 0;
 
-   cleancache_invalidate_inode(mapping);
if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
-   return;
+   goto out;
 
/* Offsets within partial pages */
partial_start = lstart & (PAGE_CACHE_SIZE - 1);
@@ -390,7 +389,7 @@ void truncate_inode_pages_range(struct address_space 
*mapping,
 * will be released, just zeroed, so we can bail out now.
 */
if (start >= end)
-   return;
+   goto out;
 
index = start;
for ( ; ; ) {
@@ -431,6 +430,8 @@ void truncate_inode_pages_range(struct address_space 
*mapping,
pagevec_release(&pvec);
index++;
}
+
+out:
cleancache_invalidate_inode(mapping);
 }
 EXPORT_SYMBOL(truncate_inode_pages_range);
@@ -627,10 +628,8 @@ int invalidate_inode_pages2_range(struct address_space 
*mapping,
int ret2 = 0;
int did_range_unmap = 0;
 
-   cleancache_invalidate_inode(mapping);
-
if (mapping->nrpages == 0 && mapping->nrexceptional == 0)
-   return 0;
+   goto out;
 
pagevec_init(&pvec, 0);
index = start;
@@ -692,6 +691,8 @@ int invalidate_inode_pages2_range(struct address_space 
*mapping,
cond_resched();
index++;
}
+
+out:
cleancache_invalidate_inode(mapping);
return ret;
 }
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH] sched/numa: Fix use-after-free bug in the task_numa_compare

2017-04-13 Thread Kirill Tkhai
ms commit 1dff76b92f69 by Gavin Guo 

The following message can be observed on the Ubuntu v3.13.0-65 with KASan
backported:

  ==
  BUG: KASan: use after free in task_numa_find_cpu+0x64c/0x890 at addr 
880dd393ecd8
  Read of size 8 by task qemu-system-x86/3998900
  =
  BUG kmalloc-128 (Tainted: GB): kasan: bad access detected
  -

  INFO: Allocated in task_numa_fault+0xc1b/0xed0 age=41980 cpu=18 pid=3998890
__slab_alloc+0x4f8/0x560
__kmalloc+0x1eb/0x280
task_numa_fault+0xc1b/0xed0
do_numa_page+0x192/0x200
handle_mm_fault+0x808/0x1160
__do_page_fault+0x218/0x750
do_page_fault+0x1a/0x70
page_fault+0x28/0x30
SyS_poll+0x66/0x1a0
system_call_fastpath+0x1a/0x1f
  INFO: Freed in task_numa_free+0x1d2/0x200 age=62 cpu=18 pid=0
__slab_free+0x2ab/0x3f0
kfree+0x161/0x170
task_numa_free+0x1d2/0x200
finish_task_switch+0x1d2/0x210
__schedule+0x5d4/0xc60
schedule_preempt_disabled+0x40/0xc0
cpu_startup_entry+0x2da/0x340
start_secondary+0x28f/0x360
  Call Trace:
   [] dump_stack+0x45/0x56
   [] print_trailer+0xfd/0x170
   [] object_err+0x36/0x40
   [] kasan_report_error+0x1e9/0x3a0
   [] kasan_report+0x40/0x50
   [] ? task_numa_find_cpu+0x64c/0x890
   [] __asan_load8+0x69/0xa0
   [] ? find_next_bit+0xd8/0x120
   [] task_numa_find_cpu+0x64c/0x890
   [] task_numa_migrate+0x4ac/0x7b0
   [] numa_migrate_preferred+0xb3/0xc0
   [] task_numa_fault+0xb88/0xed0
   [] do_numa_page+0x192/0x200
   [] handle_mm_fault+0x808/0x1160
   [] ? sched_clock_cpu+0x10d/0x160
   [] ? native_load_tls+0x82/0xa0
   [] __do_page_fault+0x218/0x750
   [] ? hrtimer_try_to_cancel+0x76/0x160
   [] ? schedule_hrtimeout_range_clock.part.24+0xf7/0x1c0
   [] do_page_fault+0x1a/0x70
   [] page_fault+0x28/0x30
   [] ? do_sys_poll+0x1c4/0x6d0
   [] ? enqueue_task_fair+0x4b6/0xaa0
   [] ? sched_clock+0x9/0x10
   [] ? resched_task+0x7a/0xc0
   [] ? check_preempt_curr+0xb3/0x130
   [] ? poll_select_copy_remaining+0x170/0x170
   [] ? wake_up_state+0x10/0x20
   [] ? drop_futex_key_refs.isra.14+0x1f/0x90
   [] ? futex_requeue+0x3de/0xba0
   [] ? do_futex+0xbe/0x8f0
   [] ? read_tsc+0x9/0x20
   [] ? ktime_get_ts+0x12d/0x170
   [] ? timespec_add_safe+0x59/0xe0
   [] SyS_poll+0x66/0x1a0
   [] system_call_fastpath+0x1a/0x1f

As commit 1effd9f19324 ("sched/numa: Fix unsafe get_task_struct() in
task_numa_assign()") points out, the rcu_read_lock() cannot protect the
task_struct from being freed in the finish_task_switch(). And the bug
happens in the process of calculation of imp which requires the access of
p->numa_faults being freed in the following path:

do_exit()
current->flags |= PF_EXITING;
release_task()
~~delayed_put_task_struct()~~
schedule()
...
...
rq->curr = next;
context_switch()
finish_task_switch()
put_task_struct()
__put_task_struct()
task_numa_free()

The fix here to get_task_struct() early before end of dst_rq->lock to
protect the calculation process and also put_task_struct() in the
corresponding point if finally the dst_rq->curr somehow cannot be
assigned.

Additional credit to Liang Chen who helped fix the error logic and add the
put_task_struct() to the place it missed.

Signed-off-by: Gavin Guo 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Andrea Arcangeli 
Cc: Andrew Morton 
Cc: Hugh Dickins 
Cc: Linus Torvalds 
Cc: Mel Gorman 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Cc: jay.vosbu...@canonical.com
Cc: liang.c...@canonical.com
Link: 
http://lkml.kernel.org/r/1453264618-17645-1-git-send-email-gavin@canonical.com
Signed-off-by: Ingo Molnar 

In scope of https://jira.sw.ru/browse/PSBM-62208

Signed-off-by: Kirill Tkhai 
---
 kernel/sched/fair.c |   29 ++---
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index daf709e40b6..4fde0d42a95 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1388,8 +1388,6 @@ static void task_numa_assign(struct task_numa_env *env,
 {
if (env->best_task)
put_task_struct(env->best_task);
-   if (p)
-   get_task_struct(p);
 
env->best_task = p;
env->best_imp = imp;
@@ -1411,19 +1409,29 @@ static void task_numa_compare(struct task_numa_env *env,
long dst_load, src_load;
long load;
long imp = (groupimp > 0) ? groupimp : taskimp;
+   bool assigned = false;
 
rcu_read_lock();
raw_spin_lock_irq(&dst_rq->lock);
cur = dst_rq->curr;
/*
-* No need to move the exiting task, and this ensures that ->curr
-* wasn't reaped and thus get_task_stru

[Devel] [PATCH RHEL7 COMMIT] kvm/x86: do not clear hyperv synic pages when setting MSRs

2017-04-13 Thread Konstantin Khorenko
The commit is pushed to "branch-rh7-3.10.0-514.10.2.vz7.29.x-ovz" and will 
appear at https://src.openvz.org/scm/ovz/vzkernel.git
after rh7-3.10.0-514.10.2.vz7.29.15
-->
commit fb03ad578cab3cab8904079ec687e76e4da8f3d8
Author: Evgeny Yakovlev 
Date:   Thu Apr 13 13:13:01 2017 +0400

kvm/x86: do not clear hyperv synic pages when setting MSRs

Existing code resets guest synic message and event flag pages to zero
when guest or host updates their guest PAs by writing to corresponding
MSRs.

This turned out to be a problem for migration code when guest had a
SYNIC irq to inject before suspending. After resuming qemu resets MSRs to
saved values and KVM zeroes out migrated guest memory as a side effect.
Following that guest sees an IRQ but doesn't see any event flags in event
flags page, skips this IRQ and hangs indefinitely.

This memory is owned by guest so let us not assume any additional
responsibility for it and remove zero out calls, which also fixes
failing migration scenario described above.

https://jira.sw.ru/browse/PSBM-63164

Signed-off-by: Evgeny Yakovlev 
Reviewed-by: Roman Kagan 
---
 arch/x86/kvm/hyperv.c | 12 
 1 file changed, 12 deletions(-)

diff --git a/arch/x86/kvm/hyperv.c b/arch/x86/kvm/hyperv.c
index eae314b..563c54fb 100644
--- a/arch/x86/kvm/hyperv.c
+++ b/arch/x86/kvm/hyperv.c
@@ -219,23 +219,11 @@ static int synic_set_msr(struct kvm_vcpu_hv_synic *synic,
synic->version = data;
break;
case HV_X64_MSR_SIEFP:
-   if (data & HV_SYNIC_SIEFP_ENABLE)
-   if (kvm_clear_guest(vcpu->kvm,
-   data & PAGE_MASK, PAGE_SIZE)) {
-   ret = 1;
-   break;
-   }
synic->evt_page = data;
if (!host)
synic_exit(synic, msr);
break;
case HV_X64_MSR_SIMP:
-   if (data & HV_SYNIC_SIMP_ENABLE)
-   if (kvm_clear_guest(vcpu->kvm,
-   data & PAGE_MASK, PAGE_SIZE)) {
-   ret = 1;
-   break;
-   }
synic->msg_page = data;
if (!host)
synic_exit(synic, msr);
___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


Re: [Devel] [PATCH rh7 v2 1/3] fs/cleancache: fix data invalidation in the cleancache during direct_io

2017-04-13 Thread Konstantin Khorenko



--
Best regards,

Konstantin Khorenko,
Virtuozzo Linux Kernel Team

On 04/13/2017 09:51 AM, Dmitry Monakhov wrote:

Andrey Ryabinin  writes:


Currently some direct_io fs hooks call invalidate_inode_pages2_range()
conditionally iff mapping->nrpages is not zero. So if nrpages is zero,
data in cleancache wouldn't be invalidated. So the next buffered read
may get stale data from the cleancache.




Fix this by calling invalidate_inode_pages2_range() regardless of nrpages
value. And if nrpages is zero, bail out from invalidate_inode_pages2_range()
only after cleancache_invalidate_inode(), so that we invalidate cleancache
but still avoid pointless page cache lookups.

BTW, can we please make tcache plugable. So one who do not want fancy
caching features can simply disable it. As we do with pfcache.


tcache/tswap disable/enable tweak (vz7: patch "Subject: [PATCH rh7] tcache/tswap: 
enable by default")
tcache/swap can be disabled/enabled using following commands:
echo 'N' > /sys/module/{tcache,tswap}/parameters/active
echo 'Y' > /sys/module/{tcache,tswap}/parameters/active

per-cgroup on the fly: echo 1 > @memory.disable_cleancache

To disable tcache at boot time: "tcache.enabled=0" kernel option





https://jira.sw.ru/browse/PSBM-63908
Signed-off-by: Andrey Ryabinin 
---
 fs/9p/vfs_file.c  |  4 ++--
 fs/nfs/direct.c   | 16 ++--
 fs/nfs/inode.c|  7 ---
 fs/xfs/xfs_file.c | 30 ++
 mm/filemap.c  | 28 
 mm/truncate.c |  4 
 6 files changed, 42 insertions(+), 47 deletions(-)

diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c
index 7da03f8..afe0036 100644
--- a/fs/9p/vfs_file.c
+++ b/fs/9p/vfs_file.c
@@ -482,7 +482,7 @@ v9fs_file_write_internal(struct inode *inode, struct p9_fid 
*fid,
if (invalidate && (total > 0)) {
pg_start = origin >> PAGE_CACHE_SHIFT;
pg_end = (origin + total - 1) >> PAGE_CACHE_SHIFT;
-   if (inode->i_mapping && inode->i_mapping->nrpages)
+   if (inode->i_mapping)
invalidate_inode_pages2_range(inode->i_mapping,
  pg_start, pg_end);
*offset += total;
@@ -688,7 +688,7 @@ v9fs_direct_write(struct file *filp, const char __user * 
data,
 * about to write.  We do this *before* the write so that if we fail
 * here we fall back to buffered write
 */
-   if (mapping->nrpages) {
+   {
pgoff_t pg_start = offset >> PAGE_CACHE_SHIFT;
pgoff_t pg_end   = (offset + count - 1) >> PAGE_CACHE_SHIFT;

diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index ab96f01..963 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -1132,12 +1132,10 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const 
struct iovec *iov,
if (result)
goto out_unlock;

-   if (mapping->nrpages) {
-   result = invalidate_inode_pages2_range(mapping,
-   pos >> PAGE_CACHE_SHIFT, end);
-   if (result)
-   goto out_unlock;
-   }
+   result = invalidate_inode_pages2_range(mapping,
+   pos >> PAGE_CACHE_SHIFT, end);
+   if (result)
+   goto out_unlock;

task_io_account_write(count);

@@ -1161,10 +1159,8 @@ ssize_t nfs_file_direct_write(struct kiocb *iocb, const 
struct iovec *iov,

result = nfs_direct_write_schedule_iovec(dreq, iov, nr_segs, pos, uio);

-   if (mapping->nrpages) {
-   invalidate_inode_pages2_range(mapping,
- pos >> PAGE_CACHE_SHIFT, end);
-   }
+   invalidate_inode_pages2_range(mapping,
+   pos >> PAGE_CACHE_SHIFT, end);

mutex_unlock(&inode->i_mutex);

diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index 8c06aed..779b05c 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1065,10 +1065,11 @@ static int nfs_invalidate_mapping(struct inode *inode, 
struct address_space *map
if (ret < 0)
return ret;
}
-   ret = invalidate_inode_pages2(mapping);
-   if (ret < 0)
-   return ret;
}
+   ret = invalidate_inode_pages2(mapping);
+   if (ret < 0)
+   return ret;
+
if (S_ISDIR(inode->i_mode)) {
spin_lock(&inode->i_lock);
memset(nfsi->cookieverf, 0, sizeof(nfsi->cookieverf));
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 9a2193b..0b7a35b 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -346,7 +346,7 @@ xfs_file_aio_read(
 * serialisation.
 */
xfs_rw_ilock(ip, XFS_IOLOCK_SHARED);
-   if ((ioflags & XFS_IO_ISDIRECT) && inode->i_mapping->nrpages) {
+   if ((ioflags & XFS_IO_ISDIRECT)) {
xfs_rw_iunlock(ip, XFS_IOLOCK