[Devel] [PATCH rh7 v2 08/21] ms/mm: memcontrol: rearrange charging fast path

2017-01-12 Thread Andrey Ryabinin
From: Johannes Weiner 

The charging path currently starts out with OOM condition checks when
OOM is the rarest possible case.

Rearrange this code to run OOM/task dying checks only after trying the
percpu charge and the res_counter charge and bail out before entering
reclaim.  Attempting a charge does not hurt an (oom-)killed task as much
as every charge attempt having to check OOM conditions.  Also, only
check __GFP_NOFAIL when the charge would actually fail.

Signed-off-by: Johannes Weiner 
Acked-by: Michal Hocko 
Cc: Hugh Dickins 
Cc: Tejun Heo 
Cc: Vladimir Davydov 
Signed-off-by: Andrew Morton 
Signed-off-by: Linus Torvalds 

https://jira.sw.ru/browse/PSBM-51558
(cherry picked from commit 06b078fc065fe1fe7097675c8ee416aa2ef94fb3)
Signed-off-by: Andrey Ryabinin 
---
 mm/memcontrol.c | 32 +---
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f904257..f006cdd 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2767,21 +2767,6 @@ static int mem_cgroup_try_charge(struct mem_cgroup 
*memcg,
 
if (mem_cgroup_is_root(memcg))
goto done;
-   /*
-* Unlike in global OOM situations, memcg is not in a physical
-* memory shortage.  Allow dying and OOM-killed tasks to
-* bypass the last charges so that they can exit quickly and
-* free their memory.
-*/
-   if (unlikely(test_thread_flag(TIF_MEMDIE) ||
-fatal_signal_pending(current)))
-   goto bypass;
-
-   if (unlikely(task_in_memcg_oom(current)))
-   goto nomem;
-
-   if (gfp_mask & __GFP_NOFAIL)
-   oom = false;
 retry:
if (consume_stock(memcg, nr_pages))
goto done;
@@ -2802,6 +2787,20 @@ retry:
goto retry;
}
 
+   /*
+* Unlike in global OOM situations, memcg is not in a physical
+* memory shortage.  Allow dying and OOM-killed tasks to
+* bypass the last charges so that they can exit quickly and
+* free their memory.
+*/
+   if (unlikely(test_thread_flag(TIF_MEMDIE) ||
+fatal_signal_pending(current) ||
+current->flags & PF_EXITING))
+   goto bypass;
+
+   if (unlikely(task_in_memcg_oom(current)))
+   goto nomem;
+
if (!(gfp_mask & __GFP_WAIT))
goto nomem;
 
@@ -2830,6 +2829,9 @@ retry:
if (mem_cgroup_wait_acct_move(mem_over_limit))
goto retry;
 
+   if (gfp_mask & __GFP_NOFAIL)
+   goto bypass;
+
if (fatal_signal_pending(current))
goto bypass;
 
-- 
2.10.2

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v2 04/21] ms/memcg: get_mem_cgroup_from_mm()

2017-01-12 Thread Andrey Ryabinin
From: Johannes Weiner 

Instead of returning NULL from try_get_mem_cgroup_from_mm() when the mm
owner is exiting, just return root_mem_cgroup.  This makes sense for all
callsites and gets rid of some of them having to fallback manually.

[fengguang...@intel.com: fix warnings]
Signed-off-by: Johannes Weiner 
Signed-off-by: Fengguang Wu 
Acked-by: Michal Hocko 
Signed-off-by: Andrew Morton 
Signed-off-by: Linus Torvalds 

https://jira.sw.ru/browse/PSBM-51558
(cherry picked from commit df381975463996178d685f6ef7d3555c5f887201)
Signed-off-by: Andrey Ryabinin 
---
 include/linux/memcontrol.h | 12 ++--
 mm/memcontrol.c| 37 +
 mm/oom_kill.c  |  4 ++--
 net/packet/af_packet.c |  2 +-
 4 files changed, 26 insertions(+), 29 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index f1b599f..bcf7752 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -85,7 +85,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct 
mem_cgroup *memcg)
 
 extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page);
 extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p);
-extern struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm);
+extern struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm);
 
 extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg);
 extern struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont);
@@ -304,17 +304,17 @@ static inline struct mem_cgroup 
*try_get_mem_cgroup_from_page(struct page *page)
return NULL;
 }
 
-static inline struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct 
*mm)
-{
-   return NULL;
-}
-
 static inline bool mm_match_cgroup(struct mm_struct *mm,
struct mem_cgroup *memcg)
 {
return true;
 }
 
+static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
+{
+   return NULL;
+}
+
 static inline int task_in_mem_cgroup(struct task_struct *task,
 const struct mem_cgroup *memcg)
 {
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d09d55d9..40ac81b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -1166,15 +1166,24 @@ struct mem_cgroup *mem_cgroup_from_task(struct 
task_struct *p)
return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id));
 }
 
-struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm)
+struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm)
 {
struct mem_cgroup *memcg = NULL;
 
rcu_read_lock();
do {
-   memcg = mem_cgroup_from_task(rcu_dereference(mm->owner));
-   if (unlikely(!memcg))
-   break;
+   /*
+* Page cache insertions can happen withou an
+* actual mm context, e.g. during disk probing
+* on boot, loopback IO, acct() writes etc.
+*/
+   if (unlikely(!mm))
+   memcg = root_mem_cgroup;
+   else {
+   memcg = 
mem_cgroup_from_task(rcu_dereference(mm->owner));
+   if (unlikely(!memcg))
+   memcg = root_mem_cgroup;
+   }
} while (!css_tryget(>css));
rcu_read_unlock();
return memcg;
@@ -1546,7 +1555,7 @@ int task_in_mem_cgroup(struct task_struct *task, const 
struct mem_cgroup *memcg)
 
p = find_lock_task_mm(task);
if (p) {
-   curr = try_get_mem_cgroup_from_mm(p->mm);
+   curr = get_mem_cgroup_from_mm(p->mm);
task_unlock(p);
} else {
/*
@@ -1560,8 +1569,6 @@ int task_in_mem_cgroup(struct task_struct *task, const 
struct mem_cgroup *memcg)
css_get(>css);
task_unlock(task);
}
-   if (!curr)
-   return 0;
/*
 * We should check use_hierarchy of "memcg" not "curr". Because checking
 * use_hierarchy of "curr" here make this function true if hierarchy is
@@ -1693,7 +1700,7 @@ void mem_cgroup_note_oom_kill(struct mem_cgroup 
*root_memcg,
 
p = find_lock_task_mm(task);
if (p) {
-   memcg = try_get_mem_cgroup_from_mm(p->mm);
+   memcg = get_mem_cgroup_from_mm(p->mm);
task_unlock(p);
} else {
rcu_read_lock();
@@ -3458,9 +3465,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct 
kmem_cache *cachep,
if (!current->mm || current->memcg_kmem_skip_account)
return cachep;
 
-   memcg = try_get_mem_cgroup_from_mm(current->mm);
-   if (unlikely(!memcg))
-   return cachep;
+   memcg = 

Re: [Devel] [PATCH vz7] fuse: fuse_writepage_locked must check for FUSE_INVALIDATE_FILES (v2)

2017-01-12 Thread Dmitry Monakhov
Maxim Patlasov  writes:

> The patch fixes another race dealing with fuse_invalidate_files,
> this time when it races with truncate(2):
>
> Thread A: the flusher performs writeback as usual:
>
>   fuse_writepages -->
> fuse_send_writepages -->
>   end_page_writeback
>
> but before fuse_send_writepages acquires fc->lock and calls 
> fuse_flush_writepages,
> some innocent user process re-dirty-es the page.
>
> Thread B: truncate(2) attempts to truncate (shrink) file as usual:
>
>   fuse_do_setattr -->
> invalidate_inode_pages2
>
> (This is possible because Thread A has not incremented fi->writectr yet.) But
> invalidate_inode_pages2 finds that re-dirty-ed page and sticks in:
>
>   invalidate_inode_pages2 -->
> fuse_launder_page -->
>   fuse_writepage_locked -->
>   fuse_wait_on_page_writeback
>
> Thread A: the flusher proceeds with fuse_flush_writepages, sends write request
> to userspace fuse daemon, but the daemon is not obliged to fulfill it 
> immediately.
> So, thread B waits now for thread A, while thread A waits for userspace.
>
> Now fuse_invalidate_files steps in sticking in filemap_write_and_wait on the
> page locked by Thread B (launder_page always work on a locked page). Deadlock.
>
> The patch fixes deadlock by waking up fuse_writepage_locked after marking
> files with FAIL_IMMEDIATELY flag.
>
> Changed in v2:
>   - instead of flagging "fail_immediately", let fuse_writepage_locked return
> fuse_file pointer, then the caller (fuse_launder_page) can use it for
> conditional wait on __fuse_wait_on_page_writeback_or_invalidate. This is
> important because otherwise fuse_invalidate_files may deadlock when
> launder waits for fuse writeback.
ACK-by: dmonak...@openvz.org
>
> Signed-off-by: Maxim Patlasov 
> ---
>  fs/fuse/file.c |   51 +--
>  1 file changed, 45 insertions(+), 6 deletions(-)
>
> diff --git a/fs/fuse/file.c b/fs/fuse/file.c
> index 0ffc806..34e75c2 100644
> --- a/fs/fuse/file.c
> +++ b/fs/fuse/file.c
> @@ -1963,7 +1963,8 @@ static struct fuse_file *fuse_write_file(struct 
> fuse_conn *fc,
>  }
>  
>  static int fuse_writepage_locked(struct page *page,
> -  struct writeback_control *wbc)
> +  struct writeback_control *wbc,
> +  struct fuse_file **ff_pp)
>  {
>   struct address_space *mapping = page->mapping;
>   struct inode *inode = mapping->host;
> @@ -1971,13 +1972,30 @@ static int fuse_writepage_locked(struct page *page,
>   struct fuse_inode *fi = get_fuse_inode(inode);
>   struct fuse_req *req;
>   struct page *tmp_page;
> + struct fuse_file *ff;
> + int err = 0;
>  
>   if (fuse_page_is_writeback(inode, page->index)) {
>   if (wbc->sync_mode != WB_SYNC_ALL) {
>   redirty_page_for_writepage(wbc, page);
>   return 0;
>   }
> - fuse_wait_on_page_writeback(inode, page->index);
> +
> + /* we can acquire ff here because we do have locked pages here! 
> */
> + ff = fuse_write_file(fc, get_fuse_inode(inode));
> + if (!ff)
> + goto dummy_end_page_wb_err;
> +
> + /* FUSE_NOTIFY_INVAL_FILES must be able to wake us up */
> + __fuse_wait_on_page_writeback_or_invalidate(inode, ff, 
> page->index);
> +
> + if (test_bit(FUSE_S_FAIL_IMMEDIATELY, >ff_state)) {
> + if (ff_pp)
> + *ff_pp = ff;
> + goto dummy_end_page_wb;
> + }
> +
> + fuse_release_ff(inode, ff);
>   }
>  
>   if (test_set_page_writeback(page))
> @@ -1995,6 +2013,8 @@ static int fuse_writepage_locked(struct page *page,
>   req->ff = fuse_write_file(fc, fi);
>   if (!req->ff)
>   goto err_nofile;
> + if (ff_pp)
> + *ff_pp = fuse_file_get(req->ff);
>   fuse_write_fill(req, req->ff, page_offset(page), 0);
>   fuse_account_request(fc, PAGE_CACHE_SIZE);
>  
> @@ -2029,13 +2049,23 @@ err_free:
>  err:
>   end_page_writeback(page);
>   return -ENOMEM;
> +
> +dummy_end_page_wb_err:
> + printk("FUSE: page under fwb dirtied on dead file\n");
> + err = -EIO;
> + /* fall through ... */
> +dummy_end_page_wb:
> + if (test_set_page_writeback(page))
> + BUG();
> + end_page_writeback(page);
> + return err;
>  }
>  
>  static int fuse_writepage(struct page *page, struct writeback_control *wbc)
>  {
>   int err;
>  
> - err = fuse_writepage_locked(page, wbc);
> + err = fuse_writepage_locked(page, wbc, NULL);
>   unlock_page(page);
>  
>   return err;
> @@ -2423,9 +2453,18 @@ static int fuse_launder_page(struct page *page)
>   struct writeback_control wbc = {
>   .sync_mode = WB_SYNC_ALL,
>   

[Devel] [PATCH rh7 v2 17/21] ms/mm: memcontrol: rewrite uncharge API

2017-01-12 Thread Andrey Ryabinin
From: Johannes Weiner 

The memcg uncharging code that is involved towards the end of a page's
lifetime - truncation, reclaim, swapout, migration - is impressively
complicated and fragile.

Because anonymous and file pages were always charged before they had their
page->mapping established, uncharges had to happen when the page type
could still be known from the context; as in unmap for anonymous, page
cache removal for file and shmem pages, and swap cache truncation for swap
pages.  However, these operations happen well before the page is actually
freed, and so a lot of synchronization is necessary:

- Charging, uncharging, page migration, and charge migration all need
  to take a per-page bit spinlock as they could race with uncharging.

- Swap cache truncation happens during both swap-in and swap-out, and
  possibly repeatedly before the page is actually freed.  This means
  that the memcg swapout code is called from many contexts that make
  no sense and it has to figure out the direction from page state to
  make sure memory and memory+swap are always correctly charged.

- On page migration, the old page might be unmapped but then reused,
  so memcg code has to prevent untimely uncharging in that case.
  Because this code - which should be a simple charge transfer - is so
  special-cased, it is not reusable for replace_page_cache().

But now that charged pages always have a page->mapping, introduce
mem_cgroup_uncharge(), which is called after the final put_page(), when we
know for sure that nobody is looking at the page anymore.

For page migration, introduce mem_cgroup_migrate(), which is called after
the migration is successful and the new page is fully rmapped.  Because
the old page is no longer uncharged after migration, prevent double
charges by decoupling the page's memcg association (PCG_USED and
pc->mem_cgroup) from the page holding an actual charge.  The new bits
PCG_MEM and PCG_MEMSW represent the respective charges and are transferred
to the new page during migration.

mem_cgroup_migrate() is suitable for replace_page_cache() as well,
which gets rid of mem_cgroup_replace_page_cache().  However, care
needs to be taken because both the source and the target page can
already be charged and on the LRU when fuse is splicing: grab the page
lock on the charge moving side to prevent changing pc->mem_cgroup of a
page under migration.  Also, the lruvecs of both pages change as we
uncharge the old and charge the new during migration, and putback may
race with us, so grab the lru lock and isolate the pages iff on LRU to
prevent races and ensure the pages are on the right lruvec afterward.

Swap accounting is massively simplified: because the page is no longer
uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can
transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry
before the final put_page() in page reclaim.

Finally, page_cgroup changes are now protected by whatever protection the
page itself offers: anonymous pages are charged under the page table lock,
whereas page cache insertions, swapin, and migration hold the page lock.
Uncharging happens under full exclusion with no outstanding references.
Charging and uncharging also ensure that the page is off-LRU, which
serializes against charge migration.  Remove the very costly page_cgroup
lock and set pc->flags non-atomically.

[mho...@suse.cz: mem_cgroup_charge_statistics needs preempt_disable]
[vdavy...@parallels.com: fix flags definition]
Signed-off-by: Johannes Weiner 
Cc: Hugh Dickins 
Cc: Tejun Heo 
Cc: Vladimir Davydov 
Tested-by: Jet Chen 
Acked-by: Michal Hocko 
Tested-by: Felipe Balbi 
Signed-off-by: Vladimir Davydov 
Signed-off-by: Andrew Morton 
Signed-off-by: Linus Torvalds 

https://jira.sw.ru/browse/PSBM-51558
(cherry picked from commit 0a31bc97c80c3fa87b32c091d9a930ac19cd0c40)
Signed-off-by: Andrey Ryabinin 
---
 Documentation/cgroups/memcg_test.txt | 128 +-
 drivers/staging/zcache/zcache-main.c |   2 +-
 include/linux/memcontrol.h   |  49 +-
 include/linux/page_cgroup.h  |  43 +-
 include/linux/swap.h |  12 +-
 mm/filemap.c |   4 +-
 mm/hmm_migrate.c |  12 -
 mm/memcontrol.c  | 850 ++-
 mm/memory.c  |   2 -
 mm/migrate.c |  39 +-
 mm/rmap.c|   1 -
 mm/shmem.c   |   8 +-
 mm/swap.c|   5 +
 mm/swap_state.c  |   8 +-
 mm/swapfile.c|   9 +-
 mm/truncate.c|   9 -
 mm/tswap.c   |   2 +-
 mm/vmscan.c  

[Devel] [PATCH rh7 v2 06/21] ms/memcg: sanitize __mem_cgroup_try_charge() call protocol

2017-01-12 Thread Andrey Ryabinin
From: Johannes Weiner 

Some callsites pass a memcg directly, some callsites pass an mm that
then has to be translated to a memcg.  This makes for a terrible
function interface.

Just push the mm-to-memcg translation into the respective callsites and
always pass a memcg to mem_cgroup_try_charge().

[mho...@suse.cz: add charge mm helper]
Signed-off-by: Johannes Weiner 
Acked-by: Michal Hocko 
Signed-off-by: Andrew Morton 
Signed-off-by: Linus Torvalds 

https://jira.sw.ru/browse/PSBM-51558
(cherry picked from commit 6d1fdc48938cd51a3964778d78f27cb26c8eb55d)
Signed-off-by: Andrey Ryabinin 
---
 mm/memcontrol.c | 207 
 1 file changed, 102 insertions(+), 105 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d6413c5..1e5d914 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2745,7 +2745,7 @@ static int memcg_cpu_hotplug_callback(struct 
notifier_block *nb,
 }
 
 
-/* See __mem_cgroup_try_charge() for details */
+/* See mem_cgroup_try_charge() for details */
 enum {
CHARGE_OK,  /* success */
CHARGE_RETRY,   /* need to retry but retry is not bad */
@@ -2824,45 +2824,35 @@ static int mem_cgroup_do_charge(struct mem_cgroup 
*memcg, gfp_t gfp_mask,
return CHARGE_NOMEM;
 }
 
-/*
- * __mem_cgroup_try_charge() does
- * 1. detect memcg to be charged against from passed *mm and *ptr,
- * 2. update page_counter
- * 3. call memory reclaim if necessary.
- *
- * In some special case, if the task is fatal, fatal_signal_pending() or
- * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup
- * to *ptr. There are two reasons for this. 1: fatal threads should quit as 
soon
- * as possible without any hazards. 2: all pages should have a valid
- * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg
- * pointer, that is treated as a charge to root_mem_cgroup.
- *
- * So __mem_cgroup_try_charge() will return
- *  0   ...  on success, filling *ptr with a valid memcg pointer.
- *  -ENOMEM ...  charge failure because of resource limits.
- *  -EINTR  ...  if thread is fatal. *ptr is filled with root_mem_cgroup.
+/**
+ * mem_cgroup_try_charge - try charging a memcg
+ * @memcg: memcg to charge
+ * @nr_pages: number of pages to charge
+ * @oom: trigger OOM if reclaim fails
  *
- * Unlike the exported interface, an "oom" parameter is added. if oom==true,
- * the oom-killer can be invoked.
+ * Returns 0 if @memcg was charged successfully, -EINTR if the charge
+ * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.
  */
-static int __mem_cgroup_try_charge(struct mm_struct *mm,
-  gfp_t gfp_mask,
-  unsigned int nr_pages,
-  struct mem_cgroup **ptr,
-  bool oom)
+static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
+gfp_t gfp_mask,
+unsigned int nr_pages,
+bool oom)
 {
unsigned int batch = max(CHARGE_BATCH, nr_pages);
int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
-   struct mem_cgroup *memcg = NULL, *iter;
+   struct mem_cgroup *iter;
int ret;
 
+   if (mem_cgroup_is_root(memcg))
+   goto done;
/*
-* Unlike gloval-vm's OOM-kill, we're not in memory shortage
-* in system level. So, allow to go ahead dying process in addition to
-* MEMDIE process.
+* Unlike in global OOM situations, memcg is not in a physical
+* memory shortage.  Allow dying and OOM-killed tasks to
+* bypass the last charges so that they can exit quickly and
+* free their memory.
 */
-   if (unlikely(test_thread_flag(TIF_MEMDIE)
-|| fatal_signal_pending(current)))
+   if (unlikely(test_thread_flag(TIF_MEMDIE) ||
+fatal_signal_pending(current)))
goto bypass;
 
if (unlikely(task_in_memcg_oom(current)))
@@ -2871,14 +2861,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
if (gfp_mask & __GFP_NOFAIL)
oom = false;
 again:
-   if (*ptr) { /* css should be a valid one */
-   memcg = *ptr;
-   css_get(>css);
-   } else {
-   memcg = get_mem_cgroup_from_mm(mm);
-   }
-   if (mem_cgroup_is_root(memcg))
-   goto done;
if (consume_stock(memcg, nr_pages))
goto done;
 
@@ -2887,10 +2869,8 @@ again:
 
/* If killed, bypass charge */
if (test_thread_flag(TIF_MEMDIE) ||
-   fatal_signal_pending(current)) {
-   css_put(>css);
+   fatal_signal_pending(current))
  

[Devel] [PATCH rh7 v2 18/21] ms/mm: memcontrol: use page lists for uncharge batching

2017-01-12 Thread Andrey Ryabinin
From: Johannes Weiner 

Pages are now uncharged at release time, and all sources of batched
uncharges operate on lists of pages.  Directly use those lists, and
get rid of the per-task batching state.

This also batches statistics accounting, in addition to the res
counter charges, to reduce IRQ-disabling and re-enabling.

Signed-off-by: Johannes Weiner 
Acked-by: Michal Hocko 
Cc: Hugh Dickins 
Cc: Tejun Heo 
Cc: Vladimir Davydov 
Cc: Naoya Horiguchi 
Cc: Vladimir Davydov 
Signed-off-by: Andrew Morton 
Signed-off-by: Linus Torvalds 

https://jira.sw.ru/browse/PSBM-51558
(cherry picked from commit 747db954cab64c6b7a95b121b517165f34751898)
Signed-off-by: Andrey Ryabinin 
---
 include/linux/memcontrol.h |  12 +--
 include/linux/sched.h  |   6 --
 kernel/fork.c  |   4 -
 mm/memcontrol.c| 204 -
 mm/swap.c  |  12 +--
 mm/vmscan.c|  12 ++-
 6 files changed, 122 insertions(+), 128 deletions(-)

diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h
index ca343bf..681f320 100644
--- a/include/linux/memcontrol.h
+++ b/include/linux/memcontrol.h
@@ -51,12 +51,8 @@ int mem_cgroup_try_charge(struct page *page, struct 
mm_struct *mm,
 void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg,
  bool lrucare);
 void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg);
-
 void mem_cgroup_uncharge(struct page *page);
-
-/* Batched uncharging */
-void mem_cgroup_uncharge_start(void);
-void mem_cgroup_uncharge_end(void);
+void mem_cgroup_uncharge_list(struct list_head *page_list);
 
 void mem_cgroup_migrate(struct page *oldpage, struct page *newpage,
bool lrucare);
@@ -244,11 +240,7 @@ static inline void mem_cgroup_uncharge(struct page *page)
 {
 }
 
-static inline void mem_cgroup_uncharge_start(void)
-{
-}
-
-static inline void mem_cgroup_uncharge_end(void)
+static inline void mem_cgroup_uncharge_list(struct list_head *page_list)
 {
 }
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index 05a4b12..c945d93 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1721,12 +1721,6 @@ struct task_struct {
struct ve_struct *task_ve;
 #endif
 #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */
-   struct memcg_batch_info {
-   int do_batch;   /* incremented when batch uncharge started */
-   struct mem_cgroup *memcg; /* target memcg of uncharge */
-   unsigned long nr_pages; /* uncharged usage */
-   unsigned long memsw_nr_pages; /* uncharged mem+swap usage */
-   } memcg_batch;
unsigned int memcg_kmem_skip_account;
 #endif
 #ifdef CONFIG_HAVE_HW_BREAKPOINT
diff --git a/kernel/fork.c b/kernel/fork.c
index 99e39cc..329dc42 100644
--- a/kernel/fork.c
+++ b/kernel/fork.c
@@ -1453,10 +1453,6 @@ static struct task_struct *copy_process(unsigned long 
clone_flags,
 #ifdef CONFIG_DEBUG_MUTEXES
p->blocked_on = NULL; /* not blocked yet */
 #endif
-#ifdef CONFIG_MEMCG
-   p->memcg_batch.do_batch = 0;
-   p->memcg_batch.memcg = NULL;
-#endif
 #ifdef CONFIG_BCACHE
p->sequential_io= 0;
p->sequential_io_avg= 0;
diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1eb0e85..16bb6aa 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3625,51 +3625,6 @@ out:
return ret;
 }
 
-/*
- * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate.
- * In that cases, pages are freed continuously and we can expect pages
- * are in the same memcg. All these calls itself limits the number of
- * pages freed at once, then uncharge_start/end() is called properly.
- * This may be called prural(2) times in a context,
- */
-
-void mem_cgroup_uncharge_start(void)
-{
-   unsigned long flags;
-
-   local_irq_save(flags);
-   current->memcg_batch.do_batch++;
-   /* We can do nest. */
-   if (current->memcg_batch.do_batch == 1) {
-   current->memcg_batch.memcg = NULL;
-   current->memcg_batch.nr_pages = 0;
-   current->memcg_batch.memsw_nr_pages = 0;
-   }
-   local_irq_restore(flags);
-}
-
-void mem_cgroup_uncharge_end(void)
-{
-   struct memcg_batch_info *batch = >memcg_batch;
-   unsigned long flags;
-
-   local_irq_save(flags);
-   VM_BUG_ON(!batch->do_batch);
-   if (--batch->do_batch) /* If stacked, do nothing */
-   goto out;
-   /*
-* This "batch->memcg" is valid without any css_get/put etc...
-* bacause we hide charges behind us.
-*/
-   if (batch->nr_pages)
-   page_counter_uncharge(>memcg->memory, 

[Devel] [PATCH rh7 v2 16/21] ms/mm: memcontrol: rewrite charge API

2017-01-12 Thread Andrey Ryabinin
From: Johannes Weiner 

These patches rework memcg charge lifetime to integrate more naturally
with the lifetime of user pages.  This drastically simplifies the code and
reduces charging and uncharging overhead.  The most expensive part of
charging and uncharging is the page_cgroup bit spinlock, which is removed
entirely after this series.

Here are the top-10 profile entries of a stress test that reads a 128G
sparse file on a freshly booted box, without even a dedicated cgroup (i.e.
 executing in the root memcg).  Before:

15.36%  cat  [kernel.kallsyms]   [k] copy_user_generic_string
13.31%  cat  [kernel.kallsyms]   [k] memset
11.48%  cat  [kernel.kallsyms]   [k] do_mpage_readpage
 4.23%  cat  [kernel.kallsyms]   [k] get_page_from_freelist
 2.38%  cat  [kernel.kallsyms]   [k] put_page
 2.32%  cat  [kernel.kallsyms]   [k] __mem_cgroup_commit_charge
 2.18%  kswapd0  [kernel.kallsyms]   [k] 
__mem_cgroup_uncharge_common
 1.92%  kswapd0  [kernel.kallsyms]   [k] shrink_page_list
 1.86%  cat  [kernel.kallsyms]   [k] __radix_tree_lookup
 1.62%  cat  [kernel.kallsyms]   [k] __pagevec_lru_add_fn

After:

15.67%   cat  [kernel.kallsyms]   [k] copy_user_generic_string
13.48%   cat  [kernel.kallsyms]   [k] memset
11.42%   cat  [kernel.kallsyms]   [k] do_mpage_readpage
 3.98%   cat  [kernel.kallsyms]   [k] get_page_from_freelist
 2.46%   cat  [kernel.kallsyms]   [k] put_page
 2.13%   kswapd0  [kernel.kallsyms]   [k] shrink_page_list
 1.88%   cat  [kernel.kallsyms]   [k] __radix_tree_lookup
 1.67%   cat  [kernel.kallsyms]   [k] __pagevec_lru_add_fn
 1.39%   kswapd0  [kernel.kallsyms]   [k] free_pcppages_bulk
 1.30%   cat  [kernel.kallsyms]   [k] kfree

As you can see, the memcg footprint has shrunk quite a bit.

   textdata bss dec hex filename
  379709892 400   48262bc86 mm/memcontrol.o.old
  352399892 400   45531b1db mm/memcontrol.o

This patch (of 4):

The memcg charge API charges pages before they are rmapped - i.e.  have an
actual "type" - and so every callsite needs its own set of charge and
uncharge functions to know what type is being operated on.  Worse,
uncharge has to happen from a context that is still type-specific, rather
than at the end of the page's lifetime with exclusive access, and so
requires a lot of synchronization.

Rewrite the charge API to provide a generic set of try_charge(),
commit_charge() and cancel_charge() transaction operations, much like
what's currently done for swap-in:

  mem_cgroup_try_charge() attempts to reserve a charge, reclaiming
  pages from the memcg if necessary.

  mem_cgroup_commit_charge() commits the page to the charge once it
  has a valid page->mapping and PageAnon() reliably tells the type.

  mem_cgroup_cancel_charge() aborts the transaction.

This reduces the charge API and enables subsequent patches to
drastically simplify uncharging.

As pages need to be committed after rmap is established but before they
are added to the LRU, page_add_new_anon_rmap() must stop doing LRU
additions again.  Revive lru_cache_add_active_or_unevictable().

[hu...@google.com: fix shmem_unuse]
[hu...@google.com: Add comments on the private use of -EAGAIN]
Signed-off-by: Johannes Weiner 
Acked-by: Michal Hocko 
Cc: Tejun Heo 
Cc: Vladimir Davydov 
Signed-off-by: Hugh Dickins 
Cc: Naoya Horiguchi 
Signed-off-by: Andrew Morton 
Signed-off-by: Linus Torvalds 

https://jira.sw.ru/browse/PSBM-51558
(cherry picked from commit 00501b531c4723972aa11d6d4ebcf8d6552007c8)
Signed-off-by: Andrey Ryabinin 
---
 Documentation/cgroups/memcg_test.txt |  32 +--
 include/linux/memcontrol.h   |  53 ++---
 include/linux/swap.h |   2 +
 kernel/events/uprobes.c  |   8 +
 mm/filemap.c |  23 +-
 mm/huge_memory.c |  56 +++--
 mm/memcontrol.c  | 404 +++
 mm/memory.c  |  46 ++--
 mm/rmap.c|   5 -
 mm/shmem.c   |  38 ++--
 mm/swap.c|  34 +++
 mm/swapfile.c|  14 +-
 mm/userfaultfd.c |   7 +-
 13 files changed, 348 insertions(+), 374 deletions(-)

diff --git a/Documentation/cgroups/memcg_test.txt 
b/Documentation/cgroups/memcg_test.txt
index ce94a83..bcf750d 100644
--- a/Documentation/cgroups/memcg_test.txt
+++ b/Documentation/cgroups/memcg_test.txt
@@ -24,24 +24,7 @@ Please note that implementation details can be changed.
 

[Devel] [PATCH rh7 v2 13/21] ms/mm: memcontrol: simplify move precharge function

2017-01-12 Thread Andrey Ryabinin
From: Johannes Weiner 

The move precharge function does some baroque things: it tries raw
res_counter charging of the entire amount first, and then falls back to
a loop of one-by-one charges, with checks for pending signals and
cond_resched() batching.

Just use mem_cgroup_try_charge() without __GFP_WAIT for the first bulk
charge attempt.  In the one-by-one loop, remove the signal check (this
is already checked in try_charge), and simply call cond_resched() after
every charge - it's not that expensive.

Signed-off-by: Johannes Weiner 
Acked-by: Michal Hocko 
Cc: Hugh Dickins 
Cc: Tejun Heo 
Cc: Vladimir Davydov 
Signed-off-by: Andrew Morton 
Signed-off-by: Linus Torvalds 

https://jira.sw.ru/browse/PSBM-51558
(cherry picked from commit 9476db974d9e18885123fcebc09f4596bb922e5f)
Signed-off-by: Andrey Ryabinin 
---
 mm/memcontrol.c | 48 +++-
 1 file changed, 15 insertions(+), 33 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a1aab9f..32533bf 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6722,56 +6722,38 @@ static void mem_cgroup_css_free(struct cgroup *cont)
 
 #ifdef CONFIG_MMU
 /* Handlers for move charge at task migration. */
-#define PRECHARGE_COUNT_AT_ONCE256
 static int mem_cgroup_do_precharge(unsigned long count)
 {
int ret = 0;
-   int batch_count = PRECHARGE_COUNT_AT_ONCE;
-   struct mem_cgroup *memcg = mc.to;
 
-   if (mem_cgroup_is_root(memcg)) {
+   if (mem_cgroup_is_root(mc.to)) {
mc.precharge += count;
/* we don't need css_get for root */
return ret;
}
-   /* try to charge at once */
-   if (count > 1) {
-   struct page_counter *dummy;
-   /*
-* "memcg" cannot be under rmdir() because we've already checked
-* by cgroup_lock_live_cgroup() that it is not removed and we
-* are still under the same cgroup_mutex. So we can postpone
-* css_get().
-*/
-   if (page_counter_try_charge(>memory, count, ))
-   goto one_by_one;
-   if (do_swap_account &&
-   page_counter_try_charge(>memsw, count, )) {
-   page_counter_uncharge(>memory, count);
-   goto one_by_one;
-   }
+
+   /* Try a single bulk charge without reclaim first */
+   ret = mem_cgroup_try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count);
+   if (!ret) {
mc.precharge += count;
return ret;
}
-one_by_one:
-   /* fall back to one by one charge */
+
+   /* Try charges one by one with reclaim */
while (count--) {
-   if (signal_pending(current)) {
-   ret = -EINTR;
-   break;
-   }
-   if (!batch_count--) {
-   batch_count = PRECHARGE_COUNT_AT_ONCE;
-   cond_resched();
-   }
-   ret = mem_cgroup_try_charge(memcg,
+   ret = mem_cgroup_try_charge(mc.to,
GFP_KERNEL & ~__GFP_NORETRY, 1);
+   /*
+* In case of failure, any residual charges against
+* mc.to will be dropped by mem_cgroup_clear_mc()
+* later on.
+*/
if (ret)
-   /* mem_cgroup_clear_mc() will do uncharge later */
return ret;
mc.precharge++;
+   cond_resched();
}
-   return ret;
+   return 0;
 }
 
 /**
-- 
2.10.2

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v2 20/21] ms/mm: memcontrol: teach uncharge_list to deal with kmem pages

2017-01-12 Thread Andrey Ryabinin
From: Vladimir Davydov 

Page table pages are batched-freed in release_pages on most
architectures.  If we want to charge them to kmemcg (this is what is
done later in this series), we need to teach mem_cgroup_uncharge_list to
handle kmem pages.

Link: 
http://lkml.kernel.org/r/18d5c09e97f80074ed25b97a7d0f32b95d875717.1464079538.git.vdavy...@virtuozzo.com
Signed-off-by: Vladimir Davydov 
Cc: Johannes Weiner 
Cc: Michal Hocko 
Cc: Eric Dumazet 
Cc: Minchan Kim 
Signed-off-by: Andrew Morton 
Signed-off-by: Linus Torvalds 

https://jira.sw.ru/browse/PSBM-51558
(cherry picked from commit 5e8d35f849b1969b900695ae191326bfacf6bfc6)
Signed-off-by: Andrey Ryabinin 
---
 mm/memcontrol.c | 37 +
 1 file changed, 21 insertions(+), 16 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 6c11788..0183a9c 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6922,15 +6922,16 @@ void mem_cgroup_cancel_charge(struct page *page, struct 
mem_cgroup *memcg)
 static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout,
   unsigned long nr_mem, unsigned long nr_memsw,
   unsigned long nr_anon, unsigned long nr_file,
-  unsigned long nr_huge, struct page *dummy_page)
+  unsigned long nr_huge, unsigned long nr_kmem,
+  struct page *dummy_page)
 {
unsigned long flags;
 
if (!mem_cgroup_is_root(memcg)) {
if (nr_mem)
-   page_counter_uncharge(>memory, nr_mem);
+   page_counter_uncharge(>memory, nr_mem + nr_kmem);
if (nr_memsw)
-   page_counter_uncharge(>memsw, nr_memsw);
+   page_counter_uncharge(>memsw, nr_memsw + 
nr_kmem);
 
memcg_oom_recover(memcg);
}
@@ -6952,6 +6953,7 @@ static void uncharge_list(struct list_head *page_list)
unsigned long nr_anon = 0;
unsigned long nr_file = 0;
unsigned long nr_huge = 0;
+   unsigned long nr_kmem = 0;
unsigned long pgpgout = 0;
unsigned long nr_mem = 0;
struct list_head *next;
@@ -6981,23 +6983,26 @@ static void uncharge_list(struct list_head *page_list)
if (memcg != pc->mem_cgroup) {
if (memcg) {
uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,
-  nr_anon, nr_file, nr_huge, page);
-   pgpgout = nr_mem = nr_memsw = 0;
+   nr_anon, nr_file, nr_huge, nr_kmem, 
page);
+   pgpgout = nr_mem = nr_memsw = nr_kmem = 0;
nr_anon = nr_file = nr_huge = 0;
}
memcg = pc->mem_cgroup;
}
 
-   if (PageTransHuge(page)) {
-   nr_pages <<= compound_order(page);
-   VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-   nr_huge += nr_pages;
-   }
-
-   if (PageAnon(page))
-   nr_anon += nr_pages;
-   else
-   nr_file += nr_pages;
+   if (!PageKmemcg(page)) {
+   if (PageTransHuge(page)) {
+   nr_pages <<= compound_order(page);
+   VM_BUG_ON_PAGE(!PageTransHuge(page), page);
+   nr_huge += nr_pages;
+   }
+   if (PageAnon(page))
+   nr_anon += nr_pages;
+   else
+   nr_file += nr_pages;
+   pgpgout++;
+   } else
+   nr_kmem += 1 << compound_order(page);
 
if (pc->flags & PCG_MEM)
nr_mem += nr_pages;
@@ -7010,7 +7015,7 @@ static void uncharge_list(struct list_head *page_list)
 
if (memcg)
uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw,
-  nr_anon, nr_file, nr_huge, page);
+  nr_anon, nr_file, nr_huge, nr_kmem, page);
 }
 
 /**
-- 
2.10.2

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v2 09/21] ms/mm: memcontrol: reclaim at least once for __GFP_NORETRY

2017-01-12 Thread Andrey Ryabinin
From: Johannes Weiner 

Currently, __GFP_NORETRY tries charging once and gives up before even
trying to reclaim.  Bring the behavior on par with the page allocator
and reclaim at least once before giving up.

Signed-off-by: Johannes Weiner 
Acked-by: Michal Hocko 
Cc: Hugh Dickins 
Cc: Tejun Heo 
Cc: Vladimir Davydov 
Signed-off-by: Andrew Morton 
Signed-off-by: Linus Torvalds 

https://jira.sw.ru/browse/PSBM-51558
(cherry picked from commit 28c34c291e746aab1c2bfd6d6609b2e47fa0978b)
Signed-off-by: Andrey Ryabinin 
---
 mm/memcontrol.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index f006cdd..3608d80 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2804,13 +2804,13 @@ retry:
if (!(gfp_mask & __GFP_WAIT))
goto nomem;
 
-   if (gfp_mask & __GFP_NORETRY)
-   goto nomem;
-
nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
 
if (mem_cgroup_margin(mem_over_limit) >= batch)
goto retry;
+
+   if (gfp_mask & __GFP_NORETRY)
+   goto nomem;
/*
 * Even though the limit is exceeded at this point, reclaim
 * may have been able to free some pages.  Retry the charge
-- 
2.10.2

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v2 14/21] ms/mm: memcontrol: catch root bypass in move precharge

2017-01-12 Thread Andrey Ryabinin
From: Johannes Weiner 

When mem_cgroup_try_charge() returns -EINTR, it bypassed the charge to
the root memcg.  But move precharging does not catch this and treats
this case as if no charge had happened, thus leaking a charge against
root.  Because of an old optimization, the root memcg's res_counter is
not actually charged right now, but it's still an imbalance and
subsequent patches will charge the root memcg again.

Catch those bypasses to the root memcg and properly cancel them before
giving up the move.

Signed-off-by: Johannes Weiner 
Acked-by: Michal Hocko 
Cc: Hugh Dickins 
Cc: Tejun Heo 
Cc: Vladimir Davydov 
Signed-off-by: Andrew Morton 
Signed-off-by: Linus Torvalds 

https://jira.sw.ru/browse/PSBM-51558
(cherry picked from commit 692e7c45d95ad1064b6911800e2cfec7fc0236db)
Signed-off-by: Andrey Ryabinin 
---
 mm/memcontrol.c | 9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 32533bf..cddfb93 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -6738,6 +6738,10 @@ static int mem_cgroup_do_precharge(unsigned long count)
mc.precharge += count;
return ret;
}
+   if (ret == -EINTR) {
+   __mem_cgroup_cancel_charge(root_mem_cgroup, count);
+   return ret;
+   }
 
/* Try charges one by one with reclaim */
while (count--) {
@@ -6746,8 +6750,11 @@ static int mem_cgroup_do_precharge(unsigned long count)
/*
 * In case of failure, any residual charges against
 * mc.to will be dropped by mem_cgroup_clear_mc()
-* later on.
+* later on.  However, cancel any charges that are
+* bypassed to root right away or they'll be lost.
 */
+   if (ret == -EINTR)
+   __mem_cgroup_cancel_charge(root_mem_cgroup, 1);
if (ret)
return ret;
mc.precharge++;
-- 
2.10.2

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v2 15/21] ms/mm: memcontrol: remove ordering between pc->mem_cgroup and PageCgroupUsed

2017-01-12 Thread Andrey Ryabinin
From: Johannes Weiner 

There is a write barrier between setting pc->mem_cgroup and
PageCgroupUsed, which was added to allow LRU operations to lookup the
memcg LRU list of a page without acquiring the page_cgroup lock.

But ever since commit 38c5d72f3ebe ("memcg: simplify LRU handling by new
rule"), pages are ensured to be off-LRU while charging, so nobody else
is changing LRU state while pc->mem_cgroup is being written, and there
are no read barriers anymore.

Remove the unnecessary write barrier.

Signed-off-by: Johannes Weiner 
Acked-by: Michal Hocko 
Cc: Hugh Dickins 
Cc: Tejun Heo 
Cc: Vladimir Davydov 
Signed-off-by: Andrew Morton 
Signed-off-by: Linus Torvalds 

https://jira.sw.ru/browse/PSBM-51558
(cherry picked from commit 9a2385eef9f28fb5260c48c45fc8fe01f1da70a6)
Signed-off-by: Andrey Ryabinin 
---
 mm/memcontrol.c | 9 -
 1 file changed, 9 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index cddfb93..2b04b1e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2977,14 +2977,6 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup 
*memcg,
}
 
pc->mem_cgroup = memcg;
-   /*
-* We access a page_cgroup asynchronously without lock_page_cgroup().
-* Especially when a page_cgroup is taken from a page, pc->mem_cgroup
-* is accessed after testing USED bit. To make pc->mem_cgroup visible
-* before USED bit, we need memory barrier here.
-* See mem_cgroup_add_lru_list(), etc.
-*/
-   smp_wmb();
SetPageCgroupUsed(pc);
 
if (lrucare) {
@@ -3520,7 +3512,6 @@ void mem_cgroup_split_huge_fixup(struct page *head)
for (i = 1; i < HPAGE_PMD_NR; i++) {
pc = head_pc + i;
pc->mem_cgroup = memcg;
-   smp_wmb();/* see __commit_charge() */
pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT;
}
__this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE],
-- 
2.10.2

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v2 21/21] ms/mm: memcontrol: only mark charged pages with PageKmemcg

2017-01-12 Thread Andrey Ryabinin
From: Vladimir Davydov 

To distinguish non-slab pages charged to kmemcg we mark them PageKmemcg,
which sets page->_mapcount to -512.  Currently, we set/clear PageKmemcg
in __alloc_pages_nodemask()/free_pages_prepare() for any page allocated
with __GFP_ACCOUNT, including those that aren't actually charged to any
cgroup, i.e. allocated from the root cgroup context.  To avoid overhead
in case cgroups are not used, we only do that if memcg_kmem_enabled() is
true.  The latter is set iff there are kmem-enabled memory cgroups
(online or offline).  The root cgroup is not considered kmem-enabled.

As a result, if a page is allocated with __GFP_ACCOUNT for the root
cgroup when there are kmem-enabled memory cgroups and is freed after all
kmem-enabled memory cgroups were removed, e.g.

  # no memory cgroups has been created yet, create one
  mkdir /sys/fs/cgroup/memory/test
  # run something allocating pages with __GFP_ACCOUNT, e.g.
  # a program using pipe
  dmesg | tail
  # remove the memory cgroup
  rmdir /sys/fs/cgroup/memory/test

we'll get bad page state bug complaining about page->_mapcount != -1:

  BUG: Bad page state in process swapper/0  pfn:1fd945c
  page:ea007f651700 count:0 mapcount:-511 mapping:  (null) index:0x0
  flags: 0x1000()

To avoid that, let's mark with PageKmemcg only those pages that are
actually charged to and hence pin a non-root memory cgroup.

Fixes: 4949148ad433 ("mm: charge/uncharge kmemcg from generic page allocator 
paths")
Reported-and-tested-by: Eric Dumazet 
Signed-off-by: Vladimir Davydov 
Signed-off-by: Linus Torvalds 

https://jira.sw.ru/browse/PSBM-51558
(cherry picked from commit c4159a75b64c0e67caededf4d7372c1b58a5f42a)
Signed-off-by: Andrey Ryabinin 
---
 mm/memcontrol.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 0183a9c..dc83f4e 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -7001,8 +7001,10 @@ static void uncharge_list(struct list_head *page_list)
else
nr_file += nr_pages;
pgpgout++;
-   } else
+   } else {
nr_kmem += 1 << compound_order(page);
+   __ClearPageKmemcg(page);
+   }
 
if (pc->flags & PCG_MEM)
nr_mem += nr_pages;
-- 
2.10.2

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v2 12/21] ms/mm: memcontrol: remove explicit OOM parameter in charge path

2017-01-12 Thread Andrey Ryabinin
From: Michal Hocko 

For the page allocator, __GFP_NORETRY implies that no OOM should be
triggered, whereas memcg has an explicit parameter to disable OOM.

The only callsites that want OOM disabled are THP charges and charge
moving.  THP already uses __GFP_NORETRY and charge moving can use it as
well - one full reclaim cycle should be plenty.  Switch it over, then
remove the OOM parameter.

Signed-off-by: Johannes Weiner 
Signed-off-by: Michal Hocko 
Cc: Hugh Dickins 
Cc: Tejun Heo 
Cc: Vladimir Davydov 
Signed-off-by: Andrew Morton 
Signed-off-by: Linus Torvalds 

https://jira.sw.ru/browse/PSBM-51558
(cherry picked from commit 0029e19ebf84dcd70b226820daa7747b28d5956d)
Signed-off-by: Andrey Ryabinin 
---
 mm/memcontrol.c | 51 +--
 1 file changed, 13 insertions(+), 38 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index a53d55d..a1aab9f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2748,15 +2748,13 @@ static int memcg_cpu_hotplug_callback(struct 
notifier_block *nb,
  * mem_cgroup_try_charge - try charging a memcg
  * @memcg: memcg to charge
  * @nr_pages: number of pages to charge
- * @oom: trigger OOM if reclaim fails
  *
  * Returns 0 if @memcg was charged successfully, -EINTR if the charge
  * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed.
  */
 static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
 gfp_t gfp_mask,
-unsigned int nr_pages,
-bool oom)
+unsigned int nr_pages)
 {
unsigned int batch = max(CHARGE_BATCH, nr_pages);
int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
@@ -2838,9 +2836,6 @@ retry:
if (fatal_signal_pending(current))
goto bypass;
 
-   if (!oom)
-   goto nomem;
-
mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(batch));
mem_cgroup_inc_failcnt(mem_over_limit, gfp_mask, nr_pages);
 
@@ -2868,15 +2863,14 @@ done:
  */
 static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm,
 gfp_t gfp_mask,
-unsigned int nr_pages,
-bool oom)
+unsigned int nr_pages)
 
 {
struct mem_cgroup *memcg;
int ret;
 
memcg = get_mem_cgroup_from_mm(mm);
-   ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom);
+   ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages);
css_put(>css);
if (ret == -EINTR)
memcg = root_mem_cgroup;
@@ -3044,15 +3038,8 @@ int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t 
gfp,
 {
struct page_counter *counter;
int ret = 0;
-   bool may_oom;
 
-   /*
-* Conditions under which we can wait for the oom_killer. Those are
-* the same conditions tested by the core page allocator
-*/
-   may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY);
-
-   ret = mem_cgroup_try_charge(memcg, gfp, nr_pages, may_oom);
+   ret = mem_cgroup_try_charge(memcg, gfp, nr_pages);
if (ret == -EINTR)  {
/*
 * mem_cgroup_try_charge() chosed to bypass to root due to
@@ -3686,7 +3673,6 @@ int mem_cgroup_newpage_charge(struct page *page,
 {
unsigned int nr_pages = 1;
struct mem_cgroup *memcg;
-   bool oom = true;
 
if (mem_cgroup_disabled())
return 0;
@@ -3698,14 +3684,9 @@ int mem_cgroup_newpage_charge(struct page *page,
if (PageTransHuge(page)) {
nr_pages <<= compound_order(page);
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
-   /*
-* Never OOM-kill a process for a huge page.  The
-* fault handler will fall back to regular pages.
-*/
-   oom = false;
}
 
-   memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom);
+   memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages);
if (!memcg)
return -ENOMEM;
__mem_cgroup_commit_charge(memcg, page, nr_pages,
@@ -3742,7 +3723,7 @@ static int __mem_cgroup_try_charge_swapin(struct 
mm_struct *mm,
memcg = try_get_mem_cgroup_from_page(page);
if (!memcg)
memcg = get_mem_cgroup_from_mm(mm);
-   ret = mem_cgroup_try_charge(memcg, mask, 1, true);
+   ret = mem_cgroup_try_charge(memcg, mask, 1);
css_put(>css);
if (ret == -EINTR)
memcg = root_mem_cgroup;
@@ -3769,7 +3750,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, 
struct page *page,
if (!PageSwapCache(page)) {
struct 

[Devel] [PATCH rh7 v2 10/21] ms/mm: huge_memory: use GFP_TRANSHUGE when charging huge pages

2017-01-12 Thread Andrey Ryabinin
From: Johannes Weiner 

Transparent huge page charges prefer falling back to regular pages
rather than spending a lot of time in direct reclaim.

Desired reclaim behavior is usually declared in the gfp mask, but THP
charges use GFP_KERNEL and then rely on the fact that OOM is disabled
for THP charges, and that OOM-disabled charges don't retry reclaim.
Needless to say, this is anything but obvious and quite error prone.

Convert THP charges to use GFP_TRANSHUGE instead, which implies
__GFP_NORETRY, to indicate the low-latency requirement.

Signed-off-by: Johannes Weiner 
Acked-by: Michal Hocko 
Cc: Hugh Dickins 
Cc: Tejun Heo 
Cc: Vladimir Davydov 
Signed-off-by: Andrew Morton 
Signed-off-by: Linus Torvalds 

https://jira.sw.ru/browse/PSBM-51558
(cherry picked from commit d51d885bbb137cc8e1704e76be1846c5e0d5e8b4)
Signed-off-by: Andrey Ryabinin 
---
 mm/huge_memory.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index c406494..14ed98b 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -708,7 +708,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct 
*mm,
 
VM_BUG_ON_PAGE(!PageCompound(page), page);
 
-   if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) {
+   if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_TRANSHUGE))) {
put_page(page);
count_vm_event(THP_FAULT_FALLBACK);
return VM_FAULT_FALLBACK;
@@ -1241,7 +1241,7 @@ alloc:
goto out;
}
 
-   if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) {
+   if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_TRANSHUGE))) {
put_page(new_page);
if (page) {
split_huge_page(page);
@@ -2524,7 +2524,7 @@ static void collapse_huge_page(struct mm_struct *mm,
if (!new_page)
return;
 
-   if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL)))
+   if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_TRANSHUGE)))
return;
 
/*
-- 
2.10.2

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v2 11/21] ms/mm: memcontrol: retry reclaim for oom-disabled and __GFP_NOFAIL charges

2017-01-12 Thread Andrey Ryabinin
From: Johannes Weiner 

There is no reason why oom-disabled and __GFP_NOFAIL charges should try
to reclaim only once when every other charge tries several times before
giving up.  Make them all retry the same number of times.

Signed-off-by: Johannes Weiner 
Acked-by: Michal Hocko 
Cc: Hugh Dickins 
Cc: Tejun Heo 
Cc: Vladimir Davydov 
Signed-off-by: Andrew Morton 
Signed-off-by: Linus Torvalds 

https://jira.sw.ru/browse/PSBM-51558
(cherry picked from commit 9b1306192d335759a6cf2f3b404c49e811e5f953)
Signed-off-by: Andrey Ryabinin 
---
 mm/memcontrol.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 3608d80..a53d55d 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2759,7 +2759,7 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg,
 bool oom)
 {
unsigned int batch = max(CHARGE_BATCH, nr_pages);
-   int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES;
+   int nr_retries = MEM_CGROUP_RECLAIM_RETRIES;
struct mem_cgroup *mem_over_limit;
struct page_counter *counter;
unsigned long nr_reclaimed;
@@ -2829,6 +2829,9 @@ retry:
if (mem_cgroup_wait_acct_move(mem_over_limit))
goto retry;
 
+   if (nr_retries--)
+   goto retry;
+
if (gfp_mask & __GFP_NOFAIL)
goto bypass;
 
@@ -2838,9 +2841,6 @@ retry:
if (!oom)
goto nomem;
 
-   if (nr_oom_retries--)
-   goto retry;
-
mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(batch));
mem_cgroup_inc_failcnt(mem_over_limit, gfp_mask, nr_pages);
 
-- 
2.10.2

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v2 07/21] ms/mm: memcontrol: fold mem_cgroup_do_charge()

2017-01-12 Thread Andrey Ryabinin
From: Johannes Weiner 

These patches rework memcg charge lifetime to integrate more naturally
with the lifetime of user pages.  This drastically simplifies the code
and reduces charging and uncharging overhead.  The most expensive part
of charging and uncharging is the page_cgroup bit spinlock, which is
removed entirely after this series.

Here are the top-10 profile entries of a stress test that reads a 128G
sparse file on a freshly booted box, without even a dedicated cgroup
(i.e. executing in the root memcg).  Before:

15.36%  cat  [kernel.kallsyms]   [k] copy_user_generic_string
13.31%  cat  [kernel.kallsyms]   [k] memset
11.48%  cat  [kernel.kallsyms]   [k] do_mpage_readpage
 4.23%  cat  [kernel.kallsyms]   [k] get_page_from_freelist
 2.38%  cat  [kernel.kallsyms]   [k] put_page
 2.32%  cat  [kernel.kallsyms]   [k] __mem_cgroup_commit_charge
 2.18%  kswapd0  [kernel.kallsyms]   [k] 
__mem_cgroup_uncharge_common
 1.92%  kswapd0  [kernel.kallsyms]   [k] shrink_page_list
 1.86%  cat  [kernel.kallsyms]   [k] __radix_tree_lookup
 1.62%  cat  [kernel.kallsyms]   [k] __pagevec_lru_add_fn

After:

15.67%   cat  [kernel.kallsyms]   [k] copy_user_generic_string
13.48%   cat  [kernel.kallsyms]   [k] memset
11.42%   cat  [kernel.kallsyms]   [k] do_mpage_readpage
 3.98%   cat  [kernel.kallsyms]   [k] get_page_from_freelist
 2.46%   cat  [kernel.kallsyms]   [k] put_page
 2.13%   kswapd0  [kernel.kallsyms]   [k] shrink_page_list
 1.88%   cat  [kernel.kallsyms]   [k] __radix_tree_lookup
 1.67%   cat  [kernel.kallsyms]   [k] __pagevec_lru_add_fn
 1.39%   kswapd0  [kernel.kallsyms]   [k] free_pcppages_bulk
 1.30%   cat  [kernel.kallsyms]   [k] kfree

As you can see, the memcg footprint has shrunk quite a bit.

   textdata bss dec hex filename
  379709892 400   48262bc86 mm/memcontrol.o.old
  352399892 400   45531b1db mm/memcontrol.o

This patch (of 13):

This function was split out because mem_cgroup_try_charge() got too big.
But having essentially one sequence of operations arbitrarily split in
half is not good for reworking the code.  Fold it back in.

Signed-off-by: Johannes Weiner 
Acked-by: Michal Hocko 
Cc: Hugh Dickins 
Cc: Tejun Heo 
Cc: Vladimir Davydov 
Signed-off-by: Andrew Morton 
Signed-off-by: Linus Torvalds 

https://jira.sw.ru/browse/PSBM-51558
(cherry picked from commit 6539cc053869bd32a2db731b215b7c73b11f68d3)
Signed-off-by: Andrey Ryabinin 
---
 mm/memcontrol.c | 191 +---
 1 file changed, 57 insertions(+), 134 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 1e5d914..f904257 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2744,86 +2744,6 @@ static int memcg_cpu_hotplug_callback(struct 
notifier_block *nb,
return NOTIFY_OK;
 }
 
-
-/* See mem_cgroup_try_charge() for details */
-enum {
-   CHARGE_OK,  /* success */
-   CHARGE_RETRY,   /* need to retry but retry is not bad */
-   CHARGE_NOMEM,   /* we can't do more. return -ENOMEM */
-   CHARGE_WOULDBLOCK,  /* GFP_WAIT wasn't set and no enough res. */
-};
-
-static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask,
-   unsigned int nr_pages, unsigned int min_pages,
-   bool invoke_oom)
-{
-   struct mem_cgroup *mem_over_limit;
-   struct page_counter *counter;
-   unsigned long flags = 0;
-   int ret;
-
-   ret = page_counter_try_charge(>memory, nr_pages, );
-
-   if (likely(!ret)) {
-   if (!do_swap_account)
-   return CHARGE_OK;
-   ret = page_counter_try_charge(>memsw, nr_pages, 
);
-   if (likely(!ret))
-   return CHARGE_OK;
-
-   page_counter_uncharge(>memory, nr_pages);
-   mem_over_limit = mem_cgroup_from_counter(counter, memsw);
-   flags |= MEM_CGROUP_RECLAIM_NOSWAP;
-   } else
-   mem_over_limit = mem_cgroup_from_counter(counter, memory);
-   /*
-* Never reclaim on behalf of optional batching, retry with a
-* single page instead.
-*/
-   if (nr_pages > min_pages)
-   return CHARGE_RETRY;
-
-   if (!(gfp_mask & __GFP_WAIT)) {
-   mem_cgroup_inc_failcnt(mem_over_limit, gfp_mask, nr_pages);
-   return CHARGE_WOULDBLOCK;
-   }
-
-   ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags);
-   if 

[Devel] [PATCH rh7 v2 05/21] ms/memcg: do not replicate get_mem_cgroup_from_mm in __mem_cgroup_try_charge

2017-01-12 Thread Andrey Ryabinin
From: Michal Hocko 

__mem_cgroup_try_charge duplicates get_mem_cgroup_from_mm for charges
which came without a memcg.  The only reason seems to be a tiny
optimization when css_tryget is not called if the charge can be consumed
from the stock.  Nevertheless css_tryget is very cheap since it has been
reworked to use per-cpu counting so this optimization doesn't give us
anything these days.

So let's drop the code duplication so that the code is more readable.

Signed-off-by: Michal Hocko 
Signed-off-by: Johannes Weiner 
Signed-off-by: Andrew Morton 
Signed-off-by: Linus Torvalds 

https://jira.sw.ru/browse/PSBM-51558
(cherry picked from commit b6b6cc72bc404c952968530d7df4c3a4ab82b65b)
Signed-off-by: Andrey Ryabinin 
---
 mm/memcontrol.c | 50 ++
 1 file changed, 6 insertions(+), 44 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 40ac81b..d6413c5 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2873,52 +2873,14 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 again:
if (*ptr) { /* css should be a valid one */
memcg = *ptr;
-   if (mem_cgroup_is_root(memcg))
-   goto done;
-   if (consume_stock(memcg, nr_pages))
-   goto done;
css_get(>css);
} else {
-   struct task_struct *p;
-
-   rcu_read_lock();
-   p = rcu_dereference(mm->owner);
-   /*
-* Because we don't have task_lock(), "p" can exit.
-* In that case, "memcg" can point to root or p can be NULL with
-* race with swapoff. Then, we have small risk of mis-accouning.
-* But such kind of mis-account by race always happens because
-* we don't have cgroup_mutex(). It's overkill and we allo that
-* small race, here.
-* (*) swapoff at el will charge against mm-struct not against
-* task-struct. So, mm->owner can be NULL.
-*/
-   memcg = mem_cgroup_from_task(p);
-   if (!memcg)
-   memcg = root_mem_cgroup;
-   if (mem_cgroup_is_root(memcg)) {
-   rcu_read_unlock();
-   goto done;
-   }
-   if (consume_stock(memcg, nr_pages)) {
-   /*
-* It seems dagerous to access memcg without css_get().
-* But considering how consume_stok works, it's not
-* necessary. If consume_stock success, some charges
-* from this memcg are cached on this cpu. So, we
-* don't need to call css_get()/css_tryget() before
-* calling consume_stock().
-*/
-   rcu_read_unlock();
-   goto done;
-   }
-   /* after here, we may be blocked. we need to get refcnt */
-   if (!css_tryget(>css)) {
-   rcu_read_unlock();
-   goto again;
-   }
-   rcu_read_unlock();
+   memcg = get_mem_cgroup_from_mm(mm);
}
+   if (mem_cgroup_is_root(memcg))
+   goto done;
+   if (consume_stock(memcg, nr_pages))
+   goto done;
 
do {
bool invoke_oom = oom && !nr_oom_retries;
@@ -2986,8 +2948,8 @@ again:
try_to_free_mem_cgroup_pages(iter, nr_pages, gfp_mask, false);
} while ((iter = parent_mem_cgroup(iter)));
 
-   css_put(>css);
 done:
+   css_put(>css);
*ptr = memcg;
return 0;
 nomem:
-- 
2.10.2

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v2 02/21] ms/mm: memcg: push !mm handling out to page cache charge function

2017-01-12 Thread Andrey Ryabinin
From: Johannes Weiner 

Only page cache charges can happen without an mm context, so push this
special case out of the inner core and into the cache charge function.

An ancient comment explains that the mm can also be NULL in case the
task is currently being migrated, but that is not actually true with the
current case, so just remove it.

Signed-off-by: Johannes Weiner 
Cc: Michal Hocko 
Signed-off-by: Andrew Morton 
Signed-off-by: Linus Torvalds 

https://jira.sw.ru/browse/PSBM-51558
(cherry picked from commit 284f39afeaa4ab1409b8f43b29cdea3007960ee3)
Signed-off-by: Andrey Ryabinin 
---
 mm/memcontrol.c | 15 ++-
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index be1c492..7b2a99f 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2870,15 +2870,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm,
 
if (gfp_mask & __GFP_NOFAIL)
oom = false;
-
-   /*
-* We always charge the cgroup the mm_struct belongs to.
-* The mm_struct's mem_cgroup changes on task migration if the
-* thread group leader migrates. It's possible that mm is not
-* set, if so charge the root memcg (happens for pagecache usage).
-*/
-   if (!*ptr && !mm)
-   *ptr = root_mem_cgroup;
 again:
if (*ptr) { /* css should be a valid one */
memcg = *ptr;
@@ -3971,6 +3962,12 @@ int mem_cgroup_cache_charge(struct page *page, struct 
mm_struct *mm,
return 0;
 
if (!PageSwapCache(page)) {
+   /*
+* Page cache insertions can happen without an actual
+* task context, e.g. during disk probing on boot.
+*/
+   if (!mm)
+   memcg = root_mem_cgroup;
ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, , true);
if (ret != -ENOMEM)
__mem_cgroup_commit_charge(memcg, page, 1, type, false);
-- 
2.10.2

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel


[Devel] [PATCH rh7 v2 01/21] ms/mm: memcg: inline mem_cgroup_charge_common()

2017-01-12 Thread Andrey Ryabinin
From: Johannes Weiner 

mem_cgroup_charge_common() is used by both cache and anon pages, but
most of its body only applies to anon pages and the remainder is not
worth having in a separate function.

Signed-off-by: Johannes Weiner 
Acked-by: Michal Hocko 
Signed-off-by: Andrew Morton 
Signed-off-by: Linus Torvalds 

https://jira.sw.ru/browse/PSBM-51558
(cherry picked from commit 1bec6b333e241a9db47d3939fb08a4e174ece02f)
Signed-off-by: Andrey Ryabinin 
---
 mm/memcontrol.c | 40 
 1 file changed, 16 insertions(+), 24 deletions(-)

diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index d23ca87..be1c492 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -3818,20 +3818,21 @@ out:
return ret;
 }
 
-/*
- * Charge the memory controller for page usage.
- * Return
- * 0 if the charge was successful
- * < 0 if the cgroup is over its limit
- */
-static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm,
-   gfp_t gfp_mask, enum charge_type ctype)
+int mem_cgroup_newpage_charge(struct page *page,
+ struct mm_struct *mm, gfp_t gfp_mask)
 {
struct mem_cgroup *memcg = NULL;
unsigned int nr_pages = 1;
bool oom = true;
int ret;
 
+   if (mem_cgroup_disabled())
+   return 0;
+
+   VM_BUG_ON_PAGE(page_mapped(page), page);
+   VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
+   VM_BUG_ON(!mm);
+
if (PageTransHuge(page)) {
nr_pages <<= compound_order(page);
VM_BUG_ON_PAGE(!PageTransHuge(page), page);
@@ -3845,22 +3846,11 @@ static int mem_cgroup_charge_common(struct page *page, 
struct mm_struct *mm,
ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, , oom);
if (ret == -ENOMEM)
return ret;
-   __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false);
+   __mem_cgroup_commit_charge(memcg, page, nr_pages,
+  MEM_CGROUP_CHARGE_TYPE_ANON, false);
return 0;
 }
 
-int mem_cgroup_newpage_charge(struct page *page,
- struct mm_struct *mm, gfp_t gfp_mask)
-{
-   if (mem_cgroup_disabled())
-   return 0;
-   VM_BUG_ON_PAGE(page_mapped(page), page);
-   VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page);
-   VM_BUG_ON(!mm);
-   return mem_cgroup_charge_common(page, mm, gfp_mask,
-   MEM_CGROUP_CHARGE_TYPE_ANON);
-}
-
 /*
  * While swap-in, try_charge -> commit or cancel, the page is locked.
  * And when try_charge() successfully returns, one refcnt to memcg without
@@ -3980,9 +3970,11 @@ int mem_cgroup_cache_charge(struct page *page, struct 
mm_struct *mm,
if (PageCompound(page))
return 0;
 
-   if (!PageSwapCache(page))
-   ret = mem_cgroup_charge_common(page, mm, gfp_mask, type);
-   else { /* page is swapcache/shmem */
+   if (!PageSwapCache(page)) {
+   ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, , true);
+   if (ret != -ENOMEM)
+   __mem_cgroup_commit_charge(memcg, page, 1, type, false);
+   } else { /* page is swapcache/shmem */
ret = __mem_cgroup_try_charge_swapin(mm, page,
 gfp_mask, );
if (!ret)
-- 
2.10.2

___
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel