[Devel] [PATCH rh7 v2 08/21] ms/mm: memcontrol: rearrange charging fast path
From: Johannes WeinerThe charging path currently starts out with OOM condition checks when OOM is the rarest possible case. Rearrange this code to run OOM/task dying checks only after trying the percpu charge and the res_counter charge and bail out before entering reclaim. Attempting a charge does not hurt an (oom-)killed task as much as every charge attempt having to check OOM conditions. Also, only check __GFP_NOFAIL when the charge would actually fail. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds https://jira.sw.ru/browse/PSBM-51558 (cherry picked from commit 06b078fc065fe1fe7097675c8ee416aa2ef94fb3) Signed-off-by: Andrey Ryabinin --- mm/memcontrol.c | 32 +--- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f904257..f006cdd 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2767,21 +2767,6 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg, if (mem_cgroup_is_root(memcg)) goto done; - /* -* Unlike in global OOM situations, memcg is not in a physical -* memory shortage. Allow dying and OOM-killed tasks to -* bypass the last charges so that they can exit quickly and -* free their memory. -*/ - if (unlikely(test_thread_flag(TIF_MEMDIE) || -fatal_signal_pending(current))) - goto bypass; - - if (unlikely(task_in_memcg_oom(current))) - goto nomem; - - if (gfp_mask & __GFP_NOFAIL) - oom = false; retry: if (consume_stock(memcg, nr_pages)) goto done; @@ -2802,6 +2787,20 @@ retry: goto retry; } + /* +* Unlike in global OOM situations, memcg is not in a physical +* memory shortage. Allow dying and OOM-killed tasks to +* bypass the last charges so that they can exit quickly and +* free their memory. +*/ + if (unlikely(test_thread_flag(TIF_MEMDIE) || +fatal_signal_pending(current) || +current->flags & PF_EXITING)) + goto bypass; + + if (unlikely(task_in_memcg_oom(current))) + goto nomem; + if (!(gfp_mask & __GFP_WAIT)) goto nomem; @@ -2830,6 +2829,9 @@ retry: if (mem_cgroup_wait_acct_move(mem_over_limit)) goto retry; + if (gfp_mask & __GFP_NOFAIL) + goto bypass; + if (fatal_signal_pending(current)) goto bypass; -- 2.10.2 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7 v2 04/21] ms/memcg: get_mem_cgroup_from_mm()
From: Johannes WeinerInstead of returning NULL from try_get_mem_cgroup_from_mm() when the mm owner is exiting, just return root_mem_cgroup. This makes sense for all callsites and gets rid of some of them having to fallback manually. [fengguang...@intel.com: fix warnings] Signed-off-by: Johannes Weiner Signed-off-by: Fengguang Wu Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds https://jira.sw.ru/browse/PSBM-51558 (cherry picked from commit df381975463996178d685f6ef7d3555c5f887201) Signed-off-by: Andrey Ryabinin --- include/linux/memcontrol.h | 12 ++-- mm/memcontrol.c| 37 + mm/oom_kill.c | 4 ++-- net/packet/af_packet.c | 2 +- 4 files changed, 26 insertions(+), 29 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index f1b599f..bcf7752 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -85,7 +85,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) extern struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page); extern struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p); -extern struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm); +extern struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm); extern struct mem_cgroup *parent_mem_cgroup(struct mem_cgroup *memcg); extern struct mem_cgroup *mem_cgroup_from_cont(struct cgroup *cont); @@ -304,17 +304,17 @@ static inline struct mem_cgroup *try_get_mem_cgroup_from_page(struct page *page) return NULL; } -static inline struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) -{ - return NULL; -} - static inline bool mm_match_cgroup(struct mm_struct *mm, struct mem_cgroup *memcg) { return true; } +static inline struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) +{ + return NULL; +} + static inline int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) { diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d09d55d9..40ac81b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1166,15 +1166,24 @@ struct mem_cgroup *mem_cgroup_from_task(struct task_struct *p) return mem_cgroup_from_css(task_subsys_state(p, mem_cgroup_subsys_id)); } -struct mem_cgroup *try_get_mem_cgroup_from_mm(struct mm_struct *mm) +struct mem_cgroup *get_mem_cgroup_from_mm(struct mm_struct *mm) { struct mem_cgroup *memcg = NULL; rcu_read_lock(); do { - memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); - if (unlikely(!memcg)) - break; + /* +* Page cache insertions can happen withou an +* actual mm context, e.g. during disk probing +* on boot, loopback IO, acct() writes etc. +*/ + if (unlikely(!mm)) + memcg = root_mem_cgroup; + else { + memcg = mem_cgroup_from_task(rcu_dereference(mm->owner)); + if (unlikely(!memcg)) + memcg = root_mem_cgroup; + } } while (!css_tryget(>css)); rcu_read_unlock(); return memcg; @@ -1546,7 +1555,7 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) p = find_lock_task_mm(task); if (p) { - curr = try_get_mem_cgroup_from_mm(p->mm); + curr = get_mem_cgroup_from_mm(p->mm); task_unlock(p); } else { /* @@ -1560,8 +1569,6 @@ int task_in_mem_cgroup(struct task_struct *task, const struct mem_cgroup *memcg) css_get(>css); task_unlock(task); } - if (!curr) - return 0; /* * We should check use_hierarchy of "memcg" not "curr". Because checking * use_hierarchy of "curr" here make this function true if hierarchy is @@ -1693,7 +1700,7 @@ void mem_cgroup_note_oom_kill(struct mem_cgroup *root_memcg, p = find_lock_task_mm(task); if (p) { - memcg = try_get_mem_cgroup_from_mm(p->mm); + memcg = get_mem_cgroup_from_mm(p->mm); task_unlock(p); } else { rcu_read_lock(); @@ -3458,9 +3465,7 @@ struct kmem_cache *__memcg_kmem_get_cache(struct kmem_cache *cachep, if (!current->mm || current->memcg_kmem_skip_account) return cachep; - memcg = try_get_mem_cgroup_from_mm(current->mm); - if (unlikely(!memcg)) - return cachep; + memcg =
Re: [Devel] [PATCH vz7] fuse: fuse_writepage_locked must check for FUSE_INVALIDATE_FILES (v2)
Maxim Patlasovwrites: > The patch fixes another race dealing with fuse_invalidate_files, > this time when it races with truncate(2): > > Thread A: the flusher performs writeback as usual: > > fuse_writepages --> > fuse_send_writepages --> > end_page_writeback > > but before fuse_send_writepages acquires fc->lock and calls > fuse_flush_writepages, > some innocent user process re-dirty-es the page. > > Thread B: truncate(2) attempts to truncate (shrink) file as usual: > > fuse_do_setattr --> > invalidate_inode_pages2 > > (This is possible because Thread A has not incremented fi->writectr yet.) But > invalidate_inode_pages2 finds that re-dirty-ed page and sticks in: > > invalidate_inode_pages2 --> > fuse_launder_page --> > fuse_writepage_locked --> > fuse_wait_on_page_writeback > > Thread A: the flusher proceeds with fuse_flush_writepages, sends write request > to userspace fuse daemon, but the daemon is not obliged to fulfill it > immediately. > So, thread B waits now for thread A, while thread A waits for userspace. > > Now fuse_invalidate_files steps in sticking in filemap_write_and_wait on the > page locked by Thread B (launder_page always work on a locked page). Deadlock. > > The patch fixes deadlock by waking up fuse_writepage_locked after marking > files with FAIL_IMMEDIATELY flag. > > Changed in v2: > - instead of flagging "fail_immediately", let fuse_writepage_locked return > fuse_file pointer, then the caller (fuse_launder_page) can use it for > conditional wait on __fuse_wait_on_page_writeback_or_invalidate. This is > important because otherwise fuse_invalidate_files may deadlock when > launder waits for fuse writeback. ACK-by: dmonak...@openvz.org > > Signed-off-by: Maxim Patlasov > --- > fs/fuse/file.c | 51 +-- > 1 file changed, 45 insertions(+), 6 deletions(-) > > diff --git a/fs/fuse/file.c b/fs/fuse/file.c > index 0ffc806..34e75c2 100644 > --- a/fs/fuse/file.c > +++ b/fs/fuse/file.c > @@ -1963,7 +1963,8 @@ static struct fuse_file *fuse_write_file(struct > fuse_conn *fc, > } > > static int fuse_writepage_locked(struct page *page, > - struct writeback_control *wbc) > + struct writeback_control *wbc, > + struct fuse_file **ff_pp) > { > struct address_space *mapping = page->mapping; > struct inode *inode = mapping->host; > @@ -1971,13 +1972,30 @@ static int fuse_writepage_locked(struct page *page, > struct fuse_inode *fi = get_fuse_inode(inode); > struct fuse_req *req; > struct page *tmp_page; > + struct fuse_file *ff; > + int err = 0; > > if (fuse_page_is_writeback(inode, page->index)) { > if (wbc->sync_mode != WB_SYNC_ALL) { > redirty_page_for_writepage(wbc, page); > return 0; > } > - fuse_wait_on_page_writeback(inode, page->index); > + > + /* we can acquire ff here because we do have locked pages here! > */ > + ff = fuse_write_file(fc, get_fuse_inode(inode)); > + if (!ff) > + goto dummy_end_page_wb_err; > + > + /* FUSE_NOTIFY_INVAL_FILES must be able to wake us up */ > + __fuse_wait_on_page_writeback_or_invalidate(inode, ff, > page->index); > + > + if (test_bit(FUSE_S_FAIL_IMMEDIATELY, >ff_state)) { > + if (ff_pp) > + *ff_pp = ff; > + goto dummy_end_page_wb; > + } > + > + fuse_release_ff(inode, ff); > } > > if (test_set_page_writeback(page)) > @@ -1995,6 +2013,8 @@ static int fuse_writepage_locked(struct page *page, > req->ff = fuse_write_file(fc, fi); > if (!req->ff) > goto err_nofile; > + if (ff_pp) > + *ff_pp = fuse_file_get(req->ff); > fuse_write_fill(req, req->ff, page_offset(page), 0); > fuse_account_request(fc, PAGE_CACHE_SIZE); > > @@ -2029,13 +2049,23 @@ err_free: > err: > end_page_writeback(page); > return -ENOMEM; > + > +dummy_end_page_wb_err: > + printk("FUSE: page under fwb dirtied on dead file\n"); > + err = -EIO; > + /* fall through ... */ > +dummy_end_page_wb: > + if (test_set_page_writeback(page)) > + BUG(); > + end_page_writeback(page); > + return err; > } > > static int fuse_writepage(struct page *page, struct writeback_control *wbc) > { > int err; > > - err = fuse_writepage_locked(page, wbc); > + err = fuse_writepage_locked(page, wbc, NULL); > unlock_page(page); > > return err; > @@ -2423,9 +2453,18 @@ static int fuse_launder_page(struct page *page) > struct writeback_control wbc = { > .sync_mode = WB_SYNC_ALL, >
[Devel] [PATCH rh7 v2 17/21] ms/mm: memcontrol: rewrite uncharge API
From: Johannes WeinerThe memcg uncharging code that is involved towards the end of a page's lifetime - truncation, reclaim, swapout, migration - is impressively complicated and fragile. Because anonymous and file pages were always charged before they had their page->mapping established, uncharges had to happen when the page type could still be known from the context; as in unmap for anonymous, page cache removal for file and shmem pages, and swap cache truncation for swap pages. However, these operations happen well before the page is actually freed, and so a lot of synchronization is necessary: - Charging, uncharging, page migration, and charge migration all need to take a per-page bit spinlock as they could race with uncharging. - Swap cache truncation happens during both swap-in and swap-out, and possibly repeatedly before the page is actually freed. This means that the memcg swapout code is called from many contexts that make no sense and it has to figure out the direction from page state to make sure memory and memory+swap are always correctly charged. - On page migration, the old page might be unmapped but then reused, so memcg code has to prevent untimely uncharging in that case. Because this code - which should be a simple charge transfer - is so special-cased, it is not reusable for replace_page_cache(). But now that charged pages always have a page->mapping, introduce mem_cgroup_uncharge(), which is called after the final put_page(), when we know for sure that nobody is looking at the page anymore. For page migration, introduce mem_cgroup_migrate(), which is called after the migration is successful and the new page is fully rmapped. Because the old page is no longer uncharged after migration, prevent double charges by decoupling the page's memcg association (PCG_USED and pc->mem_cgroup) from the page holding an actual charge. The new bits PCG_MEM and PCG_MEMSW represent the respective charges and are transferred to the new page during migration. mem_cgroup_migrate() is suitable for replace_page_cache() as well, which gets rid of mem_cgroup_replace_page_cache(). However, care needs to be taken because both the source and the target page can already be charged and on the LRU when fuse is splicing: grab the page lock on the charge moving side to prevent changing pc->mem_cgroup of a page under migration. Also, the lruvecs of both pages change as we uncharge the old and charge the new during migration, and putback may race with us, so grab the lru lock and isolate the pages iff on LRU to prevent races and ensure the pages are on the right lruvec afterward. Swap accounting is massively simplified: because the page is no longer uncharged as early as swap cache deletion, a new mem_cgroup_swapout() can transfer the page's memory+swap charge (PCG_MEMSW) to the swap entry before the final put_page() in page reclaim. Finally, page_cgroup changes are now protected by whatever protection the page itself offers: anonymous pages are charged under the page table lock, whereas page cache insertions, swapin, and migration hold the page lock. Uncharging happens under full exclusion with no outstanding references. Charging and uncharging also ensure that the page is off-LRU, which serializes against charge migration. Remove the very costly page_cgroup lock and set pc->flags non-atomically. [mho...@suse.cz: mem_cgroup_charge_statistics needs preempt_disable] [vdavy...@parallels.com: fix flags definition] Signed-off-by: Johannes Weiner Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Tested-by: Jet Chen Acked-by: Michal Hocko Tested-by: Felipe Balbi Signed-off-by: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds https://jira.sw.ru/browse/PSBM-51558 (cherry picked from commit 0a31bc97c80c3fa87b32c091d9a930ac19cd0c40) Signed-off-by: Andrey Ryabinin --- Documentation/cgroups/memcg_test.txt | 128 +- drivers/staging/zcache/zcache-main.c | 2 +- include/linux/memcontrol.h | 49 +- include/linux/page_cgroup.h | 43 +- include/linux/swap.h | 12 +- mm/filemap.c | 4 +- mm/hmm_migrate.c | 12 - mm/memcontrol.c | 850 ++- mm/memory.c | 2 - mm/migrate.c | 39 +- mm/rmap.c| 1 - mm/shmem.c | 8 +- mm/swap.c| 5 + mm/swap_state.c | 8 +- mm/swapfile.c| 9 +- mm/truncate.c| 9 - mm/tswap.c | 2 +- mm/vmscan.c
[Devel] [PATCH rh7 v2 06/21] ms/memcg: sanitize __mem_cgroup_try_charge() call protocol
From: Johannes WeinerSome callsites pass a memcg directly, some callsites pass an mm that then has to be translated to a memcg. This makes for a terrible function interface. Just push the mm-to-memcg translation into the respective callsites and always pass a memcg to mem_cgroup_try_charge(). [mho...@suse.cz: add charge mm helper] Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds https://jira.sw.ru/browse/PSBM-51558 (cherry picked from commit 6d1fdc48938cd51a3964778d78f27cb26c8eb55d) Signed-off-by: Andrey Ryabinin --- mm/memcontrol.c | 207 1 file changed, 102 insertions(+), 105 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d6413c5..1e5d914 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2745,7 +2745,7 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, } -/* See __mem_cgroup_try_charge() for details */ +/* See mem_cgroup_try_charge() for details */ enum { CHARGE_OK, /* success */ CHARGE_RETRY, /* need to retry but retry is not bad */ @@ -2824,45 +2824,35 @@ static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, return CHARGE_NOMEM; } -/* - * __mem_cgroup_try_charge() does - * 1. detect memcg to be charged against from passed *mm and *ptr, - * 2. update page_counter - * 3. call memory reclaim if necessary. - * - * In some special case, if the task is fatal, fatal_signal_pending() or - * has TIF_MEMDIE, this function returns -EINTR while writing root_mem_cgroup - * to *ptr. There are two reasons for this. 1: fatal threads should quit as soon - * as possible without any hazards. 2: all pages should have a valid - * pc->mem_cgroup. If mm is NULL and the caller doesn't pass a valid memcg - * pointer, that is treated as a charge to root_mem_cgroup. - * - * So __mem_cgroup_try_charge() will return - * 0 ... on success, filling *ptr with a valid memcg pointer. - * -ENOMEM ... charge failure because of resource limits. - * -EINTR ... if thread is fatal. *ptr is filled with root_mem_cgroup. +/** + * mem_cgroup_try_charge - try charging a memcg + * @memcg: memcg to charge + * @nr_pages: number of pages to charge + * @oom: trigger OOM if reclaim fails * - * Unlike the exported interface, an "oom" parameter is added. if oom==true, - * the oom-killer can be invoked. + * Returns 0 if @memcg was charged successfully, -EINTR if the charge + * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed. */ -static int __mem_cgroup_try_charge(struct mm_struct *mm, - gfp_t gfp_mask, - unsigned int nr_pages, - struct mem_cgroup **ptr, - bool oom) +static int mem_cgroup_try_charge(struct mem_cgroup *memcg, +gfp_t gfp_mask, +unsigned int nr_pages, +bool oom) { unsigned int batch = max(CHARGE_BATCH, nr_pages); int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; - struct mem_cgroup *memcg = NULL, *iter; + struct mem_cgroup *iter; int ret; + if (mem_cgroup_is_root(memcg)) + goto done; /* -* Unlike gloval-vm's OOM-kill, we're not in memory shortage -* in system level. So, allow to go ahead dying process in addition to -* MEMDIE process. +* Unlike in global OOM situations, memcg is not in a physical +* memory shortage. Allow dying and OOM-killed tasks to +* bypass the last charges so that they can exit quickly and +* free their memory. */ - if (unlikely(test_thread_flag(TIF_MEMDIE) -|| fatal_signal_pending(current))) + if (unlikely(test_thread_flag(TIF_MEMDIE) || +fatal_signal_pending(current))) goto bypass; if (unlikely(task_in_memcg_oom(current))) @@ -2871,14 +2861,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, if (gfp_mask & __GFP_NOFAIL) oom = false; again: - if (*ptr) { /* css should be a valid one */ - memcg = *ptr; - css_get(>css); - } else { - memcg = get_mem_cgroup_from_mm(mm); - } - if (mem_cgroup_is_root(memcg)) - goto done; if (consume_stock(memcg, nr_pages)) goto done; @@ -2887,10 +2869,8 @@ again: /* If killed, bypass charge */ if (test_thread_flag(TIF_MEMDIE) || - fatal_signal_pending(current)) { - css_put(>css); + fatal_signal_pending(current))
[Devel] [PATCH rh7 v2 18/21] ms/mm: memcontrol: use page lists for uncharge batching
From: Johannes WeinerPages are now uncharged at release time, and all sources of batched uncharges operate on lists of pages. Directly use those lists, and get rid of the per-task batching state. This also batches statistics accounting, in addition to the res counter charges, to reduce IRQ-disabling and re-enabling. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Cc: Naoya Horiguchi Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds https://jira.sw.ru/browse/PSBM-51558 (cherry picked from commit 747db954cab64c6b7a95b121b517165f34751898) Signed-off-by: Andrey Ryabinin --- include/linux/memcontrol.h | 12 +-- include/linux/sched.h | 6 -- kernel/fork.c | 4 - mm/memcontrol.c| 204 - mm/swap.c | 12 +-- mm/vmscan.c| 12 ++- 6 files changed, 122 insertions(+), 128 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index ca343bf..681f320 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -51,12 +51,8 @@ int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, void mem_cgroup_commit_charge(struct page *page, struct mem_cgroup *memcg, bool lrucare); void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg); - void mem_cgroup_uncharge(struct page *page); - -/* Batched uncharging */ -void mem_cgroup_uncharge_start(void); -void mem_cgroup_uncharge_end(void); +void mem_cgroup_uncharge_list(struct list_head *page_list); void mem_cgroup_migrate(struct page *oldpage, struct page *newpage, bool lrucare); @@ -244,11 +240,7 @@ static inline void mem_cgroup_uncharge(struct page *page) { } -static inline void mem_cgroup_uncharge_start(void) -{ -} - -static inline void mem_cgroup_uncharge_end(void) +static inline void mem_cgroup_uncharge_list(struct list_head *page_list) { } diff --git a/include/linux/sched.h b/include/linux/sched.h index 05a4b12..c945d93 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1721,12 +1721,6 @@ struct task_struct { struct ve_struct *task_ve; #endif #ifdef CONFIG_MEMCG /* memcg uses this to do batch job */ - struct memcg_batch_info { - int do_batch; /* incremented when batch uncharge started */ - struct mem_cgroup *memcg; /* target memcg of uncharge */ - unsigned long nr_pages; /* uncharged usage */ - unsigned long memsw_nr_pages; /* uncharged mem+swap usage */ - } memcg_batch; unsigned int memcg_kmem_skip_account; #endif #ifdef CONFIG_HAVE_HW_BREAKPOINT diff --git a/kernel/fork.c b/kernel/fork.c index 99e39cc..329dc42 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -1453,10 +1453,6 @@ static struct task_struct *copy_process(unsigned long clone_flags, #ifdef CONFIG_DEBUG_MUTEXES p->blocked_on = NULL; /* not blocked yet */ #endif -#ifdef CONFIG_MEMCG - p->memcg_batch.do_batch = 0; - p->memcg_batch.memcg = NULL; -#endif #ifdef CONFIG_BCACHE p->sequential_io= 0; p->sequential_io_avg= 0; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1eb0e85..16bb6aa 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3625,51 +3625,6 @@ out: return ret; } -/* - * Batch_start/batch_end is called in unmap_page_range/invlidate/trucate. - * In that cases, pages are freed continuously and we can expect pages - * are in the same memcg. All these calls itself limits the number of - * pages freed at once, then uncharge_start/end() is called properly. - * This may be called prural(2) times in a context, - */ - -void mem_cgroup_uncharge_start(void) -{ - unsigned long flags; - - local_irq_save(flags); - current->memcg_batch.do_batch++; - /* We can do nest. */ - if (current->memcg_batch.do_batch == 1) { - current->memcg_batch.memcg = NULL; - current->memcg_batch.nr_pages = 0; - current->memcg_batch.memsw_nr_pages = 0; - } - local_irq_restore(flags); -} - -void mem_cgroup_uncharge_end(void) -{ - struct memcg_batch_info *batch = >memcg_batch; - unsigned long flags; - - local_irq_save(flags); - VM_BUG_ON(!batch->do_batch); - if (--batch->do_batch) /* If stacked, do nothing */ - goto out; - /* -* This "batch->memcg" is valid without any css_get/put etc... -* bacause we hide charges behind us. -*/ - if (batch->nr_pages) - page_counter_uncharge(>memcg->memory,
[Devel] [PATCH rh7 v2 16/21] ms/mm: memcontrol: rewrite charge API
From: Johannes WeinerThese patches rework memcg charge lifetime to integrate more naturally with the lifetime of user pages. This drastically simplifies the code and reduces charging and uncharging overhead. The most expensive part of charging and uncharging is the page_cgroup bit spinlock, which is removed entirely after this series. Here are the top-10 profile entries of a stress test that reads a 128G sparse file on a freshly booted box, without even a dedicated cgroup (i.e. executing in the root memcg). Before: 15.36% cat [kernel.kallsyms] [k] copy_user_generic_string 13.31% cat [kernel.kallsyms] [k] memset 11.48% cat [kernel.kallsyms] [k] do_mpage_readpage 4.23% cat [kernel.kallsyms] [k] get_page_from_freelist 2.38% cat [kernel.kallsyms] [k] put_page 2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge 2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common 1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list 1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup 1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn After: 15.67% cat [kernel.kallsyms] [k] copy_user_generic_string 13.48% cat [kernel.kallsyms] [k] memset 11.42% cat [kernel.kallsyms] [k] do_mpage_readpage 3.98% cat [kernel.kallsyms] [k] get_page_from_freelist 2.46% cat [kernel.kallsyms] [k] put_page 2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list 1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup 1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn 1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk 1.30% cat [kernel.kallsyms] [k] kfree As you can see, the memcg footprint has shrunk quite a bit. textdata bss dec hex filename 379709892 400 48262bc86 mm/memcontrol.o.old 352399892 400 45531b1db mm/memcontrol.o This patch (of 4): The memcg charge API charges pages before they are rmapped - i.e. have an actual "type" - and so every callsite needs its own set of charge and uncharge functions to know what type is being operated on. Worse, uncharge has to happen from a context that is still type-specific, rather than at the end of the page's lifetime with exclusive access, and so requires a lot of synchronization. Rewrite the charge API to provide a generic set of try_charge(), commit_charge() and cancel_charge() transaction operations, much like what's currently done for swap-in: mem_cgroup_try_charge() attempts to reserve a charge, reclaiming pages from the memcg if necessary. mem_cgroup_commit_charge() commits the page to the charge once it has a valid page->mapping and PageAnon() reliably tells the type. mem_cgroup_cancel_charge() aborts the transaction. This reduces the charge API and enables subsequent patches to drastically simplify uncharging. As pages need to be committed after rmap is established but before they are added to the LRU, page_add_new_anon_rmap() must stop doing LRU additions again. Revive lru_cache_add_active_or_unevictable(). [hu...@google.com: fix shmem_unuse] [hu...@google.com: Add comments on the private use of -EAGAIN] Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Hugh Dickins Cc: Naoya Horiguchi Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds https://jira.sw.ru/browse/PSBM-51558 (cherry picked from commit 00501b531c4723972aa11d6d4ebcf8d6552007c8) Signed-off-by: Andrey Ryabinin --- Documentation/cgroups/memcg_test.txt | 32 +-- include/linux/memcontrol.h | 53 ++--- include/linux/swap.h | 2 + kernel/events/uprobes.c | 8 + mm/filemap.c | 23 +- mm/huge_memory.c | 56 +++-- mm/memcontrol.c | 404 +++ mm/memory.c | 46 ++-- mm/rmap.c| 5 - mm/shmem.c | 38 ++-- mm/swap.c| 34 +++ mm/swapfile.c| 14 +- mm/userfaultfd.c | 7 +- 13 files changed, 348 insertions(+), 374 deletions(-) diff --git a/Documentation/cgroups/memcg_test.txt b/Documentation/cgroups/memcg_test.txt index ce94a83..bcf750d 100644 --- a/Documentation/cgroups/memcg_test.txt +++ b/Documentation/cgroups/memcg_test.txt @@ -24,24 +24,7 @@ Please note that implementation details can be changed.
[Devel] [PATCH rh7 v2 13/21] ms/mm: memcontrol: simplify move precharge function
From: Johannes WeinerThe move precharge function does some baroque things: it tries raw res_counter charging of the entire amount first, and then falls back to a loop of one-by-one charges, with checks for pending signals and cond_resched() batching. Just use mem_cgroup_try_charge() without __GFP_WAIT for the first bulk charge attempt. In the one-by-one loop, remove the signal check (this is already checked in try_charge), and simply call cond_resched() after every charge - it's not that expensive. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds https://jira.sw.ru/browse/PSBM-51558 (cherry picked from commit 9476db974d9e18885123fcebc09f4596bb922e5f) Signed-off-by: Andrey Ryabinin --- mm/memcontrol.c | 48 +++- 1 file changed, 15 insertions(+), 33 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a1aab9f..32533bf 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6722,56 +6722,38 @@ static void mem_cgroup_css_free(struct cgroup *cont) #ifdef CONFIG_MMU /* Handlers for move charge at task migration. */ -#define PRECHARGE_COUNT_AT_ONCE256 static int mem_cgroup_do_precharge(unsigned long count) { int ret = 0; - int batch_count = PRECHARGE_COUNT_AT_ONCE; - struct mem_cgroup *memcg = mc.to; - if (mem_cgroup_is_root(memcg)) { + if (mem_cgroup_is_root(mc.to)) { mc.precharge += count; /* we don't need css_get for root */ return ret; } - /* try to charge at once */ - if (count > 1) { - struct page_counter *dummy; - /* -* "memcg" cannot be under rmdir() because we've already checked -* by cgroup_lock_live_cgroup() that it is not removed and we -* are still under the same cgroup_mutex. So we can postpone -* css_get(). -*/ - if (page_counter_try_charge(>memory, count, )) - goto one_by_one; - if (do_swap_account && - page_counter_try_charge(>memsw, count, )) { - page_counter_uncharge(>memory, count); - goto one_by_one; - } + + /* Try a single bulk charge without reclaim first */ + ret = mem_cgroup_try_charge(mc.to, GFP_KERNEL & ~__GFP_WAIT, count); + if (!ret) { mc.precharge += count; return ret; } -one_by_one: - /* fall back to one by one charge */ + + /* Try charges one by one with reclaim */ while (count--) { - if (signal_pending(current)) { - ret = -EINTR; - break; - } - if (!batch_count--) { - batch_count = PRECHARGE_COUNT_AT_ONCE; - cond_resched(); - } - ret = mem_cgroup_try_charge(memcg, + ret = mem_cgroup_try_charge(mc.to, GFP_KERNEL & ~__GFP_NORETRY, 1); + /* +* In case of failure, any residual charges against +* mc.to will be dropped by mem_cgroup_clear_mc() +* later on. +*/ if (ret) - /* mem_cgroup_clear_mc() will do uncharge later */ return ret; mc.precharge++; + cond_resched(); } - return ret; + return 0; } /** -- 2.10.2 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7 v2 20/21] ms/mm: memcontrol: teach uncharge_list to deal with kmem pages
From: Vladimir DavydovPage table pages are batched-freed in release_pages on most architectures. If we want to charge them to kmemcg (this is what is done later in this series), we need to teach mem_cgroup_uncharge_list to handle kmem pages. Link: http://lkml.kernel.org/r/18d5c09e97f80074ed25b97a7d0f32b95d875717.1464079538.git.vdavy...@virtuozzo.com Signed-off-by: Vladimir Davydov Cc: Johannes Weiner Cc: Michal Hocko Cc: Eric Dumazet Cc: Minchan Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds https://jira.sw.ru/browse/PSBM-51558 (cherry picked from commit 5e8d35f849b1969b900695ae191326bfacf6bfc6) Signed-off-by: Andrey Ryabinin --- mm/memcontrol.c | 37 + 1 file changed, 21 insertions(+), 16 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 6c11788..0183a9c 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6922,15 +6922,16 @@ void mem_cgroup_cancel_charge(struct page *page, struct mem_cgroup *memcg) static void uncharge_batch(struct mem_cgroup *memcg, unsigned long pgpgout, unsigned long nr_mem, unsigned long nr_memsw, unsigned long nr_anon, unsigned long nr_file, - unsigned long nr_huge, struct page *dummy_page) + unsigned long nr_huge, unsigned long nr_kmem, + struct page *dummy_page) { unsigned long flags; if (!mem_cgroup_is_root(memcg)) { if (nr_mem) - page_counter_uncharge(>memory, nr_mem); + page_counter_uncharge(>memory, nr_mem + nr_kmem); if (nr_memsw) - page_counter_uncharge(>memsw, nr_memsw); + page_counter_uncharge(>memsw, nr_memsw + nr_kmem); memcg_oom_recover(memcg); } @@ -6952,6 +6953,7 @@ static void uncharge_list(struct list_head *page_list) unsigned long nr_anon = 0; unsigned long nr_file = 0; unsigned long nr_huge = 0; + unsigned long nr_kmem = 0; unsigned long pgpgout = 0; unsigned long nr_mem = 0; struct list_head *next; @@ -6981,23 +6983,26 @@ static void uncharge_list(struct list_head *page_list) if (memcg != pc->mem_cgroup) { if (memcg) { uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, - nr_anon, nr_file, nr_huge, page); - pgpgout = nr_mem = nr_memsw = 0; + nr_anon, nr_file, nr_huge, nr_kmem, page); + pgpgout = nr_mem = nr_memsw = nr_kmem = 0; nr_anon = nr_file = nr_huge = 0; } memcg = pc->mem_cgroup; } - if (PageTransHuge(page)) { - nr_pages <<= compound_order(page); - VM_BUG_ON_PAGE(!PageTransHuge(page), page); - nr_huge += nr_pages; - } - - if (PageAnon(page)) - nr_anon += nr_pages; - else - nr_file += nr_pages; + if (!PageKmemcg(page)) { + if (PageTransHuge(page)) { + nr_pages <<= compound_order(page); + VM_BUG_ON_PAGE(!PageTransHuge(page), page); + nr_huge += nr_pages; + } + if (PageAnon(page)) + nr_anon += nr_pages; + else + nr_file += nr_pages; + pgpgout++; + } else + nr_kmem += 1 << compound_order(page); if (pc->flags & PCG_MEM) nr_mem += nr_pages; @@ -7010,7 +7015,7 @@ static void uncharge_list(struct list_head *page_list) if (memcg) uncharge_batch(memcg, pgpgout, nr_mem, nr_memsw, - nr_anon, nr_file, nr_huge, page); + nr_anon, nr_file, nr_huge, nr_kmem, page); } /** -- 2.10.2 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7 v2 09/21] ms/mm: memcontrol: reclaim at least once for __GFP_NORETRY
From: Johannes WeinerCurrently, __GFP_NORETRY tries charging once and gives up before even trying to reclaim. Bring the behavior on par with the page allocator and reclaim at least once before giving up. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds https://jira.sw.ru/browse/PSBM-51558 (cherry picked from commit 28c34c291e746aab1c2bfd6d6609b2e47fa0978b) Signed-off-by: Andrey Ryabinin --- mm/memcontrol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index f006cdd..3608d80 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2804,13 +2804,13 @@ retry: if (!(gfp_mask & __GFP_WAIT)) goto nomem; - if (gfp_mask & __GFP_NORETRY) - goto nomem; - nr_reclaimed = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); if (mem_cgroup_margin(mem_over_limit) >= batch) goto retry; + + if (gfp_mask & __GFP_NORETRY) + goto nomem; /* * Even though the limit is exceeded at this point, reclaim * may have been able to free some pages. Retry the charge -- 2.10.2 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7 v2 14/21] ms/mm: memcontrol: catch root bypass in move precharge
From: Johannes WeinerWhen mem_cgroup_try_charge() returns -EINTR, it bypassed the charge to the root memcg. But move precharging does not catch this and treats this case as if no charge had happened, thus leaking a charge against root. Because of an old optimization, the root memcg's res_counter is not actually charged right now, but it's still an imbalance and subsequent patches will charge the root memcg again. Catch those bypasses to the root memcg and properly cancel them before giving up the move. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds https://jira.sw.ru/browse/PSBM-51558 (cherry picked from commit 692e7c45d95ad1064b6911800e2cfec7fc0236db) Signed-off-by: Andrey Ryabinin --- mm/memcontrol.c | 9 - 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 32533bf..cddfb93 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6738,6 +6738,10 @@ static int mem_cgroup_do_precharge(unsigned long count) mc.precharge += count; return ret; } + if (ret == -EINTR) { + __mem_cgroup_cancel_charge(root_mem_cgroup, count); + return ret; + } /* Try charges one by one with reclaim */ while (count--) { @@ -6746,8 +6750,11 @@ static int mem_cgroup_do_precharge(unsigned long count) /* * In case of failure, any residual charges against * mc.to will be dropped by mem_cgroup_clear_mc() -* later on. +* later on. However, cancel any charges that are +* bypassed to root right away or they'll be lost. */ + if (ret == -EINTR) + __mem_cgroup_cancel_charge(root_mem_cgroup, 1); if (ret) return ret; mc.precharge++; -- 2.10.2 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7 v2 15/21] ms/mm: memcontrol: remove ordering between pc->mem_cgroup and PageCgroupUsed
From: Johannes WeinerThere is a write barrier between setting pc->mem_cgroup and PageCgroupUsed, which was added to allow LRU operations to lookup the memcg LRU list of a page without acquiring the page_cgroup lock. But ever since commit 38c5d72f3ebe ("memcg: simplify LRU handling by new rule"), pages are ensured to be off-LRU while charging, so nobody else is changing LRU state while pc->mem_cgroup is being written, and there are no read barriers anymore. Remove the unnecessary write barrier. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds https://jira.sw.ru/browse/PSBM-51558 (cherry picked from commit 9a2385eef9f28fb5260c48c45fc8fe01f1da70a6) Signed-off-by: Andrey Ryabinin --- mm/memcontrol.c | 9 - 1 file changed, 9 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index cddfb93..2b04b1e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2977,14 +2977,6 @@ static void __mem_cgroup_commit_charge(struct mem_cgroup *memcg, } pc->mem_cgroup = memcg; - /* -* We access a page_cgroup asynchronously without lock_page_cgroup(). -* Especially when a page_cgroup is taken from a page, pc->mem_cgroup -* is accessed after testing USED bit. To make pc->mem_cgroup visible -* before USED bit, we need memory barrier here. -* See mem_cgroup_add_lru_list(), etc. -*/ - smp_wmb(); SetPageCgroupUsed(pc); if (lrucare) { @@ -3520,7 +3512,6 @@ void mem_cgroup_split_huge_fixup(struct page *head) for (i = 1; i < HPAGE_PMD_NR; i++) { pc = head_pc + i; pc->mem_cgroup = memcg; - smp_wmb();/* see __commit_charge() */ pc->flags = head_pc->flags & ~PCGF_NOCOPY_AT_SPLIT; } __this_cpu_sub(memcg->stat->count[MEM_CGROUP_STAT_RSS_HUGE], -- 2.10.2 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7 v2 21/21] ms/mm: memcontrol: only mark charged pages with PageKmemcg
From: Vladimir DavydovTo distinguish non-slab pages charged to kmemcg we mark them PageKmemcg, which sets page->_mapcount to -512. Currently, we set/clear PageKmemcg in __alloc_pages_nodemask()/free_pages_prepare() for any page allocated with __GFP_ACCOUNT, including those that aren't actually charged to any cgroup, i.e. allocated from the root cgroup context. To avoid overhead in case cgroups are not used, we only do that if memcg_kmem_enabled() is true. The latter is set iff there are kmem-enabled memory cgroups (online or offline). The root cgroup is not considered kmem-enabled. As a result, if a page is allocated with __GFP_ACCOUNT for the root cgroup when there are kmem-enabled memory cgroups and is freed after all kmem-enabled memory cgroups were removed, e.g. # no memory cgroups has been created yet, create one mkdir /sys/fs/cgroup/memory/test # run something allocating pages with __GFP_ACCOUNT, e.g. # a program using pipe dmesg | tail # remove the memory cgroup rmdir /sys/fs/cgroup/memory/test we'll get bad page state bug complaining about page->_mapcount != -1: BUG: Bad page state in process swapper/0 pfn:1fd945c page:ea007f651700 count:0 mapcount:-511 mapping: (null) index:0x0 flags: 0x1000() To avoid that, let's mark with PageKmemcg only those pages that are actually charged to and hence pin a non-root memory cgroup. Fixes: 4949148ad433 ("mm: charge/uncharge kmemcg from generic page allocator paths") Reported-and-tested-by: Eric Dumazet Signed-off-by: Vladimir Davydov Signed-off-by: Linus Torvalds https://jira.sw.ru/browse/PSBM-51558 (cherry picked from commit c4159a75b64c0e67caededf4d7372c1b58a5f42a) Signed-off-by: Andrey Ryabinin --- mm/memcontrol.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 0183a9c..dc83f4e 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -7001,8 +7001,10 @@ static void uncharge_list(struct list_head *page_list) else nr_file += nr_pages; pgpgout++; - } else + } else { nr_kmem += 1 << compound_order(page); + __ClearPageKmemcg(page); + } if (pc->flags & PCG_MEM) nr_mem += nr_pages; -- 2.10.2 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7 v2 12/21] ms/mm: memcontrol: remove explicit OOM parameter in charge path
From: Michal HockoFor the page allocator, __GFP_NORETRY implies that no OOM should be triggered, whereas memcg has an explicit parameter to disable OOM. The only callsites that want OOM disabled are THP charges and charge moving. THP already uses __GFP_NORETRY and charge moving can use it as well - one full reclaim cycle should be plenty. Switch it over, then remove the OOM parameter. Signed-off-by: Johannes Weiner Signed-off-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds https://jira.sw.ru/browse/PSBM-51558 (cherry picked from commit 0029e19ebf84dcd70b226820daa7747b28d5956d) Signed-off-by: Andrey Ryabinin --- mm/memcontrol.c | 51 +-- 1 file changed, 13 insertions(+), 38 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index a53d55d..a1aab9f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2748,15 +2748,13 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, * mem_cgroup_try_charge - try charging a memcg * @memcg: memcg to charge * @nr_pages: number of pages to charge - * @oom: trigger OOM if reclaim fails * * Returns 0 if @memcg was charged successfully, -EINTR if the charge * was bypassed to root_mem_cgroup, and -ENOMEM if the charge failed. */ static int mem_cgroup_try_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, -unsigned int nr_pages, -bool oom) +unsigned int nr_pages) { unsigned int batch = max(CHARGE_BATCH, nr_pages); int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; @@ -2838,9 +2836,6 @@ retry: if (fatal_signal_pending(current)) goto bypass; - if (!oom) - goto nomem; - mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(batch)); mem_cgroup_inc_failcnt(mem_over_limit, gfp_mask, nr_pages); @@ -2868,15 +2863,14 @@ done: */ static struct mem_cgroup *mem_cgroup_try_charge_mm(struct mm_struct *mm, gfp_t gfp_mask, -unsigned int nr_pages, -bool oom) +unsigned int nr_pages) { struct mem_cgroup *memcg; int ret; memcg = get_mem_cgroup_from_mm(mm); - ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages, oom); + ret = mem_cgroup_try_charge(memcg, gfp_mask, nr_pages); css_put(>css); if (ret == -EINTR) memcg = root_mem_cgroup; @@ -3044,15 +3038,8 @@ int memcg_charge_kmem(struct mem_cgroup *memcg, gfp_t gfp, { struct page_counter *counter; int ret = 0; - bool may_oom; - /* -* Conditions under which we can wait for the oom_killer. Those are -* the same conditions tested by the core page allocator -*/ - may_oom = (gfp & __GFP_FS) && !(gfp & __GFP_NORETRY); - - ret = mem_cgroup_try_charge(memcg, gfp, nr_pages, may_oom); + ret = mem_cgroup_try_charge(memcg, gfp, nr_pages); if (ret == -EINTR) { /* * mem_cgroup_try_charge() chosed to bypass to root due to @@ -3686,7 +3673,6 @@ int mem_cgroup_newpage_charge(struct page *page, { unsigned int nr_pages = 1; struct mem_cgroup *memcg; - bool oom = true; if (mem_cgroup_disabled()) return 0; @@ -3698,14 +3684,9 @@ int mem_cgroup_newpage_charge(struct page *page, if (PageTransHuge(page)) { nr_pages <<= compound_order(page); VM_BUG_ON_PAGE(!PageTransHuge(page), page); - /* -* Never OOM-kill a process for a huge page. The -* fault handler will fall back to regular pages. -*/ - oom = false; } - memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages, oom); + memcg = mem_cgroup_try_charge_mm(mm, gfp_mask, nr_pages); if (!memcg) return -ENOMEM; __mem_cgroup_commit_charge(memcg, page, nr_pages, @@ -3742,7 +3723,7 @@ static int __mem_cgroup_try_charge_swapin(struct mm_struct *mm, memcg = try_get_mem_cgroup_from_page(page); if (!memcg) memcg = get_mem_cgroup_from_mm(mm); - ret = mem_cgroup_try_charge(memcg, mask, 1, true); + ret = mem_cgroup_try_charge(memcg, mask, 1); css_put(>css); if (ret == -EINTR) memcg = root_mem_cgroup; @@ -3769,7 +3750,7 @@ int mem_cgroup_try_charge_swapin(struct mm_struct *mm, struct page *page, if (!PageSwapCache(page)) { struct
[Devel] [PATCH rh7 v2 10/21] ms/mm: huge_memory: use GFP_TRANSHUGE when charging huge pages
From: Johannes WeinerTransparent huge page charges prefer falling back to regular pages rather than spending a lot of time in direct reclaim. Desired reclaim behavior is usually declared in the gfp mask, but THP charges use GFP_KERNEL and then rely on the fact that OOM is disabled for THP charges, and that OOM-disabled charges don't retry reclaim. Needless to say, this is anything but obvious and quite error prone. Convert THP charges to use GFP_TRANSHUGE instead, which implies __GFP_NORETRY, to indicate the low-latency requirement. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds https://jira.sw.ru/browse/PSBM-51558 (cherry picked from commit d51d885bbb137cc8e1704e76be1846c5e0d5e8b4) Signed-off-by: Andrey Ryabinin --- mm/huge_memory.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index c406494..14ed98b 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -708,7 +708,7 @@ static int __do_huge_pmd_anonymous_page(struct mm_struct *mm, VM_BUG_ON_PAGE(!PageCompound(page), page); - if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_KERNEL))) { + if (unlikely(mem_cgroup_newpage_charge(page, mm, GFP_TRANSHUGE))) { put_page(page); count_vm_event(THP_FAULT_FALLBACK); return VM_FAULT_FALLBACK; @@ -1241,7 +1241,7 @@ alloc: goto out; } - if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) { + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_TRANSHUGE))) { put_page(new_page); if (page) { split_huge_page(page); @@ -2524,7 +2524,7 @@ static void collapse_huge_page(struct mm_struct *mm, if (!new_page) return; - if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_KERNEL))) + if (unlikely(mem_cgroup_newpage_charge(new_page, mm, GFP_TRANSHUGE))) return; /* -- 2.10.2 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7 v2 11/21] ms/mm: memcontrol: retry reclaim for oom-disabled and __GFP_NOFAIL charges
From: Johannes WeinerThere is no reason why oom-disabled and __GFP_NOFAIL charges should try to reclaim only once when every other charge tries several times before giving up. Make them all retry the same number of times. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds https://jira.sw.ru/browse/PSBM-51558 (cherry picked from commit 9b1306192d335759a6cf2f3b404c49e811e5f953) Signed-off-by: Andrey Ryabinin --- mm/memcontrol.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 3608d80..a53d55d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2759,7 +2759,7 @@ static int mem_cgroup_try_charge(struct mem_cgroup *memcg, bool oom) { unsigned int batch = max(CHARGE_BATCH, nr_pages); - int nr_oom_retries = MEM_CGROUP_RECLAIM_RETRIES; + int nr_retries = MEM_CGROUP_RECLAIM_RETRIES; struct mem_cgroup *mem_over_limit; struct page_counter *counter; unsigned long nr_reclaimed; @@ -2829,6 +2829,9 @@ retry: if (mem_cgroup_wait_acct_move(mem_over_limit)) goto retry; + if (nr_retries--) + goto retry; + if (gfp_mask & __GFP_NOFAIL) goto bypass; @@ -2838,9 +2841,6 @@ retry: if (!oom) goto nomem; - if (nr_oom_retries--) - goto retry; - mem_cgroup_oom(mem_over_limit, gfp_mask, get_order(batch)); mem_cgroup_inc_failcnt(mem_over_limit, gfp_mask, nr_pages); -- 2.10.2 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7 v2 07/21] ms/mm: memcontrol: fold mem_cgroup_do_charge()
From: Johannes WeinerThese patches rework memcg charge lifetime to integrate more naturally with the lifetime of user pages. This drastically simplifies the code and reduces charging and uncharging overhead. The most expensive part of charging and uncharging is the page_cgroup bit spinlock, which is removed entirely after this series. Here are the top-10 profile entries of a stress test that reads a 128G sparse file on a freshly booted box, without even a dedicated cgroup (i.e. executing in the root memcg). Before: 15.36% cat [kernel.kallsyms] [k] copy_user_generic_string 13.31% cat [kernel.kallsyms] [k] memset 11.48% cat [kernel.kallsyms] [k] do_mpage_readpage 4.23% cat [kernel.kallsyms] [k] get_page_from_freelist 2.38% cat [kernel.kallsyms] [k] put_page 2.32% cat [kernel.kallsyms] [k] __mem_cgroup_commit_charge 2.18% kswapd0 [kernel.kallsyms] [k] __mem_cgroup_uncharge_common 1.92% kswapd0 [kernel.kallsyms] [k] shrink_page_list 1.86% cat [kernel.kallsyms] [k] __radix_tree_lookup 1.62% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn After: 15.67% cat [kernel.kallsyms] [k] copy_user_generic_string 13.48% cat [kernel.kallsyms] [k] memset 11.42% cat [kernel.kallsyms] [k] do_mpage_readpage 3.98% cat [kernel.kallsyms] [k] get_page_from_freelist 2.46% cat [kernel.kallsyms] [k] put_page 2.13% kswapd0 [kernel.kallsyms] [k] shrink_page_list 1.88% cat [kernel.kallsyms] [k] __radix_tree_lookup 1.67% cat [kernel.kallsyms] [k] __pagevec_lru_add_fn 1.39% kswapd0 [kernel.kallsyms] [k] free_pcppages_bulk 1.30% cat [kernel.kallsyms] [k] kfree As you can see, the memcg footprint has shrunk quite a bit. textdata bss dec hex filename 379709892 400 48262bc86 mm/memcontrol.o.old 352399892 400 45531b1db mm/memcontrol.o This patch (of 13): This function was split out because mem_cgroup_try_charge() got too big. But having essentially one sequence of operations arbitrarily split in half is not good for reworking the code. Fold it back in. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Cc: Hugh Dickins Cc: Tejun Heo Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds https://jira.sw.ru/browse/PSBM-51558 (cherry picked from commit 6539cc053869bd32a2db731b215b7c73b11f68d3) Signed-off-by: Andrey Ryabinin --- mm/memcontrol.c | 191 +--- 1 file changed, 57 insertions(+), 134 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1e5d914..f904257 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2744,86 +2744,6 @@ static int memcg_cpu_hotplug_callback(struct notifier_block *nb, return NOTIFY_OK; } - -/* See mem_cgroup_try_charge() for details */ -enum { - CHARGE_OK, /* success */ - CHARGE_RETRY, /* need to retry but retry is not bad */ - CHARGE_NOMEM, /* we can't do more. return -ENOMEM */ - CHARGE_WOULDBLOCK, /* GFP_WAIT wasn't set and no enough res. */ -}; - -static int mem_cgroup_do_charge(struct mem_cgroup *memcg, gfp_t gfp_mask, - unsigned int nr_pages, unsigned int min_pages, - bool invoke_oom) -{ - struct mem_cgroup *mem_over_limit; - struct page_counter *counter; - unsigned long flags = 0; - int ret; - - ret = page_counter_try_charge(>memory, nr_pages, ); - - if (likely(!ret)) { - if (!do_swap_account) - return CHARGE_OK; - ret = page_counter_try_charge(>memsw, nr_pages, ); - if (likely(!ret)) - return CHARGE_OK; - - page_counter_uncharge(>memory, nr_pages); - mem_over_limit = mem_cgroup_from_counter(counter, memsw); - flags |= MEM_CGROUP_RECLAIM_NOSWAP; - } else - mem_over_limit = mem_cgroup_from_counter(counter, memory); - /* -* Never reclaim on behalf of optional batching, retry with a -* single page instead. -*/ - if (nr_pages > min_pages) - return CHARGE_RETRY; - - if (!(gfp_mask & __GFP_WAIT)) { - mem_cgroup_inc_failcnt(mem_over_limit, gfp_mask, nr_pages); - return CHARGE_WOULDBLOCK; - } - - ret = mem_cgroup_reclaim(mem_over_limit, gfp_mask, flags); - if
[Devel] [PATCH rh7 v2 05/21] ms/memcg: do not replicate get_mem_cgroup_from_mm in __mem_cgroup_try_charge
From: Michal Hocko__mem_cgroup_try_charge duplicates get_mem_cgroup_from_mm for charges which came without a memcg. The only reason seems to be a tiny optimization when css_tryget is not called if the charge can be consumed from the stock. Nevertheless css_tryget is very cheap since it has been reworked to use per-cpu counting so this optimization doesn't give us anything these days. So let's drop the code duplication so that the code is more readable. Signed-off-by: Michal Hocko Signed-off-by: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds https://jira.sw.ru/browse/PSBM-51558 (cherry picked from commit b6b6cc72bc404c952968530d7df4c3a4ab82b65b) Signed-off-by: Andrey Ryabinin --- mm/memcontrol.c | 50 ++ 1 file changed, 6 insertions(+), 44 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 40ac81b..d6413c5 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2873,52 +2873,14 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, again: if (*ptr) { /* css should be a valid one */ memcg = *ptr; - if (mem_cgroup_is_root(memcg)) - goto done; - if (consume_stock(memcg, nr_pages)) - goto done; css_get(>css); } else { - struct task_struct *p; - - rcu_read_lock(); - p = rcu_dereference(mm->owner); - /* -* Because we don't have task_lock(), "p" can exit. -* In that case, "memcg" can point to root or p can be NULL with -* race with swapoff. Then, we have small risk of mis-accouning. -* But such kind of mis-account by race always happens because -* we don't have cgroup_mutex(). It's overkill and we allo that -* small race, here. -* (*) swapoff at el will charge against mm-struct not against -* task-struct. So, mm->owner can be NULL. -*/ - memcg = mem_cgroup_from_task(p); - if (!memcg) - memcg = root_mem_cgroup; - if (mem_cgroup_is_root(memcg)) { - rcu_read_unlock(); - goto done; - } - if (consume_stock(memcg, nr_pages)) { - /* -* It seems dagerous to access memcg without css_get(). -* But considering how consume_stok works, it's not -* necessary. If consume_stock success, some charges -* from this memcg are cached on this cpu. So, we -* don't need to call css_get()/css_tryget() before -* calling consume_stock(). -*/ - rcu_read_unlock(); - goto done; - } - /* after here, we may be blocked. we need to get refcnt */ - if (!css_tryget(>css)) { - rcu_read_unlock(); - goto again; - } - rcu_read_unlock(); + memcg = get_mem_cgroup_from_mm(mm); } + if (mem_cgroup_is_root(memcg)) + goto done; + if (consume_stock(memcg, nr_pages)) + goto done; do { bool invoke_oom = oom && !nr_oom_retries; @@ -2986,8 +2948,8 @@ again: try_to_free_mem_cgroup_pages(iter, nr_pages, gfp_mask, false); } while ((iter = parent_mem_cgroup(iter))); - css_put(>css); done: + css_put(>css); *ptr = memcg; return 0; nomem: -- 2.10.2 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7 v2 02/21] ms/mm: memcg: push !mm handling out to page cache charge function
From: Johannes WeinerOnly page cache charges can happen without an mm context, so push this special case out of the inner core and into the cache charge function. An ancient comment explains that the mm can also be NULL in case the task is currently being migrated, but that is not actually true with the current case, so just remove it. Signed-off-by: Johannes Weiner Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds https://jira.sw.ru/browse/PSBM-51558 (cherry picked from commit 284f39afeaa4ab1409b8f43b29cdea3007960ee3) Signed-off-by: Andrey Ryabinin --- mm/memcontrol.c | 15 ++- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index be1c492..7b2a99f 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2870,15 +2870,6 @@ static int __mem_cgroup_try_charge(struct mm_struct *mm, if (gfp_mask & __GFP_NOFAIL) oom = false; - - /* -* We always charge the cgroup the mm_struct belongs to. -* The mm_struct's mem_cgroup changes on task migration if the -* thread group leader migrates. It's possible that mm is not -* set, if so charge the root memcg (happens for pagecache usage). -*/ - if (!*ptr && !mm) - *ptr = root_mem_cgroup; again: if (*ptr) { /* css should be a valid one */ memcg = *ptr; @@ -3971,6 +3962,12 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, return 0; if (!PageSwapCache(page)) { + /* +* Page cache insertions can happen without an actual +* task context, e.g. during disk probing on boot. +*/ + if (!mm) + memcg = root_mem_cgroup; ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, , true); if (ret != -ENOMEM) __mem_cgroup_commit_charge(memcg, page, 1, type, false); -- 2.10.2 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel
[Devel] [PATCH rh7 v2 01/21] ms/mm: memcg: inline mem_cgroup_charge_common()
From: Johannes Weinermem_cgroup_charge_common() is used by both cache and anon pages, but most of its body only applies to anon pages and the remainder is not worth having in a separate function. Signed-off-by: Johannes Weiner Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds https://jira.sw.ru/browse/PSBM-51558 (cherry picked from commit 1bec6b333e241a9db47d3939fb08a4e174ece02f) Signed-off-by: Andrey Ryabinin --- mm/memcontrol.c | 40 1 file changed, 16 insertions(+), 24 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d23ca87..be1c492 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3818,20 +3818,21 @@ out: return ret; } -/* - * Charge the memory controller for page usage. - * Return - * 0 if the charge was successful - * < 0 if the cgroup is over its limit - */ -static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, - gfp_t gfp_mask, enum charge_type ctype) +int mem_cgroup_newpage_charge(struct page *page, + struct mm_struct *mm, gfp_t gfp_mask) { struct mem_cgroup *memcg = NULL; unsigned int nr_pages = 1; bool oom = true; int ret; + if (mem_cgroup_disabled()) + return 0; + + VM_BUG_ON_PAGE(page_mapped(page), page); + VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); + VM_BUG_ON(!mm); + if (PageTransHuge(page)) { nr_pages <<= compound_order(page); VM_BUG_ON_PAGE(!PageTransHuge(page), page); @@ -3845,22 +3846,11 @@ static int mem_cgroup_charge_common(struct page *page, struct mm_struct *mm, ret = __mem_cgroup_try_charge(mm, gfp_mask, nr_pages, , oom); if (ret == -ENOMEM) return ret; - __mem_cgroup_commit_charge(memcg, page, nr_pages, ctype, false); + __mem_cgroup_commit_charge(memcg, page, nr_pages, + MEM_CGROUP_CHARGE_TYPE_ANON, false); return 0; } -int mem_cgroup_newpage_charge(struct page *page, - struct mm_struct *mm, gfp_t gfp_mask) -{ - if (mem_cgroup_disabled()) - return 0; - VM_BUG_ON_PAGE(page_mapped(page), page); - VM_BUG_ON_PAGE(page->mapping && !PageAnon(page), page); - VM_BUG_ON(!mm); - return mem_cgroup_charge_common(page, mm, gfp_mask, - MEM_CGROUP_CHARGE_TYPE_ANON); -} - /* * While swap-in, try_charge -> commit or cancel, the page is locked. * And when try_charge() successfully returns, one refcnt to memcg without @@ -3980,9 +3970,11 @@ int mem_cgroup_cache_charge(struct page *page, struct mm_struct *mm, if (PageCompound(page)) return 0; - if (!PageSwapCache(page)) - ret = mem_cgroup_charge_common(page, mm, gfp_mask, type); - else { /* page is swapcache/shmem */ + if (!PageSwapCache(page)) { + ret = __mem_cgroup_try_charge(mm, gfp_mask, 1, , true); + if (ret != -ENOMEM) + __mem_cgroup_commit_charge(memcg, page, 1, type, false); + } else { /* page is swapcache/shmem */ ret = __mem_cgroup_try_charge_swapin(mm, page, gfp_mask, ); if (!ret) -- 2.10.2 ___ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel