Re: [PATCH v4 4/4] mm: introduce MADV_PAGEOUT
On Fri, Jul 12, 2019 at 09:58:09AM -0400, Johannes Weiner wrote: > On Fri, Jul 12, 2019 at 02:18:28PM +0900, Minchan Kim wrote: > > Hi Johannes, > > > > On Thu, Jul 11, 2019 at 02:42:23PM -0400, Johannes Weiner wrote: > > > On Thu, Jul 11, 2019 at 10:25:28AM +0900, Minchan Kim wrote: > > > > @@ -480,6 +482,198 @@ static long madvise_cold(struct vm_area_struct > > > > *vma, > > > > return 0; > > > > } > > > > > > > > +static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr, > > > > + unsigned long end, struct mm_walk *walk) > > > > +{ > > > > + struct mmu_gather *tlb = walk->private; > > > > + struct mm_struct *mm = tlb->mm; > > > > + struct vm_area_struct *vma = walk->vma; > > > > + pte_t *orig_pte, *pte, ptent; > > > > + spinlock_t *ptl; > > > > + LIST_HEAD(page_list); > > > > + struct page *page; > > > > + unsigned long next; > > > > + > > > > + if (fatal_signal_pending(current)) > > > > + return -EINTR; > > > > + > > > > + next = pmd_addr_end(addr, end); > > > > + if (pmd_trans_huge(*pmd)) { > > > > + pmd_t orig_pmd; > > > > + > > > > + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); > > > > + ptl = pmd_trans_huge_lock(pmd, vma); > > > > + if (!ptl) > > > > + return 0; > > > > + > > > > + orig_pmd = *pmd; > > > > + if (is_huge_zero_pmd(orig_pmd)) > > > > + goto huge_unlock; > > > > + > > > > + if (unlikely(!pmd_present(orig_pmd))) { > > > > + VM_BUG_ON(thp_migration_supported() && > > > > + > > > > !is_pmd_migration_entry(orig_pmd)); > > > > + goto huge_unlock; > > > > + } > > > > + > > > > + page = pmd_page(orig_pmd); > > > > + if (next - addr != HPAGE_PMD_SIZE) { > > > > + int err; > > > > + > > > > + if (page_mapcount(page) != 1) > > > > + goto huge_unlock; > > > > + get_page(page); > > > > + spin_unlock(ptl); > > > > + lock_page(page); > > > > + err = split_huge_page(page); > > > > + unlock_page(page); > > > > + put_page(page); > > > > + if (!err) > > > > + goto regular_page; > > > > + return 0; > > > > + } > > > > + > > > > + if (isolate_lru_page(page)) > > > > + goto huge_unlock; > > > > + > > > > + if (pmd_young(orig_pmd)) { > > > > + pmdp_invalidate(vma, addr, pmd); > > > > + orig_pmd = pmd_mkold(orig_pmd); > > > > + > > > > + set_pmd_at(mm, addr, pmd, orig_pmd); > > > > + tlb_remove_tlb_entry(tlb, pmd, addr); > > > > + } > > > > + > > > > + ClearPageReferenced(page); > > > > + test_and_clear_page_young(page); > > > > + list_add(>lru, _list); > > > > +huge_unlock: > > > > + spin_unlock(ptl); > > > > + reclaim_pages(_list); > > > > + return 0; > > > > + } > > > > + > > > > + if (pmd_trans_unstable(pmd)) > > > > + return 0; > > > > +regular_page: > > > > + tlb_change_page_size(tlb, PAGE_SIZE); > > > > + orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, > > > > ); > > > > + flush_tlb_batched_pending(mm); > > > > + arch_enter_lazy_mmu_mode(); > > > > + for (; addr < end; pte++, addr += PAGE_SIZE) { > > > > + ptent = *pte; > > > > + if (!pte_present(ptent)) > > > > + continue; > > > > + > > > > + page = vm_normal_page(vma, addr, ptent); > > > > + if (!page) > > > > + continue; > > > > + > > > > + /* > > > > +* creating a THP page is expensive so split it only if > > > > we > > > > +* are sure it's worth. Split it if we are only owner. > > > > +*/ > > > > + if (PageTransCompound(page)) { > > > > + if (page_mapcount(page) != 1) > > > > + break; > > > > + get_page(page); > > > > + if (!trylock_page(page)) { > > > > + put_page(page); > > > > + break; > > > > + } > > > > + pte_unmap_unlock(orig_pte, ptl); > > > > + if (split_huge_page(page)) { > > > > + unlock_page(page); > > > > +
Re: [PATCH v4 4/4] mm: introduce MADV_PAGEOUT
On Fri 12-07-19 09:58:09, Johannes Weiner wrote: [...] > > @@ -423,6 +445,12 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned > > long addr, > > > > VM_BUG_ON_PAGE(PageTransCompound(page), page); > > > > + if (pageout) { > > + if (isolate_lru_page(page)) > > + continue; > > + list_add(>lru, _list); > > + } > > + > > if (pte_young(ptent)) { > > ptent = ptep_get_and_clear_full(mm, addr, pte, > > tlb->fullmm); > > One thought on the ordering here. > > When LRU isolation fails, it would still make sense to clear the young > bit: we cannot reclaim the page as we wanted to, but the user still > provided a clear hint that the page is cold and she won't be touching > it for a while. MADV_PAGEOUT is basically MADV_COLD + try_to_reclaim. > So IMO isolation should go to the end next to deactivate_page(). Make sense to me -- Michal Hocko SUSE Labs
Re: [PATCH v4 4/4] mm: introduce MADV_PAGEOUT
On Fri, Jul 12, 2019 at 02:18:28PM +0900, Minchan Kim wrote: > Hi Johannes, > > On Thu, Jul 11, 2019 at 02:42:23PM -0400, Johannes Weiner wrote: > > On Thu, Jul 11, 2019 at 10:25:28AM +0900, Minchan Kim wrote: > > > @@ -480,6 +482,198 @@ static long madvise_cold(struct vm_area_struct *vma, > > > return 0; > > > } > > > > > > +static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr, > > > + unsigned long end, struct mm_walk *walk) > > > +{ > > > + struct mmu_gather *tlb = walk->private; > > > + struct mm_struct *mm = tlb->mm; > > > + struct vm_area_struct *vma = walk->vma; > > > + pte_t *orig_pte, *pte, ptent; > > > + spinlock_t *ptl; > > > + LIST_HEAD(page_list); > > > + struct page *page; > > > + unsigned long next; > > > + > > > + if (fatal_signal_pending(current)) > > > + return -EINTR; > > > + > > > + next = pmd_addr_end(addr, end); > > > + if (pmd_trans_huge(*pmd)) { > > > + pmd_t orig_pmd; > > > + > > > + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); > > > + ptl = pmd_trans_huge_lock(pmd, vma); > > > + if (!ptl) > > > + return 0; > > > + > > > + orig_pmd = *pmd; > > > + if (is_huge_zero_pmd(orig_pmd)) > > > + goto huge_unlock; > > > + > > > + if (unlikely(!pmd_present(orig_pmd))) { > > > + VM_BUG_ON(thp_migration_supported() && > > > + !is_pmd_migration_entry(orig_pmd)); > > > + goto huge_unlock; > > > + } > > > + > > > + page = pmd_page(orig_pmd); > > > + if (next - addr != HPAGE_PMD_SIZE) { > > > + int err; > > > + > > > + if (page_mapcount(page) != 1) > > > + goto huge_unlock; > > > + get_page(page); > > > + spin_unlock(ptl); > > > + lock_page(page); > > > + err = split_huge_page(page); > > > + unlock_page(page); > > > + put_page(page); > > > + if (!err) > > > + goto regular_page; > > > + return 0; > > > + } > > > + > > > + if (isolate_lru_page(page)) > > > + goto huge_unlock; > > > + > > > + if (pmd_young(orig_pmd)) { > > > + pmdp_invalidate(vma, addr, pmd); > > > + orig_pmd = pmd_mkold(orig_pmd); > > > + > > > + set_pmd_at(mm, addr, pmd, orig_pmd); > > > + tlb_remove_tlb_entry(tlb, pmd, addr); > > > + } > > > + > > > + ClearPageReferenced(page); > > > + test_and_clear_page_young(page); > > > + list_add(>lru, _list); > > > +huge_unlock: > > > + spin_unlock(ptl); > > > + reclaim_pages(_list); > > > + return 0; > > > + } > > > + > > > + if (pmd_trans_unstable(pmd)) > > > + return 0; > > > +regular_page: > > > + tlb_change_page_size(tlb, PAGE_SIZE); > > > + orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ); > > > + flush_tlb_batched_pending(mm); > > > + arch_enter_lazy_mmu_mode(); > > > + for (; addr < end; pte++, addr += PAGE_SIZE) { > > > + ptent = *pte; > > > + if (!pte_present(ptent)) > > > + continue; > > > + > > > + page = vm_normal_page(vma, addr, ptent); > > > + if (!page) > > > + continue; > > > + > > > + /* > > > + * creating a THP page is expensive so split it only if we > > > + * are sure it's worth. Split it if we are only owner. > > > + */ > > > + if (PageTransCompound(page)) { > > > + if (page_mapcount(page) != 1) > > > + break; > > > + get_page(page); > > > + if (!trylock_page(page)) { > > > + put_page(page); > > > + break; > > > + } > > > + pte_unmap_unlock(orig_pte, ptl); > > > + if (split_huge_page(page)) { > > > + unlock_page(page); > > > + put_page(page); > > > + pte_offset_map_lock(mm, pmd, addr, ); > > > + break; > > > + } > > > + unlock_page(page); > > > + put_page(page); > > > + pte = pte_offset_map_lock(mm, pmd, addr, ); > > > + pte--; > > > + addr -= PAGE_SIZE; > > > + continue; > > > + } > > > + > > > + VM_BUG_ON_PAGE(PageTransCompound(page), page); > > > + > > > + if (isolate_lru_page(page)) > > > + continue; > > > + > > > + if (pte_young(ptent)) { > > > + ptent = ptep_get_and_clear_full(mm, addr, pte, > > > + tlb->fullmm); > > > + ptent = pte_mkold(ptent); > > > +
Re: [PATCH v4 4/4] mm: introduce MADV_PAGEOUT
On Fri 12-07-19 14:18:28, Minchan Kim wrote: [...] > >From 41592f23e876ec21e49dc3c76dc89538e2bb16be Mon Sep 17 00:00:00 2001 > From: Minchan Kim > Date: Fri, 12 Jul 2019 14:05:36 +0900 > Subject: [PATCH] mm: factor out common parts between MADV_COLD and > MADV_PAGEOUT > > There are many common parts between MADV_COLD and MADV_PAGEOUT. > This patch factor them out to save code duplication. This looks better indeed. I still hope that this can get improved even further but let's do that in a follow up patch. > Signed-off-by: Minchan Kim Acked-by: Michal Hocko > --- > mm/madvise.c | 201 +-- > 1 file changed, 52 insertions(+), 149 deletions(-) > > diff --git a/mm/madvise.c b/mm/madvise.c > index bc2f0138982e..3d3d14517cc8 100644 > --- a/mm/madvise.c > +++ b/mm/madvise.c > @@ -30,6 +30,11 @@ > > #include "internal.h" > > +struct madvise_walk_private { > + struct mmu_gather *tlb; > + bool pageout; > +}; > + > /* > * Any behaviour which results in changes to the vma->vm_flags needs to > * take mmap_sem for writing. Others, which simply traverse vmas, need > @@ -310,16 +315,23 @@ static long madvise_willneed(struct vm_area_struct *vma, > return 0; > } > > -static int madvise_cold_pte_range(pmd_t *pmd, unsigned long addr, > - unsigned long end, struct mm_walk *walk) > +static int madvise_cold_or_pageout_pte_range(pmd_t *pmd, > + unsigned long addr, unsigned long end, > + struct mm_walk *walk) > { > - struct mmu_gather *tlb = walk->private; > + struct madvise_walk_private *private = walk->private; > + struct mmu_gather *tlb = private->tlb; > + bool pageout = private->pageout; > struct mm_struct *mm = tlb->mm; > struct vm_area_struct *vma = walk->vma; > pte_t *orig_pte, *pte, ptent; > spinlock_t *ptl; > - struct page *page; > unsigned long next; > + struct page *page = NULL; > + LIST_HEAD(page_list); > + > + if (fatal_signal_pending(current)) > + return -EINTR; > > next = pmd_addr_end(addr, end); > if (pmd_trans_huge(*pmd)) { > @@ -358,6 +370,12 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned > long addr, > return 0; > } > > + if (pageout) { > + if (isolate_lru_page(page)) > + goto huge_unlock; > + list_add(>lru, _list); > + } > + > if (pmd_young(orig_pmd)) { > pmdp_invalidate(vma, addr, pmd); > orig_pmd = pmd_mkold(orig_pmd); > @@ -366,10 +384,14 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned > long addr, > tlb_remove_pmd_tlb_entry(tlb, pmd, addr); > } > > + ClearPageReferenced(page); > test_and_clear_page_young(page); > - deactivate_page(page); > huge_unlock: > spin_unlock(ptl); > + if (pageout) > + reclaim_pages(_list); > + else > + deactivate_page(page); > return 0; > } > > @@ -423,6 +445,12 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned > long addr, > > VM_BUG_ON_PAGE(PageTransCompound(page), page); > > + if (pageout) { > + if (isolate_lru_page(page)) > + continue; > + list_add(>lru, _list); > + } > + > if (pte_young(ptent)) { > ptent = ptep_get_and_clear_full(mm, addr, pte, > tlb->fullmm); > @@ -437,12 +465,16 @@ static int madvise_cold_pte_range(pmd_t *pmd, unsigned > long addr, >* As a side effect, it makes confuse idle-page tracking >* because they will miss recent referenced history. >*/ > + ClearPageReferenced(page); > test_and_clear_page_young(page); > - deactivate_page(page); > + if (!pageout) > + deactivate_page(page); > } > > arch_enter_lazy_mmu_mode(); > pte_unmap_unlock(orig_pte, ptl); > + if (pageout) > + reclaim_pages(_list); > cond_resched(); > > return 0; > @@ -452,10 +484,15 @@ static void madvise_cold_page_range(struct mmu_gather > *tlb, >struct vm_area_struct *vma, >unsigned long addr, unsigned long end) > { > + struct madvise_walk_private walk_private = { > + .tlb = tlb, > + .pageout = false, > + }; > + > struct mm_walk cold_walk = { > - .pmd_entry = madvise_cold_pte_range, > + .pmd_entry = madvise_cold_or_pageout_pte_range, > .mm = vma->vm_mm, > -
Re: [PATCH v4 4/4] mm: introduce MADV_PAGEOUT
Hi Johannes, On Thu, Jul 11, 2019 at 02:42:23PM -0400, Johannes Weiner wrote: > On Thu, Jul 11, 2019 at 10:25:28AM +0900, Minchan Kim wrote: > > @@ -480,6 +482,198 @@ static long madvise_cold(struct vm_area_struct *vma, > > return 0; > > } > > > > +static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr, > > + unsigned long end, struct mm_walk *walk) > > +{ > > + struct mmu_gather *tlb = walk->private; > > + struct mm_struct *mm = tlb->mm; > > + struct vm_area_struct *vma = walk->vma; > > + pte_t *orig_pte, *pte, ptent; > > + spinlock_t *ptl; > > + LIST_HEAD(page_list); > > + struct page *page; > > + unsigned long next; > > + > > + if (fatal_signal_pending(current)) > > + return -EINTR; > > + > > + next = pmd_addr_end(addr, end); > > + if (pmd_trans_huge(*pmd)) { > > + pmd_t orig_pmd; > > + > > + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); > > + ptl = pmd_trans_huge_lock(pmd, vma); > > + if (!ptl) > > + return 0; > > + > > + orig_pmd = *pmd; > > + if (is_huge_zero_pmd(orig_pmd)) > > + goto huge_unlock; > > + > > + if (unlikely(!pmd_present(orig_pmd))) { > > + VM_BUG_ON(thp_migration_supported() && > > + !is_pmd_migration_entry(orig_pmd)); > > + goto huge_unlock; > > + } > > + > > + page = pmd_page(orig_pmd); > > + if (next - addr != HPAGE_PMD_SIZE) { > > + int err; > > + > > + if (page_mapcount(page) != 1) > > + goto huge_unlock; > > + get_page(page); > > + spin_unlock(ptl); > > + lock_page(page); > > + err = split_huge_page(page); > > + unlock_page(page); > > + put_page(page); > > + if (!err) > > + goto regular_page; > > + return 0; > > + } > > + > > + if (isolate_lru_page(page)) > > + goto huge_unlock; > > + > > + if (pmd_young(orig_pmd)) { > > + pmdp_invalidate(vma, addr, pmd); > > + orig_pmd = pmd_mkold(orig_pmd); > > + > > + set_pmd_at(mm, addr, pmd, orig_pmd); > > + tlb_remove_tlb_entry(tlb, pmd, addr); > > + } > > + > > + ClearPageReferenced(page); > > + test_and_clear_page_young(page); > > + list_add(>lru, _list); > > +huge_unlock: > > + spin_unlock(ptl); > > + reclaim_pages(_list); > > + return 0; > > + } > > + > > + if (pmd_trans_unstable(pmd)) > > + return 0; > > +regular_page: > > + tlb_change_page_size(tlb, PAGE_SIZE); > > + orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ); > > + flush_tlb_batched_pending(mm); > > + arch_enter_lazy_mmu_mode(); > > + for (; addr < end; pte++, addr += PAGE_SIZE) { > > + ptent = *pte; > > + if (!pte_present(ptent)) > > + continue; > > + > > + page = vm_normal_page(vma, addr, ptent); > > + if (!page) > > + continue; > > + > > + /* > > +* creating a THP page is expensive so split it only if we > > +* are sure it's worth. Split it if we are only owner. > > +*/ > > + if (PageTransCompound(page)) { > > + if (page_mapcount(page) != 1) > > + break; > > + get_page(page); > > + if (!trylock_page(page)) { > > + put_page(page); > > + break; > > + } > > + pte_unmap_unlock(orig_pte, ptl); > > + if (split_huge_page(page)) { > > + unlock_page(page); > > + put_page(page); > > + pte_offset_map_lock(mm, pmd, addr, ); > > + break; > > + } > > + unlock_page(page); > > + put_page(page); > > + pte = pte_offset_map_lock(mm, pmd, addr, ); > > + pte--; > > + addr -= PAGE_SIZE; > > + continue; > > + } > > + > > + VM_BUG_ON_PAGE(PageTransCompound(page), page); > > + > > + if (isolate_lru_page(page)) > > + continue; > > + > > + if (pte_young(ptent)) { > > + ptent = ptep_get_and_clear_full(mm, addr, pte, > > + tlb->fullmm); > > + ptent = pte_mkold(ptent); > > + set_pte_at(mm, addr, pte, ptent); > > + tlb_remove_tlb_entry(tlb, pte, addr); > > +
Re: [PATCH v4 4/4] mm: introduce MADV_PAGEOUT
On Thu, Jul 11, 2019 at 10:25:28AM +0900, Minchan Kim wrote: > @@ -480,6 +482,198 @@ static long madvise_cold(struct vm_area_struct *vma, > return 0; > } > > +static int madvise_pageout_pte_range(pmd_t *pmd, unsigned long addr, > + unsigned long end, struct mm_walk *walk) > +{ > + struct mmu_gather *tlb = walk->private; > + struct mm_struct *mm = tlb->mm; > + struct vm_area_struct *vma = walk->vma; > + pte_t *orig_pte, *pte, ptent; > + spinlock_t *ptl; > + LIST_HEAD(page_list); > + struct page *page; > + unsigned long next; > + > + if (fatal_signal_pending(current)) > + return -EINTR; > + > + next = pmd_addr_end(addr, end); > + if (pmd_trans_huge(*pmd)) { > + pmd_t orig_pmd; > + > + tlb_change_page_size(tlb, HPAGE_PMD_SIZE); > + ptl = pmd_trans_huge_lock(pmd, vma); > + if (!ptl) > + return 0; > + > + orig_pmd = *pmd; > + if (is_huge_zero_pmd(orig_pmd)) > + goto huge_unlock; > + > + if (unlikely(!pmd_present(orig_pmd))) { > + VM_BUG_ON(thp_migration_supported() && > + !is_pmd_migration_entry(orig_pmd)); > + goto huge_unlock; > + } > + > + page = pmd_page(orig_pmd); > + if (next - addr != HPAGE_PMD_SIZE) { > + int err; > + > + if (page_mapcount(page) != 1) > + goto huge_unlock; > + get_page(page); > + spin_unlock(ptl); > + lock_page(page); > + err = split_huge_page(page); > + unlock_page(page); > + put_page(page); > + if (!err) > + goto regular_page; > + return 0; > + } > + > + if (isolate_lru_page(page)) > + goto huge_unlock; > + > + if (pmd_young(orig_pmd)) { > + pmdp_invalidate(vma, addr, pmd); > + orig_pmd = pmd_mkold(orig_pmd); > + > + set_pmd_at(mm, addr, pmd, orig_pmd); > + tlb_remove_tlb_entry(tlb, pmd, addr); > + } > + > + ClearPageReferenced(page); > + test_and_clear_page_young(page); > + list_add(>lru, _list); > +huge_unlock: > + spin_unlock(ptl); > + reclaim_pages(_list); > + return 0; > + } > + > + if (pmd_trans_unstable(pmd)) > + return 0; > +regular_page: > + tlb_change_page_size(tlb, PAGE_SIZE); > + orig_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, addr, ); > + flush_tlb_batched_pending(mm); > + arch_enter_lazy_mmu_mode(); > + for (; addr < end; pte++, addr += PAGE_SIZE) { > + ptent = *pte; > + if (!pte_present(ptent)) > + continue; > + > + page = vm_normal_page(vma, addr, ptent); > + if (!page) > + continue; > + > + /* > + * creating a THP page is expensive so split it only if we > + * are sure it's worth. Split it if we are only owner. > + */ > + if (PageTransCompound(page)) { > + if (page_mapcount(page) != 1) > + break; > + get_page(page); > + if (!trylock_page(page)) { > + put_page(page); > + break; > + } > + pte_unmap_unlock(orig_pte, ptl); > + if (split_huge_page(page)) { > + unlock_page(page); > + put_page(page); > + pte_offset_map_lock(mm, pmd, addr, ); > + break; > + } > + unlock_page(page); > + put_page(page); > + pte = pte_offset_map_lock(mm, pmd, addr, ); > + pte--; > + addr -= PAGE_SIZE; > + continue; > + } > + > + VM_BUG_ON_PAGE(PageTransCompound(page), page); > + > + if (isolate_lru_page(page)) > + continue; > + > + if (pte_young(ptent)) { > + ptent = ptep_get_and_clear_full(mm, addr, pte, > + tlb->fullmm); > + ptent = pte_mkold(ptent); > + set_pte_at(mm, addr, pte, ptent); > + tlb_remove_tlb_entry(tlb, pte, addr); > + } > + ClearPageReferenced(page); > + test_and_clear_page_young(page); > + list_add(>lru,