Re: [PATCHv6 01/37] mm, shmem: swich huge tmpfs to multi-order radix-tree entries

2017-02-13 Thread Kirill A. Shutemov
On Thu, Feb 09, 2017 at 07:58:20PM +0300, Kirill A. Shutemov wrote:
> I'll look into it.

I ended up with this (I'll test it more later):

void filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff)
{
struct radix_tree_iter iter;
void **slot;
struct file *file = vmf->vma->vm_file;
struct address_space *mapping = file->f_mapping;
pgoff_t last_pgoff = start_pgoff;
loff_t size;
struct page *page;
bool mapped;

rcu_read_lock();
radix_tree_for_each_slot(slot, >page_tree, ,
start_pgoff) {
unsigned long index = iter.index;
if (index < start_pgoff)
index = start_pgoff;
if (index > end_pgoff)
break;
repeat:
page = radix_tree_deref_slot(slot);
if (unlikely(!page))
continue;
if (radix_tree_exception(page)) {
if (radix_tree_deref_retry(page))
slot = radix_tree_iter_retry();
continue;
}

if (!page_cache_get_speculative(page))
goto repeat;

/* Has the page moved? */
if (unlikely(page != *slot)) {
put_page(page);
goto repeat;
}

/* For multi-order entries, find relevant subpage */
page = find_subpage(page, index);

if (!PageUptodate(page) || PageReadahead(page))
goto skip;
if (!trylock_page(page))
goto skip;

if (page_mapping(page) != mapping || !PageUptodate(page))
goto skip_unlock;

size = round_up(i_size_read(mapping->host), PAGE_SIZE);
if (compound_head(page)->index >= size >> PAGE_SHIFT)
goto skip_unlock;

if (file->f_ra.mmap_miss > 0)
file->f_ra.mmap_miss--;
map_next_subpage:
if (PageHWPoison(page))
goto next;

vmf->address += (index - last_pgoff) << PAGE_SHIFT;
if (vmf->pte)
vmf->pte += index - last_pgoff;
last_pgoff = index;
mapped = !alloc_set_pte(vmf, NULL, page);

/* Huge page is mapped or last index? No need to proceed. */
if (pmd_trans_huge(*vmf->pmd) ||
index == end_pgoff) {
unlock_page(page);
break;
}
next:
if (page && PageCompound(page)) {
/* Last subpage handled? */
if ((index & (compound_nr_pages(page) - 1)) ==
compound_nr_pages(page) - 1)
goto skip_unlock;
index++;
page++;

/*
 * One page reference goes to page table mapping.
 * Need additional reference, if last alloc_set_pte()
 * succeed.
 */
if (mapped)
get_page(page);
goto map_next_subpage;
}
skip_unlock:
unlock_page(page);
skip:
iter.index = compound_head(page)->index +
compound_nr_pages(page) - 1;
/* Only give up reference if alloc_set_pte() failed. */
if (!mapped)
put_page(page);
}
rcu_read_unlock();
}

-- 
 Kirill A. Shutemov


Re: [PATCHv6 01/37] mm, shmem: swich huge tmpfs to multi-order radix-tree entries

2017-02-09 Thread Kirill A. Shutemov
On Wed, Feb 08, 2017 at 07:57:27PM -0800, Matthew Wilcox wrote:
> On Thu, Jan 26, 2017 at 02:57:43PM +0300, Kirill A. Shutemov wrote:
> > +++ b/include/linux/pagemap.h
> > @@ -332,6 +332,15 @@ static inline struct page 
> > *grab_cache_page_nowait(struct address_space *mapping,
> > mapping_gfp_mask(mapping));
> >  }
> >  
> > +static inline struct page *find_subpage(struct page *page, pgoff_t offset)
> > +{
> > +   VM_BUG_ON_PAGE(PageTail(page), page);
> > +   VM_BUG_ON_PAGE(page->index > offset, page);
> > +   VM_BUG_ON_PAGE(page->index + (1 << compound_order(page)) < offset,
> > +   page);
> > +   return page - page->index + offset;
> > +}
> 
> What would you think to:
> 
> static inline void check_page_index(struct page *page, pgoff_t offset)
> {
>   VM_BUG_ON_PAGE(PageTail(page), page);
>   VM_BUG_ON_PAGE(page->index > offset, page);
>   VM_BUG_ON_PAGE(page->index + (1 << compound_order(page)) <= offset,
>   page);
> }
> 
> (I think I fixed an off-by-one error up there ...  if
> index + (1 << order) == offset, this is also a bug, right?
> because offset would then refer to the next page, not this page)

Right, thanks.

> 
> static inline struct page *find_subpage(struct page *page, pgoff_t offset)
> {
>   check_page_index(page, offset);
>   return page + (offset - page->index);
> }
> 
> ... then you can use check_page_index down ...

Okay, makes sense.

> 
> > @@ -1250,7 +1233,6 @@ struct page *find_lock_entry(struct address_space 
> > *mapping, pgoff_t offset)
> > put_page(page);
> > goto repeat;
> > }
> > -   VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
> 
> ... here?

Ok.

> > @@ -1472,25 +1451,35 @@ unsigned find_get_pages(struct address_space 
> > *mapping, pgoff_t start,
> > goto repeat;
> > }
> >  
> > +   /* For multi-order entries, find relevant subpage */
> > +   if (PageTransHuge(page)) {
> > +   VM_BUG_ON(index - page->index < 0);
> > +   VM_BUG_ON(index - page->index >= HPAGE_PMD_NR);
> > +   page += index - page->index;
> > +   }
> 
> Use find_subpage() here?

Ok.

> > pages[ret] = page;
> > if (++ret == nr_pages)
> > break;
> > +   if (!PageTransCompound(page))
> > +   continue;
> > +   for (refs = 0; ret < nr_pages &&
> > +   (index + 1) % HPAGE_PMD_NR;
> > +   ret++, refs++, index++)
> > +   pages[ret] = ++page;
> > +   if (refs)
> > +   page_ref_add(compound_head(page), refs);
> > +   if (ret == nr_pages)
> > +   break;
> 
> Can we avoid referencing huge pages specifically in the page cache?  I'd
> like us to get to the point where we can put arbitrary compound pages into
> the page cache.  For example, I think this can be written as:
> 
>   if (!PageCompound(page))
>   continue;
>   for (refs = 0; ret < nr_pages; refs++, index++) {
>   if (index > page->index + (1 << compound_order(page)))
>   break;
>   pages[ret++] = ++page;
>   }
>   if (refs)
>   page_ref_add(compound_head(page), refs);
>   if (ret == nr_pages)
>   break;

That's slightly more costly, but I guess that's fine.

> > @@ -1541,19 +1533,12 @@ unsigned find_get_pages_contig(struct address_space 
> > *mapping, pgoff_t index,
> >  
> > +   /* For multi-order entries, find relevant subpage */
> > +   if (PageTransHuge(page)) {
> > +   VM_BUG_ON(index - page->index < 0);
> > +   VM_BUG_ON(index - page->index >= HPAGE_PMD_NR);
> > +   page += index - page->index;
> > +   }
> > +
> > pages[ret] = page;
> > if (++ret == nr_pages)
> > break;
> > +   if (!PageTransCompound(page))
> > +   continue;
> > +   for (refs = 0; ret < nr_pages &&
> > +   (index + 1) % HPAGE_PMD_NR;
> > +   ret++, refs++, index++)
> > +   pages[ret] = ++page;
> > +   if (refs)
> > +   page_ref_add(compound_head(page), refs);
> > +   if (ret == nr_pages)
> > +   break;
> > }
> > rcu_read_unlock();
> > return ret;
> 
> Ugh, the same code again.  Hmm ... we only need to modify 'ret' as a result
> of this ... so could we split it out like this?
> 
> static unsigned populate_pages(struct page **pages, unsigned i, unsigned max,
> struct page *page)
> {
> unsigned refs = 0;
> for (;;) {
> pages[i++] = page;
>   

Re: [PATCHv6 01/37] mm, shmem: swich huge tmpfs to multi-order radix-tree entries

2017-02-08 Thread Matthew Wilcox
On Thu, Jan 26, 2017 at 02:57:43PM +0300, Kirill A. Shutemov wrote:
> +++ b/include/linux/pagemap.h
> @@ -332,6 +332,15 @@ static inline struct page *grab_cache_page_nowait(struct 
> address_space *mapping,
>   mapping_gfp_mask(mapping));
>  }
>  
> +static inline struct page *find_subpage(struct page *page, pgoff_t offset)
> +{
> + VM_BUG_ON_PAGE(PageTail(page), page);
> + VM_BUG_ON_PAGE(page->index > offset, page);
> + VM_BUG_ON_PAGE(page->index + (1 << compound_order(page)) < offset,
> + page);
> + return page - page->index + offset;
> +}

What would you think to:

static inline void check_page_index(struct page *page, pgoff_t offset)
{
VM_BUG_ON_PAGE(PageTail(page), page);
VM_BUG_ON_PAGE(page->index > offset, page);
VM_BUG_ON_PAGE(page->index + (1 << compound_order(page)) <= offset,
page);
}

(I think I fixed an off-by-one error up there ...  if
index + (1 << order) == offset, this is also a bug, right?
because offset would then refer to the next page, not this page)

static inline struct page *find_subpage(struct page *page, pgoff_t offset)
{
check_page_index(page, offset);
return page + (offset - page->index);
}

... then you can use check_page_index down ...

> @@ -1250,7 +1233,6 @@ struct page *find_lock_entry(struct address_space 
> *mapping, pgoff_t offset)
>   put_page(page);
>   goto repeat;
>   }
> - VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);

... here?

> @@ -1472,25 +1451,35 @@ unsigned find_get_pages(struct address_space 
> *mapping, pgoff_t start,
>   goto repeat;
>   }
>  
> + /* For multi-order entries, find relevant subpage */
> + if (PageTransHuge(page)) {
> + VM_BUG_ON(index - page->index < 0);
> + VM_BUG_ON(index - page->index >= HPAGE_PMD_NR);
> + page += index - page->index;
> + }

Use find_subpage() here?

>   pages[ret] = page;
>   if (++ret == nr_pages)
>   break;
> + if (!PageTransCompound(page))
> + continue;
> + for (refs = 0; ret < nr_pages &&
> + (index + 1) % HPAGE_PMD_NR;
> + ret++, refs++, index++)
> + pages[ret] = ++page;
> + if (refs)
> + page_ref_add(compound_head(page), refs);
> + if (ret == nr_pages)
> + break;

Can we avoid referencing huge pages specifically in the page cache?  I'd
like us to get to the point where we can put arbitrary compound pages into
the page cache.  For example, I think this can be written as:

if (!PageCompound(page))
continue;
for (refs = 0; ret < nr_pages; refs++, index++) {
if (index > page->index + (1 << compound_order(page)))
break;
pages[ret++] = ++page;
}
if (refs)
page_ref_add(compound_head(page), refs);
if (ret == nr_pages)
break;

> @@ -1541,19 +1533,12 @@ unsigned find_get_pages_contig(struct address_space 
> *mapping, pgoff_t index,
>  
> + /* For multi-order entries, find relevant subpage */
> + if (PageTransHuge(page)) {
> + VM_BUG_ON(index - page->index < 0);
> + VM_BUG_ON(index - page->index >= HPAGE_PMD_NR);
> + page += index - page->index;
> + }
> +
>   pages[ret] = page;
>   if (++ret == nr_pages)
>   break;
> + if (!PageTransCompound(page))
> + continue;
> + for (refs = 0; ret < nr_pages &&
> + (index + 1) % HPAGE_PMD_NR;
> + ret++, refs++, index++)
> + pages[ret] = ++page;
> + if (refs)
> + page_ref_add(compound_head(page), refs);
> + if (ret == nr_pages)
> + break;
>   }
>   rcu_read_unlock();
>   return ret;

Ugh, the same code again.  Hmm ... we only need to modify 'ret' as a result
of this ... so could we split it out like this?

static unsigned populate_pages(struct page **pages, unsigned i, unsigned max,
struct page *page)
{
unsigned refs = 0;
for (;;) {
pages[i++] = page;
if (i == max)
break;
if (PageHead(page + 1))
break;
page++;
refs++;
}
if (refs)
page_ref_add(compound_head(page), refs);
return i;
}


[PATCHv6 01/37] mm, shmem: swich huge tmpfs to multi-order radix-tree entries

2017-01-26 Thread Kirill A. Shutemov
We would need to use multi-order radix-tree entires for ext4 and other
filesystems to have coherent view on tags (dirty/towrite) in the tree.

This patch converts huge tmpfs implementation to multi-order entries, so
we will be able to use the same code patch for all filesystems.

We also change interface for page-cache lookup function:

  - functions that lookup for pages[1] would return subpages of THP
relevant for requested indexes;

  - functions that lookup for entries[2] would return one entry per-THP
and index will point to index of head page (basically, round down to
HPAGE_PMD_NR);

This would provide balanced exposure of multi-order entires to the rest
of the kernel.

[1] find_get_pages(), pagecache_get_page(), pagevec_lookup(), etc.
[2] find_get_entry(), find_get_entries(), pagevec_lookup_entries(), etc.

Signed-off-by: Kirill A. Shutemov 
---
 include/linux/pagemap.h |   9 ++
 mm/filemap.c| 236 ++--
 mm/huge_memory.c|  48 +++---
 mm/khugepaged.c |  26 ++
 mm/shmem.c  | 117 ++--
 mm/truncate.c   |  15 ++-
 6 files changed, 235 insertions(+), 216 deletions(-)

diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h
index 324c8dbad1e1..ad63a7be5a5e 100644
--- a/include/linux/pagemap.h
+++ b/include/linux/pagemap.h
@@ -332,6 +332,15 @@ static inline struct page *grab_cache_page_nowait(struct 
address_space *mapping,
mapping_gfp_mask(mapping));
 }
 
+static inline struct page *find_subpage(struct page *page, pgoff_t offset)
+{
+   VM_BUG_ON_PAGE(PageTail(page), page);
+   VM_BUG_ON_PAGE(page->index > offset, page);
+   VM_BUG_ON_PAGE(page->index + (1 << compound_order(page)) < offset,
+   page);
+   return page - page->index + offset;
+}
+
 struct page *find_get_entry(struct address_space *mapping, pgoff_t offset);
 struct page *find_lock_entry(struct address_space *mapping, pgoff_t offset);
 unsigned find_get_entries(struct address_space *mapping, pgoff_t start,
diff --git a/mm/filemap.c b/mm/filemap.c
index b772a33ef640..837a71a2a412 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -150,7 +150,9 @@ static int page_cache_tree_insert(struct address_space 
*mapping,
 static void page_cache_tree_delete(struct address_space *mapping,
   struct page *page, void *shadow)
 {
-   int i, nr;
+   struct radix_tree_node *node;
+   void **slot;
+   int nr;
 
/* hugetlb pages are represented by one entry in the radix tree */
nr = PageHuge(page) ? 1 : hpage_nr_pages(page);
@@ -159,19 +161,12 @@ static void page_cache_tree_delete(struct address_space 
*mapping,
VM_BUG_ON_PAGE(PageTail(page), page);
VM_BUG_ON_PAGE(nr != 1 && shadow, page);
 
-   for (i = 0; i < nr; i++) {
-   struct radix_tree_node *node;
-   void **slot;
+   __radix_tree_lookup(>page_tree, page->index, , );
+   VM_BUG_ON_PAGE(!node && nr != 1, page);
 
-   __radix_tree_lookup(>page_tree, page->index + i,
-   , );
-
-   VM_BUG_ON_PAGE(!node && nr != 1, page);
-
-   radix_tree_clear_tags(>page_tree, node, slot);
-   __radix_tree_replace(>page_tree, node, slot, shadow,
-workingset_update_node, mapping);
-   }
+   radix_tree_clear_tags(>page_tree, node, slot);
+   __radix_tree_replace(>page_tree, node, slot, shadow,
+   workingset_update_node, mapping);
 
if (shadow) {
mapping->nrexceptional += nr;
@@ -285,12 +280,7 @@ void delete_from_page_cache(struct page *page)
if (freepage)
freepage(page);
 
-   if (PageTransHuge(page) && !PageHuge(page)) {
-   page_ref_sub(page, HPAGE_PMD_NR);
-   VM_BUG_ON_PAGE(page_count(page) <= 0, page);
-   } else {
-   put_page(page);
-   }
+   put_page(page);
 }
 EXPORT_SYMBOL(delete_from_page_cache);
 
@@ -1172,7 +1162,7 @@ EXPORT_SYMBOL(page_cache_prev_hole);
 struct page *find_get_entry(struct address_space *mapping, pgoff_t offset)
 {
void **pagep;
-   struct page *head, *page;
+   struct page *page;
 
rcu_read_lock();
 repeat:
@@ -1193,15 +1183,8 @@ struct page *find_get_entry(struct address_space 
*mapping, pgoff_t offset)
goto out;
}
 
-   head = compound_head(page);
-   if (!page_cache_get_speculative(head))
-   goto repeat;
-
-   /* The page was split under us? */
-   if (compound_head(page) != head) {
-   put_page(head);
+   if (!page_cache_get_speculative(page))
goto repeat;
-   }
 
/*
 * Has the