Re: [PATCHv6 01/37] mm, shmem: swich huge tmpfs to multi-order radix-tree entries

2017-02-13 Thread Kirill A. Shutemov
On Thu, Feb 09, 2017 at 07:58:20PM +0300, Kirill A. Shutemov wrote:
> I'll look into it.

I ended up with this (I'll test it more later):

void filemap_map_pages(struct vm_fault *vmf,
pgoff_t start_pgoff, pgoff_t end_pgoff)
{
struct radix_tree_iter iter;
void **slot;
struct file *file = vmf->vma->vm_file;
struct address_space *mapping = file->f_mapping;
pgoff_t last_pgoff = start_pgoff;
loff_t size;
struct page *page;
bool mapped;

rcu_read_lock();
radix_tree_for_each_slot(slot, >page_tree, ,
start_pgoff) {
unsigned long index = iter.index;
if (index < start_pgoff)
index = start_pgoff;
if (index > end_pgoff)
break;
repeat:
page = radix_tree_deref_slot(slot);
if (unlikely(!page))
continue;
if (radix_tree_exception(page)) {
if (radix_tree_deref_retry(page))
slot = radix_tree_iter_retry();
continue;
}

if (!page_cache_get_speculative(page))
goto repeat;

/* Has the page moved? */
if (unlikely(page != *slot)) {
put_page(page);
goto repeat;
}

/* For multi-order entries, find relevant subpage */
page = find_subpage(page, index);

if (!PageUptodate(page) || PageReadahead(page))
goto skip;
if (!trylock_page(page))
goto skip;

if (page_mapping(page) != mapping || !PageUptodate(page))
goto skip_unlock;

size = round_up(i_size_read(mapping->host), PAGE_SIZE);
if (compound_head(page)->index >= size >> PAGE_SHIFT)
goto skip_unlock;

if (file->f_ra.mmap_miss > 0)
file->f_ra.mmap_miss--;
map_next_subpage:
if (PageHWPoison(page))
goto next;

vmf->address += (index - last_pgoff) << PAGE_SHIFT;
if (vmf->pte)
vmf->pte += index - last_pgoff;
last_pgoff = index;
mapped = !alloc_set_pte(vmf, NULL, page);

/* Huge page is mapped or last index? No need to proceed. */
if (pmd_trans_huge(*vmf->pmd) ||
index == end_pgoff) {
unlock_page(page);
break;
}
next:
if (page && PageCompound(page)) {
/* Last subpage handled? */
if ((index & (compound_nr_pages(page) - 1)) ==
compound_nr_pages(page) - 1)
goto skip_unlock;
index++;
page++;

/*
 * One page reference goes to page table mapping.
 * Need additional reference, if last alloc_set_pte()
 * succeed.
 */
if (mapped)
get_page(page);
goto map_next_subpage;
}
skip_unlock:
unlock_page(page);
skip:
iter.index = compound_head(page)->index +
compound_nr_pages(page) - 1;
/* Only give up reference if alloc_set_pte() failed. */
if (!mapped)
put_page(page);
}
rcu_read_unlock();
}

-- 
 Kirill A. Shutemov


Re: [PATCHv6 01/37] mm, shmem: swich huge tmpfs to multi-order radix-tree entries

2017-02-09 Thread Kirill A. Shutemov
On Wed, Feb 08, 2017 at 07:57:27PM -0800, Matthew Wilcox wrote:
> On Thu, Jan 26, 2017 at 02:57:43PM +0300, Kirill A. Shutemov wrote:
> > +++ b/include/linux/pagemap.h
> > @@ -332,6 +332,15 @@ static inline struct page 
> > *grab_cache_page_nowait(struct address_space *mapping,
> > mapping_gfp_mask(mapping));
> >  }
> >  
> > +static inline struct page *find_subpage(struct page *page, pgoff_t offset)
> > +{
> > +   VM_BUG_ON_PAGE(PageTail(page), page);
> > +   VM_BUG_ON_PAGE(page->index > offset, page);
> > +   VM_BUG_ON_PAGE(page->index + (1 << compound_order(page)) < offset,
> > +   page);
> > +   return page - page->index + offset;
> > +}
> 
> What would you think to:
> 
> static inline void check_page_index(struct page *page, pgoff_t offset)
> {
>   VM_BUG_ON_PAGE(PageTail(page), page);
>   VM_BUG_ON_PAGE(page->index > offset, page);
>   VM_BUG_ON_PAGE(page->index + (1 << compound_order(page)) <= offset,
>   page);
> }
> 
> (I think I fixed an off-by-one error up there ...  if
> index + (1 << order) == offset, this is also a bug, right?
> because offset would then refer to the next page, not this page)

Right, thanks.

> 
> static inline struct page *find_subpage(struct page *page, pgoff_t offset)
> {
>   check_page_index(page, offset);
>   return page + (offset - page->index);
> }
> 
> ... then you can use check_page_index down ...

Okay, makes sense.

> 
> > @@ -1250,7 +1233,6 @@ struct page *find_lock_entry(struct address_space 
> > *mapping, pgoff_t offset)
> > put_page(page);
> > goto repeat;
> > }
> > -   VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);
> 
> ... here?

Ok.

> > @@ -1472,25 +1451,35 @@ unsigned find_get_pages(struct address_space 
> > *mapping, pgoff_t start,
> > goto repeat;
> > }
> >  
> > +   /* For multi-order entries, find relevant subpage */
> > +   if (PageTransHuge(page)) {
> > +   VM_BUG_ON(index - page->index < 0);
> > +   VM_BUG_ON(index - page->index >= HPAGE_PMD_NR);
> > +   page += index - page->index;
> > +   }
> 
> Use find_subpage() here?

Ok.

> > pages[ret] = page;
> > if (++ret == nr_pages)
> > break;
> > +   if (!PageTransCompound(page))
> > +   continue;
> > +   for (refs = 0; ret < nr_pages &&
> > +   (index + 1) % HPAGE_PMD_NR;
> > +   ret++, refs++, index++)
> > +   pages[ret] = ++page;
> > +   if (refs)
> > +   page_ref_add(compound_head(page), refs);
> > +   if (ret == nr_pages)
> > +   break;
> 
> Can we avoid referencing huge pages specifically in the page cache?  I'd
> like us to get to the point where we can put arbitrary compound pages into
> the page cache.  For example, I think this can be written as:
> 
>   if (!PageCompound(page))
>   continue;
>   for (refs = 0; ret < nr_pages; refs++, index++) {
>   if (index > page->index + (1 << compound_order(page)))
>   break;
>   pages[ret++] = ++page;
>   }
>   if (refs)
>   page_ref_add(compound_head(page), refs);
>   if (ret == nr_pages)
>   break;

That's slightly more costly, but I guess that's fine.

> > @@ -1541,19 +1533,12 @@ unsigned find_get_pages_contig(struct address_space 
> > *mapping, pgoff_t index,
> >  
> > +   /* For multi-order entries, find relevant subpage */
> > +   if (PageTransHuge(page)) {
> > +   VM_BUG_ON(index - page->index < 0);
> > +   VM_BUG_ON(index - page->index >= HPAGE_PMD_NR);
> > +   page += index - page->index;
> > +   }
> > +
> > pages[ret] = page;
> > if (++ret == nr_pages)
> > break;
> > +   if (!PageTransCompound(page))
> > +   continue;
> > +   for (refs = 0; ret < nr_pages &&
> > +   (index + 1) % HPAGE_PMD_NR;
> > +   ret++, refs++, index++)
> > +   pages[ret] = ++page;
> > +   if (refs)
> > +   page_ref_add(compound_head(page), refs);
> > +   if (ret == nr_pages)
> > +   break;
> > }
> > rcu_read_unlock();
> > return ret;
> 
> Ugh, the same code again.  Hmm ... we only need to modify 'ret' as a result
> of this ... so could we split it out like this?
> 
> static unsigned populate_pages(struct page **pages, unsigned i, unsigned max,
> struct page *page)
> {
> unsigned refs = 0;
> for (;;) {
> pages[i++] = page;
>   

Re: [PATCHv6 01/37] mm, shmem: swich huge tmpfs to multi-order radix-tree entries

2017-02-08 Thread Matthew Wilcox
On Thu, Jan 26, 2017 at 02:57:43PM +0300, Kirill A. Shutemov wrote:
> +++ b/include/linux/pagemap.h
> @@ -332,6 +332,15 @@ static inline struct page *grab_cache_page_nowait(struct 
> address_space *mapping,
>   mapping_gfp_mask(mapping));
>  }
>  
> +static inline struct page *find_subpage(struct page *page, pgoff_t offset)
> +{
> + VM_BUG_ON_PAGE(PageTail(page), page);
> + VM_BUG_ON_PAGE(page->index > offset, page);
> + VM_BUG_ON_PAGE(page->index + (1 << compound_order(page)) < offset,
> + page);
> + return page - page->index + offset;
> +}

What would you think to:

static inline void check_page_index(struct page *page, pgoff_t offset)
{
VM_BUG_ON_PAGE(PageTail(page), page);
VM_BUG_ON_PAGE(page->index > offset, page);
VM_BUG_ON_PAGE(page->index + (1 << compound_order(page)) <= offset,
page);
}

(I think I fixed an off-by-one error up there ...  if
index + (1 << order) == offset, this is also a bug, right?
because offset would then refer to the next page, not this page)

static inline struct page *find_subpage(struct page *page, pgoff_t offset)
{
check_page_index(page, offset);
return page + (offset - page->index);
}

... then you can use check_page_index down ...

> @@ -1250,7 +1233,6 @@ struct page *find_lock_entry(struct address_space 
> *mapping, pgoff_t offset)
>   put_page(page);
>   goto repeat;
>   }
> - VM_BUG_ON_PAGE(page_to_pgoff(page) != offset, page);

... here?

> @@ -1472,25 +1451,35 @@ unsigned find_get_pages(struct address_space 
> *mapping, pgoff_t start,
>   goto repeat;
>   }
>  
> + /* For multi-order entries, find relevant subpage */
> + if (PageTransHuge(page)) {
> + VM_BUG_ON(index - page->index < 0);
> + VM_BUG_ON(index - page->index >= HPAGE_PMD_NR);
> + page += index - page->index;
> + }

Use find_subpage() here?

>   pages[ret] = page;
>   if (++ret == nr_pages)
>   break;
> + if (!PageTransCompound(page))
> + continue;
> + for (refs = 0; ret < nr_pages &&
> + (index + 1) % HPAGE_PMD_NR;
> + ret++, refs++, index++)
> + pages[ret] = ++page;
> + if (refs)
> + page_ref_add(compound_head(page), refs);
> + if (ret == nr_pages)
> + break;

Can we avoid referencing huge pages specifically in the page cache?  I'd
like us to get to the point where we can put arbitrary compound pages into
the page cache.  For example, I think this can be written as:

if (!PageCompound(page))
continue;
for (refs = 0; ret < nr_pages; refs++, index++) {
if (index > page->index + (1 << compound_order(page)))
break;
pages[ret++] = ++page;
}
if (refs)
page_ref_add(compound_head(page), refs);
if (ret == nr_pages)
break;

> @@ -1541,19 +1533,12 @@ unsigned find_get_pages_contig(struct address_space 
> *mapping, pgoff_t index,
>  
> + /* For multi-order entries, find relevant subpage */
> + if (PageTransHuge(page)) {
> + VM_BUG_ON(index - page->index < 0);
> + VM_BUG_ON(index - page->index >= HPAGE_PMD_NR);
> + page += index - page->index;
> + }
> +
>   pages[ret] = page;
>   if (++ret == nr_pages)
>   break;
> + if (!PageTransCompound(page))
> + continue;
> + for (refs = 0; ret < nr_pages &&
> + (index + 1) % HPAGE_PMD_NR;
> + ret++, refs++, index++)
> + pages[ret] = ++page;
> + if (refs)
> + page_ref_add(compound_head(page), refs);
> + if (ret == nr_pages)
> + break;
>   }
>   rcu_read_unlock();
>   return ret;

Ugh, the same code again.  Hmm ... we only need to modify 'ret' as a result
of this ... so could we split it out like this?

static unsigned populate_pages(struct page **pages, unsigned i, unsigned max,
struct page *page)
{
unsigned refs = 0;
for (;;) {
pages[i++] = page;
if (i == max)
break;
if (PageHead(page + 1))
break;
page++;
refs++;
}
if (refs)
page_ref_add(compound_head(page), refs);
return i;
}