On Thu, Feb 06, 2014 at 05:31:55PM -0800, Linus Torvalds wrote:
> On Thu, Feb 6, 2014 at 2:24 PM, Kirill A. Shutemov <[email protected]> 
> wrote:
> >
> > If we want to reduce number of page fault with less overhead we probably
> > should concentrate on minor page fault -- populate pte around fault
> > address which already in page cache. It should cover scripting use-case
> > pretty well.
> 
> That's what my patch largely does. Except I screwed up and didn't use
> FAULT_FLAG_ALLOW_RETRY in fault_around().

I fail to see how you avoid touching backing storage.

> Anyway, my patch kind of works, but I'm starting to hate it. I think I
> want to try to extend the "->fault()" interface to allow
> filemap_fault() to just fill in multiple pages.
> 
> We alread have that "vmf->page" thing, we could make it a small array
> easily. That would allow proper gang lookup, and much more efficient
> "fill in multiple entries in one go" in mm/memory.c.

See patch below.

I extended ->fault() interface: added pointer to array of pages and
nr_pages. If ->fault() nows about new interface and use it, it fills array
and set VM_FAULT_AROUND bit in return code. Only filemap_fault()
converted at the moment.

I haven't tested it much, but my kvm boots. There're few places where code
should be fixed. __do_fault() and filemap_fault() are too ugly and need to
be cleaned.

I don't have any performance data yet.

Any thoughts?

---
 include/linux/mm.h |  11 +++++
 mm/filemap.c       | 142 ++++++++++++++++++++++++++++++++++++++++++++++++++++-
 mm/memory.c        |  58 ++++++++++++++++++++--
 3 files changed, 204 insertions(+), 7 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index 35527173cf50..deb65c0850a9 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -181,6 +181,9 @@ extern pgprot_t protection_map[16];
 #define FAULT_FLAG_KILLABLE    0x20    /* The fault task is in SIGKILL 
killable region */
 #define FAULT_FLAG_TRIED       0x40    /* second try */
 #define FAULT_FLAG_USER                0x80    /* The fault originated in 
userspace */
+#define FAULT_FLAG_AROUND      0x100   /* Get ->nr_pages pages around fault
+                                        * address
+                                        */
 
 /*
  * vm_fault is filled by the the pagefault handler and passed to the vma's
@@ -200,6 +203,13 @@ struct vm_fault {
                                         * is set (which is also implied by
                                         * VM_FAULT_ERROR).
                                         */
+
+       int nr_pages;                   /* Number of pages to faultin, naturally
+                                        * aligned around virtual_address
+                                        */
+       struct page **pages;            /* Pointer to array to store nr_pages
+                                        * pages.
+                                        */
 };
 
 /*
@@ -962,6 +972,7 @@ static inline int page_mapped(struct page *page)
 #define VM_FAULT_LOCKED        0x0200  /* ->fault locked the returned page */
 #define VM_FAULT_RETRY 0x0400  /* ->fault blocked, must retry */
 #define VM_FAULT_FALLBACK 0x0800       /* huge page fault failed, fall back to 
small */
+#define VM_FAULT_AROUND 0x1000
 
 #define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large 
hwpoison */
 
diff --git a/mm/filemap.c b/mm/filemap.c
index b7749a92021c..1cabb15a5847 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -884,6 +884,64 @@ repeat:
        return ret;
 }
 
+void find_get_pages_or_null(struct address_space *mapping, pgoff_t start,
+                           unsigned int nr_pages, struct page **pages)
+{
+       struct radix_tree_iter iter;
+       void **slot;
+
+       if (unlikely(!nr_pages))
+               return;
+
+       memset(pages, 0, sizeof(struct page *) * nr_pages);
+
+       rcu_read_lock();
+restart:
+       radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
+               struct page *page;
+repeat:
+               page = radix_tree_deref_slot(slot);
+               if (unlikely(!page))
+                       continue;
+
+               if (radix_tree_exception(page)) {
+                       if (radix_tree_deref_retry(page)) {
+                               /*
+                                * Transient condition which can only trigger
+                                * when entry at index 0 moves out of or back
+                                * to root: none yet gotten, safe to restart.
+                                */
+                               WARN_ON(iter.index);
+                               goto restart;
+                       }
+                       /*
+                        * Otherwise, shmem/tmpfs must be storing a swap entry
+                        * here as an exceptional entry: so skip over it -
+                        * we only reach this from invalidate_mapping_pages().
+                        */
+                       continue;
+               }
+
+               if (!page_cache_get_speculative(page))
+                       goto repeat;
+
+               if (page->index - start >= nr_pages) {
+                       page_cache_release(page);
+                       break;
+               }
+
+               /* Has the page moved? */
+               if (unlikely(page != *slot)) {
+                       page_cache_release(page);
+                       goto repeat;
+               }
+
+               pages[page->index - start] = page;
+       }
+
+       rcu_read_unlock();
+}
+
 /**
  * find_get_pages_contig - gang contiguous pagecache lookup
  * @mapping:   The address_space to search
@@ -1595,6 +1653,67 @@ static void do_async_mmap_readahead(struct 
vm_area_struct *vma,
                                           page, offset, ra->ra_pages);
 }
 
+static void lock_secondary_pages(struct vm_area_struct *vma,
+               struct vm_fault *vmf)
+{
+       struct file *file = vma->vm_file;
+       struct address_space *mapping = file->f_mapping;
+       unsigned long address = (unsigned long)vmf->virtual_address;
+       int primary_idx = (address >> PAGE_SHIFT) & (vmf->nr_pages - 1);
+       pgoff_t size;
+       int i;
+
+       for (i = 0; i < vmf->nr_pages; i++) {
+               unsigned long addr = address + PAGE_SIZE * (i - primary_idx);
+
+               if (i == primary_idx || !vmf->pages[i])
+                       continue;
+
+               if (addr < vma->vm_start || addr >= vma->vm_end)
+                       goto put;
+               if (!trylock_page(vmf->pages[i]))
+                       goto put;
+               /* Truncated? */
+               if (unlikely(vmf->pages[i]->mapping != mapping))
+                       goto unlock;
+
+               if (unlikely(!PageUptodate(vmf->pages[i])))
+                       goto unlock;
+
+               /*
+                * We have a locked page in the page cache, now we need to
+                * check that it's up-to-date. If not, it is going to be due to
+                * an error.
+                */
+               size = (i_size_read(mapping->host) + PAGE_CACHE_SIZE - 1)
+                       >> PAGE_CACHE_SHIFT;
+               if (unlikely(vmf->pages[i]->index >= size))
+                       goto unlock;
+
+               continue;
+unlock:
+               unlock_page(vmf->pages[i]);
+put:
+               put_page(vmf->pages[i]);
+               vmf->pages[i] = NULL;
+       }
+}
+
+static void unlock_and_put_secondary_pages(struct vm_fault *vmf)
+{
+       unsigned long address = (unsigned long)vmf->virtual_address;
+       int primary_idx = (address >> PAGE_SHIFT) & (vmf->nr_pages - 1);
+       int i;
+
+       for (i = 0; i < vmf->nr_pages; i++) {
+               if (i == primary_idx || !vmf->pages[i])
+                       continue;
+               unlock_page(vmf->pages[i]);
+               page_cache_release(vmf->pages[i]);
+               vmf->pages[i] = NULL;
+       }
+}
+
 /**
  * filemap_fault - read in file data for page fault handling
  * @vma:       vma in which the fault was taken
@@ -1617,6 +1736,8 @@ int filemap_fault(struct vm_area_struct *vma, struct 
vm_fault *vmf)
        pgoff_t offset = vmf->pgoff;
        struct page *page;
        pgoff_t size;
+       unsigned long address = (unsigned long)vmf->virtual_address;
+       int primary_idx = (address >> PAGE_SHIFT) & (vmf->nr_pages - 1);
        int ret = 0;
 
        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
@@ -1626,7 +1747,15 @@ int filemap_fault(struct vm_area_struct *vma, struct 
vm_fault *vmf)
        /*
         * Do we have something in the page cache already?
         */
-       page = find_get_page(mapping, offset);
+       if (vmf->flags & FAULT_FLAG_AROUND) {
+               find_get_pages_or_null(mapping, offset - primary_idx,
+                               vmf->nr_pages, vmf->pages);
+               page = vmf->pages[primary_idx];
+               lock_secondary_pages(vma, vmf);
+               ret |= VM_FAULT_AROUND;
+       } else
+               page = find_get_page(mapping, offset);
+
        if (likely(page) && !(vmf->flags & FAULT_FLAG_TRIED)) {
                /*
                 * We found the page, so try async readahead before
@@ -1638,14 +1767,18 @@ int filemap_fault(struct vm_area_struct *vma, struct 
vm_fault *vmf)
                do_sync_mmap_readahead(vma, ra, file, offset);
                count_vm_event(PGMAJFAULT);
                mem_cgroup_count_vm_event(vma->vm_mm, PGMAJFAULT);
-               ret = VM_FAULT_MAJOR;
+               ret |= VM_FAULT_MAJOR;
 retry_find:
                page = find_get_page(mapping, offset);
                if (!page)
                        goto no_cached_page;
+               if (vmf->flags & FAULT_FLAG_AROUND)
+                       vmf->pages[primary_idx] = page;
        }
 
        if (!lock_page_or_retry(page, vma->vm_mm, vmf->flags)) {
+               unlock_and_put_secondary_pages(vmf);
+               ret &= ~VM_FAULT_AROUND;
                page_cache_release(page);
                return ret | VM_FAULT_RETRY;
        }
@@ -1671,6 +1804,7 @@ retry_find:
         */
        size = (i_size_read(inode) + PAGE_CACHE_SIZE - 1) >> PAGE_CACHE_SHIFT;
        if (unlikely(offset >= size)) {
+               unlock_and_put_secondary_pages(vmf);
                unlock_page(page);
                page_cache_release(page);
                return VM_FAULT_SIGBUS;
@@ -1694,6 +1828,8 @@ no_cached_page:
        if (error >= 0)
                goto retry_find;
 
+       unlock_and_put_secondary_pages(vmf);
+
        /*
         * An error return from page_cache_read can result if the
         * system is low on memory, or a problem occurs while trying
@@ -1722,6 +1858,8 @@ page_not_uptodate:
        if (!error || error == AOP_TRUNCATED_PAGE)
                goto retry_find;
 
+       unlock_and_put_secondary_pages(vmf);
+
        /* Things didn't work out. Return zero to tell the mm layer so. */
        shrink_readahead_size_eio(file, ra);
        return VM_FAULT_SIGBUS;
diff --git a/mm/memory.c b/mm/memory.c
index 6768ce9e57d2..e94d86ac7d5a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -3283,6 +3283,9 @@ oom:
        return VM_FAULT_OOM;
 }
 
+#define FAULT_AROUND_PAGES 32
+#define FAULT_AROUND_MASK (FAULT_AROUND_PAGES - 1)
+
 /*
  * __do_fault() tries to create a new page mapping. It aggressively
  * tries to share with existing pages, but makes a separate copy if
@@ -3303,12 +3306,13 @@ static int __do_fault(struct mm_struct *mm, struct 
vm_area_struct *vma,
        pte_t *page_table;
        spinlock_t *ptl;
        struct page *page;
+       struct page *pages[FAULT_AROUND_PAGES];
        struct page *cow_page;
        pte_t entry;
        int anon = 0;
        struct page *dirty_page = NULL;
        struct vm_fault vmf;
-       int ret;
+       int i, ret;
        int page_mkwrite = 0;
 
        /*
@@ -3336,14 +3340,35 @@ static int __do_fault(struct mm_struct *mm, struct 
vm_area_struct *vma,
        vmf.flags = flags;
        vmf.page = NULL;
 
+       /* Do fault around only for read faults on linear mapping */
+       if (flags & (FAULT_FLAG_WRITE | FAULT_FLAG_NONLINEAR)) {
+               vmf.nr_pages = 0;
+               vmf.pages = NULL;
+       } else {
+               vmf.nr_pages = FAULT_AROUND_PAGES;
+               vmf.pages = pages;
+               vmf.flags |= FAULT_FLAG_AROUND;
+       }
        ret = vma->vm_ops->fault(vma, &vmf);
+
+       /* ->fault don't know about FAULT_FLAG_AROUND */
+       if ((vmf.flags & FAULT_FLAG_AROUND) && !(ret & VM_FAULT_AROUND)) {
+               vmf.flags &= ~FAULT_FLAG_AROUND;
+       }
        if (unlikely(ret & (VM_FAULT_ERROR | VM_FAULT_NOPAGE |
                            VM_FAULT_RETRY)))
                goto uncharge_out;
 
        if (unlikely(PageHWPoison(vmf.page))) {
-               if (ret & VM_FAULT_LOCKED)
-                       unlock_page(vmf.page);
+               if (ret & VM_FAULT_LOCKED) {
+                       if (vmf.flags & FAULT_FLAG_AROUND) {
+                               for (i = 0; i < FAULT_AROUND_PAGES; i++) {
+                                       if (pages[i])
+                                               unlock_page(pages[i]);
+                               }
+                       } else
+                               unlock_page(vmf.page);
+               }
                ret = VM_FAULT_HWPOISON;
                goto uncharge_out;
        }
@@ -3352,9 +3377,10 @@ static int __do_fault(struct mm_struct *mm, struct 
vm_area_struct *vma,
         * For consistency in subsequent calls, make the faulted page always
         * locked.
         */
-       if (unlikely(!(ret & VM_FAULT_LOCKED)))
+       if (unlikely(!(ret & VM_FAULT_LOCKED))) {
+               BUG_ON(ret & VM_FAULT_AROUND); // FIXME
                lock_page(vmf.page);
-       else
+       } else
                VM_BUG_ON(!PageLocked(vmf.page));
 
        /*
@@ -3401,6 +3427,28 @@ static int __do_fault(struct mm_struct *mm, struct 
vm_area_struct *vma,
 
        page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 
+       for (i = 0; (vmf.flags & FAULT_FLAG_AROUND) && i < FAULT_AROUND_PAGES;
+                       i++) {
+               int primary_idx = (address >> PAGE_SHIFT) & FAULT_AROUND_MASK;
+               pte_t *pte = page_table - primary_idx + i;
+               unsigned long addr = address + PAGE_SIZE * (i - primary_idx);
+
+               if (!pages[i])
+                       continue;
+               if (i == primary_idx || !pte_none(*pte))
+                       goto skip;
+               if (PageHWPoison(vmf.page))
+                       goto skip;
+               flush_icache_page(vma, pages[i]);
+               entry = mk_pte(pages[i], vma->vm_page_prot);
+               inc_mm_counter_fast(mm, MM_FILEPAGES);
+               page_add_file_rmap(pages[i]);
+               set_pte_at(mm, addr, pte, entry);
+               update_mmu_cache(vma, addr, pte);
+skip:
+               unlock_page(pages[i]);
+       }
+
        /*
         * This silly early PAGE_DIRTY setting removes a race
         * due to the bad i386 page protection. But it's valid
-- 
 Kirill A. Shutemov
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to