By vrange(2) semantic, a user should see SIGBUS if they try to
access purged page without marking the memory as non-voaltile
(ie, vrange(...VRANGE_NOVOLATILE)).

This allows for optimistic traversal of volatile pages, without
having to mark them non-volatile first and the SIGBUS allows
applications to trap and fixup the purged range before accessing
them again.

This patch implements it by adding SWP_VRANGE so it consumes one
from MAX_SWAPFILES. It means worst case of MAX_SWAPFILES in 32 bit
is 32 - 2 - 1 - 1 = 28. I think it's still enough for everybody.

Cc: Mel Gorman <m...@csn.ul.ie>
Cc: Hugh Dickins <hu...@google.com>
Cc: Dave Hansen <dave.han...@intel.com>
Cc: Rik van Riel <r...@redhat.com>
Cc: Dmitry Adamushko <dmitry.adamus...@gmail.com>
Cc: Dave Chinner <da...@fromorbit.com>
Cc: Neil Brown <ne...@suse.de>
Cc: Andrea Righi <and...@betterlinux.com>
Cc: Andrea Arcangeli <aarca...@redhat.com>
Cc: Aneesh Kumar K.V <aneesh.ku...@linux.vnet.ibm.com>
Cc: Mike Hommey <m...@glandium.org>
Cc: Taras Glek <tg...@mozilla.com>
Cc: Dhaval Giani <dhaval.gi...@gmail.com>
Cc: Jan Kara <j...@suse.cz>
Cc: KOSAKI Motohiro <kosaki.motoh...@gmail.com>
Cc: Johannes Weiner <han...@cmpxchg.org>
Cc: Michel Lespinasse <wal...@google.com>
Cc: Rob Clark <robdcl...@gmail.com>
Cc: linux...@kvack.org <linux...@kvack.org>
Cc: John Stultz <john.stu...@linaro.org>
Signed-off-by: Minchan Kim <minc...@kernel.org>
---
 include/linux/swap.h   |    6 +++++-
 include/linux/vrange.h |   17 ++++++++++++++++-
 mm/memory.c            |   35 +++++++++++++++++++++++++++++++++--
 mm/mincore.c           |    5 ++++-
 mm/vrange.c            |   19 +++++++++++++++++++
 5 files changed, 77 insertions(+), 5 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 46ba0c6c219f..39b3d4c6aec9 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -49,6 +49,9 @@ static inline int current_is_kswapd(void)
  * actions on faults.
  */
 
+#define SWP_VRANGE_NUM 1
+#define SWP_VRANGE     (MAX_SWAPFILES + SWP_HWPOISON_NUM + SWP_MIGRATION_NUM)
+
 /*
  * NUMA node memory migration support
  */
@@ -71,7 +74,8 @@ static inline int current_is_kswapd(void)
 #endif
 
 #define MAX_SWAPFILES \
-       ((1 << MAX_SWAPFILES_SHIFT) - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM)
+       ((1 << MAX_SWAPFILES_SHIFT) - SWP_MIGRATION_NUM - SWP_HWPOISON_NUM \
+                       - SWP_VRANGE_NUM)
 
 /*
  * Magic header for a swap area. The first part of the union is
diff --git a/include/linux/vrange.h b/include/linux/vrange.h
index 778902d9cc30..d9ce2ec53a34 100644
--- a/include/linux/vrange.h
+++ b/include/linux/vrange.h
@@ -3,6 +3,8 @@
 
 #include <linux/vrange_types.h>
 #include <linux/mm.h>
+#include <linux/swap.h>
+#include <linux/swapops.h>
 
 #define vrange_from_node(node_ptr) \
        container_of(node_ptr, struct vrange, node)
@@ -12,6 +14,16 @@
 
 #ifdef CONFIG_MMU
 
+static inline swp_entry_t make_vrange_entry(void)
+{
+       return swp_entry(SWP_VRANGE, 0);
+}
+
+static inline int is_vrange_entry(swp_entry_t entry)
+{
+       return swp_type(entry) == SWP_VRANGE;
+}
+
 static inline void vrange_root_init(struct vrange_root *vroot, int type,
                                                                void *object)
 {
@@ -43,7 +55,8 @@ extern int vrange_fork(struct mm_struct *new,
                                        struct mm_struct *old);
 int discard_vpage(struct page *page);
 bool vrange_addr_volatile(struct vm_area_struct *vma, unsigned long addr);
-
+extern bool vrange_addr_purged(struct vm_area_struct *vma,
+                               unsigned long address);
 #else
 
 static inline void vrange_root_init(struct vrange_root *vroot,
@@ -60,5 +73,7 @@ static inline bool vrange_addr_volatile(struct vm_area_struct 
*vma,
        return false;
 }
 static inline int discard_vpage(struct page *page) { return 0 };
+static inline bool vrange_addr_purged(struct vm_area_struct *vma,
+                               unsigned long address);
 #endif
 #endif /* _LINIUX_VRANGE_H */
diff --git a/mm/memory.c b/mm/memory.c
index d176154c243f..86231180f01f 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -59,6 +59,7 @@
 #include <linux/gfp.h>
 #include <linux/migrate.h>
 #include <linux/string.h>
+#include <linux/vrange.h>
 
 #include <asm/io.h>
 #include <asm/pgalloc.h>
@@ -807,6 +808,8 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct 
*src_mm,
        if (unlikely(!pte_present(pte))) {
                if (!pte_file(pte)) {
                        swp_entry_t entry = pte_to_swp_entry(pte);
+                       if (is_vrange_entry(entry))
+                               goto out_set_pte;
 
                        if (swap_duplicate(entry) < 0)
                                return entry.val;
@@ -1152,6 +1155,8 @@ again:
                                print_bad_pte(vma, addr, ptent, NULL);
                } else {
                        swp_entry_t entry = pte_to_swp_entry(ptent);
+                       if (is_vrange_entry(entry))
+                               goto out;
 
                        if (!non_swap_entry(entry))
                                rss[MM_SWAPENTS]--;
@@ -1168,6 +1173,7 @@ again:
                        if (unlikely(!free_swap_and_cache(entry)))
                                print_bad_pte(vma, addr, ptent, NULL);
                }
+out:
                pte_clear_not_present_full(mm, addr, pte, tlb->fullmm);
        } while (pte++, addr += PAGE_SIZE, addr != end);
 
@@ -3695,15 +3701,40 @@ static int handle_pte_fault(struct mm_struct *mm,
 
        entry = *pte;
        if (!pte_present(entry)) {
+               swp_entry_t vrange_entry;
+retry:
                if (pte_none(entry)) {
                        if (vma->vm_ops) {
                                if (likely(vma->vm_ops->fault))
                                        return do_linear_fault(mm, vma, address,
-                                               pte, pmd, flags, entry);
+                                                       pte, pmd, flags, entry);
                        }
                        return do_anonymous_page(mm, vma, address,
-                                                pte, pmd, flags);
+                                       pte, pmd, flags);
+               }
+
+               vrange_entry = pte_to_swp_entry(entry);
+               if (unlikely(is_vrange_entry(vrange_entry))) {
+                       if (!vrange_addr_purged(vma, address)) {
+                               /*
+                                * If the address is not in purged vrange,
+                                * It means user already called NOVOLATILE
+                                * vrange system call so we shouldn't send
+                                * a SIGBUS. Intead, zap it and retry.
+                                */
+                               ptl = pte_lockptr(mm, pmd);
+                               spin_lock(ptl);
+                               if (unlikely(!pte_same(*pte, entry)))
+                                       goto unlock;
+                               flush_cache_page(vma, address, pte_pfn(*pte));
+                               ptep_clear_flush(vma, address, pte);
+                               pte_unmap_unlock(pte, ptl);
+                               goto retry;
+                       }
+
+                       return VM_FAULT_SIGBUS;
                }
+
                if (pte_file(entry))
                        return do_nonlinear_fault(mm, vma, address,
                                        pte, pmd, flags, entry);
diff --git a/mm/mincore.c b/mm/mincore.c
index da2be56a7b8f..e6138048d735 100644
--- a/mm/mincore.c
+++ b/mm/mincore.c
@@ -15,6 +15,7 @@
 #include <linux/swap.h>
 #include <linux/swapops.h>
 #include <linux/hugetlb.h>
+#include <linux/vrange.h>
 
 #include <asm/uaccess.h>
 #include <asm/pgtable.h>
@@ -129,7 +130,9 @@ static void mincore_pte_range(struct vm_area_struct *vma, 
pmd_t *pmd,
                } else { /* pte is a swap entry */
                        swp_entry_t entry = pte_to_swp_entry(pte);
 
-                       if (is_migration_entry(entry)) {
+                       if (is_vrange_entry(entry)) {
+                               *vec = 0;
+                       } else if (is_migration_entry(entry)) {
                                /* migration entries are always uptodate */
                                *vec = 1;
                        } else {
diff --git a/mm/vrange.c b/mm/vrange.c
index 18afe94d3f13..f86ed33434d8 100644
--- a/mm/vrange.c
+++ b/mm/vrange.c
@@ -431,6 +431,24 @@ bool vrange_addr_volatile(struct vm_area_struct *vma, 
unsigned long addr)
        return ret;
 }
 
+bool vrange_addr_purged(struct vm_area_struct *vma, unsigned long addr)
+{
+       struct vrange_root *vroot;
+       struct vrange *range;
+       unsigned long vstart_idx;
+       bool ret = false;
+
+       vroot = __vma_to_vroot(vma);
+       vstart_idx = __vma_addr_to_index(vma, addr);
+
+       vrange_lock(vroot);
+       range = __vrange_find(vroot, vstart_idx, vstart_idx);
+       if (range && range->purged)
+               ret = true;
+       vrange_unlock(vroot);
+       return ret;
+}
+
 /* Caller should hold vrange_lock */
 static void do_purge(struct vrange_root *vroot,
                unsigned long start_idx, unsigned long end_idx)
@@ -474,6 +492,7 @@ static void try_to_discard_one(struct vrange_root *vroot, 
struct page *page,
        page_remove_rmap(page);
        page_cache_release(page);
 
+       set_pte_at(mm, addr, pte, swp_entry_to_pte(make_vrange_entry()));
        pte_unmap_unlock(pte, ptl);
        mmu_notifier_invalidate_page(mm, addr);
 
-- 
1.7.9.5

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to