From: Fuad Tabba <ta...@google.com>

When a page is shared, the exclusive pin is dropped, but one
normal pin is maintained. In order to be able to unshare a page,
add the ability to reaquire the exclusive pin, but only if there
is only one normal pin on the page, and only if the page is
marked as AnonExclusive.

Co-Developed-by: Elliot Berman <quic_eber...@quicinc.com>
Signed-off-by: Elliot Berman <quic_eber...@quicinc.com>
Signed-off-by: Fuad Tabba <ta...@google.com>
Signed-off-by: Elliot Berman <quic_eber...@quicinc.com>
---
 include/linux/mm.h       |  1 +
 include/linux/page_ref.h | 18 ++++++++++++------
 mm/gup.c                 | 48 +++++++++++++++++++++++++++++++++++++-----------
 3 files changed, 50 insertions(+), 17 deletions(-)

diff --git a/include/linux/mm.h b/include/linux/mm.h
index d03d62bceba0..628ab936dd2b 100644
--- a/include/linux/mm.h
+++ b/include/linux/mm.h
@@ -1590,6 +1590,7 @@ void unpin_user_page_range_dirty_lock(struct page *page, 
unsigned long npages,
 void unpin_user_pages(struct page **pages, unsigned long npages);
 void unpin_exc_pages(struct page **pages, unsigned long npages);
 void unexc_user_page(struct page *page);
+int reexc_user_page(struct page *page);
 
 static inline bool is_cow_mapping(vm_flags_t flags)
 {
diff --git a/include/linux/page_ref.h b/include/linux/page_ref.h
index 9d16e1f4db09..e66130fe995d 100644
--- a/include/linux/page_ref.h
+++ b/include/linux/page_ref.h
@@ -92,7 +92,8 @@ static inline void __page_ref_unfreeze(struct page *page, int 
v)
  * provides safe operation for get_user_pages(), page_mkclean() and
  * other calls that race to set up page table entries.
  */
-#define GUP_PIN_COUNTING_BIAS (1U << 10)
+#define GUP_PIN_COUNTING_SHIFT (10)
+#define GUP_PIN_COUNTING_BIAS (1U << GUP_PIN_COUNTING_SHIFT)
 
 /*
  * GUP_PIN_EXCLUSIVE_BIAS is used to grab an exclusive pin over a page.
@@ -100,7 +101,8 @@ static inline void __page_ref_unfreeze(struct page *page, 
int v)
  * exist for the page.
  * After it's taken, no other gup pins can be taken.
  */
-#define GUP_PIN_EXCLUSIVE_BIAS (1U << 30)
+#define GUP_PIN_EXCLUSIVE_SHIFT (30)
+#define GUP_PIN_EXCLUSIVE_BIAS (1U << GUP_PIN_EXCLUSIVE_SHIFT)
 
 static inline int page_ref_count(const struct page *page)
 {
@@ -155,7 +157,9 @@ static inline void init_page_count(struct page *page)
        set_page_count(page, 1);
 }
 
-static __must_check inline bool page_ref_setexc(struct page *page, unsigned 
int refs)
+static __must_check inline bool page_ref_setexc(struct page *page,
+                                               unsigned int expected_pins,
+                                               unsigned int refs)
 {
        unsigned int old_count, new_count;
 
@@ -165,7 +169,7 @@ static __must_check inline bool page_ref_setexc(struct page 
*page, unsigned int
        do {
                old_count = atomic_read(&page->_refcount);
 
-               if (old_count >= GUP_PIN_COUNTING_BIAS)
+               if ((old_count >> GUP_PIN_COUNTING_SHIFT) != expected_pins)
                        return false;
 
                if (check_add_overflow(old_count, refs + 
GUP_PIN_EXCLUSIVE_BIAS, &new_count))
@@ -178,9 +182,11 @@ static __must_check inline bool page_ref_setexc(struct 
page *page, unsigned int
        return true;
 }
 
-static __must_check inline bool folio_ref_setexc(struct folio *folio, unsigned 
int refs)
+static __must_check inline bool folio_ref_setexc(struct folio *folio,
+                                                unsigned int expected_pins,
+                                                unsigned int refs)
 {
-       return page_ref_setexc(&folio->page, refs);
+       return page_ref_setexc(&folio->page, expected_pins, refs);
 }
 
 static inline void page_ref_add(struct page *page, int nr)
diff --git a/mm/gup.c b/mm/gup.c
index 7f20de33221d..663030d03d95 100644
--- a/mm/gup.c
+++ b/mm/gup.c
@@ -97,7 +97,9 @@ static inline struct folio *try_get_folio(struct page *page, 
int refs)
        return folio;
 }
 
-static bool large_folio_pin_setexc(struct folio *folio, unsigned int pins)
+static bool large_folio_pin_setexc(struct folio *folio,
+                                  unsigned int expected_pins,
+                                  unsigned int pins)
 {
        unsigned int old_pincount, new_pincount;
 
@@ -107,7 +109,7 @@ static bool large_folio_pin_setexc(struct folio *folio, 
unsigned int pins)
        do {
                old_pincount = atomic_read(&folio->_pincount);
 
-               if (old_pincount > 0)
+               if (old_pincount != expected_pins)
                        return false;
 
                if (check_add_overflow(old_pincount, pins + 
GUP_PIN_EXCLUSIVE_BIAS, &new_pincount))
@@ -117,15 +119,18 @@ static bool large_folio_pin_setexc(struct folio *folio, 
unsigned int pins)
        return true;
 }
 
-static bool __try_grab_folio_excl(struct folio *folio, int pincount, int 
refcount)
+static bool __try_grab_folio_excl(struct folio *folio,
+                                 unsigned int expected_pins,
+                                 int pincount,
+                                 int refcount)
 {
        if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_EXCLUSIVE_PIN)))
                return false;
 
        if (folio_test_large(folio)) {
-               if (!large_folio_pin_setexc(folio, pincount))
+               if (!large_folio_pin_setexc(folio, expected_pins, pincount))
                        return false;
-       } else if (!folio_ref_setexc(folio, refcount)) {
+       } else if (!folio_ref_setexc(folio, expected_pins, refcount)) {
                return false;
        }
 
@@ -135,7 +140,9 @@ static bool __try_grab_folio_excl(struct folio *folio, int 
pincount, int refcoun
        return true;
 }
 
-static bool try_grab_folio_excl(struct folio *folio, int refs)
+static bool try_grab_folio_excl(struct folio *folio,
+                               unsigned int expected_pins,
+                               int refs)
 {
        /*
         * When pinning a large folio, use an exact count to track it.
@@ -145,15 +152,17 @@ static bool try_grab_folio_excl(struct folio *folio, int 
refs)
         * is pinned.  That's why the refcount from the earlier
         * try_get_folio() is left intact.
         */
-       return __try_grab_folio_excl(folio, refs,
+       return __try_grab_folio_excl(folio, expected_pins, refs,
                                     refs * (GUP_PIN_COUNTING_BIAS - 1));
 }
 
-static bool try_grab_page_excl(struct page *page)
+static bool try_grab_page_excl(struct page *page,
+                              unsigned int expected_pins)
 {
        struct folio *folio = page_folio(page);
 
-       return __try_grab_folio_excl(folio, 1, GUP_PIN_COUNTING_BIAS);
+       return __try_grab_folio_excl(folio, expected_pins, 1,
+                                    GUP_PIN_COUNTING_BIAS);
 }
 
 /**
@@ -227,7 +236,7 @@ struct folio *try_grab_folio(struct page *page, int refs, 
unsigned int flags)
        }
 
        if (unlikely(flags & FOLL_EXCLUSIVE)) {
-               if (!try_grab_folio_excl(folio, refs))
+               if (!try_grab_folio_excl(folio, 0, refs))
                        return NULL;
        } else {
                /*
@@ -347,7 +356,7 @@ int __must_check try_grab_page(struct page *page, unsigned 
int flags)
                        return -EBUSY;
 
                if (unlikely(flags & FOLL_EXCLUSIVE)) {
-                       if (!try_grab_page_excl(page))
+                       if (!try_grab_page_excl(page, 0))
                                return -EBUSY;
                } else {
                        /*
@@ -661,6 +670,23 @@ void unexc_user_page(struct page *page)
 }
 EXPORT_SYMBOL(unexc_user_page);
 
+int reexc_user_page(struct page *page)
+{
+       if (WARN_ON_ONCE(!IS_ENABLED(CONFIG_EXCLUSIVE_PIN)))
+               return -EINVAL;
+
+       sanity_check_pinned_pages(&page, 1);
+
+       if (!PageAnonExclusive(page))
+               return -EINVAL;
+
+       if (!try_grab_page_excl(page, 1))
+               return -EBUSY;
+
+       return 0;
+}
+EXPORT_SYMBOL(reexc_user_page);
+
 /*
  * Set the MMF_HAS_PINNED if not set yet; after set it'll be there for the mm's
  * lifecycle.  Avoid setting the bit unless necessary, or it might cause write

-- 
2.34.1


Reply via email to