From: Dan Streetman <ddstr...@ieee.org>

commit adfab836f4908deb049a5128082719e689eed964 upstream.

The logic controlling the singly-linked list of swap_info_struct entries
for all active, i.e.  swapon'ed, swap targets is rather complex, because:

 - it stores the entries in priority order
 - there is a pointer to the highest priority entry
 - there is a pointer to the highest priority not-full entry
 - there is a highest_priority_index variable set outside the swap_lock
 - swap entries of equal priority should be used equally

this complexity leads to bugs such as: https://lkml.org/lkml/2014/2/13/181
where different priority swap targets are incorrectly used equally.

That bug probably could be solved with the existing singly-linked lists,
but I think it would only add more complexity to the already difficult to
understand get_swap_page() swap_list iteration logic.

The first patch changes from a singly-linked list to a doubly-linked list
using list_heads; the highest_priority_index and related code are removed
and get_swap_page() starts each iteration at the highest priority
swap_info entry, even if it's full.  While this does introduce unnecessary
list iteration (i.e.  Schlemiel the painter's algorithm) in the case where
one or more of the highest priority entries are full, the iteration and
manipulation code is much simpler and behaves correctly re: the above bug;
and the fourth patch removes the unnecessary iteration.

The second patch adds some minor plist helper functions; nothing new
really, just functions to match existing regular list functions.  These
are used by the next two patches.

The third patch adds plist_requeue(), which is used by get_swap_page() in
the next patch - it performs the requeueing of same-priority entries
(which moves the entry to the end of its priority in the plist), so that
all equal-priority swap_info_structs get used equally.

The fourth patch converts the main list into a plist, and adds a new plist
that contains only swap_info entries that are both active and not full.
As Mel suggested using plists allows removing all the ordering code from
swap - plists handle ordering automatically.  The list naming is also
clarified now that there are two lists, with the original list changed
from swap_list_head to swap_active_head and the new list named
swap_avail_head.  A new spinlock is also added for the new list, so
swap_info entries can be added or removed from the new list immediately as
they become full or not full.

This patch (of 4):

Replace the singly-linked list tracking active, i.e.  swapon'ed,
swap_info_struct entries with a doubly-linked list using struct
list_heads.  Simplify the logic iterating and manipulating the list of
entries, especially get_swap_page(), by using standard list_head
functions, and removing the highest priority iteration logic.

The change fixes the bug:
https://lkml.org/lkml/2014/2/13/181
in which different priority swap entries after the highest priority entry
are incorrectly used equally in pairs.  The swap behavior is now as
advertised, i.e. different priority swap entries are used in order, and
equal priority swap targets are used concurrently.

Signed-off-by: Dan Streetman <ddstr...@ieee.org>
Acked-by: Mel Gorman <mgor...@suse.de>
Cc: Shaohua Li <s...@fusionio.com>
Cc: Hugh Dickins <hu...@google.com>
Cc: Dan Streetman <ddstr...@ieee.org>
Cc: Michal Hocko <mho...@suse.cz>
Cc: Christian Ehrhardt <ehrha...@linux.vnet.ibm.com>
Cc: Weijie Yang <weiji...@gmail.com>
Cc: Rik van Riel <r...@redhat.com>
Cc: Johannes Weiner <han...@cmpxchg.org>
Cc: Bob Liu <bob....@oracle.com>
Cc: Steven Rostedt <rost...@goodmis.org>
Cc: Peter Zijlstra <pet...@infradead.org>
Cc: Paul Gortmaker <paul.gortma...@windriver.com>
Cc: Thomas Gleixner <t...@linutronix.de>
Signed-off-by: Andrew Morton <a...@linux-foundation.org>
Signed-off-by: Linus Torvalds <torva...@linux-foundation.org>
Signed-off-by: Mel Gorman <mgor...@suse.de>
---
 include/linux/swap.h     |   7 +-
 include/linux/swapfile.h |   2 +-
 mm/frontswap.c           |  13 ++--
 mm/swapfile.c            | 171 ++++++++++++++++++++---------------------------
 4 files changed, 78 insertions(+), 115 deletions(-)

diff --git a/include/linux/swap.h b/include/linux/swap.h
index 46ba0c6..8eaec00 100644
--- a/include/linux/swap.h
+++ b/include/linux/swap.h
@@ -214,8 +214,8 @@ struct percpu_cluster {
 struct swap_info_struct {
        unsigned long   flags;          /* SWP_USED etc: see above */
        signed short    prio;           /* swap priority of this type */
+       struct list_head list;          /* entry in swap list */
        signed char     type;           /* strange name for an index */
-       signed char     next;           /* next type on the swap list */
        unsigned int    max;            /* extent of the swap_map */
        unsigned char *swap_map;        /* vmalloc'ed array of usage counts */
        struct swap_cluster_info *cluster_info; /* cluster info. Only for SSD */
@@ -255,11 +255,6 @@ struct swap_info_struct {
        struct swap_cluster_info discard_cluster_tail; /* list tail of discard 
clusters */
 };
 
-struct swap_list_t {
-       int head;       /* head of priority-ordered swapfile list */
-       int next;       /* swapfile to be used next */
-};
-
 /* linux/mm/page_alloc.c */
 extern unsigned long totalram_pages;
 extern unsigned long totalreserve_pages;
diff --git a/include/linux/swapfile.h b/include/linux/swapfile.h
index e282624..2eab382 100644
--- a/include/linux/swapfile.h
+++ b/include/linux/swapfile.h
@@ -6,7 +6,7 @@
  * want to expose them to the dozens of source files that include swap.h
  */
 extern spinlock_t swap_lock;
-extern struct swap_list_t swap_list;
+extern struct list_head swap_list_head;
 extern struct swap_info_struct *swap_info[];
 extern int try_to_unuse(unsigned int, bool, unsigned long);
 
diff --git a/mm/frontswap.c b/mm/frontswap.c
index 1b24bdc..fae1160 100644
--- a/mm/frontswap.c
+++ b/mm/frontswap.c
@@ -327,15 +327,12 @@ EXPORT_SYMBOL(__frontswap_invalidate_area);
 
 static unsigned long __frontswap_curr_pages(void)
 {
-       int type;
        unsigned long totalpages = 0;
        struct swap_info_struct *si = NULL;
 
        assert_spin_locked(&swap_lock);
-       for (type = swap_list.head; type >= 0; type = si->next) {
-               si = swap_info[type];
+       list_for_each_entry(si, &swap_list_head, list)
                totalpages += atomic_read(&si->frontswap_pages);
-       }
        return totalpages;
 }
 
@@ -347,11 +344,9 @@ static int __frontswap_unuse_pages(unsigned long total, 
unsigned long *unused,
        int si_frontswap_pages;
        unsigned long total_pages_to_unuse = total;
        unsigned long pages = 0, pages_to_unuse = 0;
-       int type;
 
        assert_spin_locked(&swap_lock);
-       for (type = swap_list.head; type >= 0; type = si->next) {
-               si = swap_info[type];
+       list_for_each_entry(si, &swap_list_head, list) {
                si_frontswap_pages = atomic_read(&si->frontswap_pages);
                if (total_pages_to_unuse < si_frontswap_pages) {
                        pages = pages_to_unuse = total_pages_to_unuse;
@@ -366,7 +361,7 @@ static int __frontswap_unuse_pages(unsigned long total, 
unsigned long *unused,
                }
                vm_unacct_memory(pages);
                *unused = pages_to_unuse;
-               *swapid = type;
+               *swapid = si->type;
                ret = 0;
                break;
        }
@@ -413,7 +408,7 @@ void frontswap_shrink(unsigned long target_pages)
        /*
         * we don't want to hold swap_lock while doing a very
         * lengthy try_to_unuse, but swap_list may change
-        * so restart scan from swap_list.head each time
+        * so restart scan from swap_list_head each time
         */
        spin_lock(&swap_lock);
        ret = __frontswap_shrink(target_pages, &pages_to_unuse, &type);
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 0ec2eaf..d40f39e 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -51,14 +51,17 @@ atomic_long_t nr_swap_pages;
 /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */
 long total_swap_pages;
 static int least_priority;
-static atomic_t highest_priority_index = ATOMIC_INIT(-1);
 
 static const char Bad_file[] = "Bad swap file entry ";
 static const char Unused_file[] = "Unused swap file entry ";
 static const char Bad_offset[] = "Bad swap offset entry ";
 static const char Unused_offset[] = "Unused swap offset entry ";
 
-struct swap_list_t swap_list = {-1, -1};
+/*
+ * all active swap_info_structs
+ * protected with swap_lock, and ordered by priority.
+ */
+LIST_HEAD(swap_list_head);
 
 struct swap_info_struct *swap_info[MAX_SWAPFILES];
 
@@ -639,66 +642,54 @@ no_page:
 
 swp_entry_t get_swap_page(void)
 {
-       struct swap_info_struct *si;
+       struct swap_info_struct *si, *next;
        pgoff_t offset;
-       int type, next;
-       int wrapped = 0;
-       int hp_index;
+       struct list_head *tmp;
 
        spin_lock(&swap_lock);
        if (atomic_long_read(&nr_swap_pages) <= 0)
                goto noswap;
        atomic_long_dec(&nr_swap_pages);
 
-       for (type = swap_list.next; type >= 0 && wrapped < 2; type = next) {
-               hp_index = atomic_xchg(&highest_priority_index, -1);
-               /*
-                * highest_priority_index records current highest priority swap
-                * type which just frees swap entries. If its priority is
-                * higher than that of swap_list.next swap type, we use it.  It
-                * isn't protected by swap_lock, so it can be an invalid value
-                * if the corresponding swap type is swapoff. We double check
-                * the flags here. It's even possible the swap type is swapoff
-                * and swapon again and its priority is changed. In such rare
-                * case, low prority swap type might be used, but eventually
-                * high priority swap will be used after several rounds of
-                * swap.
-                */
-               if (hp_index != -1 && hp_index != type &&
-                   swap_info[type]->prio < swap_info[hp_index]->prio &&
-                   (swap_info[hp_index]->flags & SWP_WRITEOK)) {
-                       type = hp_index;
-                       swap_list.next = type;
-               }
-
-               si = swap_info[type];
-               next = si->next;
-               if (next < 0 ||
-                   (!wrapped && si->prio != swap_info[next]->prio)) {
-                       next = swap_list.head;
-                       wrapped++;
-               }
-
+       list_for_each(tmp, &swap_list_head) {
+               si = list_entry(tmp, typeof(*si), list);
                spin_lock(&si->lock);
-               if (!si->highest_bit) {
-                       spin_unlock(&si->lock);
-                       continue;
-               }
-               if (!(si->flags & SWP_WRITEOK)) {
+               if (!si->highest_bit || !(si->flags & SWP_WRITEOK)) {
                        spin_unlock(&si->lock);
                        continue;
                }
 
-               swap_list.next = next;
+               /*
+                * rotate the current swap_info that we're going to use
+                * to after any other swap_info that have the same prio,
+                * so that all equal-priority swap_info get used equally
+                */
+               next = si;
+               list_for_each_entry_continue(next, &swap_list_head, list) {
+                       if (si->prio != next->prio)
+                               break;
+                       list_rotate_left(&si->list);
+                       next = si;
+               }
 
                spin_unlock(&swap_lock);
                /* This is called for allocating swap entry for cache */
                offset = scan_swap_map(si, SWAP_HAS_CACHE);
                spin_unlock(&si->lock);
                if (offset)
-                       return swp_entry(type, offset);
+                       return swp_entry(si->type, offset);
                spin_lock(&swap_lock);
-               next = swap_list.next;
+               /*
+                * if we got here, it's likely that si was almost full before,
+                * and since scan_swap_map() can drop the si->lock, multiple
+                * callers probably all tried to get a page from the same si
+                * and it filled up before we could get one.  So we need to
+                * try again.  Since we dropped the swap_lock, there may now
+                * be non-full higher priority swap_infos, and this si may have
+                * even been removed from the list (although very unlikely).
+                * Let's start over.
+                */
+               tmp = &swap_list_head;
        }
 
        atomic_long_inc(&nr_swap_pages);
@@ -765,27 +756,6 @@ out:
        return NULL;
 }
 
-/*
- * This swap type frees swap entry, check if it is the highest priority swap
- * type which just frees swap entry. get_swap_page() uses
- * highest_priority_index to search highest priority swap type. The
- * swap_info_struct.lock can't protect us if there are multiple swap types
- * active, so we use atomic_cmpxchg.
- */
-static void set_highest_priority_index(int type)
-{
-       int old_hp_index, new_hp_index;
-
-       do {
-               old_hp_index = atomic_read(&highest_priority_index);
-               if (old_hp_index != -1 &&
-                       swap_info[old_hp_index]->prio >= swap_info[type]->prio)
-                       break;
-               new_hp_index = type;
-       } while (atomic_cmpxchg(&highest_priority_index,
-               old_hp_index, new_hp_index) != old_hp_index);
-}
-
 static unsigned char swap_entry_free(struct swap_info_struct *p,
                                     swp_entry_t entry, unsigned char usage)
 {
@@ -829,7 +799,6 @@ static unsigned char swap_entry_free(struct 
swap_info_struct *p,
                        p->lowest_bit = offset;
                if (offset > p->highest_bit)
                        p->highest_bit = offset;
-               set_highest_priority_index(p->type);
                atomic_long_inc(&nr_swap_pages);
                p->inuse_pages--;
                frontswap_invalidate_page(p->type, offset);
@@ -1764,7 +1733,7 @@ static void _enable_swap_info(struct swap_info_struct *p, 
int prio,
                                unsigned char *swap_map,
                                struct swap_cluster_info *cluster_info)
 {
-       int i, prev;
+       struct swap_info_struct *si;
 
        if (prio >= 0)
                p->prio = prio;
@@ -1776,18 +1745,28 @@ static void _enable_swap_info(struct swap_info_struct 
*p, int prio,
        atomic_long_add(p->pages, &nr_swap_pages);
        total_swap_pages += p->pages;
 
-       /* insert swap space into swap_list: */
-       prev = -1;
-       for (i = swap_list.head; i >= 0; i = swap_info[i]->next) {
-               if (p->prio >= swap_info[i]->prio)
-                       break;
-               prev = i;
+       assert_spin_locked(&swap_lock);
+       BUG_ON(!list_empty(&p->list));
+       /*
+        * insert into swap list; the list is in priority order,
+        * so that get_swap_page() can get a page from the highest
+        * priority swap_info_struct with available page(s), and
+        * swapoff can adjust the auto-assigned (i.e. negative) prio
+        * values for any lower-priority swap_info_structs when
+        * removing a negative-prio swap_info_struct
+        */
+       list_for_each_entry(si, &swap_list_head, list) {
+               if (p->prio >= si->prio) {
+                       list_add_tail(&p->list, &si->list);
+                       return;
+               }
        }
-       p->next = i;
-       if (prev < 0)
-               swap_list.head = swap_list.next = p->type;
-       else
-               swap_info[prev]->next = p->type;
+       /*
+        * this covers two cases:
+        * 1) p->prio is less than all existing prio
+        * 2) the swap list is empty
+        */
+       list_add_tail(&p->list, &swap_list_head);
 }
 
 static void enable_swap_info(struct swap_info_struct *p, int prio,
@@ -1822,8 +1801,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        struct address_space *mapping;
        struct inode *inode;
        struct filename *pathname;
-       int i, type, prev;
-       int err;
+       int err, found = 0;
        unsigned int old_block_size;
 
        if (!capable(CAP_SYS_ADMIN))
@@ -1841,17 +1819,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, 
specialfile)
                goto out;
 
        mapping = victim->f_mapping;
-       prev = -1;
        spin_lock(&swap_lock);
-       for (type = swap_list.head; type >= 0; type = swap_info[type]->next) {
-               p = swap_info[type];
+       list_for_each_entry(p, &swap_list_head, list) {
                if (p->flags & SWP_WRITEOK) {
-                       if (p->swap_file->f_mapping == mapping)
+                       if (p->swap_file->f_mapping == mapping) {
+                               found = 1;
                                break;
+                       }
                }
-               prev = type;
        }
-       if (type < 0) {
+       if (!found) {
                err = -EINVAL;
                spin_unlock(&swap_lock);
                goto out_dput;
@@ -1863,20 +1840,16 @@ SYSCALL_DEFINE1(swapoff, const char __user *, 
specialfile)
                spin_unlock(&swap_lock);
                goto out_dput;
        }
-       if (prev < 0)
-               swap_list.head = p->next;
-       else
-               swap_info[prev]->next = p->next;
-       if (type == swap_list.next) {
-               /* just pick something that's safe... */
-               swap_list.next = swap_list.head;
-       }
        spin_lock(&p->lock);
        if (p->prio < 0) {
-               for (i = p->next; i >= 0; i = swap_info[i]->next)
-                       swap_info[i]->prio = p->prio--;
+               struct swap_info_struct *si = p;
+
+               list_for_each_entry_continue(si, &swap_list_head, list) {
+                       si->prio++;
+               }
                least_priority++;
        }
+       list_del_init(&p->list);
        atomic_long_sub(p->pages, &nr_swap_pages);
        total_swap_pages -= p->pages;
        p->flags &= ~SWP_WRITEOK;
@@ -1884,7 +1857,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        spin_unlock(&swap_lock);
 
        set_current_oom_origin();
-       err = try_to_unuse(type, false, 0); /* force all pages to be unused */
+       err = try_to_unuse(p->type, false, 0); /* force unuse all pages */
        clear_current_oom_origin();
 
        if (err) {
@@ -1926,7 +1899,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        frontswap_map_set(p, NULL);
        spin_unlock(&p->lock);
        spin_unlock(&swap_lock);
-       frontswap_invalidate_area(type);
+       frontswap_invalidate_area(p->type);
        mutex_unlock(&swapon_mutex);
        free_percpu(p->percpu_cluster);
        p->percpu_cluster = NULL;
@@ -1934,7 +1907,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
        vfree(cluster_info);
        vfree(frontswap_map);
        /* Destroy swap account informatin */
-       swap_cgroup_swapoff(type);
+       swap_cgroup_swapoff(p->type);
 
        inode = mapping->host;
        if (S_ISBLK(inode->i_mode)) {
@@ -2141,8 +2114,8 @@ static struct swap_info_struct *alloc_swap_info(void)
                 */
        }
        INIT_LIST_HEAD(&p->first_swap_extent.list);
+       INIT_LIST_HEAD(&p->list);
        p->flags = SWP_USED;
-       p->next = -1;
        spin_unlock(&swap_lock);
        spin_lock_init(&p->lock);
 
-- 
1.8.4.5

--
To unsubscribe from this list: send the line "unsubscribe stable" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to