Currently tcache looks up page in radix tree under 'tcache_node_tree->lock'.
After the page found, it has to be deleted from lru list which requires
taking another 'tcache_nodeinfo->lock' under 'tcache_node_tree->lock'.

So let's say we have gigabytes of data in the tcache and node is under memory
pressure. And than direct_io() happens, so we need to invalidate gigabytes
of data. tcache_node_tree and tcache_nodeinfo lock become so contended that
we might hit softlockup:
        Kernel panic - not syncing: Watchdog detected hard LOCKUP on cpu 2
         [<ffffffff811f559c>] __tcache_page_tree_delete+0x1c/0x1b0
         [<ffffffff811f6152>] tcache_invalidate_node_pages+0x72/0x1c0
         [<ffffffff811f6716>] tcache_cleancache_invalidate_inode+0x166/0x370
         [<ffffffff811f1b83>] __cleancache_invalidate_inode+0x83/0xa0
         [<ffffffff8118b311>] invalidate_inode_pages2_range+0x1c1/0x430
         [<ffffffff8118b597>] invalidate_inode_pages2+0x17/0x20
         [<ffffffffa0217673>] dio_invalidate_cache+0x23/0xf0 [pio_direct]
         [<ffffffffa0217818>] dio_prepare_merge+0xd8/0x1d0 [pio_direct]
         [<ffffffffa01ef35b>] ploop1_prepare_merge+0xab/0xd0 [pfmt_ploop1]
         [<ffffffffa02cd8d4>] ploop_ioctl+0x1194/0x2760 [ploop]
         [<ffffffff8120a222>] ? path_openat+0xc2/0x460
         [<ffffffff8120b8c2>] ? user_path_at_empty+0x72/0xc0
         [<ffffffff812d4a2f>] blkdev_ioctl+0x2df/0x770
         [<ffffffff81236bf1>] block_ioctl+0x41/0x50
         [<ffffffff8120da75>] do_vfs_ioctl+0x255/0x4f0
         [<ffffffff81218897>] ? __fd_install+0x47/0x60
         [<ffffffff8120dd64>] SyS_ioctl+0x54/0xa0
         [<ffffffff816449c9>] system_call_fastpath+0x16/0x1

Tcache is basically a copy of the page-cache, so we could just implement
something like the page-cache lockless protocol, but for tcache.

So lookups now require only RCU-read lock held and tcache_node_tree
lock needed only for deletion from radix tree.
Also this allows us to get rid of nesting locks situation.

Simple test that runs in small memcg and just reads big files in parallel
shows some improvement with this patch.

Before:
      10826.662747      task-clock (msec)         #    0.039 CPUs utilized
            52,324      context-switches          #    0.005 M/sec
             1,178      cpu-migrations            #    0.109 K/sec
            19,392      page-faults               #    0.002 M/sec

     275.361955913 seconds time elapsed

After:
      11088.596109      task-clock (msec)         #    0.043 CPUs utilized
            51,313      context-switches          #    0.005 M/sec
             1,048      cpu-migrations            #    0.095 K/sec
            19,355      page-faults               #    0.002 M/sec

     255.423624009 seconds time elapsed

https://jira.sw.ru/browse/PSBM-64727
Signed-off-by: Andrey Ryabinin <aryabi...@virtuozzo.com>
---
 mm/tcache.c | 206 +++++++++++++++++++++++++++++++++++++-----------------------
 1 file changed, 129 insertions(+), 77 deletions(-)

diff --git a/mm/tcache.c b/mm/tcache.c
index 3778523bbce..a77e3cfc58d 100644
--- a/mm/tcache.c
+++ b/mm/tcache.c
@@ -15,6 +15,7 @@
 #include <linux/spinlock.h>
 #include <linux/rcupdate.h>
 #include <linux/rwsem.h>
+#include <linux/pagemap.h>
 #include <linux/rbtree.h>
 #include <linux/radix-tree.h>
 #include <linux/idr.h>
@@ -703,17 +704,10 @@ static inline void tcache_init_page(struct page *page,
        page->index = index;
 }
 
-static inline void tcache_hold_page(struct page *page)
-{
-       get_page(page);
-}
-
 static inline void tcache_put_page(struct page *page)
 {
-       if (put_page_testzero(page)) {
-               page->mapping = NULL;   /* to make free_pages_check happy */
-               free_hot_cold_page(page, false);
-       }
+       page->mapping = NULL;
+       free_hot_cold_page(page, false);
 }
 
 static int tcache_page_tree_insert(struct tcache_node *node, pgoff_t index,
@@ -745,6 +739,11 @@ out:
 static struct page *__tcache_page_tree_delete(struct tcache_node *node,
                                              pgoff_t index, struct page *page)
 {
+       if (!page_ref_freeze(page, 2)) {
+               put_page(page);
+               return NULL;
+       }
+
        page = radix_tree_delete_item(&node->page_tree, index, page);
        if (page) {
                if (!--node->nr_pages)
@@ -779,12 +778,10 @@ tcache_attach_page(struct tcache_node *node, pgoff_t 
index, struct page *page)
 
        spin_lock_irqsave(&node->tree_lock, flags);
        err = tcache_page_tree_insert(node, index, page);
-       if (!err) {
-               tcache_hold_page(page);
+       spin_unlock(&node->tree_lock);
+       if (!err)
                tcache_lru_add(node->pool, page);
-       }
-
-       spin_unlock_irqrestore(&node->tree_lock, flags);
+       local_irq_restore(flags);
        return err;
 }
 
@@ -795,61 +792,127 @@ tcache_attach_page(struct tcache_node *node, pgoff_t 
index, struct page *page)
 static struct page *tcache_detach_page(struct tcache_node *node, pgoff_t index,
                                       bool reused)
 {
+       void **pagep;
        unsigned long flags;
        struct page *page;
 
-       local_irq_save(flags);
-       page = tcache_page_tree_delete(node, index, NULL);
-       if (page)
-               tcache_lru_del(node->pool, page, reused);
-       local_irq_restore(flags);
+       rcu_read_lock();
+repeat:
+       page = NULL;
+       pagep = radix_tree_lookup_slot(&node->page_tree, index);
+       if (pagep) {
+               page = radix_tree_deref_slot(pagep);
+               if (unlikely(!page))
+                       goto out;
+               if (radix_tree_exception(page)) {
+                       if (radix_tree_deref_retry(page))
+                               goto repeat;
+                       WARN_ON(1);
+               }
+               if (!page_cache_get_speculative(page))
+                       goto repeat;
+               /*
+                * Has the page moved?
+                * This is part of the lockless pagecache protocol. See
+                * include/linux/pagemap.h for details.
+                */
+               if (unlikely(page != *pagep)) {
+                       put_page(page);
+                       goto repeat;
+               }
+       }
+out:
+       rcu_read_unlock();
+
+       if (page) {
+               local_irq_save(flags);
+               page = tcache_page_tree_delete(node, index, page);
+               if (page)
+                       tcache_lru_del(node->pool, page, reused);
+               local_irq_restore(flags);
+       }
 
        return page;
 }
 
-static noinline_for_stack void
-tcache_invalidate_node_pages(struct tcache_node *node)
+static unsigned tcache_lookup(struct page **pages, struct tcache_node *node,
+                       pgoff_t start, unsigned int nr_pages, pgoff_t *indices)
 {
        struct radix_tree_iter iter;
-       struct page *page;
+       unsigned int ret = 0;
        void **slot;
-       pgoff_t index = 0;
 
-       spin_lock_irq(&node->tree_lock);
+       if (!nr_pages)
+               return 0;
+
+       rcu_read_lock();
+restart:
+       radix_tree_for_each_slot(slot, &node->page_tree, &iter, start) {
+               struct page *page;
+repeat:
+               page = radix_tree_deref_slot(slot);
+               if (unlikely(!page))
+                       continue;
+
+               if (radix_tree_exception(page) && radix_tree_deref_retry(page))
+                       goto restart;
+
+               if (!page_cache_get_speculative(page))
+                       goto repeat;
+
+               /* Has the page moved? */
+               if (unlikely(page != *slot)) {
+                       page_cache_release(page);
+                       goto repeat;
+               }
+
+               indices[ret] = iter.index;
+               pages[ret] = page;
+               if (++ret == nr_pages)
+                       break;
+       }
+       rcu_read_unlock();
+       return ret;
+}
+
+#define TCACHE_PAGEVEC_SIZE 16
+static noinline_for_stack void
+tcache_invalidate_node_pages(struct tcache_node *node)
+{
+       pgoff_t indices[TCACHE_PAGEVEC_SIZE];
+       struct page *pages[TCACHE_PAGEVEC_SIZE];
+       pgoff_t index = 0;
+       unsigned nr_pages;
+       int i;
 
        /*
         * First forbid new page insertions - see tcache_page_tree_replace.
         */
        node->invalidated = true;
 
-       /*
-        * Now truncate all pages. Be careful, because pages can still be
-        * deleted from this node by the shrinker or by concurrent lookups.
-        */
-restart:
-       radix_tree_for_each_slot(slot, &node->page_tree, &iter, index) {
-               page = radix_tree_deref_slot_protected(slot, &node->tree_lock);
-               BUG_ON(!__tcache_page_tree_delete(node, page->index, page));
-               tcache_lru_del(node->pool, page, false);
-               tcache_put_page(page);
-
-               if (need_resched()) {
-                       spin_unlock_irq(&node->tree_lock);
-                       cond_resched();
+       while ((nr_pages = tcache_lookup(pages, node, index,
+                                               TCACHE_PAGEVEC_SIZE, indices))) 
{
+               for (i = 0; i < nr_pages; i++) {
+                       struct page *page = pages[i];
+
+                       index = indices[i];
+
                        spin_lock_irq(&node->tree_lock);
-                       /*
-                        * Restart iteration over the radix tree, because the
-                        * current node could have been freed when we dropped
-                        * the lock.
-                        */
-                       index = iter.index + 1;
-                       goto restart;
+                       page = __tcache_page_tree_delete(node, page->index, 
page);
+                       spin_unlock(&node->tree_lock);
+
+                       if (page) {
+                               tcache_lru_del(node->pool, page, false);
+                               local_irq_enable();
+                               tcache_put_page(page);
+                       } else
+                               local_irq_enable();
                }
+               cond_resched();
+               index++;
        }
 
-       BUG_ON(node->nr_pages != 0);
-
-       spin_unlock_irq(&node->tree_lock);
+       WARN_ON(node->nr_pages != 0);
 }
 
 static noinline_for_stack void
@@ -932,12 +995,16 @@ __tcache_lru_isolate(struct tcache_nodeinfo *ni,
        struct tcache_node *node;
        struct page *page;
        int nr_isolated = 0;
+       int nr_scanned = nr_to_isolate;
 
-       while (nr_to_isolate > 0 && !list_empty(&pni->lru)) {
+       while (nr_to_isolate > 0 && !list_empty(&pni->lru) && nr_scanned--) {
                page = list_first_entry(&pni->lru, struct page, lru);
+
+               if (unlikely(!page_cache_get_speculative(page)))
+                       continue;
+
                __tcache_lru_del(ni, pni, page);
 
-               tcache_hold_page(page);
                /*
                 * A node can be destroyed only if all its pages have been
                 * removed both from the tree and the LRU list. Since we are
@@ -976,7 +1043,7 @@ again:
        if (!tcache_grab_pool(pni->pool))
                goto again;
 
-       nr = __tcache_lru_isolate(ni, pni, pages + nr_isolated, nr_to_isolate);
+       nr = __tcache_lru_isolate(ni, pni, pages, nr_to_isolate);
        nr_isolated += nr;
        nr_to_isolate -= nr;
 
@@ -984,9 +1051,6 @@ again:
                __tcache_insert_reclaim_node(ni, pni);
 
        tcache_put_pool(pni->pool);
-
-       if (nr_to_isolate > 0)
-               goto again;
 out:
        spin_unlock_irq(&ni->lock);
        return nr_isolated;
@@ -998,18 +1062,7 @@ static bool __tcache_reclaim_page(struct page *page)
        bool ret;
 
        node = tcache_page_node(page);
-       if (tcache_page_tree_delete(node, page->index, page)) {
-               /*
-                * We deleted the page from the tree - drop the
-                * corresponding reference.
-                */
-               tcache_put_page(page);
-               ret = true;
-       } else
-               /* The page was deleted by a concurrent thread - abort. */
-               ret = false;
-
-       /* Drop the reference taken in __tcache_lru_isolate. */
+       ret = tcache_page_tree_delete(node, page->index, page);
        tcache_put_node_and_pool(node);
        return ret;
 }
@@ -1021,9 +1074,10 @@ static int tcache_reclaim_pages(struct page **pages, int 
nr)
 
        local_irq_disable();
        for (i = 0; i < nr; i++) {
-               nr_reclaimed += !!__tcache_reclaim_page(pages[i]);
-               /* Drop the reference taken in __tcache_lru_isolate. */
-               tcache_put_page(pages[i]);
+               if (__tcache_reclaim_page(pages[i])) {
+                       nr_reclaimed++;
+                       tcache_put_page(pages[i]);
+               }
                pages[i] = NULL;
        }
        local_irq_enable();
@@ -1048,10 +1102,10 @@ tcache_try_to_reclaim_page(struct tcache_pool *pool, 
int nid)
        if (!ret)
                goto out;
 
-       if (!__tcache_reclaim_page(page)) {
-               tcache_put_page(page);
+       if (!__tcache_reclaim_page(page))
                page = NULL;
-       }
+       else
+               page_ref_unfreeze(page, 1);
 out:
        local_irq_restore(flags);
        return page;
@@ -1135,13 +1189,11 @@ static void tcache_cleancache_put_page(int pool_id,
                cache_page = tcache_alloc_page(node->pool);
                if (cache_page) {
                        copy_highpage(cache_page, page);
-                       /* cleancache does not care about failures */
-                       (void)tcache_attach_page(node, index, cache_page);
+                       if (tcache_attach_page(node, index, cache_page))
+                               if (put_page_testzero(cache_page))
+                                       tcache_put_page(page);
                }
                tcache_put_node_and_pool(node);
-               if (cache_page)
-                       tcache_put_page(cache_page);
-
        }
 }
 
-- 
2.13.0

_______________________________________________
Devel mailing list
Devel@openvz.org
https://lists.openvz.org/mailman/listinfo/devel

Reply via email to