Currently tcache looks up page in radix tree under 'tcache_node_tree->lock'. After the page found, it has to be deleted from lru list which requires taking another 'tcache_nodeinfo->lock' under 'tcache_node_tree->lock'.
So let's say we have gigabytes of data in the tcache and node is under memory pressure. And than direct_io() happens, so we need to invalidate gigabytes of data. tcache_node_tree and tcache_nodeinfo lock become so contended that we might hit softlockup: Kernel panic - not syncing: Watchdog detected hard LOCKUP on cpu 2 [<ffffffff811f559c>] __tcache_page_tree_delete+0x1c/0x1b0 [<ffffffff811f6152>] tcache_invalidate_node_pages+0x72/0x1c0 [<ffffffff811f6716>] tcache_cleancache_invalidate_inode+0x166/0x370 [<ffffffff811f1b83>] __cleancache_invalidate_inode+0x83/0xa0 [<ffffffff8118b311>] invalidate_inode_pages2_range+0x1c1/0x430 [<ffffffff8118b597>] invalidate_inode_pages2+0x17/0x20 [<ffffffffa0217673>] dio_invalidate_cache+0x23/0xf0 [pio_direct] [<ffffffffa0217818>] dio_prepare_merge+0xd8/0x1d0 [pio_direct] [<ffffffffa01ef35b>] ploop1_prepare_merge+0xab/0xd0 [pfmt_ploop1] [<ffffffffa02cd8d4>] ploop_ioctl+0x1194/0x2760 [ploop] [<ffffffff8120a222>] ? path_openat+0xc2/0x460 [<ffffffff8120b8c2>] ? user_path_at_empty+0x72/0xc0 [<ffffffff812d4a2f>] blkdev_ioctl+0x2df/0x770 [<ffffffff81236bf1>] block_ioctl+0x41/0x50 [<ffffffff8120da75>] do_vfs_ioctl+0x255/0x4f0 [<ffffffff81218897>] ? __fd_install+0x47/0x60 [<ffffffff8120dd64>] SyS_ioctl+0x54/0xa0 [<ffffffff816449c9>] system_call_fastpath+0x16/0x1 Tcache is basically a copy of the page-cache, so we could just implement something like the page-cache lockless protocol, but for tcache. So lookups now require only RCU-read lock held and tcache_node_tree lock needed only for deletion from radix tree. Also this allows us to get rid of nesting locks situation. Simple test that runs in small memcg and just reads big files in parallel shows some improvement with this patch. Before: 10826.662747 task-clock (msec) # 0.039 CPUs utilized 52,324 context-switches # 0.005 M/sec 1,178 cpu-migrations # 0.109 K/sec 19,392 page-faults # 0.002 M/sec 275.361955913 seconds time elapsed After: 11088.596109 task-clock (msec) # 0.043 CPUs utilized 51,313 context-switches # 0.005 M/sec 1,048 cpu-migrations # 0.095 K/sec 19,355 page-faults # 0.002 M/sec 255.423624009 seconds time elapsed https://jira.sw.ru/browse/PSBM-64727 Signed-off-by: Andrey Ryabinin <aryabi...@virtuozzo.com> --- mm/tcache.c | 206 +++++++++++++++++++++++++++++++++++++----------------------- 1 file changed, 129 insertions(+), 77 deletions(-) diff --git a/mm/tcache.c b/mm/tcache.c index 3778523bbce..a77e3cfc58d 100644 --- a/mm/tcache.c +++ b/mm/tcache.c @@ -15,6 +15,7 @@ #include <linux/spinlock.h> #include <linux/rcupdate.h> #include <linux/rwsem.h> +#include <linux/pagemap.h> #include <linux/rbtree.h> #include <linux/radix-tree.h> #include <linux/idr.h> @@ -703,17 +704,10 @@ static inline void tcache_init_page(struct page *page, page->index = index; } -static inline void tcache_hold_page(struct page *page) -{ - get_page(page); -} - static inline void tcache_put_page(struct page *page) { - if (put_page_testzero(page)) { - page->mapping = NULL; /* to make free_pages_check happy */ - free_hot_cold_page(page, false); - } + page->mapping = NULL; + free_hot_cold_page(page, false); } static int tcache_page_tree_insert(struct tcache_node *node, pgoff_t index, @@ -745,6 +739,11 @@ out: static struct page *__tcache_page_tree_delete(struct tcache_node *node, pgoff_t index, struct page *page) { + if (!page_ref_freeze(page, 2)) { + put_page(page); + return NULL; + } + page = radix_tree_delete_item(&node->page_tree, index, page); if (page) { if (!--node->nr_pages) @@ -779,12 +778,10 @@ tcache_attach_page(struct tcache_node *node, pgoff_t index, struct page *page) spin_lock_irqsave(&node->tree_lock, flags); err = tcache_page_tree_insert(node, index, page); - if (!err) { - tcache_hold_page(page); + spin_unlock(&node->tree_lock); + if (!err) tcache_lru_add(node->pool, page); - } - - spin_unlock_irqrestore(&node->tree_lock, flags); + local_irq_restore(flags); return err; } @@ -795,61 +792,127 @@ tcache_attach_page(struct tcache_node *node, pgoff_t index, struct page *page) static struct page *tcache_detach_page(struct tcache_node *node, pgoff_t index, bool reused) { + void **pagep; unsigned long flags; struct page *page; - local_irq_save(flags); - page = tcache_page_tree_delete(node, index, NULL); - if (page) - tcache_lru_del(node->pool, page, reused); - local_irq_restore(flags); + rcu_read_lock(); +repeat: + page = NULL; + pagep = radix_tree_lookup_slot(&node->page_tree, index); + if (pagep) { + page = radix_tree_deref_slot(pagep); + if (unlikely(!page)) + goto out; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) + goto repeat; + WARN_ON(1); + } + if (!page_cache_get_speculative(page)) + goto repeat; + /* + * Has the page moved? + * This is part of the lockless pagecache protocol. See + * include/linux/pagemap.h for details. + */ + if (unlikely(page != *pagep)) { + put_page(page); + goto repeat; + } + } +out: + rcu_read_unlock(); + + if (page) { + local_irq_save(flags); + page = tcache_page_tree_delete(node, index, page); + if (page) + tcache_lru_del(node->pool, page, reused); + local_irq_restore(flags); + } return page; } -static noinline_for_stack void -tcache_invalidate_node_pages(struct tcache_node *node) +static unsigned tcache_lookup(struct page **pages, struct tcache_node *node, + pgoff_t start, unsigned int nr_pages, pgoff_t *indices) { struct radix_tree_iter iter; - struct page *page; + unsigned int ret = 0; void **slot; - pgoff_t index = 0; - spin_lock_irq(&node->tree_lock); + if (!nr_pages) + return 0; + + rcu_read_lock(); +restart: + radix_tree_for_each_slot(slot, &node->page_tree, &iter, start) { + struct page *page; +repeat: + page = radix_tree_deref_slot(slot); + if (unlikely(!page)) + continue; + + if (radix_tree_exception(page) && radix_tree_deref_retry(page)) + goto restart; + + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *slot)) { + page_cache_release(page); + goto repeat; + } + + indices[ret] = iter.index; + pages[ret] = page; + if (++ret == nr_pages) + break; + } + rcu_read_unlock(); + return ret; +} + +#define TCACHE_PAGEVEC_SIZE 16 +static noinline_for_stack void +tcache_invalidate_node_pages(struct tcache_node *node) +{ + pgoff_t indices[TCACHE_PAGEVEC_SIZE]; + struct page *pages[TCACHE_PAGEVEC_SIZE]; + pgoff_t index = 0; + unsigned nr_pages; + int i; /* * First forbid new page insertions - see tcache_page_tree_replace. */ node->invalidated = true; - /* - * Now truncate all pages. Be careful, because pages can still be - * deleted from this node by the shrinker or by concurrent lookups. - */ -restart: - radix_tree_for_each_slot(slot, &node->page_tree, &iter, index) { - page = radix_tree_deref_slot_protected(slot, &node->tree_lock); - BUG_ON(!__tcache_page_tree_delete(node, page->index, page)); - tcache_lru_del(node->pool, page, false); - tcache_put_page(page); - - if (need_resched()) { - spin_unlock_irq(&node->tree_lock); - cond_resched(); + while ((nr_pages = tcache_lookup(pages, node, index, + TCACHE_PAGEVEC_SIZE, indices))) { + for (i = 0; i < nr_pages; i++) { + struct page *page = pages[i]; + + index = indices[i]; + spin_lock_irq(&node->tree_lock); - /* - * Restart iteration over the radix tree, because the - * current node could have been freed when we dropped - * the lock. - */ - index = iter.index + 1; - goto restart; + page = __tcache_page_tree_delete(node, page->index, page); + spin_unlock(&node->tree_lock); + + if (page) { + tcache_lru_del(node->pool, page, false); + local_irq_enable(); + tcache_put_page(page); + } else + local_irq_enable(); } + cond_resched(); + index++; } - BUG_ON(node->nr_pages != 0); - - spin_unlock_irq(&node->tree_lock); + WARN_ON(node->nr_pages != 0); } static noinline_for_stack void @@ -932,12 +995,16 @@ __tcache_lru_isolate(struct tcache_nodeinfo *ni, struct tcache_node *node; struct page *page; int nr_isolated = 0; + int nr_scanned = nr_to_isolate; - while (nr_to_isolate > 0 && !list_empty(&pni->lru)) { + while (nr_to_isolate > 0 && !list_empty(&pni->lru) && nr_scanned--) { page = list_first_entry(&pni->lru, struct page, lru); + + if (unlikely(!page_cache_get_speculative(page))) + continue; + __tcache_lru_del(ni, pni, page); - tcache_hold_page(page); /* * A node can be destroyed only if all its pages have been * removed both from the tree and the LRU list. Since we are @@ -976,7 +1043,7 @@ again: if (!tcache_grab_pool(pni->pool)) goto again; - nr = __tcache_lru_isolate(ni, pni, pages + nr_isolated, nr_to_isolate); + nr = __tcache_lru_isolate(ni, pni, pages, nr_to_isolate); nr_isolated += nr; nr_to_isolate -= nr; @@ -984,9 +1051,6 @@ again: __tcache_insert_reclaim_node(ni, pni); tcache_put_pool(pni->pool); - - if (nr_to_isolate > 0) - goto again; out: spin_unlock_irq(&ni->lock); return nr_isolated; @@ -998,18 +1062,7 @@ static bool __tcache_reclaim_page(struct page *page) bool ret; node = tcache_page_node(page); - if (tcache_page_tree_delete(node, page->index, page)) { - /* - * We deleted the page from the tree - drop the - * corresponding reference. - */ - tcache_put_page(page); - ret = true; - } else - /* The page was deleted by a concurrent thread - abort. */ - ret = false; - - /* Drop the reference taken in __tcache_lru_isolate. */ + ret = tcache_page_tree_delete(node, page->index, page); tcache_put_node_and_pool(node); return ret; } @@ -1021,9 +1074,10 @@ static int tcache_reclaim_pages(struct page **pages, int nr) local_irq_disable(); for (i = 0; i < nr; i++) { - nr_reclaimed += !!__tcache_reclaim_page(pages[i]); - /* Drop the reference taken in __tcache_lru_isolate. */ - tcache_put_page(pages[i]); + if (__tcache_reclaim_page(pages[i])) { + nr_reclaimed++; + tcache_put_page(pages[i]); + } pages[i] = NULL; } local_irq_enable(); @@ -1048,10 +1102,10 @@ tcache_try_to_reclaim_page(struct tcache_pool *pool, int nid) if (!ret) goto out; - if (!__tcache_reclaim_page(page)) { - tcache_put_page(page); + if (!__tcache_reclaim_page(page)) page = NULL; - } + else + page_ref_unfreeze(page, 1); out: local_irq_restore(flags); return page; @@ -1135,13 +1189,11 @@ static void tcache_cleancache_put_page(int pool_id, cache_page = tcache_alloc_page(node->pool); if (cache_page) { copy_highpage(cache_page, page); - /* cleancache does not care about failures */ - (void)tcache_attach_page(node, index, cache_page); + if (tcache_attach_page(node, index, cache_page)) + if (put_page_testzero(cache_page)) + tcache_put_page(page); } tcache_put_node_and_pool(node); - if (cache_page) - tcache_put_page(cache_page); - } } -- 2.13.0 _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel