This mostly copy of page cache implementation. Record refault information when page swapped out, read it on swap in.
https://pmc.acronis.com/browse/VSTOR-19037 Signed-off-by: Andrey Ryabinin <aryabi...@virtuozzo.com> --- drivers/staging/zcache/zcache-main.c | 2 +- include/linux/swap.h | 10 +-- mm/shmem.c | 2 +- mm/swap_state.c | 123 ++++++++++++++++++++++++--- mm/swapfile.c | 2 +- mm/tswap.c | 2 +- mm/vmscan.c | 6 +- mm/workingset.c | 3 +- 8 files changed, 125 insertions(+), 25 deletions(-) diff --git a/drivers/staging/zcache/zcache-main.c b/drivers/staging/zcache/zcache-main.c index 01e8446b04d0..732be2143e64 100644 --- a/drivers/staging/zcache/zcache-main.c +++ b/drivers/staging/zcache/zcache-main.c @@ -948,7 +948,7 @@ static int zcache_get_swap_cache_page(int type, pgoff_t offset, /* May fail (-ENOMEM) if radix-tree node allocation failed. */ __set_page_locked(new_page); SetPageSwapBacked(new_page); - err = __add_to_swap_cache(new_page, entry); + err = __add_to_swap_cache(new_page, entry, NULL); if (likely(!err)) { radix_tree_preload_end(); lru_cache_add_anon(new_page); diff --git a/include/linux/swap.h b/include/linux/swap.h index 7797cb88870b..2985b5f90ce5 100644 --- a/include/linux/swap.h +++ b/include/linux/swap.h @@ -455,9 +455,9 @@ extern struct address_space *swapper_spaces[]; extern unsigned long total_swapcache_pages(void); extern void show_swap_cache_info(void); extern int add_to_swap(struct page *, struct list_head *list); -extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t); -extern int __add_to_swap_cache(struct page *page, swp_entry_t entry); -extern void __delete_from_swap_cache(struct page *); +extern int add_to_swap_cache(struct page *, swp_entry_t, gfp_t, void **); +extern int __add_to_swap_cache(struct page *page, swp_entry_t entry, void **shadow); +extern void __delete_from_swap_cache(struct page *, void *shadow); extern void delete_from_swap_cache(struct page *); extern void free_page_and_swap_cache(struct page *); extern void free_pages_and_swap_cache(struct page **, int); @@ -592,12 +592,12 @@ static inline int add_to_swap(struct page *page, struct list_head *list) } static inline int add_to_swap_cache(struct page *page, swp_entry_t entry, - gfp_t gfp_mask) + gfp_t gfp_mask, void **) { return -1; } -static inline void __delete_from_swap_cache(struct page *page) +static inline void __delete_from_swap_cache(struct page *page, void *shadow) { } diff --git a/mm/shmem.c b/mm/shmem.c index cda801a5496b..b25e1423d407 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -995,7 +995,7 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (list_empty(&info->swaplist)) list_add_tail(&info->swaplist, &shmem_swaplist); - if (add_to_swap_cache(page, swap, GFP_ATOMIC) == 0) { + if (add_to_swap_cache(page, swap, GFP_ATOMIC, NULL) == 0) { spin_lock(&info->lock); shmem_recalc_inode(inode); info->swapped++; diff --git a/mm/swap_state.c b/mm/swap_state.c index 83e48a7edb28..3931364e78a3 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -91,10 +91,12 @@ void show_swap_cache_info(void) * __add_to_swap_cache resembles add_to_page_cache_locked on swapper_space, * but sets SwapCache flag and private instead of mapping and index. */ -int __add_to_swap_cache(struct page *page, swp_entry_t entry) +int __add_to_swap_cache(struct page *page, swp_entry_t entry, void **shadow) { int error; + void **slot; struct address_space *address_space; + struct radix_tree_node *node; VM_BUG_ON_PAGE(!PageLocked(page), page); VM_BUG_ON_PAGE(PageSwapCache(page), page); @@ -106,13 +108,46 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) address_space = swap_address_space(entry); spin_lock_irq(&address_space->tree_lock); - error = radix_tree_insert(&address_space->page_tree, - entry.val, page); - if (likely(!error)) { - address_space->nrpages++; - __inc_zone_page_state(page, NR_FILE_PAGES); - INC_CACHE_INFO(add_total); + error = __radix_tree_create(&address_space->page_tree, entry.val, 0, + &node, &slot); + if (error) + goto out; + if (*slot) { + void *p; + + p = radix_tree_deref_slot_protected(slot, + &address_space->tree_lock); + if (!radix_tree_very_exceptional_entry(p)) { + error = -EEXIST; + goto out; + } + + address_space->nrexceptional--; + if (shadow) + *shadow = p; + if (node) + workingset_node_shadows_dec(node); } + radix_tree_replace_slot(slot, page); + address_space->nrpages++; + __inc_zone_page_state(page, NR_FILE_PAGES); + INC_CACHE_INFO(add_total); + if (node) { + workingset_node_pages_inc(node); + /* + * Don't track node that contains actual pages. + * + * Avoid acquiring the list_lru lock if already + * untracked. The list_empty() test is safe as + * node->private_list is protected by + * mapping->tree_lock. + */ + if (!list_empty(&node->private_list)) + list_lru_del(&workingset_shadow_nodes, + &node->private_list); + } + +out: spin_unlock_irq(&address_space->tree_lock); if (unlikely(error)) { @@ -131,23 +166,78 @@ int __add_to_swap_cache(struct page *page, swp_entry_t entry) } -int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask) +int add_to_swap_cache(struct page *page, swp_entry_t entry, gfp_t gfp_mask, + void **shadow) { int error; error = radix_tree_maybe_preload(gfp_mask); if (!error) { - error = __add_to_swap_cache(page, entry); + error = __add_to_swap_cache(page, entry, shadow); radix_tree_preload_end(); } return error; } +static void page_swap_cache_delete(struct address_space *mapping, + struct page *page, void *shadow) +{ + struct radix_tree_node *node; + void **slot; + + VM_BUG_ON(!PageLocked(page)); + + __radix_tree_lookup(&mapping->page_tree, page_private(page), &node, &slot); + radix_tree_clear_tags(&mapping->page_tree, node, slot); + + if (!node) { + /* + * We need a node to properly account shadow + * entries. Don't plant any without. XXX + */ + shadow = NULL; + } + + radix_tree_replace_slot(slot, shadow); + + if (shadow) { + mapping->nrexceptional++; + /* + * Make sure the nrexceptional update is committed before + * the nrpages update so that final truncate racing + * with reclaim does not see both counters 0 at the + * same time and miss a shadow entry. + */ + smp_wmb(); + } + + if (!node) + return; + + workingset_node_pages_dec(node); + if (shadow) + workingset_node_shadows_inc(node); + else + if (__radix_tree_delete_node(&mapping->page_tree, node)) + return; + + /* + * Track node that only contains shadow entries. + * + * Avoid acquiring the list_lru lock if already tracked. The + * list_empty() test is safe as node->private_list is + * protected by mapping->tree_lock. + */ + if (!workingset_node_pages(node) && list_empty(&node->private_list)) { + node->private_data = mapping; + list_lru_add(&workingset_shadow_nodes, &node->private_list); + } +} /* * This must be called only on pages that have * been verified to be in the swap cache. */ -void __delete_from_swap_cache(struct page *page) +void __delete_from_swap_cache(struct page *page, void *shadow) { swp_entry_t entry; struct address_space *address_space; @@ -158,7 +248,7 @@ void __delete_from_swap_cache(struct page *page) entry.val = page_private(page); address_space = swap_address_space(entry); - radix_tree_delete(&address_space->page_tree, page_private(page)); + page_swap_cache_delete(address_space, page, shadow); set_page_private(page, 0); ClearPageSwapCache(page); address_space->nrpages--; @@ -203,7 +293,7 @@ int add_to_swap(struct page *page, struct list_head *list) * Add it to the swap cache. */ err = add_to_swap_cache(page, entry, - __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN); + __GFP_HIGH|__GFP_NOMEMALLOC|__GFP_NOWARN, NULL); if (!err) { return 1; @@ -232,7 +322,7 @@ void delete_from_swap_cache(struct page *page) address_space = swap_address_space(entry); spin_lock_irq(&address_space->tree_lock); - __delete_from_swap_cache(page); + __delete_from_swap_cache(page, NULL); spin_unlock_irq(&address_space->tree_lock); swapcache_free(entry); @@ -323,6 +413,7 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, { struct page *found_page, *new_page = NULL; struct address_space *swapper_space = swap_address_space(entry); + void *shadow = NULL; int err; *new_page_allocated = false; @@ -395,9 +486,13 @@ struct page *__read_swap_cache_async(swp_entry_t entry, gfp_t gfp_mask, /* May fail (-ENOMEM) if radix-tree node allocation failed. */ __set_page_locked(new_page); SetPageSwapBacked(new_page); - err = __add_to_swap_cache(new_page, entry); + err = __add_to_swap_cache(new_page, entry, &shadow); if (likely(!err)) { radix_tree_preload_end(); + if (shadow && workingset_refault(shadow)) { + SetPageActive(new_page); + workingset_activation(new_page); + } /* * Initiate read into locked page and return. */ diff --git a/mm/swapfile.c b/mm/swapfile.c index 14043e6bf776..ffc3981c8c60 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -1208,7 +1208,7 @@ int reuse_swap_page(struct page *page) address_space = swap_address_space(entry); spin_lock_irq(&address_space->tree_lock); - __delete_from_swap_cache(page); + __delete_from_swap_cache(page, NULL); spin_unlock_irq(&address_space->tree_lock); /* the page is still in use, do not uncharge */ diff --git a/mm/tswap.c b/mm/tswap.c index 112a13d223d6..8b18bd17afcf 100644 --- a/mm/tswap.c +++ b/mm/tswap.c @@ -213,7 +213,7 @@ static int tswap_evict_page(struct page *page) goto out_free_swapcache; SetPageSwapBacked(page); - err = __add_to_swap_cache(page, entry); + err = __add_to_swap_cache(page, entry, NULL); if (err) { ClearPageSwapBacked(page); /* __add_to_swap_cache clears page->private on failure */ diff --git a/mm/vmscan.c b/mm/vmscan.c index 583ba1abfc44..fe034747bb31 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -781,8 +781,12 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, if (PageSwapCache(page)) { swp_entry_t swap = { .val = page_private(page) }; + void *shadow = NULL; + mem_cgroup_swapout(page, swap); - __delete_from_swap_cache(page); + + shadow = workingset_eviction(mapping, page); + __delete_from_swap_cache(page, shadow); spin_unlock_irq(&mapping->tree_lock); swapcache_free(swap); } else { diff --git a/mm/workingset.c b/mm/workingset.c index 0b4cf96bb026..46865ad551ce 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -275,7 +275,8 @@ bool workingset_refault(void *shadow) } lruvec = mem_cgroup_zone_lruvec(zone, memcg); refault = atomic_long_read(&lruvec->inactive_age); - active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE); + active_file = lruvec_lru_size(lruvec, LRU_ACTIVE_FILE) + + lruvec_lru_size(lruvec, LRU_ACTIVE_ANON); rcu_read_unlock(); /* -- 2.19.2 _______________________________________________ Devel mailing list Devel@openvz.org https://lists.openvz.org/mailman/listinfo/devel