On Fri, Jan 10, 2014 at 01:10:43PM -0500, Johannes Weiner wrote:
> Previously, page cache radix tree nodes were freed after reclaim
> emptied out their page pointers.  But now reclaim stores shadow
> entries in their place, which are only reclaimed when the inodes
> themselves are reclaimed.  This is problematic for bigger files that
> are still in use after they have a significant amount of their cache
> reclaimed, without any of those pages actually refaulting.  The shadow
> entries will just sit there and waste memory.  In the worst case, the
> shadow entries will accumulate until the machine runs out of memory.
> 
> To get this under control, the VM will track radix tree nodes
> exclusively containing shadow entries on a per-NUMA node list.
> Per-NUMA rather than global because we expect the radix tree nodes
> themselves to be allocated node-locally and we want to reduce
> cross-node references of otherwise independent cache workloads.  A
> simple shrinker will then reclaim these nodes on memory pressure.
> 
> A few things need to be stored in the radix tree node to implement the
> shadow node LRU and allow tree deletions coming from the list:

Just a couple of things with the list_lru interfaces.

....
> @@ -123,9 +129,39 @@ static void page_cache_tree_delete(struct address_space 
> *mapping,
>                * same time and miss a shadow entry.
>                */
>               smp_wmb();
> -     } else
> -             radix_tree_delete(&mapping->page_tree, page->index);
> +     }
>       mapping->nrpages--;
> +
> +     if (!node) {
> +             /* Clear direct pointer tags in root node */
> +             mapping->page_tree.gfp_mask &= __GFP_BITS_MASK;
> +             radix_tree_replace_slot(slot, shadow);
> +             return;
> +     }
> +
> +     /* Clear tree tags for the removed page */
> +     index = page->index;
> +     offset = index & RADIX_TREE_MAP_MASK;
> +     for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
> +             if (test_bit(offset, node->tags[tag]))
> +                     radix_tree_tag_clear(&mapping->page_tree, index, tag);
> +     }
> +
> +     /* Delete page, swap shadow entry */
> +     radix_tree_replace_slot(slot, shadow);
> +     node->count--;
> +     if (shadow)
> +             node->count += 1U << RADIX_TREE_COUNT_SHIFT;
> +     else
> +             if (__radix_tree_delete_node(&mapping->page_tree, node))
> +                     return;
> +
> +     /* Only shadow entries in there, keep track of this node */
> +     if (!(node->count & RADIX_TREE_COUNT_MASK) &&
> +         list_empty(&node->private_list)) {
> +             node->private_data = mapping;
> +             list_lru_add(&workingset_shadow_nodes, &node->private_list);
> +     }

You can't do this list_empty(&node->private_list) check safely
externally to the list_lru code - only time that entry can be
checked safely is under the LRU list locks. This is the reason that
list_lru_add/list_lru_del return a boolean to indicate is the object
was added/removed from the list - they do this list_empty() check
internally. i.e. the correct, safe way to do conditionally update
state iff the object was added to the LRU is:

        if (!(node->count & RADIX_TREE_COUNT_MASK)) {
                if (list_lru_add(&workingset_shadow_nodes, &node->private_list))
                        node->private_data = mapping;
        }

> +     radix_tree_replace_slot(slot, page);
> +     mapping->nrpages++;
> +     if (node) {
> +             node->count++;
> +             /* Installed page, can't be shadow-only anymore */
> +             if (!list_empty(&node->private_list))
> +                     list_lru_del(&workingset_shadow_nodes,
> +                                  &node->private_list);
> +     }

Same issue here:

        if (node) {
                node->count++;
                list_lru_del(&workingset_shadow_nodes, &node->private_list);
        }


> +     return 0;
>  }
>  
>  static int __add_to_page_cache_locked(struct page *page,
> diff --git a/mm/list_lru.c b/mm/list_lru.c
> index 72f9decb0104..47a9faf4070b 100644
> --- a/mm/list_lru.c
> +++ b/mm/list_lru.c
> @@ -88,10 +88,18 @@ restart:
>               ret = isolate(item, &nlru->lock, cb_arg);
>               switch (ret) {
>               case LRU_REMOVED:
> +             case LRU_REMOVED_RETRY:
>                       if (--nlru->nr_items == 0)
>                               node_clear(nid, lru->active_nodes);
>                       WARN_ON_ONCE(nlru->nr_items < 0);
>                       isolated++;
> +                     /*
> +                      * If the lru lock has been dropped, our list
> +                      * traversal is now invalid and so we have to
> +                      * restart from scratch.
> +                      */
> +                     if (ret == LRU_REMOVED_RETRY)
> +                             goto restart;
>                       break;
>               case LRU_ROTATE:
>                       list_move_tail(item, &nlru->list);

I think that we need to assert that the list lru lock is correctly
held here on return with LRU_REMOVED_RETRY. i.e.

                case LRU_REMOVED_RETRY:
                        assert_spin_locked(&nlru->lock);
                case LRU_REMOVED:
                .....

> @@ -35,8 +38,21 @@ static void clear_exceptional_entry(struct address_space 
> *mapping,
>        * without the tree itself locked.  These unlocked entries
>        * need verification under the tree lock.
>        */
> -     if (radix_tree_delete_item(&mapping->page_tree, index, entry) == entry)
> -             mapping->nrshadows--;
> +     if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot))
> +             goto unlock;
> +     if (*slot != entry)
> +             goto unlock;
> +     radix_tree_replace_slot(slot, NULL);
> +     mapping->nrshadows--;
> +     if (!node)
> +             goto unlock;
> +     node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
> +     /* No more shadow entries, stop tracking the node */
> +     if (!(node->count >> RADIX_TREE_COUNT_SHIFT) &&
> +         !list_empty(&node->private_list))
> +             list_lru_del(&workingset_shadow_nodes, &node->private_list);
> +     __radix_tree_delete_node(&mapping->page_tree, node);

Same issue with list_empty() check.

> +
> +/*
> + * Page cache radix tree nodes containing only shadow entries can grow
> + * excessively on certain workloads.  That's why they are tracked on
> + * per-(NUMA)node lists and pushed back by a shrinker, but with a
> + * slightly higher threshold than regular shrinkers so we don't
> + * discard the entries too eagerly - after all, during light memory
> + * pressure is exactly when we need them.
> + */
> +
> +struct list_lru workingset_shadow_nodes;
> +
> +static unsigned long count_shadow_nodes(struct shrinker *shrinker,
> +                                     struct shrink_control *sc)
> +{
> +     return list_lru_count_node(&workingset_shadow_nodes, sc->nid);
> +}
> +
> +static enum lru_status shadow_lru_isolate(struct list_head *item,
> +                                       spinlock_t *lru_lock,
> +                                       void *arg)
> +{
> +     unsigned long *nr_reclaimed = arg;
> +     struct address_space *mapping;
> +     struct radix_tree_node *node;
> +     unsigned int i;
> +     int ret;
> +
> +     /*
> +      * Page cache insertions and deletions synchroneously maintain
> +      * the shadow node LRU under the mapping->tree_lock and the
> +      * lru_lock.  Because the page cache tree is emptied before
> +      * the inode can be destroyed, holding the lru_lock pins any
> +      * address_space that has radix tree nodes on the LRU.
> +      *
> +      * We can then safely transition to the mapping->tree_lock to
> +      * pin only the address_space of the particular node we want
> +      * to reclaim, take the node off-LRU, and drop the lru_lock.
> +      */
> +
> +     node = container_of(item, struct radix_tree_node, private_list);
> +     mapping = node->private_data;
> +
> +     /* Coming from the list, invert the lock order */
> +     if (!spin_trylock_irq(&mapping->tree_lock)) {
> +             spin_unlock(lru_lock);
> +             ret = LRU_RETRY;
> +             goto out;
> +     }
> +
> +     list_del_init(item);
> +     spin_unlock(lru_lock);
> +
> +     /*
> +      * The nodes should only contain one or more shadow entries,
> +      * no pages, so we expect to be able to remove them all and
> +      * delete and free the empty node afterwards.
> +      */
> +
> +     BUG_ON(!node->count);
> +     BUG_ON(node->count & RADIX_TREE_COUNT_MASK);
> +
> +     for (i = 0; i < RADIX_TREE_MAP_SIZE; i++) {
> +             if (node->slots[i]) {
> +                     BUG_ON(!radix_tree_exceptional_entry(node->slots[i]));
> +                     node->slots[i] = NULL;
> +                     BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT));
> +                     node->count -= 1U << RADIX_TREE_COUNT_SHIFT;
> +                     BUG_ON(!mapping->nrshadows);
> +                     mapping->nrshadows--;
> +             }
> +     }
> +     BUG_ON(node->count);
> +     inc_zone_state(page_zone(virt_to_page(node)), WORKINGSET_NODERECLAIM);
> +     if (!__radix_tree_delete_node(&mapping->page_tree, node))
> +             BUG();
> +     (*nr_reclaimed)++;
> +
> +     spin_unlock_irq(&mapping->tree_lock);
> +     ret = LRU_REMOVED_RETRY;
> +out:
> +     cond_resched();
> +     spin_lock(lru_lock);
> +     return ret;
> +}
> +
> +static unsigned long scan_shadow_nodes(struct shrinker *shrinker,
> +                                    struct shrink_control *sc)
> +{
> +     unsigned long nr_reclaimed = 0;
> +
> +     list_lru_walk_node(&workingset_shadow_nodes, sc->nid,
> +                        shadow_lru_isolate, &nr_reclaimed, &sc->nr_to_scan);
> +
> +     return nr_reclaimed;

list_lru_walk_node() returns the number of reclaimed objects (i.e.
the number of objects that returned LRU_REMOVED/LRU_REMOVED_RETRY
from the ->isolate callback). You don't need to count nr_reclaimed
yourself.

> +}
> +
> +static struct shrinker workingset_shadow_shrinker = {
> +     .count_objects = count_shadow_nodes,
> +     .scan_objects = scan_shadow_nodes,
> +     .seeks = DEFAULT_SEEKS * 4,
> +     .flags = SHRINKER_NUMA_AWARE,
> +};

Can you add a comment explaining how you calculated the .seeks
value? It's important to document the weighings/importance
we give to slab reclaim so we can determine if it's actually
acheiving the desired balance under different loads...

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to