On Fri, Nov 18, 2016 at 7:31 PM, Matthew Wilcox <[email protected]> wrote:
> From: Konstantin Khlebnikov [mailto:[email protected]]
>> On Thu, Nov 17, 2016 at 3:17 AM, Matthew Wilcox
>> <[email protected]> wrote:
>> > This fixes several interlinked problems with the iterators in the
>> > presence of multiorder entries.
>> >
>> > 1. radix_tree_iter_next() would only advance by one slot, which would
>> > result in the iterators returning the same entry more than once if there
>> > were sibling entries.
>>
>> Is this a problem? Do we have users who cannot evalate length of entry
>> by looking into it head?
>
> At the moment we have no users in tree :-) The two users I know of are the
> page cache and DAX. The page cache stores a pointer to a struct page, which
> has compound_order() to tell you the size. DAX uses a couple of bits in the
> radix tree entry to describe whether this is a PTE/PMD/PUD and so also knows
> the size of the entry that it found. We also store swap cache entries in the
> same radix tree (tagged exceptional). These currently have no information in
> them to describe their size because each one represents only one page. The
> latest patchset to support swapping huge pages inserts 512 entries into the
> radix tree instead of taking advantage of the multiorder entry
> infrastructure. Hopefully that gets fixed soon, but it will require stealing
> a bit from either the number of swap files allowed or from the maximum size
> of each swap file (currently 32/128GB)
>
> I think what you're suggesting is that we introduce a new API:
>
> slot = radix_tree_iter_save(&iter, order);
>
> where the caller tells us the order of the entry it just consumed. Or maybe
> you're suggesting
>
> slot = radix_tree_iter_advance(&iter, newindex)
Yes, someting like that.
>
> which would allow us to skip to any index. Although ... isn't that just
> radix_tree_iter_init()?
Iterator could keep pointer to current node and reuse it for next
iteration if possible.
>
> It does push a bit of complexity onto the callers. We have 7 callers of
> radix_tree_iter_next() in my current tree (after applying this patch set, so
> range_tag_if_tagged and locate_item have been pushed into their callers):
> btrfs, kugepaged, page-writeback and shmem. btrfs knows its objects occupy
> one slot. khugepaged knows that its page is order 0 at the time it calls
> radix_tree_iter_next(). Page-writeback has a struct page and can simply use
> compound_order(). It's shmem where things get sticky, although it's all
> solvable with some temporary variables.
>
Users who work only with single slot enties don't get any complications,
all other already manage these multiorder entries somehow.
> MM people, what do you think of this patch? It's on top of my current IDR
> tree, although I'd fold it into patch 20 of the series if it's acceptable.
>
> diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
> index 6d3457a..7f864ec 100644
> --- a/fs/btrfs/tests/btrfs-tests.c
> +++ b/fs/btrfs/tests/btrfs-tests.c
> @@ -162,7 +162,7 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info
> *fs_info)
> slot = radix_tree_iter_retry(&iter);
> continue;
> }
> - slot = radix_tree_iter_next(slot, &iter);
> + slot = radix_tree_iter_save(&iter, 0);
> spin_unlock(&fs_info->buffer_lock);
> free_extent_buffer_stale(eb);
> spin_lock(&fs_info->buffer_lock);
> diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h
> index 4e42d4d..4419325 100644
> --- a/include/linux/radix-tree.h
> +++ b/include/linux/radix-tree.h
> @@ -421,15 +421,22 @@ __radix_tree_iter_add(struct radix_tree_iter *iter,
> unsigned long slots)
> }
>
> /**
> - * radix_tree_iter_next - resume iterating when the chunk may be invalid
> - * @iter: iterator state
> + * radix_tree_iter_save - resume iterating when the chunk may be invalid
> + * @iter: iterator state
> + * @order: order of the entry that was just processed
> *
> - * If the iterator needs to release then reacquire a lock, the chunk may
> - * have been invalidated by an insertion or deletion. Call this function
> + * If the iteration needs to release then reacquire a lock, the chunk may
> + * be invalidated by an insertion or deletion. Call this function
> * before releasing the lock to continue the iteration from the next index.
> */
> -void **__must_check radix_tree_iter_next(void **slot,
> - struct radix_tree_iter *iter);
> +static inline void **__must_check
> +radix_tree_iter_save(struct radix_tree_iter *iter, unsigned order)
> +{
> + iter->next_index = round_up(iter->index, 1 << order);
> + iter->index = 0;
> + iter->tags = 0;
> + return NULL;
> +}
>
> /**
> * radix_tree_chunk_size - get current chunk size
> @@ -467,7 +474,7 @@ static inline void ** __radix_tree_next_slot(void **slot,
> * For tagged lookup it also eats @iter->tags.
> *
> * There are several cases where 'slot' can be passed in as NULL to this
> - * function. These cases result from the use of radix_tree_iter_next() or
> + * function. These cases result from the use of radix_tree_iter_save() or
> * radix_tree_iter_retry(). In these cases we don't end up dereferencing
> * 'slot' because either:
> * a) we are doing tagged iteration and iter->tags has been set to 0, or
> diff --git a/lib/radix-tree.c b/lib/radix-tree.c
> index 7e2469b..fcbadad 100644
> --- a/lib/radix-tree.c
> +++ b/lib/radix-tree.c
> @@ -1245,6 +1245,7 @@ static inline void __set_iter_shift(struct
> radix_tree_iter *iter,
> #endif
> }
>
> +#ifdef CONFIG_RADIX_TREE_MULTIORDER
> static void ** __radix_tree_iter_next(struct radix_tree_node **nodep,
> void **slot, struct radix_tree_iter *iter)
> {
> @@ -1263,7 +1264,6 @@ static void ** __radix_tree_iter_next(struct
> radix_tree_node **nodep,
> return NULL;
> }
>
> -#ifdef CONFIG_RADIX_TREE_MULTIORDER
> void ** __radix_tree_next_slot(void **slot, struct radix_tree_iter *iter,
> unsigned flags)
> {
> @@ -1321,20 +1321,6 @@ void ** __radix_tree_next_slot(void **slot, struct
> radix_tree_iter *iter,
> EXPORT_SYMBOL(__radix_tree_next_slot);
> #endif
>
> -void **radix_tree_iter_next(void **slot, struct radix_tree_iter *iter)
> -{
> - struct radix_tree_node *node;
> -
> - slot++;
> - iter->index = __radix_tree_iter_add(iter, 1);
> - node = rcu_dereference_raw(*slot);
> - __radix_tree_iter_next(&node, slot, iter);
> - iter->next_index = iter->index;
> - iter->tags = 0;
> - return NULL;
> -}
> -EXPORT_SYMBOL(radix_tree_iter_next);
> -
> /**
> * radix_tree_next_chunk - find next chunk of slots for iteration
> *
> diff --git a/mm/khugepaged.c b/mm/khugepaged.c
> index 46155d1..54446e6 100644
> --- a/mm/khugepaged.c
> +++ b/mm/khugepaged.c
> @@ -1614,7 +1614,7 @@ static void khugepaged_scan_shmem(struct mm_struct *mm,
> present++;
>
> if (need_resched()) {
> - slot = radix_tree_iter_next(slot, &iter);
> + slot = radix_tree_iter_save(&iter, 0);
> cond_resched_rcu();
> }
> }
> diff --git a/mm/page-writeback.c b/mm/page-writeback.c
> index c593715..7d6b870 100644
> --- a/mm/page-writeback.c
> +++ b/mm/page-writeback.c
> @@ -2119,7 +2119,7 @@ void tag_pages_for_writeback(struct address_space
> *mapping,
> tagged++;
> if ((tagged % WRITEBACK_TAG_BATCH) != 0)
> continue;
> - slot = radix_tree_iter_next(slot, &iter);
> + slot = radix_tree_iter_save(&iter, compound_order(*slot));
> spin_unlock_irq(&mapping->tree_lock);
> cond_resched();
> spin_lock_irq(&mapping->tree_lock);
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 8f9c9aa..3f2d07a 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -644,6 +644,7 @@ unsigned long shmem_partial_swap_usage(struct
> address_space *mapping,
> rcu_read_lock();
>
> radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
> + unsigned int order = 0;
> if (iter.index >= end)
> break;
>
> @@ -656,9 +657,11 @@ unsigned long shmem_partial_swap_usage(struct
> address_space *mapping,
>
> if (radix_tree_exceptional_entry(page))
> swapped++;
> + else
> + order = compound_order(page);
>
> if (need_resched()) {
> - slot = radix_tree_iter_next(slot, &iter);
> + slot = radix_tree_iter_save(&iter, order);
> cond_resched_rcu();
> }
> }
> @@ -1062,7 +1065,7 @@ static unsigned long find_swap_entry(struct
> radix_tree_root *root, void *item)
> checked++;
> if ((checked % 4096) != 0)
> continue;
> - slot = radix_tree_iter_next(slot, &iter);
> + slot = radix_tree_iter_save(&iter, 0);
> cond_resched_rcu();
> }
>
> @@ -2444,21 +2447,25 @@ static void shmem_tag_pins(struct address_space
> *mapping)
> rcu_read_lock();
>
> radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
> + unsigned int order = 0;
> page = radix_tree_deref_slot(slot);
> if (!page || radix_tree_exception(page)) {
> if (radix_tree_deref_retry(page)) {
> slot = radix_tree_iter_retry(&iter);
> continue;
> }
> - } else if (page_count(page) - page_mapcount(page) > 1) {
> - spin_lock_irq(&mapping->tree_lock);
> - radix_tree_tag_set(&mapping->page_tree, iter.index,
> - SHMEM_TAG_PINNED);
> - spin_unlock_irq(&mapping->tree_lock);
> + } else {
> + order = compound_order(page);
> + if (page_count(page) - page_mapcount(page) > 1) {
> + spin_lock_irq(&mapping->tree_lock);
> + radix_tree_tag_set(&mapping->page_tree,
> + iter.index, SHMEM_TAG_PINNED);
> + spin_unlock_irq(&mapping->tree_lock);
> + }
> }
>
> if (need_resched()) {
> - slot = radix_tree_iter_next(slot, &iter);
> + slot = radix_tree_iter_save(&iter, order);
> cond_resched_rcu();
> }
> }
> @@ -2528,7 +2535,10 @@ static int shmem_wait_for_pins(struct address_space
> *mapping)
> spin_unlock_irq(&mapping->tree_lock);
> continue_resched:
> if (need_resched()) {
> - slot = radix_tree_iter_next(slot, &iter);
> + unsigned int order = 0;
> + if (page)
> + order = compound_order(page);
> + slot = radix_tree_iter_save(&iter, order);
> cond_resched_rcu();
> }
> }