On Thu, Aug 11, 2005 at 10:28:04PM +1000, Nick Piggin wrote:
> 6/7
> 
> -- 
> SUSE Labs, Novell Inc.
> 

> Use the speculative get_page and the lockless radix tree lookups
> to introduce lockless page cache lookups (ie. no mapping->tree_lock).
> 
> The only atomicity changes this should introduce is the use of a
> non atomic pagevec lookup for truncate, however what atomicity
> guarantees there were are probably not too useful anyway.

I don't understand the placement of the rcu_read_lock() and
rcu_read_unlock() calls.  Again, possibly because I don't understand
the overall algorithm yet.  And again, search for blank lines.

> Index: linux-2.6/mm/filemap.c
> ===================================================================
> --- linux-2.6.orig/mm/filemap.c
> +++ linux-2.6/mm/filemap.c
> @@ -379,18 +379,25 @@ int add_to_page_cache(struct page *page,
>       int error = radix_tree_preload(gfp_mask & ~__GFP_HIGHMEM);
>  
>       if (error == 0) {
> +             page_cache_get(page);
> +             __SetPageLocked(page);
> +             page->mapping = mapping;
> +             page->index = offset;
> +
>               write_lock_irq(&mapping->tree_lock);
>               error = radix_tree_insert(&mapping->page_tree, offset, page);
>               if (!error) {
> -                     page_cache_get(page);
> -                     SetPageLocked(page);
> -                     page->mapping = mapping;
> -                     page->index = offset;
>                       mapping->nrpages++;
>                       pagecache_acct(1);
>               }
>               write_unlock_irq(&mapping->tree_lock);
>               radix_tree_preload_end();
> +
> +             if (error) {
> +                     page->mapping = NULL;
> +                     __put_page(page);
> +                     __ClearPageLocked(page);
> +             }
>       }
>       return error;
>  }
> @@ -500,13 +507,15 @@ EXPORT_SYMBOL(__lock_page);
>   */
>  struct page * find_get_page(struct address_space *mapping, unsigned long 
> offset)
>  {
> -     struct page *page;
> +     struct page **pagep;
> +     struct page *page = NULL;
>  
> -     read_lock_irq(&mapping->tree_lock);
> -     page = radix_tree_lookup(&mapping->page_tree, offset);
> -     if (page)
> -             page_cache_get(page);
> -     read_unlock_irq(&mapping->tree_lock);
> +     rcu_read_lock();
> +     pagep = (struct page **)radix_tree_lookup_slot(&mapping->page_tree,
> +                                                                     offset);
> +     if (pagep)
> +             page = page_cache_get_speculative(pagep);

The data structures accessed by page_cache_get_speculative() don't
seem to be freed via RCU.  My guess is that this is because these
data structures (struct page) never really go away.  However, they
do get re-used, and this re-use must be protected against.  It looks
to me that this protection takes the form of atomic instructions
in get_page_testone() and the like.  If this is the case, then it
should be possible to push the rcu_read_lock() and rcu_read_unlock()
down into the radix_tree_lookup_slot().

> +     rcu_read_unlock();
>       return page;
>  }
>  
> @@ -519,12 +528,24 @@ struct page *find_trylock_page(struct ad
>  {
>       struct page *page;
>  
> -     read_lock_irq(&mapping->tree_lock);
> -     page = radix_tree_lookup(&mapping->page_tree, offset);
> -     if (page && TestSetPageLocked(page))
> -             page = NULL;
> -     read_unlock_irq(&mapping->tree_lock);
> -     return page;
> +     page = find_get_page(mapping, offset);
> +     if (page) {
> +             if (TestSetPageLocked(page))
> +                     goto out_failed;
> +             /* Has the page been truncated before being locked? */
> +             if (page->mapping != mapping || page->index != offset) {
> +                     unlock_page(page);
> +                     goto out_failed;
> +             }
> +
> +             /* Silly interface requires us to drop the refcount */
> +             __put_page(page);
> +             return page;
> +
> +out_failed:
> +             page_cache_release(page);
> +     }
> +     return NULL;
>  }
>  
>  EXPORT_SYMBOL(find_trylock_page);
> @@ -545,25 +566,17 @@ struct page *find_lock_page(struct addre
>  {
>       struct page *page;
>  
> -     read_lock_irq(&mapping->tree_lock);
>  repeat:
> -     page = radix_tree_lookup(&mapping->page_tree, offset);
> +     page = find_get_page(mapping, offset);
>       if (page) {
> -             page_cache_get(page);
> -             if (TestSetPageLocked(page)) {
> -                     read_unlock_irq(&mapping->tree_lock);
> -                     lock_page(page);
> -                     read_lock_irq(&mapping->tree_lock);
> -
> -                     /* Has the page been truncated while we slept? */
> -                     if (page->mapping != mapping || page->index != offset) {
> -                             unlock_page(page);
> -                             page_cache_release(page);
> -                             goto repeat;
> -                     }
> +             lock_page(page);
> +             /* Has the page been truncated before being locked? */
> +             if (page->mapping != mapping || page->index != offset) {
> +                     unlock_page(page);
> +                     page_cache_release(page);
> +                     goto repeat;
>               }
>       }
> -     read_unlock_irq(&mapping->tree_lock);
>       return page;
>  }
>  
> @@ -646,6 +659,32 @@ unsigned find_get_pages(struct address_s
>       return ret;
>  }
>  
> +unsigned find_get_pages_nonatomic(struct address_space *mapping, pgoff_t 
> start,
> +                         unsigned int nr_pages, struct page **pages)
> +{
> +     unsigned int i;
> +     unsigned int ret;
> +     unsigned int ret2;
> +
> +     /*
> +      * We do some unsightly casting to use the array first for storing
> +      * pointers to the page pointers, and then for the pointers to
> +      * the pages themselves that the caller wants.
> +      */
> +     rcu_read_lock();
> +     ret = radix_tree_gang_lookup_slot(&mapping->page_tree,
> +                             (void ***)pages, start, nr_pages);
> +     ret2 = 0;
> +     for (i = 0; i < ret; i++) {
> +             struct page *page;
> +             page = page_cache_get_speculative(((struct page ***)pages)[i]);
> +             if (page)
> +                     pages[ret2++] = page;
> +     }

Same thing here -- I don't see why the rcu_read_lock() and rcu_read_unlock()
should not be pushed down into radix_tree_gang_lookup_slot().

> +     rcu_read_unlock();
> +     return ret2;
> +}
> +
>  /*
>   * Like find_get_pages, except we only return pages which are tagged with
>   * `tag'.   We update *index to index the next page for the traversal.
> Index: linux-2.6/mm/readahead.c
> ===================================================================
> --- linux-2.6.orig/mm/readahead.c
> +++ linux-2.6/mm/readahead.c
> @@ -272,27 +272,26 @@ __do_page_cache_readahead(struct address
>       /*
>        * Preallocate as many pages as we will need.
>        */
> -     read_lock_irq(&mapping->tree_lock);
>       for (page_idx = 0; page_idx < nr_to_read; page_idx++) {
>               unsigned long page_offset = offset + page_idx;
>               
>               if (page_offset > end_index)
>                       break;
>  
> +             /* Don't need mapping->tree_lock - lookup can be racy */
> +             rcu_read_lock();
>               page = radix_tree_lookup(&mapping->page_tree, page_offset);
> +             rcu_read_unlock();

Here, they could clearly be pushed down.  ;-)

>               if (page)
>                       continue;
>  
> -             read_unlock_irq(&mapping->tree_lock);
>               page = page_cache_alloc_cold(mapping);
> -             read_lock_irq(&mapping->tree_lock);
>               if (!page)
>                       break;
>               page->index = page_offset;
>               list_add(&page->lru, &page_pool);
>               ret++;
>       }
> -     read_unlock_irq(&mapping->tree_lock);
>  
>       /*
>        * Now start the IO.  We ignore I/O errors - if the page is not
> Index: linux-2.6/mm/swap_state.c
> ===================================================================
> --- linux-2.6.orig/mm/swap_state.c
> +++ linux-2.6/mm/swap_state.c
> @@ -76,19 +76,26 @@ static int __add_to_swap_cache(struct pa
>       BUG_ON(PagePrivate(page));
>       error = radix_tree_preload(gfp_mask);
>       if (!error) {
> +             page_cache_get(page);
> +             SetPageLocked(page);
> +             SetPageSwapCache(page);
> +             page->private = entry.val;
> +
>               write_lock_irq(&swapper_space.tree_lock);
>               error = radix_tree_insert(&swapper_space.page_tree,
>                                               entry.val, page);
>               if (!error) {
> -                     page_cache_get(page);
> -                     SetPageLocked(page);
> -                     SetPageSwapCache(page);
> -                     page->private = entry.val;
>                       total_swapcache_pages++;
>                       pagecache_acct(1);
>               }
>               write_unlock_irq(&swapper_space.tree_lock);
>               radix_tree_preload_end();
> +
> +             if (error) {
> +                     __put_page(page);
> +                     ClearPageLocked(page);
> +                     ClearPageSwapCache(page);
> +             }
>       }
>       return error;
>  }
> Index: linux-2.6/include/linux/page-flags.h
> ===================================================================
> --- linux-2.6.orig/include/linux/page-flags.h
> +++ linux-2.6/include/linux/page-flags.h
> @@ -167,16 +167,13 @@ extern void __mod_page_state(unsigned lo
>  /*
>   * Manipulation of page state flags
>   */
> -#define PageLocked(page)             \
> -             test_bit(PG_locked, &(page)->flags)
> -#define SetPageLocked(page)          \
> -             set_bit(PG_locked, &(page)->flags)
> -#define TestSetPageLocked(page)              \
> -             test_and_set_bit(PG_locked, &(page)->flags)
> -#define ClearPageLocked(page)                \
> -             clear_bit(PG_locked, &(page)->flags)
> -#define TestClearPageLocked(page)    \
> -             test_and_clear_bit(PG_locked, &(page)->flags)
> +#define PageLocked(page)     test_bit(PG_locked, &(page)->flags)
> +#define SetPageLocked(page)  set_bit(PG_locked, &(page)->flags)
> +#define __SetPageLocked(page)        __set_bit(PG_locked, &(page)->flags)
> +#define TestSetPageLocked(page)      test_and_set_bit(PG_locked, 
> &(page)->flags)
> +#define ClearPageLocked(page)        clear_bit(PG_locked, &(page)->flags)
> +#define __ClearPageLocked(page)      __clear_bit(PG_locked, &(page)->flags)
> +#define TestClearPageLocked(page) test_and_clear_bit(PG_locked, 
> &(page)->flags)
>  
>  #define PageError(page)              test_bit(PG_error, &(page)->flags)
>  #define SetPageError(page)   set_bit(PG_error, &(page)->flags)
> Index: linux-2.6/include/linux/pagemap.h
> ===================================================================
> --- linux-2.6.orig/include/linux/pagemap.h
> +++ linux-2.6/include/linux/pagemap.h
> @@ -130,6 +130,8 @@ extern struct page * find_or_create_page
>                               unsigned long index, unsigned int gfp_mask);
>  unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
>                       unsigned int nr_pages, struct page **pages);
> +unsigned find_get_pages_nonatomic(struct address_space *mapping, pgoff_t 
> start,
> +                     unsigned int nr_pages, struct page **pages);
>  unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
>                       int tag, unsigned int nr_pages, struct page **pages);
>  
> Index: linux-2.6/include/linux/pagevec.h
> ===================================================================
> --- linux-2.6.orig/include/linux/pagevec.h
> +++ linux-2.6/include/linux/pagevec.h
> @@ -25,6 +25,8 @@ void __pagevec_lru_add_active(struct pag
>  void pagevec_strip(struct pagevec *pvec);
>  unsigned pagevec_lookup(struct pagevec *pvec, struct address_space *mapping,
>               pgoff_t start, unsigned nr_pages);
> +unsigned pagevec_lookup_nonatomic(struct pagevec *pvec,
> +     struct address_space *mapping, pgoff_t start, unsigned nr_pages);
>  unsigned pagevec_lookup_tag(struct pagevec *pvec,
>               struct address_space *mapping, pgoff_t *index, int tag,
>               unsigned nr_pages);
> Index: linux-2.6/mm/swap.c
> ===================================================================
> --- linux-2.6.orig/mm/swap.c
> +++ linux-2.6/mm/swap.c
> @@ -380,6 +380,19 @@ unsigned pagevec_lookup(struct pagevec *
>       return pagevec_count(pvec);
>  }
>  
> +/**
> + * pagevec_lookup_nonatomic - non atomic pagevec_lookup
> + *
> + * This routine is non-atomic in that it may return blah.
> + */
> +unsigned pagevec_lookup_nonatomic(struct pagevec *pvec,
> +             struct address_space *mapping, pgoff_t start, unsigned nr_pages)
> +{
> +     pvec->nr = find_get_pages_nonatomic(mapping, start,
> +                                     nr_pages, pvec->pages);
> +     return pagevec_count(pvec);
> +}
> +
>  unsigned pagevec_lookup_tag(struct pagevec *pvec, struct address_space 
> *mapping,
>               pgoff_t *index, int tag, unsigned nr_pages)
>  {
> Index: linux-2.6/mm/truncate.c
> ===================================================================
> --- linux-2.6.orig/mm/truncate.c
> +++ linux-2.6/mm/truncate.c
> @@ -126,7 +126,7 @@ void truncate_inode_pages(struct address
>  
>       pagevec_init(&pvec, 0);
>       next = start;
> -     while (pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
> +     while (pagevec_lookup_nonatomic(&pvec, mapping, next, PAGEVEC_SIZE)) {
>               for (i = 0; i < pagevec_count(&pvec); i++) {
>                       struct page *page = pvec.pages[i];
>                       pgoff_t page_index = page->index;
> @@ -160,7 +160,7 @@ void truncate_inode_pages(struct address
>       next = start;
>       for ( ; ; ) {
>               cond_resched();
> -             if (!pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
> +             if (!pagevec_lookup_nonatomic(&pvec, mapping, next, 
> PAGEVEC_SIZE)) {
>                       if (next == start)
>                               break;
>                       next = start;
> @@ -206,7 +206,7 @@ unsigned long invalidate_mapping_pages(s
>  
>       pagevec_init(&pvec, 0);
>       while (next <= end &&
> -                     pagevec_lookup(&pvec, mapping, next, PAGEVEC_SIZE)) {
> +             pagevec_lookup_nonatomic(&pvec, mapping, next, PAGEVEC_SIZE)) {
>               for (i = 0; i < pagevec_count(&pvec); i++) {
>                       struct page *page = pvec.pages[i];
>  
> Index: linux-2.6/mm/page-writeback.c
> ===================================================================
> --- linux-2.6.orig/mm/page-writeback.c
> +++ linux-2.6/mm/page-writeback.c
> @@ -811,6 +811,7 @@ int mapping_tagged(struct address_space 
>       unsigned long flags;
>       int ret;
>  
> +     /* XXX: radix_tree_tagged is safe to run without the lock? */
>       read_lock_irqsave(&mapping->tree_lock, flags);
>       ret = radix_tree_tagged(&mapping->page_tree, tag);
>       read_unlock_irqrestore(&mapping->tree_lock, flags);

-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to