On Sun, 20 Jul 2014, David Herrmann wrote:

> If we set SEAL_WRITE on a file, we must make sure there cannot be any
> ongoing write-operations on the file. For write() calls, we simply lock
> the inode mutex, for mmap() we simply verify there're no writable
> mappings. However, there might be pages pinned by AIO, Direct-IO and
> similar operations via GUP. We must make sure those do not write to the
> memfd file after we set SEAL_WRITE.
> 
> As there is no way to notify GUP users to drop pages or to wait for them
> to be done, we implement the wait ourself: When setting SEAL_WRITE, we
> check all pages for their ref-count. If it's bigger than 1, we know
> there's some user of the page. We then mark the page and wait for up to
> 150ms for those ref-counts to be dropped. If the ref-counts are not
> dropped in time, we refuse the seal operation.
> 
> Signed-off-by: David Herrmann <dh.herrm...@gmail.com>

Acked-by: Hugh Dickins <hu...@google.com>

I'd have moved this one up before the testing ones - except changing
the sequence in between postings can be confusing.  I'd be happy if
akpm happened to move it up - but unconcerned if he did not.

> ---
>  mm/shmem.c | 110 
> ++++++++++++++++++++++++++++++++++++++++++++++++++++++++++++-
>  1 file changed, 109 insertions(+), 1 deletion(-)
> 
> diff --git a/mm/shmem.c b/mm/shmem.c
> index 770e072..df1aceb 100644
> --- a/mm/shmem.c
> +++ b/mm/shmem.c
> @@ -1780,9 +1780,117 @@ static loff_t shmem_file_llseek(struct file *file, 
> loff_t offset, int whence)
>       return offset;
>  }
>  
> +/*
> + * We need a tag: a new tag would expand every radix_tree_node by 8 bytes,
> + * so reuse a tag which we firmly believe is never set or cleared on shmem.
> + */
> +#define SHMEM_TAG_PINNED        PAGECACHE_TAG_TOWRITE
> +#define LAST_SCAN               4       /* about 150ms max */
> +
> +static void shmem_tag_pins(struct address_space *mapping)
> +{
> +     struct radix_tree_iter iter;
> +     void **slot;
> +     pgoff_t start;
> +     struct page *page;
> +
> +     lru_add_drain();
> +     start = 0;
> +     rcu_read_lock();
> +
> +restart:
> +     radix_tree_for_each_slot(slot, &mapping->page_tree, &iter, start) {
> +             page = radix_tree_deref_slot(slot);
> +             if (!page || radix_tree_exception(page)) {
> +                     if (radix_tree_deref_retry(page))
> +                             goto restart;
> +             } else if (page_count(page) - page_mapcount(page) > 1) {
> +                     spin_lock_irq(&mapping->tree_lock);
> +                     radix_tree_tag_set(&mapping->page_tree, iter.index,
> +                                        SHMEM_TAG_PINNED);
> +                     spin_unlock_irq(&mapping->tree_lock);
> +             }
> +
> +             if (need_resched()) {
> +                     cond_resched_rcu();
> +                     start = iter.index + 1;
> +                     goto restart;
> +             }
> +     }
> +     rcu_read_unlock();
> +}
> +
> +/*
> + * Setting SEAL_WRITE requires us to verify there's no pending writer. 
> However,
> + * via get_user_pages(), drivers might have some pending I/O without any 
> active
> + * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages
> + * and see whether it has an elevated ref-count. If so, we tag them and wait 
> for
> + * them to be dropped.
> + * The caller must guarantee that no new user will acquire writable 
> references
> + * to those pages to avoid races.
> + */
>  static int shmem_wait_for_pins(struct address_space *mapping)
>  {
> -     return 0;
> +     struct radix_tree_iter iter;
> +     void **slot;
> +     pgoff_t start;
> +     struct page *page;
> +     int error, scan;
> +
> +     shmem_tag_pins(mapping);
> +
> +     error = 0;
> +     for (scan = 0; scan <= LAST_SCAN; scan++) {
> +             if (!radix_tree_tagged(&mapping->page_tree, SHMEM_TAG_PINNED))
> +                     break;
> +
> +             if (!scan)
> +                     lru_add_drain_all();
> +             else if (schedule_timeout_killable((HZ << scan) / 200))
> +                     scan = LAST_SCAN;
> +
> +             start = 0;
> +             rcu_read_lock();
> +restart:
> +             radix_tree_for_each_tagged(slot, &mapping->page_tree, &iter,
> +                                        start, SHMEM_TAG_PINNED) {
> +
> +                     page = radix_tree_deref_slot(slot);
> +                     if (radix_tree_exception(page)) {
> +                             if (radix_tree_deref_retry(page))
> +                                     goto restart;
> +
> +                             page = NULL;
> +                     }
> +
> +                     if (page &&
> +                         page_count(page) - page_mapcount(page) != 1) {
> +                             if (scan < LAST_SCAN)
> +                                     goto continue_resched;
> +
> +                             /*
> +                              * On the last scan, we clean up all those tags
> +                              * we inserted; but make a note that we still
> +                              * found pages pinned.
> +                              */
> +                             error = -EBUSY;
> +                     }
> +
> +                     spin_lock_irq(&mapping->tree_lock);
> +                     radix_tree_tag_clear(&mapping->page_tree,
> +                                          iter.index, SHMEM_TAG_PINNED);
> +                     spin_unlock_irq(&mapping->tree_lock);
> +continue_resched:
> +                     if (need_resched()) {
> +                             cond_resched_rcu();
> +                             start = iter.index + 1;
> +                             goto restart;
> +                     }
> +             }
> +             rcu_read_unlock();
> +     }
> +
> +     return error;
>  }
>  
>  #define F_ALL_SEALS (F_SEAL_SEAL | \
> -- 
> 2.0.2
> 
> 
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to