Hi Tang,

On Tue, Jun 11, 2013 at 05:42:31PM +0800, Tang Chen wrote:
> Hi Benjamin,
> 
> Are you still working on this problem ?
> 
> Thanks. :)

Below is a copy of the most recent version of this patch I have worked 
on.  This version works and stands up to my testing using move_pages() to 
force the migration of the aio ring buffer.  A test program is available 
at http://www.kvack.org/~bcrl/aio/aio-numa-test.c .  Please note that 
this version is not suitable for mainline as the modifactions to the 
anon inode code are undesirable, so that part needs reworking.

                -ben


 fs/aio.c                |  113 ++++++++++++++++++++++++++++++++++++++++++++----
 fs/anon_inodes.c        |   14 ++++-
 include/linux/migrate.h |    3 +
 mm/migrate.c            |    2 
 mm/swap.c               |    1 
 5 files changed, 121 insertions(+), 12 deletions(-)

diff --git a/fs/aio.c b/fs/aio.c
index c5b1a8c..a951690 100644
--- a/fs/aio.c
+++ b/fs/aio.c
@@ -35,6 +35,9 @@
 #include <linux/eventfd.h>
 #include <linux/blkdev.h>
 #include <linux/compat.h>
+#include <linux/anon_inodes.h>
+#include <linux/migrate.h>
+#include <linux/ramfs.h>
 
 #include <asm/kmap_types.h>
 #include <asm/uaccess.h>
@@ -108,6 +111,7 @@ struct kioctx {
        } ____cacheline_aligned_in_smp;
 
        struct page             *internal_pages[AIO_RING_PAGES];
+       struct file             *ctx_file;
 };
 
 /*------ sysctl variables----*/
@@ -136,18 +140,80 @@ __initcall(aio_setup);
 
 static void aio_free_ring(struct kioctx *ctx)
 {
-       long i;
-
-       for (i = 0; i < ctx->nr_pages; i++)
-               put_page(ctx->ring_pages[i]);
+       int i;
 
        if (ctx->mmap_size)
                vm_munmap(ctx->mmap_base, ctx->mmap_size);
 
+       if (ctx->ctx_file)
+               truncate_setsize(ctx->ctx_file->f_inode, 0);
+
+       for (i = 0; i < ctx->nr_pages; i++) {
+               pr_debug("pid(%d) [%d] page->count=%d\n", current->pid, i,
+                        page_count(ctx->ring_pages[i]));
+               put_page(ctx->ring_pages[i]);
+       }
+
        if (ctx->ring_pages && ctx->ring_pages != ctx->internal_pages)
                kfree(ctx->ring_pages);
+
+       if (ctx->ctx_file) {
+               truncate_setsize(ctx->ctx_file->f_inode, 0);
+               pr_debug("pid(%d) i_nlink=%u d_count=%d, d_unhashed=%d 
i_count=%d\n",
+                        current->pid, ctx->ctx_file->f_inode->i_nlink,
+                        ctx->ctx_file->f_path.dentry->d_count,
+                        d_unhashed(ctx->ctx_file->f_path.dentry),
+                        
atomic_read(&ctx->ctx_file->f_path.dentry->d_inode->i_count));
+               fput(ctx->ctx_file);
+               ctx->ctx_file = NULL;
+       }
+}
+
+static int aio_ctx_mmap(struct file *file, struct vm_area_struct *vma)
+{
+       vma->vm_ops = &generic_file_vm_ops;
+       return 0;
+}
+
+static const struct file_operations aio_ctx_fops = {
+       .mmap   = aio_ctx_mmap,
+};
+
+static int aio_set_page_dirty(struct page *page)
+{
+       return 0;
+}
+
+static int aio_migratepage(struct address_space *mapping, struct page *new,
+                          struct page *old, enum migrate_mode mode)
+{
+       struct kioctx *ctx = mapping->private_data;
+       unsigned long flags;
+       unsigned idx = old->index;
+       int rc;
+
+       BUG_ON(PageWriteback(old));    /* Writeback must be complete */
+       put_page(old);
+       rc = migrate_page_move_mapping(mapping, new, old, NULL, mode);
+       if (rc != MIGRATEPAGE_SUCCESS) {
+               get_page(old);
+               return rc;
+       }
+       get_page(new);
+
+       spin_lock_irqsave(&ctx->completion_lock, flags);
+       migrate_page_copy(new, old);
+       ctx->ring_pages[idx] = new;
+       spin_unlock_irqrestore(&ctx->completion_lock, flags);
+
+       return MIGRATEPAGE_SUCCESS;
 }
 
+static const struct address_space_operations aio_ctx_aops = {
+       .set_page_dirty = aio_set_page_dirty,
+       .migratepage    = aio_migratepage,
+};
+
 static int aio_setup_ring(struct kioctx *ctx)
 {
        struct aio_ring *ring;
@@ -155,6 +221,7 @@ static int aio_setup_ring(struct kioctx *ctx)
        struct mm_struct *mm = current->mm;
        unsigned long size, populate;
        int nr_pages;
+       int i;
 
        /* Compensate for the ring buffer's head/tail overlap entry */
        nr_events += 2; /* 1 is required, 2 for good luck */
@@ -166,6 +233,28 @@ static int aio_setup_ring(struct kioctx *ctx)
        if (nr_pages < 0)
                return -EINVAL;
 
+       ctx->ctx_file = anon_inode_getfile("[aio]", &aio_ctx_fops, ctx, O_RDWR);
+       if (IS_ERR(ctx->ctx_file)) {
+               ctx->ctx_file = NULL;
+               return -EAGAIN;
+       }
+       ctx->ctx_file->f_inode->i_mapping->a_ops = &aio_ctx_aops;
+       ctx->ctx_file->f_inode->i_mapping->private_data = ctx;
+       ctx->ctx_file->f_inode->i_size = PAGE_SIZE * (loff_t)nr_pages;
+
+       for (i=0; i<nr_pages; i++) {
+               struct page *page;
+               page = find_or_create_page(ctx->ctx_file->f_inode->i_mapping,
+                                          i, GFP_HIGHUSER | __GFP_ZERO);
+               if (!page)
+                       break;
+               pr_debug("pid(%d) page[%d]->count=%d\n",
+                        current->pid, i, page_count(page));
+               SetPageUptodate(page);
+               SetPageDirty(page);
+               unlock_page(page);
+       }
+
        nr_events = (PAGE_SIZE * nr_pages - sizeof(struct aio_ring)) / 
sizeof(struct io_event);
 
        ctx->nr_events = 0;
@@ -180,20 +269,25 @@ static int aio_setup_ring(struct kioctx *ctx)
        ctx->mmap_size = nr_pages * PAGE_SIZE;
        pr_debug("attempting mmap of %lu bytes\n", ctx->mmap_size);
        down_write(&mm->mmap_sem);
-       ctx->mmap_base = do_mmap_pgoff(NULL, 0, ctx->mmap_size,
-                                      PROT_READ|PROT_WRITE,
-                                      MAP_ANONYMOUS|MAP_PRIVATE, 0, &populate);
+       ctx->mmap_base = do_mmap_pgoff(ctx->ctx_file, 0, ctx->mmap_size,
+                                      PROT_READ | PROT_WRITE,
+                                      MAP_SHARED | MAP_POPULATE, 0,
+                                      &populate);
        if (IS_ERR((void *)ctx->mmap_base)) {
                up_write(&mm->mmap_sem);
                ctx->mmap_size = 0;
                aio_free_ring(ctx);
                return -EAGAIN;
        }
+       up_write(&mm->mmap_sem);
+       mm_populate(ctx->mmap_base, populate);
 
        pr_debug("mmap address: 0x%08lx\n", ctx->mmap_base);
        ctx->nr_pages = get_user_pages(current, mm, ctx->mmap_base, nr_pages,
                                       1, 0, ctx->ring_pages, NULL);
-       up_write(&mm->mmap_sem);
+       for (i=0; i<ctx->nr_pages; i++) {
+               put_page(ctx->ring_pages[i]);
+       }
 
        if (unlikely(ctx->nr_pages != nr_pages)) {
                aio_free_ring(ctx);
@@ -403,6 +497,8 @@ out_cleanup:
        err = -EAGAIN;
        aio_free_ring(ctx);
 out_freectx:
+       if (ctx->ctx_file)
+               fput(ctx->ctx_file);
        kmem_cache_free(kioctx_cachep, ctx);
        pr_debug("error allocating ioctx %d\n", err);
        return ERR_PTR(err);
@@ -852,6 +948,7 @@ SYSCALL_DEFINE2(io_setup, unsigned, nr_events, 
aio_context_t __user *, ctxp)
        ioctx = ioctx_alloc(nr_events);
        ret = PTR_ERR(ioctx);
        if (!IS_ERR(ioctx)) {
+               ctx = ioctx->user_id;
                ret = put_user(ioctx->user_id, ctxp);
                if (ret)
                        kill_ioctx(ioctx);
diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c
index 47a65df..376d289 100644
--- a/fs/anon_inodes.c
+++ b/fs/anon_inodes.c
@@ -131,6 +131,7 @@ struct file *anon_inode_getfile(const char *name,
        struct qstr this;
        struct path path;
        struct file *file;
+       struct inode *inode;
 
        if (IS_ERR(anon_inode_inode))
                return ERR_PTR(-ENODEV);
@@ -138,6 +139,12 @@ struct file *anon_inode_getfile(const char *name,
        if (fops->owner && !try_module_get(fops->owner))
                return ERR_PTR(-ENOENT);
 
+       inode = anon_inode_mkinode(anon_inode_inode->i_sb);
+       if (IS_ERR(inode)) {
+               file = ERR_PTR(-ENOMEM);
+               goto err_module;
+       }
+
        /*
         * Link the inode to a directory entry by creating a unique name
         * using the inode sequence number.
@@ -155,17 +162,18 @@ struct file *anon_inode_getfile(const char *name,
         * We know the anon_inode inode count is always greater than zero,
         * so ihold() is safe.
         */
-       ihold(anon_inode_inode);
+       //ihold(inode);
 
-       d_instantiate(path.dentry, anon_inode_inode);
+       d_instantiate(path.dentry, inode);
 
        file = alloc_file(&path, OPEN_FMODE(flags), fops);
        if (IS_ERR(file))
                goto err_dput;
-       file->f_mapping = anon_inode_inode->i_mapping;
+       file->f_mapping = inode->i_mapping;
 
        file->f_flags = flags & (O_ACCMODE | O_NONBLOCK);
        file->private_data = priv;
+       drop_nlink(inode);
 
        return file;
 
diff --git a/include/linux/migrate.h b/include/linux/migrate.h
index a405d3dc..b6f3289 100644
--- a/include/linux/migrate.h
+++ b/include/linux/migrate.h
@@ -55,6 +55,9 @@ extern int migrate_vmas(struct mm_struct *mm,
 extern void migrate_page_copy(struct page *newpage, struct page *page);
 extern int migrate_huge_page_move_mapping(struct address_space *mapping,
                                  struct page *newpage, struct page *page);
+extern int migrate_page_move_mapping(struct address_space *mapping,
+                struct page *newpage, struct page *page,
+                struct buffer_head *head, enum migrate_mode mode);
 #else
 
 static inline void putback_lru_pages(struct list_head *l) {}
diff --git a/mm/migrate.c b/mm/migrate.c
index 27ed225..ac9c3a9 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -294,7 +294,7 @@ static inline bool buffer_migrate_lock_buffers(struct 
buffer_head *head,
  * 2 for pages with a mapping
  * 3 for pages with a mapping and PagePrivate/PagePrivate2 set.
  */
-static int migrate_page_move_mapping(struct address_space *mapping,
+int migrate_page_move_mapping(struct address_space *mapping,
                struct page *newpage, struct page *page,
                struct buffer_head *head, enum migrate_mode mode)
 {
diff --git a/mm/swap.c b/mm/swap.c
index dfd7d71..bbfba0a 100644
--- a/mm/swap.c
+++ b/mm/swap.c
@@ -160,6 +160,7 @@ skip_lock_tail:
 
 void put_page(struct page *page)
 {
+       BUG_ON(page_count(page) <= 0);
        if (unlikely(PageCompound(page)))
                put_compound_page(page);
        else if (put_page_testzero(page))
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [email protected]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to