On Fri, 2007-07-27 at 11:37 -0500, Adam Litke wrote:
> Hey... I am amazed at how quickly you came back with a patch for this :)
> Thanks for looking at it.  Unfortunately there is one show-stopper and I
> have some reservations (pun definitely intended) with your approach:
Thanks for your great comments.

> 
> First, your patch does not pass the libhugetlbfs test
> 'alloc-instantiate-race' which was written to tickle the the race which
> the mutex was introduced to solve.  Your patch works for shared
> mappings, but not for the private case.
My testing about private might not be thorough. Function hugetlb_cow has a race
for multi-thread to fault on the same private page index. But after I fixed it,
alloc-instantiate-race still failed.

I tried to google the source code tarball of libhugetlbfs test suite, but 
couldn't
find it. Would you like to send me a copy of the test source codes?

> 
> Second, the introduction of another pair of global counters triggers my
> internal warning system... These global counters are known to cause
> problems with NUMA and cpusets.  Have you considered these interactions?
flight_blocks is a variable per superblock. As you know, quota is not used 
frequently,
so flight_blocks mostly isn't an issue of cache-miss.

flight_huge_pages might be a potential issue of cache miss with NUMA/cpusets. 

Currently, global variable free_huge_pages and resv_huge_pages have the same 
cache
issue with NUMA as well as the hugetlb_instantiation_mutex. I assume the new 
variable
flight_huge_pages shares the same cache line with hugetlb_lock, 
free_huge_pages, and
resv_huge_pages.

A possible improvement about it is to add a simple check without lock
before calling alloc_huge_page in function hugetlb_no_page. The
check could be:
if ( (!free_huge_pages && flight_huge_pages) ||
          (!(vma->vm_flags & VM_MAYSHARE) &&
           free_huge_pages <= resv_huge_pages &&
           flight_huge_pages) ) {
               hugetlb_rollback_quota(mapping);
               cond_resched();
               goto retry;
}

Then, it becomes read instead of write before changing the variables.

> Additionally, the commit/rollback logic you are using closely parallels
> what we already do with the huge page reservation mechanism.  Is there
> any way you could integrate your stuff into the reservation system to
> avoid all the duplicated logic?
I add a new parameter flight_delta to function hugetlb_acct_memory and use
hugetlb_acct_memory to replace both hugetlb_commit_page and 
hugetlb_rollback_page.
As for quota, it's hard to integrate it into current reservation mechanism.

Below is the new patch.

Yanmin

---

--- linux-2.6.22/fs/hugetlbfs/inode.c   2007-07-09 07:32:17.000000000 +0800
+++ linux-2.6.22_hugetlb/fs/hugetlbfs/inode.c   2007-07-26 14:52:04.000000000 
+0800
@@ -662,6 +662,7 @@ hugetlbfs_fill_super(struct super_block 
        spin_lock_init(&sbinfo->stat_lock);
        sbinfo->max_blocks = config.nr_blocks;
        sbinfo->free_blocks = config.nr_blocks;
+       sbinfo->flight_blocks = 0;
        sbinfo->max_inodes = config.nr_inodes;
        sbinfo->free_inodes = config.nr_inodes;
        sb->s_maxbytes = MAX_LFS_FILESIZE;
@@ -694,8 +695,11 @@ int hugetlb_get_quota(struct address_spa
 
        if (sbinfo->free_blocks > -1) {
                spin_lock(&sbinfo->stat_lock);
-               if (sbinfo->free_blocks > 0)
+               if (sbinfo->free_blocks > 0) {
                        sbinfo->free_blocks--;
+                       sbinfo->flight_blocks ++;
+               } else if (sbinfo->flight_blocks)
+                       ret = -EAGAIN;
                else
                        ret = -ENOMEM;
                spin_unlock(&sbinfo->stat_lock);
@@ -710,7 +714,30 @@ void hugetlb_put_quota(struct address_sp
 
        if (sbinfo->free_blocks > -1) {
                spin_lock(&sbinfo->stat_lock);
-               sbinfo->free_blocks++;
+               sbinfo->free_blocks ++;
+               spin_unlock(&sbinfo->stat_lock);
+       }
+}
+
+void hugetlb_commit_quota(struct address_space *mapping)
+{
+       struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
+
+       if (sbinfo->free_blocks > -1) {
+               spin_lock(&sbinfo->stat_lock);
+               sbinfo->flight_blocks --;
+               spin_unlock(&sbinfo->stat_lock);
+       }
+}
+
+void hugetlb_rollback_quota(struct address_space *mapping)
+{
+       struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb);
+
+       if (sbinfo->free_blocks > -1) {
+               spin_lock(&sbinfo->stat_lock);
+               sbinfo->free_blocks ++;
+               sbinfo->flight_blocks --;
                spin_unlock(&sbinfo->stat_lock);
        }
 }
--- linux-2.6.22/include/linux/hugetlb.h        2007-07-09 07:32:17.000000000 
+0800
+++ linux-2.6.22_hugetlb/include/linux/hugetlb.h        2007-07-24 
16:54:39.000000000 +0800
@@ -140,6 +140,7 @@ struct hugetlbfs_config {
 struct hugetlbfs_sb_info {
        long    max_blocks;   /* blocks allowed */
        long    free_blocks;  /* blocks free */
+       long    flight_blocks;/* blocks allocated but still not be used */
        long    max_inodes;   /* inodes allowed */
        long    free_inodes;  /* inodes free */
        spinlock_t      stat_lock;
@@ -166,6 +167,8 @@ extern struct vm_operations_struct huget
 struct file *hugetlb_file_setup(const char *name, size_t);
 int hugetlb_get_quota(struct address_space *mapping);
 void hugetlb_put_quota(struct address_space *mapping);
+void hugetlb_commit_quota(struct address_space *mapping);
+void hugetlb_rollback_quota(struct address_space *mapping);
 
 static inline int is_file_hugepages(struct file *file)
 {
--- linux-2.6.22/mm/hugetlb.c   2007-07-09 07:32:17.000000000 +0800
+++ linux-2.6.22_hugetlb/mm/hugetlb.c   2007-07-30 15:15:37.000000000 +0800
@@ -22,15 +22,16 @@
 #include "internal.h"
 
 const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL;
+/*
+ * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
+ */
+static DEFINE_SPINLOCK(hugetlb_lock);
 static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages;
+static unsigned long flight_huge_pages;
 unsigned long max_huge_pages;
 static struct list_head hugepage_freelists[MAX_NUMNODES];
 static unsigned int nr_huge_pages_node[MAX_NUMNODES];
 static unsigned int free_huge_pages_node[MAX_NUMNODES];
-/*
- * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages
- */
-static DEFINE_SPINLOCK(hugetlb_lock);
 
 static void clear_huge_page(struct page *page, unsigned long addr)
 {
@@ -123,27 +124,43 @@ static int alloc_fresh_huge_page(void)
 static struct page *alloc_huge_page(struct vm_area_struct *vma,
                                    unsigned long addr)
 {
-       struct page *page;
+       struct page *page = NULL;
 
        spin_lock(&hugetlb_lock);
-       if (vma->vm_flags & VM_MAYSHARE)
-               resv_huge_pages--;
-       else if (free_huge_pages <= resv_huge_pages)
-               goto fail;
+       if (vma->vm_flags & VM_MAYSHARE) {
+               if (resv_huge_pages)
+                       resv_huge_pages --;
+               else 
+                       goto out;
+       } else if (free_huge_pages <= resv_huge_pages + flight_huge_pages)
+               goto out;
 
        page = dequeue_huge_page(vma, addr);
-       if (!page)
-               goto fail;
+       if (page) {
+               set_page_refcounted(page);
+               flight_huge_pages ++;
+       } else if (vma->vm_flags & VM_MAYSHARE)
+                       resv_huge_pages ++;
 
+out:
+       if (!page && flight_huge_pages)
+               page = ERR_PTR(-EAGAIN);
        spin_unlock(&hugetlb_lock);
-       set_page_refcounted(page);
        return page;
+}
 
-fail:
-       if (vma->vm_flags & VM_MAYSHARE)
-               resv_huge_pages++;
+static int hugetlb_acct_memory(long resv_delta, long flight_delta)
+{
+       int ret = -ENOMEM;
+
+       spin_lock(&hugetlb_lock);
+       if ((resv_delta + resv_huge_pages) <= free_huge_pages) {
+               resv_huge_pages += resv_delta;
+               ret = 0;
+       }
+       flight_huge_pages += flight_delta;
        spin_unlock(&hugetlb_lock);
-       return NULL;
+       return ret;
 }
 
 static int __init hugetlb_init(void)
@@ -438,6 +455,7 @@ static int hugetlb_cow(struct mm_struct 
 {
        struct page *old_page, *new_page;
        int avoidcopy;
+       int ret = VM_FAULT_MINOR;
 
        old_page = pte_page(pte);
 
@@ -446,38 +464,51 @@ static int hugetlb_cow(struct mm_struct 
        avoidcopy = (page_count(old_page) == 1);
        if (avoidcopy) {
                set_huge_ptep_writable(vma, address, ptep);
-               return VM_FAULT_MINOR;
+               return ret;
        }
 
        page_cache_get(old_page);
+
+       spin_unlock(&mm->page_table_lock);
+retry:
        new_page = alloc_huge_page(vma, address);
 
-       if (!new_page) {
-               page_cache_release(old_page);
-               return VM_FAULT_OOM;
-       }
+       if (PTR_ERR(new_page) == -EAGAIN) {
+               if (likely(pte_same(*ptep, pte)) ) {
+                       cond_resched();
+                       goto retry;
+               } else
+                       spin_lock(&mm->page_table_lock);
+       } else if (!new_page) {
+               spin_lock(&mm->page_table_lock);
+               if (likely(pte_same(*ptep, pte)))
+                       ret = VM_FAULT_OOM;
+       } else {
+               copy_huge_page(new_page, old_page, address, vma);
+               spin_lock(&mm->page_table_lock);
 
-       spin_unlock(&mm->page_table_lock);
-       copy_huge_page(new_page, old_page, address, vma);
-       spin_lock(&mm->page_table_lock);
+               ptep = huge_pte_offset(mm, address & HPAGE_MASK);
+               if (likely(pte_same(*ptep, pte))) {
+                       /* Break COW */
+                       set_huge_pte_at(mm, address, ptep,
+                                       make_huge_pte(vma, new_page, 1));
+                       hugetlb_acct_memory(0, -1);
+                       new_page = old_page;
+               } else
+                       hugetlb_acct_memory(0, -1);
 
-       ptep = huge_pte_offset(mm, address & HPAGE_MASK);
-       if (likely(pte_same(*ptep, pte))) {
-               /* Break COW */
-               set_huge_pte_at(mm, address, ptep,
-                               make_huge_pte(vma, new_page, 1));
-               /* Make the old page be freed below */
-               new_page = old_page;
+               page_cache_release(new_page);
        }
-       page_cache_release(new_page);
+
        page_cache_release(old_page);
-       return VM_FAULT_MINOR;
+       return ret;
 }
 
 int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
                        unsigned long address, pte_t *ptep, int write_access)
 {
        int ret = VM_FAULT_SIGBUS;
+       int result, page_allocated;
        unsigned long idx;
        unsigned long size;
        struct page *page;
@@ -493,19 +524,49 @@ int hugetlb_no_page(struct mm_struct *mm
         * before we get page_table_lock.
         */
 retry:
+       result = -ENOMEM;
+       page_allocated = 0;
        page = find_lock_page(mapping, idx);
        if (!page) {
                size = i_size_read(mapping->host) >> HPAGE_SHIFT;
                if (idx >= size)
                        goto out;
-               if (hugetlb_get_quota(mapping))
+               result = hugetlb_get_quota(mapping);
+               if (result == -EAGAIN) {
+                       cond_resched();
+                       goto retry;
+               } else if (result) {
+                       page = find_lock_page(mapping, idx);
+                       if (page)
+                               goto get_page;
                        goto out;
+               }
+
+               /* To prevent too much cache miss, do some checking firstly */
+               if ( (!free_huge_pages && flight_huge_pages) ||
+                       (!(vma->vm_flags & VM_MAYSHARE) &&
+                       free_huge_pages <= resv_huge_pages &&
+                       flight_huge_pages) ) {
+                               hugetlb_rollback_quota(mapping);
+                               cond_resched();
+                               goto retry;
+               }
                page = alloc_huge_page(vma, address);
-               if (!page) {
-                       hugetlb_put_quota(mapping);
+               if (IS_ERR(page) || !page) {
+                       hugetlb_rollback_quota(mapping);
+                       result = -ENOMEM;
+                       if (PTR_ERR(page) == -EAGAIN) {
+                               cond_resched();
+                               goto retry;
+                       }
+
+                       page = find_lock_page(mapping, idx);
+                       if (page)
+                               goto get_page;
                        ret = VM_FAULT_OOM;
                        goto out;
                }
+               page_allocated = 1;
                clear_huge_page(page, address);
 
                if (vma->vm_flags & VM_SHARED) {
@@ -514,15 +575,22 @@ retry:
                        err = add_to_page_cache(page, mapping, idx, GFP_KERNEL);
                        if (err) {
                                put_page(page);
-                               hugetlb_put_quota(mapping);
+                               hugetlb_acct_memory(1, -1);
+                               hugetlb_rollback_quota(mapping);
                                if (err == -EEXIST)
                                        goto retry;
                                goto out;
+                       } else {
+                               hugetlb_acct_memory(0, -1);
+                               hugetlb_commit_quota(mapping);
+                               result = -ENOMEM;
+                               page_allocated = 0;
                        }
                } else
                        lock_page(page);
        }
 
+get_page:
        spin_lock(&mm->page_table_lock);
        size = i_size_read(mapping->host) >> HPAGE_SHIFT;
        if (idx >= size)
@@ -536,6 +604,16 @@ retry:
                                && (vma->vm_flags & VM_SHARED)));
        set_huge_pte_at(mm, address, ptep, new_pte);
 
+       /*
+        * If a new huge page is allocated for private mapping,
+        * need commit quota and page here
+        */
+       if (page_allocated)
+               hugetlb_acct_memory(0, -1);
+       if (!result)
+               hugetlb_commit_quota(mapping);
+
+       page_allocated = 0;
        if (write_access && !(vma->vm_flags & VM_SHARED)) {
                /* Optimization, do the COW without a second fault */
                ret = hugetlb_cow(mm, vma, address, ptep, new_pte);
@@ -548,9 +626,13 @@ out:
 
 backout:
        spin_unlock(&mm->page_table_lock);
-       hugetlb_put_quota(mapping);
        unlock_page(page);
        put_page(page);
+       if (page_allocated)
+               hugetlb_acct_memory(0, -1);
+       if (!result)
+               hugetlb_rollback_quota(mapping);
+
        goto out;
 }
 
@@ -560,7 +642,6 @@ int hugetlb_fault(struct mm_struct *mm, 
        pte_t *ptep;
        pte_t entry;
        int ret;
-       static DEFINE_MUTEX(hugetlb_instantiation_mutex);
 
        ptep = huge_pte_alloc(mm, address);
        if (!ptep)
@@ -571,11 +652,9 @@ int hugetlb_fault(struct mm_struct *mm, 
         * get spurious allocation failures if two CPUs race to instantiate
         * the same page in the page cache.
         */
-       mutex_lock(&hugetlb_instantiation_mutex);
        entry = *ptep;
        if (pte_none(entry)) {
                ret = hugetlb_no_page(mm, vma, address, ptep, write_access);
-               mutex_unlock(&hugetlb_instantiation_mutex);
                return ret;
        }
 
@@ -587,7 +666,6 @@ int hugetlb_fault(struct mm_struct *mm, 
                if (write_access && !pte_write(entry))
                        ret = hugetlb_cow(mm, vma, address, ptep, entry);
        spin_unlock(&mm->page_table_lock);
-       mutex_unlock(&hugetlb_instantiation_mutex);
 
        return ret;
 }
@@ -811,19 +889,6 @@ static long region_truncate(struct list_
        return chg;
 }
 
-static int hugetlb_acct_memory(long delta)
-{
-       int ret = -ENOMEM;
-
-       spin_lock(&hugetlb_lock);
-       if ((delta + resv_huge_pages) <= free_huge_pages) {
-               resv_huge_pages += delta;
-               ret = 0;
-       }
-       spin_unlock(&hugetlb_lock);
-       return ret;
-}
-
 int hugetlb_reserve_pages(struct inode *inode, long from, long to)
 {
        long ret, chg;
@@ -851,7 +916,7 @@ int hugetlb_reserve_pages(struct inode *
        if (chg > cpuset_mems_nr(free_huge_pages_node))
                return -ENOMEM;
 
-       ret = hugetlb_acct_memory(chg);
+       ret = hugetlb_acct_memory(chg, 0);
        if (ret < 0)
                return ret;
        region_add(&inode->i_mapping->private_list, from, to);
@@ -861,5 +926,6 @@ int hugetlb_reserve_pages(struct inode *
 void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed)
 {
        long chg = region_truncate(&inode->i_mapping->private_list, offset);
-       hugetlb_acct_memory(freed - chg);
+       hugetlb_acct_memory(freed - chg, 0);
 }
+
-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to