On Fri, 2007-07-27 at 11:37 -0500, Adam Litke wrote: > Hey... I am amazed at how quickly you came back with a patch for this :) > Thanks for looking at it. Unfortunately there is one show-stopper and I > have some reservations (pun definitely intended) with your approach: Thanks for your great comments.
> > First, your patch does not pass the libhugetlbfs test > 'alloc-instantiate-race' which was written to tickle the the race which > the mutex was introduced to solve. Your patch works for shared > mappings, but not for the private case. My testing about private might not be thorough. Function hugetlb_cow has a race for multi-thread to fault on the same private page index. But after I fixed it, alloc-instantiate-race still failed. I tried to google the source code tarball of libhugetlbfs test suite, but couldn't find it. Would you like to send me a copy of the test source codes? > > Second, the introduction of another pair of global counters triggers my > internal warning system... These global counters are known to cause > problems with NUMA and cpusets. Have you considered these interactions? flight_blocks is a variable per superblock. As you know, quota is not used frequently, so flight_blocks mostly isn't an issue of cache-miss. flight_huge_pages might be a potential issue of cache miss with NUMA/cpusets. Currently, global variable free_huge_pages and resv_huge_pages have the same cache issue with NUMA as well as the hugetlb_instantiation_mutex. I assume the new variable flight_huge_pages shares the same cache line with hugetlb_lock, free_huge_pages, and resv_huge_pages. A possible improvement about it is to add a simple check without lock before calling alloc_huge_page in function hugetlb_no_page. The check could be: if ( (!free_huge_pages && flight_huge_pages) || (!(vma->vm_flags & VM_MAYSHARE) && free_huge_pages <= resv_huge_pages && flight_huge_pages) ) { hugetlb_rollback_quota(mapping); cond_resched(); goto retry; } Then, it becomes read instead of write before changing the variables. > Additionally, the commit/rollback logic you are using closely parallels > what we already do with the huge page reservation mechanism. Is there > any way you could integrate your stuff into the reservation system to > avoid all the duplicated logic? I add a new parameter flight_delta to function hugetlb_acct_memory and use hugetlb_acct_memory to replace both hugetlb_commit_page and hugetlb_rollback_page. As for quota, it's hard to integrate it into current reservation mechanism. Below is the new patch. Yanmin --- --- linux-2.6.22/fs/hugetlbfs/inode.c 2007-07-09 07:32:17.000000000 +0800 +++ linux-2.6.22_hugetlb/fs/hugetlbfs/inode.c 2007-07-26 14:52:04.000000000 +0800 @@ -662,6 +662,7 @@ hugetlbfs_fill_super(struct super_block spin_lock_init(&sbinfo->stat_lock); sbinfo->max_blocks = config.nr_blocks; sbinfo->free_blocks = config.nr_blocks; + sbinfo->flight_blocks = 0; sbinfo->max_inodes = config.nr_inodes; sbinfo->free_inodes = config.nr_inodes; sb->s_maxbytes = MAX_LFS_FILESIZE; @@ -694,8 +695,11 @@ int hugetlb_get_quota(struct address_spa if (sbinfo->free_blocks > -1) { spin_lock(&sbinfo->stat_lock); - if (sbinfo->free_blocks > 0) + if (sbinfo->free_blocks > 0) { sbinfo->free_blocks--; + sbinfo->flight_blocks ++; + } else if (sbinfo->flight_blocks) + ret = -EAGAIN; else ret = -ENOMEM; spin_unlock(&sbinfo->stat_lock); @@ -710,7 +714,30 @@ void hugetlb_put_quota(struct address_sp if (sbinfo->free_blocks > -1) { spin_lock(&sbinfo->stat_lock); - sbinfo->free_blocks++; + sbinfo->free_blocks ++; + spin_unlock(&sbinfo->stat_lock); + } +} + +void hugetlb_commit_quota(struct address_space *mapping) +{ + struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb); + + if (sbinfo->free_blocks > -1) { + spin_lock(&sbinfo->stat_lock); + sbinfo->flight_blocks --; + spin_unlock(&sbinfo->stat_lock); + } +} + +void hugetlb_rollback_quota(struct address_space *mapping) +{ + struct hugetlbfs_sb_info *sbinfo = HUGETLBFS_SB(mapping->host->i_sb); + + if (sbinfo->free_blocks > -1) { + spin_lock(&sbinfo->stat_lock); + sbinfo->free_blocks ++; + sbinfo->flight_blocks --; spin_unlock(&sbinfo->stat_lock); } } --- linux-2.6.22/include/linux/hugetlb.h 2007-07-09 07:32:17.000000000 +0800 +++ linux-2.6.22_hugetlb/include/linux/hugetlb.h 2007-07-24 16:54:39.000000000 +0800 @@ -140,6 +140,7 @@ struct hugetlbfs_config { struct hugetlbfs_sb_info { long max_blocks; /* blocks allowed */ long free_blocks; /* blocks free */ + long flight_blocks;/* blocks allocated but still not be used */ long max_inodes; /* inodes allowed */ long free_inodes; /* inodes free */ spinlock_t stat_lock; @@ -166,6 +167,8 @@ extern struct vm_operations_struct huget struct file *hugetlb_file_setup(const char *name, size_t); int hugetlb_get_quota(struct address_space *mapping); void hugetlb_put_quota(struct address_space *mapping); +void hugetlb_commit_quota(struct address_space *mapping); +void hugetlb_rollback_quota(struct address_space *mapping); static inline int is_file_hugepages(struct file *file) { --- linux-2.6.22/mm/hugetlb.c 2007-07-09 07:32:17.000000000 +0800 +++ linux-2.6.22_hugetlb/mm/hugetlb.c 2007-07-30 15:15:37.000000000 +0800 @@ -22,15 +22,16 @@ #include "internal.h" const unsigned long hugetlb_zero = 0, hugetlb_infinity = ~0UL; +/* + * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages + */ +static DEFINE_SPINLOCK(hugetlb_lock); static unsigned long nr_huge_pages, free_huge_pages, resv_huge_pages; +static unsigned long flight_huge_pages; unsigned long max_huge_pages; static struct list_head hugepage_freelists[MAX_NUMNODES]; static unsigned int nr_huge_pages_node[MAX_NUMNODES]; static unsigned int free_huge_pages_node[MAX_NUMNODES]; -/* - * Protects updates to hugepage_freelists, nr_huge_pages, and free_huge_pages - */ -static DEFINE_SPINLOCK(hugetlb_lock); static void clear_huge_page(struct page *page, unsigned long addr) { @@ -123,27 +124,43 @@ static int alloc_fresh_huge_page(void) static struct page *alloc_huge_page(struct vm_area_struct *vma, unsigned long addr) { - struct page *page; + struct page *page = NULL; spin_lock(&hugetlb_lock); - if (vma->vm_flags & VM_MAYSHARE) - resv_huge_pages--; - else if (free_huge_pages <= resv_huge_pages) - goto fail; + if (vma->vm_flags & VM_MAYSHARE) { + if (resv_huge_pages) + resv_huge_pages --; + else + goto out; + } else if (free_huge_pages <= resv_huge_pages + flight_huge_pages) + goto out; page = dequeue_huge_page(vma, addr); - if (!page) - goto fail; + if (page) { + set_page_refcounted(page); + flight_huge_pages ++; + } else if (vma->vm_flags & VM_MAYSHARE) + resv_huge_pages ++; +out: + if (!page && flight_huge_pages) + page = ERR_PTR(-EAGAIN); spin_unlock(&hugetlb_lock); - set_page_refcounted(page); return page; +} -fail: - if (vma->vm_flags & VM_MAYSHARE) - resv_huge_pages++; +static int hugetlb_acct_memory(long resv_delta, long flight_delta) +{ + int ret = -ENOMEM; + + spin_lock(&hugetlb_lock); + if ((resv_delta + resv_huge_pages) <= free_huge_pages) { + resv_huge_pages += resv_delta; + ret = 0; + } + flight_huge_pages += flight_delta; spin_unlock(&hugetlb_lock); - return NULL; + return ret; } static int __init hugetlb_init(void) @@ -438,6 +455,7 @@ static int hugetlb_cow(struct mm_struct { struct page *old_page, *new_page; int avoidcopy; + int ret = VM_FAULT_MINOR; old_page = pte_page(pte); @@ -446,38 +464,51 @@ static int hugetlb_cow(struct mm_struct avoidcopy = (page_count(old_page) == 1); if (avoidcopy) { set_huge_ptep_writable(vma, address, ptep); - return VM_FAULT_MINOR; + return ret; } page_cache_get(old_page); + + spin_unlock(&mm->page_table_lock); +retry: new_page = alloc_huge_page(vma, address); - if (!new_page) { - page_cache_release(old_page); - return VM_FAULT_OOM; - } + if (PTR_ERR(new_page) == -EAGAIN) { + if (likely(pte_same(*ptep, pte)) ) { + cond_resched(); + goto retry; + } else + spin_lock(&mm->page_table_lock); + } else if (!new_page) { + spin_lock(&mm->page_table_lock); + if (likely(pte_same(*ptep, pte))) + ret = VM_FAULT_OOM; + } else { + copy_huge_page(new_page, old_page, address, vma); + spin_lock(&mm->page_table_lock); - spin_unlock(&mm->page_table_lock); - copy_huge_page(new_page, old_page, address, vma); - spin_lock(&mm->page_table_lock); + ptep = huge_pte_offset(mm, address & HPAGE_MASK); + if (likely(pte_same(*ptep, pte))) { + /* Break COW */ + set_huge_pte_at(mm, address, ptep, + make_huge_pte(vma, new_page, 1)); + hugetlb_acct_memory(0, -1); + new_page = old_page; + } else + hugetlb_acct_memory(0, -1); - ptep = huge_pte_offset(mm, address & HPAGE_MASK); - if (likely(pte_same(*ptep, pte))) { - /* Break COW */ - set_huge_pte_at(mm, address, ptep, - make_huge_pte(vma, new_page, 1)); - /* Make the old page be freed below */ - new_page = old_page; + page_cache_release(new_page); } - page_cache_release(new_page); + page_cache_release(old_page); - return VM_FAULT_MINOR; + return ret; } int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, unsigned long address, pte_t *ptep, int write_access) { int ret = VM_FAULT_SIGBUS; + int result, page_allocated; unsigned long idx; unsigned long size; struct page *page; @@ -493,19 +524,49 @@ int hugetlb_no_page(struct mm_struct *mm * before we get page_table_lock. */ retry: + result = -ENOMEM; + page_allocated = 0; page = find_lock_page(mapping, idx); if (!page) { size = i_size_read(mapping->host) >> HPAGE_SHIFT; if (idx >= size) goto out; - if (hugetlb_get_quota(mapping)) + result = hugetlb_get_quota(mapping); + if (result == -EAGAIN) { + cond_resched(); + goto retry; + } else if (result) { + page = find_lock_page(mapping, idx); + if (page) + goto get_page; goto out; + } + + /* To prevent too much cache miss, do some checking firstly */ + if ( (!free_huge_pages && flight_huge_pages) || + (!(vma->vm_flags & VM_MAYSHARE) && + free_huge_pages <= resv_huge_pages && + flight_huge_pages) ) { + hugetlb_rollback_quota(mapping); + cond_resched(); + goto retry; + } page = alloc_huge_page(vma, address); - if (!page) { - hugetlb_put_quota(mapping); + if (IS_ERR(page) || !page) { + hugetlb_rollback_quota(mapping); + result = -ENOMEM; + if (PTR_ERR(page) == -EAGAIN) { + cond_resched(); + goto retry; + } + + page = find_lock_page(mapping, idx); + if (page) + goto get_page; ret = VM_FAULT_OOM; goto out; } + page_allocated = 1; clear_huge_page(page, address); if (vma->vm_flags & VM_SHARED) { @@ -514,15 +575,22 @@ retry: err = add_to_page_cache(page, mapping, idx, GFP_KERNEL); if (err) { put_page(page); - hugetlb_put_quota(mapping); + hugetlb_acct_memory(1, -1); + hugetlb_rollback_quota(mapping); if (err == -EEXIST) goto retry; goto out; + } else { + hugetlb_acct_memory(0, -1); + hugetlb_commit_quota(mapping); + result = -ENOMEM; + page_allocated = 0; } } else lock_page(page); } +get_page: spin_lock(&mm->page_table_lock); size = i_size_read(mapping->host) >> HPAGE_SHIFT; if (idx >= size) @@ -536,6 +604,16 @@ retry: && (vma->vm_flags & VM_SHARED))); set_huge_pte_at(mm, address, ptep, new_pte); + /* + * If a new huge page is allocated for private mapping, + * need commit quota and page here + */ + if (page_allocated) + hugetlb_acct_memory(0, -1); + if (!result) + hugetlb_commit_quota(mapping); + + page_allocated = 0; if (write_access && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ ret = hugetlb_cow(mm, vma, address, ptep, new_pte); @@ -548,9 +626,13 @@ out: backout: spin_unlock(&mm->page_table_lock); - hugetlb_put_quota(mapping); unlock_page(page); put_page(page); + if (page_allocated) + hugetlb_acct_memory(0, -1); + if (!result) + hugetlb_rollback_quota(mapping); + goto out; } @@ -560,7 +642,6 @@ int hugetlb_fault(struct mm_struct *mm, pte_t *ptep; pte_t entry; int ret; - static DEFINE_MUTEX(hugetlb_instantiation_mutex); ptep = huge_pte_alloc(mm, address); if (!ptep) @@ -571,11 +652,9 @@ int hugetlb_fault(struct mm_struct *mm, * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - mutex_lock(&hugetlb_instantiation_mutex); entry = *ptep; if (pte_none(entry)) { ret = hugetlb_no_page(mm, vma, address, ptep, write_access); - mutex_unlock(&hugetlb_instantiation_mutex); return ret; } @@ -587,7 +666,6 @@ int hugetlb_fault(struct mm_struct *mm, if (write_access && !pte_write(entry)) ret = hugetlb_cow(mm, vma, address, ptep, entry); spin_unlock(&mm->page_table_lock); - mutex_unlock(&hugetlb_instantiation_mutex); return ret; } @@ -811,19 +889,6 @@ static long region_truncate(struct list_ return chg; } -static int hugetlb_acct_memory(long delta) -{ - int ret = -ENOMEM; - - spin_lock(&hugetlb_lock); - if ((delta + resv_huge_pages) <= free_huge_pages) { - resv_huge_pages += delta; - ret = 0; - } - spin_unlock(&hugetlb_lock); - return ret; -} - int hugetlb_reserve_pages(struct inode *inode, long from, long to) { long ret, chg; @@ -851,7 +916,7 @@ int hugetlb_reserve_pages(struct inode * if (chg > cpuset_mems_nr(free_huge_pages_node)) return -ENOMEM; - ret = hugetlb_acct_memory(chg); + ret = hugetlb_acct_memory(chg, 0); if (ret < 0) return ret; region_add(&inode->i_mapping->private_list, from, to); @@ -861,5 +926,6 @@ int hugetlb_reserve_pages(struct inode * void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed) { long chg = region_truncate(&inode->i_mapping->private_list, offset); - hugetlb_acct_memory(freed - chg); + hugetlb_acct_memory(freed - chg, 0); } + - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/