Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
On 1/24/07, Vaidyanathan Srinivasan <[EMAIL PROTECTED]> wrote: Aubrey Li wrote: > On 1/19/07, Vaidyanathan Srinivasan <[EMAIL PROTECTED]> wrote: >> Hi Aubrey, >> >> I used your patch on my PPC64 box and I do not get expected >> behavior. As you had requested, I am attaching zoneinfo and meminfo >> dumps: >> >> Please let me know if you need any further data to help me out with >> the test/experiment. >> > > Although I have no PPC64 box in hand, I think the logic should be the same. > get_page_from_freelist() is called 5 times in __alloc_pages(). > > 1) alloc_flags = ALLOC_WMARK_LOW | ALLOC_PAGECACHE; > 2) alloc_flags = ALLOC_WMARK_MIN | ALLOC_PAGECACHE; > We should have the same result on the first two times get_page_from_freelist(). > > 3) if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) > && !in_interrupt()) >alloc_flags = ALLOC_NO_WATERMARKS > The case on my platform will never enter this branch. If the branch > occurs on your side, > The limit will be omitted. Because NO watermark, zone_watermark_ok() > will not be checked. memory will be allocated directly. > > 4)if (likely(did_some_progress)) { >alloc_flags should include ALLOC_PAGECACHE. > So we should have the same result on this call. > > 5)} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { >alloc_flags = ALLOC_WMARK_HIGH, without ALLOC_PAGECACHE > > This branch will not hit on my case. You may need to check it. > > If 3) or 5) occurs on your platform, I think you can easily fix it. > Please confirm it and let me know the result. None of the above condition was the problem in my PPC64 box. I added __GFP_PAGECACHE flag in pagecache_alloc_cold() and grab_cache_page_nowait() routines and the reclaim seemed to work. --- linux-2.6.20-rc5.orig/include/linux/pagemap.h +++ linux-2.6.20-rc5/include/linux/pagemap.h @@ -62,12 +62,12 @@ static inline struct page *__page_cache_ static inline struct page *page_cache_alloc(struct address_space *x) { - return __page_cache_alloc(mapping_gfp_mask(x)); + return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_PAGECACHE); } static inline struct page *page_cache_alloc_cold(struct address_space *x) { - return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD); + return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD|__GFP_PAGECACHE); } typedef int filler_t(void *, struct page *); [snip] --- linux-2.6.20-rc5.orig/mm/filemap.c +++ linux-2.6.20-rc5/mm/filemap.c @@ -823,7 +823,7 @@ grab_cache_page_nowait(struct address_sp page_cache_release(page); return NULL; } - page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); + page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS | __GFP_PAGECACHE); if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) { page_cache_release(page); page = NULL; pagecache_alloc_cold() is used in the read-ahead path which was being called in my case of large file operations. --Vaidy Thanks to point it out. There is another patch on the LKML which I think is better. Checking the zone->max_pagecache in the get_page_from_freelist() is better than checking the watermark in zone_watermark_ok(). Let me know if it works for you. Thanks, -Aubrey - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
Aubrey Li wrote: > On 1/19/07, Vaidyanathan Srinivasan <[EMAIL PROTECTED]> wrote: >> Hi Aubrey, >> >> I used your patch on my PPC64 box and I do not get expected >> behavior. As you had requested, I am attaching zoneinfo and meminfo >> dumps: >> >> Please let me know if you need any further data to help me out with >> the test/experiment. >> > > Although I have no PPC64 box in hand, I think the logic should be the same. > get_page_from_freelist() is called 5 times in __alloc_pages(). > > 1) alloc_flags = ALLOC_WMARK_LOW | ALLOC_PAGECACHE; > 2) alloc_flags = ALLOC_WMARK_MIN | ALLOC_PAGECACHE; > We should have the same result on the first two times > get_page_from_freelist(). > > 3) if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) > && !in_interrupt()) >alloc_flags = ALLOC_NO_WATERMARKS > The case on my platform will never enter this branch. If the branch > occurs on your side, > The limit will be omitted. Because NO watermark, zone_watermark_ok() > will not be checked. memory will be allocated directly. > > 4)if (likely(did_some_progress)) { >alloc_flags should include ALLOC_PAGECACHE. > So we should have the same result on this call. > > 5)} else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { >alloc_flags = ALLOC_WMARK_HIGH, without ALLOC_PAGECACHE > > This branch will not hit on my case. You may need to check it. > > If 3) or 5) occurs on your platform, I think you can easily fix it. > Please confirm it and let me know the result. None of the above condition was the problem in my PPC64 box. I added __GFP_PAGECACHE flag in pagecache_alloc_cold() and grab_cache_page_nowait() routines and the reclaim seemed to work. --- linux-2.6.20-rc5.orig/include/linux/pagemap.h +++ linux-2.6.20-rc5/include/linux/pagemap.h @@ -62,12 +62,12 @@ static inline struct page *__page_cache_ static inline struct page *page_cache_alloc(struct address_space *x) { - return __page_cache_alloc(mapping_gfp_mask(x)); + return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_PAGECACHE); } static inline struct page *page_cache_alloc_cold(struct address_space *x) { - return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD); + return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD|__GFP_PAGECACHE); } typedef int filler_t(void *, struct page *); [snip] --- linux-2.6.20-rc5.orig/mm/filemap.c +++ linux-2.6.20-rc5/mm/filemap.c @@ -823,7 +823,7 @@ grab_cache_page_nowait(struct address_sp page_cache_release(page); return NULL; } - page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS); + page = __page_cache_alloc(mapping_gfp_mask(mapping) & ~__GFP_FS | __GFP_PAGECACHE); if (page && add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) { page_cache_release(page); page = NULL; pagecache_alloc_cold() is used in the read-ahead path which was being called in my case of large file operations. --Vaidy - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
Aubrey Li wrote: On 1/19/07, Vaidyanathan Srinivasan [EMAIL PROTECTED] wrote: Hi Aubrey, I used your patch on my PPC64 box and I do not get expected behavior. As you had requested, I am attaching zoneinfo and meminfo dumps: Please let me know if you need any further data to help me out with the test/experiment. Although I have no PPC64 box in hand, I think the logic should be the same. get_page_from_freelist() is called 5 times in __alloc_pages(). 1) alloc_flags = ALLOC_WMARK_LOW | ALLOC_PAGECACHE; 2) alloc_flags = ALLOC_WMARK_MIN | ALLOC_PAGECACHE; We should have the same result on the first two times get_page_from_freelist(). 3) if (((p-flags PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) !in_interrupt()) alloc_flags = ALLOC_NO_WATERMARKS The case on my platform will never enter this branch. If the branch occurs on your side, The limit will be omitted. Because NO watermark, zone_watermark_ok() will not be checked. memory will be allocated directly. 4)if (likely(did_some_progress)) { alloc_flags should include ALLOC_PAGECACHE. So we should have the same result on this call. 5)} else if ((gfp_mask __GFP_FS) !(gfp_mask __GFP_NORETRY)) { alloc_flags = ALLOC_WMARK_HIGH, without ALLOC_PAGECACHE This branch will not hit on my case. You may need to check it. If 3) or 5) occurs on your platform, I think you can easily fix it. Please confirm it and let me know the result. None of the above condition was the problem in my PPC64 box. I added __GFP_PAGECACHE flag in pagecache_alloc_cold() and grab_cache_page_nowait() routines and the reclaim seemed to work. --- linux-2.6.20-rc5.orig/include/linux/pagemap.h +++ linux-2.6.20-rc5/include/linux/pagemap.h @@ -62,12 +62,12 @@ static inline struct page *__page_cache_ static inline struct page *page_cache_alloc(struct address_space *x) { - return __page_cache_alloc(mapping_gfp_mask(x)); + return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_PAGECACHE); } static inline struct page *page_cache_alloc_cold(struct address_space *x) { - return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD); + return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD|__GFP_PAGECACHE); } typedef int filler_t(void *, struct page *); [snip] --- linux-2.6.20-rc5.orig/mm/filemap.c +++ linux-2.6.20-rc5/mm/filemap.c @@ -823,7 +823,7 @@ grab_cache_page_nowait(struct address_sp page_cache_release(page); return NULL; } - page = __page_cache_alloc(mapping_gfp_mask(mapping) ~__GFP_FS); + page = __page_cache_alloc(mapping_gfp_mask(mapping) ~__GFP_FS | __GFP_PAGECACHE); if (page add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) { page_cache_release(page); page = NULL; pagecache_alloc_cold() is used in the read-ahead path which was being called in my case of large file operations. --Vaidy - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
On 1/24/07, Vaidyanathan Srinivasan [EMAIL PROTECTED] wrote: Aubrey Li wrote: On 1/19/07, Vaidyanathan Srinivasan [EMAIL PROTECTED] wrote: Hi Aubrey, I used your patch on my PPC64 box and I do not get expected behavior. As you had requested, I am attaching zoneinfo and meminfo dumps: Please let me know if you need any further data to help me out with the test/experiment. Although I have no PPC64 box in hand, I think the logic should be the same. get_page_from_freelist() is called 5 times in __alloc_pages(). 1) alloc_flags = ALLOC_WMARK_LOW | ALLOC_PAGECACHE; 2) alloc_flags = ALLOC_WMARK_MIN | ALLOC_PAGECACHE; We should have the same result on the first two times get_page_from_freelist(). 3) if (((p-flags PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) !in_interrupt()) alloc_flags = ALLOC_NO_WATERMARKS The case on my platform will never enter this branch. If the branch occurs on your side, The limit will be omitted. Because NO watermark, zone_watermark_ok() will not be checked. memory will be allocated directly. 4)if (likely(did_some_progress)) { alloc_flags should include ALLOC_PAGECACHE. So we should have the same result on this call. 5)} else if ((gfp_mask __GFP_FS) !(gfp_mask __GFP_NORETRY)) { alloc_flags = ALLOC_WMARK_HIGH, without ALLOC_PAGECACHE This branch will not hit on my case. You may need to check it. If 3) or 5) occurs on your platform, I think you can easily fix it. Please confirm it and let me know the result. None of the above condition was the problem in my PPC64 box. I added __GFP_PAGECACHE flag in pagecache_alloc_cold() and grab_cache_page_nowait() routines and the reclaim seemed to work. --- linux-2.6.20-rc5.orig/include/linux/pagemap.h +++ linux-2.6.20-rc5/include/linux/pagemap.h @@ -62,12 +62,12 @@ static inline struct page *__page_cache_ static inline struct page *page_cache_alloc(struct address_space *x) { - return __page_cache_alloc(mapping_gfp_mask(x)); + return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_PAGECACHE); } static inline struct page *page_cache_alloc_cold(struct address_space *x) { - return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD); + return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_COLD|__GFP_PAGECACHE); } typedef int filler_t(void *, struct page *); [snip] --- linux-2.6.20-rc5.orig/mm/filemap.c +++ linux-2.6.20-rc5/mm/filemap.c @@ -823,7 +823,7 @@ grab_cache_page_nowait(struct address_sp page_cache_release(page); return NULL; } - page = __page_cache_alloc(mapping_gfp_mask(mapping) ~__GFP_FS); + page = __page_cache_alloc(mapping_gfp_mask(mapping) ~__GFP_FS | __GFP_PAGECACHE); if (page add_to_page_cache_lru(page, mapping, index, GFP_KERNEL)) { page_cache_release(page); page = NULL; pagecache_alloc_cold() is used in the read-ahead path which was being called in my case of large file operations. --Vaidy Thanks to point it out. There is another patch on the LKML which I think is better. Checking the zone-max_pagecache in the get_page_from_freelist() is better than checking the watermark in zone_watermark_ok(). Let me know if it works for you. Thanks, -Aubrey - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
On Sat, 20 Jan 2007, Aubrey Li wrote: > assume: > min = 123pages > pagecache_reserved = 200 pages > > if( alloc_flags & ALLOC_PAGECACHE) >watermark = min + pagecache_reserved ( 323 pages) > else >watermark = min ( 123 pages) > > So if request pagecache, when free pages < 323 pages, reclaim is triggered. > But at this time if request memory not pagecache, reclaim will be > triggered when free pages < 123 as the present reclaimer does. > > I verified it on my side, why do you think it doesn't work properly? The code does not check the page cache size but the number of free pages. The page cache size is available via zone_page_state(zone, NR_FILE_PAGES). In its current form your patch is making the system reclaim earlier for page cache allocations. And its reclaiming regardless of the number of pages in the page cache. If there are no pagecache pages but only anonymous pages in the zone then the code will still reclaim although the page cache size is zero. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
On Sat, 20 Jan 2007, Nick Piggin wrote: > > It doesn't reduce the amount of memory available to the system. It > > just reduce the amount of memory available to the page cache. So that > > page cache is limited and the reserved memory can be allocated by the > > application. > > But the patch doesn't do that, as I explained. The patch could do it if he would be checking NR_FILE_PAGES against a limit instead of the free pages. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
On Sat, 20 Jan 2007, Nick Piggin wrote: It doesn't reduce the amount of memory available to the system. It just reduce the amount of memory available to the page cache. So that page cache is limited and the reserved memory can be allocated by the application. But the patch doesn't do that, as I explained. The patch could do it if he would be checking NR_FILE_PAGES against a limit instead of the free pages. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
On Sat, 20 Jan 2007, Aubrey Li wrote: assume: min = 123pages pagecache_reserved = 200 pages if( alloc_flags ALLOC_PAGECACHE) watermark = min + pagecache_reserved ( 323 pages) else watermark = min ( 123 pages) So if request pagecache, when free pages 323 pages, reclaim is triggered. But at this time if request memory not pagecache, reclaim will be triggered when free pages 123 as the present reclaimer does. I verified it on my side, why do you think it doesn't work properly? The code does not check the page cache size but the number of free pages. The page cache size is available via zone_page_state(zone, NR_FILE_PAGES). In its current form your patch is making the system reclaim earlier for page cache allocations. And its reclaiming regardless of the number of pages in the page cache. If there are no pagecache pages but only anonymous pages in the zone then the code will still reclaim although the page cache size is zero. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
On 1/20/07, Nick Piggin <[EMAIL PROTECTED]> wrote: Aubrey Li wrote: > So what's the right way to limit pagecache? Probably something a lot more complicated... if you can say there is a "right way". >> Secondly, your patch isn't actually very good. It unconditionally >> shrinks memory to below the given % mark each time a pagecache alloc >> occurs, regardless of how much pagecache is in the system. Effectively >> that seems to just reduce the amount of memory available to the system. > > > It doesn't reduce the amount of memory available to the system. It > just reduce the amount of memory available to the page cache. So that > page cache is limited and the reserved memory can be allocated by the > application. But the patch doesn't do that, as I explained. I'm not sure you read the correct patch. Let me explain the logic again. assume: min = 123pages pagecache_reserved = 200 pages if( alloc_flags & ALLOC_PAGECACHE) watermark = min + pagecache_reserved ( 323 pages) else watermark = min ( 123 pages) So if request pagecache, when free pages < 323 pages, reclaim is triggered. But at this time if request memory not pagecache, reclaim will be triggered when free pages < 123 as the present reclaimer does. I verified it on my side, why do you think it doesn't work properly? >> Luckily, there are actually good, robust solutions for your higher >> order allocation problem. Do higher order allocations at boot time, >> modifiy userspace applications, or set up otherwise-unused, or easily >> reclaimable reserve pools for higher order allocations. I don't >> understand why you are so resistant to all of these approaches? >> > > I think we have explained the reason too much. We are working on > no-mmu arch and provide a platform running linux to our customer. They > are doing very good things like mplayer, asterisk, ip camera, etc on > our platform, some applications was migrated from mmu arch. I think > that means in some cases no-mmu arch is somewhat better than mmu arch. > So we are taking effort to make the migration smooth or make no-mmu > linux stronger. > It's no way to let our customer modify their applications, we also > unwilling to do it. And we have not an existing mechanism to set up a > pools for the complex applications. So I'm trying to do some coding > hack in the kernel to satisfy these kinds of requirement. Oh, maybe you misunderstand the reserve pools idea: that is an entirely kernel based solution where you can preallocate a large, contiguous pool of memory at boot time which you can use to satisfy your nommu higher order anonymous memory allocations. This is something that will not get fragmented by pagecache, nor will it get fragmented by any other page allocation, slab allocation. Tt is a pretty good solution provided that you size the pool correctly for your application's needs. So if application malloc(1M), how does kernel know to allocate reserved pool not from buddy system? I didn't see any special code about this. Is there any doc or example? -Aubrey - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
Aubrey Li wrote: So what's the right way to limit pagecache? Probably something a lot more complicated... if you can say there is a "right way". Secondly, your patch isn't actually very good. It unconditionally shrinks memory to below the given % mark each time a pagecache alloc occurs, regardless of how much pagecache is in the system. Effectively that seems to just reduce the amount of memory available to the system. It doesn't reduce the amount of memory available to the system. It just reduce the amount of memory available to the page cache. So that page cache is limited and the reserved memory can be allocated by the application. But the patch doesn't do that, as I explained. Luckily, there are actually good, robust solutions for your higher order allocation problem. Do higher order allocations at boot time, modifiy userspace applications, or set up otherwise-unused, or easily reclaimable reserve pools for higher order allocations. I don't understand why you are so resistant to all of these approaches? I think we have explained the reason too much. We are working on no-mmu arch and provide a platform running linux to our customer. They are doing very good things like mplayer, asterisk, ip camera, etc on our platform, some applications was migrated from mmu arch. I think that means in some cases no-mmu arch is somewhat better than mmu arch. So we are taking effort to make the migration smooth or make no-mmu linux stronger. It's no way to let our customer modify their applications, we also unwilling to do it. And we have not an existing mechanism to set up a pools for the complex applications. So I'm trying to do some coding hack in the kernel to satisfy these kinds of requirement. Oh, maybe you misunderstand the reserve pools idea: that is an entirely kernel based solution where you can preallocate a large, contiguous pool of memory at boot time which you can use to satisfy your nommu higher order anonymous memory allocations. This is something that will not get fragmented by pagecache, nor will it get fragmented by any other page allocation, slab allocation. Tt is a pretty good solution provided that you size the pool correctly for your application's needs. -- SUSE Labs, Novell Inc. Send instant messages to your online friends http://au.messenger.yahoo.com - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
On 1/19/07, Nick Piggin <[EMAIL PROTECTED]> wrote: Maybe, if you are talking about my advice to fix userspace... but you *are* going to contribute those changes back for the nommu community to use, right? So the end result of that is _not_ actually tweaking the end solutions. not quite sure what you're referring to here, but our approach is to contribute everything back in an acceptable form -mike - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
On 1/20/07, Nick Piggin <[EMAIL PROTECTED]> wrote: Aubrey Li wrote: > On 1/20/07, Vaidyanathan Srinivasan <[EMAIL PROTECTED]> wrote: >> If pagecache is overlimit, we expect old (cold) pagecache pages to >> be thrown out and reused for new file data. We do not expect to >> drop a few text or data pages to make room for new pagecache. >> > Well, actually I think this probably not necessary. Because the > reclaimer has no way to predict the behavior of user mode processes, > how do you make sure the pagecache will not be access again in a short It is not about predicting behaviour, it is about directing the reclaim effort at the actual resource that is under pressure. Even given a pagecache limiting patch which does the proper accounting to keep pagecache pages under a % limit (unlike yours), kicking off an undirected reclaim could (in theory) reclaim all slab and anonymous memory pages before bringing pagecache under the limit. So I think you need to be a bit more thorough than just assuming everything will be OK. Page reclaim behaviour is pretty strange and complex. So what's the right way to limit pagecache? Secondly, your patch isn't actually very good. It unconditionally shrinks memory to below the given % mark each time a pagecache alloc occurs, regardless of how much pagecache is in the system. Effectively that seems to just reduce the amount of memory available to the system. It doesn't reduce the amount of memory available to the system. It just reduce the amount of memory available to the page cache. So that page cache is limited and the reserved memory can be allocated by the application. Luckily, there are actually good, robust solutions for your higher order allocation problem. Do higher order allocations at boot time, modifiy userspace applications, or set up otherwise-unused, or easily reclaimable reserve pools for higher order allocations. I don't understand why you are so resistant to all of these approaches? I think we have explained the reason too much. We are working on no-mmu arch and provide a platform running linux to our customer. They are doing very good things like mplayer, asterisk, ip camera, etc on our platform, some applications was migrated from mmu arch. I think that means in some cases no-mmu arch is somewhat better than mmu arch. So we are taking effort to make the migration smooth or make no-mmu linux stronger. It's no way to let our customer modify their applications, we also unwilling to do it. And we have not an existing mechanism to set up a pools for the complex applications. So I'm trying to do some coding hack in the kernel to satisfy these kinds of requirement. And as you see, the patch seems to solve the problems on my side. But I'm not sure it's the right way to limit vfs cache, So I'm asking for comments and suggestions and help, I'm not asking to clobber the kernel. -Aubrey - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
Mike Frysinger wrote: On 1/19/07, Nick Piggin <[EMAIL PROTECTED]> wrote: Luckily, there are actually good, robust solutions for your higher order allocation problem. Do higher order allocations at boot time, modifiy userspace applications, or set up otherwise-unused, or easily reclaimable reserve pools for higher order allocations. I don't understand why you are so resistant to all of these approaches? in a nutshell ... the idea is to try and generalize these things your approach involves tweaking each end solution to maximize the performance Maybe, if you are talking about my advice to fix userspace... but you *are* going to contribute those changes back for the nommu community to use, right? So the end result of that is _not_ actually tweaking the end solutions. But actually, if you take the reserved pool approach, then that will work fine, in-kernel, and it is something that already needs to be done for dynamic hugepage allocations which is almost exactly the same situation. And everybody can use this as well (I think most of the code is written already, but not merged). our approach is to teach the kernel some more tricks so that each solution need not be tweaked these are at obvious odds as they tackle the problem by going in pretty much opposite directions ... yours leads to a tighter system in the end, but ours leads to much more rapid development and deployment OK that's fair enough, but considering that it doesn't actually fix the problem properly; and that it does weird and wonderful things with our already fragile page reclaim path, then it is not a good idea to merge it upstream. -- SUSE Labs, Novell Inc. Send instant messages to your online friends http://au.messenger.yahoo.com - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
On 1/19/07, Nick Piggin <[EMAIL PROTECTED]> wrote: Luckily, there are actually good, robust solutions for your higher order allocation problem. Do higher order allocations at boot time, modifiy userspace applications, or set up otherwise-unused, or easily reclaimable reserve pools for higher order allocations. I don't understand why you are so resistant to all of these approaches? in a nutshell ... the idea is to try and generalize these things your approach involves tweaking each end solution to maximize the performance our approach is to teach the kernel some more tricks so that each solution need not be tweaked these are at obvious odds as they tackle the problem by going in pretty much opposite directions ... yours leads to a tighter system in the end, but ours leads to much more rapid development and deployment -mike - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
Aubrey Li wrote: On 1/20/07, Vaidyanathan Srinivasan <[EMAIL PROTECTED]> wrote: If pagecache is overlimit, we expect old (cold) pagecache pages to be thrown out and reused for new file data. We do not expect to drop a few text or data pages to make room for new pagecache. Well, actually I think this probably not necessary. Because the reclaimer has no way to predict the behavior of user mode processes, how do you make sure the pagecache will not be access again in a short It is not about predicting behaviour, it is about directing the reclaim effort at the actual resource that is under pressure. Even given a pagecache limiting patch which does the proper accounting to keep pagecache pages under a % limit (unlike yours), kicking off an undirected reclaim could (in theory) reclaim all slab and anonymous memory pages before bringing pagecache under the limit. So I think you need to be a bit more thorough than just assuming everything will be OK. Page reclaim behaviour is pretty strange and complex. Secondly, your patch isn't actually very good. It unconditionally shrinks memory to below the given % mark each time a pagecache alloc occurs, regardless of how much pagecache is in the system. Effectively that seems to just reduce the amount of memory available to the system. Luckily, there are actually good, robust solutions for your higher order allocation problem. Do higher order allocations at boot time, modifiy userspace applications, or set up otherwise-unused, or easily reclaimable reserve pools for higher order allocations. I don't understand why you are so resistant to all of these approaches? -- SUSE Labs, Novell Inc. Send instant messages to your online friends http://au.messenger.yahoo.com - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
On 1/20/07, Vaidyanathan Srinivasan <[EMAIL PROTECTED]> wrote: Aubrey Li wrote: > On 1/19/07, Vaidyanathan Srinivasan <[EMAIL PROTECTED]> wrote: >> >> Hi Aubrey, >> >> The idea of creating separate flag for pagecache in page_alloc is >> interesting. The good part is that you flag watermark low and the >> zone reclaimer will do the rest of the job. >> >> However when the zone reclaimer starts to reclaim pages, it will >> remove all cold pages and not specifically pagecache pages. This >> may affect performance of applications. >> >> One possible solution to this reclaim is to use scan control fields >> and ask the shrink_page_list() and shrink_active_list() routines to >> target only pagecache pages. Pagecache pages are not mapped and >> they are easy to find on the LRU list. >> >> Please review my patch at http://lkml.org/lkml/2007/01/17/96 >> > > So you mean the existing reclaimer has the same issue, doesn't it? Well, the existing reclaimer will do the right job if the kernel really runs out of memory and need to recover pages for new allocations. The pages to be removed will be the coldest page in the system. However now with the introduction of pagecache limit, we are artificially creating a memory scarcity and forcing the reclaimer to throw away some pages while we actually have free usable RAM. In this context the choice of pages picked by the present reclaimer may not be the best ones. If pagecache is overlimit, we expect old (cold) pagecache pages to be thrown out and reused for new file data. We do not expect to drop a few text or data pages to make room for new pagecache. Well, actually I think this probably not necessary. Because the reclaimer has no way to predict the behavior of user mode processes, how do you make sure the pagecache will not be access again in a short time? So I think the present reclaimer is suitable. Limit pagecache must affect performance of applications. The key is what do you want to get? In my case, I get more memory to allocate, less fragmentation, it can solve my problem, :) Now the problem in the idea of the patch is, when vfs cache limit is hit, reclaimer doesn't reclaim all of the reclaimable pages, it just give few out. So next time vfs pagecache request, it is quite possible reclaimer is triggered again. That's the point in my mind affecting the performance of the applications. I'll continue to work on this issue to see if I can make a improvement. -Aubrey - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
On Sat, 20 Jan 2007, Vaidyanathan Srinivasan wrote: > >> However when the zone reclaimer starts to reclaim pages, it will > >> remove all cold pages and not specifically pagecache pages. This > >> may affect performance of applications. The reclaimer is passed a control structure that can be used to disable write to swap (if that is the concern). > I am open to suggestions on reclaim logic. My view is that we need > to selectively reclaim pagecache pages and not just call the > traditional reclaimer to freeup arbitrary type of pages. The traditional reclaim works fine if told what to do. Introducing another LRU list to do reclaim is a significant change to the VM, creates lots of overhead etc. - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
Aubrey Li wrote: > On 1/19/07, Vaidyanathan Srinivasan <[EMAIL PROTECTED]> wrote: >> >> Hi Aubrey, >> >> The idea of creating separate flag for pagecache in page_alloc is >> interesting. The good part is that you flag watermark low and the >> zone reclaimer will do the rest of the job. >> >> However when the zone reclaimer starts to reclaim pages, it will >> remove all cold pages and not specifically pagecache pages. This >> may affect performance of applications. >> >> One possible solution to this reclaim is to use scan control fields >> and ask the shrink_page_list() and shrink_active_list() routines to >> target only pagecache pages. Pagecache pages are not mapped and >> they are easy to find on the LRU list. >> >> Please review my patch at http://lkml.org/lkml/2007/01/17/96 >> > > So you mean the existing reclaimer has the same issue, doesn't it? Well, the existing reclaimer will do the right job if the kernel really runs out of memory and need to recover pages for new allocations. The pages to be removed will be the coldest page in the system. However now with the introduction of pagecache limit, we are artificially creating a memory scarcity and forcing the reclaimer to throw away some pages while we actually have free usable RAM. In this context the choice of pages picked by the present reclaimer may not be the best ones. If pagecache is overlimit, we expect old (cold) pagecache pages to be thrown out and reused for new file data. We do not expect to drop a few text or data pages to make room for new pagecache. > In your and Roy's patch, balance_pagecache() routine is called on file > backed access. > So you still want to add this checking? or change the current > reclaimer completely? The balance_pagecache() routine is called for file backed access since that is when we would probably exceed the pagecache limit. The routine check if the limit has exceeded and calls the reclaimer. The reclaimer is an extension of the present reclaimer with more checks to remove only pagecache pages and not try to unmap any mapped pages and potentially affect application performance. I am open to suggestions on reclaim logic. My view is that we need to selectively reclaim pagecache pages and not just call the traditional reclaimer to freeup arbitrary type of pages. --Vaidy > -Aubrey > - > To unsubscribe from this list: send the line "unsubscribe linux-kernel" in > the body of a message to [EMAIL PROTECTED] > More majordomo info at http://vger.kernel.org/majordomo-info.html > Please read the FAQ at http://www.tux.org/lkml/ > - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
On Thu, 18 Jan 2007, Aubrey Li wrote: > +int sysctl_pagecache_ratio = 10; > + Pagecache ratio is the ratio of memory to be left over? Would it not be better to twist this around and to be able to specify how much of the memory of a node may be used by the pagecache? Why limit the size of the pagecache? Some kind of rationale would be useful. Maybe it was there in earlier incarnations of the patch that I did not see? It should be kept with it. zone_reclaim already dynamically limits the size of the pagecache. > + if (alloc_flags & ALLOC_PAGECACHE) > + min = min + (sysctl_pagecache_ratio * z->present_pages) / 100; The calculation of the multiplication / division is usually not done in the hot allocation path. See f.e. how min_unmapped_pages is handled in mm/page_alloc.c - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
On 1/19/07, Vaidyanathan Srinivasan <[EMAIL PROTECTED]> wrote: Hi Aubrey, The idea of creating separate flag for pagecache in page_alloc is interesting. The good part is that you flag watermark low and the zone reclaimer will do the rest of the job. However when the zone reclaimer starts to reclaim pages, it will remove all cold pages and not specifically pagecache pages. This may affect performance of applications. One possible solution to this reclaim is to use scan control fields and ask the shrink_page_list() and shrink_active_list() routines to target only pagecache pages. Pagecache pages are not mapped and they are easy to find on the LRU list. Please review my patch at http://lkml.org/lkml/2007/01/17/96 So you mean the existing reclaimer has the same issue, doesn't it? In your and Roy's patch, balance_pagecache() routine is called on file backed access. So you still want to add this checking? or change the current reclaimer completely? -Aubrey - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
On 1/19/07, Vaidyanathan Srinivasan <[EMAIL PROTECTED]> wrote: Hi Aubrey, I used your patch on my PPC64 box and I do not get expected behavior. As you had requested, I am attaching zoneinfo and meminfo dumps: Please let me know if you need any further data to help me out with the test/experiment. Although I have no PPC64 box in hand, I think the logic should be the same. get_page_from_freelist() is called 5 times in __alloc_pages(). 1) alloc_flags = ALLOC_WMARK_LOW | ALLOC_PAGECACHE; 2) alloc_flags = ALLOC_WMARK_MIN | ALLOC_PAGECACHE; We should have the same result on the first two times get_page_from_freelist(). 3) if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) && !in_interrupt()) alloc_flags = ALLOC_NO_WATERMARKS The case on my platform will never enter this branch. If the branch occurs on your side, The limit will be omitted. Because NO watermark, zone_watermark_ok() will not be checked. memory will be allocated directly. 4)if (likely(did_some_progress)) { alloc_flags should include ALLOC_PAGECACHE. So we should have the same result on this call. 5) } else if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { alloc_flags = ALLOC_WMARK_HIGH, without ALLOC_PAGECACHE This branch will not hit on my case. You may need to check it. If 3) or 5) occurs on your platform, I think you can easily fix it. Please confirm it and let me know the result. Thanks, -Aubrey - To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
Aubrey Li wrote: > Here is the newest patch against 2.6.20-rc5. > == > From ad9ca9a32bdcaddce9988afbf0187bfd04685a0c Mon Sep 17 00:00:00 2001 > From: Aubrey.Li <[EMAIL PROTECTED]> > Date: Thu, 18 Jan 2007 11:08:31 +0800 > Subject: [PATCH] Add an interface to limit total vfs page cache. > The default percent is using 90% memory for page cache. > > Signed-off-by: Aubrey.Li <[EMAIL PROTECTED]> > --- > include/linux/gfp.h |1 + > include/linux/pagemap.h |2 +- > include/linux/sysctl.h |2 ++ > kernel/sysctl.c | 11 +++ > mm/page_alloc.c | 17 +++-- > 5 files changed, 30 insertions(+), 3 deletions(-) > > diff --git a/include/linux/gfp.h b/include/linux/gfp.h > index 00c314a..531360e 100644 > --- a/include/linux/gfp.h > +++ b/include/linux/gfp.h > @@ -46,6 +46,7 @@ struct vm_area_struct; > #define __GFP_NOMEMALLOC ((__force gfp_t)0x1u) /* Don't use > emergency reserves */ > #define __GFP_HARDWALL ((__force gfp_t)0x2u) /* Enforce > hardwall cpuset memory allocs */ > #define __GFP_THISNODE ((__force gfp_t)0x4u)/* No fallback, no > policies */ > +#define __GFP_PAGECACHE ((__force gfp_t)0x8u) /* Is page cache > allocation ? */ > > #define __GFP_BITS_SHIFT 20 /* Room for 20 __GFP_FOO bits */ > #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) > diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h > index c3e255b..890bb23 100644 > --- a/include/linux/pagemap.h > +++ b/include/linux/pagemap.h > @@ -62,7 +62,7 @@ static inline struct page *__page_cache_ > > static inline struct page *page_cache_alloc(struct address_space *x) > { > - return __page_cache_alloc(mapping_gfp_mask(x)); > + return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_PAGECACHE); > } > > static inline struct page *page_cache_alloc_cold(struct address_space *x) > diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h > index 81480e6..d3c9174 100644 > --- a/include/linux/sysctl.h > +++ b/include/linux/sysctl.h > @@ -202,6 +202,7 @@ enum > VM_PANIC_ON_OOM=33, /* panic at out-of-memory */ > VM_VDSO_ENABLED=34, /* map VDSO into new processes? */ > VM_MIN_SLAB=35, /* Percent pages ignored by zone reclaim */ > + VM_PAGECACHE_RATIO=36, /* percent of RAM to use as page cache */ > }; > > > @@ -955,6 +956,7 @@ extern ctl_handler sysctl_string; > extern ctl_handler sysctl_intvec; > extern ctl_handler sysctl_jiffies; > extern ctl_handler sysctl_ms_jiffies; > +extern int sysctl_pagecache_ratio; > > > /* > diff --git a/kernel/sysctl.c b/kernel/sysctl.c > index 600b333..92db115 100644 > --- a/kernel/sysctl.c > +++ b/kernel/sysctl.c > @@ -1035,6 +1035,17 @@ static ctl_table vm_table[] = { > .extra1 = , > }, > #endif > + { > + .ctl_name = VM_PAGECACHE_RATIO, > + .procname = "pagecache_ratio", > + .data = _pagecache_ratio, > + .maxlen = sizeof(sysctl_pagecache_ratio), > + .mode = 0644, > + .proc_handler = _dointvec_minmax, > + .strategy = _intvec, > + .extra1 = , > +.extra2 = _hundred, > + }, > { .ctl_name = 0 } > }; > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > index fc5b544..5802b39 100644 > --- a/mm/page_alloc.c > +++ b/mm/page_alloc.c > @@ -82,6 +82,8 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_Z > #endif > }; > > +int sysctl_pagecache_ratio = 10; > + > EXPORT_SYMBOL(totalram_pages); > > static char * const zone_names[MAX_NR_ZONES] = { > @@ -895,6 +897,7 @@ failed: > #define ALLOC_HARDER 0x10 /* try to alloc harder */ > #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ > #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ > +#define ALLOC_PAGECACHE 0x80 /* __GFP_PAGECACHE set */ > > #ifdef CONFIG_FAIL_PAGE_ALLOC > > @@ -998,6 +1001,9 @@ int zone_watermark_ok(struct zone *z, in > if (alloc_flags & ALLOC_HARDER) > min -= min / 4; > > + if (alloc_flags & ALLOC_PAGECACHE) > + min = min + (sysctl_pagecache_ratio * z->present_pages) / 100; > + > if (free_pages <= min + z->lowmem_reserve[classzone_idx]) > return 0; Hi Aubrey, The idea of creating separate flag for pagecache in page_alloc is interesting. The good part is that you flag watermark low and the zone reclaimer will do the rest of the job. However when the zone reclaimer starts to reclaim pages, it will remove all cold pages and not specifically pagecache pages. This may affect performance of applications. One possible solution to this reclaim is to use scan control fields and ask the shrink_page_list() and shrink_active_list() routines to target only pagecache pages. Pagecache pages are not mapped and they are easy to find on the LRU
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
Aubrey Li wrote: > Here is the newest patch against 2.6.20-rc5. > == > From ad9ca9a32bdcaddce9988afbf0187bfd04685a0c Mon Sep 17 00:00:00 2001 > From: Aubrey.Li <[EMAIL PROTECTED]> > Date: Thu, 18 Jan 2007 11:08:31 +0800 > Subject: [PATCH] Add an interface to limit total vfs page cache. > The default percent is using 90% memory for page cache. Hi Aubrey, I used your patch on my PPC64 box and I do not get expected behavior. As you had requested, I am attaching zoneinfo and meminfo dumps: # cat /proc/sys/vm/pagecache_ratio 50 # cat /proc/meminfo MemTotal: 1014600 kB << 1GB Ram MemFree:960336 kB << Expect to see around 500MB free after Buffers: 8348 kB issue of DD command Cached: 8624 kB SwapCached: 8 kB Active: 20908 kB Inactive: 5680 kB SwapTotal: 1526164 kB SwapFree: 1526088 kB Dirty: 116 kB Writeback: 0 kB AnonPages:9544 kB Mapped: 7736 kB Slab:18920 kB SReclaimable: 5792 kB SUnreclaim: 13128 kB PageTables:972 kB NFS_Unstable:0 kB Bounce: 0 kB CommitLimit: 2033464 kB Committed_AS:46652 kB VmallocTotal: 8589934592 kB VmallocUsed: 2440 kB VmallocChunk: 8589932152 kB HugePages_Total: 0 HugePages_Free: 0 HugePages_Rsvd: 0 Hugepagesize:16384 kB # cat /proc/zoneinfo Node 0, zone DMA pages free 130474 min 571 low 713 high 856 active 5010 inactive 775 scanned 0 (a: 24 i: 0) spanned 147456 present 145440 nr_anon_pages 2383 nr_mapped1932 nr_file_pages 3389 nr_slab_reclaimable 1094 nr_slab_unreclaimable 1819 nr_page_table_pages 243 nr_dirty 4 nr_writeback 0 nr_unstable 0 nr_bounce0 nr_vmscan_write 34 numa_hit 1428389 numa_miss0 numa_foreign 1048457 numa_interleave 1511 numa_local 1428389 numa_other 0 protection: (0, 0) pagesets cpu: 0 pcp: 0 count: 77 high: 186 batch: 31 cpu: 0 pcp: 1 count: 3 high: 62 batch: 15 vm stats threshold: 16 cpu: 1 pcp: 0 count: 171 high: 186 batch: 31 cpu: 1 pcp: 1 count: 11 high: 62 batch: 15 vm stats threshold: 16 all_unreclaimable: 0 prev_priority: 12 start_pfn: 0 Node 1, zone DMA pages free 109610 min 444 low 555 high 666 active 217 inactive 655 scanned 0 (a: 21 i: 0) spanned 114688 present 113120 nr_anon_pages 3 nr_mapped2 nr_file_pages 869 nr_slab_reclaimable 354 nr_slab_unreclaimable 1454 nr_page_table_pages 0 nr_dirty 0 nr_writeback 0 nr_unstable 0 nr_bounce0 nr_vmscan_write 0 numa_hit 2220 numa_miss1048457 numa_foreign 0 numa_interleave 1519 numa_local 0 numa_other 1050677 protection: (0, 0) pagesets all_unreclaimable: 0 prev_priority: 12 start_pfn: 147456 The test: Write 1GB file in /tmp # dd if=/dev/zero of=/tmp/foo bs=1M count=1024 1024+0 records in 1024+0 records out 1073741824 bytes (1.1 GB) copied, 15.2301 seconds, 70.5 MB/s Expect around 500MB to be retained as free after the run? # cat /proc/meminfo MemTotal: 1014600 kB MemFree: 14080 kB <<< Buffers: 11164 kB Cached: 924536 kB <<< Almost all memory is consumed by SwapCached: 8 kB pagecache Active: 27500 kB Inactive: 917740 kB SwapTotal: 1526164 kB SwapFree: 1526088 kB Dirty: 100528 kB Writeback: 0 kB AnonPages:9544 kB Mapped: 7736 kB Slab:45264 kB SReclaimable:29652 kB SUnreclaim: 15612 kB PageTables:972 kB NFS_Unstable:0 kB Bounce: 0 kB CommitLimit: 2033464 kB Committed_AS:47732 kB VmallocTotal: 8589934592 kB VmallocUsed: 2440 kB VmallocChunk: 8589932152 kB HugePages_Total: 0 HugePages_Free: 0 HugePages_Rsvd: 0 Hugepagesize:16384 kB # cat /proc/zoneinfo Node 0, zone DMA pages free 2063 min 571 low 713 high 856 active 6028 inactive 124552 scanned 0 (a: 5 i: 0) spanned 147456 present 145440 nr_anon_pages 2384 nr_mapped1932 nr_file_pages 128191 nr_slab_reclaimable 4312 nr_slab_unreclaimable 2102 nr_page_table_pages 243 nr_dirty 13724 nr_writeback 0 nr_unstable 0 nr_bounce0 nr_vmscan_write 34 numa_hit 1577905 numa_miss0 numa_foreign 1173147 numa_interleave 1511 numa_local 1577905 numa_other 0
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
Aubrey Li wrote: Here is the newest patch against 2.6.20-rc5. == From ad9ca9a32bdcaddce9988afbf0187bfd04685a0c Mon Sep 17 00:00:00 2001 From: Aubrey.Li [EMAIL PROTECTED] Date: Thu, 18 Jan 2007 11:08:31 +0800 Subject: [PATCH] Add an interface to limit total vfs page cache. The default percent is using 90% memory for page cache. Hi Aubrey, I used your patch on my PPC64 box and I do not get expected behavior. As you had requested, I am attaching zoneinfo and meminfo dumps: # cat /proc/sys/vm/pagecache_ratio 50 # cat /proc/meminfo MemTotal: 1014600 kB 1GB Ram MemFree:960336 kB Expect to see around 500MB free after Buffers: 8348 kB issue of DD command Cached: 8624 kB SwapCached: 8 kB Active: 20908 kB Inactive: 5680 kB SwapTotal: 1526164 kB SwapFree: 1526088 kB Dirty: 116 kB Writeback: 0 kB AnonPages:9544 kB Mapped: 7736 kB Slab:18920 kB SReclaimable: 5792 kB SUnreclaim: 13128 kB PageTables:972 kB NFS_Unstable:0 kB Bounce: 0 kB CommitLimit: 2033464 kB Committed_AS:46652 kB VmallocTotal: 8589934592 kB VmallocUsed: 2440 kB VmallocChunk: 8589932152 kB HugePages_Total: 0 HugePages_Free: 0 HugePages_Rsvd: 0 Hugepagesize:16384 kB # cat /proc/zoneinfo Node 0, zone DMA pages free 130474 min 571 low 713 high 856 active 5010 inactive 775 scanned 0 (a: 24 i: 0) spanned 147456 present 145440 nr_anon_pages 2383 nr_mapped1932 nr_file_pages 3389 nr_slab_reclaimable 1094 nr_slab_unreclaimable 1819 nr_page_table_pages 243 nr_dirty 4 nr_writeback 0 nr_unstable 0 nr_bounce0 nr_vmscan_write 34 numa_hit 1428389 numa_miss0 numa_foreign 1048457 numa_interleave 1511 numa_local 1428389 numa_other 0 protection: (0, 0) pagesets cpu: 0 pcp: 0 count: 77 high: 186 batch: 31 cpu: 0 pcp: 1 count: 3 high: 62 batch: 15 vm stats threshold: 16 cpu: 1 pcp: 0 count: 171 high: 186 batch: 31 cpu: 1 pcp: 1 count: 11 high: 62 batch: 15 vm stats threshold: 16 all_unreclaimable: 0 prev_priority: 12 start_pfn: 0 Node 1, zone DMA pages free 109610 min 444 low 555 high 666 active 217 inactive 655 scanned 0 (a: 21 i: 0) spanned 114688 present 113120 nr_anon_pages 3 nr_mapped2 nr_file_pages 869 nr_slab_reclaimable 354 nr_slab_unreclaimable 1454 nr_page_table_pages 0 nr_dirty 0 nr_writeback 0 nr_unstable 0 nr_bounce0 nr_vmscan_write 0 numa_hit 2220 numa_miss1048457 numa_foreign 0 numa_interleave 1519 numa_local 0 numa_other 1050677 protection: (0, 0) pagesets all_unreclaimable: 0 prev_priority: 12 start_pfn: 147456 The test: Write 1GB file in /tmp # dd if=/dev/zero of=/tmp/foo bs=1M count=1024 1024+0 records in 1024+0 records out 1073741824 bytes (1.1 GB) copied, 15.2301 seconds, 70.5 MB/s Expect around 500MB to be retained as free after the run? # cat /proc/meminfo MemTotal: 1014600 kB MemFree: 14080 kB Buffers: 11164 kB Cached: 924536 kB Almost all memory is consumed by SwapCached: 8 kB pagecache Active: 27500 kB Inactive: 917740 kB SwapTotal: 1526164 kB SwapFree: 1526088 kB Dirty: 100528 kB Writeback: 0 kB AnonPages:9544 kB Mapped: 7736 kB Slab:45264 kB SReclaimable:29652 kB SUnreclaim: 15612 kB PageTables:972 kB NFS_Unstable:0 kB Bounce: 0 kB CommitLimit: 2033464 kB Committed_AS:47732 kB VmallocTotal: 8589934592 kB VmallocUsed: 2440 kB VmallocChunk: 8589932152 kB HugePages_Total: 0 HugePages_Free: 0 HugePages_Rsvd: 0 Hugepagesize:16384 kB # cat /proc/zoneinfo Node 0, zone DMA pages free 2063 min 571 low 713 high 856 active 6028 inactive 124552 scanned 0 (a: 5 i: 0) spanned 147456 present 145440 nr_anon_pages 2384 nr_mapped1932 nr_file_pages 128191 nr_slab_reclaimable 4312 nr_slab_unreclaimable 2102 nr_page_table_pages 243 nr_dirty 13724 nr_writeback 0 nr_unstable 0 nr_bounce0 nr_vmscan_write 34 numa_hit 1577905 numa_miss0 numa_foreign 1173147 numa_interleave 1511 numa_local 1577905 numa_other 0 protection:
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
Aubrey Li wrote: Here is the newest patch against 2.6.20-rc5. == From ad9ca9a32bdcaddce9988afbf0187bfd04685a0c Mon Sep 17 00:00:00 2001 From: Aubrey.Li [EMAIL PROTECTED] Date: Thu, 18 Jan 2007 11:08:31 +0800 Subject: [PATCH] Add an interface to limit total vfs page cache. The default percent is using 90% memory for page cache. Signed-off-by: Aubrey.Li [EMAIL PROTECTED] --- include/linux/gfp.h |1 + include/linux/pagemap.h |2 +- include/linux/sysctl.h |2 ++ kernel/sysctl.c | 11 +++ mm/page_alloc.c | 17 +++-- 5 files changed, 30 insertions(+), 3 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 00c314a..531360e 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -46,6 +46,7 @@ struct vm_area_struct; #define __GFP_NOMEMALLOC ((__force gfp_t)0x1u) /* Don't use emergency reserves */ #define __GFP_HARDWALL ((__force gfp_t)0x2u) /* Enforce hardwall cpuset memory allocs */ #define __GFP_THISNODE ((__force gfp_t)0x4u)/* No fallback, no policies */ +#define __GFP_PAGECACHE ((__force gfp_t)0x8u) /* Is page cache allocation ? */ #define __GFP_BITS_SHIFT 20 /* Room for 20 __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 __GFP_BITS_SHIFT) - 1)) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c3e255b..890bb23 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -62,7 +62,7 @@ static inline struct page *__page_cache_ static inline struct page *page_cache_alloc(struct address_space *x) { - return __page_cache_alloc(mapping_gfp_mask(x)); + return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_PAGECACHE); } static inline struct page *page_cache_alloc_cold(struct address_space *x) diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 81480e6..d3c9174 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -202,6 +202,7 @@ enum VM_PANIC_ON_OOM=33, /* panic at out-of-memory */ VM_VDSO_ENABLED=34, /* map VDSO into new processes? */ VM_MIN_SLAB=35, /* Percent pages ignored by zone reclaim */ + VM_PAGECACHE_RATIO=36, /* percent of RAM to use as page cache */ }; @@ -955,6 +956,7 @@ extern ctl_handler sysctl_string; extern ctl_handler sysctl_intvec; extern ctl_handler sysctl_jiffies; extern ctl_handler sysctl_ms_jiffies; +extern int sysctl_pagecache_ratio; /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 600b333..92db115 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1035,6 +1035,17 @@ static ctl_table vm_table[] = { .extra1 = zero, }, #endif + { + .ctl_name = VM_PAGECACHE_RATIO, + .procname = pagecache_ratio, + .data = sysctl_pagecache_ratio, + .maxlen = sizeof(sysctl_pagecache_ratio), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .strategy = sysctl_intvec, + .extra1 = zero, +.extra2 = one_hundred, + }, { .ctl_name = 0 } }; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fc5b544..5802b39 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -82,6 +82,8 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_Z #endif }; +int sysctl_pagecache_ratio = 10; + EXPORT_SYMBOL(totalram_pages); static char * const zone_names[MAX_NR_ZONES] = { @@ -895,6 +897,7 @@ failed: #define ALLOC_HARDER 0x10 /* try to alloc harder */ #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET 0x40 /* check for correct cpuset */ +#define ALLOC_PAGECACHE 0x80 /* __GFP_PAGECACHE set */ #ifdef CONFIG_FAIL_PAGE_ALLOC @@ -998,6 +1001,9 @@ int zone_watermark_ok(struct zone *z, in if (alloc_flags ALLOC_HARDER) min -= min / 4; + if (alloc_flags ALLOC_PAGECACHE) + min = min + (sysctl_pagecache_ratio * z-present_pages) / 100; + if (free_pages = min + z-lowmem_reserve[classzone_idx]) return 0; Hi Aubrey, The idea of creating separate flag for pagecache in page_alloc is interesting. The good part is that you flag watermark low and the zone reclaimer will do the rest of the job. However when the zone reclaimer starts to reclaim pages, it will remove all cold pages and not specifically pagecache pages. This may affect performance of applications. One possible solution to this reclaim is to use scan control fields and ask the shrink_page_list() and shrink_active_list() routines to target only pagecache pages. Pagecache pages are not mapped and they are easy to find on the LRU list. Please review my patch at http://lkml.org/lkml/2007/01/17/96 --Vaidy [snip] - To unsubscribe
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
On 1/19/07, Vaidyanathan Srinivasan [EMAIL PROTECTED] wrote: Hi Aubrey, I used your patch on my PPC64 box and I do not get expected behavior. As you had requested, I am attaching zoneinfo and meminfo dumps: Please let me know if you need any further data to help me out with the test/experiment. Although I have no PPC64 box in hand, I think the logic should be the same. get_page_from_freelist() is called 5 times in __alloc_pages(). 1) alloc_flags = ALLOC_WMARK_LOW | ALLOC_PAGECACHE; 2) alloc_flags = ALLOC_WMARK_MIN | ALLOC_PAGECACHE; We should have the same result on the first two times get_page_from_freelist(). 3) if (((p-flags PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE))) !in_interrupt()) alloc_flags = ALLOC_NO_WATERMARKS The case on my platform will never enter this branch. If the branch occurs on your side, The limit will be omitted. Because NO watermark, zone_watermark_ok() will not be checked. memory will be allocated directly. 4)if (likely(did_some_progress)) { alloc_flags should include ALLOC_PAGECACHE. So we should have the same result on this call. 5) } else if ((gfp_mask __GFP_FS) !(gfp_mask __GFP_NORETRY)) { alloc_flags = ALLOC_WMARK_HIGH, without ALLOC_PAGECACHE This branch will not hit on my case. You may need to check it. If 3) or 5) occurs on your platform, I think you can easily fix it. Please confirm it and let me know the result. Thanks, -Aubrey - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
On 1/19/07, Vaidyanathan Srinivasan [EMAIL PROTECTED] wrote: Hi Aubrey, The idea of creating separate flag for pagecache in page_alloc is interesting. The good part is that you flag watermark low and the zone reclaimer will do the rest of the job. However when the zone reclaimer starts to reclaim pages, it will remove all cold pages and not specifically pagecache pages. This may affect performance of applications. One possible solution to this reclaim is to use scan control fields and ask the shrink_page_list() and shrink_active_list() routines to target only pagecache pages. Pagecache pages are not mapped and they are easy to find on the LRU list. Please review my patch at http://lkml.org/lkml/2007/01/17/96 So you mean the existing reclaimer has the same issue, doesn't it? In your and Roy's patch, balance_pagecache() routine is called on file backed access. So you still want to add this checking? or change the current reclaimer completely? -Aubrey - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
On Thu, 18 Jan 2007, Aubrey Li wrote: +int sysctl_pagecache_ratio = 10; + Pagecache ratio is the ratio of memory to be left over? Would it not be better to twist this around and to be able to specify how much of the memory of a node may be used by the pagecache? Why limit the size of the pagecache? Some kind of rationale would be useful. Maybe it was there in earlier incarnations of the patch that I did not see? It should be kept with it. zone_reclaim already dynamically limits the size of the pagecache. + if (alloc_flags ALLOC_PAGECACHE) + min = min + (sysctl_pagecache_ratio * z-present_pages) / 100; The calculation of the multiplication / division is usually not done in the hot allocation path. See f.e. how min_unmapped_pages is handled in mm/page_alloc.c - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
Aubrey Li wrote: On 1/19/07, Vaidyanathan Srinivasan [EMAIL PROTECTED] wrote: Hi Aubrey, The idea of creating separate flag for pagecache in page_alloc is interesting. The good part is that you flag watermark low and the zone reclaimer will do the rest of the job. However when the zone reclaimer starts to reclaim pages, it will remove all cold pages and not specifically pagecache pages. This may affect performance of applications. One possible solution to this reclaim is to use scan control fields and ask the shrink_page_list() and shrink_active_list() routines to target only pagecache pages. Pagecache pages are not mapped and they are easy to find on the LRU list. Please review my patch at http://lkml.org/lkml/2007/01/17/96 So you mean the existing reclaimer has the same issue, doesn't it? Well, the existing reclaimer will do the right job if the kernel really runs out of memory and need to recover pages for new allocations. The pages to be removed will be the coldest page in the system. However now with the introduction of pagecache limit, we are artificially creating a memory scarcity and forcing the reclaimer to throw away some pages while we actually have free usable RAM. In this context the choice of pages picked by the present reclaimer may not be the best ones. If pagecache is overlimit, we expect old (cold) pagecache pages to be thrown out and reused for new file data. We do not expect to drop a few text or data pages to make room for new pagecache. In your and Roy's patch, balance_pagecache() routine is called on file backed access. So you still want to add this checking? or change the current reclaimer completely? The balance_pagecache() routine is called for file backed access since that is when we would probably exceed the pagecache limit. The routine check if the limit has exceeded and calls the reclaimer. The reclaimer is an extension of the present reclaimer with more checks to remove only pagecache pages and not try to unmap any mapped pages and potentially affect application performance. I am open to suggestions on reclaim logic. My view is that we need to selectively reclaim pagecache pages and not just call the traditional reclaimer to freeup arbitrary type of pages. --Vaidy -Aubrey - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/ - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
On Sat, 20 Jan 2007, Vaidyanathan Srinivasan wrote: However when the zone reclaimer starts to reclaim pages, it will remove all cold pages and not specifically pagecache pages. This may affect performance of applications. The reclaimer is passed a control structure that can be used to disable write to swap (if that is the concern). I am open to suggestions on reclaim logic. My view is that we need to selectively reclaim pagecache pages and not just call the traditional reclaimer to freeup arbitrary type of pages. The traditional reclaim works fine if told what to do. Introducing another LRU list to do reclaim is a significant change to the VM, creates lots of overhead etc. - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
On 1/20/07, Vaidyanathan Srinivasan [EMAIL PROTECTED] wrote: Aubrey Li wrote: On 1/19/07, Vaidyanathan Srinivasan [EMAIL PROTECTED] wrote: Hi Aubrey, The idea of creating separate flag for pagecache in page_alloc is interesting. The good part is that you flag watermark low and the zone reclaimer will do the rest of the job. However when the zone reclaimer starts to reclaim pages, it will remove all cold pages and not specifically pagecache pages. This may affect performance of applications. One possible solution to this reclaim is to use scan control fields and ask the shrink_page_list() and shrink_active_list() routines to target only pagecache pages. Pagecache pages are not mapped and they are easy to find on the LRU list. Please review my patch at http://lkml.org/lkml/2007/01/17/96 So you mean the existing reclaimer has the same issue, doesn't it? Well, the existing reclaimer will do the right job if the kernel really runs out of memory and need to recover pages for new allocations. The pages to be removed will be the coldest page in the system. However now with the introduction of pagecache limit, we are artificially creating a memory scarcity and forcing the reclaimer to throw away some pages while we actually have free usable RAM. In this context the choice of pages picked by the present reclaimer may not be the best ones. If pagecache is overlimit, we expect old (cold) pagecache pages to be thrown out and reused for new file data. We do not expect to drop a few text or data pages to make room for new pagecache. Well, actually I think this probably not necessary. Because the reclaimer has no way to predict the behavior of user mode processes, how do you make sure the pagecache will not be access again in a short time? So I think the present reclaimer is suitable. Limit pagecache must affect performance of applications. The key is what do you want to get? In my case, I get more memory to allocate, less fragmentation, it can solve my problem, :) Now the problem in the idea of the patch is, when vfs cache limit is hit, reclaimer doesn't reclaim all of the reclaimable pages, it just give few out. So next time vfs pagecache request, it is quite possible reclaimer is triggered again. That's the point in my mind affecting the performance of the applications. I'll continue to work on this issue to see if I can make a improvement. -Aubrey - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
Aubrey Li wrote: On 1/20/07, Vaidyanathan Srinivasan [EMAIL PROTECTED] wrote: If pagecache is overlimit, we expect old (cold) pagecache pages to be thrown out and reused for new file data. We do not expect to drop a few text or data pages to make room for new pagecache. Well, actually I think this probably not necessary. Because the reclaimer has no way to predict the behavior of user mode processes, how do you make sure the pagecache will not be access again in a short It is not about predicting behaviour, it is about directing the reclaim effort at the actual resource that is under pressure. Even given a pagecache limiting patch which does the proper accounting to keep pagecache pages under a % limit (unlike yours), kicking off an undirected reclaim could (in theory) reclaim all slab and anonymous memory pages before bringing pagecache under the limit. So I think you need to be a bit more thorough than just assuming everything will be OK. Page reclaim behaviour is pretty strange and complex. Secondly, your patch isn't actually very good. It unconditionally shrinks memory to below the given % mark each time a pagecache alloc occurs, regardless of how much pagecache is in the system. Effectively that seems to just reduce the amount of memory available to the system. Luckily, there are actually good, robust solutions for your higher order allocation problem. Do higher order allocations at boot time, modifiy userspace applications, or set up otherwise-unused, or easily reclaimable reserve pools for higher order allocations. I don't understand why you are so resistant to all of these approaches? -- SUSE Labs, Novell Inc. Send instant messages to your online friends http://au.messenger.yahoo.com - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
On 1/19/07, Nick Piggin [EMAIL PROTECTED] wrote: Luckily, there are actually good, robust solutions for your higher order allocation problem. Do higher order allocations at boot time, modifiy userspace applications, or set up otherwise-unused, or easily reclaimable reserve pools for higher order allocations. I don't understand why you are so resistant to all of these approaches? in a nutshell ... the idea is to try and generalize these things your approach involves tweaking each end solution to maximize the performance our approach is to teach the kernel some more tricks so that each solution need not be tweaked these are at obvious odds as they tackle the problem by going in pretty much opposite directions ... yours leads to a tighter system in the end, but ours leads to much more rapid development and deployment -mike - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
Mike Frysinger wrote: On 1/19/07, Nick Piggin [EMAIL PROTECTED] wrote: Luckily, there are actually good, robust solutions for your higher order allocation problem. Do higher order allocations at boot time, modifiy userspace applications, or set up otherwise-unused, or easily reclaimable reserve pools for higher order allocations. I don't understand why you are so resistant to all of these approaches? in a nutshell ... the idea is to try and generalize these things your approach involves tweaking each end solution to maximize the performance Maybe, if you are talking about my advice to fix userspace... but you *are* going to contribute those changes back for the nommu community to use, right? So the end result of that is _not_ actually tweaking the end solutions. But actually, if you take the reserved pool approach, then that will work fine, in-kernel, and it is something that already needs to be done for dynamic hugepage allocations which is almost exactly the same situation. And everybody can use this as well (I think most of the code is written already, but not merged). our approach is to teach the kernel some more tricks so that each solution need not be tweaked these are at obvious odds as they tackle the problem by going in pretty much opposite directions ... yours leads to a tighter system in the end, but ours leads to much more rapid development and deployment OK that's fair enough, but considering that it doesn't actually fix the problem properly; and that it does weird and wonderful things with our already fragile page reclaim path, then it is not a good idea to merge it upstream. -- SUSE Labs, Novell Inc. Send instant messages to your online friends http://au.messenger.yahoo.com - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
On 1/20/07, Nick Piggin [EMAIL PROTECTED] wrote: Aubrey Li wrote: On 1/20/07, Vaidyanathan Srinivasan [EMAIL PROTECTED] wrote: If pagecache is overlimit, we expect old (cold) pagecache pages to be thrown out and reused for new file data. We do not expect to drop a few text or data pages to make room for new pagecache. Well, actually I think this probably not necessary. Because the reclaimer has no way to predict the behavior of user mode processes, how do you make sure the pagecache will not be access again in a short It is not about predicting behaviour, it is about directing the reclaim effort at the actual resource that is under pressure. Even given a pagecache limiting patch which does the proper accounting to keep pagecache pages under a % limit (unlike yours), kicking off an undirected reclaim could (in theory) reclaim all slab and anonymous memory pages before bringing pagecache under the limit. So I think you need to be a bit more thorough than just assuming everything will be OK. Page reclaim behaviour is pretty strange and complex. So what's the right way to limit pagecache? Secondly, your patch isn't actually very good. It unconditionally shrinks memory to below the given % mark each time a pagecache alloc occurs, regardless of how much pagecache is in the system. Effectively that seems to just reduce the amount of memory available to the system. It doesn't reduce the amount of memory available to the system. It just reduce the amount of memory available to the page cache. So that page cache is limited and the reserved memory can be allocated by the application. Luckily, there are actually good, robust solutions for your higher order allocation problem. Do higher order allocations at boot time, modifiy userspace applications, or set up otherwise-unused, or easily reclaimable reserve pools for higher order allocations. I don't understand why you are so resistant to all of these approaches? I think we have explained the reason too much. We are working on no-mmu arch and provide a platform running linux to our customer. They are doing very good things like mplayer, asterisk, ip camera, etc on our platform, some applications was migrated from mmu arch. I think that means in some cases no-mmu arch is somewhat better than mmu arch. So we are taking effort to make the migration smooth or make no-mmu linux stronger. It's no way to let our customer modify their applications, we also unwilling to do it. And we have not an existing mechanism to set up a pools for the complex applications. So I'm trying to do some coding hack in the kernel to satisfy these kinds of requirement. And as you see, the patch seems to solve the problems on my side. But I'm not sure it's the right way to limit vfs cache, So I'm asking for comments and suggestions and help, I'm not asking to clobber the kernel. -Aubrey - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
On 1/19/07, Nick Piggin [EMAIL PROTECTED] wrote: Maybe, if you are talking about my advice to fix userspace... but you *are* going to contribute those changes back for the nommu community to use, right? So the end result of that is _not_ actually tweaking the end solutions. not quite sure what you're referring to here, but our approach is to contribute everything back in an acceptable form -mike - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
Aubrey Li wrote: So what's the right way to limit pagecache? Probably something a lot more complicated... if you can say there is a right way. Secondly, your patch isn't actually very good. It unconditionally shrinks memory to below the given % mark each time a pagecache alloc occurs, regardless of how much pagecache is in the system. Effectively that seems to just reduce the amount of memory available to the system. It doesn't reduce the amount of memory available to the system. It just reduce the amount of memory available to the page cache. So that page cache is limited and the reserved memory can be allocated by the application. But the patch doesn't do that, as I explained. Luckily, there are actually good, robust solutions for your higher order allocation problem. Do higher order allocations at boot time, modifiy userspace applications, or set up otherwise-unused, or easily reclaimable reserve pools for higher order allocations. I don't understand why you are so resistant to all of these approaches? I think we have explained the reason too much. We are working on no-mmu arch and provide a platform running linux to our customer. They are doing very good things like mplayer, asterisk, ip camera, etc on our platform, some applications was migrated from mmu arch. I think that means in some cases no-mmu arch is somewhat better than mmu arch. So we are taking effort to make the migration smooth or make no-mmu linux stronger. It's no way to let our customer modify their applications, we also unwilling to do it. And we have not an existing mechanism to set up a pools for the complex applications. So I'm trying to do some coding hack in the kernel to satisfy these kinds of requirement. Oh, maybe you misunderstand the reserve pools idea: that is an entirely kernel based solution where you can preallocate a large, contiguous pool of memory at boot time which you can use to satisfy your nommu higher order anonymous memory allocations. This is something that will not get fragmented by pagecache, nor will it get fragmented by any other page allocation, slab allocation. Tt is a pretty good solution provided that you size the pool correctly for your application's needs. -- SUSE Labs, Novell Inc. Send instant messages to your online friends http://au.messenger.yahoo.com - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
Re: [RPC][PATCH 2.6.20-rc5] limit total vfs page cache
On 1/20/07, Nick Piggin [EMAIL PROTECTED] wrote: Aubrey Li wrote: So what's the right way to limit pagecache? Probably something a lot more complicated... if you can say there is a right way. Secondly, your patch isn't actually very good. It unconditionally shrinks memory to below the given % mark each time a pagecache alloc occurs, regardless of how much pagecache is in the system. Effectively that seems to just reduce the amount of memory available to the system. It doesn't reduce the amount of memory available to the system. It just reduce the amount of memory available to the page cache. So that page cache is limited and the reserved memory can be allocated by the application. But the patch doesn't do that, as I explained. I'm not sure you read the correct patch. Let me explain the logic again. assume: min = 123pages pagecache_reserved = 200 pages if( alloc_flags ALLOC_PAGECACHE) watermark = min + pagecache_reserved ( 323 pages) else watermark = min ( 123 pages) So if request pagecache, when free pages 323 pages, reclaim is triggered. But at this time if request memory not pagecache, reclaim will be triggered when free pages 123 as the present reclaimer does. I verified it on my side, why do you think it doesn't work properly? Luckily, there are actually good, robust solutions for your higher order allocation problem. Do higher order allocations at boot time, modifiy userspace applications, or set up otherwise-unused, or easily reclaimable reserve pools for higher order allocations. I don't understand why you are so resistant to all of these approaches? I think we have explained the reason too much. We are working on no-mmu arch and provide a platform running linux to our customer. They are doing very good things like mplayer, asterisk, ip camera, etc on our platform, some applications was migrated from mmu arch. I think that means in some cases no-mmu arch is somewhat better than mmu arch. So we are taking effort to make the migration smooth or make no-mmu linux stronger. It's no way to let our customer modify their applications, we also unwilling to do it. And we have not an existing mechanism to set up a pools for the complex applications. So I'm trying to do some coding hack in the kernel to satisfy these kinds of requirement. Oh, maybe you misunderstand the reserve pools idea: that is an entirely kernel based solution where you can preallocate a large, contiguous pool of memory at boot time which you can use to satisfy your nommu higher order anonymous memory allocations. This is something that will not get fragmented by pagecache, nor will it get fragmented by any other page allocation, slab allocation. Tt is a pretty good solution provided that you size the pool correctly for your application's needs. So if application malloc(1M), how does kernel know to allocate reserved pool not from buddy system? I didn't see any special code about this. Is there any doc or example? -Aubrey - To unsubscribe from this list: send the line unsubscribe linux-kernel in the body of a message to [EMAIL PROTECTED] More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/
[RPC][PATCH 2.6.20-rc5] limit total vfs page cache
Here is the newest patch against 2.6.20-rc5. == From ad9ca9a32bdcaddce9988afbf0187bfd04685a0c Mon Sep 17 00:00:00 2001 From: Aubrey.Li <[EMAIL PROTECTED]> Date: Thu, 18 Jan 2007 11:08:31 +0800 Subject: [PATCH] Add an interface to limit total vfs page cache. The default percent is using 90% memory for page cache. Signed-off-by: Aubrey.Li <[EMAIL PROTECTED]> --- include/linux/gfp.h |1 + include/linux/pagemap.h |2 +- include/linux/sysctl.h |2 ++ kernel/sysctl.c | 11 +++ mm/page_alloc.c | 17 +++-- 5 files changed, 30 insertions(+), 3 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 00c314a..531360e 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -46,6 +46,7 @@ struct vm_area_struct; #define __GFP_NOMEMALLOC ((__force gfp_t)0x1u) /* Don't use emergency reserves */ #define __GFP_HARDWALL ((__force gfp_t)0x2u) /* Enforce hardwall cpuset memory allocs */ #define __GFP_THISNODE ((__force gfp_t)0x4u)/* No fallback, no policies */ +#define __GFP_PAGECACHE((__force gfp_t)0x8u) /* Is page cache allocation ? */ #define __GFP_BITS_SHIFT 20 /* Room for 20 __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c3e255b..890bb23 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -62,7 +62,7 @@ static inline struct page *__page_cache_ static inline struct page *page_cache_alloc(struct address_space *x) { - return __page_cache_alloc(mapping_gfp_mask(x)); + return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_PAGECACHE); } static inline struct page *page_cache_alloc_cold(struct address_space *x) diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 81480e6..d3c9174 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -202,6 +202,7 @@ enum VM_PANIC_ON_OOM=33, /* panic at out-of-memory */ VM_VDSO_ENABLED=34, /* map VDSO into new processes? */ VM_MIN_SLAB=35, /* Percent pages ignored by zone reclaim */ + VM_PAGECACHE_RATIO=36, /* percent of RAM to use as page cache */ }; @@ -955,6 +956,7 @@ extern ctl_handler sysctl_string; extern ctl_handler sysctl_intvec; extern ctl_handler sysctl_jiffies; extern ctl_handler sysctl_ms_jiffies; +extern int sysctl_pagecache_ratio; /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 600b333..92db115 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1035,6 +1035,17 @@ static ctl_table vm_table[] = { .extra1 = , }, #endif + { + .ctl_name = VM_PAGECACHE_RATIO, + .procname = "pagecache_ratio", + .data = _pagecache_ratio, + .maxlen = sizeof(sysctl_pagecache_ratio), + .mode = 0644, + .proc_handler = _dointvec_minmax, + .strategy = _intvec, + .extra1 = , +.extra2 = _hundred, + }, { .ctl_name = 0 } }; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fc5b544..5802b39 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -82,6 +82,8 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_Z #endif }; +int sysctl_pagecache_ratio = 10; + EXPORT_SYMBOL(totalram_pages); static char * const zone_names[MAX_NR_ZONES] = { @@ -895,6 +897,7 @@ failed: #define ALLOC_HARDER0x10 /* try to alloc harder */ #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET0x40 /* check for correct cpuset */ +#define ALLOC_PAGECACHE0x80 /* __GFP_PAGECACHE set */ #ifdef CONFIG_FAIL_PAGE_ALLOC @@ -998,6 +1001,9 @@ int zone_watermark_ok(struct zone *z, in if (alloc_flags & ALLOC_HARDER) min -= min / 4; + if (alloc_flags & ALLOC_PAGECACHE) + min = min + (sysctl_pagecache_ratio * z->present_pages) / 100; + if (free_pages <= min + z->lowmem_reserve[classzone_idx]) return 0; for (o = 0; o < order; o++) { @@ -1236,8 +1242,12 @@ restart: return NULL; } - page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, - zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); + if (gfp_mask & __GFP_PAGECACHE) + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, + zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_PAGECACHE); + else + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, + zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); if (page) goto got_pg; @@ -1273,6 +1283,9 @@ restart: if (wait) alloc_flags |= ALLOC_CPUSET; + if (gfp_mask & __GFP_PAGECACHE) +
[RPC][PATCH 2.6.20-rc5] limit total vfs page cache
Here is the newest patch against 2.6.20-rc5. == From ad9ca9a32bdcaddce9988afbf0187bfd04685a0c Mon Sep 17 00:00:00 2001 From: Aubrey.Li [EMAIL PROTECTED] Date: Thu, 18 Jan 2007 11:08:31 +0800 Subject: [PATCH] Add an interface to limit total vfs page cache. The default percent is using 90% memory for page cache. Signed-off-by: Aubrey.Li [EMAIL PROTECTED] --- include/linux/gfp.h |1 + include/linux/pagemap.h |2 +- include/linux/sysctl.h |2 ++ kernel/sysctl.c | 11 +++ mm/page_alloc.c | 17 +++-- 5 files changed, 30 insertions(+), 3 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 00c314a..531360e 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -46,6 +46,7 @@ struct vm_area_struct; #define __GFP_NOMEMALLOC ((__force gfp_t)0x1u) /* Don't use emergency reserves */ #define __GFP_HARDWALL ((__force gfp_t)0x2u) /* Enforce hardwall cpuset memory allocs */ #define __GFP_THISNODE ((__force gfp_t)0x4u)/* No fallback, no policies */ +#define __GFP_PAGECACHE((__force gfp_t)0x8u) /* Is page cache allocation ? */ #define __GFP_BITS_SHIFT 20 /* Room for 20 __GFP_FOO bits */ #define __GFP_BITS_MASK ((__force gfp_t)((1 __GFP_BITS_SHIFT) - 1)) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index c3e255b..890bb23 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -62,7 +62,7 @@ static inline struct page *__page_cache_ static inline struct page *page_cache_alloc(struct address_space *x) { - return __page_cache_alloc(mapping_gfp_mask(x)); + return __page_cache_alloc(mapping_gfp_mask(x)|__GFP_PAGECACHE); } static inline struct page *page_cache_alloc_cold(struct address_space *x) diff --git a/include/linux/sysctl.h b/include/linux/sysctl.h index 81480e6..d3c9174 100644 --- a/include/linux/sysctl.h +++ b/include/linux/sysctl.h @@ -202,6 +202,7 @@ enum VM_PANIC_ON_OOM=33, /* panic at out-of-memory */ VM_VDSO_ENABLED=34, /* map VDSO into new processes? */ VM_MIN_SLAB=35, /* Percent pages ignored by zone reclaim */ + VM_PAGECACHE_RATIO=36, /* percent of RAM to use as page cache */ }; @@ -955,6 +956,7 @@ extern ctl_handler sysctl_string; extern ctl_handler sysctl_intvec; extern ctl_handler sysctl_jiffies; extern ctl_handler sysctl_ms_jiffies; +extern int sysctl_pagecache_ratio; /* diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 600b333..92db115 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -1035,6 +1035,17 @@ static ctl_table vm_table[] = { .extra1 = zero, }, #endif + { + .ctl_name = VM_PAGECACHE_RATIO, + .procname = pagecache_ratio, + .data = sysctl_pagecache_ratio, + .maxlen = sizeof(sysctl_pagecache_ratio), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .strategy = sysctl_intvec, + .extra1 = zero, +.extra2 = one_hundred, + }, { .ctl_name = 0 } }; diff --git a/mm/page_alloc.c b/mm/page_alloc.c index fc5b544..5802b39 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -82,6 +82,8 @@ int sysctl_lowmem_reserve_ratio[MAX_NR_Z #endif }; +int sysctl_pagecache_ratio = 10; + EXPORT_SYMBOL(totalram_pages); static char * const zone_names[MAX_NR_ZONES] = { @@ -895,6 +897,7 @@ failed: #define ALLOC_HARDER0x10 /* try to alloc harder */ #define ALLOC_HIGH 0x20 /* __GFP_HIGH set */ #define ALLOC_CPUSET0x40 /* check for correct cpuset */ +#define ALLOC_PAGECACHE0x80 /* __GFP_PAGECACHE set */ #ifdef CONFIG_FAIL_PAGE_ALLOC @@ -998,6 +1001,9 @@ int zone_watermark_ok(struct zone *z, in if (alloc_flags ALLOC_HARDER) min -= min / 4; + if (alloc_flags ALLOC_PAGECACHE) + min = min + (sysctl_pagecache_ratio * z-present_pages) / 100; + if (free_pages = min + z-lowmem_reserve[classzone_idx]) return 0; for (o = 0; o order; o++) { @@ -1236,8 +1242,12 @@ restart: return NULL; } - page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, - zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); + if (gfp_mask __GFP_PAGECACHE) + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, + zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET|ALLOC_PAGECACHE); + else + page = get_page_from_freelist(gfp_mask|__GFP_HARDWALL, order, + zonelist, ALLOC_WMARK_LOW|ALLOC_CPUSET); if (page) goto got_pg; @@ -1273,6 +1283,9 @@ restart: if (wait) alloc_flags |= ALLOC_CPUSET; + if (gfp_mask