Dear Ben, > Please read Documentation/SubmittingPatches, use scripts/checkpatch.pl > and try to provide a patch that is suitable for upstream inclusion. > Also, your name belongs in the patch header, not in the code.
I changed the proposed patch accordingly, scripts/checkpatch.pl produces just a few warnings. I had my patch in use for a while now, so I believe it is suitably tested. Please let me know if I need to do anything else. Cheers, Paul Paul Szabo p...@maths.usyd.edu.au http://www.maths.usyd.edu.au/u/psz/ School of Mathematics and Statistics University of Sydney Australia
Avoid OOM when filesystem caches fill lowmem and are not reclaimed, doing drop_caches at that point. The issue is easily reproducible on machines with over 32GB RAM. The patch correctly protects against OOM. The added call to drop_caches has been observed to trigger "needlessly" but on quite rare occasions only. Also included are several minor fixes: - Comment about highmem_is_dirtyable that seems used only to calculate limits and threshholds, not used in any decisions. - In determine_dirtyable_memory() subtract min_free_kbytes from returned value. I believe this is "right", that min_free_kbytes is not intended for dirty pages. - In bdi_position_ratio() get difference (setpoint-dirty) right even when it is negative, which happens often. Normally these numbers are "small" and even with left-shift I never observed a 32-bit overflow. I believe it should be possible to re-write the whole function in 32-bit ints; maybe it is not worth the effort to make it "efficient"; seeing how this function was always wrong and we survived, it should simply be removed. - Comment in bdi_max_pause() that it seems to always return a too-small value, maybe it should simply return a fixed value. - Comment in balance_dirty_pages() about a test marked unlikely() but which I observe to be quite common. - Comment in __alloc_pages_slowpath() about did_some_progress being set twice, but only checked after the second setting, so the first setting is lost and wasted. - Comment in zone_reclaimable() that maybe should return true with non-zero NR_SLAB_RECLAIMABLE. - Comment about all_unreclaimable which may be set wrongly. - Comments in global_reclaimable_pages() and zone_reclaimable_pages() about maybe adding or including NR_SLAB_RECLAIMABLE. Paul Szabo p...@maths.usyd.edu.au http://www.maths.usyd.edu.au/u/psz/ School of Mathematics and Statistics University of Sydney Australia Reported-by: Paul Szabo <p...@maths.usyd.edu.au> Reference: http://bugs.debian.org/695182 Signed-off-by: Paul Szabo <p...@maths.usyd.edu.au> --- fs/drop_caches.c.old 2012-10-17 13:50:15.000000000 +1100 +++ fs/drop_caches.c 2013-01-04 21:52:47.000000000 +1100 @@ -65,3 +65,10 @@ int drop_caches_sysctl_handler(ctl_table } return 0; } + +/* Easy call: do "echo 3 > /proc/sys/vm/drop_caches" */ +void easy_drop_caches(void) +{ + iterate_supers(drop_pagecache_sb, NULL); + drop_slab(); +} --- mm/page-writeback.c.old 2012-10-17 13:50:15.000000000 +1100 +++ mm/page-writeback.c 2013-01-06 21:54:59.000000000 +1100 @@ -39,7 +39,8 @@ /* * Sleep at most 200ms at a time in balance_dirty_pages(). */ -#define MAX_PAUSE max(HZ/5, 1) +/* Might as well be max(HZ/5,4) to ensure max_pause/4>0 always */ +#define MAX_PAUSE max(HZ/5, 4) /* * Estimate write bandwidth at 200ms intervals. @@ -343,11 +344,26 @@ static unsigned long highmem_dirtyable_m unsigned long determine_dirtyable_memory(void) { unsigned long x; + int y = 0; + extern int min_free_kbytes; x = global_page_state(NR_FREE_PAGES) + global_reclaimable_pages(); + /* + * Seems that highmem_is_dirtyable is only used here, in the + * calculation of limits and threshholds of dirtiness, not in deciding + * where to put dirty things. Is that so? Is that as should be? + * What is the recommended setting of highmem_is_dirtyable? + */ if (!vm_highmem_is_dirtyable) x -= highmem_dirtyable_memory(x); + /* Subtract min_free_kbytes */ + if (min_free_kbytes > 0) + y = min_free_kbytes >> (PAGE_SHIFT - 10); + if (x > y) + x -= y; + else + x = 0; return x + 1; /* Ensure that we never return 0 */ } @@ -541,6 +557,9 @@ static unsigned long bdi_position_ratio( if (unlikely(dirty >= limit)) return 0; + /* Never seen this happen, just sanity-check paranoia */ + if (unlikely(freerun >= limit)) + return 16 << RATELIMIT_CALC_SHIFT; /* * global setpoint @@ -559,7 +578,7 @@ static unsigned long bdi_position_ratio( * => fast response on large errors; small oscillation near setpoint */ setpoint = (freerun + limit) / 2; - x = div_s64((setpoint - dirty) << RATELIMIT_CALC_SHIFT, + x = div_s64(((s64)setpoint - (s64)dirty) << RATELIMIT_CALC_SHIFT, limit - setpoint + 1); pos_ratio = x; pos_ratio = pos_ratio * x >> RATELIMIT_CALC_SHIFT; @@ -995,6 +1014,13 @@ static unsigned long bdi_max_pause(struc * The pause time will be settled within range (max_pause/4, max_pause). * Apply a minimal value of 4 to get a non-zero max_pause/4. */ + /* + * On large machine it seems we always return 4, + * on smaller desktop machine mostly return 5 (rarely 9 or 14). + * Are those too small? Should we return something fixed e.g. + return (HZ/10); + * instead of this wasted/useless calculation? + */ return clamp_val(t, 4, MAX_PAUSE); } @@ -1109,6 +1135,11 @@ static void balance_dirty_pages(struct a } pause = HZ * pages_dirtied / task_ratelimit; if (unlikely(pause <= 0)) { + /* + * Not unlikely: often we get zero. + * Seems we always get 0 on large machine. + * Should not do a pause of 1 here? + */ trace_balance_dirty_pages(bdi, dirty_thresh, background_thresh, --- mm/page_alloc.c.old 2012-10-17 13:50:15.000000000 +1100 +++ mm/page_alloc.c 2013-01-05 17:21:41.000000000 +1100 @@ -2207,6 +2207,10 @@ rebalance: * If we failed to make any progress reclaiming, then we are * running out of options and have to consider going OOM */ + /* + * We had did_some_progress set twice, but is only checked here + * so the first setting was lost. Is that as should be? + */ if (!did_some_progress) { if ((gfp_mask & __GFP_FS) && !(gfp_mask & __GFP_NORETRY)) { if (oom_killer_disabled) --- mm/vmscan.c.old 2012-10-17 13:50:15.000000000 +1100 +++ mm/vmscan.c 2013-01-06 09:50:49.000000000 +1100 @@ -213,6 +213,8 @@ static inline int do_shrinker_shrink(str /* * Call the shrink functions to age shrinkable caches * + * These comments seem to be about filesystem caches, though slabs may be + * used elsewhere also. * Here we assume it costs one seek to replace a lru page and that it also * takes a seek to recreate a cache object. With this in mind we age equal * percentages of the lru and ageable caches. This should balance the seeks @@ -2244,6 +2246,9 @@ static bool shrink_zones(int priority, s static bool zone_reclaimable(struct zone *zone) { + /* Should we return true with NR_SLAB_RECLAIMABLE ? */ + /*if (zone_page_state(zone,NR_SLAB_RECLAIMABLE)>0) return true; */ + /* Wonder about the "correctness" of that *6 factor. */ return zone->pages_scanned < zone_reclaimable_pages(zone) * 6; } @@ -2726,9 +2731,87 @@ loop_again: nr_slab = shrink_slab(&shrink, sc.nr_scanned, lru_pages); sc.nr_reclaimed += reclaim_state->reclaimed_slab; total_scanned += sc.nr_scanned; + if (unlikely( + i == 1 && + nr_slab < 10 && + (reclaim_state->reclaimed_slab) < 10 && + zone_page_state(zone, NR_SLAB_RECLAIMABLE) > 10 && + !zone_watermark_ok_safe(zone, order, + high_wmark_pages(zone), end_zone, 0))) { + /* + * We are stressed (desperate), better + * to drop file caches than to suffer + * an OOM episode (where you need to + * press the reset button): see + * http://bugs.debian.org/695182 + * + * Do something like + * echo 3 > /proc/sys/vm/drop_caches + * See in fs/drop_caches.c : + void easy_drop_caches(void) + { + iterate_supers(drop_pagecache_sb, NULL); + drop_slab(); + } + * + * All slabs (other than filesystem) may + * already be cleared, or may be cleared + * now: is that fair or efficient? + * + * I noticed this issue on machines with + * over 32GB RAM, but do not see + * anything specific to large memory in + * the code. I wonder if the issue might + * affect machines with smaller memory + * or with 64-bit build. + * + * Need to drop_pagecache, drop_slab + * alone is ineffective. This is + * probably why our shrink_slab above + * did not succeed. + * + * Should we drop_slab(), or do loops of + * shrink_slab and keep counts as above? + * Seems that reclaimed_slab is kept + * automatically. + * + * I wonder why are these slabs or + * caches in lowmem, should not they + * have been allocated in highmem + * initially, instead? + */ + extern void easy_drop_caches(void); + reclaim_state->reclaimed_slab = 0; + printk(KERN_WARNING "drop_caches with zone=%d nr_slab=%d reclaimed_slab=%ld RECLAIMABLE=%ld FREE=%ld\n", + i, nr_slab, reclaim_state->reclaimed_slab, + zone_page_state(zone, NR_SLAB_RECLAIMABLE), + zone_page_state(zone, NR_FREE_PAGES)); + easy_drop_caches(); + printk(KERN_WARNING "after drop_caches reclaimed_slab=%ld RECLAIMABLE=%ld FREE=%ld\n", + reclaim_state->reclaimed_slab, + zone_page_state(zone, NR_SLAB_RECLAIMABLE), + zone_page_state(zone, NR_FREE_PAGES)); + if (reclaim_state->reclaimed_slab > 0) { + sc.nr_reclaimed += reclaim_state->reclaimed_slab; + if (nr_slab == 0) + nr_slab = 1; + } + } if (nr_slab == 0 && !zone_reclaimable(zone)) zone->all_unreclaimable = 1; + /* + * Beware of all_unreclaimable. We set it when + * - shrink_slab() returns 0, which may happen + * because of temporary failure or because of + * some internal restrictions, and + * - zone_reclaimable() returns false, which + * may happen though NR_SLAB_RECLAIMABLE is + * non-zero + * so it may be set "wrong" or prematurely. + * And then we do not unset all_unreclaimable + * until some page is freed (in page_alloc.c). + */ } /* @@ -3055,6 +3138,7 @@ unsigned long global_reclaimable_pages(v { int nr; + /* Should we add/include global_page_state(NR_SLAB_RECLAIMABLE) ? */ nr = global_page_state(NR_ACTIVE_FILE) + global_page_state(NR_INACTIVE_FILE); @@ -3069,6 +3153,7 @@ unsigned long zone_reclaimable_pages(str { int nr; + /* Should we add/include zone_page_state(zone,NR_SLAB_RECLAIMABLE) ? */ nr = zone_page_state(zone, NR_ACTIVE_FILE) + zone_page_state(zone, NR_INACTIVE_FILE);