Dave,
It will be helpful if you could Cc: ckrm-tech on future resource
management related patches.
Generic comments (might already be in your list):
- Nice sized patch.
- we need both min and max to do effective resource management.
Having limit is good, but not sufficient.
- What happens to accounting when a task is moved from one "group"
to another ?
- What happens when a "group" is removed ?
- How does the limit and hierarchy play together ?
- How about shared pages ?
This code specific comments:
- In the reclamation path walking through the whole page list in
search of pages belonging to naughty "group" will be costly.
- With the logic of (forced) limiting a "group" at allocation, how will
we find naughty "groups" during reclamation ?
See inlined comments below
regards,
chandra
On Tue, 2006-08-15 at 12:20 -0700, [EMAIL PROTECTED] wrote:
<snip>
> #endif /* _LINUX_CPUSET_H */
> diff -puN include/linux/gfp.h~challenged-memory-controller include/linux/gfp.h
> --- lxc/include/linux/gfp.h~challenged-memory-controller 2006-08-15
> 07:47:28.000000000 -0700
> +++ lxc-dave/include/linux/gfp.h 2006-08-15 07:47:34.000000000 -0700
> @@ -108,10 +108,6 @@ static inline enum zone_type gfp_zone(gf
> * optimized to &contig_page_data at compile-time.
> */
>
> -#ifndef HAVE_ARCH_FREE_PAGE
> -static inline void arch_free_page(struct page *page, int order) { }
> -#endif
> -
Not a good idea. What happens for the arches that _have_ a
arch_free_page ?
May be you can define a function that calls arch_free_page() and also
does what you want to be done.
Thinking more... arch_free_page() may not be the right place to credit
the "group" for pages being freed, since the page may be freed
immediately after arch_free_page(). IMO, a better place would be some
other lower level function like free_bulk_pages().
> extern struct page *
> FASTCALL(__alloc_pages(gfp_t, unsigned int, struct zonelist *));
>
> diff -puN include/linux/mm.h~challenged-memory-controller include/linux/mm.h
> diff -puN include/linux/sched.h~challenged-memory-controller
> include/linux/sched.h
> diff -puN include/linux/swap.h~challenged-memory-controller
> include/linux/swap.h
> --- lxc/include/linux/swap.h~challenged-memory-controller 2006-08-15
> 07:47:28.000000000 -0700
> +++ lxc-dave/include/linux/swap.h 2006-08-15 07:47:34.000000000 -0700
> @@ -188,7 +188,7 @@ extern void swap_setup(void);
>
> /* linux/mm/vmscan.c */
> extern unsigned long try_to_free_pages(struct zone **, gfp_t);
> -extern unsigned long shrink_all_memory(unsigned long nr_pages);
> +extern unsigned long shrink_all_memory(unsigned long nr_pages, struct cpuset
> *cs);
IMO shrink_all_memory() may not be the right function, because it tries
to free up slab etc., a better place might be shrink_zones() or
shrink_all_zones().
> extern int vm_swappiness;
> extern int remove_mapping(struct address_space *mapping, struct page *page);
> extern long vm_total_pages;
> diff -puN kernel/cpuset.c~challenged-memory-controller kernel/cpuset.c
> --- lxc/kernel/cpuset.c~challenged-memory-controller 2006-08-14
> 13:22:12.000000000 -0700
> +++ lxc-dave/kernel/cpuset.c 2006-08-15 08:00:40.000000000 -0700
> @@ -21,6 +21,7 @@
> #include <linux/cpu.h>
> #include <linux/cpumask.h>
> #include <linux/cpuset.h>
> +#include <linux/delay.h>
> #include <linux/err.h>
> #include <linux/errno.h>
> #include <linux/file.h>
> @@ -46,6 +47,7 @@
> #include <linux/spinlock.h>
> #include <linux/stat.h>
> #include <linux/string.h>
> +#include <linux/swap.h>
> #include <linux/time.h>
> #include <linux/backing-dev.h>
> #include <linux/sort.h>
> @@ -97,6 +99,8 @@ struct cpuset {
> * recent time this cpuset changed its mems_allowed.
> */
> int mems_generation;
> + int mems_nr_pages;
> + int mems_max_pages;
>
> struct fmeter fmeter; /* memory_pressure filter */
> };
> @@ -112,6 +116,55 @@ typedef enum {
> CS_SPREAD_SLAB,
> } cpuset_flagbits_t;
>
> +int shrink_cpuset(struct cpuset *cs, gfp_t gfpmask, int tries)
gfpmask unused.
> +{
> + int nr_shrunk = 0;
> + while (cpuset_amount_over_memory_max(cs)) {
> + if (tries-- < 0)
> + break;
> + nr_shrunk += shrink_all_memory(10, cs);
what is the purpose of nr_shrunk ?
> + }
> + return 0;
> +}
since it always returns 0, why can't it be a void function ?
> +
> +int cpuset_inc_nr_pages(struct cpuset *cs, int nr, gfp_t gfpmask)
> +{
> + int ret;
> + if (!cs)
> + return 0;
> + cs->mems_nr_pages += nr;
not decremented in failure case. better to increment in the success case
only.
> + if (cpuset_amount_over_memory_max(cs)) {
> + if (!(gfpmask & __GFP_WAIT))
> + return -ENOMEM;
we would be failing allocations in interrupt context, which may not be
good.
> + ret = shrink_cpuset(cs, gfpmask, 50);
> + }
> + if (cpuset_amount_over_memory_max(cs))
> + return -ENOMEM;
> + return 0;
> +}
> +void cpuset_dec_nr_pages(struct cpuset *cs, int nr)
> +{
> + if (!cs)
> + return;
> + cs->mems_nr_pages -= nr;
> +}
> +int cpuset_get_nr_pages(const struct cpuset *cs)
> +{
> + return cs->mems_nr_pages;
> +}
> +int cpuset_amount_over_memory_max(const struct cpuset *cs)
> +{
> + int amount;
> +
> + if (!cs || cs->mems_max_pages < 0)
> + return 0;
> + amount = cs->mems_nr_pages - cs->mems_max_pages;
> + if (amount < 0)
> + amount = 0;
> + return amount;
> +}
None of the callers use the return value. I don't see any reason for the
exact number. If there is no reason, can we simply return
(cs->mems_nr_pages > cs->mems_max_pages)
> +
> +
> /* convenient tests for these bits */
> static inline int is_cpu_exclusive(const struct cpuset *cs)
> {
> @@ -173,6 +226,8 @@ static struct cpuset top_cpuset = {
> .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
> .cpus_allowed = CPU_MASK_ALL,
> .mems_allowed = NODE_MASK_ALL,
> + .mems_nr_pages = 0,
> + .mems_max_pages = -1,
instead of have these as int and "-1" why not use unsigned int and
ULONG_MAX ?
> .count = ATOMIC_INIT(0),
> .sibling = LIST_HEAD_INIT(top_cpuset.sibling),
> .children = LIST_HEAD_INIT(top_cpuset.children),
> @@ -1021,6 +1076,17 @@ static int update_memory_pressure_enable
> return 0;
> }
>
> +static int update_memory_max_nr_pages(struct cpuset *cs, char *buf)
> +{
> + int rate = simple_strtol(buf, NULL, 10);
> + int shrunk;
> + int loopnr = 0;
unused variables.
> + cs->mems_max_pages = rate;
> + while (cpuset_amount_over_memory_max(cs))
> + shrunk = shrink_cpuset(cs, 0, 10);
> + return 0;
> +}
> +
> /*
> * update_flag - read a 0 or a 1 in a file and update associated flag
> * bit: the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
> @@ -1109,6 +1175,7 @@ static int update_flag(cpuset_flagbits_t
> */
>
> #define FM_COEF 933 /* coefficient for half-life of 10 secs */
> +#define FM_COEF 93 /* coefficient for half-life of 10 secs */
what is the purpose of this change ?
> #define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
> #define FM_MAXCNT 1000000 /* limit cnt to avoid overflow */
> #define FM_SCALE 1000 /* faux fixed point scale */
> @@ -1263,6 +1330,8 @@ typedef enum {
> FILE_NOTIFY_ON_RELEASE,
> FILE_MEMORY_PRESSURE_ENABLED,
> FILE_MEMORY_PRESSURE,
> + FILE_MEMORY_MAX,
> + FILE_MEMORY_USED,
> FILE_SPREAD_PAGE,
> FILE_SPREAD_SLAB,
> FILE_TASKLIST,
> @@ -1321,6 +1390,9 @@ static ssize_t cpuset_common_file_write(
> case FILE_MEMORY_PRESSURE_ENABLED:
> retval = update_memory_pressure_enabled(cs, buffer);
> break;
> + case FILE_MEMORY_MAX:
> + retval = update_memory_max_nr_pages(cs, buffer);
> + break;
> case FILE_MEMORY_PRESSURE:
> retval = -EACCES;
> break;
> @@ -1441,6 +1513,12 @@ static ssize_t cpuset_common_file_read(s
> case FILE_MEMORY_PRESSURE:
> s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
> break;
> + case FILE_MEMORY_MAX:
> + s += sprintf(s, "%d", cs->mems_max_pages);
> + break;
> + case FILE_MEMORY_USED:
> + s += sprintf(s, "%d", cs->mems_nr_pages);
> + break;
> case FILE_SPREAD_PAGE:
> *s++ = is_spread_page(cs) ? '1' : '0';
> break;
> @@ -1785,6 +1863,16 @@ static struct cftype cft_cpu_exclusive =
> .private = FILE_CPU_EXCLUSIVE,
> };
>
> +static struct cftype cft_mem_used = {
> + .name = "memory_nr_pages",
> + .private = FILE_MEMORY_USED,
> +};
> +
> +static struct cftype cft_mem_max = {
> + .name = "memory_max_pages",
> + .private = FILE_MEMORY_MAX,
> +};
> +
> static struct cftype cft_mem_exclusive = {
> .name = "mem_exclusive",
> .private = FILE_MEM_EXCLUSIVE,
> @@ -1830,6 +1918,10 @@ static int cpuset_populate_dir(struct de
> return err;
> if ((err = cpuset_add_file(cs_dentry, &cft_cpu_exclusive)) < 0)
> return err;
> + if ((err = cpuset_add_file(cs_dentry, &cft_mem_max)) < 0)
> + return err;
> + if ((err = cpuset_add_file(cs_dentry, &cft_mem_used)) < 0)
> + return err;
> if ((err = cpuset_add_file(cs_dentry, &cft_mem_exclusive)) < 0)
> return err;
> if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0)
> @@ -1880,6 +1972,8 @@ static long cpuset_create(struct cpuset
> INIT_LIST_HEAD(&cs->sibling);
> INIT_LIST_HEAD(&cs->children);
> cs->mems_generation = cpuset_mems_generation++;
> + cs->mems_max_pages = parent->mems_max_pages;
IMO this is not intuitive. Having same default values (infinite) for all
"groups" sounds more intuitive.
> + cs->mems_nr_pages = 0;
> fmeter_init(&cs->fmeter);
>
> cs->parent = parent;
> @@ -1986,6 +2080,8 @@ int __init cpuset_init_early(void)
>
> tsk->cpuset = &top_cpuset;
> tsk->cpuset->mems_generation = cpuset_mems_generation++;
> + tsk->cpuset->mems_max_pages = -1;
> + tsk->cpuset->mems_nr_pages = 0;
> return 0;
> }
>
> @@ -2005,6 +2101,8 @@ int __init cpuset_init(void)
>
> fmeter_init(&top_cpuset.fmeter);
> top_cpuset.mems_generation = cpuset_mems_generation++;
> + top_cpuset.mems_max_pages = -1;
> + top_cpuset.mems_nr_pages = 0;
>
> init_task.cpuset = &top_cpuset;
>
> @@ -2438,7 +2536,6 @@ int cpuset_memory_pressure_enabled __rea
> void __cpuset_memory_pressure_bump(void)
> {
> struct cpuset *cs;
> -
> task_lock(current);
> cs = current->cpuset;
> fmeter_markevent(&cs->fmeter);
> diff -puN mm/page_alloc.c~challenged-memory-controller mm/page_alloc.c
> --- lxc/mm/page_alloc.c~challenged-memory-controller 2006-08-14
> 13:24:16.000000000 -0700
> +++ lxc-dave/mm/page_alloc.c 2006-08-15 07:57:13.000000000 -0700
> @@ -470,6 +470,11 @@ static void free_one_page(struct zone *z
> free_pages_bulk(zone, 1, &list, order);
> }
>
> +void arch_free_page(struct page *page, int order)
> +{
> + cpuset_dec_nr_pages(page->cpuset, 1<<order);
> +}
> +
> static void __free_pages_ok(struct page *page, unsigned int order)
> {
> unsigned long flags;
> @@ -1020,6 +1025,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned i
>
> might_sleep_if(wait);
>
> + if (cpuset_inc_nr_pages(current->cpuset, 1<<order, gfp_mask))
> + return NULL;
> +
As pointed above, may need to handle interrupt context differently.
> restart:
> z = zonelist->zones; /* the list of zones suitable for gfp_mask */
>
> @@ -1159,6 +1167,10 @@ got_pg:
> if (page)
> set_page_owner(page, order, gfp_mask);
> #endif
> + if (page)
> + page->cpuset = current->cpuset;
getting a reference to cpuset without incrementing the reference count
of current->cpuset.
> + else
> + cpuset_dec_nr_pages(current->cpuset, 1<<order);
> return page;
> }
>
> diff -puN mm/rmap.c~challenged-memory-controller mm/rmap.c
> --- lxc/mm/rmap.c~challenged-memory-controller 2006-08-15
> 07:47:28.000000000 -0700
> +++ lxc-dave/mm/rmap.c 2006-08-15 08:01:26.000000000 -0700
> @@ -927,3 +927,8 @@ int try_to_unmap(struct page *page, int
> return ret;
> }
>
> +extern int cpuset_amount_over_memory_max(const struct cpuset *cs);
> +int page_has_naughty_cpuset(struct page *page)
> +{
> + return cpuset_amount_over_memory_max(page->cpuset);
> +}
why not have this function in vmscan.c itself ? can be inlined there.
BTW, is this function necessary ?
> diff -puN mm/vmscan.c~challenged-memory-controller mm/vmscan.c
> --- lxc/mm/vmscan.c~challenged-memory-controller 2006-08-15
> 07:47:28.000000000 -0700
> +++ lxc-dave/mm/vmscan.c 2006-08-15 08:05:03.000000000 -0700
> @@ -63,8 +63,9 @@ struct scan_control {
> int swap_cluster_max;
>
> int swappiness;
> -
> int all_unreclaimable;
> + int only_pages_with_naughty_cpusets;
> + struct cpuset *only_this_cpuset;
> };
>
> #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
> @@ -445,6 +446,10 @@ static unsigned long shrink_page_list(st
>
> VM_BUG_ON(PageActive(page));
>
> + if (cpuset_amount_over_memory_max(sc->only_this_cpuset) &&
> + page->cpuset && page->cpuset != sc->only_this_cpuset) {
> + goto keep_locked;
> + }
since sc->only_this_cpuset will remain constant and page is the one that
will be changing in this loop, I think a small rearrangement of this
code would benefit performance.
> sc->nr_scanned++;
>
> if (!sc->may_swap && page_mapped(page))
> @@ -793,9 +798,20 @@ force_reclaim_mapped:
> spin_unlock_irq(&zone->lru_lock);
>
> while (!list_empty(&l_hold)) {
> + extern int page_has_naughty_cpuset(struct page *page);
> cond_resched();
> page = lru_to_page(&l_hold);
> list_del(&page->lru);
> + if (sc->only_this_cpuset &&
> + page->cpuset && page->cpuset != sc->only_this_cpuset) {
> + list_add(&page->lru, &l_active);
> + continue;
> + }
> + if (sc->only_pages_with_naughty_cpusets &&
> + !page_has_naughty_cpuset(page)) {
> + list_add(&page->lru, &l_active);
> + continue;
> + }
> if (page_mapped(page)) {
> if (!reclaim_mapped ||
> (total_swap_pages == 0 && PageAnon(page)) ||
> @@ -875,6 +891,7 @@ static unsigned long shrink_zone(int pri
> unsigned long nr_inactive;
> unsigned long nr_to_scan;
> unsigned long nr_reclaimed = 0;
> + int nr_scans = 0;
>
> atomic_inc(&zone->reclaim_in_progress);
>
> @@ -897,6 +914,11 @@ static unsigned long shrink_zone(int pri
> nr_inactive = 0;
>
> while (nr_active || nr_inactive) {
> + nr_scans++;
> + if (printk_ratelimit())
> + printk("%s() scan nr: %d\n", __func__, nr_scans);
> + if (nr_scans > 20)
> + sc->only_pages_with_naughty_cpusets = 0;
> if (nr_active) {
> nr_to_scan = min(nr_active,
> (unsigned long)sc->swap_cluster_max);
> @@ -993,6 +1015,7 @@ unsigned long try_to_free_pages(struct z
> .swap_cluster_max = SWAP_CLUSTER_MAX,
> .may_swap = 1,
> .swappiness = vm_swappiness,
> + .only_pages_with_naughty_cpusets = 1,
> };
>
> delay_swap_prefetch();
> @@ -1090,6 +1113,7 @@ static unsigned long balance_pgdat(pg_da
> .may_swap = 1,
> .swap_cluster_max = SWAP_CLUSTER_MAX,
> .swappiness = vm_swappiness,
> + .only_pages_with_naughty_cpusets = 1,
> };
>
> loop_again:
> @@ -1310,7 +1334,6 @@ void wakeup_kswapd(struct zone *zone, in
> wake_up_interruptible(&pgdat->kswapd_wait);
> }
>
> -#ifdef CONFIG_PM
> /*
> * Helper function for shrink_all_memory(). Tries to reclaim 'nr_pages'
> pages
> * from LRU lists system-wide, for given pass and priority, and returns the
> @@ -1363,7 +1386,7 @@ static unsigned long shrink_all_zones(un
> * LRU order by reclaiming preferentially
> * inactive > active > active referenced > active mapped
> */
> -unsigned long shrink_all_memory(unsigned long nr_pages)
> +unsigned long shrink_all_memory(unsigned long nr_pages, struct cpuset *cs)
> {
> unsigned long lru_pages, nr_slab;
> unsigned long ret = 0;
> @@ -1376,6 +1399,8 @@ unsigned long shrink_all_memory(unsigned
> .swap_cluster_max = nr_pages,
> .may_writepage = 1,
> .swappiness = vm_swappiness,
> + .only_pages_with_naughty_cpusets = 1,
> + .only_this_cpuset = cs,
> };
>
> delay_swap_prefetch();
> @@ -1462,7 +1487,6 @@ out:
>
> return ret;
> }
> -#endif
>
> #ifdef CONFIG_HOTPLUG_CPU
> /* It's optimal to keep kswapds on the same CPUs as their memory, but
> @@ -1568,6 +1592,7 @@ static int __zone_reclaim(struct zone *z
> SWAP_CLUSTER_MAX),
> .gfp_mask = gfp_mask,
> .swappiness = vm_swappiness,
> + .only_pages_with_naughty_cpusets = 1,
> };
>
> disable_swap_token();
> _
>
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [EMAIL PROTECTED] For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[EMAIL PROTECTED]"> [EMAIL PROTECTED] </a>
--
----------------------------------------------------------------------
Chandra Seetharaman | Be careful what you choose....
- [EMAIL PROTECTED] | .......you may get it.
----------------------------------------------------------------------
-------------------------------------------------------------------------
Using Tomcat but need to do more? Need to support web services, security?
Get stuff done quickly with pre-integrated technology to make your job easier
Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642
_______________________________________________
ckrm-tech mailing list
https://lists.sourceforge.net/lists/listinfo/ckrm-tech