Re: [ckrm-tech] [RFC][PATCH] "challenged" memory controller

Chandra Seetharaman Wed, 16 Aug 2006 13:35:08 -0700

Dave,

It will be helpful if you could Cc: ckrm-tech on future resource
management related patches.


Generic comments (might already be in your list):
- Nice sized patch.
- we need both min and max to do effective resource management. 
  Having limit is good, but not sufficient.
- What happens to accounting when a task is moved from one "group"
   to another ?
- What happens when a "group" is removed ?
- How does the limit and hierarchy play together ?
- How about shared pages ?

This code specific comments:
- In the reclamation path walking through the whole page list in 
   search of pages belonging to naughty "group" will be costly.
- With the logic of (forced) limiting a "group" at allocation, how will
   we find naughty "groups" during reclamation ?

See inlined comments below

regards,

chandra
On Tue, 2006-08-15 at 12:20 -0700, [EMAIL PROTECTED] wrote:

<snip>

>  #endif /* _LINUX_CPUSET_H */
> diff -puN include/linux/gfp.h~challenged-memory-controller include/linux/gfp.h
> --- lxc/include/linux/gfp.h~challenged-memory-controller      2006-08-15 
> 07:47:28.000000000 -0700
> +++ lxc-dave/include/linux/gfp.h      2006-08-15 07:47:34.000000000 -0700
> @@ -108,10 +108,6 @@ static inline enum zone_type gfp_zone(gf
>   * optimized to &contig_page_data at compile-time.
>   */
>  
> -#ifndef HAVE_ARCH_FREE_PAGE
> -static inline void arch_free_page(struct page *page, int order) { }
> -#endif
> -

Not a good idea. What happens for the arches that _have_ a
arch_free_page ?

May be you can define a function that calls arch_free_page() and also
does what you want to be done.

Thinking more... arch_free_page() may not be the right place to credit
the "group" for pages being freed, since the page may be freed
immediately after arch_free_page(). IMO, a better place would be some
other lower level function like free_bulk_pages().

>  extern struct page *
>  FASTCALL(__alloc_pages(gfp_t, unsigned int, struct zonelist *));
>  
> diff -puN include/linux/mm.h~challenged-memory-controller include/linux/mm.h
> diff -puN include/linux/sched.h~challenged-memory-controller 
> include/linux/sched.h
> diff -puN include/linux/swap.h~challenged-memory-controller 
> include/linux/swap.h
> --- lxc/include/linux/swap.h~challenged-memory-controller     2006-08-15 
> 07:47:28.000000000 -0700
> +++ lxc-dave/include/linux/swap.h     2006-08-15 07:47:34.000000000 -0700
> @@ -188,7 +188,7 @@ extern void swap_setup(void);
>  
>  /* linux/mm/vmscan.c */
>  extern unsigned long try_to_free_pages(struct zone **, gfp_t);
> -extern unsigned long shrink_all_memory(unsigned long nr_pages);
> +extern unsigned long shrink_all_memory(unsigned long nr_pages, struct cpuset 
> *cs);

IMO shrink_all_memory() may not be the right function, because it tries
to free up slab etc., a better place might be shrink_zones() or
shrink_all_zones().

>  extern int vm_swappiness;
>  extern int remove_mapping(struct address_space *mapping, struct page *page);
>  extern long vm_total_pages;
> diff -puN kernel/cpuset.c~challenged-memory-controller kernel/cpuset.c
> --- lxc/kernel/cpuset.c~challenged-memory-controller  2006-08-14 
> 13:22:12.000000000 -0700
> +++ lxc-dave/kernel/cpuset.c  2006-08-15 08:00:40.000000000 -0700
> @@ -21,6 +21,7 @@
>  #include <linux/cpu.h>
>  #include <linux/cpumask.h>
>  #include <linux/cpuset.h>
> +#include <linux/delay.h>
>  #include <linux/err.h>
>  #include <linux/errno.h>
>  #include <linux/file.h>
> @@ -46,6 +47,7 @@
>  #include <linux/spinlock.h>
>  #include <linux/stat.h>
>  #include <linux/string.h>
> +#include <linux/swap.h>
>  #include <linux/time.h>
>  #include <linux/backing-dev.h>
>  #include <linux/sort.h>
> @@ -97,6 +99,8 @@ struct cpuset {
>        * recent time this cpuset changed its mems_allowed.
>        */
>       int mems_generation;
> +     int mems_nr_pages;
> +     int mems_max_pages;
>  
>       struct fmeter fmeter;           /* memory_pressure filter */
>  };
> @@ -112,6 +116,55 @@ typedef enum {
>       CS_SPREAD_SLAB,
>  } cpuset_flagbits_t;
>  
> +int shrink_cpuset(struct cpuset *cs, gfp_t gfpmask, int tries)

gfpmask unused. 
> +{
> +     int nr_shrunk = 0;
> +     while (cpuset_amount_over_memory_max(cs)) {
> +             if (tries-- < 0)
> +                     break;
> +             nr_shrunk += shrink_all_memory(10, cs);

what is the purpose of nr_shrunk ?

> +     }
> +     return 0;
> +}

since it always returns 0, why can't it be a void function ? 
> +
> +int cpuset_inc_nr_pages(struct cpuset *cs, int nr, gfp_t gfpmask)
> +{
> +     int ret;
> +     if (!cs)
> +             return 0;
> +     cs->mems_nr_pages += nr;

not decremented in failure case. better to increment in the success case
only. 
> +     if (cpuset_amount_over_memory_max(cs)) {
> +             if (!(gfpmask & __GFP_WAIT))
> +                     return -ENOMEM;

we would be failing allocations in interrupt context, which may not be
good.
> +             ret = shrink_cpuset(cs, gfpmask, 50);
> +     }
> +     if (cpuset_amount_over_memory_max(cs))
> +             return -ENOMEM;
> +     return 0;
> +}

> +void cpuset_dec_nr_pages(struct cpuset *cs, int nr)
> +{
> +     if (!cs)
> +             return;
> +     cs->mems_nr_pages -= nr;
> +}
> +int cpuset_get_nr_pages(const struct cpuset *cs)
> +{
> +     return cs->mems_nr_pages;
> +}
> +int cpuset_amount_over_memory_max(const struct cpuset *cs)
> +{
> +     int amount;
> +
> +     if (!cs || cs->mems_max_pages < 0)
> +             return 0;
> +     amount = cs->mems_nr_pages - cs->mems_max_pages;
> +     if (amount < 0)
> +             amount = 0;
> +     return amount;
> +}

None of the callers use the return value. I don't see any reason for the
exact number. If there is no reason, can we simply return 
                   (cs->mems_nr_pages > cs->mems_max_pages) 
> +
> +
>  /* convenient tests for these bits */
>  static inline int is_cpu_exclusive(const struct cpuset *cs)
>  {
> @@ -173,6 +226,8 @@ static struct cpuset top_cpuset = {
>       .flags = ((1 << CS_CPU_EXCLUSIVE) | (1 << CS_MEM_EXCLUSIVE)),
>       .cpus_allowed = CPU_MASK_ALL,
>       .mems_allowed = NODE_MASK_ALL,
> +     .mems_nr_pages = 0,
> +     .mems_max_pages = -1,

instead of have these as int and "-1" why not use unsigned int and
ULONG_MAX ?

>       .count = ATOMIC_INIT(0),
>       .sibling = LIST_HEAD_INIT(top_cpuset.sibling),
>       .children = LIST_HEAD_INIT(top_cpuset.children),
> @@ -1021,6 +1076,17 @@ static int update_memory_pressure_enable
>       return 0;
>  }
>  
> +static int update_memory_max_nr_pages(struct cpuset *cs, char *buf)
> +{
> +     int rate = simple_strtol(buf, NULL, 10);
> +     int shrunk;
> +     int loopnr = 0;

unused variables.
> +     cs->mems_max_pages = rate;
> +     while (cpuset_amount_over_memory_max(cs))
> +             shrunk = shrink_cpuset(cs, 0, 10);
> +     return 0;
> +}
> +
>  /*
>   * update_flag - read a 0 or a 1 in a file and update associated flag
>   * bit:      the bit to update (CS_CPU_EXCLUSIVE, CS_MEM_EXCLUSIVE,
> @@ -1109,6 +1175,7 @@ static int update_flag(cpuset_flagbits_t
>   */
>  
>  #define FM_COEF 933          /* coefficient for half-life of 10 secs */
> +#define FM_COEF 93           /* coefficient for half-life of 10 secs */

what is the purpose of this change ?

>  #define FM_MAXTICKS ((time_t)99) /* useless computing more ticks than this */
>  #define FM_MAXCNT 1000000    /* limit cnt to avoid overflow */
>  #define FM_SCALE 1000                /* faux fixed point scale */
> @@ -1263,6 +1330,8 @@ typedef enum {
>       FILE_NOTIFY_ON_RELEASE,
>       FILE_MEMORY_PRESSURE_ENABLED,
>       FILE_MEMORY_PRESSURE,
> +     FILE_MEMORY_MAX,
> +     FILE_MEMORY_USED,
>       FILE_SPREAD_PAGE,
>       FILE_SPREAD_SLAB,
>       FILE_TASKLIST,
> @@ -1321,6 +1390,9 @@ static ssize_t cpuset_common_file_write(
>       case FILE_MEMORY_PRESSURE_ENABLED:
>               retval = update_memory_pressure_enabled(cs, buffer);
>               break;
> +     case FILE_MEMORY_MAX:
> +             retval = update_memory_max_nr_pages(cs, buffer);
> +             break;
>       case FILE_MEMORY_PRESSURE:
>               retval = -EACCES;
>               break;
> @@ -1441,6 +1513,12 @@ static ssize_t cpuset_common_file_read(s
>       case FILE_MEMORY_PRESSURE:
>               s += sprintf(s, "%d", fmeter_getrate(&cs->fmeter));
>               break;
> +     case FILE_MEMORY_MAX:
> +             s += sprintf(s, "%d", cs->mems_max_pages);
> +             break;
> +     case FILE_MEMORY_USED:
> +             s += sprintf(s, "%d", cs->mems_nr_pages);
> +             break;
>       case FILE_SPREAD_PAGE:
>               *s++ = is_spread_page(cs) ? '1' : '0';
>               break;
> @@ -1785,6 +1863,16 @@ static struct cftype cft_cpu_exclusive =
>       .private = FILE_CPU_EXCLUSIVE,
>  };
>  
> +static struct cftype cft_mem_used = {
> +     .name = "memory_nr_pages",
> +     .private = FILE_MEMORY_USED,
> +};
> +
> +static struct cftype cft_mem_max = {
> +     .name = "memory_max_pages",
> +     .private = FILE_MEMORY_MAX,
> +};
> +
>  static struct cftype cft_mem_exclusive = {
>       .name = "mem_exclusive",
>       .private = FILE_MEM_EXCLUSIVE,
> @@ -1830,6 +1918,10 @@ static int cpuset_populate_dir(struct de
>               return err;
>       if ((err = cpuset_add_file(cs_dentry, &cft_cpu_exclusive)) < 0)
>               return err;
> +     if ((err = cpuset_add_file(cs_dentry, &cft_mem_max)) < 0)
> +             return err;
> +     if ((err = cpuset_add_file(cs_dentry, &cft_mem_used)) < 0)
> +             return err;
>       if ((err = cpuset_add_file(cs_dentry, &cft_mem_exclusive)) < 0)
>               return err;
>       if ((err = cpuset_add_file(cs_dentry, &cft_notify_on_release)) < 0)
> @@ -1880,6 +1972,8 @@ static long cpuset_create(struct cpuset 
>       INIT_LIST_HEAD(&cs->sibling);
>       INIT_LIST_HEAD(&cs->children);
>       cs->mems_generation = cpuset_mems_generation++;
> +     cs->mems_max_pages = parent->mems_max_pages;

IMO this is not intuitive. Having same default values (infinite) for all
"groups" sounds more intuitive.

> +     cs->mems_nr_pages = 0;
>       fmeter_init(&cs->fmeter);
>  
>       cs->parent = parent;
> @@ -1986,6 +2080,8 @@ int __init cpuset_init_early(void)
>  
>       tsk->cpuset = &top_cpuset;
>       tsk->cpuset->mems_generation = cpuset_mems_generation++;
> +     tsk->cpuset->mems_max_pages = -1;
> +     tsk->cpuset->mems_nr_pages = 0;
>       return 0;
>  }
>  
> @@ -2005,6 +2101,8 @@ int __init cpuset_init(void)
>  
>       fmeter_init(&top_cpuset.fmeter);
>       top_cpuset.mems_generation = cpuset_mems_generation++;
> +     top_cpuset.mems_max_pages = -1;
> +     top_cpuset.mems_nr_pages = 0;
>  
>       init_task.cpuset = &top_cpuset;
>  
> @@ -2438,7 +2536,6 @@ int cpuset_memory_pressure_enabled __rea
>  void __cpuset_memory_pressure_bump(void)
>  {
>       struct cpuset *cs;
> -
>       task_lock(current);
>       cs = current->cpuset;
>       fmeter_markevent(&cs->fmeter);
> diff -puN mm/page_alloc.c~challenged-memory-controller mm/page_alloc.c
> --- lxc/mm/page_alloc.c~challenged-memory-controller  2006-08-14 
> 13:24:16.000000000 -0700
> +++ lxc-dave/mm/page_alloc.c  2006-08-15 07:57:13.000000000 -0700
> @@ -470,6 +470,11 @@ static void free_one_page(struct zone *z
>       free_pages_bulk(zone, 1, &list, order);
>  }
>  
> +void arch_free_page(struct page *page, int order)
> +{
> +     cpuset_dec_nr_pages(page->cpuset, 1<<order);
> +}
> +
>  static void __free_pages_ok(struct page *page, unsigned int order)
>  {
>       unsigned long flags;
> @@ -1020,6 +1025,9 @@ __alloc_pages(gfp_t gfp_mask, unsigned i
>  
>       might_sleep_if(wait);
>  
> +     if (cpuset_inc_nr_pages(current->cpuset, 1<<order, gfp_mask))
> +             return NULL;
> +

As pointed above, may need to handle interrupt context differently.

>  restart:
>       z = zonelist->zones;  /* the list of zones suitable for gfp_mask */
>  
> @@ -1159,6 +1167,10 @@ got_pg:
>       if (page)
>               set_page_owner(page, order, gfp_mask);
>  #endif
> +     if (page)
> +             page->cpuset = current->cpuset;

getting a reference to cpuset without incrementing the reference count
of current->cpuset.

> +     else
> +             cpuset_dec_nr_pages(current->cpuset, 1<<order);
>       return page;
>  }
>  
> diff -puN mm/rmap.c~challenged-memory-controller mm/rmap.c
> --- lxc/mm/rmap.c~challenged-memory-controller        2006-08-15 
> 07:47:28.000000000 -0700
> +++ lxc-dave/mm/rmap.c        2006-08-15 08:01:26.000000000 -0700
> @@ -927,3 +927,8 @@ int try_to_unmap(struct page *page, int 
>       return ret;
>  }
>  
> +extern int cpuset_amount_over_memory_max(const struct cpuset *cs);
> +int page_has_naughty_cpuset(struct page *page)
> +{
> +     return cpuset_amount_over_memory_max(page->cpuset);
> +}

why not have this function in vmscan.c itself ? can be inlined there.

BTW, is this function necessary ?

> diff -puN mm/vmscan.c~challenged-memory-controller mm/vmscan.c
> --- lxc/mm/vmscan.c~challenged-memory-controller      2006-08-15 
> 07:47:28.000000000 -0700
> +++ lxc-dave/mm/vmscan.c      2006-08-15 08:05:03.000000000 -0700
> @@ -63,8 +63,9 @@ struct scan_control {
>       int swap_cluster_max;
>  
>       int swappiness;
> -
>       int all_unreclaimable;
> +     int only_pages_with_naughty_cpusets;
> +     struct cpuset *only_this_cpuset;
>  };
>  
>  #define lru_to_page(_head) (list_entry((_head)->prev, struct page, lru))
> @@ -445,6 +446,10 @@ static unsigned long shrink_page_list(st
>  
>               VM_BUG_ON(PageActive(page));
>  
> +             if (cpuset_amount_over_memory_max(sc->only_this_cpuset) &&
> +                 page->cpuset && page->cpuset != sc->only_this_cpuset) {
> +                     goto keep_locked;
> +             }

since sc->only_this_cpuset will remain constant and page is the one that
will be changing in this loop, I think a small rearrangement of this
code would benefit performance.
 
>               sc->nr_scanned++;
>  
>               if (!sc->may_swap && page_mapped(page))
> @@ -793,9 +798,20 @@ force_reclaim_mapped:
>       spin_unlock_irq(&zone->lru_lock);
>  
>       while (!list_empty(&l_hold)) {
> +             extern int page_has_naughty_cpuset(struct page *page);
>               cond_resched();
>               page = lru_to_page(&l_hold);
>               list_del(&page->lru);
> +             if (sc->only_this_cpuset &&
> +                 page->cpuset && page->cpuset != sc->only_this_cpuset) {
> +                     list_add(&page->lru, &l_active);
> +                     continue;
> +             }
> +             if (sc->only_pages_with_naughty_cpusets &&
> +                 !page_has_naughty_cpuset(page)) {
> +                     list_add(&page->lru, &l_active);
> +                     continue;
> +             }
>               if (page_mapped(page)) {
>                       if (!reclaim_mapped ||
>                           (total_swap_pages == 0 && PageAnon(page)) ||
> @@ -875,6 +891,7 @@ static unsigned long shrink_zone(int pri
>       unsigned long nr_inactive;
>       unsigned long nr_to_scan;
>       unsigned long nr_reclaimed = 0;
> +     int nr_scans = 0;
>  
>       atomic_inc(&zone->reclaim_in_progress);
>  
> @@ -897,6 +914,11 @@ static unsigned long shrink_zone(int pri
>               nr_inactive = 0;
>  
>       while (nr_active || nr_inactive) {
> +             nr_scans++;
> +             if (printk_ratelimit())
> +                     printk("%s() scan nr: %d\n", __func__, nr_scans);
> +             if (nr_scans > 20)
> +                     sc->only_pages_with_naughty_cpusets = 0;
>               if (nr_active) {
>                       nr_to_scan = min(nr_active,
>                                       (unsigned long)sc->swap_cluster_max);
> @@ -993,6 +1015,7 @@ unsigned long try_to_free_pages(struct z
>               .swap_cluster_max = SWAP_CLUSTER_MAX,
>               .may_swap = 1,
>               .swappiness = vm_swappiness,
> +             .only_pages_with_naughty_cpusets = 1,
>       };
>  
>       delay_swap_prefetch();
> @@ -1090,6 +1113,7 @@ static unsigned long balance_pgdat(pg_da
>               .may_swap = 1,
>               .swap_cluster_max = SWAP_CLUSTER_MAX,
>               .swappiness = vm_swappiness,
> +             .only_pages_with_naughty_cpusets = 1,
>       };
>  
>  loop_again:
> @@ -1310,7 +1334,6 @@ void wakeup_kswapd(struct zone *zone, in
>       wake_up_interruptible(&pgdat->kswapd_wait);
>  }
>  
> -#ifdef CONFIG_PM
>  /*
>   * Helper function for shrink_all_memory().  Tries to reclaim 'nr_pages' 
> pages
>   * from LRU lists system-wide, for given pass and priority, and returns the
> @@ -1363,7 +1386,7 @@ static unsigned long shrink_all_zones(un
>   * LRU order by reclaiming preferentially
>   * inactive > active > active referenced > active mapped
>   */
> -unsigned long shrink_all_memory(unsigned long nr_pages)
> +unsigned long shrink_all_memory(unsigned long nr_pages, struct cpuset *cs)
>  {
>       unsigned long lru_pages, nr_slab;
>       unsigned long ret = 0;
> @@ -1376,6 +1399,8 @@ unsigned long shrink_all_memory(unsigned
>               .swap_cluster_max = nr_pages,
>               .may_writepage = 1,
>               .swappiness = vm_swappiness,
> +             .only_pages_with_naughty_cpusets = 1,
> +             .only_this_cpuset = cs,
>       };
>  
>       delay_swap_prefetch();
> @@ -1462,7 +1487,6 @@ out:
>  
>       return ret;
>  }
> -#endif
>  
>  #ifdef CONFIG_HOTPLUG_CPU
>  /* It's optimal to keep kswapds on the same CPUs as their memory, but
> @@ -1568,6 +1592,7 @@ static int __zone_reclaim(struct zone *z
>                                       SWAP_CLUSTER_MAX),
>               .gfp_mask = gfp_mask,
>               .swappiness = vm_swappiness,
> +             .only_pages_with_naughty_cpusets = 1,
>       };
>  
>       disable_swap_token();
> _
> 
> --
> To unsubscribe, send a message with 'unsubscribe linux-mm' in
> the body to [EMAIL PROTECTED]  For more info on Linux MM,
> see: http://www.linux-mm.org/ .
> Don't email: <a href=mailto:"[EMAIL PROTECTED]"> [EMAIL PROTECTED] </a>
-- 

----------------------------------------------------------------------
    Chandra Seetharaman               | Be careful what you choose....
              - [EMAIL PROTECTED]   |      .......you may get it.
----------------------------------------------------------------------



-------------------------------------------------------------------------
Using Tomcat but need to do more? Need to support web services, security?
Get stuff done quickly with pre-integrated technology to make your job easier
Download IBM WebSphere Application Server v.1.0.1 based on Apache Geronimo
http://sel.as-us.falkag.net/sel?cmd=lnk&kid=120709&bid=263057&dat=121642
_______________________________________________
ckrm-tech mailing list
https://lists.sourceforge.net/lists/listinfo/ckrm-tech

Re: [ckrm-tech] [RFC][PATCH] "challenged" memory controller

Reply via email to