Ok, full reset.

I care about kernel allocations only. In particular about those that
have PF_MEMALLOC semantics.

The thing I need is that any memory allocated below
  ALLOC_MIN|ALLOC_HIGH|ALLOC_HARDER
is only ever used by processes that have ALLOC_NO_WATERMARKS rights;
for the duration of the distress.

What this patch does:
 - change the page allocator to try ALLOC_MIN|ALLOC_HIGH|ALLOC_HARDER
   if ALLOC_NO_WATERMARKS, before the actual ALLOC_NO_WATERMARKS alloc

 - set page->reserve nonzero for each page allocated with
   ALLOC_NO_WATERMARKS; which by the previous point implies that all
   available zones are below ALLOC_MIN|ALLOC_HIGH|ALLOC_HARDER

 - when a page->reserve slab is allocated store it in s->reserve_slab
   and do not update the ->cpu_slab[] (this forces subsequent allocs to
   retry the allocation).

All ALLOC_NO_WATERMARKS enabled slab allocations are served from
->reserve_slab, up until the point where a !page->reserve slab alloc
succeeds, at which point the ->reserve_slab is pushed into the partial
lists and ->reserve_slab set to NULL.

Since only the allocation of a new slab uses the gfp zone flags, and
other allocations placement hints they have to be uniform over all slab
allocs for a given kmem_cache. Thus the s->reserve_slab/page->reserve
status is kmem_cache wide.

Any holes left?

---

Index: linux-2.6-git/mm/internal.h
===================================================================
--- linux-2.6-git.orig/mm/internal.h
+++ linux-2.6-git/mm/internal.h
@@ -12,6 +12,7 @@
 #define __MM_INTERNAL_H
 
 #include <linux/mm.h>
+#include <linux/hardirq.h>
 
 static inline void set_page_count(struct page *page, int v)
 {
@@ -37,4 +38,50 @@ static inline void __put_page(struct pag
 extern void fastcall __init __free_pages_bootmem(struct page *page,
                                                unsigned int order);
 
+#define ALLOC_HARDER           0x01 /* try to alloc harder */
+#define ALLOC_HIGH             0x02 /* __GFP_HIGH set */
+#define ALLOC_WMARK_MIN                0x04 /* use pages_min watermark */
+#define ALLOC_WMARK_LOW                0x08 /* use pages_low watermark */
+#define ALLOC_WMARK_HIGH       0x10 /* use pages_high watermark */
+#define ALLOC_NO_WATERMARKS    0x20 /* don't check watermarks at all */
+#define ALLOC_CPUSET           0x40 /* check for correct cpuset */
+
+/*
+ * get the deepest reaching allocation flags for the given gfp_mask
+ */
+static int inline gfp_to_alloc_flags(gfp_t gfp_mask)
+{
+       struct task_struct *p = current;
+       int alloc_flags = ALLOC_WMARK_MIN | ALLOC_CPUSET;
+       const gfp_t wait = gfp_mask & __GFP_WAIT;
+
+       /*
+        * The caller may dip into page reserves a bit more if the caller
+        * cannot run direct reclaim, or if the caller has realtime scheduling
+        * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
+        * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
+        */
+       if (gfp_mask & __GFP_HIGH)
+               alloc_flags |= ALLOC_HIGH;
+
+       if (!wait) {
+               alloc_flags |= ALLOC_HARDER;
+               /*
+                * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
+                * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
+                */
+               alloc_flags &= ~ALLOC_CPUSET;
+       } else if (unlikely(rt_task(p)) && !in_interrupt())
+               alloc_flags |= ALLOC_HARDER;
+
+       if (likely(!(gfp_mask & __GFP_NOMEMALLOC))) {
+               if (!in_interrupt() &&
+                   ((p->flags & PF_MEMALLOC) ||
+                    unlikely(test_thread_flag(TIF_MEMDIE))))
+                       alloc_flags |= ALLOC_NO_WATERMARKS;
+       }
+
+       return alloc_flags;
+}
+
 #endif
Index: linux-2.6-git/mm/page_alloc.c
===================================================================
--- linux-2.6-git.orig/mm/page_alloc.c
+++ linux-2.6-git/mm/page_alloc.c
@@ -1175,14 +1175,6 @@ failed:
        return NULL;
 }
 
-#define ALLOC_NO_WATERMARKS    0x01 /* don't check watermarks at all */
-#define ALLOC_WMARK_MIN                0x02 /* use pages_min watermark */
-#define ALLOC_WMARK_LOW                0x04 /* use pages_low watermark */
-#define ALLOC_WMARK_HIGH       0x08 /* use pages_high watermark */
-#define ALLOC_HARDER           0x10 /* try to alloc harder */
-#define ALLOC_HIGH             0x20 /* __GFP_HIGH set */
-#define ALLOC_CPUSET           0x40 /* check for correct cpuset */
-
 #ifdef CONFIG_FAIL_PAGE_ALLOC
 
 static struct fail_page_alloc_attr {
@@ -1494,6 +1486,7 @@ zonelist_scan:
 
                page = buffered_rmqueue(zonelist, zone, order, gfp_mask);
                if (page)
+                       page->reserve = (alloc_flags & ALLOC_NO_WATERMARKS);
                        break;
 this_zone_full:
                if (NUMA_BUILD)
@@ -1619,48 +1612,36 @@ restart:
         * OK, we're below the kswapd watermark and have kicked background
         * reclaim. Now things get more complex, so set up alloc_flags according
         * to how we want to proceed.
-        *
-        * The caller may dip into page reserves a bit more if the caller
-        * cannot run direct reclaim, or if the caller has realtime scheduling
-        * policy or is asking for __GFP_HIGH memory.  GFP_ATOMIC requests will
-        * set both ALLOC_HARDER (!wait) and ALLOC_HIGH (__GFP_HIGH).
         */
-       alloc_flags = ALLOC_WMARK_MIN;
-       if ((unlikely(rt_task(p)) && !in_interrupt()) || !wait)
-               alloc_flags |= ALLOC_HARDER;
-       if (gfp_mask & __GFP_HIGH)
-               alloc_flags |= ALLOC_HIGH;
-       if (wait)
-               alloc_flags |= ALLOC_CPUSET;
+       alloc_flags = gfp_to_alloc_flags(gfp_mask);
 
-       /*
-        * Go through the zonelist again. Let __GFP_HIGH and allocations
-        * coming from realtime tasks go deeper into reserves.
-        *
-        * This is the last chance, in general, before the goto nopage.
-        * Ignore cpuset if GFP_ATOMIC (!wait) rather than fail alloc.
-        * See also cpuset_zone_allowed() comment in kernel/cpuset.c.
-        */
-       page = get_page_from_freelist(gfp_mask, order, zonelist, alloc_flags);
+       /* This is the last chance, in general, before the goto nopage. */
+       page = get_page_from_freelist(gfp_mask, order, zonelist,
+                       alloc_flags & ~ALLOC_NO_WATERMARKS);
        if (page)
                goto got_pg;
 
        /* This allocation should allow future memory freeing. */
-
 rebalance:
-       if (((p->flags & PF_MEMALLOC) || unlikely(test_thread_flag(TIF_MEMDIE)))
-                       && !in_interrupt()) {
-               if (!(gfp_mask & __GFP_NOMEMALLOC)) {
+       if (alloc_flags & ALLOC_NO_WATERMARKS) {
 nofail_alloc:
-                       /* go through the zonelist yet again, ignoring mins */
-                       page = get_page_from_freelist(gfp_mask, order,
+               /*
+                * Before going bare metal, try to get a page above the
+                * critical threshold - ignoring CPU sets.
+                */
+               page = get_page_from_freelist(gfp_mask, order, zonelist,
+                               ALLOC_MIN|ALLOC_HIGH|ALLOC_HARDER);
+               if (page)
+                       goto got_pg;
+
+               /* go through the zonelist yet again, ignoring mins */
+               page = get_page_from_freelist(gfp_mask, order,
                                zonelist, ALLOC_NO_WATERMARKS);
-                       if (page)
-                               goto got_pg;
-                       if (gfp_mask & __GFP_NOFAIL) {
-                               congestion_wait(WRITE, HZ/50);
-                               goto nofail_alloc;
-                       }
+               if (page)
+                       goto got_pg;
+               if (wait && (gfp_mask & __GFP_NOFAIL)) {
+                       congestion_wait(WRITE, HZ/50);
+                       goto nofail_alloc;
                }
                goto nopage;
        }
@@ -1669,6 +1650,10 @@ nofail_alloc:
        if (!wait)
                goto nopage;
 
+       /* Avoid recursion of direct reclaim */
+       if (p->flags & PF_MEMALLOC)
+               goto nopage;
+
        cond_resched();
 
        /* We now go into synchronous reclaim */
Index: linux-2.6-git/include/linux/mm_types.h
===================================================================
--- linux-2.6-git.orig/include/linux/mm_types.h
+++ linux-2.6-git/include/linux/mm_types.h
@@ -60,6 +60,7 @@ struct page {
        union {
                pgoff_t index;          /* Our offset within mapping. */
                void *freelist;         /* SLUB: freelist req. slab lock */
+               int reserve;            /* page_alloc: page is a reserve page */
        };
        struct list_head lru;           /* Pageout list, eg. active_list
                                         * protected by zone->lru_lock !
Index: linux-2.6-git/include/linux/slub_def.h
===================================================================
--- linux-2.6-git.orig/include/linux/slub_def.h
+++ linux-2.6-git/include/linux/slub_def.h
@@ -46,6 +46,8 @@ struct kmem_cache {
        struct list_head list;  /* List of slab caches */
        struct kobject kobj;    /* For sysfs */
 
+       struct page *reserve_slab;
+
 #ifdef CONFIG_NUMA
        int defrag_ratio;
        struct kmem_cache_node *node[MAX_NUMNODES];
Index: linux-2.6-git/mm/slub.c
===================================================================
--- linux-2.6-git.orig/mm/slub.c
+++ linux-2.6-git/mm/slub.c
@@ -20,11 +20,13 @@
 #include <linux/mempolicy.h>
 #include <linux/ctype.h>
 #include <linux/kallsyms.h>
+#include "internal.h"
 
 /*
  * Lock order:
- *   1. slab_lock(page)
- *   2. slab->list_lock
+ *   1. reserve_lock
+ *   2. slab_lock(page)
+ *   3. node->list_lock
  *
  *   The slab_lock protects operations on the object of a particular
  *   slab and its metadata in the page struct. If the slab lock
@@ -259,6 +261,8 @@ static int sysfs_slab_alias(struct kmem_
 static void sysfs_slab_remove(struct kmem_cache *s) {}
 #endif
 
+static DEFINE_SPINLOCK(reserve_lock);
+
 /********************************************************************
  *                     Core slab cache functions
  *******************************************************************/
@@ -1007,7 +1011,7 @@ static void setup_object(struct kmem_cac
                s->ctor(object, s, 0);
 }
 
-static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node)
+static struct page *new_slab(struct kmem_cache *s, gfp_t flags, int node, int 
*reserve)
 {
        struct page *page;
        struct kmem_cache_node *n;
@@ -1025,6 +1029,7 @@ static struct page *new_slab(struct kmem
        if (!page)
                goto out;
 
+       *reserve = page->reserve;
        n = get_node(s, page_to_nid(page));
        if (n)
                atomic_long_inc(&n->nr_slabs);
@@ -1395,6 +1400,7 @@ static void *__slab_alloc(struct kmem_ca
 {
        void **object;
        int cpu = smp_processor_id();
+       int reserve = 0;
 
        if (!page)
                goto new_slab;
@@ -1424,10 +1430,25 @@ new_slab:
        if (page) {
                s->cpu_slab[cpu] = page;
                goto load_freelist;
-       }
+       } else if (unlikely(gfp_to_alloc_flags(gfpflags) & ALLOC_NO_WATERMARKS))
+               goto try_reserve;
 
-       page = new_slab(s, gfpflags, node);
-       if (page) {
+alloc_slab:
+       page = new_slab(s, gfpflags, node, &reserve);
+       if (page && !reserve) {
+               if (unlikely(s->reserve_slab)) {
+                       struct page *reserve;
+
+                       spin_lock(&reserve_lock);
+                       reserve = s->reserve_slab;
+                       s->reserve_slab = NULL;
+                       spin_unlock(&reserve_lock);
+
+                       if (reserve) {
+                               slab_lock(reserve);
+                               unfreeze_slab(s, reserve);
+                       }
+               }
                cpu = smp_processor_id();
                if (s->cpu_slab[cpu]) {
                        /*
@@ -1455,6 +1476,18 @@ new_slab:
                SetSlabFrozen(page);
                s->cpu_slab[cpu] = page;
                goto load_freelist;
+       } else if (page) {
+               spin_lock(&reserve_lock);
+               if (s->reserve_slab) {
+                       discard_slab(s, page);
+                       page = s->reserve_slab;
+                       goto got_reserve;
+               }
+               slab_lock(page);
+               SetSlabFrozen(page);
+               s->reserve_slab = page;
+               spin_unlock(&reserve_lock);
+               goto use_reserve;
        }
        return NULL;
 debug:
@@ -1470,6 +1503,31 @@ debug:
        page->freelist = object[page->offset];
        slab_unlock(page);
        return object;
+
+try_reserve:
+       spin_lock(&reserve_lock);
+       page = s->reserve_slab;
+       if (!page) {
+               spin_unlock(&reserve_lock);
+               goto alloc_slab;
+       }
+
+got_reserve:
+       slab_lock(page);
+       if (!page->freelist) {
+               s->reserve_slab = NULL;
+               spin_unlock(&reserve_lock);
+               unfreeze_slab(s, page);
+               goto alloc_slab;
+       }
+       spin_unlock(&reserve_lock);
+
+use_reserve:
+       object = page->freelist;
+       page->inuse++;
+       page->freelist = object[page->offset];
+       slab_unlock(page);
+       return object;
 }
 
 /*
@@ -1807,10 +1865,11 @@ static struct kmem_cache_node * __init e
 {
        struct page *page;
        struct kmem_cache_node *n;
+       int reserve;
 
        BUG_ON(kmalloc_caches->size < sizeof(struct kmem_cache_node));
 
-       page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node);
+       page = new_slab(kmalloc_caches, gfpflags | GFP_THISNODE, node, 
&reserve);
        /* new_slab() disables interupts */
        local_irq_enable();
 
@@ -2018,6 +2077,8 @@ static int kmem_cache_open(struct kmem_c
 #ifdef CONFIG_NUMA
        s->defrag_ratio = 100;
 #endif
+       s->reserve_slab = NULL;
+
        if (init_kmem_cache_nodes(s, gfpflags & ~SLUB_DMA))
                return 1;
 error:


-
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to [EMAIL PROTECTED]
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to