On 02/14/2017 03:44 PM, Anshuman Khandual wrote:
> On 02/14/2017 01:58 PM, Vlastimil Babka wrote:
>> On 02/10/2017 11:06 AM, Anshuman Khandual wrote:
>>> This implements allocation isolation for CDM nodes in buddy allocator by
>>> discarding CDM memory zones all the time except in the cases where the gfp
>>> flag has got __GFP_THISNODE or the nodemask contains CDM nodes in cases
>>> where it is non NULL (explicit allocation request in the kernel or user
>>> process MPOL_BIND policy based requests).
>>>
>>> Signed-off-by: Anshuman Khandual <khand...@linux.vnet.ibm.com>
>>> ---
>>>  mm/page_alloc.c | 16 ++++++++++++++++
>>>  1 file changed, 16 insertions(+)
>>>
>>> diff --git a/mm/page_alloc.c b/mm/page_alloc.c
>>> index 84d61bb..392c24a 100644
>>> --- a/mm/page_alloc.c
>>> +++ b/mm/page_alloc.c
>>> @@ -64,6 +64,7 @@
>>>  #include <linux/page_owner.h>
>>>  #include <linux/kthread.h>
>>>  #include <linux/memcontrol.h>
>>> +#include <linux/node.h>
>>>  
>>>  #include <asm/sections.h>
>>>  #include <asm/tlbflush.h>
>>> @@ -2908,6 +2909,21 @@ get_page_from_freelist(gfp_t gfp_mask, unsigned int 
>>> order, int alloc_flags,
>>>             struct page *page;
>>>             unsigned long mark;
>>>  
>>> +           /*
>>> +            * CDM nodes get skipped if the requested gfp flag
>>> +            * does not have __GFP_THISNODE set or the nodemask
>>> +            * does not have any CDM nodes in case the nodemask
>>> +            * is non NULL (explicit allocation requests from
>>> +            * kernel or user process MPOL_BIND policy which has
>>> +            * CDM nodes).
>>> +            */
>>> +           if (is_cdm_node(zone->zone_pgdat->node_id)) {
>>> +                   if (!(gfp_mask & __GFP_THISNODE)) {
>>> +                           if (!ac->nodemask)
>>> +                                   continue;
>>> +                   }
>>> +           }
>>
>> With the current cpuset implementation, this will have a subtle corner
>> case when allocating from a cpuset that allows the cdm node, and there
>> is no (task or vma) mempolicy applied for the allocation. In the fast
>> path (__alloc_pages_nodemask()) we'll set ac->nodemask to
>> current->mems_allowed, so your code will wrongly assume that this
>> ac->nodemask is a policy that allows the CDM node. Probably not what you
>> want?
> 
> You are right, its a problem and not what we want. We can make the
> function get_page_from_freelist() take another parameter "orig_nodemask"
> which gets passed into __alloc_pages_nodemask() in the first place. So
> inside zonelist iterator we can compare orig_nodemask with current
> ac.nodemask to figure out if cpuset swapping of nodemask happened and
> skip CDM node if necessary. Thats a viable solution IMHO.

Hello Vlastimil,

As I mentioned before yesterday this solution works and tested to verify that
there is no allocation leak happening to CDM even after cpuset_enabled() is 
turned ON after changing /sys/fs/cgroup/cpuset/ setup. Major part of the change
is just to add an additional parameter into the function get_page_from_freelist
and changing all it's call sites.

- Anshuman

diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 392c24a..9f41e0f 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -2894,11 +2894,15 @@ static bool zone_allows_reclaim(struct zone 
*local_zone, struct zone *zone)
  */
 static struct page *
 get_page_from_freelist(gfp_t gfp_mask, unsigned int order, int alloc_flags,
-                                               const struct alloc_context *ac)
+                       const struct alloc_context *ac, nodemask_t *orig_mask)
 {
        struct zoneref *z = ac->preferred_zoneref;
        struct zone *zone;
        struct pglist_data *last_pgdat_dirty_limit = NULL;
+       bool cpuset_fallback;
+
+       if (ac->nodemask != orig_mask)
+               cpuset_fallback = true;
 
        /*
         * Scan zonelist, looking for a zone with enough free.
@@ -2919,6 +2923,9 @@ static bool zone_allows_reclaim(struct zone *local_zone, 
struct zone *zone)
                 */
                if (is_cdm_node(zone->zone_pgdat->node_id)) {
                        if (!(gfp_mask & __GFP_THISNODE)) {
+                               if (cpuset_fallback)
+                                       continue;
+
                                if (!ac->nodemask)
                                        continue;
                        }
@@ -3066,7 +3073,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
 
 static inline struct page *
 __alloc_pages_may_oom(gfp_t gfp_mask, unsigned int order,
-       const struct alloc_context *ac, unsigned long *did_some_progress)
+       const struct alloc_context *ac, unsigned long *did_some_progress, 
nodemask_t *orig_mask)
 {
        struct oom_control oc = {
                .zonelist = ac->zonelist,
@@ -3095,7 +3102,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
         * we're still under heavy pressure.
         */
        page = get_page_from_freelist(gfp_mask | __GFP_HARDWALL, order,
-                                       ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac);
+                                       ALLOC_WMARK_HIGH|ALLOC_CPUSET, ac, 
orig_mask);
        if (page)
                goto out;
 
@@ -3131,14 +3138,14 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
 
                if (gfp_mask & __GFP_NOFAIL) {
                        page = get_page_from_freelist(gfp_mask, order,
-                                       ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac);
+                                       ALLOC_NO_WATERMARKS|ALLOC_CPUSET, ac, 
orig_mask);
                        /*
                         * fallback to ignore cpuset restriction if our nodes
                         * are depleted
                         */
                        if (!page)
                                page = get_page_from_freelist(gfp_mask, order,
-                                       ALLOC_NO_WATERMARKS, ac);
+                                       ALLOC_NO_WATERMARKS, ac, orig_mask);
                }
        }
 out:
@@ -3157,7 +3164,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
 static struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                unsigned int alloc_flags, const struct alloc_context *ac,
-               enum compact_priority prio, enum compact_result *compact_result)
+               enum compact_priority prio, enum compact_result 
*compact_result, nodemask_t *orig_mask)
 {
        struct page *page;
 
@@ -3178,7 +3185,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
         */
        count_vm_event(COMPACTSTALL);
 
-       page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+       page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac, 
orig_mask);
 
        if (page) {
                struct zone *zone = page_zone(page);
@@ -3263,7 +3270,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
 static inline struct page *
 __alloc_pages_direct_compact(gfp_t gfp_mask, unsigned int order,
                unsigned int alloc_flags, const struct alloc_context *ac,
-               enum compact_priority prio, enum compact_result *compact_result)
+               enum compact_priority prio, enum compact_result 
*compact_result, nodemask_t *orig_mask)
 {
        *compact_result = COMPACT_SKIPPED;
        return NULL;
@@ -3330,7 +3337,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
 static inline struct page *
 __alloc_pages_direct_reclaim(gfp_t gfp_mask, unsigned int order,
                unsigned int alloc_flags, const struct alloc_context *ac,
-               unsigned long *did_some_progress)
+               unsigned long *did_some_progress, nodemask_t *orig_mask)
 {
        struct page *page = NULL;
        bool drained = false;
@@ -3340,7 +3347,7 @@ void warn_alloc(gfp_t gfp_mask, const char *fmt, ...)
                return NULL;
 
 retry:
-       page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+       page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac, 
orig_mask);
 
        /*
         * If an allocation failed after direct reclaim, it could be because
@@ -3533,7 +3540,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 
 static inline struct page *
 __alloc_pages_slowpath(gfp_t gfp_mask, unsigned int order,
-                                               struct alloc_context *ac)
+                               struct alloc_context *ac, nodemask_t *orig_mask)
 {
        bool can_direct_reclaim = gfp_mask & __GFP_DIRECT_RECLAIM;
        struct page *page = NULL;
@@ -3597,7 +3604,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
         * The adjusted alloc_flags might result in immediate success, so try
         * that first
         */
-       page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+       page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac, 
orig_mask);
        if (page)
                goto got_pg;
 
@@ -3612,7 +3619,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
                page = __alloc_pages_direct_compact(gfp_mask, order,
                                                alloc_flags, ac,
                                                INIT_COMPACT_PRIORITY,
-                                               &compact_result);
+                                               &compact_result, orig_mask);
                if (page)
                        goto got_pg;
 
@@ -3661,7 +3668,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
        }
 
        /* Attempt with potentially adjusted zonelist and alloc_flags */
-       page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac);
+       page = get_page_from_freelist(gfp_mask, order, alloc_flags, ac, 
orig_mask);
        if (page)
                goto got_pg;
 
@@ -3697,13 +3704,13 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
 
        /* Try direct reclaim and then allocating */
        page = __alloc_pages_direct_reclaim(gfp_mask, order, alloc_flags, ac,
-                                                       &did_some_progress);
+                                                       &did_some_progress, 
orig_mask);
        if (page)
                goto got_pg;
 
        /* Try direct compaction and then allocating */
        page = __alloc_pages_direct_compact(gfp_mask, order, alloc_flags, ac,
-                                       compact_priority, &compact_result);
+                                       compact_priority, &compact_result, 
orig_mask);
        if (page)
                goto got_pg;
 
@@ -3750,7 +3757,7 @@ bool gfp_pfmemalloc_allowed(gfp_t gfp_mask)
                goto retry_cpuset;
 
        /* Reclaim has failed us, start killing things */
-       page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress);
+       page = __alloc_pages_may_oom(gfp_mask, order, ac, &did_some_progress, 
orig_mask);
        if (page)
                goto got_pg;
 
@@ -3842,7 +3849,7 @@ struct page *
        }
 
        /* First allocation attempt */
-       page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac);
+       page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac, 
nodemask);
        if (likely(page))
                goto out;
 
@@ -3861,7 +3868,7 @@ struct page *
        if (unlikely(ac.nodemask != nodemask))
                ac.nodemask = nodemask;
 
-       page = __alloc_pages_slowpath(alloc_mask, order, &ac);
+       page = __alloc_pages_slowpath(alloc_mask, order, &ac, nodemask);
 
 out:
        if (memcg_kmem_enabled() && (gfp_mask & __GFP_ACCOUNT) && page &&









Reply via email to