The OOM killer must know whether killing a task can actually free memory such that pressure is reduced.
A private node only contributes to relieving pressure if it participates in both reclaim and demotion. Without this check, the check, the OOM killer may select an undeserving victim. Introduce NP_OPS_OOM_ELIGIBLE and helpers node_oom_eligible() and zone_oom_eligible(). Replace cpuset_mems_allowed_intersects() in oom_cpuset_eligible() with oom_mems_intersect() that iterates N_MEMORY nodes and skips ineligible private nodes. Update constrained_alloc() to use zone_oom_eligible() for constraint detection and node_oom_eligible() to exclude ineligible nodes from totalpages accounting. Remove cpuset_mems_allowed_intersects() as it has no remaining callers. Signed-off-by: Gregory Price <[email protected]> --- include/linux/cpuset.h | 9 ------- include/linux/node_private.h | 3 +++ kernel/cgroup/cpuset.c | 17 ------------ mm/oom_kill.c | 52 ++++++++++++++++++++++++++++++++---- 4 files changed, 50 insertions(+), 31 deletions(-) diff --git a/include/linux/cpuset.h b/include/linux/cpuset.h index 7b2f3f6b68a9..53ccfb00b277 100644 --- a/include/linux/cpuset.h +++ b/include/linux/cpuset.h @@ -97,9 +97,6 @@ static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) return true; } -extern int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, - const struct task_struct *tsk2); - #ifdef CONFIG_CPUSETS_V1 #define cpuset_memory_pressure_bump() \ do { \ @@ -241,12 +238,6 @@ static inline bool cpuset_zone_allowed(struct zone *z, gfp_t gfp_mask) return true; } -static inline int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, - const struct task_struct *tsk2) -{ - return 1; -} - static inline void cpuset_memory_pressure_bump(void) {} static inline void cpuset_task_status_allowed(struct seq_file *m, diff --git a/include/linux/node_private.h b/include/linux/node_private.h index 34be52383255..34d862f09e24 100644 --- a/include/linux/node_private.h +++ b/include/linux/node_private.h @@ -141,6 +141,9 @@ struct node_private_ops { /* Kernel reclaim (kswapd, direct reclaim, OOM) operates on this node */ #define NP_OPS_RECLAIM BIT(4) +/* Private node is OOM-eligible: reclaim can run and pages can be demoted here */ +#define NP_OPS_OOM_ELIGIBLE (NP_OPS_RECLAIM | NP_OPS_DEMOTION) + /** * struct node_private - Per-node container for N_MEMORY_PRIVATE nodes * diff --git a/kernel/cgroup/cpuset.c b/kernel/cgroup/cpuset.c index 1a597f0c7c6c..29789d544fd5 100644 --- a/kernel/cgroup/cpuset.c +++ b/kernel/cgroup/cpuset.c @@ -4530,23 +4530,6 @@ int cpuset_mem_spread_node(void) return cpuset_spread_node(¤t->cpuset_mem_spread_rotor); } -/** - * cpuset_mems_allowed_intersects - Does @tsk1's mems_allowed intersect @tsk2's? - * @tsk1: pointer to task_struct of some task. - * @tsk2: pointer to task_struct of some other task. - * - * Description: Return true if @tsk1's mems_allowed intersects the - * mems_allowed of @tsk2. Used by the OOM killer to determine if - * one of the task's memory usage might impact the memory available - * to the other. - **/ - -int cpuset_mems_allowed_intersects(const struct task_struct *tsk1, - const struct task_struct *tsk2) -{ - return nodes_intersects(tsk1->mems_allowed, tsk2->mems_allowed); -} - /** * cpuset_print_current_mems_allowed - prints current's cpuset and mems_allowed * diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 5eb11fbba704..cd0d65ccd1e8 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -74,7 +74,45 @@ static inline bool is_memcg_oom(struct oom_control *oc) return oc->memcg != NULL; } +/* Private nodes are only eligible if they support both reclaim and demotion */ +static inline bool node_oom_eligible(int nid) +{ + if (!node_state(nid, N_MEMORY_PRIVATE)) + return true; + return (node_private_flags(nid) & NP_OPS_OOM_ELIGIBLE) == + NP_OPS_OOM_ELIGIBLE; +} + +static inline bool zone_oom_eligible(struct zone *zone, gfp_t gfp_mask) +{ + if (!node_oom_eligible(zone_to_nid(zone))) + return false; + return cpuset_zone_allowed(zone, gfp_mask); +} + #ifdef CONFIG_NUMA +/* + * Killing a task can only relieve system pressure if freed memory can be + * demoted there and reclaim can operate on the node's pages, so we + * omit private nodes that aren't eligible. + */ +static bool oom_mems_intersect(const struct task_struct *tsk1, + const struct task_struct *tsk2) +{ + int nid; + + for_each_node_state(nid, N_MEMORY) { + if (!node_isset(nid, tsk1->mems_allowed)) + continue; + if (!node_isset(nid, tsk2->mems_allowed)) + continue; + if (!node_oom_eligible(nid)) + continue; + return true; + } + return false; +} + /** * oom_cpuset_eligible() - check task eligibility for kill * @start: task struct of which task to consider @@ -107,9 +145,10 @@ static bool oom_cpuset_eligible(struct task_struct *start, } else { /* * This is not a mempolicy constrained oom, so only - * check the mems of tsk's cpuset. + * check the mems of tsk's cpuset, excluding private + * nodes that do not participate in kernel reclaim. */ - ret = cpuset_mems_allowed_intersects(current, tsk); + ret = oom_mems_intersect(current, tsk); } if (ret) break; @@ -291,16 +330,19 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) return CONSTRAINT_MEMORY_POLICY; } - /* Check this allocation failure is caused by cpuset's wall function */ + /* Check this allocation failure is caused by cpuset or private node constraints */ for_each_zone_zonelist_nodemask(zone, z, oc->zonelist, highest_zoneidx, oc->nodemask) - if (!cpuset_zone_allowed(zone, oc->gfp_mask)) + if (!zone_oom_eligible(zone, oc->gfp_mask)) cpuset_limited = true; if (cpuset_limited) { oc->totalpages = total_swap_pages; - for_each_node_mask(nid, cpuset_current_mems_allowed) + for_each_node_mask(nid, cpuset_current_mems_allowed) { + if (!node_oom_eligible(nid)) + continue; oc->totalpages += node_present_pages(nid); + } return CONSTRAINT_CPUSET; } return CONSTRAINT_NONE; -- 2.53.0
