Re: [4.17 regression] Performance drop on kernel-4.17 visible on Stream, Linpack and NAS parallel benchmarks

Mel Gorman Wed, 27 Jun 2018 01:50:38 -0700

On Wed, Jun 27, 2018 at 12:18:37AM +0200, Jirka Hladky wrote:
> Hi Mel,
> 
> we have results for the "Fixes for sched/numa_balancing" series and overall
> it looks very promising.
> 
> We see improvements in the range 15-20% for the stream benchmark and
> upto 60% for the OpenMP NAS benchmark. While NAS results are noisy (have
> quite big variations)  we see improvements on a wide range of 2 and 4 NUMA
> systems.  More importantly, we don't see any regressions. I have posted
> screenshots of the median differences on 4 NUMA servers for NAS benchmark
> with
> 
> 4x Gold 6126 CPU @ 2.60GHz
> 4x E5-4627 v2 @ 3.30GHz
> 
> to illustrate the typical results.
> 
> How are things looking at your side?
>


I saw similar results in that it was a general win. I'm also trialing
the following monolithic patch on top if you want to try it. The timing
of this will depend on when/if numa_blancing fixes get picked up and I'm
still waiting on test results to come through. Unfortunately, on Friday,
I'll be unavailable for two weeks so this may drag on a bit.  The expanded
set of patches is at

git://git.kernel.org/pub/scm/linux/kernel/git/mel/linux.git 
sched-numa-fast-crossnode-v1r12

If the results are positive, I'll update the series and post it as a
RFC. Monolithic patch on top of fixes for sched/numa_balancing is as follows.

diff --git a/include/linux/mmzone.h b/include/linux/mmzone.h
index 0dbe1d5bb936..eea5f82ca447 100644
--- a/include/linux/mmzone.h
+++ b/include/linux/mmzone.h
@@ -669,11 +669,6 @@ typedef struct pglist_data {
        struct task_struct *kcompactd;
 #endif
 #ifdef CONFIG_NUMA_BALANCING
-       /* Rate limiting time interval */
-       unsigned long numabalancing_migrate_next_window;
-
-       /* Number of pages migrated during the rate limiting time interval */
-       unsigned long numabalancing_migrate_nr_pages;
        int active_node_migrate;
 #endif
        /*
diff --git a/include/trace/events/migrate.h b/include/trace/events/migrate.h
index 711372845945..de8c73f9abcf 100644
--- a/include/trace/events/migrate.h
+++ b/include/trace/events/migrate.h
@@ -71,32 +71,6 @@ TRACE_EVENT(mm_migrate_pages,
                __print_symbolic(__entry->reason, MIGRATE_REASON))
 );
 
-TRACE_EVENT(mm_numa_migrate_ratelimit,
-
-       TP_PROTO(struct task_struct *p, int dst_nid, unsigned long nr_pages),
-
-       TP_ARGS(p, dst_nid, nr_pages),
-
-       TP_STRUCT__entry(
-               __array(        char,           comm,   TASK_COMM_LEN)
-               __field(        pid_t,          pid)
-               __field(        int,            dst_nid)
-               __field(        unsigned long,  nr_pages)
-       ),
-
-       TP_fast_assign(
-               memcpy(__entry->comm, p->comm, TASK_COMM_LEN);
-               __entry->pid            = p->pid;
-               __entry->dst_nid        = dst_nid;
-               __entry->nr_pages       = nr_pages;
-       ),
-
-       TP_printk("comm=%s pid=%d dst_nid=%d nr_pages=%lu",
-               __entry->comm,
-               __entry->pid,
-               __entry->dst_nid,
-               __entry->nr_pages)
-);
 #endif /* _TRACE_MIGRATE_H */
 
 /* This part must be outside protection */
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 6ca3be059872..c020af2c58ec 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -1394,6 +1394,17 @@ bool should_numa_migrate_memory(struct task_struct *p, 
struct page * page,
        int last_cpupid, this_cpupid;
 
        this_cpupid = cpu_pid_to_cpupid(dst_cpu, current->pid);
+       last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
+
+       /*
+        * Allow first faults or private faults to migrate immediately early in
+        * the lifetime of a task. The magic number 4 is based on waiting for
+        * two full passes of the "multi-stage node selection" test that is
+        * executed below.
+        */
+       if ((p->numa_preferred_nid == -1 || p->numa_scan_seq <= 4) &&
+           (cpupid_pid_unset(last_cpupid) || cpupid_match_pid(p, last_cpupid)))
+               return true;
 
        /*
         * Multi-stage node selection is used in conjunction with a periodic
@@ -1412,7 +1423,6 @@ bool should_numa_migrate_memory(struct task_struct *p, 
struct page * page,
         * This quadric squishes small probabilities, making it less likely we
         * act on an unlikely task<->page relation.
         */
-       last_cpupid = page_cpupid_xchg_last(page, this_cpupid);
        if (!cpupid_pid_unset(last_cpupid) &&
                                cpupid_to_nid(last_cpupid) != dst_nid)
                return false;
@@ -6702,6 +6712,9 @@ static void migrate_task_rq_fair(struct task_struct *p, 
int new_cpu __maybe_unus
        p->se.exec_start = 0;
 
 #ifdef CONFIG_NUMA_BALANCING
+       if (!static_branch_likely(&sched_numa_balancing))
+               return;
+
        if (!p->mm || (p->flags & PF_EXITING))
                return;
 
@@ -6709,8 +6722,26 @@ static void migrate_task_rq_fair(struct task_struct *p, 
int new_cpu __maybe_unus
                int src_nid = cpu_to_node(task_cpu(p));
                int dst_nid = cpu_to_node(new_cpu);
 
-               if (src_nid != dst_nid)
-                       p->numa_scan_period = task_scan_start(p);
+               if (src_nid == dst_nid)
+                       return;
+
+               /*
+                * Allow resets if faults have been trapped before one scan
+                * has completed. This is most likely due to a new task that
+                * is pulled cross-node due to wakeups or load balancing.
+                */
+               if (p->numa_scan_seq) {
+                       /*
+                        * Avoid scan adjustments if moving to the preferred
+                        * node or if the task was not previously running on
+                        * the preferred node.
+                        */
+                       if (dst_nid == p->numa_preferred_nid ||
+                           (p->numa_preferred_nid != -1 && src_nid != 
p->numa_preferred_nid))
+                               return;
+               }
+
+               p->numa_scan_period = task_scan_start(p);
        }
 #endif
 }
diff --git a/mm/migrate.c b/mm/migrate.c
index c7749902a160..f935f4781036 100644
--- a/mm/migrate.c
+++ b/mm/migrate.c
@@ -1856,54 +1856,6 @@ static struct page *alloc_misplaced_dst_page(struct page 
*page,
        return newpage;
 }
 
-/*
- * page migration rate limiting control.
- * Do not migrate more than @pages_to_migrate in a @migrate_interval_millisecs
- * window of time. Default here says do not migrate more than 1280M per second.
- */
-static unsigned int migrate_interval_millisecs __read_mostly = 100;
-static unsigned int ratelimit_pages __read_mostly = 128 << (20 - PAGE_SHIFT);
-
-/* Returns true if the node is migrate rate-limited after the update */
-static bool numamigrate_update_ratelimit(pg_data_t *pgdat,
-                                       unsigned long nr_pages)
-{
-       unsigned long next_window, interval;
-
-       next_window = READ_ONCE(pgdat->numabalancing_migrate_next_window);
-       interval = msecs_to_jiffies(migrate_interval_millisecs);
-
-       /*
-        * Rate-limit the amount of data that is being migrated to a node.
-        * Optimal placement is no good if the memory bus is saturated and
-        * all the time is being spent migrating!
-        */
-       if (time_after(jiffies, next_window)) {
-               if (xchg(&pgdat->numabalancing_migrate_nr_pages, 0)) {
-                       do {
-                               next_window += interval;
-                       } while (unlikely(time_after(jiffies, next_window)));
-
-                       WRITE_ONCE(pgdat->numabalancing_migrate_next_window,
-                                                              next_window);
-               }
-       }
-       if (pgdat->numabalancing_migrate_nr_pages > ratelimit_pages) {
-               trace_mm_numa_migrate_ratelimit(current, pgdat->node_id,
-                                                               nr_pages);
-               return true;
-       }
-
-       /*
-        * This is an unlocked non-atomic update so errors are possible.
-        * The consequences are failing to migrate when we potentiall should
-        * have which is not severe enough to warrant locking. If it is ever
-        * a problem, it can be converted to a per-cpu counter.
-        */
-       pgdat->numabalancing_migrate_nr_pages += nr_pages;
-       return false;
-}
-
 static int numamigrate_isolate_page(pg_data_t *pgdat, struct page *page)
 {
        int page_lru;
@@ -1976,14 +1928,6 @@ int migrate_misplaced_page(struct page *page, struct 
vm_area_struct *vma,
        if (page_is_file_cache(page) && PageDirty(page))
                goto out;
 
-       /*
-        * Rate-limit the amount of data that is being migrated to a node.
-        * Optimal placement is no good if the memory bus is saturated and
-        * all the time is being spent migrating!
-        */
-       if (numamigrate_update_ratelimit(pgdat, 1))
-               goto out;
-
        isolated = numamigrate_isolate_page(pgdat, page);
        if (!isolated)
                goto out;
@@ -2030,14 +1974,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct 
*mm,
        unsigned long mmun_start = address & HPAGE_PMD_MASK;
        unsigned long mmun_end = mmun_start + HPAGE_PMD_SIZE;
 
-       /*
-        * Rate-limit the amount of data that is being migrated to a node.
-        * Optimal placement is no good if the memory bus is saturated and
-        * all the time is being spent migrating!
-        */
-       if (numamigrate_update_ratelimit(pgdat, HPAGE_PMD_NR))
-               goto out_dropref;
-
        new_page = alloc_pages_node(node,
                (GFP_TRANSHUGE_LIGHT | __GFP_THISNODE),
                HPAGE_PMD_ORDER);
@@ -2134,7 +2070,6 @@ int migrate_misplaced_transhuge_page(struct mm_struct *mm,
 
 out_fail:
        count_vm_events(PGMIGRATE_FAIL, HPAGE_PMD_NR);
-out_dropref:
        ptl = pmd_lock(mm, pmd);
        if (pmd_same(*pmd, entry)) {
                entry = pmd_modify(entry, vma->vm_page_prot);
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index a4fc9b0798df..9049e7b26e92 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -6211,11 +6211,6 @@ static void __paginginit free_area_init_core(struct 
pglist_data *pgdat)
        int nid = pgdat->node_id;
 
        pgdat_resize_init(pgdat);
-#ifdef CONFIG_NUMA_BALANCING
-       pgdat->numabalancing_migrate_nr_pages = 0;
-       pgdat->active_node_migrate = 0;
-       pgdat->numabalancing_migrate_next_window = jiffies;
-#endif
 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
        spin_lock_init(&pgdat->split_queue_lock);
        INIT_LIST_HEAD(&pgdat->split_queue);

-- 
Mel Gorman
SUSE Labs

Re: [4.17 regression] Performance drop on kernel-4.17 visible on Stream, Linpack and NAS parallel benchmarks

Reply via email to