On Wed, Apr 10, 2013 at 10:29:12AM -0500, Russ Anderson wrote: > On Wed, Apr 10, 2013 at 08:10:05AM -0700, Linus Torvalds wrote: > > > > Yeah, we've had issues with ACPI in the past, so I do think we should > > always reboot using the BP. Even if it almost certainly works on 99+% > > of all machines on any random CPU. > > > > The optimal solution would be to just speed up the > > disable_nonboot_cpus() code so much that it isn't an issue. That would > > be good for suspending too, although I guess suspend isn't a big issue > > if you have a thousand CPU's. > > > > Has anybody checked whether we could do the cpu_down() on non-boot > > CPU's in parallel? Right now we serialize the thing completely, with > > one single > > > > for_each_online_cpu(cpu) { > > ... > > > > loop that does a synchrinous _cpu_down() for each CPU. No wonder it > > takes forever. We do __stop_machine() over and over and over again: > > the whole thing is basically O(n**2) in CPU's. > > Yes, I have a test patch that replaces for_each_online_cpu(cpu) > with a cpu bitmask in disable_nonboot_cpus(). The lower level > routines already take a bitmask. It allows __stop_machine() to > be called just once. That change reduces shutdown time on a > 1024 cpu machine from 16 minutes 4 minutes. Significant improvement, > but not good enough. > > The next significant bottleneck is __cpu_notify(). Tried creating > worker threads to parallelize the shutdown, but the problem is > __cpu_notify() is not thread safe. Putting a lock around it > caused all the worker threads to fight over the lock. > > Note that __cpu_notify() has to be called for all cpus being > shut down because the cpu_chain notifier call chain has cpu as a > parameter. The delema is that cpu_chain notifiers need to be called on > all cpus, but cannot be done in parallel due to __cpu_notify() not being > thread safe. Spinning through the notifier chain sequentially for all > cpus just takes a long time. > > The real fix would be to make the &cpu_chain notifier per cpu, or at > least thread safe, so that all the cpus being shut down could do so > in parallel. That is a significant change with ramifications on > other code. > > I will post a patch shortly with the cpu bitmask change. Changing > __cpu_notify() will take more discussion.
Here is the test patch with the cpu bitmask change. It results in calling __stop_machine() just once. After making feedback changes I'll formally submit the patch. --- kernel/cpu.c | 94 +++++++++++++++++++++++++++++++++++------------------------ 1 file changed, 56 insertions(+), 38 deletions(-) Index: linux/kernel/cpu.c =================================================================== --- linux.orig/kernel/cpu.c 2013-04-10 17:16:40.084960495 -0500 +++ linux/kernel/cpu.c 2013-04-10 17:17:27.988245160 -0500 @@ -262,10 +262,11 @@ static int __ref take_cpu_down(void *_pa } /* Requires cpu_add_remove_lock to be held */ -static int __ref _cpu_down(unsigned int cpu, int tasks_frozen) +static int __ref _cpu_down(const cpumask_t *cpus_to_offline, int tasks_frozen) { - int err, nr_calls = 0; + int cpu = 0, err = 0, nr_calls = 0; void *hcpu = (void *)(long)cpu; + cpumask_var_t cpus_offlined; unsigned long mod = tasks_frozen ? CPU_TASKS_FROZEN : 0; struct take_cpu_down_param tcd_param = { .mod = mod, @@ -278,46 +279,65 @@ static int __ref _cpu_down(unsigned int if (!cpu_online(cpu)) return -EINVAL; + if (!alloc_cpumask_var(&cpus_offlined, GFP_KERNEL)) + return -ENOMEM; + cpu_hotplug_begin(); + cpumask_clear(cpus_offlined); + cpumask_copy(cpus_offlined, cpus_to_offline); - err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); - if (err) { - nr_calls--; - __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); - printk("%s: attempt to take down CPU %u failed\n", + for_each_cpu_mask(cpu, *cpus_to_offline) { + hcpu = (void *)(long)cpu; + if (!cpu_online(cpu)) + continue; + tcd_param.hcpu = hcpu; + err = __cpu_notify(CPU_DOWN_PREPARE | mod, hcpu, -1, &nr_calls); + if (err) { + nr_calls--; + __cpu_notify(CPU_DOWN_FAILED | mod, hcpu, nr_calls, NULL); + pr_err("%s: attempt to take down CPU %u failed\n", __func__, cpu); - goto out_release; + goto out_release; + } + smpboot_park_threads(cpu); } - smpboot_park_threads(cpu); - err = __stop_machine(take_cpu_down, &tcd_param, cpumask_of(cpu)); + err = __stop_machine(take_cpu_down, &tcd_param, cpus_to_offline); if (err) { /* CPU didn't die: tell everyone. Can't complain. */ - smpboot_unpark_threads(cpu); - cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); + for_each_cpu_mask(cpu, *cpus_to_offline) { + hcpu = (void *)(long)cpu; + smpboot_unpark_threads(cpu); + cpu_notify_nofail(CPU_DOWN_FAILED | mod, hcpu); + } goto out_release; } - BUG_ON(cpu_online(cpu)); /* * The migration_call() CPU_DYING callback will have removed all * runnable tasks from the cpu, there's only the idle task left now * that the migration thread is done doing the stop_machine thing. - * - * Wait for the stop thread to go away. */ - while (!idle_cpu(cpu)) - cpu_relax(); + for_each_cpu_mask(cpu, *cpus_offlined) { + BUG_ON(cpu_online(cpu)); - /* This actually kills the CPU. */ - __cpu_die(cpu); - - /* CPU is completely dead: tell everyone. Too late to complain. */ - cpu_notify_nofail(CPU_DEAD | mod, hcpu); - - check_for_tasks(cpu); + /* + * Wait for the stop thread to go away. + */ + while (!idle_cpu(cpu)) + cpu_relax(); + + /* This actually kills the CPU. */ + __cpu_die(cpu); + + /* CPU is completely dead: tell everyone. Too late to complain. */ + hcpu = (void *)(long)cpu; + cpu_notify_nofail(CPU_DEAD | mod, hcpu); + check_for_tasks(cpu); + } out_release: + free_cpumask_var(cpus_offlined); cpu_hotplug_done(); if (!err) cpu_notify_nofail(CPU_POST_DEAD | mod, hcpu); @@ -327,6 +347,7 @@ out_release: int __ref cpu_down(unsigned int cpu) { int err; + cpumask_var_t cpumask; cpu_maps_update_begin(); @@ -335,7 +356,11 @@ int __ref cpu_down(unsigned int cpu) goto out; } - err = _cpu_down(cpu, 0); + if (!alloc_cpumask_var(&cpumask, GFP_KERNEL)) + return -ENOMEM; + cpumask_set_cpu(cpu, cpumask); + err = _cpu_down(cpumask, 0); + free_cpumask_var(cpumask); out: cpu_maps_update_done(); @@ -459,7 +484,7 @@ static cpumask_var_t frozen_cpus; int disable_nonboot_cpus(void) { - int cpu, first_cpu, error = 0; + int first_cpu, error = 0; cpu_maps_update_begin(); first_cpu = cpumask_first(cpu_online_mask); @@ -470,18 +495,11 @@ int disable_nonboot_cpus(void) cpumask_clear(frozen_cpus); printk("Disabling non-boot CPUs ...\n"); - for_each_online_cpu(cpu) { - if (cpu == first_cpu) - continue; - error = _cpu_down(cpu, 1); - if (!error) - cpumask_set_cpu(cpu, frozen_cpus); - else { - printk(KERN_ERR "Error taking CPU%d down: %d\n", - cpu, error); - break; - } - } + cpumask_copy(frozen_cpus, cpu_online_mask); + cpumask_clear_cpu(first_cpu, frozen_cpus); /* all but one cpu*/ + error = _cpu_down(frozen_cpus, 1); + if (error) + pr_err("Error %d stopping cpus\n", error); if (!error) { BUG_ON(num_online_cpus() > 1); -- Russ Anderson, OS RAS/Partitioning Project Lead SGI - Silicon Graphics Inc r...@sgi.com -- To unsubscribe from this list: send the line "unsubscribe linux-kernel" in the body of a message to majord...@vger.kernel.org More majordomo info at http://vger.kernel.org/majordomo-info.html Please read the FAQ at http://www.tux.org/lkml/