Oh boy oh boy. This thing runs at SCHED_FIFO MAX_USER_RT_PRIO/2 and
stops at mwait_idle_with_hints(). Why bother with /2?
There are a few things I haven't fully decoded. For instance why is it
looking at local_softirq_pending()?
The timer is probably here if mwait would let it sleep too long.

I tried to convert it over to smpboot thread so we don't have that CPU
notifier stuff to fire the cpu threads during hotplug events.

the smp_mb() barriers are not documented - I just shifted the code to
the left.

The code / logic itself could be a little more inteligent and only wake
up the threads for the CPUs that are about to idle but it seems it is
done on all of the at once unless I missed something.

Cc: Zhang Rui <rui.zh...@intel.com>
Cc: Eduardo Valentin <edubez...@gmail.com>
Cc: linux...@vger.kernel.org
Signed-off-by: Sebastian Andrzej Siewior <bige...@linutronix.de>
---
 drivers/thermal/intel_powerclamp.c | 315 ++++++++++++++++---------------------
 1 file changed, 136 insertions(+), 179 deletions(-)

diff --git a/drivers/thermal/intel_powerclamp.c 
b/drivers/thermal/intel_powerclamp.c
index 6c79588251d5..e04f7631426a 100644
--- a/drivers/thermal/intel_powerclamp.c
+++ b/drivers/thermal/intel_powerclamp.c
@@ -51,6 +51,7 @@
 #include <linux/debugfs.h>
 #include <linux/seq_file.h>
 #include <linux/sched/rt.h>
+#include <linux/smpboot.h>
 
 #include <asm/nmi.h>
 #include <asm/msr.h>
@@ -85,9 +86,9 @@ static unsigned int control_cpu; /* The cpu assigned to 
collect stat and update
                                  * can be offlined.
                                  */
 static bool clamping;
+static DEFINE_PER_CPU(struct task_struct *, clamp_kthreads);
+static DEFINE_PER_CPU(struct timer_list, clamp_timer);
 
-
-static struct task_struct * __percpu *powerclamp_thread;
 static struct thermal_cooling_device *cooling_dev;
 static unsigned long *cpu_clamping_mask;  /* bit map for tracking per cpu
                                           * clamping thread
@@ -368,100 +369,82 @@ static bool powerclamp_adjust_controls(unsigned int 
target_ratio,
        return set_target_ratio + guard <= current_ratio;
 }
 
-static int clamp_thread(void *arg)
+static void clamp_thread_fn(unsigned int cpu)
 {
-       int cpunr = (unsigned long)arg;
-       DEFINE_TIMER(wakeup_timer, noop_timer, 0, 0);
-       static const struct sched_param param = {
-               .sched_priority = MAX_USER_RT_PRIO/2,
-       };
        unsigned int count = 0;
        unsigned int target_ratio;
+       int sleeptime;
+       unsigned long target_jiffies;
+       unsigned int guard;
+       unsigned int compensation = 0;
+       int interval; /* jiffies to sleep for each attempt */
+       unsigned int duration_jiffies = msecs_to_jiffies(duration);
+       unsigned int window_size_now;
+       struct timer_list *wake_timer = per_cpu_ptr(&clamp_timer, cpu);
 
-       set_bit(cpunr, cpu_clamping_mask);
-       set_freezable();
-       init_timer_on_stack(&wakeup_timer);
-       sched_setscheduler(current, SCHED_FIFO, &param);
+       /*
+        * make sure user selected ratio does not take effect until
+        * the next round. adjust target_ratio if user has changed
+        * target such that we can converge quickly.
+        */
+       target_ratio = set_target_ratio;
+       guard = 1 + target_ratio/20;
+       window_size_now = window_size;
+       count++;
 
-       while (true == clamping && !kthread_should_stop() &&
-               cpu_online(cpunr)) {
-               int sleeptime;
-               unsigned long target_jiffies;
-               unsigned int guard;
-               unsigned int compensation = 0;
-               int interval; /* jiffies to sleep for each attempt */
-               unsigned int duration_jiffies = msecs_to_jiffies(duration);
-               unsigned int window_size_now;
+       /*
+        * systems may have different ability to enter package level
+        * c-states, thus we need to compensate the injected idle ratio
+        * to achieve the actual target reported by the HW.
+        */
+       compensation = get_compensation(target_ratio);
+       interval = duration_jiffies*100/(target_ratio+compensation);
 
-               try_to_freeze();
-               /*
-                * make sure user selected ratio does not take effect until
-                * the next round. adjust target_ratio if user has changed
-                * target such that we can converge quickly.
-                */
-               target_ratio = set_target_ratio;
-               guard = 1 + target_ratio/20;
-               window_size_now = window_size;
-               count++;
-
-               /*
-                * systems may have different ability to enter package level
-                * c-states, thus we need to compensate the injected idle ratio
-                * to achieve the actual target reported by the HW.
-                */
-               compensation = get_compensation(target_ratio);
-               interval = duration_jiffies*100/(target_ratio+compensation);
-
-               /* align idle time */
-               target_jiffies = roundup(jiffies, interval);
-               sleeptime = target_jiffies - jiffies;
-               if (sleeptime <= 0)
-                       sleeptime = 1;
-               schedule_timeout_interruptible(sleeptime);
-               /*
-                * only elected controlling cpu can collect stats and update
-                * control parameters.
-                */
-               if (cpunr == control_cpu && !(count%window_size_now)) {
-                       should_skip =
-                               powerclamp_adjust_controls(target_ratio,
-                                                       guard, window_size_now);
-                       smp_mb();
-               }
-
-               if (should_skip)
-                       continue;
-
-               target_jiffies = jiffies + duration_jiffies;
-               mod_timer(&wakeup_timer, target_jiffies);
-               if (unlikely(local_softirq_pending()))
-                       continue;
-               /*
-                * stop tick sched during idle time, interrupts are still
-                * allowed. thus jiffies are updated properly.
-                */
-               preempt_disable();
-               /* mwait until target jiffies is reached */
-               while (time_before(jiffies, target_jiffies)) {
-                       unsigned long ecx = 1;
-                       unsigned long eax = target_mwait;
-
-                       /*
-                        * REVISIT: may call enter_idle() to notify drivers who
-                        * can save power during cpu idle. same for exit_idle()
-                        */
-                       local_touch_nmi();
-                       stop_critical_timings();
-                       mwait_idle_with_hints(eax, ecx);
-                       start_critical_timings();
-                       atomic_inc(&idle_wakeup_counter);
-               }
-               preempt_enable();
+       /* align idle time */
+       target_jiffies = roundup(jiffies, interval);
+       sleeptime = target_jiffies - jiffies;
+       if (sleeptime <= 0)
+               sleeptime = 1;
+       schedule_timeout_interruptible(sleeptime);
+       /*
+        * only elected controlling cpu can collect stats and update
+        * control parameters.
+        */
+       if (cpu == control_cpu && !(count%window_size_now)) {
+               should_skip =
+                       powerclamp_adjust_controls(target_ratio,
+                                                  guard, window_size_now);
+               smp_mb();
        }
-       del_timer_sync(&wakeup_timer);
-       clear_bit(cpunr, cpu_clamping_mask);
 
-       return 0;
+       if (should_skip)
+               return;
+
+       target_jiffies = jiffies + duration_jiffies;
+       mod_timer(wake_timer, target_jiffies);
+       if (unlikely(local_softirq_pending()))
+               return;
+       /*
+        * stop tick sched during idle time, interrupts are still
+        * allowed. thus jiffies are updated properly.
+        */
+       preempt_disable();
+       /* mwait until target jiffies is reached */
+       while (time_before(jiffies, target_jiffies)) {
+               unsigned long ecx = 1;
+               unsigned long eax = target_mwait;
+
+               /*
+                * REVISIT: may call enter_idle() to notify drivers who
+                * can save power during cpu idle. same for exit_idle()
+                */
+               local_touch_nmi();
+               stop_critical_timings();
+               mwait_idle_with_hints(eax, ecx);
+               start_critical_timings();
+               atomic_inc(&idle_wakeup_counter);
+       }
+       preempt_enable();
 }
 
 /*
@@ -505,10 +488,64 @@ static void poll_pkg_cstate(struct work_struct *dummy)
                schedule_delayed_work(&poll_pkg_cstate_work, HZ);
 }
 
+static void clamp_thread_setup(unsigned int cpu)
+{
+       struct timer_list *wake_timer;
+       static struct sched_param param = {
+               .sched_priority = MAX_USER_RT_PRIO/2,
+       };
+
+       sched_setscheduler(current, SCHED_FIFO, &param);
+       wake_timer = per_cpu_ptr(&clamp_timer, cpu);
+
+       setup_timer(wake_timer, noop_timer, 0);
+}
+
+static void clamp_thread_unpark(unsigned int cpu)
+{
+       set_bit(cpu, cpu_clamping_mask);
+       if (cpu == 0) {
+               control_cpu = 0;
+               smp_mb();
+       }
+}
+
+static void clamp_thread_park(unsigned int cpu)
+{
+       clear_bit(cpu, cpu_clamping_mask);
+       if (cpu == control_cpu) {
+               control_cpu = cpumask_any_but(cpu_online_mask, cpu);
+               smp_mb();
+       }
+       del_timer_sync(per_cpu_ptr(&clamp_timer, cpu));
+}
+
+static void clamp_thread_cleanup(unsigned int cpu, bool online)
+{
+       if (!online)
+               return;
+       clamp_thread_park(cpu);
+}
+
+static int clamp_thread_should_run(unsigned int cpu)
+{
+       return clamping == true;
+}
+
+static struct smp_hotplug_thread clamp_threads = {
+       .store                  = &clamp_kthreads,
+       .setup                  = clamp_thread_setup,
+       .cleanup                = clamp_thread_cleanup,
+       .thread_should_run      = clamp_thread_should_run,
+       .thread_fn              = clamp_thread_fn,
+       .park                   = clamp_thread_park,
+       .unpark                 = clamp_thread_unpark,
+       .thread_comm            = "kidle_inject/%u",
+};
+
 static int start_power_clamp(void)
 {
-       unsigned long cpu;
-       struct task_struct *thread;
+       unsigned int cpu;
 
        /* check if pkg cstate counter is completely 0, abort in this case */
        if (!has_pkg_state_counter()) {
@@ -528,23 +565,9 @@ static int start_power_clamp(void)
        clamping = true;
        schedule_delayed_work(&poll_pkg_cstate_work, 0);
 
-       /* start one thread per online cpu */
-       for_each_online_cpu(cpu) {
-               struct task_struct **p =
-                       per_cpu_ptr(powerclamp_thread, cpu);
+       for_each_online_cpu(cpu)
+               wake_up_process(per_cpu_ptr(clamp_kthreads, cpu));
 
-               thread = kthread_create_on_node(clamp_thread,
-                                               (void *) cpu,
-                                               cpu_to_node(cpu),
-                                               "kidle_inject/%ld", cpu);
-               /* bind to cpu here */
-               if (likely(!IS_ERR(thread))) {
-                       kthread_bind(thread, cpu);
-                       wake_up_process(thread);
-                       *p = thread;
-               }
-
-       }
        put_online_cpus();
 
        return 0;
@@ -552,9 +575,6 @@ static int start_power_clamp(void)
 
 static void end_power_clamp(void)
 {
-       int i;
-       struct task_struct *thread;
-
        clamping = false;
        /*
         * make clamping visible to other cpus and give per cpu clamping threads
@@ -562,63 +582,8 @@ static void end_power_clamp(void)
         */
        smp_mb();
        msleep(20);
-       if (bitmap_weight(cpu_clamping_mask, num_possible_cpus())) {
-               for_each_set_bit(i, cpu_clamping_mask, num_possible_cpus()) {
-                       pr_debug("clamping thread for cpu %d alive, kill\n", i);
-                       thread = *per_cpu_ptr(powerclamp_thread, i);
-                       kthread_stop(thread);
-               }
-       }
 }
 
-static int powerclamp_cpu_callback(struct notifier_block *nfb,
-                               unsigned long action, void *hcpu)
-{
-       unsigned long cpu = (unsigned long)hcpu;
-       struct task_struct *thread;
-       struct task_struct **percpu_thread =
-               per_cpu_ptr(powerclamp_thread, cpu);
-
-       if (false == clamping)
-               goto exit_ok;
-
-       switch (action) {
-       case CPU_ONLINE:
-               thread = kthread_create_on_node(clamp_thread,
-                                               (void *) cpu,
-                                               cpu_to_node(cpu),
-                                               "kidle_inject/%lu", cpu);
-               if (likely(!IS_ERR(thread))) {
-                       kthread_bind(thread, cpu);
-                       wake_up_process(thread);
-                       *percpu_thread = thread;
-               }
-               /* prefer BSP as controlling CPU */
-               if (cpu == 0) {
-                       control_cpu = 0;
-                       smp_mb();
-               }
-               break;
-       case CPU_DEAD:
-               if (test_bit(cpu, cpu_clamping_mask)) {
-                       pr_err("cpu %lu dead but powerclamping thread is not\n",
-                               cpu);
-                       kthread_stop(*percpu_thread);
-               }
-               if (cpu == control_cpu) {
-                       control_cpu = smp_processor_id();
-                       smp_mb();
-               }
-       }
-
-exit_ok:
-       return NOTIFY_OK;
-}
-
-static struct notifier_block powerclamp_cpu_notifier = {
-       .notifier_call = powerclamp_cpu_callback,
-};
-
 static int powerclamp_get_max_state(struct thermal_cooling_device *cdev,
                                 unsigned long *state)
 {
@@ -788,19 +753,15 @@ static int __init powerclamp_init(void)
 
        /* set default limit, maybe adjusted during runtime based on feedback */
        window_size = 2;
-       register_hotcpu_notifier(&powerclamp_cpu_notifier);
-
-       powerclamp_thread = alloc_percpu(struct task_struct *);
-       if (!powerclamp_thread) {
-               retval = -ENOMEM;
-               goto exit_unregister;
-       }
+       retval = smpboot_register_percpu_thread(&clamp_threads);
+       if (retval)
+               goto exit_free;
 
        cooling_dev = thermal_cooling_device_register("intel_powerclamp", NULL,
                                                &powerclamp_cooling_ops);
        if (IS_ERR(cooling_dev)) {
                retval = -ENODEV;
-               goto exit_free_thread;
+               goto exit_free_smp_thread;
        }
 
        if (!duration)
@@ -809,11 +770,8 @@ static int __init powerclamp_init(void)
        powerclamp_create_debug_files();
 
        return 0;
-
-exit_free_thread:
-       free_percpu(powerclamp_thread);
-exit_unregister:
-       unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
+exit_free_smp_thread:
+       smpboot_unregister_percpu_thread(&clamp_threads);
 exit_free:
        kfree(cpu_clamping_mask);
        return retval;
@@ -822,9 +780,8 @@ module_init(powerclamp_init);
 
 static void __exit powerclamp_exit(void)
 {
-       unregister_hotcpu_notifier(&powerclamp_cpu_notifier);
        end_power_clamp();
-       free_percpu(powerclamp_thread);
+       smpboot_unregister_percpu_thread(&clamp_threads);
        thermal_cooling_device_unregister(cooling_dev);
        kfree(cpu_clamping_mask);
 
-- 
2.7.0

Reply via email to