On Mon, Nov 09, 2015 at 04:21:23PM -0800, Jacob Pan wrote:
> +++ b/include/trace/events/sched.h

> +/*
> + * Tracepoint for idle injection
> + */
> +TRACE_EVENT(sched_cfs_idle_inject,
> +
> +     TP_PROTO(char *msg, int throttled),
> +
> +     TP_ARGS(msg, throttled),
> +
> +     TP_STRUCT__entry(
> +             __string(msg, msg)
> +             __field(int, throttled)
> +     ),
> +
> +     TP_fast_assign(
> +             __assign_str(msg, msg);
> +             __entry->throttled = throttled;
> +     ),
> +
> +     TP_printk("%s: throttled=%d", __get_str(msg), __entry->throttled)
> +);

So I hate tracepoints.. and I'd rather not see them. But at the very
least kill that @msg field and replace it with an enum or so.


> +/*
> + * Knobs for controlling percentage of time when idle is forced across all
> + * CPUs. This is a power management feature intended for achieving deepest
> + * and broadest idle without lower CPU frequencies to less optimal level.
> + * No action is taken if CPUs are natually idle.
> + */
> +#ifdef CONFIG_CFS_IDLE_INJECT
> +unsigned int sysctl_sched_cfs_idle_inject_pct;
> +unsigned int sysctl_sched_cfs_idle_inject_duration = 10UL;

since you're playing the ifdef game, you might as well also do:

static inline void cfs_rq_nr_running_inc(struct cfs_rq *cfs_rq)
{
        if (!cfs_rq->nr_running++ && !cfs_rq->forced_idle)
                cfs_rq->runnable = true;
}

static inline bool cfs_rq_runnable(struct cfs_rq *cfs_rq)
{
        return cfs_rq->runnable;
}

#else

static inline void cfs_rq_nr_running_inc(struct cfs_rq *cfs_rq)
{
        cfs_rq->nr_running++;
}

static inline bool cfs_rq_runnable(struct cfs_rq *cfs_rq)
{
        return !!cfs_rq->nr_running;
}

> +#endif
> +
>  static inline void update_load_add(struct load_weight *lw, unsigned long inc)
>  {
>       lw->weight += inc;
> @@ -2334,7 +2346,9 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct 
> sched_entity *se)
>               list_add(&se->group_node, &rq->cfs_tasks);
>       }
>  #endif
> -     cfs_rq->nr_running++;
> +
> +     if (!cfs_rq->nr_running++ && !cfs_rq->forced_idle)
> +             cfs_rq->runnable = true;

which makes that:
        cfs_rq_nr_running_inc();

>  }
>  
>  static void
> @@ -2347,7 +2361,9 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct 
> sched_entity *se)
>               account_numa_dequeue(rq_of(cfs_rq), task_of(se));
>               list_del_init(&se->group_node);
>       }
> -     cfs_rq->nr_running--;
> +
> +     if (!--cfs_rq->nr_running && !cfs_rq->forced_idle)
> +             cfs_rq->runnable = false;

        cfs_rq_nr_running_dec();

>  }
>  
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> @@ -5139,7 +5155,7 @@ pick_next_task_fair(struct rq *rq, struct task_struct 
> *prev)
>  
>  again:
>  #ifdef CONFIG_FAIR_GROUP_SCHED
> -     if (!cfs_rq->nr_running)
> +     if (!cfs_rq->runnable)

        if (!cfs_rq_runnable(cfs_rq))
>               goto idle;
>  
>       if (prev->sched_class != &fair_sched_class)

>  idle:
> +     if ((cfs_rq->forced_idle)) {
> +             if (unlikely(local_softirq_pending())) {
> +                     trace_sched_cfs_idle_inject("softirq pending", 1);

> +                     cfs_rq->forced_idle = false;
> +                     cfs_rq->runnable = cfs_rq->nr_running;

maybe:
                        __unthrottle_cfs_rq(cfs_rq); ?

> +                     goto again;
> +             }
> +             trace_sched_cfs_idle_inject("forced idle", 1);
> +             return NULL;
> +     }
>       /*
>        * This is OK, because current is on_cpu, which avoids it being picked
>        * for load-balance and preemption/IRQs are still disabled avoiding
> @@ -8318,3 +8344,350 @@ __init void init_sched_fair_class(void)
>  #endif /* SMP */
>  
>  }
> +
> +#ifdef CONFIG_CFS_IDLE_INJECT

> +static atomic_t idle_inject_active;

You only use atomic_{read,set} on this, therefore atomic_t is pointless.

> +static DEFINE_PER_CPU(struct hrtimer, idle_inject_timer);
> +static DEFINE_PER_CPU(bool, idle_injected);

I tend to prefer to not use bool as a storage class; its ill defined.

> +/* protect injection parameters from runtime changes */
> +static DEFINE_SPINLOCK(idle_inject_lock);

A global lock, yay :-), I think you want this to be a RAW_SPINLOCK
though. As on -RT this would want to actually run from IRQ context too.

> +static enum hrtimer_restart idle_inject_timer_fn(struct hrtimer *hrtimer)
> +{
> +     struct hrtimer *hrt = this_cpu_ptr(&idle_inject_timer);
> +     int cpu = smp_processor_id();
> +     ktime_t now, delta, period;
> +     bool status;
> +
> +     now = hrtimer_cb_get_time(hrt);

You're not interested in the current time.

> +
> +     status = raw_cpu_read(idle_injected);
> +     if (status) {
> +             /*
> +              * We were injecting idle in the last phase, let's forward the
> +              * timer to the next period
> +              *
> +              * status: 1             0                1        0
> +              * ____          ____________________           _______
> +              *     |________|                    |_________|
> +              *
> +              *     |duration|      interval      |
> +              *
> +              *              ^ we are here
> +              *                  forward to here: ^
> +              */
> +             delta = ktime_sub(now, inject_start_time);
> +             period = ktime_add(ms_to_ktime(duration),
> +                             ms_to_ktime(inject_interval));
> +             delta = ktime_roundup(delta, period);
> +             hrtimer_set_expires(hrt, ktime_add(delta, inject_start_time));

This doesn't make any sense. Who cares what the current time is.

> +     } else {
> +             /*
> +              * We were not injecting idle in the last phase, let's forward
> +              * timer after forced idle duration
> +              * ____          ____________________           _______
> +              *     |________|                    |_________|
> +              *
> +              *     |duration|      interval      |
> +              *
> +              *     ^ we are here
> +              *              ^ forward timer to here
> +              */
> +             hrtimer_set_expires(hrt, ktime_add(ms_to_ktime(duration), now));

Same here, we don't care about the current time. The timer was at the
previous start of injection, just forward it a whole period to find the
next injection slot.

> +     }

It looks like what you want is:

        hrtimer_forward(hrt, period);

unconditionally.

> +     raw_cpu_write(idle_injected, !status);
> +     trace_sched_cfs_idle_inject("idle sync timer", !status);
> +     if (status)
> +             unthrottle_rq(cpu);
> +     else
> +             throttle_rq(cpu);
> +
> +     return HRTIMER_RESTART;
> +}
> +
> +static void idle_inject_timer_start(void *info)
> +{
> +     int cpu = smp_processor_id();
> +     struct hrtimer *hrt = this_cpu_ptr(&idle_inject_timer);
> +
> +     this_cpu_write(idle_injected, true);
> +     set_bit(cpu, idle_inject_cpumask);
> +     hrtimer_start(hrt, ms_to_ktime(duration), HRTIMER_MODE_ABS_PINNED);
> +     hrtimer_set_expires(hrt, *(ktime_t *)info);

This is broken, _first_ set an expiration time, then start the timer.

Now you insert the timer into the RB tree on a previous expiration time,
then you modify the expiration time under it, effectively wrecking the
RB tree.

> +}

> +static void stop_idle_inject(void)
> +{
> +     int i;
> +     struct hrtimer *hrt;
> +
> +     if (bitmap_weight(idle_inject_cpumask, num_possible_cpus())) {

I don't get the point of this bitmap; with the cpu notifier you
basically ensure this is equal to online_mask.

Also, this weight test is pointless, if the bitmap is empty the
for_each_set_bit() should be of equal cost -- and afaict nothing calling
this is performance critical in the first place.

> +             for_each_set_bit(i, idle_inject_cpumask, num_possible_cpus()) {

> +                     hrt = &per_cpu(idle_inject_timer, i);
> +                     hrtimer_cancel(hrt);
> +                     unthrottle_rq(i);
> +             }
> +     }
> +}
> +
> +static int idle_inject_cpu_callback(struct notifier_block *nfb,
> +                             unsigned long action, void *hcpu)
> +{
> +     unsigned long cpu = (unsigned long)hcpu;
> +     struct hrtimer *hrt = &per_cpu(idle_inject_timer, cpu);
> +     ktime_t now, delta, period;
> +
> +     if (!atomic_read(&idle_inject_active))
> +             goto exit_ok;

We should never get here if that weren't set, right? I mean you
register/unregister these callbacks around setting that variable.

> +
> +     switch (action) {
> +     case CPU_STARTING:
> +             raw_cpu_write(idle_injected, true);
> +
> +             hrtimer_init(hrt, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
> +             hrt->function = idle_inject_timer_fn;
> +             set_bit(cpu, idle_inject_cpumask);
> +
> +             now = hrtimer_cb_get_time(hrt);
> +             hrtimer_start(hrt, ms_to_ktime(duration),
> +                     HRTIMER_MODE_ABS_PINNED);
> +             /*
> +              * When a new CPU comes online, we need to make sure it aligns
> +              * its phase with the rest of the CPUs. So we set the
> +              * timer to the next period based on the common starting time,
> +              * then start injecting idle time.
> +              */
> +             spin_lock_irq(&idle_inject_lock);
> +             delta = ktime_sub(now, inject_start_time);
> +             period = ktime_add(ms_to_ktime(duration),
> +                             ms_to_ktime(inject_interval));
> +             delta = ktime_roundup(delta, period);
> +             spin_unlock_irq(&idle_inject_lock);
> +             hrtimer_set_expires(hrt, ktime_add(delta, inject_start_time));

Same broken, you cannot call that on a timer you've already started.

> +             break;
> +     case CPU_DYING:
> +             clear_bit(cpu, idle_inject_cpumask);
> +             hrtimer_cancel(hrt);
> +             raw_cpu_write(idle_injected, false);
> +             unthrottle_rq(cpu);
> +             break;
> +     default:
> +             return NOTIFY_DONE;
> +     }
> +exit_ok:
> +     return NOTIFY_OK;
> +}
> +
> +static int idle_inject_pm_callback(struct notifier_block *self,
> +                             unsigned long action, void *hcpu)
> +{
> +     switch (action) {
> +     case PM_HIBERNATION_PREPARE:
> +     case PM_SUSPEND_PREPARE:
> +             if (atomic_read(&idle_inject_active))
> +                     stop_idle_inject();

As with the above, if that were false, this whole callback would not be
called, seeing how you unregister before actually clearing that
idle_inject_active thing.

> +             break;
> +     case PM_POST_HIBERNATION:
> +     case PM_POST_SUSPEND:
> +             printk("POST SUSPEND restart idle injection\n");

Seems a tad inconsistent, printing here but not when stopping it.

> +             start_idle_inject();
> +             break;
> +     default:
> +             break;
> +     }
> +     return NOTIFY_OK;
> +}
> +
> +static struct notifier_block idle_inject_pm_notifier = {
> +     .notifier_call = idle_inject_pm_callback,
> +};
> +
> +static struct notifier_block idle_inject_cpu_notifier = {
> +     .notifier_call = idle_inject_cpu_callback,
> +};
> +
> +static void end_idle_inject(void) {
> +     unregister_hotcpu_notifier(&idle_inject_cpu_notifier);
> +     unregister_pm_notifier(&idle_inject_pm_notifier);

As per the above, these callbacks will not happen hereafter, and will
this never see:

> +     atomic_set(&idle_inject_active, 0);
> +     kfree(idle_inject_cpumask);
> +}
> +
> +static int prepare_idle_inject(void)
> +{
> +     int retval = 0;
> +     int bitmap_size;
> +     int cpu;
> +     struct hrtimer *hrt;
> +
> +     bitmap_size = BITS_TO_LONGS(num_possible_cpus()) * sizeof(long);

This is incorrect, you want nr_cpu_ids. There is no guarantee the CPU
space does not contain holes. But seeing I still don't see the point of
the mask, this might all fix itself by killing it alltogether.

> +     idle_inject_cpumask = kzalloc(bitmap_size, GFP_KERNEL);
> +     if (!idle_inject_cpumask)
> +             return -ENOMEM;
> +
> +     retval = register_pm_notifier(&idle_inject_pm_notifier);
> +     if (retval)
> +             goto exit_free;
> +     retval = register_hotcpu_notifier(&idle_inject_cpu_notifier);
> +     if (retval)
> +             goto exit_unregister_pm;
> +     get_online_cpus();
> +     for_each_online_cpu(cpu) {
> +             hrt = &per_cpu(idle_inject_timer, cpu);
> +             hrtimer_init(hrt, CLOCK_MONOTONIC, HRTIMER_MODE_ABS_PINNED);
> +             hrt->function = idle_inject_timer_fn;
> +     }
> +     put_online_cpus();
> +
> +     if (!duration)
> +             duration = DEFAULT_DURATION_MSECS;
> +
> +     return 0;
> +exit_unregister_pm:
> +     unregister_pm_notifier(&idle_inject_pm_notifier);
> +exit_free:
> +     kfree(idle_inject_cpumask);
> +     return retval;
> +}
> +
> +int proc_sched_cfs_idle_inject_pct_handler(struct ctl_table *table,
> +                                     int write,
> +                                     void __user *buffer,
> +                                     size_t *length, loff_t *ppos)
> +{
> +     int ret;
> +
> +     ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
> +     if (ret)
> +             goto out;
> +
> +     if (idle_pct != sysctl_sched_cfs_idle_inject_pct) {
> +             if (!idle_pct)
> +                     start_idle_inject();
> +             else if (!sysctl_sched_cfs_idle_inject_pct) {
> +                     stop_idle_inject();
> +                     end_idle_inject();
> +             }
> +
> +             /* recompute injection parameters */
> +             spin_lock_irq(&idle_inject_lock);
> +             idle_pct = sysctl_sched_cfs_idle_inject_pct;
> +             /*
> +              * duration is fixed for each injection period, we adjust
> +              * non idle interval to satisfy the idle percentage set
> +              * by the user. e.g. if duration is 10 and we want 33% idle
> +              * then interval is 20.
> +              * 33% idle
> +              * ____          ___________________          _________
> +              *     |________|                   |________| 33% idle
> +              * ____          ________          _______
> +              *     |________|        |________|  50% idle
> +              *
> +              *     |duration|interval|
> +              */
> +             if (idle_pct)
> +                     inject_interval = (duration * (100 - idle_pct))
> +                             / idle_pct;

This needs {} (or just exceed the 80 char thing).

> +             spin_unlock_irq(&idle_inject_lock);
> +
> +     }
> +out:
> +     return ret;
> +}
> +
> +int proc_sched_cfs_idle_inject_duration_handler(struct ctl_table *table,
> +                                             int write,
> +                                             void __user *buffer,
> +                                             size_t *length, loff_t *ppos)
> +{
> +     int ret;
> +
> +     ret = proc_dointvec_minmax(table, write, buffer, length, ppos);
> +     if (ret)
> +             goto out;
> +
> +     if (duration == sysctl_sched_cfs_idle_inject_duration)
> +             goto out;
> +     /* recompute injection parameters */
> +     spin_lock_irq(&idle_inject_lock);
> +     duration = jiffies_to_msecs(sysctl_sched_cfs_idle_inject_duration);
> +     if (idle_pct)
> +             inject_interval = (duration * (100 - idle_pct)) / idle_pct;
> +
> +     spin_unlock_irq(&idle_inject_lock);
> +out:
> +     return ret;
> +}

And since you have proc handlers for both these, why not convert to
ktime here and avoid the enless ms_to_ktime() calls ?

Also, maybe precompute the period, since that is what you really need.
--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to