Hi Peter, Thanks for the comments.
On 2020/12/8 22:16, Peter Zijlstra wrote: > On Tue, Dec 08, 2020 at 09:49:57AM +0800, Aubrey Li wrote: >> diff --git a/kernel/sched/core.c b/kernel/sched/core.c >> index c4da7e17b906..b8af602dea79 100644 >> --- a/kernel/sched/core.c >> +++ b/kernel/sched/core.c >> @@ -3999,6 +3999,7 @@ void scheduler_tick(void) >> rq_lock(rq, &rf); >> >> update_rq_clock(rq); >> + update_idle_cpumask(rq, false); > > Does that really need to be done with rq->lock held?> >> thermal_pressure = arch_scale_thermal_pressure(cpu_of(rq)); >> update_thermal_load_avg(rq_clock_thermal(rq), rq, thermal_pressure); > >> @@ -6808,6 +6813,51 @@ balance_fair(struct rq *rq, struct task_struct *prev, >> struct rq_flags *rf) >> } >> #endif /* CONFIG_SMP */ >> >> +/* >> + * Update cpu idle state and record this information >> + * in sd_llc_shared->idle_cpus_span. >> + */ >> +void update_idle_cpumask(struct rq *rq, bool set_idle) >> +{ >> + struct sched_domain *sd; >> + int cpu = cpu_of(rq); >> + int idle_state; >> + >> + /* >> + * If called from scheduler tick, only update >> + * idle cpumask if the CPU is busy, as idle >> + * cpumask is also updated on idle entry. >> + * >> + */ >> + if (!set_idle && idle_cpu(cpu)) >> + return; > > scheduler_tick() already calls idle_cpu() when SMP. > >> + /* >> + * Also set SCHED_IDLE cpu in idle cpumask to >> + * allow SCHED_IDLE cpu as a wakeup target >> + */ >> + idle_state = set_idle || sched_idle_cpu(cpu); >> + /* >> + * No need to update idle cpumask if the state >> + * does not change. >> + */ >> + if (rq->last_idle_state == idle_state) >> + return; >> + >> + rcu_read_lock(); > > This is called with IRQs disabled, surely we can forgo rcu_read_lock() > here. > >> + sd = rcu_dereference(per_cpu(sd_llc, cpu)); >> + if (!sd || !sd->shared) >> + goto unlock; > > I don't think !sd->shared is possible here. > >> + if (idle_state) >> + cpumask_set_cpu(cpu, sds_idle_cpus(sd->shared)); >> + else >> + cpumask_clear_cpu(cpu, sds_idle_cpus(sd->shared)); >> + >> + rq->last_idle_state = idle_state; >> +unlock: >> + rcu_read_unlock(); >> +} >> + >> static unsigned long wakeup_gran(struct sched_entity *se) >> { >> unsigned long gran = sysctl_sched_wakeup_granularity; >> diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c >> index f324dc36fc43..f995660edf2b 100644 >> --- a/kernel/sched/idle.c >> +++ b/kernel/sched/idle.c >> @@ -156,6 +156,11 @@ static void cpuidle_idle_call(void) >> return; >> } >> >> + /* >> + * The CPU is about to go idle, set it in idle cpumask >> + * to be a wake up target. >> + */ >> + update_idle_cpumask(this_rq(), true); > > This should be in do_idle(), right around arch_cpu_idle_enter(). > >> /* >> * The RCU framework needs to be told that we are entering an idle >> * section, so no more rcu read side critical sections and one more >> diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h >> index 8d1ca65db3b0..db460b20217a 100644 >> --- a/kernel/sched/sched.h >> +++ b/kernel/sched/sched.h >> @@ -1004,6 +1004,7 @@ struct rq { >> /* This is used to determine avg_idle's max value */ >> u64 max_idle_balance_cost; >> #endif /* CONFIG_SMP */ >> + unsigned char last_idle_state; > > All of that is pointless for UP. Also, is this the best location? > Good point, I should put all of these under SMP. I'll refine the patch soon. Thanks, -Aubrey