[tip:sched/core] sched: Rework pick_next_task() slow-path
Commit-ID: 67692435c411e5c53a1c588ecca2037aebd81f2e Gitweb: https://git.kernel.org/tip/67692435c411e5c53a1c588ecca2037aebd81f2e Author: Peter Zijlstra AuthorDate: Wed, 29 May 2019 20:36:44 + Committer: Peter Zijlstra CommitDate: Thu, 8 Aug 2019 09:09:31 +0200 sched: Rework pick_next_task() slow-path Avoid the RETRY_TASK case in the pick_next_task() slow path. By doing the put_prev_task() early, we get the rt/deadline pull done, and by testing rq->nr_running we know if we need newidle_balance(). This then gives a stable state to pick a task from. Since the fast-path is fair only; it means the other classes will always have pick_next_task(.prev=NULL, .rf=NULL) and we can simplify. Signed-off-by: Peter Zijlstra (Intel) Cc: Aaron Lu Cc: Valentin Schneider Cc: mi...@kernel.org Cc: Phil Auld Cc: Julien Desfossez Cc: Nishanth Aravamudan Link: https://lkml.kernel.org/r/aa34d24b36547139248f32a30138791ac6c02bd6.1559129225.git.vpil...@digitalocean.com --- kernel/sched/core.c | 19 --- kernel/sched/deadline.c | 30 ++ kernel/sched/fair.c | 9 ++--- kernel/sched/idle.c | 4 +++- kernel/sched/rt.c| 29 + kernel/sched/sched.h | 13 - kernel/sched/stop_task.c | 3 ++- 7 files changed, 34 insertions(+), 73 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 7bbe78a31ba5..a6661852907b 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3791,7 +3791,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) p = fair_sched_class.pick_next_task(rq, prev, rf); if (unlikely(p == RETRY_TASK)) - goto again; + goto restart; /* Assumes fair_sched_class->next == idle_sched_class */ if (unlikely(!p)) @@ -3800,14 +3800,19 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return p; } -again: +restart: + /* +* Ensure that we put DL/RT tasks before the pick loop, such that they +* can PULL higher prio tasks when we lower the RQ 'priority'. +*/ + prev->sched_class->put_prev_task(rq, prev, rf); + if (!rq->nr_running) + newidle_balance(rq, rf); + for_each_class(class) { - p = class->pick_next_task(rq, prev, rf); - if (p) { - if (unlikely(p == RETRY_TASK)) - goto again; + p = class->pick_next_task(rq, NULL, NULL); + if (p) return p; - } } /* The idle class should always have a runnable task: */ diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 2872e15a87cd..0b9cbfb2b1d4 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1761,39 +1761,13 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) struct task_struct *p; struct dl_rq *dl_rq; - dl_rq = >dl; - - if (need_pull_dl_task(rq, prev)) { - /* -* This is OK, because current is on_cpu, which avoids it being -* picked for load-balance and preemption/IRQs are still -* disabled avoiding further scheduler activity on it and we're -* being very careful to re-start the picking loop. -*/ - rq_unpin_lock(rq, rf); - pull_dl_task(rq); - rq_repin_lock(rq, rf); - /* -* pull_dl_task() can drop (and re-acquire) rq->lock; this -* means a stop task can slip in, in which case we need to -* re-start task selection. -*/ - if (rq->stop && task_on_rq_queued(rq->stop)) - return RETRY_TASK; - } + WARN_ON_ONCE(prev || rf); - /* -* When prev is DL, we may throttle it in put_prev_task(). -* So, we update time before we check for dl_nr_running. -*/ - if (prev->sched_class == _sched_class) - update_curr_dl(rq); + dl_rq = >dl; if (unlikely(!dl_rq->dl_nr_running)) return NULL; - put_prev_task(rq, prev); - dl_se = pick_next_dl_entity(rq, dl_rq); BUG_ON(!dl_se); diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 4418c1998e69..19c58599e967 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6770,7 +6770,7 @@ again: goto idle; #ifdef CONFIG_FAIR_GROUP_SCHED - if (prev->sched_class != _sched_class) + if (!prev || prev->sched_class != _sched_class) goto simple; /* @@ -6847,8 +6847,8 @@ again: goto done; simple: #endif - - put_prev_task(rq, prev); + if (prev) +
[tip:sched/core] sched: Allow put_prev_task() to drop rq->lock
Commit-ID: 5f2a45fc9e89e022233085e6f0f352eb6ff770bb Gitweb: https://git.kernel.org/tip/5f2a45fc9e89e022233085e6f0f352eb6ff770bb Author: Peter Zijlstra AuthorDate: Wed, 29 May 2019 20:36:43 + Committer: Peter Zijlstra CommitDate: Thu, 8 Aug 2019 09:09:31 +0200 sched: Allow put_prev_task() to drop rq->lock Currently the pick_next_task() loop is convoluted and ugly because of how it can drop the rq->lock and needs to restart the picking. For the RT/Deadline classes, it is put_prev_task() where we do balancing, and we could do this before the picking loop. Make this possible. Signed-off-by: Peter Zijlstra (Intel) Cc: Valentin Schneider Cc: Aaron Lu Cc: mi...@kernel.org Cc: Phil Auld Cc: Julien Desfossez Cc: Nishanth Aravamudan Link: https://lkml.kernel.org/r/e4519f6850477ab7f3d257062796e6425ee4ba7c.1559129225.git.vpil...@digitalocean.com --- kernel/sched/core.c | 2 +- kernel/sched/deadline.c | 14 +- kernel/sched/fair.c | 2 +- kernel/sched/idle.c | 2 +- kernel/sched/rt.c| 14 +- kernel/sched/sched.h | 4 ++-- kernel/sched/stop_task.c | 2 +- 7 files changed, 32 insertions(+), 8 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 0c4220789092..7bbe78a31ba5 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6090,7 +6090,7 @@ static struct task_struct *__pick_migrate_task(struct rq *rq) for_each_class(class) { next = class->pick_next_task(rq, NULL, NULL); if (next) { - next->sched_class->put_prev_task(rq, next); + next->sched_class->put_prev_task(rq, next, NULL); return next; } } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 6eae79350303..2872e15a87cd 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1804,13 +1804,25 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return p; } -static void put_prev_task_dl(struct rq *rq, struct task_struct *p) +static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct rq_flags *rf) { update_curr_dl(rq); update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1); if (on_dl_rq(>dl) && p->nr_cpus_allowed > 1) enqueue_pushable_dl_task(rq, p); + + if (rf && !on_dl_rq(>dl) && need_pull_dl_task(rq, p)) { + /* +* This is OK, because current is on_cpu, which avoids it being +* picked for load-balance and preemption/IRQs are still +* disabled avoiding further scheduler activity on it and we've +* not yet started the picking loop. +*/ + rq_unpin_lock(rq, rf); + pull_dl_task(rq); + rq_repin_lock(rq, rf); + } } /* diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index e7c27eda9f24..4418c1998e69 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -6901,7 +6901,7 @@ idle: /* * Account for a descheduled task: */ -static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) +static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { struct sched_entity *se = >se; struct cfs_rq *cfs_rq; diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c index 54194d41035c..8d59de2e4a6e 100644 --- a/kernel/sched/idle.c +++ b/kernel/sched/idle.c @@ -374,7 +374,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl resched_curr(rq); } -static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) +static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) { } diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index f71bcbe1a00c..dbdabd76f192 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1592,7 +1592,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) return p; } -static void put_prev_task_rt(struct rq *rq, struct task_struct *p) +static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct rq_flags *rf) { update_curr_rt(rq); @@ -1604,6 +1604,18 @@ static void put_prev_task_rt(struct rq *rq, struct task_struct *p) */ if (on_rt_rq(>rt) && p->nr_cpus_allowed > 1) enqueue_pushable_task(rq, p); + + if (rf && !on_rt_rq(>rt) && need_pull_rt_task(rq, p)) { + /* +* This is OK, because current is on_cpu, which avoids it being +* picked for load-balance and preemption/IRQs are still +* disabled avoiding further scheduler activity on it and we've +* not yet started the picking loop. +*/ + rq_unpin_lock(rq, rf); + pull_rt_task(rq); + rq_repin_lock(rq, rf); +
[tip:sched/core] sched: Add task_struct pointer to sched_class::set_curr_task
Commit-ID: 03b7fad167efca3b7a39733933f9df56e79c Gitweb: https://git.kernel.org/tip/03b7fad167efca3b7a39733933f9df56e79c Author: Peter Zijlstra AuthorDate: Wed, 29 May 2019 20:36:41 + Committer: Peter Zijlstra CommitDate: Thu, 8 Aug 2019 09:09:31 +0200 sched: Add task_struct pointer to sched_class::set_curr_task In preparation of further separating pick_next_task() and set_curr_task() we have to pass the actual task into it, while there, rename the thing to better pair with put_prev_task(). Signed-off-by: Peter Zijlstra (Intel) Cc: Aaron Lu Cc: Valentin Schneider Cc: mi...@kernel.org Cc: Phil Auld Cc: Julien Desfossez Cc: Nishanth Aravamudan Link: https://lkml.kernel.org/r/a96d1bcdd716db4a4c5da2fece647a1456c0ed78.1559129225.git.vpil...@digitalocean.com --- kernel/sched/core.c | 12 ++-- kernel/sched/deadline.c | 7 +-- kernel/sched/fair.c | 17 ++--- kernel/sched/idle.c | 27 +++ kernel/sched/rt.c| 7 +-- kernel/sched/sched.h | 7 --- kernel/sched/stop_task.c | 17 +++-- 7 files changed, 48 insertions(+), 46 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 364b6d7da2be..0c4220789092 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1494,7 +1494,7 @@ void do_set_cpus_allowed(struct task_struct *p, const struct cpumask *new_mask) if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); if (running) - set_curr_task(rq, p); + set_next_task(rq, p); } /* @@ -4325,7 +4325,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) if (queued) enqueue_task(rq, p, queue_flag); if (running) - set_curr_task(rq, p); + set_next_task(rq, p); check_class_changed(rq, p, prev_class, oldprio); out_unlock: @@ -4392,7 +4392,7 @@ void set_user_nice(struct task_struct *p, long nice) resched_curr(rq); } if (running) - set_curr_task(rq, p); + set_next_task(rq, p); out_unlock: task_rq_unlock(rq, p, ); } @@ -4840,7 +4840,7 @@ change: enqueue_task(rq, p, queue_flags); } if (running) - set_curr_task(rq, p); + set_next_task(rq, p); check_class_changed(rq, p, prev_class, oldprio); @@ -6042,7 +6042,7 @@ void sched_setnuma(struct task_struct *p, int nid) if (queued) enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK); if (running) - set_curr_task(rq, p); + set_next_task(rq, p); task_rq_unlock(rq, p, ); } #endif /* CONFIG_NUMA_BALANCING */ @@ -6919,7 +6919,7 @@ void sched_move_task(struct task_struct *tsk) if (queued) enqueue_task(rq, tsk, queue_flags); if (running) - set_curr_task(rq, tsk); + set_next_task(rq, tsk); task_rq_unlock(rq, tsk, ); } diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 2dc2784b196c..6eae79350303 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1844,11 +1844,6 @@ static void task_fork_dl(struct task_struct *p) */ } -static void set_curr_task_dl(struct rq *rq) -{ - set_next_task_dl(rq, rq->curr); -} - #ifdef CONFIG_SMP /* Only try algorithms three times */ @@ -2466,6 +2461,7 @@ const struct sched_class dl_sched_class = { .pick_next_task = pick_next_task_dl, .put_prev_task = put_prev_task_dl, + .set_next_task = set_next_task_dl, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_dl, @@ -2476,7 +2472,6 @@ const struct sched_class dl_sched_class = { .task_woken = task_woken_dl, #endif - .set_curr_task = set_curr_task_dl, .task_tick = task_tick_dl, .task_fork = task_fork_dl, diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 7d8043fc8317..8ce1b8893947 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -10150,9 +10150,19 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) * This routine is mostly called to set cfs_rq->curr field when a task * migrates between groups/classes. */ -static void set_curr_task_fair(struct rq *rq) +static void set_next_task_fair(struct rq *rq, struct task_struct *p) { - struct sched_entity *se = >curr->se; + struct sched_entity *se = >se; + +#ifdef CONFIG_SMP + if (task_on_rq_queued(p)) { + /* +* Move the next running task to the front of the list, so our +* cfs_tasks list becomes MRU one. +*/ + list_move(>group_node, >cfs_tasks); + } +#endif for_each_sched_entity(se) { struct cfs_rq *cfs_rq
[tip:sched/core] sched/fair: Expose newidle_balance()
Commit-ID: 5ba553eff0c3a7c099b1e29a740277a82c0c3314 Gitweb: https://git.kernel.org/tip/5ba553eff0c3a7c099b1e29a740277a82c0c3314 Author: Peter Zijlstra AuthorDate: Wed, 29 May 2019 20:36:42 + Committer: Peter Zijlstra CommitDate: Thu, 8 Aug 2019 09:09:31 +0200 sched/fair: Expose newidle_balance() For pick_next_task_fair() it is the newidle balance that requires dropping the rq->lock; provided we do put_prev_task() early, we can also detect the condition for doing newidle early. Signed-off-by: Peter Zijlstra (Intel) Cc: Aaron Lu Cc: Valentin Schneider Cc: mi...@kernel.org Cc: Phil Auld Cc: Julien Desfossez Cc: Nishanth Aravamudan Link: https://lkml.kernel.org/r/9e3eb1859b946f03d7e500453a885725b68957ba.1559129225.git.vpil...@digitalocean.com --- kernel/sched/fair.c | 18 -- kernel/sched/sched.h | 4 2 files changed, 12 insertions(+), 10 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 8ce1b8893947..e7c27eda9f24 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3690,8 +3690,6 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq *cfs_rq) return cfs_rq->avg.load_avg; } -static int idle_balance(struct rq *this_rq, struct rq_flags *rf); - static inline unsigned long task_util(struct task_struct *p) { return READ_ONCE(p->se.avg.util_avg); @@ -6878,11 +6876,10 @@ done: __maybe_unused; return p; idle: - update_misfit_status(NULL, rq); - new_tasks = idle_balance(rq, rf); + new_tasks = newidle_balance(rq, rf); /* -* Because idle_balance() releases (and re-acquires) rq->lock, it is +* Because newidle_balance() releases (and re-acquires) rq->lock, it is * possible for any higher priority task to appear. In that case we * must re-start the pick_next_entity() loop. */ @@ -9045,10 +9042,10 @@ out_one_pinned: ld_moved = 0; /* -* idle_balance() disregards balance intervals, so we could repeatedly -* reach this code, which would lead to balance_interval skyrocketting -* in a short amount of time. Skip the balance_interval increase logic -* to avoid that. +* newidle_balance() disregards balance intervals, so we could +* repeatedly reach this code, which would lead to balance_interval +* skyrocketting in a short amount of time. Skip the balance_interval +* increase logic to avoid that. */ if (env.idle == CPU_NEWLY_IDLE) goto out; @@ -9758,7 +9755,7 @@ static inline void nohz_newidle_balance(struct rq *this_rq) { } * idle_balance is called by schedule() if this_cpu is about to become * idle. Attempts to pull tasks from other CPUs. */ -static int idle_balance(struct rq *this_rq, struct rq_flags *rf) +int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { unsigned long next_balance = jiffies + HZ; int this_cpu = this_rq->cpu; @@ -9766,6 +9763,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf) int pulled_task = 0; u64 curr_cost = 0; + update_misfit_status(NULL, this_rq); /* * We must set idle_stamp _before_ calling idle_balance(), such that we * measure the duration of idle_balance() as idle time. diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index f3c50445bf22..304d98e712bf 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -1445,10 +1445,14 @@ static inline void unregister_sched_domain_sysctl(void) } #endif +extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf); + #else static inline void sched_ttwu_pending(void) { } +static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { return 0; } + #endif /* CONFIG_SMP */ #include "stats.h"
[tip:sched/core] sched/{rt,deadline}: Fix set_next_task vs pick_next_task
Commit-ID: f95d4eaee6d0207bff2dc93371133d31227d4cfb Gitweb: https://git.kernel.org/tip/f95d4eaee6d0207bff2dc93371133d31227d4cfb Author: Peter Zijlstra AuthorDate: Wed, 29 May 2019 20:36:40 + Committer: Peter Zijlstra CommitDate: Thu, 8 Aug 2019 09:09:30 +0200 sched/{rt,deadline}: Fix set_next_task vs pick_next_task Because pick_next_task() implies set_curr_task() and some of the details haven't mattered too much, some of what _should_ be in set_curr_task() ended up in pick_next_task, correct this. This prepares the way for a pick_next_task() variant that does not affect the current state; allowing remote picking. Signed-off-by: Peter Zijlstra (Intel) Cc: Aaron Lu Cc: Valentin Schneider Cc: mi...@kernel.org Cc: Phil Auld Cc: Julien Desfossez Cc: Nishanth Aravamudan Link: https://lkml.kernel.org/r/38c61d5240553e043c27c5e00b9dd0d184dd6081.1559129225.git.vpil...@digitalocean.com --- kernel/sched/deadline.c | 22 +++--- kernel/sched/rt.c | 26 +- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index 039dde2b1dac..2dc2784b196c 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1727,12 +1727,20 @@ static void start_hrtick_dl(struct rq *rq, struct task_struct *p) } #endif -static inline void set_next_task(struct rq *rq, struct task_struct *p) +static void set_next_task_dl(struct rq *rq, struct task_struct *p) { p->se.exec_start = rq_clock_task(rq); /* You can't push away the running task */ dequeue_pushable_dl_task(rq, p); + + if (hrtick_enabled(rq)) + start_hrtick_dl(rq, p); + + if (rq->curr->sched_class != _sched_class) + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); + + deadline_queue_push_tasks(rq); } static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq, @@ -1791,15 +1799,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) p = dl_task_of(dl_se); - set_next_task(rq, p); - - if (hrtick_enabled(rq)) - start_hrtick_dl(rq, p); - - deadline_queue_push_tasks(rq); - - if (rq->curr->sched_class != _sched_class) - update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); + set_next_task_dl(rq, p); return p; } @@ -1846,7 +1846,7 @@ static void task_fork_dl(struct task_struct *p) static void set_curr_task_dl(struct rq *rq) { - set_next_task(rq, rq->curr); + set_next_task_dl(rq, rq->curr); } #ifdef CONFIG_SMP diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c index a532558a5176..40bb71004325 100644 --- a/kernel/sched/rt.c +++ b/kernel/sched/rt.c @@ -1498,12 +1498,22 @@ static void check_preempt_curr_rt(struct rq *rq, struct task_struct *p, int flag #endif } -static inline void set_next_task(struct rq *rq, struct task_struct *p) +static inline void set_next_task_rt(struct rq *rq, struct task_struct *p) { p->se.exec_start = rq_clock_task(rq); /* The running task is never eligible for pushing */ dequeue_pushable_task(rq, p); + + /* +* If prev task was rt, put_prev_task() has already updated the +* utilization. We only care of the case where we start to schedule a +* rt task +*/ + if (rq->curr->sched_class != _sched_class) + update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); + + rt_queue_push_tasks(rq); } static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq, @@ -1577,17 +1587,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) p = _pick_next_task_rt(rq); - set_next_task(rq, p); - - rt_queue_push_tasks(rq); - - /* -* If prev task was rt, put_prev_task() has already updated the -* utilization. We only care of the case where we start to schedule a -* rt task -*/ - if (rq->curr->sched_class != _sched_class) - update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0); + set_next_task_rt(rq, p); return p; } @@ -2356,7 +2356,7 @@ static void task_tick_rt(struct rq *rq, struct task_struct *p, int queued) static void set_curr_task_rt(struct rq *rq) { - set_next_task(rq, rq->curr); + set_next_task_rt(rq, rq->curr); } static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)
[tip:sched/core] stop_machine: Fix stop_cpus_in_progress ordering
Commit-ID: 99d84bf8c65a7a0dbc9e166ca0a58ed949ac4f37 Gitweb: https://git.kernel.org/tip/99d84bf8c65a7a0dbc9e166ca0a58ed949ac4f37 Author: Peter Zijlstra AuthorDate: Wed, 29 May 2019 20:36:37 + Committer: Peter Zijlstra CommitDate: Thu, 8 Aug 2019 09:09:30 +0200 stop_machine: Fix stop_cpus_in_progress ordering Make sure the entire for loop has stop_cpus_in_progress set. Signed-off-by: Peter Zijlstra (Intel) Cc: Aaron Lu Cc: Valentin Schneider Cc: mi...@kernel.org Cc: Phil Auld Cc: Julien Desfossez Cc: Nishanth Aravamudan Link: https://lkml.kernel.org/r/0fd8fd4b99b9b9aa88d8b2dff897f7fd0d88f72c.1559129225.git.vpil...@digitalocean.com --- kernel/stop_machine.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index b4f83f7bdf86..c7031a22aa7b 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -383,6 +383,7 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask, */ preempt_disable(); stop_cpus_in_progress = true; + barrier(); for_each_cpu(cpu, cpumask) { work = _cpu(cpu_stopper.stop_work, cpu); work->fn = fn; @@ -391,6 +392,7 @@ static bool queue_stop_cpus_work(const struct cpumask *cpumask, if (cpu_stop_queue_work(cpu, work)) queued = true; } + barrier(); stop_cpus_in_progress = false; preempt_enable();
[tip:sched/core] sched: Fix kerneldoc comment for ia64_set_curr_task
Commit-ID: 5feeb7837a448f659e0aaa19fb446b1d9a4b323a Gitweb: https://git.kernel.org/tip/5feeb7837a448f659e0aaa19fb446b1d9a4b323a Author: Peter Zijlstra AuthorDate: Wed, 29 May 2019 20:36:38 + Committer: Peter Zijlstra CommitDate: Thu, 8 Aug 2019 09:09:30 +0200 sched: Fix kerneldoc comment for ia64_set_curr_task Signed-off-by: Peter Zijlstra (Intel) Cc: Aaron Lu Cc: Valentin Schneider Cc: mi...@kernel.org Cc: Phil Auld Cc: Julien Desfossez Cc: Nishanth Aravamudan Link: https://lkml.kernel.org/r/fde3a65ea3091ec6b84dac3c19639f85f452c5d1.1559129225.git.vpil...@digitalocean.com --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index b4a44bc84749..9a821ff68502 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -6772,7 +6772,7 @@ struct task_struct *curr_task(int cpu) #ifdef CONFIG_IA64 /** - * set_curr_task - set the current task for a given CPU. + * ia64_set_curr_task - set the current task for a given CPU. * @cpu: the processor in question. * @p: the task pointer to set. *
[tip:core/urgent] objtool: Improve UACCESS coverage
Commit-ID: 882a0db9d143e5e8dac54b96e83135bccd1f68d1 Gitweb: https://git.kernel.org/tip/882a0db9d143e5e8dac54b96e83135bccd1f68d1 Author: Peter Zijlstra AuthorDate: Wed, 24 Jul 2019 17:47:26 -0500 Committer: Thomas Gleixner CommitDate: Thu, 25 Jul 2019 08:36:39 +0200 objtool: Improve UACCESS coverage A clang build reported an (obvious) double CLAC while a GCC build did not; it turns out that objtool only re-visits instructions if the first visit was with AC=0. If OTOH the first visit was with AC=1, it completely ignores any subsequent visit, even when it has AC=0. Fix this by using a visited mask instead of a boolean, and (explicitly) mark the AC state. $ ./objtool check -b --no-fp --retpoline --uaccess drivers/gpu/drm/i915/gem/i915_gem_execbuffer.o drivers/gpu/drm/i915/gem/i915_gem_execbuffer.o: warning: objtool: .altinstr_replacement+0x22: redundant UACCESS disable drivers/gpu/drm/i915/gem/i915_gem_execbuffer.o: warning: objtool: eb_copy_relocations.isra.34()+0xea: (alt) drivers/gpu/drm/i915/gem/i915_gem_execbuffer.o: warning: objtool: .altinstr_replacement+0x: (branch) drivers/gpu/drm/i915/gem/i915_gem_execbuffer.o: warning: objtool: eb_copy_relocations.isra.34()+0xd9: (alt) drivers/gpu/drm/i915/gem/i915_gem_execbuffer.o: warning: objtool: eb_copy_relocations.isra.34()+0xb2: (branch) drivers/gpu/drm/i915/gem/i915_gem_execbuffer.o: warning: objtool: eb_copy_relocations.isra.34()+0x39: (branch) drivers/gpu/drm/i915/gem/i915_gem_execbuffer.o: warning: objtool: eb_copy_relocations.isra.34()+0x0: <=== (func) Reported-by: Josh Poimboeuf Reported-by: Thomas Gleixner Reported-by: Sedat Dilek Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Josh Poimboeuf Signed-off-by: Thomas Gleixner Tested-by: Nathan Chancellor Tested-by: Nick Desaulniers Tested-by: Sedat Dilek Link: https://github.com/ClangBuiltLinux/linux/issues/617 Link: https://lkml.kernel.org/r/5359166aad2d53f3145cd442d83d0e5115e0cd17.1564007838.git.jpoim...@redhat.com --- tools/objtool/check.c | 7 --- tools/objtool/check.h | 3 ++- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/tools/objtool/check.c b/tools/objtool/check.c index 5f26620f13f5..176f2f084060 100644 --- a/tools/objtool/check.c +++ b/tools/objtool/check.c @@ -1946,6 +1946,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, struct alternative *alt; struct instruction *insn, *next_insn; struct section *sec; + u8 visited; int ret; insn = first; @@ -1972,12 +1973,12 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, return 1; } + visited = 1 << state.uaccess; if (insn->visited) { if (!insn->hint && !insn_state_match(insn, )) return 1; - /* If we were here with AC=0, but now have AC=1, go again */ - if (insn->state.uaccess || !state.uaccess) + if (insn->visited & visited) return 0; } @@ -2024,7 +2025,7 @@ static int validate_branch(struct objtool_file *file, struct symbol *func, } else insn->state = state; - insn->visited = true; + insn->visited |= visited; if (!insn->ignore_alts) { bool skip_orig = false; diff --git a/tools/objtool/check.h b/tools/objtool/check.h index b881fafcf55d..6d875ca6fce0 100644 --- a/tools/objtool/check.h +++ b/tools/objtool/check.h @@ -33,8 +33,9 @@ struct instruction { unsigned int len; enum insn_type type; unsigned long immediate; - bool alt_group, visited, dead_end, ignore, hint, save, restore, ignore_alts; + bool alt_group, dead_end, ignore, hint, save, restore, ignore_alts; bool retpoline_safe; + u8 visited; struct symbol *call_dest; struct instruction *jump_dest; struct instruction *first_jump_src;
[tip:smp/urgent] smp: Warn on function calls from softirq context
Commit-ID: 19dbdcb8039cff16669a05136a29180778d16d0a Gitweb: https://git.kernel.org/tip/19dbdcb8039cff16669a05136a29180778d16d0a Author: Peter Zijlstra AuthorDate: Thu, 18 Jul 2019 11:20:09 +0200 Committer: Thomas Gleixner CommitDate: Sat, 20 Jul 2019 11:27:16 +0200 smp: Warn on function calls from softirq context It's clearly documented that smp function calls cannot be invoked from softirq handling context. Unfortunately nothing enforces that or emits a warning. A single function call can be invoked from softirq context only via smp_call_function_single_async(). The only legit context is task context, so add a warning to that effect. Reported-by: luferry Signed-off-by: Peter Zijlstra Signed-off-by: Thomas Gleixner Link: https://lkml.kernel.org/r/20190718160601.gp3...@hirez.programming.kicks-ass.net --- kernel/smp.c | 16 1 file changed, 16 insertions(+) diff --git a/kernel/smp.c b/kernel/smp.c index 616d4d114847..7dbcb402c2fc 100644 --- a/kernel/smp.c +++ b/kernel/smp.c @@ -291,6 +291,14 @@ int smp_call_function_single(int cpu, smp_call_func_t func, void *info, WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() && !oops_in_progress); + /* +* When @wait we can deadlock when we interrupt between llist_add() and +* arch_send_call_function_ipi*(); when !@wait we can deadlock due to +* csd_lock() on because the interrupt context uses the same csd +* storage. +*/ + WARN_ON_ONCE(!in_task()); + csd = _stack; if (!wait) { csd = this_cpu_ptr(_data); @@ -416,6 +424,14 @@ void smp_call_function_many(const struct cpumask *mask, WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled() && !oops_in_progress && !early_boot_irqs_disabled); + /* +* When @wait we can deadlock when we interrupt between llist_add() and +* arch_send_call_function_ipi*(); when !@wait we can deadlock due to +* csd_lock() on because the interrupt context uses the same csd +* storage. +*/ + WARN_ON_ONCE(!in_task()); + /* Try to fastpath. So, what's a CPU they want? Ignoring this one. */ cpu = cpumask_first_and(mask, cpu_online_mask); if (cpu == this_cpu)
[tip:core/urgent] stacktrace: Force USER_DS for stack_trace_save_user()
Commit-ID: cac9b9a4b08304f11daace03b8b48659355e44c1 Gitweb: https://git.kernel.org/tip/cac9b9a4b08304f11daace03b8b48659355e44c1 Author: Peter Zijlstra AuthorDate: Thu, 18 Jul 2019 10:47:47 +0200 Committer: Thomas Gleixner CommitDate: Thu, 18 Jul 2019 16:47:24 +0200 stacktrace: Force USER_DS for stack_trace_save_user() When walking userspace stacks, USER_DS needs to be set, otherwise access_ok() will not function as expected. Reported-by: Vegard Nossum Reported-by: Eiichi Tsukata Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Tested-by: Vegard Nossum Reviewed-by: Joel Fernandes (Google) Link: https://lkml.kernel.org/r/20190718085754.gm3...@hirez.programming.kicks-ass.net --- kernel/stacktrace.c | 5 + 1 file changed, 5 insertions(+) diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c index e6a02b274b73..f5440abb7532 100644 --- a/kernel/stacktrace.c +++ b/kernel/stacktrace.c @@ -226,12 +226,17 @@ unsigned int stack_trace_save_user(unsigned long *store, unsigned int size) .store = store, .size = size, }; + mm_segment_t fs; /* Trace user stack if not a kernel thread */ if (current->flags & PF_KTHREAD) return 0; + fs = get_fs(); + set_fs(USER_DS); arch_stack_walk_user(consume_entry, , task_pt_regs(current)); + set_fs(fs); + return c.len; } #endif
[tip:x86/urgent] x86/mm, tracing: Fix CR2 corruption
Commit-ID: a0d14b8909de55139b8702fe0c7e80b69763dcfb Gitweb: https://git.kernel.org/tip/a0d14b8909de55139b8702fe0c7e80b69763dcfb Author: Peter Zijlstra AuthorDate: Thu, 11 Jul 2019 13:40:59 +0200 Committer: Thomas Gleixner CommitDate: Wed, 17 Jul 2019 23:17:38 +0200 x86/mm, tracing: Fix CR2 corruption Despite the current efforts to read CR2 before tracing happens there still exist a number of possible holes: idtentry page_fault do_page_fault has_error_code=1 call error_entry TRACE_IRQS_OFF call trace_hardirqs_off* #PF // modifies CR2 CALL_enter_from_user_mode __context_tracking_exit() trace_user_exit(0) #PF // modifies CR2 call do_page_fault address = read_cr2(); /* whoopsie */ And similar for i386. Fix it by pulling the CR2 read into the entry code, before any of that stuff gets a chance to run and ruin things. Reported-by: He Zhe Reported-by: Eiichi Tsukata Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Andy Lutomirski Cc: b...@alien8.de Cc: rost...@goodmis.org Cc: torva...@linux-foundation.org Cc: h...@zytor.com Cc: dave.han...@linux.intel.com Cc: jgr...@suse.com Cc: j...@joelfernandes.org Link: https://lkml.kernel.org/r/2019074336.116812...@infradead.org Debugged-by: Steven Rostedt --- arch/x86/entry/entry_32.S | 25 ++--- arch/x86/entry/entry_64.S | 35 ++- arch/x86/include/asm/kvm_para.h | 2 +- arch/x86/include/asm/traps.h| 4 ++-- arch/x86/kernel/kvm.c | 8 arch/x86/kernel/traps.c | 6 +- arch/x86/mm/fault.c | 30 +++--- 7 files changed, 59 insertions(+), 51 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 4d4b6100f0e8..2bb986f305ac 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -1443,9 +1443,28 @@ BUILD_INTERRUPT3(hv_stimer0_callback_vector, HYPERV_STIMER0_VECTOR, ENTRY(page_fault) ASM_CLAC - pushl $do_page_fault - ALIGN - jmp common_exception + pushl $0; /* %gs's slot on the stack */ + + SAVE_ALL switch_stacks=1 skip_gs=1 + + ENCODE_FRAME_POINTER + UNWIND_ESPFIX_STACK + + /* fixup %gs */ + GS_TO_REG %ecx + REG_TO_PTGS %ecx + SET_KERNEL_GS %ecx + + GET_CR2_INTO(%ecx) # might clobber %eax + + /* fixup orig %eax */ + movlPT_ORIG_EAX(%esp), %edx # get the error code + movl$-1, PT_ORIG_EAX(%esp) # no syscall to restart + + TRACE_IRQS_OFF + movl%esp, %eax # pt_regs pointer + calldo_page_fault + jmp ret_from_exception END(page_fault) common_exception: diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 95ae05f0edf2..7cb2e1f1ec09 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -864,7 +864,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt */ #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8) -.macro idtentry_part do_sym, has_error_code:req, paranoid:req, shift_ist=-1, ist_offset=0 +.macro idtentry_part do_sym, has_error_code:req, read_cr2:req, paranoid:req, shift_ist=-1, ist_offset=0 .if \paranoid callparanoid_entry @@ -874,12 +874,21 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt .endif UNWIND_HINT_REGS - .if \paranoid + .if \read_cr2 + GET_CR2_INTO(%rdx); /* can clobber %rax */ + .endif + .if \shift_ist != -1 TRACE_IRQS_OFF_DEBUG/* reload IDT in case of recursion */ .else TRACE_IRQS_OFF .endif + + .if \paranoid == 0 + testb $3, CS(%rsp) + jz .Lfrom_kernel_no_context_tracking_\@ + CALL_enter_from_user_mode +.Lfrom_kernel_no_context_tracking_\@: .endif movq%rsp, %rdi /* pt_regs pointer */ @@ -923,6 +932,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt * fresh stack. (This is for #DB, which has a nasty habit * of recursing.) * @create_gap:create a 6-word stack gap when coming from kernel mode. + * @read_cr2: load CR2 into the 3rd argument; done before calling any C code * * idtentry generates an IDT stub that sets up a usable kernel context, * creates struct pt_regs, and calls @do_sym. The stub has the following @@ -947,7 +957,7 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt * @paranoid == 2 is special: the stub will
[tip:x86/urgent] x86/entry/64: Update comments and sanity tests for create_gap
Commit-ID: 4234653e882740cbf6625294e388b3176583 Gitweb: https://git.kernel.org/tip/4234653e882740cbf6625294e388b3176583 Author: Peter Zijlstra AuthorDate: Thu, 11 Jul 2019 13:40:58 +0200 Committer: Thomas Gleixner CommitDate: Wed, 17 Jul 2019 23:17:38 +0200 x86/entry/64: Update comments and sanity tests for create_gap Commit 2700fefdb2d9 ("x86_64: Add gap to int3 to allow for call emulation") forgot to update the comment, do so now. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt (VMware) Acked-by: Andy Lutomirski Cc: b...@alien8.de Cc: torva...@linux-foundation.org Cc: h...@zytor.com Cc: dave.han...@linux.intel.com Cc: jgr...@suse.com Cc: zhe...@windriver.com Cc: j...@joelfernandes.org Cc: de...@etsukata.com Link: https://lkml.kernel.org/r/2019074336.059780...@infradead.org --- arch/x86/entry/entry_64.S | 17 +++-- 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 3db5fede743b..95ae05f0edf2 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -913,15 +913,16 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt /** * idtentry - Generate an IDT entry stub * @sym: Name of the generated entry point - * @do_sym:C function to be called - * @has_error_code:True if this IDT vector has an error code on the stack - * @paranoid: non-zero means that this vector may be invoked from + * @do_sym:C function to be called + * @has_error_code:True if this IDT vector has an error code on the stack + * @paranoid: non-zero means that this vector may be invoked from * kernel mode with user GSBASE and/or user CR3. * 2 is special -- see below. * @shift_ist: Set to an IST index if entries from kernel mode should - * decrement the IST stack so that nested entries get a + * decrement the IST stack so that nested entries get a * fresh stack. (This is for #DB, which has a nasty habit - * of recursing.) + * of recursing.) + * @create_gap:create a 6-word stack gap when coming from kernel mode. * * idtentry generates an IDT stub that sets up a usable kernel context, * creates struct pt_regs, and calls @do_sym. The stub has the following @@ -951,10 +952,14 @@ ENTRY(\sym) UNWIND_HINT_IRET_REGS offset=\has_error_code*8 /* Sanity check */ - .if \shift_ist != -1 && \paranoid == 0 + .if \shift_ist != -1 && \paranoid != 1 .error "using shift_ist requires paranoid=1" .endif + .if \create_gap && \paranoid + .error "using create_gap requires paranoid=0" + .endif + ASM_CLAC .if \has_error_code == 0
[tip:x86/urgent] x86/entry/64: Simplify idtentry a little
Commit-ID: 2fd37912cfb019228bf246215938e6f7619516a2 Gitweb: https://git.kernel.org/tip/2fd37912cfb019228bf246215938e6f7619516a2 Author: Peter Zijlstra AuthorDate: Thu, 11 Jul 2019 13:40:57 +0200 Committer: Thomas Gleixner CommitDate: Wed, 17 Jul 2019 23:17:37 +0200 x86/entry/64: Simplify idtentry a little There's a bunch of duplication in idtentry, namely the .Lfrom_usermode_switch_stack is a paranoid=0 copy of the normal flow. Make this explicit by creating a idtentry_part helper macro. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt (VMware) Acked-by: Andy Lutomirski Cc: b...@alien8.de Cc: torva...@linux-foundation.org Cc: h...@zytor.com Cc: dave.han...@linux.intel.com Cc: jgr...@suse.com Cc: zhe...@windriver.com Cc: j...@joelfernandes.org Cc: de...@etsukata.com Link: https://lkml.kernel.org/r/2019074336.002429...@infradead.org --- arch/x86/entry/entry_64.S | 102 ++ 1 file changed, 48 insertions(+), 54 deletions(-) diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S index 0ea4831a72a4..3db5fede743b 100644 --- a/arch/x86/entry/entry_64.S +++ b/arch/x86/entry/entry_64.S @@ -864,6 +864,52 @@ apicinterrupt IRQ_WORK_VECTOR irq_work_interrupt smp_irq_work_interrupt */ #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8) +.macro idtentry_part do_sym, has_error_code:req, paranoid:req, shift_ist=-1, ist_offset=0 + + .if \paranoid + callparanoid_entry + /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ + .else + callerror_entry + .endif + UNWIND_HINT_REGS + + .if \paranoid + .if \shift_ist != -1 + TRACE_IRQS_OFF_DEBUG/* reload IDT in case of recursion */ + .else + TRACE_IRQS_OFF + .endif + .endif + + movq%rsp, %rdi /* pt_regs pointer */ + + .if \has_error_code + movqORIG_RAX(%rsp), %rsi/* get error code */ + movq$-1, ORIG_RAX(%rsp) /* no syscall to restart */ + .else + xorl%esi, %esi /* no error code */ + .endif + + .if \shift_ist != -1 + subq$\ist_offset, CPU_TSS_IST(\shift_ist) + .endif + + call\do_sym + + .if \shift_ist != -1 + addq$\ist_offset, CPU_TSS_IST(\shift_ist) + .endif + + .if \paranoid + /* this procedure expect "no swapgs" flag in ebx */ + jmp paranoid_exit + .else + jmp error_exit + .endif + +.endm + /** * idtentry - Generate an IDT entry stub * @sym: Name of the generated entry point @@ -934,47 +980,7 @@ ENTRY(\sym) .Lfrom_usermode_no_gap_\@: .endif - .if \paranoid - callparanoid_entry - .else - callerror_entry - .endif - UNWIND_HINT_REGS - /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */ - - .if \paranoid - .if \shift_ist != -1 - TRACE_IRQS_OFF_DEBUG/* reload IDT in case of recursion */ - .else - TRACE_IRQS_OFF - .endif - .endif - - movq%rsp, %rdi /* pt_regs pointer */ - - .if \has_error_code - movqORIG_RAX(%rsp), %rsi/* get error code */ - movq$-1, ORIG_RAX(%rsp) /* no syscall to restart */ - .else - xorl%esi, %esi /* no error code */ - .endif - - .if \shift_ist != -1 - subq$\ist_offset, CPU_TSS_IST(\shift_ist) - .endif - - call\do_sym - - .if \shift_ist != -1 - addq$\ist_offset, CPU_TSS_IST(\shift_ist) - .endif - - /* these procedures expect "no swapgs" flag in ebx */ - .if \paranoid - jmp paranoid_exit - .else - jmp error_exit - .endif + idtentry_part \do_sym, \has_error_code, \paranoid, \shift_ist, \ist_offset .if \paranoid == 1 /* @@ -983,21 +989,9 @@ ENTRY(\sym) * run in real process context if user_mode(regs). */ .Lfrom_usermode_switch_stack_\@: - callerror_entry - - movq%rsp, %rdi /* pt_regs pointer */ - - .if \has_error_code - movqORIG_RAX(%rsp), %rsi/* get error code */ - movq$-1, ORIG_RAX(%rsp) /* no syscall to restart */ - .else - xorl%esi, %esi /* no error code */ + idtentry_part \do_sym, \has_error_code, paranoid=0 .endif - call\do_sym - - jmp error_exit - .endif _ASM_NOKPROBE(\sym) END(\sym) .endm
[tip:x86/urgent] x86/entry/32: Simplify common_exception
Commit-ID: e67f1c11e5ea7fa47449a16325ecc997dbbf9bdf Gitweb: https://git.kernel.org/tip/e67f1c11e5ea7fa47449a16325ecc997dbbf9bdf Author: Peter Zijlstra AuthorDate: Thu, 11 Jul 2019 13:40:56 +0200 Committer: Thomas Gleixner CommitDate: Wed, 17 Jul 2019 23:17:37 +0200 x86/entry/32: Simplify common_exception Adding one more option to SAVE_ALL can be used in common_exception to simplify things. This also saves duplication later where page_fault will no longer use common_exception. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Steven Rostedt (VMware) Reviewed-by: Andy Lutomirski Cc: b...@alien8.de Cc: torva...@linux-foundation.org Cc: h...@zytor.com Cc: dave.han...@linux.intel.com Cc: jgr...@suse.com Cc: zhe...@windriver.com Cc: j...@joelfernandes.org Cc: de...@etsukata.com Link: https://lkml.kernel.org/r/2019074335.945136...@infradead.org --- arch/x86/entry/entry_32.S | 36 +--- 1 file changed, 13 insertions(+), 23 deletions(-) diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S index 90b473297299..4d4b6100f0e8 100644 --- a/arch/x86/entry/entry_32.S +++ b/arch/x86/entry/entry_32.S @@ -294,9 +294,11 @@ .Lfinished_frame_\@: .endm -.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 +.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 cld +.if \skip_gs == 0 PUSH_GS +.endif FIXUP_FRAME pushl %fs pushl %es @@ -313,13 +315,13 @@ movl%edx, %es movl$(__KERNEL_PERCPU), %edx movl%edx, %fs +.if \skip_gs == 0 SET_KERNEL_GS %edx - +.endif /* Switch to kernel stack if necessary */ .if \switch_stacks > 0 SWITCH_TO_KERNEL_STACK .endif - .endm .macro SAVE_ALL_NMI cr3_reg:req @@ -1448,32 +1450,20 @@ END(page_fault) common_exception: /* the function address is in %gs's slot on the stack */ - FIXUP_FRAME - pushl %fs - pushl %es - pushl %ds - pushl %eax - movl$(__USER_DS), %eax - movl%eax, %ds - movl%eax, %es - movl$(__KERNEL_PERCPU), %eax - movl%eax, %fs - pushl %ebp - pushl %edi - pushl %esi - pushl %edx - pushl %ecx - pushl %ebx - SWITCH_TO_KERNEL_STACK + SAVE_ALL switch_stacks=1 skip_gs=1 ENCODE_FRAME_POINTER - cld UNWIND_ESPFIX_STACK + + /* fixup %gs */ GS_TO_REG %ecx movlPT_GS(%esp), %edi # get the function address - movlPT_ORIG_EAX(%esp), %edx # get the error code - movl$-1, PT_ORIG_EAX(%esp) # no syscall to restart REG_TO_PTGS %ecx SET_KERNEL_GS %ecx + + /* fixup orig %eax */ + movlPT_ORIG_EAX(%esp), %edx # get the error code + movl$-1, PT_ORIG_EAX(%esp) # no syscall to restart + TRACE_IRQS_OFF movl%esp, %eax # pt_regs pointer CALL_NOSPEC %edi
[tip:x86/urgent] x86/paravirt: Make read_cr2() CALLEE_SAVE
Commit-ID: 55aedddb6149ab71bec9f050846855113977b033 Gitweb: https://git.kernel.org/tip/55aedddb6149ab71bec9f050846855113977b033 Author: Peter Zijlstra AuthorDate: Thu, 11 Jul 2019 13:40:55 +0200 Committer: Thomas Gleixner CommitDate: Wed, 17 Jul 2019 23:17:37 +0200 x86/paravirt: Make read_cr2() CALLEE_SAVE The one paravirt read_cr2() implementation (Xen) is actually quite trivial and doesn't need to clobber anything other than the return register. Making read_cr2() CALLEE_SAVE avoids all the PUSH/POP nonsense and allows more convenient use from assembly. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Juergen Gross Cc: b...@alien8.de Cc: rost...@goodmis.org Cc: l...@kernel.org Cc: torva...@linux-foundation.org Cc: h...@zytor.com Cc: dave.han...@linux.intel.com Cc: zhe...@windriver.com Cc: j...@joelfernandes.org Cc: de...@etsukata.com Link: https://lkml.kernel.org/r/2019074335.887392...@infradead.org --- arch/x86/entry/calling.h | 6 ++ arch/x86/include/asm/paravirt.h | 22 +- arch/x86/include/asm/paravirt_types.h | 2 +- arch/x86/kernel/asm-offsets.c | 1 + arch/x86/kernel/head_64.S | 4 +--- arch/x86/kernel/paravirt.c| 2 +- arch/x86/xen/enlighten_pv.c | 3 ++- arch/x86/xen/mmu_pv.c | 12 +--- arch/x86/xen/xen-asm.S| 16 arch/x86/xen/xen-ops.h| 3 +++ 10 files changed, 45 insertions(+), 26 deletions(-) diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h index 9f1f9e3b8230..830bd984182b 100644 --- a/arch/x86/entry/calling.h +++ b/arch/x86/entry/calling.h @@ -343,3 +343,9 @@ For 32-bit we have the following conventions - kernel is built with .Lafter_call_\@: #endif .endm + +#ifdef CONFIG_PARAVIRT_XXL +#define GET_CR2_INTO(reg) GET_CR2_INTO_AX ; _ASM_MOV %_ASM_AX, reg +#else +#define GET_CR2_INTO(reg) _ASM_MOV %cr2, reg +#endif diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h index c25c38a05c1c..5135282683d4 100644 --- a/arch/x86/include/asm/paravirt.h +++ b/arch/x86/include/asm/paravirt.h @@ -116,7 +116,7 @@ static inline void write_cr0(unsigned long x) static inline unsigned long read_cr2(void) { - return PVOP_CALL0(unsigned long, mmu.read_cr2); + return PVOP_CALLEE0(unsigned long, mmu.read_cr2); } static inline void write_cr2(unsigned long x) @@ -909,13 +909,7 @@ extern void default_banner(void); ANNOTATE_RETPOLINE_SAFE; \ call PARA_INDIRECT(pv_ops+PV_CPU_swapgs); \ ) -#endif - -#define GET_CR2_INTO_RAX \ - ANNOTATE_RETPOLINE_SAFE;\ - call PARA_INDIRECT(pv_ops+PV_MMU_read_cr2); -#ifdef CONFIG_PARAVIRT_XXL #define USERGS_SYSRET64 \ PARA_SITE(PARA_PATCH(PV_CPU_usergs_sysret64), \ ANNOTATE_RETPOLINE_SAFE; \ @@ -929,9 +923,19 @@ extern void default_banner(void); call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);\ PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);) #endif -#endif +#endif /* CONFIG_PARAVIRT_XXL */ +#endif /* CONFIG_X86_64 */ + +#ifdef CONFIG_PARAVIRT_XXL + +#define GET_CR2_INTO_AX \ + PARA_SITE(PARA_PATCH(PV_MMU_read_cr2), \ + ANNOTATE_RETPOLINE_SAFE; \ + call PARA_INDIRECT(pv_ops+PV_MMU_read_cr2); \ +) + +#endif /* CONFIG_PARAVIRT_XXL */ -#endif /* CONFIG_X86_32 */ #endif /* __ASSEMBLY__ */ #else /* CONFIG_PARAVIRT */ diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h index 946f8f1f1efc..639b2df445ee 100644 --- a/arch/x86/include/asm/paravirt_types.h +++ b/arch/x86/include/asm/paravirt_types.h @@ -220,7 +220,7 @@ struct pv_mmu_ops { void (*exit_mmap)(struct mm_struct *mm); #ifdef CONFIG_PARAVIRT_XXL - unsigned long (*read_cr2)(void); + struct paravirt_callee_save read_cr2; void (*write_cr2)(unsigned long); unsigned long (*read_cr3)(void); diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c index da64452584b0..5c7ee3df4d0b 100644 --- a/arch/x86/kernel/asm-offsets.c +++ b/arch/x86/kernel/asm-offsets.c @@ -76,6 +76,7 @@ static void __used common(void) BLANK(); OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask); OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending); + OFFSET(XEN_vcpu_info_arch_cr2, vcpu_info, arch.cr2); #endif BLANK(); diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S index bcd206c8ac90..0e2d72929a8c 100644 ---
[tip:sched/urgent] sched/core: Fix preempt warning in ttwu
Commit-ID: e3d85487fba42206024bc3ed32e4b581c7cb46db Gitweb: https://git.kernel.org/tip/e3d85487fba42206024bc3ed32e4b581c7cb46db Author: Peter Zijlstra AuthorDate: Wed, 10 Jul 2019 12:57:36 +0200 Committer: Ingo Molnar CommitDate: Sat, 13 Jul 2019 11:23:27 +0200 sched/core: Fix preempt warning in ttwu John reported a DEBUG_PREEMPT warning caused by commit: aacedf26fb76 ("sched/core: Optimize try_to_wake_up() for local wakeups") I overlooked that ttwu_stat() requires preemption disabled. Reported-by: John Stultz Tested-by: John Stultz Signed-off-by: Peter Zijlstra (Intel) Cc: Andrew Morton Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: aacedf26fb76 ("sched/core: Optimize try_to_wake_up() for local wakeups") Link: https://lkml.kernel.org/r/20190710105736.gk3...@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/core.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index fa43ce3962e7..2b037f195473 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -2399,6 +2399,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) unsigned long flags; int cpu, success = 0; + preempt_disable(); if (p == current) { /* * We're waking current, this means 'p->on_rq' and 'task_cpu(p) @@ -2412,7 +2413,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) *it disabling IRQs (this allows not taking ->pi_lock). */ if (!(p->state & state)) - return false; + goto out; success = 1; cpu = task_cpu(p); @@ -2526,6 +2527,7 @@ unlock: out: if (success) ttwu_stat(p, cpu, wake_flags); + preempt_enable(); return success; }
[tip:x86/urgent] x86/alternatives: Fix int3_emulate_call() selftest stack corruption
Commit-ID: ecc606103837b98a2b665e8f14e533a6c72bbdc0 Gitweb: https://git.kernel.org/tip/ecc606103837b98a2b665e8f14e533a6c72bbdc0 Author: Peter Zijlstra AuthorDate: Mon, 8 Jul 2019 15:55:30 -0500 Committer: Thomas Gleixner CommitDate: Tue, 9 Jul 2019 22:39:15 +0200 x86/alternatives: Fix int3_emulate_call() selftest stack corruption KASAN shows the following splat during boot: BUG: KASAN: unknown-crash in unwind_next_frame+0x3f6/0x490 Read of size 8 at addr 84007db0 by task swapper/0 CPU: 0 PID: 0 Comm: swapper Tainted: GT 5.2.0-rc6-00013-g7457c0d #1 Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 04/01/2014 Call Trace: dump_stack+0x19/0x1b print_address_description+0x1b0/0x2b2 __kasan_report+0x10f/0x171 kasan_report+0x12/0x1c __asan_load8+0x54/0x81 unwind_next_frame+0x3f6/0x490 unwind_next_frame+0x1b/0x23 arch_stack_walk+0x68/0xa5 stack_trace_save+0x7b/0xa0 save_trace+0x3c/0x93 mark_lock+0x1ef/0x9b1 lock_acquire+0x122/0x221 __mutex_lock+0xb6/0x731 mutex_lock_nested+0x16/0x18 _vm_unmap_aliases+0x141/0x183 vm_unmap_aliases+0x14/0x16 change_page_attr_set_clr+0x15e/0x2f2 set_memory_4k+0x2a/0x2c check_bugs+0x11fd/0x1298 start_kernel+0x793/0x7eb x86_64_start_reservations+0x55/0x76 x86_64_start_kernel+0x87/0xaa secondary_startup_64+0xa4/0xb0 Memory state around the buggy address: 84007c80: 00 00 00 00 00 00 00 00 00 00 00 00 00 f1 f1 f1 84007d00: f1 00 00 00 00 00 00 00 00 00 f2 f2 f2 f3 f3 f3 >84007d80: f3 79 be 52 49 79 be 00 00 00 00 00 00 00 00 f1 It turns out that int3_selftest() is corrupting the stack. The problem is that the KASAN-ified version of int3_magic() is much less trivial than the C code appears. It clobbers several unexpected registers. So when the selftest's INT3 is converted to an emulated call to int3_magic(), the registers are clobbered and Bad Things happen when the function returns. Fix this by converting int3_magic() to the trivial ASM function it should be, avoiding all calling convention issues. Also add ASM_CALL_CONSTRAINT to the INT3 ASM, since it contains a 'CALL'. [peterz: cribbed changelog from josh] Fixes: 7457c0da024b ("x86/alternatives: Add int3_emulate_call() selftest") Reported-by: kernel test robot Debugged-by: Josh Poimboeuf Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Josh Poimboeuf Cc: Linus Torvalds Cc: Andy Lutomirski Link: https://lkml.kernel.org/r/20190709125744.gb3...@hirez.programming.kicks-ass.net --- arch/x86/kernel/alternative.c | 25 - 1 file changed, 20 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c index 99ef8b6f9a1a..ccd32013c47a 100644 --- a/arch/x86/kernel/alternative.c +++ b/arch/x86/kernel/alternative.c @@ -625,10 +625,23 @@ extern struct paravirt_patch_site __start_parainstructions[], * * See entry_{32,64}.S for more details. */ -static void __init int3_magic(unsigned int *ptr) -{ - *ptr = 1; -} + +/* + * We define the int3_magic() function in assembly to control the calling + * convention such that we can 'call' it from assembly. + */ + +extern void int3_magic(unsigned int *ptr); /* defined in asm */ + +asm ( +" .pushsection.init.text, \"ax\", @progbits\n" +" .type int3_magic, @function\n" +"int3_magic:\n" +" movl$1, (%" _ASM_ARG1 ")\n" +" ret\n" +" .size int3_magic, .-int3_magic\n" +" .popsection\n" +); extern __initdata unsigned long int3_selftest_ip; /* defined in asm below */ @@ -676,7 +689,9 @@ static void __init int3_selftest(void) "int3_selftest_ip:\n\t" __ASM_SEL(.long, .quad) " 1b\n\t" ".popsection\n\t" - : : __ASM_SEL_RAW(a, D) () : "memory"); + : ASM_CALL_CONSTRAINT + : __ASM_SEL_RAW(a, D) () + : "memory"); BUG_ON(val != 1);
[tip:locking/core] x86/percpu: Differentiate this_cpu_{}() and __this_cpu_{}()
Commit-ID: 0b9ccc0a9b146b49e83bf1e32f70d2396a694bfb Gitweb: https://git.kernel.org/tip/0b9ccc0a9b146b49e83bf1e32f70d2396a694bfb Author: Peter Zijlstra AuthorDate: Thu, 6 Dec 2018 12:24:33 +0100 Committer: Ingo Molnar CommitDate: Mon, 17 Jun 2019 12:43:40 +0200 x86/percpu: Differentiate this_cpu_{}() and __this_cpu_{}() Nadav Amit reported that commit: b59167ac7baf ("x86/percpu: Fix this_cpu_read()") added a bunch of constraints to all sorts of code; and while some of that was correct and desired, some of that seems superfluous. The thing is, the this_cpu_*() operations are defined IRQ-safe, this means the values are subject to change from IRQs, and thus must be reloaded. Also, the generic form: local_irq_save() __this_cpu_read() local_irq_restore() would not allow the re-use of previous values; if by nothing else, then the barrier()s implied by local_irq_*(). Which raises the point that percpu_from_op() and the others also need that volatile. OTOH __this_cpu_*() operations are not IRQ-safe and assume external preempt/IRQ disabling and could thus be allowed more room for optimization. This makes the this_cpu_*() vs __this_cpu_*() behaviour more consistent with other architectures. $ ./compare.sh defconfig-build defconfig-build1 vmlinux.o x86_pmu_cancel_txn 80 71 -9,+0 __text_poke 919964 +45,+0 do_user_addr_fault 1082 1058 -24,+0 __do_page_fault 1194 1178 -16,+0 do_exit 2995 3027 -43,+75 process_one_work 1008989 -67,+48 finish_task_switch524505 -19,+0 __schedule_bug103 98 -59,+54 __schedule_bug103 98 -59,+54 __sched_setscheduler 2015 2030 +15,+0 freeze_processes 203230 +31,-4 rcu_gp_kthread_wake 106 99 -7,+0 rcu_core 1841 1834 -7,+0 call_timer_fn 298286 -12,+0 can_stop_idle_tick146139 -31,+24 perf_pending_event253239 -14,+0 shmem_alloc_page 209213 +4,+0 __alloc_pages_slowpath 3284 3269 -15,+0 umount_tree 671694 +23,+0 advance_transaction 803798 -5,+0 con_put_char 71 51 -20,+0 xhci_urb_enqueue 1302 1295 -7,+0 xhci_urb_enqueue 1302 1295 -7,+0 tcp_sacktag_write_queue 2130 2075 -55,+0 tcp_try_undo_loss 229208 -21,+0 tcp_v4_inbound_md5_hash 438411 -31,+4 tcp_v4_inbound_md5_hash 438411 -31,+4 tcp_v6_inbound_md5_hash 469411 -33,-25 tcp_v6_inbound_md5_hash 469411 -33,-25 restricted_pointer434420 -14,+0 irq_exit 162154 -8,+0 get_perf_callchain638624 -14,+0 rt_mutex_trylock 169156 -13,+0 avc_has_extended_perms 1092 1089 -3,+0 avc_has_perm_noaudit 309306 -3,+0 __perf_sw_event 138122 -16,+0 perf_swevent_get_recursion_context116102 -14,+0 __local_bh_enable_ip 93 72 -21,+0 xfrm_input 4175 4161 -14,+0 avc_has_perm 446443 -3,+0 vm_events_fold_cpu 57 56 -1,+0 vfree 68 61 -7,+0 freeze_processes 203230 +31,-4 _local_bh_enable
[tip:locking/core] Documentation/atomic_t.txt: Clarify pure non-rmw usage
Commit-ID: fff9b6c7d26943a8eb32b58364b7ec6b9369746a Gitweb: https://git.kernel.org/tip/fff9b6c7d26943a8eb32b58364b7ec6b9369746a Author: Peter Zijlstra AuthorDate: Fri, 24 May 2019 13:52:31 +0200 Committer: Ingo Molnar CommitDate: Mon, 3 Jun 2019 12:32:57 +0200 Documentation/atomic_t.txt: Clarify pure non-rmw usage Clarify that pure non-RMW usage of atomic_t is pointless, there is nothing 'magical' about atomic_set() / atomic_read(). This is something that seems to confuse people, because I happen upon it semi-regularly. Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Greg Kroah-Hartman Acked-by: Will Deacon Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190524115231.gn2...@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- Documentation/atomic_t.txt | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/Documentation/atomic_t.txt b/Documentation/atomic_t.txt index dca3fb0554db..89eae7f6b360 100644 --- a/Documentation/atomic_t.txt +++ b/Documentation/atomic_t.txt @@ -81,9 +81,11 @@ Non-RMW ops: The non-RMW ops are (typically) regular LOADs and STOREs and are canonically implemented using READ_ONCE(), WRITE_ONCE(), smp_load_acquire() and -smp_store_release() respectively. +smp_store_release() respectively. Therefore, if you find yourself only using +the Non-RMW operations of atomic_t, you do not in fact need atomic_t at all +and are doing it wrong. -The one detail to this is that atomic_set{}() should be observable to the RMW +A subtle detail of atomic_set{}() is that it should be observable to the RMW ops. That is: C atomic-set
[tip:locking/core] locking/lock_events: Use raw_cpu_{add,inc}() for stats
Commit-ID: 24811637dbfd07c69da7e9db586d35d17e6afca3 Gitweb: https://git.kernel.org/tip/24811637dbfd07c69da7e9db586d35d17e6afca3 Author: Peter Zijlstra AuthorDate: Mon, 27 May 2019 10:23:26 +0200 Committer: Ingo Molnar CommitDate: Mon, 3 Jun 2019 12:32:56 +0200 locking/lock_events: Use raw_cpu_{add,inc}() for stats Instead of playing silly games with CONFIG_DEBUG_PREEMPT toggling between this_cpu_*() and __this_cpu_*() use raw_cpu_*(), which is exactly what we want here. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Linus Torvalds Cc: Borislav Petkov Cc: Davidlohr Bueso Cc: H. Peter Anvin Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Tim Chen Cc: Waiman Long Cc: Will Deacon Cc: huang ying Link: https://lkml.kernel.org/r/20190527082326.gp2...@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/locking/lock_events.h | 45 1 file changed, 4 insertions(+), 41 deletions(-) diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h index 46b71af8eef2..8c7e7d25f09c 100644 --- a/kernel/locking/lock_events.h +++ b/kernel/locking/lock_events.h @@ -31,50 +31,13 @@ enum lock_events { DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]); /* - * The purpose of the lock event counting subsystem is to provide a low - * overhead way to record the number of specific locking events by using - * percpu counters. It is the percpu sum that matters, not specifically - * how many of them happens in each cpu. - * - * It is possible that the same percpu counter may be modified in both - * the process and interrupt contexts. For architectures that perform - * percpu operation with multiple instructions, it is possible to lose - * count if a process context percpu update is interrupted in the middle - * and the same counter is updated in the interrupt context. Therefore, - * the generated percpu sum may not be precise. The error, if any, should - * be small and insignificant. - * - * For those architectures that do multi-instruction percpu operation, - * preemption in the middle and moving the task to another cpu may cause - * a larger error in the count. Again, this will be few and far between. - * Given the imprecise nature of the count and the possibility of resetting - * the count and doing the measurement again, this is not really a big - * problem. - * - * To get a better picture of what is happening under the hood, it is - * suggested that a few measurements should be taken with the counts - * reset in between to stamp out outliner because of these possible - * error conditions. - * - * To minimize overhead, we use __this_cpu_*() in all cases except when - * CONFIG_DEBUG_PREEMPT is defined. In this particular case, this_cpu_*() - * will be used to avoid the appearance of unwanted BUG messages. - */ -#ifdef CONFIG_DEBUG_PREEMPT -#define lockevent_percpu_inc(x)this_cpu_inc(x) -#define lockevent_percpu_add(x, v) this_cpu_add(x, v) -#else -#define lockevent_percpu_inc(x)__this_cpu_inc(x) -#define lockevent_percpu_add(x, v) __this_cpu_add(x, v) -#endif - -/* - * Increment the PV qspinlock statistical counters + * Increment the statistical counters. use raw_cpu_inc() because of lower + * overhead and we don't care if we loose the occasional update. */ static inline void __lockevent_inc(enum lock_events event, bool cond) { if (cond) - lockevent_percpu_inc(lockevents[event]); + raw_cpu_inc(lockevents[event]); } #define lockevent_inc(ev)__lockevent_inc(LOCKEVENT_ ##ev, true) @@ -82,7 +45,7 @@ static inline void __lockevent_inc(enum lock_events event, bool cond) static inline void __lockevent_add(enum lock_events event, int inc) { - lockevent_percpu_add(lockevents[event], inc); + raw_cpu_add(lockevents[event], inc); } #define lockevent_add(ev, c) __lockevent_add(LOCKEVENT_ ##ev, c)
[tip:perf/urgent] perf/ring_buffer: Add ordering to rb->nest increment
Commit-ID: 3f9fbe9bd86c534eba2faf5d840fd44c6049f50e Gitweb: https://git.kernel.org/tip/3f9fbe9bd86c534eba2faf5d840fd44c6049f50e Author: Peter Zijlstra AuthorDate: Fri, 17 May 2019 13:52:32 +0200 Committer: Ingo Molnar CommitDate: Fri, 24 May 2019 09:00:10 +0200 perf/ring_buffer: Add ordering to rb->nest increment Similar to how decrementing rb->next too early can cause data_head to (temporarily) be observed to go backward, so too can this happen when we increment too late. This barrier() ensures the rb->head load happens after the increment, both the one in the 'goto again' path, as the one from perf_output_get_handle() -- albeit very unlikely to matter for the latter. Suggested-by: Yabin Cui Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: a...@kernel.org Cc: mark.rutl...@arm.com Cc: namhy...@kernel.org Fixes: ef60777c9abd ("perf: Optimize the perf_output() path by removing IRQ-disables") Link: http://lkml.kernel.org/r/20190517115418.309516...@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 9 + 1 file changed, 9 insertions(+) diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 009467a60578..4b5f8d932400 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -48,6 +48,15 @@ static void perf_output_put_handle(struct perf_output_handle *handle) unsigned long head; again: + /* +* In order to avoid publishing a head value that goes backwards, +* we must ensure the load of @rb->head happens after we've +* incremented @rb->nest. +* +* Otherwise we can observe a @rb->head value before one published +* by an IRQ/NMI happening between the load and the increment. +*/ + barrier(); head = local_read(>head); /*
[tip:perf/urgent] perf/ring-buffer: Always use {READ,WRITE}_ONCE() for rb->user_page data
Commit-ID: 4d839dd9e4356bbacf3eb0ab13a549b83b008c21 Gitweb: https://git.kernel.org/tip/4d839dd9e4356bbacf3eb0ab13a549b83b008c21 Author: Peter Zijlstra AuthorDate: Fri, 17 May 2019 13:52:33 +0200 Committer: Ingo Molnar CommitDate: Fri, 24 May 2019 09:00:11 +0200 perf/ring-buffer: Always use {READ,WRITE}_ONCE() for rb->user_page data We must use {READ,WRITE}_ONCE() on rb->user_page data such that concurrent usage will see whole values. A few key sites were missing this. Suggested-by: Yabin Cui Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: a...@kernel.org Cc: mark.rutl...@arm.com Cc: namhy...@kernel.org Fixes: 7b732a750477 ("perf_counter: new output ABI - part 1") Link: http://lkml.kernel.org/r/20190517115418.394192...@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/ring_buffer.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 4b5f8d932400..7a0c73e4b3eb 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -100,7 +100,7 @@ again: * See perf_output_begin(). */ smp_wmb(); /* B, matches C */ - rb->user_page->data_head = head; + WRITE_ONCE(rb->user_page->data_head, head); /* * We must publish the head before decrementing the nest count, @@ -496,7 +496,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, unsigned long size) perf_event_aux_event(handle->event, aux_head, size, handle->aux_flags); - rb->user_page->aux_head = rb->aux_head; + WRITE_ONCE(rb->user_page->aux_head, rb->aux_head); if (rb_need_aux_wakeup(rb)) wakeup = true; @@ -528,7 +528,7 @@ int perf_aux_output_skip(struct perf_output_handle *handle, unsigned long size) rb->aux_head += size; - rb->user_page->aux_head = rb->aux_head; + WRITE_ONCE(rb->user_page->aux_head, rb->aux_head); if (rb_need_aux_wakeup(rb)) { perf_output_wakeup(handle); handle->wakeup = rb->aux_wakeup + rb->aux_watermark;
[tip:perf/urgent] perf/ring-buffer: Use regular variables for nesting
Commit-ID: 5322ea58a06da2e69c5ef36a9b4d4b9255edd423 Gitweb: https://git.kernel.org/tip/5322ea58a06da2e69c5ef36a9b4d4b9255edd423 Author: Peter Zijlstra AuthorDate: Fri, 17 May 2019 13:52:34 +0200 Committer: Ingo Molnar CommitDate: Fri, 24 May 2019 09:00:11 +0200 perf/ring-buffer: Use regular variables for nesting While the IRQ/NMI will nest, the nest-count will be invariant over the actual exception, since it will decrement equal to increment. This means we can -- carefully -- use a regular variable since the typical LOAD-STORE race doesn't exist (similar to preempt_count). This optimizes the ring-buffer for all LOAD-STORE architectures, since they need to use atomic ops to implement local_t. Suggested-by: Alexander Shishkin Signed-off-by: Peter Zijlstra (Intel) Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: a...@kernel.org Cc: mark.rutl...@arm.com Cc: namhy...@kernel.org Cc: yab...@google.com Link: http://lkml.kernel.org/r/20190517115418.481392...@infradead.org Signed-off-by: Ingo Molnar --- kernel/events/internal.h| 4 ++-- kernel/events/ring_buffer.c | 41 ++--- 2 files changed, 28 insertions(+), 17 deletions(-) diff --git a/kernel/events/internal.h b/kernel/events/internal.h index 79c47076700a..3aef4191798c 100644 --- a/kernel/events/internal.h +++ b/kernel/events/internal.h @@ -24,7 +24,7 @@ struct ring_buffer { atomic_tpoll; /* POLL_ for wakeups */ local_t head; /* write position*/ - local_t nest; /* nested writers*/ + unsigned intnest; /* nested writers*/ local_t events; /* event limit */ local_t wakeup; /* wakeup stamp */ local_t lost; /* nr records lost */ @@ -41,7 +41,7 @@ struct ring_buffer { /* AUX area */ longaux_head; - local_t aux_nest; + unsigned intaux_nest; longaux_wakeup; /* last aux_watermark boundary crossed by aux_head */ unsigned long aux_pgoff; int aux_nr_pages; diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c index 7a0c73e4b3eb..ffb59a4ef4ff 100644 --- a/kernel/events/ring_buffer.c +++ b/kernel/events/ring_buffer.c @@ -38,7 +38,12 @@ static void perf_output_get_handle(struct perf_output_handle *handle) struct ring_buffer *rb = handle->rb; preempt_disable(); - local_inc(>nest); + + /* +* Avoid an explicit LOAD/STORE such that architectures with memops +* can use them. +*/ + (*(volatile unsigned int *)>nest)++; handle->wakeup = local_read(>wakeup); } @@ -46,6 +51,17 @@ static void perf_output_put_handle(struct perf_output_handle *handle) { struct ring_buffer *rb = handle->rb; unsigned long head; + unsigned int nest; + + /* +* If this isn't the outermost nesting, we don't have to update +* @rb->user_page->data_head. +*/ + nest = READ_ONCE(rb->nest); + if (nest > 1) { + WRITE_ONCE(rb->nest, nest - 1); + goto out; + } again: /* @@ -64,15 +80,6 @@ again: * load above to be stale. */ - /* -* If this isn't the outermost nesting, we don't have to update -* @rb->user_page->data_head. -*/ - if (local_read(>nest) > 1) { - local_dec(>nest); - goto out; - } - /* * Since the mmap() consumer (userspace) can run on a different CPU: * @@ -108,7 +115,7 @@ again: * write will (temporarily) publish a stale value. */ barrier(); - local_set(>nest, 0); + WRITE_ONCE(rb->nest, 0); /* * Ensure we decrement @rb->nest before we validate the @rb->head. @@ -116,7 +123,7 @@ again: */ barrier(); if (unlikely(head != local_read(>head))) { - local_inc(>nest); + WRITE_ONCE(rb->nest, 1); goto again; } @@ -355,6 +362,7 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, struct perf_event *output_event = event; unsigned long aux_head, aux_tail; struct ring_buffer *rb; + unsigned int nest; if (output_event->parent) output_event = output_event->parent; @@ -385,13 +393,16 @@ void *perf_aux_output_begin(struct perf_output_handle *handle, if (!refcount_inc_not_zero(>aux_refcount)) goto err; + nest =
[tip:sched/core] trace: Fix preempt_enable_no_resched() abuse
Commit-ID: e8bd5814989b994cf1b0cb179e1c777e40c0f02c Gitweb: https://git.kernel.org/tip/e8bd5814989b994cf1b0cb179e1c777e40c0f02c Author: Peter Zijlstra AuthorDate: Tue, 23 Apr 2019 22:03:18 +0200 Committer: Ingo Molnar CommitDate: Mon, 29 Apr 2019 08:27:09 +0200 trace: Fix preempt_enable_no_resched() abuse Unless there is a call into schedule() in the immediate (deterministic) future, one must not use preempt_enable_no_resched(). It can cause a preemption to go missing and thereby cause arbitrary delays, breaking the PREEMPT=y invariant. Signed-off-by: Peter Zijlstra (Intel) Cc: Alexei Starovoitov Cc: Daniel Borkmann Cc: Davidlohr Bueso Cc: Linus Torvalds Cc: Roman Gushchin Cc: Steven Rostedt Cc: Thomas Gleixner Cc: Tim Chen Cc: Waiman Long Cc: Will Deacon Cc: huang ying Fixes: 2c2d7329d8af ("tracing/ftrace: use preempt_enable_no_resched_notrace in ring_buffer_time_stamp()") Link: https://lkml.kernel.org/r/20190423200318.gy14...@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/trace/ring_buffer.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c index 41b6f96e5366..4ee8d8aa3d0f 100644 --- a/kernel/trace/ring_buffer.c +++ b/kernel/trace/ring_buffer.c @@ -762,7 +762,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int cpu) preempt_disable_notrace(); time = rb_time_stamp(buffer); - preempt_enable_no_resched_notrace(); + preempt_enable_notrace(); return time; }
[tip:core/objtool] x86/uaccess: Dont leak the AC flag into __put_user() argument evaluation
Commit-ID: 6ae865615fc43d014da2fd1f1bba7e81ee622d1b Gitweb: https://git.kernel.org/tip/6ae865615fc43d014da2fd1f1bba7e81ee622d1b Author: Peter Zijlstra AuthorDate: Wed, 24 Apr 2019 09:19:24 +0200 Committer: Ingo Molnar CommitDate: Wed, 24 Apr 2019 12:19:45 +0200 x86/uaccess: Dont leak the AC flag into __put_user() argument evaluation The __put_user() macro evaluates it's @ptr argument inside the __uaccess_begin() / __uaccess_end() region. While this would normally not be expected to be an issue, an UBSAN bug (it ignored -fwrapv, fixed in GCC 8+) would transform the @ptr evaluation for: drivers/gpu/drm/i915/i915_gem_execbuffer.c: if (unlikely(__put_user(offset, [r-stack].presumed_offset))) { into a signed-overflow-UB check and trigger the objtool AC validation. Finish this commit: 2a418cf3f5f1 ("x86/uaccess: Don't leak the AC flag into __put_user() value evaluation") and explicitly evaluate all 3 arguments early. Reported-by: Randy Dunlap Signed-off-by: Peter Zijlstra (Intel) Acked-by: Randy Dunlap # build-tested Acked-by: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: l...@kernel.org Fixes: 2a418cf3f5f1 ("x86/uaccess: Don't leak the AC flag into __put_user() value evaluation") Link: http://lkml.kernel.org/r/20190424072208.695962...@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/uaccess.h | 7 --- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h index 5ca7b91faf67..bb21913885a3 100644 --- a/arch/x86/include/asm/uaccess.h +++ b/arch/x86/include/asm/uaccess.h @@ -427,10 +427,11 @@ do { \ ({ \ __label__ __pu_label; \ int __pu_err = -EFAULT; \ - __typeof__(*(ptr)) __pu_val;\ - __pu_val = x; \ + __typeof__(*(ptr)) __pu_val = (x); \ + __typeof__(ptr) __pu_ptr = (ptr); \ + __typeof__(size) __pu_size = (size);\ __uaccess_begin(); \ - __put_user_size(__pu_val, (ptr), (size), __pu_label); \ + __put_user_size(__pu_val, __pu_ptr, __pu_size, __pu_label); \ __pu_err = 0; \ __pu_label:\ __uaccess_end();\
[tip:core/objtool] mm/uaccess: Use 'unsigned long' to placate UBSAN warnings on older GCC versions
Commit-ID: 29da93fea3ea39ab9b12270cc6be1b70ef201c9e Gitweb: https://git.kernel.org/tip/29da93fea3ea39ab9b12270cc6be1b70ef201c9e Author: Peter Zijlstra AuthorDate: Wed, 24 Apr 2019 09:19:25 +0200 Committer: Ingo Molnar CommitDate: Wed, 24 Apr 2019 12:19:45 +0200 mm/uaccess: Use 'unsigned long' to placate UBSAN warnings on older GCC versions Randy reported objtool triggered on his (GCC-7.4) build: lib/strncpy_from_user.o: warning: objtool: strncpy_from_user()+0x315: call to __ubsan_handle_add_overflow() with UACCESS enabled lib/strnlen_user.o: warning: objtool: strnlen_user()+0x337: call to __ubsan_handle_sub_overflow() with UACCESS enabled This is due to UBSAN generating signed-overflow-UB warnings where it should not. Prior to GCC-8 UBSAN ignored -fwrapv (which the kernel uses through -fno-strict-overflow). Make the functions use 'unsigned long' throughout. Reported-by: Randy Dunlap Signed-off-by: Peter Zijlstra (Intel) Acked-by: Randy Dunlap # build-tested Acked-by: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: l...@kernel.org Link: http://lkml.kernel.org/r/20190424072208.754094...@infradead.org Signed-off-by: Ingo Molnar --- lib/strncpy_from_user.c | 5 +++-- lib/strnlen_user.c | 4 ++-- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c index 58eacd41526c..023ba9f3b99f 100644 --- a/lib/strncpy_from_user.c +++ b/lib/strncpy_from_user.c @@ -23,10 +23,11 @@ * hit it), 'max' is the address space maximum (and we return * -EFAULT if we hit it). */ -static inline long do_strncpy_from_user(char *dst, const char __user *src, long count, unsigned long max) +static inline long do_strncpy_from_user(char *dst, const char __user *src, + unsigned long count, unsigned long max) { const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; - long res = 0; + unsigned long res = 0; /* * Truncate 'max' to the user-specified limit, so that diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c index 1c1a1b0e38a5..7f2db3fe311f 100644 --- a/lib/strnlen_user.c +++ b/lib/strnlen_user.c @@ -28,7 +28,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, unsigned long max) { const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS; - long align, res = 0; + unsigned long align, res = 0; unsigned long c; /* @@ -42,7 +42,7 @@ static inline long do_strnlen_user(const char __user *src, unsigned long count, * Do everything aligned. But that means that we * need to also expand the maximum.. */ - align = (sizeof(long) - 1) & (unsigned long)src; + align = (sizeof(unsigned long) - 1) & (unsigned long)src; src -= align; max += align;
[tip:perf/core] perf/x86: Support constraint ranges
Commit-ID: 63b79f6ebc464afb730bc45762c820795e276da1 Gitweb: https://git.kernel.org/tip/63b79f6ebc464afb730bc45762c820795e276da1 Author: Peter Zijlstra AuthorDate: Tue, 2 Apr 2019 12:45:04 -0700 Committer: Ingo Molnar CommitDate: Tue, 16 Apr 2019 12:26:17 +0200 perf/x86: Support constraint ranges Icelake extended the general counters to 8, even when SMT is enabled. However only a (large) subset of the events can be used on all 8 counters. The events that can or cannot be used on all counters are organized in ranges. A lot of scheduler constraints are required to handle all this. To avoid blowing up the tables add event code ranges to the constraint tables, and a new inline function to match them. Originally-by: Andi Kleen Signed-off-by: Peter Zijlstra (Intel) # developer hat on Signed-off-by: Kan Liang Signed-off-by: Peter Zijlstra (Intel) # maintainer hat on Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: a...@kernel.org Cc: jo...@kernel.org Link: https://lkml.kernel.org/r/20190402194509.2832-8-kan.li...@linux.intel.com Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 2 +- arch/x86/events/intel/ds.c | 2 +- arch/x86/events/perf_event.h | 43 +-- 3 files changed, 39 insertions(+), 8 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index bdc366d709aa..d4b52896f173 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2693,7 +2693,7 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, int idx, if (x86_pmu.event_constraints) { for_each_event_constraint(c, x86_pmu.event_constraints) { - if ((event->hw.config & c->cmask) == c->code) { + if (constraint_match(c, event->hw.config)) { event->hw.flags |= c->flags; return c; } diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c index 6436452d6342..4429bfa92fbc 100644 --- a/arch/x86/events/intel/ds.c +++ b/arch/x86/events/intel/ds.c @@ -858,7 +858,7 @@ struct event_constraint *intel_pebs_constraints(struct perf_event *event) if (x86_pmu.pebs_constraints) { for_each_event_constraint(c, x86_pmu.pebs_constraints) { - if ((event->hw.config & c->cmask) == c->code) { + if (constraint_match(c, event->hw.config)) { event->hw.flags |= c->flags; return c; } diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index dced91582147..0ff0c5ae8c29 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -49,13 +49,19 @@ struct event_constraint { unsigned long idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)]; u64 idxmsk64; }; - u64 code; - u64 cmask; - int weight; - int overlap; - int flags; + u64 code; + u64 cmask; + int weight; + int overlap; + int flags; + unsigned intsize; }; +static inline bool constraint_match(struct event_constraint *c, u64 ecode) +{ + return ((ecode & c->cmask) - c->code) <= (u64)c->size; +} + /* * struct hw_perf_event.flags flags */ @@ -280,18 +286,29 @@ struct cpu_hw_events { void*kfree_on_online[X86_PERF_KFREE_MAX]; }; -#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\ +#define __EVENT_CONSTRAINT_RANGE(c, e, n, m, w, o, f) {\ { .idxmsk64 = (n) },\ .code = (c),\ + .size = (e) - (c), \ .cmask = (m), \ .weight = (w), \ .overlap = (o), \ .flags = f, \ } +#define __EVENT_CONSTRAINT(c, n, m, w, o, f) \ + __EVENT_CONSTRAINT_RANGE(c, c, n, m, w, o, f) + #define EVENT_CONSTRAINT(c, n, m) \ __EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0) +/* + * The constraint_match() function only works for 'simple' event codes + * and not for extended (AMD64_EVENTSEL_EVENT) events codes. + */ +#define EVENT_CONSTRAINT_RANGE(c, e, n, m) \ + __EVENT_CONSTRAINT_RANGE(c, e, n, m, HWEIGHT(n), 0, 0) + #define INTEL_EXCLEVT_CONSTRAINT(c, n) \ __EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\ 0, PERF_X86_EVENT_EXCL) @@ -326,6 +343,12 @@ struct cpu_hw_events { #define INTEL_EVENT_CONSTRAINT(c, n) \ EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT) +/* + * Constraint on a range of Event codes + */ +#define INTEL_EVENT_CONSTRAINT_RANGE(c,
[tip:x86/urgent] x86/mm/tlb: Revert "x86/mm: Align TLB invalidation info"
Commit-ID: 780e0106d468a2962b16b52fdf42898f2639e0a0 Gitweb: https://git.kernel.org/tip/780e0106d468a2962b16b52fdf42898f2639e0a0 Author: Peter Zijlstra AuthorDate: Tue, 16 Apr 2019 10:03:35 +0200 Committer: Ingo Molnar CommitDate: Tue, 16 Apr 2019 10:10:13 +0200 x86/mm/tlb: Revert "x86/mm: Align TLB invalidation info" Revert the following commit: 515ab7c41306: ("x86/mm: Align TLB invalidation info") I found out (the hard way) that under some .config options (notably L1_CACHE_SHIFT=7) and compiler combinations this on-stack alignment leads to a 320 byte stack usage, which then triggers a KASAN stack warning elsewhere. Using 320 bytes of stack space for a 40 byte structure is ludicrous and clearly not right. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Linus Torvalds Acked-by: Nadav Amit Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: Peter Zijlstra Cc: Thomas Gleixner Fixes: 515ab7c41306 ("x86/mm: Align TLB invalidation info") Link: http://lkml.kernel.org/r/20190416080335.gm7...@worktop.programming.kicks-ass.net [ Minor changelog edits. ] Signed-off-by: Ingo Molnar --- arch/x86/mm/tlb.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index bc4bc7b2f075..487b8474c01c 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -728,7 +728,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, { int cpu; - struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = { + struct flush_tlb_info info = { .mm = mm, .stride_shift = stride_shift, .freed_tables = freed_tables,
[tip:perf/urgent] perf/x86/intel: Initialize TFA MSR
Commit-ID: d7262457e35dbe239659e62654e56f8ddb814bed Gitweb: https://git.kernel.org/tip/d7262457e35dbe239659e62654e56f8ddb814bed Author: Peter Zijlstra AuthorDate: Thu, 21 Mar 2019 13:38:49 +0100 Committer: Ingo Molnar CommitDate: Wed, 3 Apr 2019 11:40:32 +0200 perf/x86/intel: Initialize TFA MSR Stephane reported that the TFA MSR is not initialized by the kernel, but the TFA bit could set by firmware or as a leftover from a kexec, which makes the state inconsistent. Reported-by: Stephane Eranian Tested-by: Nelson DSouza Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vince Weaver Cc: to...@suse.com Link: https://lkml.kernel.org/r/20190321123849.gn6...@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 6 ++ 1 file changed, 6 insertions(+) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 1539647ea39d..f61dcbef20ff 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3575,6 +3575,12 @@ static void intel_pmu_cpu_starting(int cpu) cpuc->lbr_sel = NULL; + if (x86_pmu.flags & PMU_FL_TFA) { + WARN_ON_ONCE(cpuc->tfa_shadow); + cpuc->tfa_shadow = ~0ULL; + intel_set_tfa(cpuc, false); + } + if (x86_pmu.version > 1) flip_smm_bit(_pmu.attr_freeze_on_smi);
[tip:sched/urgent] sched/cpufreq: Fix 32-bit math overflow
Commit-ID: a23314e9d88d89d49e69db08f60b7caa470f04e1 Gitweb: https://git.kernel.org/tip/a23314e9d88d89d49e69db08f60b7caa470f04e1 Author: Peter Zijlstra AuthorDate: Tue, 5 Mar 2019 09:32:02 +0100 Committer: Ingo Molnar CommitDate: Tue, 19 Mar 2019 12:06:11 +0100 sched/cpufreq: Fix 32-bit math overflow Vincent Wang reported that get_next_freq() has a mult overflow bug on 32-bit platforms in the IOWAIT boost case, since in that case {util,max} are in freq units instead of capacity units. Solve this by moving the IOWAIT boost to capacity units. And since this means @max is constant; simplify the code. Reported-by: Vincent Wang Tested-by: Vincent Wang Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rafael J. Wysocki Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Chunyan Zhang Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J. Wysocki Cc: Rik van Riel Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190305083202.gu32...@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 59 +--- 1 file changed, 25 insertions(+), 34 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 033ec7c45f13..1ccf77f6d346 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -48,10 +48,10 @@ struct sugov_cpu { booliowait_boost_pending; unsigned intiowait_boost; - unsigned intiowait_boost_max; u64 last_update; unsigned long bw_dl; + unsigned long min; unsigned long max; /* The field below is for single-CPU policies only: */ @@ -303,8 +303,7 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, if (delta_ns <= TICK_NSEC) return false; - sg_cpu->iowait_boost = set_iowait_boost - ? sg_cpu->sg_policy->policy->min : 0; + sg_cpu->iowait_boost = set_iowait_boost ? sg_cpu->min : 0; sg_cpu->iowait_boost_pending = set_iowait_boost; return true; @@ -344,14 +343,13 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, /* Double the boost at each request */ if (sg_cpu->iowait_boost) { - sg_cpu->iowait_boost <<= 1; - if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max) - sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; + sg_cpu->iowait_boost = + min_t(unsigned int, sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE); return; } /* First wakeup after IO: start with minimum boost */ - sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min; + sg_cpu->iowait_boost = sg_cpu->min; } /** @@ -373,47 +371,38 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, * This mechanism is designed to boost high frequently IO waiting tasks, while * being more conservative on tasks which does sporadic IO operations. */ -static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, - unsigned long *util, unsigned long *max) +static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, + unsigned long util, unsigned long max) { - unsigned int boost_util, boost_max; + unsigned long boost; /* No boost currently required */ if (!sg_cpu->iowait_boost) - return; + return util; /* Reset boost if the CPU appears to have been idle enough */ if (sugov_iowait_reset(sg_cpu, time, false)) - return; + return util; - /* -* An IO waiting task has just woken up: -* allow to further double the boost value -*/ - if (sg_cpu->iowait_boost_pending) { - sg_cpu->iowait_boost_pending = false; - } else { + if (!sg_cpu->iowait_boost_pending) { /* -* Otherwise: reduce the boost value and disable it when we -* reach the minimum. +* No boost pending; reduce the boost value. */ sg_cpu->iowait_boost >>= 1; - if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) { + if (sg_cpu->iowait_boost < sg_cpu->min) { sg_cpu->iowait_boost = 0; - return; + return util; } } + sg_cpu->iowait_boost_pending = false; + /* -* Apply the current boost value: a CPU is boosted only if its current -* utilization is smaller then the current IO boost level. +* @util is already in capacity scale; convert iowait_boost +* into the same
[tip:perf/urgent] perf/x86: Fixup typo in stub functions
Commit-ID: f764c58b7faa26f5714e6907f892abc2bc0de4f8 Gitweb: https://git.kernel.org/tip/f764c58b7faa26f5714e6907f892abc2bc0de4f8 Author: Peter Zijlstra AuthorDate: Fri, 15 Mar 2019 09:14:10 +0100 Committer: Ingo Molnar CommitDate: Fri, 15 Mar 2019 13:12:42 +0100 perf/x86: Fixup typo in stub functions Guenter reported a build warning for CONFIG_CPU_SUP_INTEL=n: > With allmodconfig-CONFIG_CPU_SUP_INTEL, this patch results in: > > In file included from arch/x86/events/amd/core.c:8:0: > arch/x86/events/amd/../perf_event.h:1036:45: warning: ‘struct cpu_hw_event’ declared inside parameter list will not be visible outside of this definition or declaration > static inline int intel_cpuc_prepare(struct cpu_hw_event *cpuc, int cpu) While harmless (an unsed pointer is an unused pointer, no matter the type) it needs fixing. Reported-by: Guenter Roeck Signed-off-by: Peter Zijlstra (Intel) Cc: Greg Kroah-Hartman Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: sta...@vger.kernel.org Fixes: d01b1f96a82e ("perf/x86/intel: Make cpuc allocations consistent") Link: http://lkml.kernel.org/r/20190315081410.gr5...@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- arch/x86/events/perf_event.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h index b04ae6c8775e..a75955741c50 100644 --- a/arch/x86/events/perf_event.h +++ b/arch/x86/events/perf_event.h @@ -1033,12 +1033,12 @@ static inline int intel_pmu_init(void) return 0; } -static inline int intel_cpuc_prepare(struct cpu_hw_event *cpuc, int cpu) +static inline int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu) { return 0; } -static inline void intel_cpuc_finish(struct cpu_hw_event *cpuc) +static inline void intel_cpuc_finish(struct cpu_hw_events *cpuc) { }
[tip:perf/urgent] perf/x86/intel: Fix memory corruption
Commit-ID: ede271b059463731cbd6dffe55ffd70d7dbe8392 Gitweb: https://git.kernel.org/tip/ede271b059463731cbd6dffe55ffd70d7dbe8392 Author: Peter Zijlstra AuthorDate: Thu, 14 Mar 2019 14:01:14 +0100 Committer: Thomas Gleixner CommitDate: Fri, 15 Mar 2019 12:22:51 +0100 perf/x86/intel: Fix memory corruption Through: validate_event() x86_pmu.get_event_constraints(.idx=-1) tfa_get_event_constraints() dyn_constraint() cpuc->constraint_list[-1] is used, which is an obvious out-of-bound access. In this case, simply skip the TFA constraint code, there is no event constraint with just PMC3, therefore the code will never result in the empty set. Fixes: 400816f60c54 ("perf/x86/intel: Implement support for TSX Force Abort") Reported-by: Tony Jones Reported-by: "DSouza, Nelson" Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Tested-by: Tony Jones Tested-by: "DSouza, Nelson" Cc: eran...@google.com Cc: jo...@redhat.com Cc: sta...@kernel.org Link: https://lkml.kernel.org/r/20190314130705.441549...@infradead.org --- arch/x86/events/intel/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 35102ecdfc8d..92dfeb343a6a 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3410,7 +3410,7 @@ tfa_get_event_constraints(struct cpu_hw_events *cpuc, int idx, /* * Without TFA we must not use PMC3. */ - if (!allow_tsx_force_abort && test_bit(3, c->idxmsk)) { + if (!allow_tsx_force_abort && test_bit(3, c->idxmsk) && idx >= 0) { c = dyn_constraint(cpuc, c, idx); c->idxmsk64 &= ~(1ULL << 3); c->weight--;
[tip:sched/urgent] sched/cpufreq: Fix 32-bit math overflow
Commit-ID: f1212844e9dc3a31d41f99713c5522acf92ff291 Gitweb: https://git.kernel.org/tip/f1212844e9dc3a31d41f99713c5522acf92ff291 Author: Peter Zijlstra AuthorDate: Tue, 5 Mar 2019 09:32:02 +0100 Committer: Ingo Molnar CommitDate: Sat, 9 Mar 2019 14:03:51 +0100 sched/cpufreq: Fix 32-bit math overflow Vincent Wang reported that get_next_freq() has a mult overflow bug on 32-bit platforms in the IOWAIT boost case, since in that case {util,max} are in freq units instead of capacity units. Solve this by moving the IOWAIT boost to capacity units. And since this means @max is constant; simplify the code. Reported-by: Vincent Wang Tested-by: Vincent Wang Signed-off-by: Peter Zijlstra (Intel) Acked-by: Rafael J. Wysocki Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Chunyan Zhang Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Quentin Perret Cc: Rafael J. Wysocki Cc: Rik van Riel Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190305083202.gu32...@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 58 +--- 1 file changed, 24 insertions(+), 34 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index 033ec7c45f13..5a8932ee5112 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -48,10 +48,10 @@ struct sugov_cpu { booliowait_boost_pending; unsigned intiowait_boost; - unsigned intiowait_boost_max; u64 last_update; unsigned long bw_dl; + unsigned long min; unsigned long max; /* The field below is for single-CPU policies only: */ @@ -303,8 +303,7 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, u64 time, if (delta_ns <= TICK_NSEC) return false; - sg_cpu->iowait_boost = set_iowait_boost - ? sg_cpu->sg_policy->policy->min : 0; + sg_cpu->iowait_boost = set_iowait_boost ? sg_cpu->min : 0; sg_cpu->iowait_boost_pending = set_iowait_boost; return true; @@ -344,14 +343,12 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, /* Double the boost at each request */ if (sg_cpu->iowait_boost) { - sg_cpu->iowait_boost <<= 1; - if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max) - sg_cpu->iowait_boost = sg_cpu->iowait_boost_max; + sg_cpu->iowait_boost = min(sg_cpu->iowait_boost << 1, SCHED_CAPACITY_SCALE); return; } /* First wakeup after IO: start with minimum boost */ - sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min; + sg_cpu->iowait_boost = sg_cpu->min; } /** @@ -373,47 +370,38 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, u64 time, * This mechanism is designed to boost high frequently IO waiting tasks, while * being more conservative on tasks which does sporadic IO operations. */ -static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, - unsigned long *util, unsigned long *max) +static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, + unsigned long util, unsigned long max) { - unsigned int boost_util, boost_max; + unsigned long boost; /* No boost currently required */ if (!sg_cpu->iowait_boost) - return; + return util; /* Reset boost if the CPU appears to have been idle enough */ if (sugov_iowait_reset(sg_cpu, time, false)) - return; + return util; - /* -* An IO waiting task has just woken up: -* allow to further double the boost value -*/ - if (sg_cpu->iowait_boost_pending) { - sg_cpu->iowait_boost_pending = false; - } else { + if (!sg_cpu->iowait_boost_pending) { /* -* Otherwise: reduce the boost value and disable it when we -* reach the minimum. +* No boost pending; reduce the boost value. */ sg_cpu->iowait_boost >>= 1; - if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) { + if (sg_cpu->iowait_boost < sg_cpu->min) { sg_cpu->iowait_boost = 0; - return; + return util; } } + sg_cpu->iowait_boost_pending = false; + /* -* Apply the current boost value: a CPU is boosted only if its current -* utilization is smaller then the current IO boost level. +* @util is already in capacity scale; convert iowait_boost +* into the same scale so we can compare. */ -
[tip:x86/urgent] x86/mm/cpa: Fix set_mce_nospec()
Commit-ID: 0521e8be211cd20d547bff9da2534b7ed6f2c1b9 Gitweb: https://git.kernel.org/tip/0521e8be211cd20d547bff9da2534b7ed6f2c1b9 Author: Peter Zijlstra AuthorDate: Fri, 8 Feb 2019 13:08:59 +0100 Committer: Thomas Gleixner CommitDate: Fri, 8 Feb 2019 14:31:56 +0100 x86/mm/cpa: Fix set_mce_nospec() The recent commit fe0937b24ff5 ("x86/mm/cpa: Fold cpa_flush_range() and cpa_flush_array() into a single cpa_flush() function") accidentally made the call to make_addr_canonical_again() go away, which breaks set_mce_nospec(). Re-instate the call to convert the address back into canonical form right before invoking either CLFLUSH or INVLPG. Rename the function while at it to be shorter (and less MAGA). Fixes: fe0937b24ff5 ("x86/mm/cpa: Fold cpa_flush_range() and cpa_flush_array() into a single cpa_flush() function") Reported-by: Tony Luck Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Tested-by: Tony Luck Cc: Linus Torvalds Cc: Dan Williams Cc: Dave Hansen Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Rik van Riel Link: https://lkml.kernel.org/r/20190208120859.gh32...@hirez.programming.kicks-ass.net --- arch/x86/mm/pageattr.c | 50 +- 1 file changed, 25 insertions(+), 25 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 4f8972311a77..14e6119838a6 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -230,6 +230,29 @@ static bool __cpa_pfn_in_highmap(unsigned long pfn) #endif +/* + * See set_mce_nospec(). + * + * Machine check recovery code needs to change cache mode of poisoned pages to + * UC to avoid speculative access logging another error. But passing the + * address of the 1:1 mapping to set_memory_uc() is a fine way to encourage a + * speculative access. So we cheat and flip the top bit of the address. This + * works fine for the code that updates the page tables. But at the end of the + * process we need to flush the TLB and cache and the non-canonical address + * causes a #GP fault when used by the INVLPG and CLFLUSH instructions. + * + * But in the common case we already have a canonical address. This code + * will fix the top bit if needed and is a no-op otherwise. + */ +static inline unsigned long fix_addr(unsigned long addr) +{ +#ifdef CONFIG_X86_64 + return (long)(addr << 1) >> 1; +#else + return addr; +#endif +} + static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx) { if (cpa->flags & CPA_PAGES_ARRAY) { @@ -313,7 +336,7 @@ void __cpa_flush_tlb(void *data) unsigned int i; for (i = 0; i < cpa->numpages; i++) - __flush_tlb_one_kernel(__cpa_addr(cpa, i)); + __flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i))); } static void cpa_flush(struct cpa_data *data, int cache) @@ -347,7 +370,7 @@ static void cpa_flush(struct cpa_data *data, int cache) * Only flush present addresses: */ if (pte && (pte_val(*pte) & _PAGE_PRESENT)) - clflush_cache_range_opt((void *)addr, PAGE_SIZE); + clflush_cache_range_opt((void *)fix_addr(addr), PAGE_SIZE); } mb(); } @@ -1627,29 +1650,6 @@ out: return ret; } -/* - * Machine check recovery code needs to change cache mode of poisoned - * pages to UC to avoid speculative access logging another error. But - * passing the address of the 1:1 mapping to set_memory_uc() is a fine - * way to encourage a speculative access. So we cheat and flip the top - * bit of the address. This works fine for the code that updates the - * page tables. But at the end of the process we need to flush the cache - * and the non-canonical address causes a #GP fault when used by the - * CLFLUSH instruction. - * - * But in the common case we already have a canonical address. This code - * will fix the top bit if needed and is a no-op otherwise. - */ -static inline unsigned long make_addr_canonical_again(unsigned long addr) -{ -#ifdef CONFIG_X86_64 - return (long)(addr << 1) >> 1; -#else - return addr; -#endif -} - - static int change_page_attr_set_clr(unsigned long *addr, int numpages, pgprot_t mask_set, pgprot_t mask_clr, int force_split, int in_flag,
[tip:perf/core] perf/x86/intel: Delay memory deallocation until x86_pmu_dead_cpu()
Commit-ID: 602cae04c4864bb3487dfe4c2126c8d9e7e1614a Gitweb: https://git.kernel.org/tip/602cae04c4864bb3487dfe4c2126c8d9e7e1614a Author: Peter Zijlstra AuthorDate: Wed, 19 Dec 2018 17:53:50 +0100 Committer: Ingo Molnar CommitDate: Mon, 4 Feb 2019 08:44:51 +0100 perf/x86/intel: Delay memory deallocation until x86_pmu_dead_cpu() intel_pmu_cpu_prepare() allocated memory for ->shared_regs among other members of struct cpu_hw_events. This memory is released in intel_pmu_cpu_dying() which is wrong. The counterpart of the intel_pmu_cpu_prepare() callback is x86_pmu_dead_cpu(). Otherwise if the CPU fails on the UP path between CPUHP_PERF_X86_PREPARE and CPUHP_AP_PERF_X86_STARTING then it won't release the memory but allocate new memory on the next attempt to online the CPU (leaking the old memory). Also, if the CPU down path fails between CPUHP_AP_PERF_X86_STARTING and CPUHP_PERF_X86_PREPARE then the CPU will go back online but never allocate the memory that was released in x86_pmu_dying_cpu(). Make the memory allocation/free symmetrical in regard to the CPU hotplug notifier by moving the deallocation to intel_pmu_cpu_dead(). This started in commit: a7e3ed1e47011 ("perf: Add support for supplementary event registers"). In principle the bug was introduced in v2.6.39 (!), but it will almost certainly not backport cleanly across the big CPU hotplug rewrite between v4.7-v4.15... [ bigeasy: Added patch description. ] [ mingo: Added backporting guidance. ] Reported-by: He Zhe Signed-off-by: Peter Zijlstra (Intel) # With developer hat on Signed-off-by: Sebastian Andrzej Siewior Signed-off-by: Peter Zijlstra (Intel) # With maintainer hat on Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: a...@kernel.org Cc: b...@alien8.de Cc: h...@zytor.com Cc: jo...@kernel.org Cc: kan.li...@linux.intel.com Cc: namhy...@kernel.org Cc: Fixes: a7e3ed1e47011 ("perf: Add support for supplementary event registers"). Link: https://lkml.kernel.org/r/20181219165350.6s3jvyxbibpvl...@linutronix.de Signed-off-by: Ingo Molnar --- arch/x86/events/intel/core.c | 16 +++- 1 file changed, 11 insertions(+), 5 deletions(-) diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 40e12cfc87f6..daafb893449b 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -3558,6 +3558,14 @@ static void free_excl_cntrs(int cpu) } static void intel_pmu_cpu_dying(int cpu) +{ + fini_debug_store_on_cpu(cpu); + + if (x86_pmu.counter_freezing) + disable_counter_freeze(); +} + +static void intel_pmu_cpu_dead(int cpu) { struct cpu_hw_events *cpuc = _cpu(cpu_hw_events, cpu); struct intel_shared_regs *pc; @@ -3570,11 +3578,6 @@ static void intel_pmu_cpu_dying(int cpu) } free_excl_cntrs(cpu); - - fini_debug_store_on_cpu(cpu); - - if (x86_pmu.counter_freezing) - disable_counter_freeze(); } static void intel_pmu_sched_task(struct perf_event_context *ctx, @@ -3663,6 +3666,7 @@ static __initconst const struct x86_pmu core_pmu = { .cpu_prepare= intel_pmu_cpu_prepare, .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, + .cpu_dead = intel_pmu_cpu_dead, }; static struct attribute *intel_pmu_attrs[]; @@ -3703,6 +3707,8 @@ static __initconst const struct x86_pmu intel_pmu = { .cpu_prepare= intel_pmu_cpu_prepare, .cpu_starting = intel_pmu_cpu_starting, .cpu_dying = intel_pmu_cpu_dying, + .cpu_dead = intel_pmu_cpu_dead, + .guest_get_msrs = intel_guest_get_msrs, .sched_task = intel_pmu_sched_task, };
[tip:sched/core] sched/fair: Robustify CFS-bandwidth timer locking
Commit-ID: c0ad4aa4d8416a39ad262a2bd68b30acd951bf0e Gitweb: https://git.kernel.org/tip/c0ad4aa4d8416a39ad262a2bd68b30acd951bf0e Author: Peter Zijlstra AuthorDate: Mon, 7 Jan 2019 13:52:31 +0100 Committer: Ingo Molnar CommitDate: Sun, 27 Jan 2019 12:29:37 +0100 sched/fair: Robustify CFS-bandwidth timer locking Traditionally hrtimer callbacks were run with IRQs disabled, but with the introduction of HRTIMER_MODE_SOFT it is possible they run from SoftIRQ context, which does _NOT_ have IRQs disabled. Allow for the CFS bandwidth timers (period_timer and slack_timer) to be ran from SoftIRQ context; this entails removing the assumption that IRQs are already disabled from the locking. While mainline doesn't strictly need this, -RT forces all timers not explicitly marked with MODE_HARD into MODE_SOFT and trips over this. And marking these timers as MODE_HARD doesn't make sense as they're not required for RT operation and can potentially be quite expensive. Reported-by: Tom Putzeys Tested-by: Mike Galbraith Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190107125231.ge14...@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 30 -- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b1374fbddd0d..3b61e19b504a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4565,7 +4565,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, struct rq *rq = rq_of(cfs_rq); struct rq_flags rf; - rq_lock(rq, ); + rq_lock_irqsave(rq, ); if (!cfs_rq_throttled(cfs_rq)) goto next; @@ -4582,7 +4582,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, unthrottle_cfs_rq(cfs_rq); next: - rq_unlock(rq, ); + rq_unlock_irqrestore(rq, ); if (!remaining) break; @@ -4598,7 +4598,7 @@ next: * period the timer is deactivated until scheduling resumes; cfs_b->idle is * used to track this state. */ -static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags) { u64 runtime, runtime_expires; int throttled; @@ -4640,11 +4640,11 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { runtime = cfs_b->runtime; cfs_b->distribute_running = 1; - raw_spin_unlock(_b->lock); + raw_spin_unlock_irqrestore(_b->lock, flags); /* we can't nest cfs_b->lock while distributing bandwidth */ runtime = distribute_cfs_runtime(cfs_b, runtime, runtime_expires); - raw_spin_lock(_b->lock); + raw_spin_lock_irqsave(_b->lock, flags); cfs_b->distribute_running = 0; throttled = !list_empty(_b->throttled_cfs_rq); @@ -4753,17 +4753,18 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) { u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); + unsigned long flags; u64 expires; /* confirm we're still not at a refresh boundary */ - raw_spin_lock(_b->lock); + raw_spin_lock_irqsave(_b->lock, flags); if (cfs_b->distribute_running) { - raw_spin_unlock(_b->lock); + raw_spin_unlock_irqrestore(_b->lock, flags); return; } if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { - raw_spin_unlock(_b->lock); + raw_spin_unlock_irqrestore(_b->lock, flags); return; } @@ -4774,18 +4775,18 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (runtime) cfs_b->distribute_running = 1; - raw_spin_unlock(_b->lock); + raw_spin_unlock_irqrestore(_b->lock, flags); if (!runtime) return; runtime = distribute_cfs_runtime(cfs_b, runtime, expires); - raw_spin_lock(_b->lock); + raw_spin_lock_irqsave(_b->lock, flags); if (expires == cfs_b->runtime_expires) lsub_positive(_b->runtime, runtime); cfs_b->distribute_running = 0; - raw_spin_unlock(_b->lock); + raw_spin_unlock_irqrestore(_b->lock, flags); } /* @@ -4863,20 +4864,21 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b =
[tip:sched/core] sched/fair: Robustify CFS-bandwidth timer locking
Commit-ID: 3cd126af79ed5a4d6b06eba63d3349e143a3bd3b Gitweb: https://git.kernel.org/tip/3cd126af79ed5a4d6b06eba63d3349e143a3bd3b Author: Peter Zijlstra AuthorDate: Mon, 7 Jan 2019 13:52:31 +0100 Committer: Ingo Molnar CommitDate: Mon, 21 Jan 2019 14:40:28 +0100 sched/fair: Robustify CFS-bandwidth timer locking Traditionally hrtimer callbacks were run with IRQs disabled, but with the introduction of HRTIMER_MODE_SOFT it is possible they run from SoftIRQ context, which does _NOT_ have IRQs disabled. Allow for the CFS bandwidth timers (period_timer and slack_timer) to be ran from SoftIRQ context; this entails removing the assumption that IRQs are already disabled from the locking. While mainline doesn't strictly need this, -RT forces all timers not explicitly marked with MODE_HARD into MODE_SOFT and trips over this. And marking these timers as MODE_HARD doesn't make sense as they're not required for RT operation and can potentially be quite expensive. Reported-by: Tom Putzeys Tested-by: Mike Galbraith Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190107125231.ge14...@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 30 -- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index b1374fbddd0d..3b61e19b504a 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4565,7 +4565,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, struct rq *rq = rq_of(cfs_rq); struct rq_flags rf; - rq_lock(rq, ); + rq_lock_irqsave(rq, ); if (!cfs_rq_throttled(cfs_rq)) goto next; @@ -4582,7 +4582,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, unthrottle_cfs_rq(cfs_rq); next: - rq_unlock(rq, ); + rq_unlock_irqrestore(rq, ); if (!remaining) break; @@ -4598,7 +4598,7 @@ next: * period the timer is deactivated until scheduling resumes; cfs_b->idle is * used to track this state. */ -static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags) { u64 runtime, runtime_expires; int throttled; @@ -4640,11 +4640,11 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { runtime = cfs_b->runtime; cfs_b->distribute_running = 1; - raw_spin_unlock(_b->lock); + raw_spin_unlock_irqrestore(_b->lock, flags); /* we can't nest cfs_b->lock while distributing bandwidth */ runtime = distribute_cfs_runtime(cfs_b, runtime, runtime_expires); - raw_spin_lock(_b->lock); + raw_spin_lock_irqsave(_b->lock, flags); cfs_b->distribute_running = 0; throttled = !list_empty(_b->throttled_cfs_rq); @@ -4753,17 +4753,18 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) { u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); + unsigned long flags; u64 expires; /* confirm we're still not at a refresh boundary */ - raw_spin_lock(_b->lock); + raw_spin_lock_irqsave(_b->lock, flags); if (cfs_b->distribute_running) { - raw_spin_unlock(_b->lock); + raw_spin_unlock_irqrestore(_b->lock, flags); return; } if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { - raw_spin_unlock(_b->lock); + raw_spin_unlock_irqrestore(_b->lock, flags); return; } @@ -4774,18 +4775,18 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (runtime) cfs_b->distribute_running = 1; - raw_spin_unlock(_b->lock); + raw_spin_unlock_irqrestore(_b->lock, flags); if (!runtime) return; runtime = distribute_cfs_runtime(cfs_b, runtime, expires); - raw_spin_lock(_b->lock); + raw_spin_lock_irqsave(_b->lock, flags); if (expires == cfs_b->runtime_expires) lsub_positive(_b->runtime, runtime); cfs_b->distribute_running = 0; - raw_spin_unlock(_b->lock); + raw_spin_unlock_irqrestore(_b->lock, flags); } /* @@ -4863,20 +4864,21 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b =
[tip:sched/core] sched/fair: Robustify CFS-bandwidth timer locking
Commit-ID: b733c2d2f2810ec8556d2d711d1b95f491bd7697 Gitweb: https://git.kernel.org/tip/b733c2d2f2810ec8556d2d711d1b95f491bd7697 Author: Peter Zijlstra AuthorDate: Mon, 7 Jan 2019 13:52:31 +0100 Committer: Ingo Molnar CommitDate: Mon, 21 Jan 2019 11:27:55 +0100 sched/fair: Robustify CFS-bandwidth timer locking Traditionally hrtimer callbacks were run with IRQs disabled, but with the introduction of HRTIMER_MODE_SOFT it is possible they run from SoftIRQ context, which does _NOT_ have IRQs disabled. Allow for the CFS bandwidth timers (period_timer and slack_timer) to be ran from SoftIRQ context; this entails removing the assumption that IRQs are already disabled from the locking. While mainline doesn't strictly need this, -RT forces all timers not explicitly marked with MODE_HARD into MODE_SOFT and trips over this. And marking these timers as MODE_HARD doesn't make sense as they're not required for RT operation and can potentially be quite expensive. Reported-by: Tom Putzeys Tested-by: Mike Galbraith Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Sebastian Andrzej Siewior Cc: Thomas Gleixner Link: https://lkml.kernel.org/r/20190107125231.ge14...@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/fair.c | 30 -- 1 file changed, 16 insertions(+), 14 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 385725eb3bd6..90c7a7bf45d3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -4565,7 +4565,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, struct rq *rq = rq_of(cfs_rq); struct rq_flags rf; - rq_lock(rq, ); + rq_lock_irqsave(rq, ); if (!cfs_rq_throttled(cfs_rq)) goto next; @@ -4582,7 +4582,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth *cfs_b, unthrottle_cfs_rq(cfs_rq); next: - rq_unlock(rq, ); + rq_unlock_irqrestore(rq, ); if (!remaining) break; @@ -4598,7 +4598,7 @@ next: * period the timer is deactivated until scheduling resumes; cfs_b->idle is * used to track this state. */ -static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) +static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, unsigned long flags) { u64 runtime, runtime_expires; int throttled; @@ -4640,11 +4640,11 @@ static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun) while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) { runtime = cfs_b->runtime; cfs_b->distribute_running = 1; - raw_spin_unlock(_b->lock); + raw_spin_unlock_irqrestore(_b->lock, flags); /* we can't nest cfs_b->lock while distributing bandwidth */ runtime = distribute_cfs_runtime(cfs_b, runtime, runtime_expires); - raw_spin_lock(_b->lock); + raw_spin_lock_irqsave(_b->lock, flags); cfs_b->distribute_running = 0; throttled = !list_empty(_b->throttled_cfs_rq); @@ -4753,17 +4753,18 @@ static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq) static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) { u64 runtime = 0, slice = sched_cfs_bandwidth_slice(); + unsigned long flags; u64 expires; /* confirm we're still not at a refresh boundary */ - raw_spin_lock(_b->lock); + raw_spin_lock_irqsave(_b->lock, flags); if (cfs_b->distribute_running) { - raw_spin_unlock(_b->lock); + raw_spin_unlock_irqrestore(_b->lock, flags); return; } if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) { - raw_spin_unlock(_b->lock); + raw_spin_unlock_irqrestore(_b->lock, flags); return; } @@ -4774,18 +4775,18 @@ static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b) if (runtime) cfs_b->distribute_running = 1; - raw_spin_unlock(_b->lock); + raw_spin_unlock_irqrestore(_b->lock, flags); if (!runtime) return; runtime = distribute_cfs_runtime(cfs_b, runtime, expires); - raw_spin_lock(_b->lock); + raw_spin_lock_irqsave(_b->lock, flags); if (expires == cfs_b->runtime_expires) lsub_positive(_b->runtime, runtime); cfs_b->distribute_running = 0; - raw_spin_unlock(_b->lock); + raw_spin_unlock_irqrestore(_b->lock, flags); } /* @@ -4863,20 +4864,21 @@ static enum hrtimer_restart sched_cfs_period_timer(struct hrtimer *timer) { struct cfs_bandwidth *cfs_b =
[tip:x86/mm] x86/mm/cpa: Rename @addrinarray to @numpages
Commit-ID: 3c567356dbe0da4fc310cfcffafc39526e1ca43a Gitweb: https://git.kernel.org/tip/3c567356dbe0da4fc310cfcffafc39526e1ca43a Author: Peter Zijlstra AuthorDate: Mon, 3 Dec 2018 18:03:53 +0100 Committer: Ingo Molnar CommitDate: Mon, 17 Dec 2018 18:54:30 +0100 x86/mm/cpa: Rename @addrinarray to @numpages The CPA_ARRAY interface works in single pages, and everything, except in these 'few' locations is this variable called 'numpages'. Remove this 'addrinarray' abberation and use 'numpages' consistently. Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: tom.stde...@amd.com Cc: dave.han...@intel.com Link: http://lkml.kernel.org/r/20181203171043.695039...@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 52 +- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 7d05149995dc..df4340c8e293 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1808,14 +1808,14 @@ out_err: } EXPORT_SYMBOL(set_memory_uc); -static int _set_memory_array(unsigned long *addr, int addrinarray, +static int _set_memory_array(unsigned long *addr, int numpages, enum page_cache_mode new_type) { enum page_cache_mode set_type; int i, j; int ret; - for (i = 0; i < addrinarray; i++) { + for (i = 0; i < numpages; i++) { ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE, new_type, NULL); if (ret) @@ -1826,11 +1826,11 @@ static int _set_memory_array(unsigned long *addr, int addrinarray, set_type = (new_type == _PAGE_CACHE_MODE_WC) ? _PAGE_CACHE_MODE_UC_MINUS : new_type; - ret = change_page_attr_set(addr, addrinarray, + ret = change_page_attr_set(addr, numpages, cachemode2pgprot(set_type), 1); if (!ret && new_type == _PAGE_CACHE_MODE_WC) - ret = change_page_attr_set_clr(addr, addrinarray, + ret = change_page_attr_set_clr(addr, numpages, cachemode2pgprot( _PAGE_CACHE_MODE_WC), __pgprot(_PAGE_CACHE_MASK), @@ -1847,21 +1847,21 @@ out_free: return ret; } -int set_memory_array_uc(unsigned long *addr, int addrinarray) +int set_memory_array_uc(unsigned long *addr, int numpages) { - return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_UC_MINUS); + return _set_memory_array(addr, numpages, _PAGE_CACHE_MODE_UC_MINUS); } EXPORT_SYMBOL(set_memory_array_uc); -int set_memory_array_wc(unsigned long *addr, int addrinarray) +int set_memory_array_wc(unsigned long *addr, int numpages) { - return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WC); + return _set_memory_array(addr, numpages, _PAGE_CACHE_MODE_WC); } EXPORT_SYMBOL(set_memory_array_wc); -int set_memory_array_wt(unsigned long *addr, int addrinarray) +int set_memory_array_wt(unsigned long *addr, int numpages) { - return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WT); + return _set_memory_array(addr, numpages, _PAGE_CACHE_MODE_WT); } EXPORT_SYMBOL_GPL(set_memory_array_wt); @@ -1941,18 +1941,18 @@ int set_memory_wb(unsigned long addr, int numpages) } EXPORT_SYMBOL(set_memory_wb); -int set_memory_array_wb(unsigned long *addr, int addrinarray) +int set_memory_array_wb(unsigned long *addr, int numpages) { int i; int ret; /* WB cache mode is hard wired to all cache attribute bits being 0 */ - ret = change_page_attr_clear(addr, addrinarray, + ret = change_page_attr_clear(addr, numpages, __pgprot(_PAGE_CACHE_MASK), 1); if (ret) return ret; - for (i = 0; i < addrinarray; i++) + for (i = 0; i < numpages; i++) free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE); return 0; @@ -2082,7 +2082,7 @@ int set_pages_uc(struct page *page, int numpages) } EXPORT_SYMBOL(set_pages_uc); -static int _set_pages_array(struct page **pages, int addrinarray, +static int _set_pages_array(struct page **pages, int numpages, enum page_cache_mode new_type) { unsigned long start; @@ -2092,7 +2092,7 @@ static int _set_pages_array(struct page **pages, int addrinarray, int free_idx; int ret; - for (i = 0; i < addrinarray; i++) { + for (i = 0; i < numpages; i++) { if (PageHighMem(pages[i])) continue; start = page_to_pfn(pages[i]) << PAGE_SHIFT; @@ -2105,10 +2105,10 @@ static int
[tip:x86/mm] x86/mm/cpa: Better use CLFLUSHOPT
Commit-ID: c38116bb940ae37f51fccd315b420ee5961dcb76 Gitweb: https://git.kernel.org/tip/c38116bb940ae37f51fccd315b420ee5961dcb76 Author: Peter Zijlstra AuthorDate: Mon, 3 Dec 2018 18:03:52 +0100 Committer: Ingo Molnar CommitDate: Mon, 17 Dec 2018 18:54:29 +0100 x86/mm/cpa: Better use CLFLUSHOPT Currently we issue an MFENCE before and after flushing a range. This means that if we flush a bunch of single page ranges -- like with the cpa array, we issue a whole bunch of superfluous MFENCEs. Reorgainze the code a little to avoid this. [ mingo: capitalize instructions, tweak changelog and comments. ] Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: tom.stde...@amd.com Cc: dave.han...@intel.com Link: http://lkml.kernel.org/r/20181203171043.626999...@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 29 + 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 85ef53b86fa0..7d05149995dc 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -251,15 +251,7 @@ static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx) * Flushing functions */ -/** - * clflush_cache_range - flush a cache range with clflush - * @vaddr: virtual start address - * @size: number of bytes to flush - * - * clflushopt is an unordered instruction which needs fencing with mfence or - * sfence to avoid ordering issues. - */ -void clflush_cache_range(void *vaddr, unsigned int size) +static void clflush_cache_range_opt(void *vaddr, unsigned int size) { const unsigned long clflush_size = boot_cpu_data.x86_clflush_size; void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1)); @@ -268,11 +260,22 @@ void clflush_cache_range(void *vaddr, unsigned int size) if (p >= vend) return; - mb(); - for (; p < vend; p += clflush_size) clflushopt(p); +} +/** + * clflush_cache_range - flush a cache range with clflush + * @vaddr: virtual start address + * @size: number of bytes to flush + * + * CLFLUSHOPT is an unordered instruction which needs fencing with MFENCE or + * SFENCE to avoid ordering issues. + */ +void clflush_cache_range(void *vaddr, unsigned int size) +{ + mb(); + clflush_cache_range_opt(vaddr, size); mb(); } EXPORT_SYMBOL_GPL(clflush_cache_range); @@ -333,6 +336,7 @@ static void cpa_flush(struct cpa_data *data, int cache) if (!cache) return; + mb(); for (i = 0; i < cpa->numpages; i++) { unsigned long addr = __cpa_addr(cpa, i); unsigned int level; @@ -343,8 +347,9 @@ static void cpa_flush(struct cpa_data *data, int cache) * Only flush present addresses: */ if (pte && (pte_val(*pte) & _PAGE_PRESENT)) - clflush_cache_range((void *)addr, PAGE_SIZE); + clflush_cache_range_opt((void *)addr, PAGE_SIZE); } + mb(); } static bool overlaps(unsigned long r1_start, unsigned long r1_end,
[tip:x86/mm] x86/mm/cpa: Fold cpa_flush_range() and cpa_flush_array() into a single cpa_flush() function
Commit-ID: fe0937b24ff5d7b343b9922201e469f9a6009d9d Gitweb: https://git.kernel.org/tip/fe0937b24ff5d7b343b9922201e469f9a6009d9d Author: Peter Zijlstra AuthorDate: Mon, 3 Dec 2018 18:03:51 +0100 Committer: Ingo Molnar CommitDate: Mon, 17 Dec 2018 18:54:28 +0100 x86/mm/cpa: Fold cpa_flush_range() and cpa_flush_array() into a single cpa_flush() function Note that the cache flush loop in cpa_flush_*() is identical when we use __cpa_addr(); further observe that flush_tlb_kernel_range() is a special case of to the cpa_flush_array() TLB invalidation code. This then means the two functions are virtually identical. Fold these two functions into a single cpa_flush() call. Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: tom.stde...@amd.com Cc: dave.han...@intel.com Link: http://lkml.kernel.org/r/20181203171043.559855...@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 92 ++ 1 file changed, 18 insertions(+), 74 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 12b69263e501..85ef53b86fa0 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -304,51 +304,7 @@ static void cpa_flush_all(unsigned long cache) on_each_cpu(__cpa_flush_all, (void *) cache, 1); } -static bool __inv_flush_all(int cache) -{ - BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); - - if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) { - cpa_flush_all(cache); - return true; - } - - return false; -} - -static void cpa_flush_range(unsigned long start, int numpages, int cache) -{ - unsigned int i, level; - unsigned long addr; - - WARN_ON(PAGE_ALIGN(start) != start); - - if (__inv_flush_all(cache)) - return; - - flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages); - - if (!cache) - return; - - /* -* We only need to flush on one CPU, -* clflush is a MESI-coherent instruction that -* will cause all other CPUs to flush the same -* cachelines: -*/ - for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) { - pte_t *pte = lookup_address(addr, ); - - /* -* Only flush present addresses: -*/ - if (pte && (pte_val(*pte) & _PAGE_PRESENT)) - clflush_cache_range((void *) addr, PAGE_SIZE); - } -} - -void __cpa_flush_array(void *data) +void __cpa_flush_tlb(void *data) { struct cpa_data *cpa = data; unsigned int i; @@ -357,33 +313,31 @@ void __cpa_flush_array(void *data) __flush_tlb_one_kernel(__cpa_addr(cpa, i)); } -static void cpa_flush_array(struct cpa_data *cpa, int cache) +static void cpa_flush(struct cpa_data *data, int cache) { + struct cpa_data *cpa = data; unsigned int i; - if (cpa_check_flush_all(cache)) + BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); + + if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) { + cpa_flush_all(cache); return; + } if (cpa->numpages <= tlb_single_page_flush_ceiling) - on_each_cpu(__cpa_flush_array, cpa, 1); + on_each_cpu(__cpa_flush_tlb, cpa, 1); else flush_tlb_all(); if (!cache) return; - /* -* We only need to flush on one CPU, -* clflush is a MESI-coherent instruction that -* will cause all other CPUs to flush the same -* cachelines: -*/ for (i = 0; i < cpa->numpages; i++) { unsigned long addr = __cpa_addr(cpa, i); unsigned int level; - pte_t *pte; - pte = lookup_address(addr, ); + pte_t *pte = lookup_address(addr, ); /* * Only flush present addresses: @@ -1698,7 +1652,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, { struct cpa_data cpa; int ret, cache, checkalias; - unsigned long baddr = 0; memset(, 0, sizeof(cpa)); @@ -1732,11 +1685,6 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, */ WARN_ON_ONCE(1); } - /* -* Save address for cache flush. *addr is modified in the call -* to __change_page_attr_set_clr() below. -*/ - baddr = make_addr_canonical_again(*addr); } /* Must avoid aliasing mappings in the highmem code */ @@ -1784,11 +1732,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, goto out; }
[tip:x86/mm] x86/mm/cpa: Make cpa_data::numpages invariant
Commit-ID: 83b4e39146aa70913580966e0f2b78b7c3492760 Gitweb: https://git.kernel.org/tip/83b4e39146aa70913580966e0f2b78b7c3492760 Author: Peter Zijlstra AuthorDate: Mon, 3 Dec 2018 18:03:50 +0100 Committer: Ingo Molnar CommitDate: Mon, 17 Dec 2018 18:54:27 +0100 x86/mm/cpa: Make cpa_data::numpages invariant Make sure __change_page_attr_set_clr() doesn't modify cpa->numpages. Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: tom.stde...@amd.com Cc: dave.han...@intel.com Link: http://lkml.kernel.org/r/20181203171043.493000...@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 21 + 1 file changed, 13 insertions(+), 8 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 351874259a71..12b69263e501 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1625,14 +1625,15 @@ static int cpa_process_alias(struct cpa_data *cpa) static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) { unsigned long numpages = cpa->numpages; - int ret; + unsigned long rempages = numpages; + int ret = 0; - while (numpages) { + while (rempages) { /* * Store the remaining nr of pages for the large page * preservation check. */ - cpa->numpages = numpages; + cpa->numpages = rempages; /* for array changes, we can't use large page */ if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY)) cpa->numpages = 1; @@ -1643,12 +1644,12 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) if (!debug_pagealloc_enabled()) spin_unlock(_lock); if (ret) - return ret; + goto out; if (checkalias) { ret = cpa_process_alias(cpa); if (ret) - return ret; + goto out; } /* @@ -1656,11 +1657,15 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) * CPA operation. Either a large page has been * preserved or a single page update happened. */ - BUG_ON(cpa->numpages > numpages || !cpa->numpages); - numpages -= cpa->numpages; + BUG_ON(cpa->numpages > rempages || !cpa->numpages); + rempages -= cpa->numpages; cpa->curpage += cpa->numpages; } - return 0; + +out: + /* Restore the original numpages */ + cpa->numpages = numpages; + return ret; } /*
[tip:x86/mm] x86/mm/cpa: Optimize cpa_flush_array() TLB invalidation
Commit-ID: 935f5839827ef54b53406e80906f7c355eb73c1b Gitweb: https://git.kernel.org/tip/935f5839827ef54b53406e80906f7c355eb73c1b Author: Peter Zijlstra AuthorDate: Mon, 3 Dec 2018 18:03:49 +0100 Committer: Ingo Molnar CommitDate: Mon, 17 Dec 2018 18:54:26 +0100 x86/mm/cpa: Optimize cpa_flush_array() TLB invalidation Instead of punting and doing tlb_flush_all(), do the same as flush_tlb_kernel_range() does and use single page invalidations. Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: tom.stde...@amd.com Cc: dave.han...@intel.com Link: http://lkml.kernel.org/r/20181203171043.430001...@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/mm/mm_internal.h | 2 ++ arch/x86/mm/pageattr.c| 42 -- arch/x86/mm/tlb.c | 4 +++- 3 files changed, 29 insertions(+), 19 deletions(-) diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h index 4e1f6e1b8159..319bde386d5f 100644 --- a/arch/x86/mm/mm_internal.h +++ b/arch/x86/mm/mm_internal.h @@ -19,4 +19,6 @@ extern int after_bootmem; void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache); +extern unsigned long tlb_single_page_flush_ceiling; + #endif /* __X86_MM_INTERNAL_H */ diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index afa98b7b6050..351874259a71 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -26,6 +26,8 @@ #include #include +#include "mm_internal.h" + /* * The current flushing context - we pass it instead of 5 arguments: */ @@ -346,16 +348,26 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) } } -static void cpa_flush_array(unsigned long baddr, unsigned long *start, - int numpages, int cache, - int in_flags, struct page **pages) +void __cpa_flush_array(void *data) { - unsigned int i, level; + struct cpa_data *cpa = data; + unsigned int i; - if (__inv_flush_all(cache)) + for (i = 0; i < cpa->numpages; i++) + __flush_tlb_one_kernel(__cpa_addr(cpa, i)); +} + +static void cpa_flush_array(struct cpa_data *cpa, int cache) +{ + unsigned int i; + + if (cpa_check_flush_all(cache)) return; - flush_tlb_all(); + if (cpa->numpages <= tlb_single_page_flush_ceiling) + on_each_cpu(__cpa_flush_array, cpa, 1); + else + flush_tlb_all(); if (!cache) return; @@ -366,15 +378,11 @@ static void cpa_flush_array(unsigned long baddr, unsigned long *start, * will cause all other CPUs to flush the same * cachelines: */ - for (i = 0; i < numpages; i++) { - unsigned long addr; + for (i = 0; i < cpa->numpages; i++) { + unsigned long addr = __cpa_addr(cpa, i); + unsigned int level; pte_t *pte; - if (in_flags & CPA_PAGES_ARRAY) - addr = (unsigned long)page_address(pages[i]); - else - addr = start[i]; - pte = lookup_address(addr, ); /* @@ -1771,12 +1779,10 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, goto out; } - if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { - cpa_flush_array(baddr, addr, numpages, cache, - cpa.flags, pages); - } else { + if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) + cpa_flush_array(, cache); + else cpa_flush_range(baddr, numpages, cache); - } out: return ret; diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c index 03b6b4c2238d..999d6d8f0bef 100644 --- a/arch/x86/mm/tlb.c +++ b/arch/x86/mm/tlb.c @@ -15,6 +15,8 @@ #include #include +#include "mm_internal.h" + /* * TLB flushing, formerly SMP-only * c/o Linus Torvalds. @@ -721,7 +723,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask, * * This is in units of pages. */ -static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; +unsigned long tlb_single_page_flush_ceiling __read_mostly = 33; void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, unsigned long end, unsigned int stride_shift,
[tip:x86/mm] x86/mm/cpa: Simplify the code after making cpa->vaddr invariant
Commit-ID: 5fe26b7a8f4693d532c7a3c3632e47e7d7016238 Gitweb: https://git.kernel.org/tip/5fe26b7a8f4693d532c7a3c3632e47e7d7016238 Author: Peter Zijlstra AuthorDate: Mon, 3 Dec 2018 18:03:48 +0100 Committer: Ingo Molnar CommitDate: Mon, 17 Dec 2018 18:54:25 +0100 x86/mm/cpa: Simplify the code after making cpa->vaddr invariant Since cpa->vaddr is invariant, this means we can remove all workarounds that deal with it changing. Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: tom.stde...@amd.com Cc: dave.han...@intel.com Link: http://lkml.kernel.org/r/20181203171043.366619...@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr-test.c | 7 ++- arch/x86/mm/pageattr.c | 13 - 2 files changed, 6 insertions(+), 14 deletions(-) diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index b6b6468530f1..facce271e8b9 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -124,7 +124,6 @@ static int pageattr_test(void) unsigned int level; int i, k; int err; - unsigned long test_addr; if (print) printk(KERN_INFO "CPA self-test:\n"); @@ -181,8 +180,7 @@ static int pageattr_test(void) switch (i % 3) { case 0: - test_addr = addr[i]; - err = change_page_attr_set(_addr, len[i], PAGE_CPA_TEST, 0); + err = change_page_attr_set([i], len[i], PAGE_CPA_TEST, 0); break; case 1: @@ -226,8 +224,7 @@ static int pageattr_test(void) failed++; continue; } - test_addr = addr[i]; - err = change_page_attr_clear(_addr, len[i], PAGE_CPA_TEST, 0); + err = change_page_attr_clear([i], len[i], PAGE_CPA_TEST, 0); if (err < 0) { printk(KERN_ERR "CPA reverting failed: %d\n", err); failed++; diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index ce8af3f08628..afa98b7b6050 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -1908,15 +1908,13 @@ EXPORT_SYMBOL_GPL(set_memory_array_wt); int _set_memory_wc(unsigned long addr, int numpages) { int ret; - unsigned long addr_copy = addr; ret = change_page_attr_set(, numpages, cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS), 0); if (!ret) { - ret = change_page_attr_set_clr(_copy, numpages, - cachemode2pgprot( - _PAGE_CACHE_MODE_WC), + ret = change_page_attr_set_clr(, numpages, + cachemode2pgprot(_PAGE_CACHE_MODE_WC), __pgprot(_PAGE_CACHE_MASK), 0, 0, NULL); } @@ -2064,7 +2062,6 @@ int set_memory_global(unsigned long addr, int numpages) static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) { struct cpa_data cpa; - unsigned long start; int ret; /* Nothing to do if memory encryption is not active */ @@ -2075,8 +2072,6 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr)) addr &= PAGE_MASK; - start = addr; - memset(, 0, sizeof(cpa)); cpa.vaddr = cpa.numpages = numpages; @@ -2091,7 +2086,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) /* * Before changing the encryption attribute, we need to flush caches. */ - cpa_flush_range(start, numpages, 1); + cpa_flush_range(addr, numpages, 1); ret = __change_page_attr_set_clr(, 1); @@ -2102,7 +2097,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) * in case TLB flushing gets optimized in the cpa_flush_range() * path use the same logic as above. */ - cpa_flush_range(start, numpages, 0); + cpa_flush_range(addr, numpages, 0); return ret; }
[tip:x86/mm] x86/mm/cpa: Make cpa_data::vaddr invariant
Commit-ID: 98bfc9b038cde1ce108f69a50720e394fe774cb7 Gitweb: https://git.kernel.org/tip/98bfc9b038cde1ce108f69a50720e394fe774cb7 Author: Peter Zijlstra AuthorDate: Mon, 3 Dec 2018 18:03:47 +0100 Committer: Ingo Molnar CommitDate: Mon, 17 Dec 2018 18:54:24 +0100 x86/mm/cpa: Make cpa_data::vaddr invariant Currently __change_page_attr_set_clr() will modify cpa->vaddr when !(CPA_ARRAY | CPA_PAGES_ARRAY), whereas in the array cases it will increment cpa->curpage. Change __cpa_addr() such that its @idx argument also works in the !array case and use cpa->curpage increments for all cases. NOTE: since cpa_data::numpages is 'unsigned long' so should cpa_data::curpage be. NOTE: after this only cpa->numpages is still modified. Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: tom.stde...@amd.com Cc: dave.han...@intel.com Link: http://lkml.kernel.org/r/20181203171043.295174...@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 18 -- 1 file changed, 8 insertions(+), 10 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 6e6900ebea30..ce8af3f08628 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -35,11 +35,11 @@ struct cpa_data { pgprot_tmask_set; pgprot_tmask_clr; unsigned long numpages; - int flags; + unsigned long curpage; unsigned long pfn; - unsignedforce_split : 1, + unsigned intflags; + unsigned intforce_split : 1, force_static_prot : 1; - int curpage; struct page **pages; }; @@ -228,7 +228,7 @@ static bool __cpa_pfn_in_highmap(unsigned long pfn) #endif -static unsigned long __cpa_addr(struct cpa_data *cpa, int idx) +static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx) { if (cpa->flags & CPA_PAGES_ARRAY) { struct page *page = cpa->pages[idx]; @@ -242,7 +242,7 @@ static unsigned long __cpa_addr(struct cpa_data *cpa, int idx) if (cpa->flags & CPA_ARRAY) return cpa->vaddr[idx]; - return *cpa->vaddr; + return *cpa->vaddr + idx * PAGE_SIZE; } /* @@ -1581,6 +1581,7 @@ static int cpa_process_alias(struct cpa_data *cpa) alias_cpa = *cpa; alias_cpa.vaddr = alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + alias_cpa.curpage = 0; ret = __change_page_attr_set_clr(_cpa, 0); if (ret) @@ -1600,6 +1601,7 @@ static int cpa_process_alias(struct cpa_data *cpa) alias_cpa = *cpa; alias_cpa.vaddr = _cpa_vaddr; alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY); + alias_cpa.curpage = 0; /* * The high mapping range is imprecise, so ignore the @@ -1648,11 +1650,7 @@ static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias) */ BUG_ON(cpa->numpages > numpages || !cpa->numpages); numpages -= cpa->numpages; - if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) - cpa->curpage++; - else - *cpa->vaddr += cpa->numpages * PAGE_SIZE; - + cpa->curpage += cpa->numpages; } return 0; }
[tip:x86/mm] x86/mm/cpa: Add ARRAY and PAGES_ARRAY selftests
Commit-ID: ecc729f1f47142ad31741549f400b611435c1af7 Gitweb: https://git.kernel.org/tip/ecc729f1f47142ad31741549f400b611435c1af7 Author: Peter Zijlstra AuthorDate: Mon, 3 Dec 2018 18:03:45 +0100 Committer: Ingo Molnar CommitDate: Mon, 17 Dec 2018 18:54:22 +0100 x86/mm/cpa: Add ARRAY and PAGES_ARRAY selftests The current pageattr-test code only uses the regular range interface, add code that also tests the array and pages interface. Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: tom.stde...@amd.com Cc: dave.han...@intel.com Link: http://lkml.kernel.org/r/20181203171043.162771...@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr-test.c | 28 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c index 08f8f76a4852..b6b6468530f1 100644 --- a/arch/x86/mm/pageattr-test.c +++ b/arch/x86/mm/pageattr-test.c @@ -23,7 +23,8 @@ static __read_mostly int print = 1; enum { - NTEST = 400, + NTEST = 3 * 100, + NPAGES = 100, #ifdef CONFIG_X86_64 LPS = (1 << PMD_SHIFT), #elif defined(CONFIG_X86_PAE) @@ -110,6 +111,9 @@ static int print_split(struct split_state *s) static unsigned long addr[NTEST]; static unsigned int len[NTEST]; +static struct page *pages[NPAGES]; +static unsigned long addrs[NPAGES]; + /* Change the global bit on random pages in the direct mapping */ static int pageattr_test(void) { @@ -137,7 +141,7 @@ static int pageattr_test(void) unsigned long pfn = prandom_u32() % max_pfn_mapped; addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT); - len[i] = prandom_u32() % 100; + len[i] = prandom_u32() % NPAGES; len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1); if (len[i] == 0) @@ -167,14 +171,30 @@ static int pageattr_test(void) break; } __set_bit(pfn + k, bm); + addrs[k] = addr[i] + k*PAGE_SIZE; + pages[k] = pfn_to_page(pfn + k); } if (!addr[i] || !pte || !k) { addr[i] = 0; continue; } - test_addr = addr[i]; - err = change_page_attr_set(_addr, len[i], PAGE_CPA_TEST, 0); + switch (i % 3) { + case 0: + test_addr = addr[i]; + err = change_page_attr_set(_addr, len[i], PAGE_CPA_TEST, 0); + break; + + case 1: + err = change_page_attr_set(addrs, len[1], PAGE_CPA_TEST, 1); + break; + + case 2: + err = cpa_set_pages_array(pages, len[i], PAGE_CPA_TEST); + break; + } + + if (err < 0) { printk(KERN_ERR "CPA %d failed %d\n", i, err); failed++;
[tip:x86/mm] x86/mm/cpa: Add __cpa_addr() helper
Commit-ID: 16ebf031e8ab73779a382c9f2b097891da6af923 Gitweb: https://git.kernel.org/tip/16ebf031e8ab73779a382c9f2b097891da6af923 Author: Peter Zijlstra AuthorDate: Mon, 3 Dec 2018 18:03:46 +0100 Committer: Ingo Molnar CommitDate: Mon, 17 Dec 2018 18:54:23 +0100 x86/mm/cpa: Add __cpa_addr() helper The code to compute the virtual address of a cpa_data is duplicated; introduce a helper before more copies happen. Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: tom.stde...@amd.com Cc: dave.han...@intel.com Link: http://lkml.kernel.org/r/20181203171043.229119...@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 38 +++--- 1 file changed, 19 insertions(+), 19 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a1bcde35db4c..6e6900ebea30 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -228,6 +228,23 @@ static bool __cpa_pfn_in_highmap(unsigned long pfn) #endif +static unsigned long __cpa_addr(struct cpa_data *cpa, int idx) +{ + if (cpa->flags & CPA_PAGES_ARRAY) { + struct page *page = cpa->pages[idx]; + + if (unlikely(PageHighMem(page))) + return 0; + + return (unsigned long)page_address(page); + } + + if (cpa->flags & CPA_ARRAY) + return cpa->vaddr[idx]; + + return *cpa->vaddr; +} + /* * Flushing functions */ @@ -1476,15 +1493,7 @@ static int __change_page_attr(struct cpa_data *cpa, int primary) unsigned int level; pte_t *kpte, old_pte; - if (cpa->flags & CPA_PAGES_ARRAY) { - struct page *page = cpa->pages[cpa->curpage]; - if (unlikely(PageHighMem(page))) - return 0; - address = (unsigned long)page_address(page); - } else if (cpa->flags & CPA_ARRAY) - address = cpa->vaddr[cpa->curpage]; - else - address = *cpa->vaddr; + address = __cpa_addr(cpa, cpa->curpage); repeat: kpte = _lookup_address_cpa(cpa, address, ); if (!kpte) @@ -1565,16 +1574,7 @@ static int cpa_process_alias(struct cpa_data *cpa) * No need to redo, when the primary call touched the direct * mapping already: */ - if (cpa->flags & CPA_PAGES_ARRAY) { - struct page *page = cpa->pages[cpa->curpage]; - if (unlikely(PageHighMem(page))) - return 0; - vaddr = (unsigned long)page_address(page); - } else if (cpa->flags & CPA_ARRAY) - vaddr = cpa->vaddr[cpa->curpage]; - else - vaddr = *cpa->vaddr; - + vaddr = __cpa_addr(cpa, cpa->curpage); if (!(within(vaddr, PAGE_OFFSET, PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT {
[tip:x86/mm] x86/mm/cpa: Fix cpa_flush_array() TLB invalidation
Commit-ID: 721066dfd4d5c0fee5772c777d6930d0f423b4eb Gitweb: https://git.kernel.org/tip/721066dfd4d5c0fee5772c777d6930d0f423b4eb Author: Peter Zijlstra AuthorDate: Mon, 3 Dec 2018 18:03:44 +0100 Committer: Ingo Molnar CommitDate: Mon, 17 Dec 2018 18:48:09 +0100 x86/mm/cpa: Fix cpa_flush_array() TLB invalidation In commit: a7295fd53c39 ("x86/mm/cpa: Use flush_tlb_kernel_range()") I misread the CAP array code and incorrectly used tlb_flush_kernel_range(), resulting in missing TLB flushes and consequent failures. Instead do a full invalidate in this case -- for now. Reported-by: StDenis, Tom Signed-off-by: Peter Zijlstra (Intel) Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Dave Hansen Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Rik van Riel Cc: Thomas Gleixner Cc: dave.han...@intel.com Fixes: a7295fd53c39 ("x86/mm/cpa: Use flush_tlb_kernel_range()") Link: http://lkml.kernel.org/r/20181203171043.089868...@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/mm/pageattr.c | 24 1 file changed, 16 insertions(+), 8 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index db7a10082238..a1bcde35db4c 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -285,20 +285,16 @@ static void cpa_flush_all(unsigned long cache) on_each_cpu(__cpa_flush_all, (void *) cache, 1); } -static bool __cpa_flush_range(unsigned long start, int numpages, int cache) +static bool __inv_flush_all(int cache) { BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); - WARN_ON(PAGE_ALIGN(start) != start); - if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) { cpa_flush_all(cache); return true; } - flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages); - - return !cache; + return false; } static void cpa_flush_range(unsigned long start, int numpages, int cache) @@ -306,7 +302,14 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) unsigned int i, level; unsigned long addr; - if (__cpa_flush_range(start, numpages, cache)) + WARN_ON(PAGE_ALIGN(start) != start); + + if (__inv_flush_all(cache)) + return; + + flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages); + + if (!cache) return; /* @@ -332,7 +335,12 @@ static void cpa_flush_array(unsigned long baddr, unsigned long *start, { unsigned int i, level; - if (__cpa_flush_range(baddr, numpages, cache)) + if (__inv_flush_all(cache)) + return; + + flush_tlb_all(); + + if (!cache) return; /*
[tip:x86/pti] sched/smt: Make sched_smt_present track topology
Commit-ID: c5511d03ec090980732e929c318a7a6374b5550e Gitweb: https://git.kernel.org/tip/c5511d03ec090980732e929c318a7a6374b5550e Author: Peter Zijlstra (Intel) AuthorDate: Sun, 25 Nov 2018 19:33:36 +0100 Committer: Thomas Gleixner CommitDate: Wed, 28 Nov 2018 11:57:06 +0100 sched/smt: Make sched_smt_present track topology Currently the 'sched_smt_present' static key is enabled when at CPU bringup SMT topology is observed, but it is never disabled. However there is demand to also disable the key when the topology changes such that there is no SMT present anymore. Implement this by making the key count the number of cores that have SMT enabled. In particular, the SMT topology bits are set before interrrupts are enabled and similarly, are cleared after interrupts are disabled for the last time and the CPU dies. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185004.246110...@linutronix.de --- kernel/sched/core.c | 19 +++ 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 091e089063be..6fedf3a98581 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5738,15 +5738,10 @@ int sched_cpu_activate(unsigned int cpu) #ifdef CONFIG_SCHED_SMT /* -* The sched_smt_present static key needs to be evaluated on every -* hotplug event because at boot time SMT might be disabled when -* the number of booted CPUs is limited. -* -* If then later a sibling gets hotplugged, then the key would stay -* off and SMT scheduling would never be functional. +* When going up, increment the number of cores with SMT present. */ - if (cpumask_weight(cpu_smt_mask(cpu)) > 1) - static_branch_enable_cpuslocked(_smt_present); + if (cpumask_weight(cpu_smt_mask(cpu)) == 2) + static_branch_inc_cpuslocked(_smt_present); #endif set_cpu_active(cpu, true); @@ -5790,6 +5785,14 @@ int sched_cpu_deactivate(unsigned int cpu) */ synchronize_rcu_mult(call_rcu, call_rcu_sched); +#ifdef CONFIG_SCHED_SMT + /* +* When going down, decrement the number of cores with SMT present. +*/ + if (cpumask_weight(cpu_smt_mask(cpu)) == 2) + static_branch_dec_cpuslocked(_smt_present); +#endif + if (!sched_smp_initialized) return 0;
[tip:x86/pti] sched/smt: Make sched_smt_present track topology
Commit-ID: c5511d03ec090980732e929c318a7a6374b5550e Gitweb: https://git.kernel.org/tip/c5511d03ec090980732e929c318a7a6374b5550e Author: Peter Zijlstra (Intel) AuthorDate: Sun, 25 Nov 2018 19:33:36 +0100 Committer: Thomas Gleixner CommitDate: Wed, 28 Nov 2018 11:57:06 +0100 sched/smt: Make sched_smt_present track topology Currently the 'sched_smt_present' static key is enabled when at CPU bringup SMT topology is observed, but it is never disabled. However there is demand to also disable the key when the topology changes such that there is no SMT present anymore. Implement this by making the key count the number of cores that have SMT enabled. In particular, the SMT topology bits are set before interrrupts are enabled and similarly, are cleared after interrupts are disabled for the last time and the CPU dies. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Ingo Molnar Cc: Andy Lutomirski Cc: Linus Torvalds Cc: Jiri Kosina Cc: Tom Lendacky Cc: Josh Poimboeuf Cc: Andrea Arcangeli Cc: David Woodhouse Cc: Tim Chen Cc: Andi Kleen Cc: Dave Hansen Cc: Casey Schaufler Cc: Asit Mallick Cc: Arjan van de Ven Cc: Jon Masters Cc: Waiman Long Cc: Greg KH Cc: Dave Stewart Cc: Kees Cook Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/20181125185004.246110...@linutronix.de --- kernel/sched/core.c | 19 +++ 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 091e089063be..6fedf3a98581 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -5738,15 +5738,10 @@ int sched_cpu_activate(unsigned int cpu) #ifdef CONFIG_SCHED_SMT /* -* The sched_smt_present static key needs to be evaluated on every -* hotplug event because at boot time SMT might be disabled when -* the number of booted CPUs is limited. -* -* If then later a sibling gets hotplugged, then the key would stay -* off and SMT scheduling would never be functional. +* When going up, increment the number of cores with SMT present. */ - if (cpumask_weight(cpu_smt_mask(cpu)) > 1) - static_branch_enable_cpuslocked(_smt_present); + if (cpumask_weight(cpu_smt_mask(cpu)) == 2) + static_branch_inc_cpuslocked(_smt_present); #endif set_cpu_active(cpu, true); @@ -5790,6 +5785,14 @@ int sched_cpu_deactivate(unsigned int cpu) */ synchronize_rcu_mult(call_rcu, call_rcu_sched); +#ifdef CONFIG_SCHED_SMT + /* +* When going down, decrement the number of cores with SMT present. +*/ + if (cpumask_weight(cpu_smt_mask(cpu)) == 2) + static_branch_dec_cpuslocked(_smt_present); +#endif + if (!sched_smp_initialized) return 0;
[tip:perf/urgent] perf/x86/intel: Fix regression by default disabling perfmon v4 interrupt handling
Commit-ID: 2a5bf23d5b795d5df33dc284e8f5cf8b6a5b4042 Gitweb: https://git.kernel.org/tip/2a5bf23d5b795d5df33dc284e8f5cf8b6a5b4042 Author: Peter Zijlstra AuthorDate: Tue, 20 Nov 2018 18:08:42 +0100 Committer: Ingo Molnar CommitDate: Tue, 20 Nov 2018 18:57:48 +0100 perf/x86/intel: Fix regression by default disabling perfmon v4 interrupt handling Kyle Huey reported that 'rr', a replay debugger, broke due to the following commit: af3bdb991a5c ("perf/x86/intel: Add a separate Arch Perfmon v4 PMI handler") Rework the 'disable_counter_freezing' __setup() parameter such that we can explicitly enable/disable it and switch to default disabled. To this purpose, rename the parameter to "perf_v4_pmi=" which is a much better description and allows requiring a bool argument. [ mingo: Improved the changelog some more. ] Reported-by: Kyle Huey Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Robert O'Callahan Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: a...@kernel.org Link: http://lkml.kernel.org/r/20181120170842.gz2...@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- Documentation/admin-guide/kernel-parameters.txt | 3 ++- arch/x86/events/intel/core.c| 12 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 81d1d5a74728..5463d5a4d85c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -856,7 +856,8 @@ causing system reset or hang due to sending INIT from AP to BSP. - disable_counter_freezing [HW] + perf_v4_pmi=[X86,INTEL] + Format: Disable Intel PMU counter freezing feature. The feature only exists starting from Arch Perfmon v4 (Skylake and newer). diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 273c62e81546..af8bea9d4006 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2306,14 +2306,18 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) return handled; } -static bool disable_counter_freezing; +static bool disable_counter_freezing = true; static int __init intel_perf_counter_freezing_setup(char *s) { - disable_counter_freezing = true; - pr_info("Intel PMU Counter freezing feature disabled\n"); + bool res; + + if (kstrtobool(s, )) + return -EINVAL; + + disable_counter_freezing = !res; return 1; } -__setup("disable_counter_freezing", intel_perf_counter_freezing_setup); +__setup("perf_v4_pmi=", intel_perf_counter_freezing_setup); /* * Simplified handler for Arch Perfmon v4:
[tip:perf/urgent] perf/x86/intel: Fix regression by default disabling perfmon v4 interrupt handling
Commit-ID: 2a5bf23d5b795d5df33dc284e8f5cf8b6a5b4042 Gitweb: https://git.kernel.org/tip/2a5bf23d5b795d5df33dc284e8f5cf8b6a5b4042 Author: Peter Zijlstra AuthorDate: Tue, 20 Nov 2018 18:08:42 +0100 Committer: Ingo Molnar CommitDate: Tue, 20 Nov 2018 18:57:48 +0100 perf/x86/intel: Fix regression by default disabling perfmon v4 interrupt handling Kyle Huey reported that 'rr', a replay debugger, broke due to the following commit: af3bdb991a5c ("perf/x86/intel: Add a separate Arch Perfmon v4 PMI handler") Rework the 'disable_counter_freezing' __setup() parameter such that we can explicitly enable/disable it and switch to default disabled. To this purpose, rename the parameter to "perf_v4_pmi=" which is a much better description and allows requiring a bool argument. [ mingo: Improved the changelog some more. ] Reported-by: Kyle Huey Signed-off-by: Peter Zijlstra (Intel) Cc: Alexander Shishkin Cc: Arnaldo Carvalho de Melo Cc: Jiri Olsa Cc: Kan Liang Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Robert O'Callahan Cc: Stephane Eranian Cc: Thomas Gleixner Cc: Vince Weaver Cc: a...@kernel.org Link: http://lkml.kernel.org/r/20181120170842.gz2...@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- Documentation/admin-guide/kernel-parameters.txt | 3 ++- arch/x86/events/intel/core.c| 12 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt index 81d1d5a74728..5463d5a4d85c 100644 --- a/Documentation/admin-guide/kernel-parameters.txt +++ b/Documentation/admin-guide/kernel-parameters.txt @@ -856,7 +856,8 @@ causing system reset or hang due to sending INIT from AP to BSP. - disable_counter_freezing [HW] + perf_v4_pmi=[X86,INTEL] + Format: Disable Intel PMU counter freezing feature. The feature only exists starting from Arch Perfmon v4 (Skylake and newer). diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c index 273c62e81546..af8bea9d4006 100644 --- a/arch/x86/events/intel/core.c +++ b/arch/x86/events/intel/core.c @@ -2306,14 +2306,18 @@ static int handle_pmi_common(struct pt_regs *regs, u64 status) return handled; } -static bool disable_counter_freezing; +static bool disable_counter_freezing = true; static int __init intel_perf_counter_freezing_setup(char *s) { - disable_counter_freezing = true; - pr_info("Intel PMU Counter freezing feature disabled\n"); + bool res; + + if (kstrtobool(s, )) + return -EINVAL; + + disable_counter_freezing = !res; return 1; } -__setup("disable_counter_freezing", intel_perf_counter_freezing_setup); +__setup("perf_v4_pmi=", intel_perf_counter_freezing_setup); /* * Simplified handler for Arch Perfmon v4:
[tip:locking/core] x86/asm: 'Simplify' GEN_*_RMWcc() macros
Commit-ID: 288e4521f0f6717909933116563e66bb894ae2af Gitweb: https://git.kernel.org/tip/288e4521f0f6717909933116563e66bb894ae2af Author: Peter Zijlstra AuthorDate: Wed, 3 Oct 2018 12:34:10 +0200 Committer: Ingo Molnar CommitDate: Tue, 16 Oct 2018 17:33:54 +0200 x86/asm: 'Simplify' GEN_*_RMWcc() macros Currently the GEN_*_RMWcc() macros include a return statement, which pretty much mandates we directly wrap them in a (inline) function. Macros with return statements are tricky and, as per the above, limit use, so remove the return statement and make them statement-expressions. This allows them to be used more widely. Also, shuffle the arguments a bit. Place the @cc argument as 3rd, this makes it consistent between UNARY and BINARY, but more importantly, it makes the @arg0 argument last. Since the @arg0 argument is now last, we can do CPP trickery and make it an optional argument, simplifying the users; 17 out of 18 occurences do not need this argument. Finally, change to asm symbolic names, instead of the numeric ordering of operands, which allows us to get rid of __BINARY_RMWcc_ARG and get cleaner code overall. Signed-off-by: Peter Zijlstra (Intel) Cc: jbeul...@suse.com Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: b...@alien8.de Cc: h...@linux.intel.com Link: https://lkml.kernel.org/r/20181003130957.108960...@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/atomic.h | 8 ++--- arch/x86/include/asm/atomic64_64.h | 8 ++--- arch/x86/include/asm/bitops.h | 9 ++--- arch/x86/include/asm/local.h | 8 ++--- arch/x86/include/asm/preempt.h | 2 +- arch/x86/include/asm/refcount.h| 13 +++ arch/x86/include/asm/rmwcc.h | 69 ++ 7 files changed, 64 insertions(+), 53 deletions(-) diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index ce84388e540c..ea3d95275b43 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -82,7 +82,7 @@ static __always_inline void arch_atomic_sub(int i, atomic_t *v) */ static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v) { - GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", e); + return GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, e, "er", i); } #define arch_atomic_sub_and_test arch_atomic_sub_and_test @@ -122,7 +122,7 @@ static __always_inline void arch_atomic_dec(atomic_t *v) */ static __always_inline bool arch_atomic_dec_and_test(atomic_t *v) { - GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", e); + return GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, e); } #define arch_atomic_dec_and_test arch_atomic_dec_and_test @@ -136,7 +136,7 @@ static __always_inline bool arch_atomic_dec_and_test(atomic_t *v) */ static __always_inline bool arch_atomic_inc_and_test(atomic_t *v) { - GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", e); + return GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, e); } #define arch_atomic_inc_and_test arch_atomic_inc_and_test @@ -151,7 +151,7 @@ static __always_inline bool arch_atomic_inc_and_test(atomic_t *v) */ static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v) { - GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", s); + return GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, s, "er", i); } #define arch_atomic_add_negative arch_atomic_add_negative diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 5f851d92eecd..dadc20adba21 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -73,7 +73,7 @@ static inline void arch_atomic64_sub(long i, atomic64_t *v) */ static inline bool arch_atomic64_sub_and_test(long i, atomic64_t *v) { - GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, "er", i, "%0", e); + return GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, e, "er", i); } #define arch_atomic64_sub_and_test arch_atomic64_sub_and_test @@ -115,7 +115,7 @@ static __always_inline void arch_atomic64_dec(atomic64_t *v) */ static inline bool arch_atomic64_dec_and_test(atomic64_t *v) { - GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", e); + return GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, e); } #define arch_atomic64_dec_and_test arch_atomic64_dec_and_test @@ -129,7 +129,7 @@ static inline bool arch_atomic64_dec_and_test(atomic64_t *v) */ static inline bool arch_atomic64_inc_and_test(atomic64_t *v) { - GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", e); + return GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, e); } #define arch_atomic64_inc_and_test arch_atomic64_inc_and_test @@ -144,7 +144,7 @@ static inline bool
[tip:locking/core] x86/asm: 'Simplify' GEN_*_RMWcc() macros
Commit-ID: 288e4521f0f6717909933116563e66bb894ae2af Gitweb: https://git.kernel.org/tip/288e4521f0f6717909933116563e66bb894ae2af Author: Peter Zijlstra AuthorDate: Wed, 3 Oct 2018 12:34:10 +0200 Committer: Ingo Molnar CommitDate: Tue, 16 Oct 2018 17:33:54 +0200 x86/asm: 'Simplify' GEN_*_RMWcc() macros Currently the GEN_*_RMWcc() macros include a return statement, which pretty much mandates we directly wrap them in a (inline) function. Macros with return statements are tricky and, as per the above, limit use, so remove the return statement and make them statement-expressions. This allows them to be used more widely. Also, shuffle the arguments a bit. Place the @cc argument as 3rd, this makes it consistent between UNARY and BINARY, but more importantly, it makes the @arg0 argument last. Since the @arg0 argument is now last, we can do CPP trickery and make it an optional argument, simplifying the users; 17 out of 18 occurences do not need this argument. Finally, change to asm symbolic names, instead of the numeric ordering of operands, which allows us to get rid of __BINARY_RMWcc_ARG and get cleaner code overall. Signed-off-by: Peter Zijlstra (Intel) Cc: jbeul...@suse.com Cc: Linus Torvalds Cc: Andy Lutomirski Cc: Borislav Petkov Cc: Brian Gerst Cc: Denys Vlasenko Cc: H. Peter Anvin Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: b...@alien8.de Cc: h...@linux.intel.com Link: https://lkml.kernel.org/r/20181003130957.108960...@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/atomic.h | 8 ++--- arch/x86/include/asm/atomic64_64.h | 8 ++--- arch/x86/include/asm/bitops.h | 9 ++--- arch/x86/include/asm/local.h | 8 ++--- arch/x86/include/asm/preempt.h | 2 +- arch/x86/include/asm/refcount.h| 13 +++ arch/x86/include/asm/rmwcc.h | 69 ++ 7 files changed, 64 insertions(+), 53 deletions(-) diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h index ce84388e540c..ea3d95275b43 100644 --- a/arch/x86/include/asm/atomic.h +++ b/arch/x86/include/asm/atomic.h @@ -82,7 +82,7 @@ static __always_inline void arch_atomic_sub(int i, atomic_t *v) */ static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v) { - GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", e); + return GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, e, "er", i); } #define arch_atomic_sub_and_test arch_atomic_sub_and_test @@ -122,7 +122,7 @@ static __always_inline void arch_atomic_dec(atomic_t *v) */ static __always_inline bool arch_atomic_dec_and_test(atomic_t *v) { - GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", e); + return GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, e); } #define arch_atomic_dec_and_test arch_atomic_dec_and_test @@ -136,7 +136,7 @@ static __always_inline bool arch_atomic_dec_and_test(atomic_t *v) */ static __always_inline bool arch_atomic_inc_and_test(atomic_t *v) { - GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", e); + return GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, e); } #define arch_atomic_inc_and_test arch_atomic_inc_and_test @@ -151,7 +151,7 @@ static __always_inline bool arch_atomic_inc_and_test(atomic_t *v) */ static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v) { - GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", s); + return GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, s, "er", i); } #define arch_atomic_add_negative arch_atomic_add_negative diff --git a/arch/x86/include/asm/atomic64_64.h b/arch/x86/include/asm/atomic64_64.h index 5f851d92eecd..dadc20adba21 100644 --- a/arch/x86/include/asm/atomic64_64.h +++ b/arch/x86/include/asm/atomic64_64.h @@ -73,7 +73,7 @@ static inline void arch_atomic64_sub(long i, atomic64_t *v) */ static inline bool arch_atomic64_sub_and_test(long i, atomic64_t *v) { - GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, "er", i, "%0", e); + return GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, e, "er", i); } #define arch_atomic64_sub_and_test arch_atomic64_sub_and_test @@ -115,7 +115,7 @@ static __always_inline void arch_atomic64_dec(atomic64_t *v) */ static inline bool arch_atomic64_dec_and_test(atomic64_t *v) { - GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", e); + return GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, e); } #define arch_atomic64_dec_and_test arch_atomic64_dec_and_test @@ -129,7 +129,7 @@ static inline bool arch_atomic64_dec_and_test(atomic64_t *v) */ static inline bool arch_atomic64_inc_and_test(atomic64_t *v) { - GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", e); + return GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, e); } #define arch_atomic64_inc_and_test arch_atomic64_inc_and_test @@ -144,7 +144,7 @@ static inline bool
[tip:locking/core] locking/qspinlock, x86: Provide liveness guarantee
Commit-ID: 7aa54be2976550f17c11a1c3e3630002dea39303 Gitweb: https://git.kernel.org/tip/7aa54be2976550f17c11a1c3e3630002dea39303 Author: Peter Zijlstra AuthorDate: Wed, 26 Sep 2018 13:01:20 +0200 Committer: Ingo Molnar CommitDate: Tue, 16 Oct 2018 17:33:54 +0200 locking/qspinlock, x86: Provide liveness guarantee On x86 we cannot do fetch_or() with a single instruction and thus end up using a cmpxchg loop, this reduces determinism. Replace the fetch_or() with a composite operation: tas-pending + load. Using two instructions of course opens a window we previously did not have. Consider the scenario: CPU0CPU1CPU2 1) lock trylock -> (0,0,1) 2) lock trylock /* fail */ 3) unlock -> (0,0,0) 4) lock trylock -> (0,0,1) 5) tas-pending -> (0,1,1) load-val <- (0,1,0) from 3 6) clear-pending-set-locked -> (0,0,1) FAIL: _2_ owners where 5) is our new composite operation. When we consider each part of the qspinlock state as a separate variable (as we can when _Q_PENDING_BITS == 8) then the above is entirely possible, because tas-pending will only RmW the pending byte, so the later load is able to observe prior tail and lock state (but not earlier than its own trylock, which operates on the whole word, due to coherence). To avoid this we need 2 things: - the load must come after the tas-pending (obviously, otherwise it can trivially observe prior state). - the tas-pending must be a full word RmW instruction, it cannot be an XCHGB for example, such that we cannot observe other state prior to setting pending. On x86 we can realize this by using "LOCK BTS m32, r32" for tas-pending followed by a regular load. Note that observing later state is not a problem: - if we fail to observe a later unlock, we'll simply spin-wait for that store to become visible. - if we observe a later xchg_tail(), there is no difference from that xchg_tail() having taken place before the tas-pending. Suggested-by: Will Deacon Reported-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Will Deacon Cc: Linus Torvalds Cc: Peter Zijlstra Cc: andrea.pa...@amarulasolutions.com Cc: long...@redhat.com Fixes: 59fb586b4a07 ("locking/qspinlock: Remove unbounded cmpxchg() loop from locking slowpath") Link: https://lkml.kernel.org/r/20181003130957.183726...@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/qspinlock.h | 15 +++ kernel/locking/qspinlock.c | 16 +++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index 3e70bed8a978..87623c6b13db 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -6,9 +6,24 @@ #include #include #include +#include #define _Q_PENDING_LOOPS (1 << 9) +#define queued_fetch_set_pending_acquire queued_fetch_set_pending_acquire +static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock) +{ + u32 val = 0; + + if (GEN_BINARY_RMWcc(LOCK_PREFIX "btsl", lock->val.counter, c, +"I", _Q_PENDING_OFFSET)) + val |= _Q_PENDING_VAL; + + val |= atomic_read(>val) & ~_Q_PENDING_MASK; + + return val; +} + #ifdef CONFIG_PARAVIRT_SPINLOCKS extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); extern void __pv_init_lock_hash(void); diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 47cb99787e4d..341ca666bc60 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -231,6 +231,20 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) } #endif /* _Q_PENDING_BITS == 8 */ +/** + * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending + * @lock : Pointer to queued spinlock structure + * Return: The previous lock value + * + * *,*,* -> *,1,* + */ +#ifndef queued_fetch_set_pending_acquire +static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock) +{ + return atomic_fetch_or_acquire(_Q_PENDING_VAL, >val); +} +#endif + /** * set_locked - Set the lock bit and own the lock * @lock: Pointer to queued spinlock structure @@ -328,7 +342,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) * * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock */ - val = atomic_fetch_or_acquire(_Q_PENDING_VAL, >val); + val = queued_fetch_set_pending_acquire(lock); /* * If we observe contention, there is a concurrent locker.
[tip:locking/core] locking/qspinlock, x86: Provide liveness guarantee
Commit-ID: 7aa54be2976550f17c11a1c3e3630002dea39303 Gitweb: https://git.kernel.org/tip/7aa54be2976550f17c11a1c3e3630002dea39303 Author: Peter Zijlstra AuthorDate: Wed, 26 Sep 2018 13:01:20 +0200 Committer: Ingo Molnar CommitDate: Tue, 16 Oct 2018 17:33:54 +0200 locking/qspinlock, x86: Provide liveness guarantee On x86 we cannot do fetch_or() with a single instruction and thus end up using a cmpxchg loop, this reduces determinism. Replace the fetch_or() with a composite operation: tas-pending + load. Using two instructions of course opens a window we previously did not have. Consider the scenario: CPU0CPU1CPU2 1) lock trylock -> (0,0,1) 2) lock trylock /* fail */ 3) unlock -> (0,0,0) 4) lock trylock -> (0,0,1) 5) tas-pending -> (0,1,1) load-val <- (0,1,0) from 3 6) clear-pending-set-locked -> (0,0,1) FAIL: _2_ owners where 5) is our new composite operation. When we consider each part of the qspinlock state as a separate variable (as we can when _Q_PENDING_BITS == 8) then the above is entirely possible, because tas-pending will only RmW the pending byte, so the later load is able to observe prior tail and lock state (but not earlier than its own trylock, which operates on the whole word, due to coherence). To avoid this we need 2 things: - the load must come after the tas-pending (obviously, otherwise it can trivially observe prior state). - the tas-pending must be a full word RmW instruction, it cannot be an XCHGB for example, such that we cannot observe other state prior to setting pending. On x86 we can realize this by using "LOCK BTS m32, r32" for tas-pending followed by a regular load. Note that observing later state is not a problem: - if we fail to observe a later unlock, we'll simply spin-wait for that store to become visible. - if we observe a later xchg_tail(), there is no difference from that xchg_tail() having taken place before the tas-pending. Suggested-by: Will Deacon Reported-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Will Deacon Cc: Linus Torvalds Cc: Peter Zijlstra Cc: andrea.pa...@amarulasolutions.com Cc: long...@redhat.com Fixes: 59fb586b4a07 ("locking/qspinlock: Remove unbounded cmpxchg() loop from locking slowpath") Link: https://lkml.kernel.org/r/20181003130957.183726...@infradead.org Signed-off-by: Ingo Molnar --- arch/x86/include/asm/qspinlock.h | 15 +++ kernel/locking/qspinlock.c | 16 +++- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h index 3e70bed8a978..87623c6b13db 100644 --- a/arch/x86/include/asm/qspinlock.h +++ b/arch/x86/include/asm/qspinlock.h @@ -6,9 +6,24 @@ #include #include #include +#include #define _Q_PENDING_LOOPS (1 << 9) +#define queued_fetch_set_pending_acquire queued_fetch_set_pending_acquire +static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock) +{ + u32 val = 0; + + if (GEN_BINARY_RMWcc(LOCK_PREFIX "btsl", lock->val.counter, c, +"I", _Q_PENDING_OFFSET)) + val |= _Q_PENDING_VAL; + + val |= atomic_read(>val) & ~_Q_PENDING_MASK; + + return val; +} + #ifdef CONFIG_PARAVIRT_SPINLOCKS extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val); extern void __pv_init_lock_hash(void); diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index 47cb99787e4d..341ca666bc60 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -231,6 +231,20 @@ static __always_inline u32 xchg_tail(struct qspinlock *lock, u32 tail) } #endif /* _Q_PENDING_BITS == 8 */ +/** + * queued_fetch_set_pending_acquire - fetch the whole lock value and set pending + * @lock : Pointer to queued spinlock structure + * Return: The previous lock value + * + * *,*,* -> *,1,* + */ +#ifndef queued_fetch_set_pending_acquire +static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock *lock) +{ + return atomic_fetch_or_acquire(_Q_PENDING_VAL, >val); +} +#endif + /** * set_locked - Set the lock bit and own the lock * @lock: Pointer to queued spinlock structure @@ -328,7 +342,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) * * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock */ - val = atomic_fetch_or_acquire(_Q_PENDING_VAL, >val); + val = queued_fetch_set_pending_acquire(lock); /* * If we observe contention, there is a concurrent locker.
[tip:locking/core] locking/qspinlock: Rework some comments
Commit-ID: 756b1df4c2c82a1cdffeafa9d2aa76c92e7fb405 Gitweb: https://git.kernel.org/tip/756b1df4c2c82a1cdffeafa9d2aa76c92e7fb405 Author: Peter Zijlstra AuthorDate: Wed, 26 Sep 2018 13:01:19 +0200 Committer: Ingo Molnar CommitDate: Tue, 16 Oct 2018 17:33:54 +0200 locking/qspinlock: Rework some comments While working my way through the code again; I felt the comments could use help. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrea.pa...@amarulasolutions.com Cc: long...@redhat.com Link: https://lkml.kernel.org/r/20181003130257.156322...@infradead.org Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 36 ++-- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index ec343276f975..47cb99787e4d 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -326,16 +326,23 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) /* * trylock || pending * -* 0,0,0 -> 0,0,1 ; trylock -* 0,0,1 -> 0,1,1 ; pending +* 0,0,* -> 0,1,* -> 0,0,1 pending, trylock */ val = atomic_fetch_or_acquire(_Q_PENDING_VAL, >val); + /* -* If we observe any contention; undo and queue. +* If we observe contention, there is a concurrent locker. +* +* Undo and queue; our setting of PENDING might have made the +* n,0,0 -> 0,0,0 transition fail and it will now be waiting +* on @next to become !NULL. */ if (unlikely(val & ~_Q_LOCKED_MASK)) { + + /* Undo PENDING if we set it. */ if (!(val & _Q_PENDING_MASK)) clear_pending(lock); + goto queue; } @@ -474,16 +481,25 @@ locked: */ /* -* In the PV case we might already have _Q_LOCKED_VAL set. +* In the PV case we might already have _Q_LOCKED_VAL set, because +* of lock stealing; therefore we must also allow: * -* The atomic_cond_read_acquire() call above has provided the -* necessary acquire semantics required for locking. +* n,0,1 -> 0,0,1 +* +* Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the +* above wait condition, therefore any concurrent setting of +* PENDING will make the uncontended transition fail. */ - if (((val & _Q_TAIL_MASK) == tail) && - atomic_try_cmpxchg_relaxed(>val, , _Q_LOCKED_VAL)) - goto release; /* No contention */ + if ((val & _Q_TAIL_MASK) == tail) { + if (atomic_try_cmpxchg_relaxed(>val, , _Q_LOCKED_VAL)) + goto release; /* No contention */ + } - /* Either somebody is queued behind us or _Q_PENDING_VAL is set */ + /* +* Either somebody is queued behind us or _Q_PENDING_VAL got set +* which will then detect the remaining tail and queue behind us +* ensuring we'll see a @next. +*/ set_locked(lock); /*
[tip:locking/core] locking/qspinlock: Rework some comments
Commit-ID: 756b1df4c2c82a1cdffeafa9d2aa76c92e7fb405 Gitweb: https://git.kernel.org/tip/756b1df4c2c82a1cdffeafa9d2aa76c92e7fb405 Author: Peter Zijlstra AuthorDate: Wed, 26 Sep 2018 13:01:19 +0200 Committer: Ingo Molnar CommitDate: Tue, 16 Oct 2018 17:33:54 +0200 locking/qspinlock: Rework some comments While working my way through the code again; I felt the comments could use help. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrea.pa...@amarulasolutions.com Cc: long...@redhat.com Link: https://lkml.kernel.org/r/20181003130257.156322...@infradead.org Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 36 ++-- 1 file changed, 26 insertions(+), 10 deletions(-) diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index ec343276f975..47cb99787e4d 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -326,16 +326,23 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) /* * trylock || pending * -* 0,0,0 -> 0,0,1 ; trylock -* 0,0,1 -> 0,1,1 ; pending +* 0,0,* -> 0,1,* -> 0,0,1 pending, trylock */ val = atomic_fetch_or_acquire(_Q_PENDING_VAL, >val); + /* -* If we observe any contention; undo and queue. +* If we observe contention, there is a concurrent locker. +* +* Undo and queue; our setting of PENDING might have made the +* n,0,0 -> 0,0,0 transition fail and it will now be waiting +* on @next to become !NULL. */ if (unlikely(val & ~_Q_LOCKED_MASK)) { + + /* Undo PENDING if we set it. */ if (!(val & _Q_PENDING_MASK)) clear_pending(lock); + goto queue; } @@ -474,16 +481,25 @@ locked: */ /* -* In the PV case we might already have _Q_LOCKED_VAL set. +* In the PV case we might already have _Q_LOCKED_VAL set, because +* of lock stealing; therefore we must also allow: * -* The atomic_cond_read_acquire() call above has provided the -* necessary acquire semantics required for locking. +* n,0,1 -> 0,0,1 +* +* Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the +* above wait condition, therefore any concurrent setting of +* PENDING will make the uncontended transition fail. */ - if (((val & _Q_TAIL_MASK) == tail) && - atomic_try_cmpxchg_relaxed(>val, , _Q_LOCKED_VAL)) - goto release; /* No contention */ + if ((val & _Q_TAIL_MASK) == tail) { + if (atomic_try_cmpxchg_relaxed(>val, , _Q_LOCKED_VAL)) + goto release; /* No contention */ + } - /* Either somebody is queued behind us or _Q_PENDING_VAL is set */ + /* +* Either somebody is queued behind us or _Q_PENDING_VAL got set +* which will then detect the remaining tail and queue behind us +* ensuring we'll see a @next. +*/ set_locked(lock); /*
[tip:locking/core] locking/qspinlock: Re-order code
Commit-ID: 53bf57fab7321fb42b703056a4c80fc9d986d170 Gitweb: https://git.kernel.org/tip/53bf57fab7321fb42b703056a4c80fc9d986d170 Author: Peter Zijlstra AuthorDate: Wed, 26 Sep 2018 13:01:18 +0200 Committer: Ingo Molnar CommitDate: Tue, 16 Oct 2018 17:33:53 +0200 locking/qspinlock: Re-order code Flip the branch condition after atomic_fetch_or_acquire(_Q_PENDING_VAL) such that we loose the indent. This also result in a more natural code flow IMO. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrea.pa...@amarulasolutions.com Cc: long...@redhat.com Link: https://lkml.kernel.org/r/20181003130257.156322...@infradead.org Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 56 ++ 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index bfaeb05123ff..ec343276f975 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -330,39 +330,37 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) * 0,0,1 -> 0,1,1 ; pending */ val = atomic_fetch_or_acquire(_Q_PENDING_VAL, >val); - if (!(val & ~_Q_LOCKED_MASK)) { - /* -* We're pending, wait for the owner to go away. -* -* *,1,1 -> *,1,0 -* -* this wait loop must be a load-acquire such that we match the -* store-release that clears the locked bit and create lock -* sequentiality; this is because not all -* clear_pending_set_locked() implementations imply full -* barriers. -*/ - if (val & _Q_LOCKED_MASK) { - atomic_cond_read_acquire(>val, -!(VAL & _Q_LOCKED_MASK)); - } - - /* -* take ownership and clear the pending bit. -* -* *,1,0 -> *,0,1 -*/ - clear_pending_set_locked(lock); - qstat_inc(qstat_lock_pending, true); - return; + /* +* If we observe any contention; undo and queue. +*/ + if (unlikely(val & ~_Q_LOCKED_MASK)) { + if (!(val & _Q_PENDING_MASK)) + clear_pending(lock); + goto queue; } /* -* If pending was clear but there are waiters in the queue, then -* we need to undo our setting of pending before we queue ourselves. +* We're pending, wait for the owner to go away. +* +* 0,1,1 -> 0,1,0 +* +* this wait loop must be a load-acquire such that we match the +* store-release that clears the locked bit and create lock +* sequentiality; this is because not all +* clear_pending_set_locked() implementations imply full +* barriers. +*/ + if (val & _Q_LOCKED_MASK) + atomic_cond_read_acquire(>val, !(VAL & _Q_LOCKED_MASK)); + + /* +* take ownership and clear the pending bit. +* +* 0,1,0 -> 0,0,1 */ - if (!(val & _Q_PENDING_MASK)) - clear_pending(lock); + clear_pending_set_locked(lock); + qstat_inc(qstat_lock_pending, true); + return; /* * End of pending bit optimistic spinning and beginning of MCS
[tip:locking/core] locking/qspinlock: Re-order code
Commit-ID: 53bf57fab7321fb42b703056a4c80fc9d986d170 Gitweb: https://git.kernel.org/tip/53bf57fab7321fb42b703056a4c80fc9d986d170 Author: Peter Zijlstra AuthorDate: Wed, 26 Sep 2018 13:01:18 +0200 Committer: Ingo Molnar CommitDate: Tue, 16 Oct 2018 17:33:53 +0200 locking/qspinlock: Re-order code Flip the branch condition after atomic_fetch_or_acquire(_Q_PENDING_VAL) such that we loose the indent. This also result in a more natural code flow IMO. Signed-off-by: Peter Zijlstra (Intel) Acked-by: Will Deacon Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: andrea.pa...@amarulasolutions.com Cc: long...@redhat.com Link: https://lkml.kernel.org/r/20181003130257.156322...@infradead.org Signed-off-by: Ingo Molnar --- kernel/locking/qspinlock.c | 56 ++ 1 file changed, 27 insertions(+), 29 deletions(-) diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c index bfaeb05123ff..ec343276f975 100644 --- a/kernel/locking/qspinlock.c +++ b/kernel/locking/qspinlock.c @@ -330,39 +330,37 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 val) * 0,0,1 -> 0,1,1 ; pending */ val = atomic_fetch_or_acquire(_Q_PENDING_VAL, >val); - if (!(val & ~_Q_LOCKED_MASK)) { - /* -* We're pending, wait for the owner to go away. -* -* *,1,1 -> *,1,0 -* -* this wait loop must be a load-acquire such that we match the -* store-release that clears the locked bit and create lock -* sequentiality; this is because not all -* clear_pending_set_locked() implementations imply full -* barriers. -*/ - if (val & _Q_LOCKED_MASK) { - atomic_cond_read_acquire(>val, -!(VAL & _Q_LOCKED_MASK)); - } - - /* -* take ownership and clear the pending bit. -* -* *,1,0 -> *,0,1 -*/ - clear_pending_set_locked(lock); - qstat_inc(qstat_lock_pending, true); - return; + /* +* If we observe any contention; undo and queue. +*/ + if (unlikely(val & ~_Q_LOCKED_MASK)) { + if (!(val & _Q_PENDING_MASK)) + clear_pending(lock); + goto queue; } /* -* If pending was clear but there are waiters in the queue, then -* we need to undo our setting of pending before we queue ourselves. +* We're pending, wait for the owner to go away. +* +* 0,1,1 -> 0,1,0 +* +* this wait loop must be a load-acquire such that we match the +* store-release that clears the locked bit and create lock +* sequentiality; this is because not all +* clear_pending_set_locked() implementations imply full +* barriers. +*/ + if (val & _Q_LOCKED_MASK) + atomic_cond_read_acquire(>val, !(VAL & _Q_LOCKED_MASK)); + + /* +* take ownership and clear the pending bit. +* +* 0,1,0 -> 0,0,1 */ - if (!(val & _Q_PENDING_MASK)) - clear_pending(lock); + clear_pending_set_locked(lock); + qstat_inc(qstat_lock_pending, true); + return; /* * End of pending bit optimistic spinning and beginning of MCS
[tip:x86/urgent] x86/tsc: Force inlining of cyc2ns bits
Commit-ID: 4907c68abd3f60f650f98d5a69d4ec77c0bde44f Gitweb: https://git.kernel.org/tip/4907c68abd3f60f650f98d5a69d4ec77c0bde44f Author: Peter Zijlstra AuthorDate: Thu, 11 Oct 2018 12:38:26 +0200 Committer: Thomas Gleixner CommitDate: Sun, 14 Oct 2018 11:11:22 +0200 x86/tsc: Force inlining of cyc2ns bits Looking at the asm for native_sched_clock() I noticed we don't inline enough. Mostly caused by sharing code with cyc2ns_read_begin(), which we didn't used to do. So mark all that __force_inline to make it DTRT. Fixes: 59eaef78bfea ("x86/tsc: Remodel cyc2ns to use seqcount_latch()") Reported-by: Eric Dumazet Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: h...@zytor.com Cc: eric.duma...@gmail.com Cc: b...@alien8.de Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/20181011104019.695196...@infradead.org --- arch/x86/kernel/tsc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index b52bd2b6cdb4..6d5dc5dabfd7 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -58,7 +58,7 @@ struct cyc2ns { static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); -void cyc2ns_read_begin(struct cyc2ns_data *data) +void __always_inline cyc2ns_read_begin(struct cyc2ns_data *data) { int seq, idx; @@ -75,7 +75,7 @@ void cyc2ns_read_begin(struct cyc2ns_data *data) } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence))); } -void cyc2ns_read_end(void) +void __always_inline cyc2ns_read_end(void) { preempt_enable_notrace(); } @@ -104,7 +104,7 @@ void cyc2ns_read_end(void) * -johns...@us.ibm.com "math is hard, lets go shopping!" */ -static inline unsigned long long cycles_2_ns(unsigned long long cyc) +static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc) { struct cyc2ns_data data; unsigned long long ns;
[tip:x86/urgent] x86/tsc: Force inlining of cyc2ns bits
Commit-ID: 4907c68abd3f60f650f98d5a69d4ec77c0bde44f Gitweb: https://git.kernel.org/tip/4907c68abd3f60f650f98d5a69d4ec77c0bde44f Author: Peter Zijlstra AuthorDate: Thu, 11 Oct 2018 12:38:26 +0200 Committer: Thomas Gleixner CommitDate: Sun, 14 Oct 2018 11:11:22 +0200 x86/tsc: Force inlining of cyc2ns bits Looking at the asm for native_sched_clock() I noticed we don't inline enough. Mostly caused by sharing code with cyc2ns_read_begin(), which we didn't used to do. So mark all that __force_inline to make it DTRT. Fixes: 59eaef78bfea ("x86/tsc: Remodel cyc2ns to use seqcount_latch()") Reported-by: Eric Dumazet Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: h...@zytor.com Cc: eric.duma...@gmail.com Cc: b...@alien8.de Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/20181011104019.695196...@infradead.org --- arch/x86/kernel/tsc.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c index b52bd2b6cdb4..6d5dc5dabfd7 100644 --- a/arch/x86/kernel/tsc.c +++ b/arch/x86/kernel/tsc.c @@ -58,7 +58,7 @@ struct cyc2ns { static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns); -void cyc2ns_read_begin(struct cyc2ns_data *data) +void __always_inline cyc2ns_read_begin(struct cyc2ns_data *data) { int seq, idx; @@ -75,7 +75,7 @@ void cyc2ns_read_begin(struct cyc2ns_data *data) } while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence))); } -void cyc2ns_read_end(void) +void __always_inline cyc2ns_read_end(void) { preempt_enable_notrace(); } @@ -104,7 +104,7 @@ void cyc2ns_read_end(void) * -johns...@us.ibm.com "math is hard, lets go shopping!" */ -static inline unsigned long long cycles_2_ns(unsigned long long cyc) +static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc) { struct cyc2ns_data data; unsigned long long ns;
[tip:x86/urgent] x86/percpu: Fix this_cpu_read()
Commit-ID: b59167ac7bafd804c91e49ad53c6d33a7394d4c8 Gitweb: https://git.kernel.org/tip/b59167ac7bafd804c91e49ad53c6d33a7394d4c8 Author: Peter Zijlstra AuthorDate: Thu, 11 Oct 2018 12:38:27 +0200 Committer: Thomas Gleixner CommitDate: Sun, 14 Oct 2018 11:11:22 +0200 x86/percpu: Fix this_cpu_read() Eric reported that a sequence count loop using this_cpu_read() got optimized out. This is wrong, this_cpu_read() must imply READ_ONCE() because the interface is IRQ-safe, therefore an interrupt can have changed the per-cpu value. Fixes: 7c3576d261ce ("[PATCH] i386: Convert PDA into the percpu section") Reported-by: Eric Dumazet Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Acked-by: Eric Dumazet Cc: h...@zytor.com Cc: eric.duma...@gmail.com Cc: b...@alien8.de Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/20181011104019.748208...@infradead.org --- arch/x86/include/asm/percpu.h | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index e9202a0de8f0..1a19d11cfbbd 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -185,22 +185,22 @@ do { \ typeof(var) pfo_ret__; \ switch (sizeof(var)) { \ case 1: \ - asm(op "b "__percpu_arg(1)",%0" \ + asm volatile(op "b "__percpu_arg(1)",%0"\ : "=q" (pfo_ret__) \ : "m" (var)); \ break; \ case 2: \ - asm(op "w "__percpu_arg(1)",%0" \ + asm volatile(op "w "__percpu_arg(1)",%0"\ : "=r" (pfo_ret__) \ : "m" (var)); \ break; \ case 4: \ - asm(op "l "__percpu_arg(1)",%0" \ + asm volatile(op "l "__percpu_arg(1)",%0"\ : "=r" (pfo_ret__) \ : "m" (var)); \ break; \ case 8: \ - asm(op "q "__percpu_arg(1)",%0" \ + asm volatile(op "q "__percpu_arg(1)",%0"\ : "=r" (pfo_ret__) \ : "m" (var)); \ break; \
[tip:x86/urgent] x86/percpu: Fix this_cpu_read()
Commit-ID: b59167ac7bafd804c91e49ad53c6d33a7394d4c8 Gitweb: https://git.kernel.org/tip/b59167ac7bafd804c91e49ad53c6d33a7394d4c8 Author: Peter Zijlstra AuthorDate: Thu, 11 Oct 2018 12:38:27 +0200 Committer: Thomas Gleixner CommitDate: Sun, 14 Oct 2018 11:11:22 +0200 x86/percpu: Fix this_cpu_read() Eric reported that a sequence count loop using this_cpu_read() got optimized out. This is wrong, this_cpu_read() must imply READ_ONCE() because the interface is IRQ-safe, therefore an interrupt can have changed the per-cpu value. Fixes: 7c3576d261ce ("[PATCH] i386: Convert PDA into the percpu section") Reported-by: Eric Dumazet Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Acked-by: Eric Dumazet Cc: h...@zytor.com Cc: eric.duma...@gmail.com Cc: b...@alien8.de Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/20181011104019.748208...@infradead.org --- arch/x86/include/asm/percpu.h | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h index e9202a0de8f0..1a19d11cfbbd 100644 --- a/arch/x86/include/asm/percpu.h +++ b/arch/x86/include/asm/percpu.h @@ -185,22 +185,22 @@ do { \ typeof(var) pfo_ret__; \ switch (sizeof(var)) { \ case 1: \ - asm(op "b "__percpu_arg(1)",%0" \ + asm volatile(op "b "__percpu_arg(1)",%0"\ : "=q" (pfo_ret__) \ : "m" (var)); \ break; \ case 2: \ - asm(op "w "__percpu_arg(1)",%0" \ + asm volatile(op "w "__percpu_arg(1)",%0"\ : "=r" (pfo_ret__) \ : "m" (var)); \ break; \ case 4: \ - asm(op "l "__percpu_arg(1)",%0" \ + asm volatile(op "l "__percpu_arg(1)",%0"\ : "=r" (pfo_ret__) \ : "m" (var)); \ break; \ case 8: \ - asm(op "q "__percpu_arg(1)",%0" \ + asm volatile(op "q "__percpu_arg(1)",%0"\ : "=r" (pfo_ret__) \ : "m" (var)); \ break; \
[tip:x86/boot] x86/kaslr, ACPI/NUMA: Fix KASLR build error
Commit-ID: 9d94e8b1d4f94a3c4cee5ad11a1be460cd070839 Gitweb: https://git.kernel.org/tip/9d94e8b1d4f94a3c4cee5ad11a1be460cd070839 Author: Peter Zijlstra (Intel) AuthorDate: Wed, 3 Oct 2018 14:41:27 +0200 Committer: Borislav Petkov CommitDate: Tue, 9 Oct 2018 12:30:25 +0200 x86/kaslr, ACPI/NUMA: Fix KASLR build error There is no point in trying to compile KASLR-specific code when there is no KASLR. [ bp: Move the whole crap into kaslr.c and make rand_mem_physical_padding static. Make kaslr_check_padding() weak to avoid build breakage on other architectures. ] Reported-by: Naresh Kamboju Reported-by: Mark Brown Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Cc: Cc: Cc: Cc: Cc: Cc: Link: http://lkml.kernel.org/r/20181003123402.ga15...@hirez.programming.kicks-ass.net --- arch/x86/include/asm/setup.h | 2 -- arch/x86/mm/kaslr.c | 19 ++- drivers/acpi/numa.c | 17 + 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 65a5bf8f6aba..ae13bc974416 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -80,8 +80,6 @@ static inline unsigned long kaslr_offset(void) return (unsigned long)&_text - __START_KERNEL; } -extern int rand_mem_physical_padding; - /* * Do NOT EVER look at the BIOS memory size location. * It does not work on many machines. diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 00cf4cae38f5..b3471388288d 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -40,7 +41,7 @@ */ static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE; -int __initdata rand_mem_physical_padding = CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; +static int __initdata rand_mem_physical_padding = CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; /* * Memory regions randomized by KASLR (except modules that use a separate logic * earlier during boot). The list is ordered based on virtual addresses. This @@ -70,6 +71,22 @@ static inline bool kaslr_memory_enabled(void) return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN); } +/* + * Check the padding size for KASLR is enough. + */ +void __init kaslr_check_padding(void) +{ + u64 max_possible_phys, max_actual_phys, threshold; + + max_actual_phys = roundup(PFN_PHYS(max_pfn), 1ULL << 40); + max_possible_phys = roundup(PFN_PHYS(max_possible_pfn), 1ULL << 40); + threshold = max_actual_phys + ((u64)rand_mem_physical_padding << 40); + + if (max_possible_phys > threshold) + pr_warn("Set 'rand_mem_physical_padding=%llu' to avoid memory hotadd failure.\n", + (max_possible_phys - max_actual_phys) >> 40); +} + static int __init rand_mem_physical_padding_setup(char *str) { int max_padding = (1 << (MAX_PHYSMEM_BITS - TB_SHIFT)) - 1; diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index 3d69834c692f..ba62004f4d86 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c @@ -32,7 +32,6 @@ #include #include #include -#include static nodemask_t nodes_found_map = NODE_MASK_NONE; @@ -433,10 +432,12 @@ acpi_table_parse_srat(enum acpi_srat_type id, handler, max_entries); } +/* To be overridden by architectures */ +void __init __weak kaslr_check_padding(void) { } + int __init acpi_numa_init(void) { int cnt = 0; - u64 max_possible_phys, max_actual_phys, threshold; if (acpi_disabled) return -EINVAL; @@ -466,17 +467,9 @@ int __init acpi_numa_init(void) cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, acpi_parse_memory_affinity, 0); - /* check the padding size for KASLR is enough. */ - if (parsed_numa_memblks && kaslr_enabled()) { - max_actual_phys = roundup(PFN_PHYS(max_pfn), 1ULL << 40); - max_possible_phys = roundup(PFN_PHYS(max_possible_pfn), 1ULL << 40); - threshold = max_actual_phys + ((u64)rand_mem_physical_padding << 40); + if (parsed_numa_memblks) + kaslr_check_padding(); - if (max_possible_phys > threshold) { - pr_warn("Set 'rand_mem_physical_padding=%llu' to avoid memory hotadd failure.\n", - (max_possible_phys - max_actual_phys) >> 40); - } - } } /* SLIT: System Locality Information Table */
[tip:x86/boot] x86/kaslr, ACPI/NUMA: Fix KASLR build error
Commit-ID: 9d94e8b1d4f94a3c4cee5ad11a1be460cd070839 Gitweb: https://git.kernel.org/tip/9d94e8b1d4f94a3c4cee5ad11a1be460cd070839 Author: Peter Zijlstra (Intel) AuthorDate: Wed, 3 Oct 2018 14:41:27 +0200 Committer: Borislav Petkov CommitDate: Tue, 9 Oct 2018 12:30:25 +0200 x86/kaslr, ACPI/NUMA: Fix KASLR build error There is no point in trying to compile KASLR-specific code when there is no KASLR. [ bp: Move the whole crap into kaslr.c and make rand_mem_physical_padding static. Make kaslr_check_padding() weak to avoid build breakage on other architectures. ] Reported-by: Naresh Kamboju Reported-by: Mark Brown Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Cc: Cc: Cc: Cc: Cc: Cc: Link: http://lkml.kernel.org/r/20181003123402.ga15...@hirez.programming.kicks-ass.net --- arch/x86/include/asm/setup.h | 2 -- arch/x86/mm/kaslr.c | 19 ++- drivers/acpi/numa.c | 17 + 3 files changed, 23 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 65a5bf8f6aba..ae13bc974416 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -80,8 +80,6 @@ static inline unsigned long kaslr_offset(void) return (unsigned long)&_text - __START_KERNEL; } -extern int rand_mem_physical_padding; - /* * Do NOT EVER look at the BIOS memory size location. * It does not work on many machines. diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 00cf4cae38f5..b3471388288d 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -40,7 +41,7 @@ */ static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE; -int __initdata rand_mem_physical_padding = CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; +static int __initdata rand_mem_physical_padding = CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; /* * Memory regions randomized by KASLR (except modules that use a separate logic * earlier during boot). The list is ordered based on virtual addresses. This @@ -70,6 +71,22 @@ static inline bool kaslr_memory_enabled(void) return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN); } +/* + * Check the padding size for KASLR is enough. + */ +void __init kaslr_check_padding(void) +{ + u64 max_possible_phys, max_actual_phys, threshold; + + max_actual_phys = roundup(PFN_PHYS(max_pfn), 1ULL << 40); + max_possible_phys = roundup(PFN_PHYS(max_possible_pfn), 1ULL << 40); + threshold = max_actual_phys + ((u64)rand_mem_physical_padding << 40); + + if (max_possible_phys > threshold) + pr_warn("Set 'rand_mem_physical_padding=%llu' to avoid memory hotadd failure.\n", + (max_possible_phys - max_actual_phys) >> 40); +} + static int __init rand_mem_physical_padding_setup(char *str) { int max_padding = (1 << (MAX_PHYSMEM_BITS - TB_SHIFT)) - 1; diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index 3d69834c692f..ba62004f4d86 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c @@ -32,7 +32,6 @@ #include #include #include -#include static nodemask_t nodes_found_map = NODE_MASK_NONE; @@ -433,10 +432,12 @@ acpi_table_parse_srat(enum acpi_srat_type id, handler, max_entries); } +/* To be overridden by architectures */ +void __init __weak kaslr_check_padding(void) { } + int __init acpi_numa_init(void) { int cnt = 0; - u64 max_possible_phys, max_actual_phys, threshold; if (acpi_disabled) return -EINVAL; @@ -466,17 +467,9 @@ int __init acpi_numa_init(void) cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, acpi_parse_memory_affinity, 0); - /* check the padding size for KASLR is enough. */ - if (parsed_numa_memblks && kaslr_enabled()) { - max_actual_phys = roundup(PFN_PHYS(max_pfn), 1ULL << 40); - max_possible_phys = roundup(PFN_PHYS(max_possible_pfn), 1ULL << 40); - threshold = max_actual_phys + ((u64)rand_mem_physical_padding << 40); + if (parsed_numa_memblks) + kaslr_check_padding(); - if (max_possible_phys > threshold) { - pr_warn("Set 'rand_mem_physical_padding=%llu' to avoid memory hotadd failure.\n", - (max_possible_phys - max_actual_phys) >> 40); - } - } } /* SLIT: System Locality Information Table */
[tip:x86/boot] x86/kaslr, ACPI/NUMA: Fix KASLR build error
Commit-ID: 3a387c6d96e69f1710a3804eb68e1253263298f2 Gitweb: https://git.kernel.org/tip/3a387c6d96e69f1710a3804eb68e1253263298f2 Author: Peter Zijlstra (Intel) AuthorDate: Wed, 3 Oct 2018 14:41:27 +0200 Committer: Borislav Petkov CommitDate: Wed, 3 Oct 2018 16:15:49 +0200 x86/kaslr, ACPI/NUMA: Fix KASLR build error There is no point in trying to compile KASLR-specific code when there is no KASLR. [ bp: Move the whole crap into kaslr.c and make rand_mem_physical_padding static. ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Cc: Cc: Cc: Cc: Cc: Cc: Link: http://lkml.kernel.org/r/20181003123402.ga15...@hirez.programming.kicks-ass.net --- arch/x86/include/asm/kaslr.h | 2 ++ arch/x86/include/asm/setup.h | 2 -- arch/x86/mm/kaslr.c | 19 ++- drivers/acpi/numa.c | 15 +++ 4 files changed, 23 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h index db7ba2feb947..95ef3fc01d12 100644 --- a/arch/x86/include/asm/kaslr.h +++ b/arch/x86/include/asm/kaslr.h @@ -6,8 +6,10 @@ unsigned long kaslr_get_random_long(const char *purpose); #ifdef CONFIG_RANDOMIZE_MEMORY void kernel_randomize_memory(void); +void kaslr_check_padding(void); #else static inline void kernel_randomize_memory(void) { } +static inline void kaslr_check_padding(void) { } #endif /* CONFIG_RANDOMIZE_MEMORY */ #endif diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 65a5bf8f6aba..ae13bc974416 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -80,8 +80,6 @@ static inline unsigned long kaslr_offset(void) return (unsigned long)&_text - __START_KERNEL; } -extern int rand_mem_physical_padding; - /* * Do NOT EVER look at the BIOS memory size location. * It does not work on many machines. diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 00cf4cae38f5..b3471388288d 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -40,7 +41,7 @@ */ static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE; -int __initdata rand_mem_physical_padding = CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; +static int __initdata rand_mem_physical_padding = CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; /* * Memory regions randomized by KASLR (except modules that use a separate logic * earlier during boot). The list is ordered based on virtual addresses. This @@ -70,6 +71,22 @@ static inline bool kaslr_memory_enabled(void) return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN); } +/* + * Check the padding size for KASLR is enough. + */ +void __init kaslr_check_padding(void) +{ + u64 max_possible_phys, max_actual_phys, threshold; + + max_actual_phys = roundup(PFN_PHYS(max_pfn), 1ULL << 40); + max_possible_phys = roundup(PFN_PHYS(max_possible_pfn), 1ULL << 40); + threshold = max_actual_phys + ((u64)rand_mem_physical_padding << 40); + + if (max_possible_phys > threshold) + pr_warn("Set 'rand_mem_physical_padding=%llu' to avoid memory hotadd failure.\n", + (max_possible_phys - max_actual_phys) >> 40); +} + static int __init rand_mem_physical_padding_setup(char *str) { int max_padding = (1 << (MAX_PHYSMEM_BITS - TB_SHIFT)) - 1; diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index 3d69834c692f..4408e37600ef 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include static nodemask_t nodes_found_map = NODE_MASK_NONE; @@ -436,7 +436,6 @@ acpi_table_parse_srat(enum acpi_srat_type id, int __init acpi_numa_init(void) { int cnt = 0; - u64 max_possible_phys, max_actual_phys, threshold; if (acpi_disabled) return -EINVAL; @@ -466,17 +465,9 @@ int __init acpi_numa_init(void) cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, acpi_parse_memory_affinity, 0); - /* check the padding size for KASLR is enough. */ - if (parsed_numa_memblks && kaslr_enabled()) { - max_actual_phys = roundup(PFN_PHYS(max_pfn), 1ULL << 40); - max_possible_phys = roundup(PFN_PHYS(max_possible_pfn), 1ULL << 40); - threshold = max_actual_phys + ((u64)rand_mem_physical_padding << 40); + if (parsed_numa_memblks) + kaslr_check_padding(); - if (max_possible_phys > threshold) { - pr_warn("Set 'rand_mem_physical_padding=%llu' to avoid memory hotadd failure.\n", - (max_possible_phys - max_actual_phys) >> 40); - } - } } /* SLIT: System Locality
[tip:x86/boot] x86/kaslr, ACPI/NUMA: Fix KASLR build error
Commit-ID: 3a387c6d96e69f1710a3804eb68e1253263298f2 Gitweb: https://git.kernel.org/tip/3a387c6d96e69f1710a3804eb68e1253263298f2 Author: Peter Zijlstra (Intel) AuthorDate: Wed, 3 Oct 2018 14:41:27 +0200 Committer: Borislav Petkov CommitDate: Wed, 3 Oct 2018 16:15:49 +0200 x86/kaslr, ACPI/NUMA: Fix KASLR build error There is no point in trying to compile KASLR-specific code when there is no KASLR. [ bp: Move the whole crap into kaslr.c and make rand_mem_physical_padding static. ] Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Borislav Petkov Cc: Cc: Cc: Cc: Cc: Cc: Link: http://lkml.kernel.org/r/20181003123402.ga15...@hirez.programming.kicks-ass.net --- arch/x86/include/asm/kaslr.h | 2 ++ arch/x86/include/asm/setup.h | 2 -- arch/x86/mm/kaslr.c | 19 ++- drivers/acpi/numa.c | 15 +++ 4 files changed, 23 insertions(+), 15 deletions(-) diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h index db7ba2feb947..95ef3fc01d12 100644 --- a/arch/x86/include/asm/kaslr.h +++ b/arch/x86/include/asm/kaslr.h @@ -6,8 +6,10 @@ unsigned long kaslr_get_random_long(const char *purpose); #ifdef CONFIG_RANDOMIZE_MEMORY void kernel_randomize_memory(void); +void kaslr_check_padding(void); #else static inline void kernel_randomize_memory(void) { } +static inline void kaslr_check_padding(void) { } #endif /* CONFIG_RANDOMIZE_MEMORY */ #endif diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h index 65a5bf8f6aba..ae13bc974416 100644 --- a/arch/x86/include/asm/setup.h +++ b/arch/x86/include/asm/setup.h @@ -80,8 +80,6 @@ static inline unsigned long kaslr_offset(void) return (unsigned long)&_text - __START_KERNEL; } -extern int rand_mem_physical_padding; - /* * Do NOT EVER look at the BIOS memory size location. * It does not work on many machines. diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c index 00cf4cae38f5..b3471388288d 100644 --- a/arch/x86/mm/kaslr.c +++ b/arch/x86/mm/kaslr.c @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -40,7 +41,7 @@ */ static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE; -int __initdata rand_mem_physical_padding = CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; +static int __initdata rand_mem_physical_padding = CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING; /* * Memory regions randomized by KASLR (except modules that use a separate logic * earlier during boot). The list is ordered based on virtual addresses. This @@ -70,6 +71,22 @@ static inline bool kaslr_memory_enabled(void) return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN); } +/* + * Check the padding size for KASLR is enough. + */ +void __init kaslr_check_padding(void) +{ + u64 max_possible_phys, max_actual_phys, threshold; + + max_actual_phys = roundup(PFN_PHYS(max_pfn), 1ULL << 40); + max_possible_phys = roundup(PFN_PHYS(max_possible_pfn), 1ULL << 40); + threshold = max_actual_phys + ((u64)rand_mem_physical_padding << 40); + + if (max_possible_phys > threshold) + pr_warn("Set 'rand_mem_physical_padding=%llu' to avoid memory hotadd failure.\n", + (max_possible_phys - max_actual_phys) >> 40); +} + static int __init rand_mem_physical_padding_setup(char *str) { int max_padding = (1 << (MAX_PHYSMEM_BITS - TB_SHIFT)) - 1; diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c index 3d69834c692f..4408e37600ef 100644 --- a/drivers/acpi/numa.c +++ b/drivers/acpi/numa.c @@ -32,7 +32,7 @@ #include #include #include -#include +#include static nodemask_t nodes_found_map = NODE_MASK_NONE; @@ -436,7 +436,6 @@ acpi_table_parse_srat(enum acpi_srat_type id, int __init acpi_numa_init(void) { int cnt = 0; - u64 max_possible_phys, max_actual_phys, threshold; if (acpi_disabled) return -EINVAL; @@ -466,17 +465,9 @@ int __init acpi_numa_init(void) cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY, acpi_parse_memory_affinity, 0); - /* check the padding size for KASLR is enough. */ - if (parsed_numa_memblks && kaslr_enabled()) { - max_actual_phys = roundup(PFN_PHYS(max_pfn), 1ULL << 40); - max_possible_phys = roundup(PFN_PHYS(max_possible_pfn), 1ULL << 40); - threshold = max_actual_phys + ((u64)rand_mem_physical_padding << 40); + if (parsed_numa_memblks) + kaslr_check_padding(); - if (max_possible_phys > threshold) { - pr_warn("Set 'rand_mem_physical_padding=%llu' to avoid memory hotadd failure.\n", - (max_possible_phys - max_actual_phys) >> 40); - } - } } /* SLIT: System Locality
[tip:x86/mm] x86/mm/cpa: Optimize __cpa_flush_range()
Commit-ID: 7904ba8a66f400182a204893c92098994e22a88d Gitweb: https://git.kernel.org/tip/7904ba8a66f400182a204893c92098994e22a88d Author: Peter Zijlstra AuthorDate: Wed, 19 Sep 2018 10:50:24 +0200 Committer: Thomas Gleixner CommitDate: Thu, 27 Sep 2018 20:39:42 +0200 x86/mm/cpa: Optimize __cpa_flush_range() If we IPI for WBINDV, then we might as well kill the entire TLB too. But if we don't have to invalidate cache, there is no reason not to use a range TLB flush. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180919085948.195633...@infradead.org --- arch/x86/mm/pageattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index dc552824e86a..62bb30b4bd2a 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -291,7 +291,7 @@ static bool __cpa_flush_range(unsigned long start, int numpages, int cache) WARN_ON(PAGE_ALIGN(start) != start); - if (!static_cpu_has(X86_FEATURE_CLFLUSH)) { + if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) { cpa_flush_all(cache); return true; }
[tip:x86/mm] x86/mm/cpa: Optimize __cpa_flush_range()
Commit-ID: 7904ba8a66f400182a204893c92098994e22a88d Gitweb: https://git.kernel.org/tip/7904ba8a66f400182a204893c92098994e22a88d Author: Peter Zijlstra AuthorDate: Wed, 19 Sep 2018 10:50:24 +0200 Committer: Thomas Gleixner CommitDate: Thu, 27 Sep 2018 20:39:42 +0200 x86/mm/cpa: Optimize __cpa_flush_range() If we IPI for WBINDV, then we might as well kill the entire TLB too. But if we don't have to invalidate cache, there is no reason not to use a range TLB flush. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180919085948.195633...@infradead.org --- arch/x86/mm/pageattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index dc552824e86a..62bb30b4bd2a 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -291,7 +291,7 @@ static bool __cpa_flush_range(unsigned long start, int numpages, int cache) WARN_ON(PAGE_ALIGN(start) != start); - if (!static_cpu_has(X86_FEATURE_CLFLUSH)) { + if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) { cpa_flush_all(cache); return true; }
[tip:x86/mm] x86/mm/cpa: Factor common code between cpa_flush_*()
Commit-ID: 47e262ac5b84015c4a101ff51767c464fb7497a6 Gitweb: https://git.kernel.org/tip/47e262ac5b84015c4a101ff51767c464fb7497a6 Author: Peter Zijlstra AuthorDate: Wed, 19 Sep 2018 10:50:23 +0200 Committer: Thomas Gleixner CommitDate: Thu, 27 Sep 2018 20:39:42 +0200 x86/mm/cpa: Factor common code between cpa_flush_*() The start of cpa_flush_range() and cpa_flush_array() is the same, use a common function. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180919085948.138859...@infradead.org --- arch/x86/mm/pageattr.c | 29 + 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 33d89d505f93..dc552824e86a 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -285,22 +285,28 @@ static void cpa_flush_all(unsigned long cache) on_each_cpu(__cpa_flush_all, (void *) cache, 1); } -static void cpa_flush_range(unsigned long start, int numpages, int cache) +static bool __cpa_flush_range(unsigned long start, int numpages, int cache) { - unsigned int i, level; - unsigned long addr; - BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); + WARN_ON(PAGE_ALIGN(start) != start); if (!static_cpu_has(X86_FEATURE_CLFLUSH)) { cpa_flush_all(cache); - return; + return true; } flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages); - if (!cache) + return !cache; +} + +static void cpa_flush_range(unsigned long start, int numpages, int cache) +{ + unsigned int i, level; + unsigned long addr; + + if (__cpa_flush_range(start, numpages, cache)) return; /* @@ -326,16 +332,7 @@ static void cpa_flush_array(unsigned long baddr, unsigned long *start, { unsigned int i, level; - BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); - - if (!static_cpu_has(X86_FEATURE_CLFLUSH)) { - cpa_flush_all(cache); - return; - } - - flush_tlb_kernel_range(baddr, baddr + PAGE_SIZE * numpages); - - if (!cache) + if (__cpa_flush_range(baddr, numpages, cache)) return; /*
[tip:x86/mm] x86/mm/cpa: Factor common code between cpa_flush_*()
Commit-ID: 47e262ac5b84015c4a101ff51767c464fb7497a6 Gitweb: https://git.kernel.org/tip/47e262ac5b84015c4a101ff51767c464fb7497a6 Author: Peter Zijlstra AuthorDate: Wed, 19 Sep 2018 10:50:23 +0200 Committer: Thomas Gleixner CommitDate: Thu, 27 Sep 2018 20:39:42 +0200 x86/mm/cpa: Factor common code between cpa_flush_*() The start of cpa_flush_range() and cpa_flush_array() is the same, use a common function. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180919085948.138859...@infradead.org --- arch/x86/mm/pageattr.c | 29 + 1 file changed, 13 insertions(+), 16 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 33d89d505f93..dc552824e86a 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -285,22 +285,28 @@ static void cpa_flush_all(unsigned long cache) on_each_cpu(__cpa_flush_all, (void *) cache, 1); } -static void cpa_flush_range(unsigned long start, int numpages, int cache) +static bool __cpa_flush_range(unsigned long start, int numpages, int cache) { - unsigned int i, level; - unsigned long addr; - BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); + WARN_ON(PAGE_ALIGN(start) != start); if (!static_cpu_has(X86_FEATURE_CLFLUSH)) { cpa_flush_all(cache); - return; + return true; } flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages); - if (!cache) + return !cache; +} + +static void cpa_flush_range(unsigned long start, int numpages, int cache) +{ + unsigned int i, level; + unsigned long addr; + + if (__cpa_flush_range(start, numpages, cache)) return; /* @@ -326,16 +332,7 @@ static void cpa_flush_array(unsigned long baddr, unsigned long *start, { unsigned int i, level; - BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); - - if (!static_cpu_has(X86_FEATURE_CLFLUSH)) { - cpa_flush_all(cache); - return; - } - - flush_tlb_kernel_range(baddr, baddr + PAGE_SIZE * numpages); - - if (!cache) + if (__cpa_flush_range(baddr, numpages, cache)) return; /*
[tip:x86/mm] x86/mm/cpa: Move CLFLUSH test into cpa_flush_array()
Commit-ID: fce2ce9544e9f098ba828442221ce99c2a5ecb0f Gitweb: https://git.kernel.org/tip/fce2ce9544e9f098ba828442221ce99c2a5ecb0f Author: Peter Zijlstra AuthorDate: Wed, 19 Sep 2018 10:50:22 +0200 Committer: Thomas Gleixner CommitDate: Thu, 27 Sep 2018 20:39:42 +0200 x86/mm/cpa: Move CLFLUSH test into cpa_flush_array() Rather than guarding cpa_flush_array() users with a CLFLUSH test, put it inside. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180919085948.087848...@infradead.org --- arch/x86/mm/pageattr.c | 27 --- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 3cc4a2ae4dbb..33d89d505f93 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -328,6 +328,11 @@ static void cpa_flush_array(unsigned long baddr, unsigned long *start, BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); + if (!static_cpu_has(X86_FEATURE_CLFLUSH)) { + cpa_flush_all(cache); + return; + } + flush_tlb_kernel_range(baddr, baddr + PAGE_SIZE * numpages); if (!cache) @@ -1756,19 +1761,19 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, cache = !!pgprot2cachemode(mask_set); /* -* On success we use CLFLUSH, when the CPU supports it to -* avoid the WBINVD. If the CPU does not support it and in the -* error case we fall back to cpa_flush_all (which uses -* WBINVD): +* On error; flush everything to be sure. */ - if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) { - if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { - cpa_flush_array(baddr, addr, numpages, cache, - cpa.flags, pages); - } else - cpa_flush_range(baddr, numpages, cache); - } else + if (ret) { cpa_flush_all(cache); + goto out; + } + + if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { + cpa_flush_array(baddr, addr, numpages, cache, + cpa.flags, pages); + } else { + cpa_flush_range(baddr, numpages, cache); + } out: return ret;
[tip:x86/mm] x86/mm/cpa: Move CLFLUSH test into cpa_flush_array()
Commit-ID: fce2ce9544e9f098ba828442221ce99c2a5ecb0f Gitweb: https://git.kernel.org/tip/fce2ce9544e9f098ba828442221ce99c2a5ecb0f Author: Peter Zijlstra AuthorDate: Wed, 19 Sep 2018 10:50:22 +0200 Committer: Thomas Gleixner CommitDate: Thu, 27 Sep 2018 20:39:42 +0200 x86/mm/cpa: Move CLFLUSH test into cpa_flush_array() Rather than guarding cpa_flush_array() users with a CLFLUSH test, put it inside. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180919085948.087848...@infradead.org --- arch/x86/mm/pageattr.c | 27 --- 1 file changed, 16 insertions(+), 11 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 3cc4a2ae4dbb..33d89d505f93 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -328,6 +328,11 @@ static void cpa_flush_array(unsigned long baddr, unsigned long *start, BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); + if (!static_cpu_has(X86_FEATURE_CLFLUSH)) { + cpa_flush_all(cache); + return; + } + flush_tlb_kernel_range(baddr, baddr + PAGE_SIZE * numpages); if (!cache) @@ -1756,19 +1761,19 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, cache = !!pgprot2cachemode(mask_set); /* -* On success we use CLFLUSH, when the CPU supports it to -* avoid the WBINVD. If the CPU does not support it and in the -* error case we fall back to cpa_flush_all (which uses -* WBINVD): +* On error; flush everything to be sure. */ - if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) { - if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { - cpa_flush_array(baddr, addr, numpages, cache, - cpa.flags, pages); - } else - cpa_flush_range(baddr, numpages, cache); - } else + if (ret) { cpa_flush_all(cache); + goto out; + } + + if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { + cpa_flush_array(baddr, addr, numpages, cache, + cpa.flags, pages); + } else { + cpa_flush_range(baddr, numpages, cache); + } out: return ret;
[tip:x86/mm] x86/mm/cpa: Move CLFLUSH test into cpa_flush_range()
Commit-ID: 5f464b33b17219a233af1267c621632225bc7acc Gitweb: https://git.kernel.org/tip/5f464b33b17219a233af1267c621632225bc7acc Author: Peter Zijlstra AuthorDate: Wed, 19 Sep 2018 10:50:21 +0200 Committer: Thomas Gleixner CommitDate: Thu, 27 Sep 2018 20:39:41 +0200 x86/mm/cpa: Move CLFLUSH test into cpa_flush_range() Rather than guarding all cpa_flush_range() uses with a CLFLUSH test, put it inside. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180919085948.036195...@infradead.org --- arch/x86/mm/pageattr.c | 15 +++ 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 02eb18403594..3cc4a2ae4dbb 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -293,6 +293,11 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); WARN_ON(PAGE_ALIGN(start) != start); + if (!static_cpu_has(X86_FEATURE_CLFLUSH)) { + cpa_flush_all(cache); + return; + } + flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages); if (!cache) @@ -2078,10 +2083,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) /* * Before changing the encryption attribute, we need to flush caches. */ - if (static_cpu_has(X86_FEATURE_CLFLUSH)) - cpa_flush_range(start, numpages, 1); - else - cpa_flush_all(1); + cpa_flush_range(start, numpages, 1); ret = __change_page_attr_set_clr(, 1); @@ -2092,10 +2094,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) * in case TLB flushing gets optimized in the cpa_flush_range() * path use the same logic as above. */ - if (static_cpu_has(X86_FEATURE_CLFLUSH)) - cpa_flush_range(start, numpages, 0); - else - cpa_flush_all(0); + cpa_flush_range(start, numpages, 0); return ret; }
[tip:x86/mm] x86/mm/cpa: Move CLFLUSH test into cpa_flush_range()
Commit-ID: 5f464b33b17219a233af1267c621632225bc7acc Gitweb: https://git.kernel.org/tip/5f464b33b17219a233af1267c621632225bc7acc Author: Peter Zijlstra AuthorDate: Wed, 19 Sep 2018 10:50:21 +0200 Committer: Thomas Gleixner CommitDate: Thu, 27 Sep 2018 20:39:41 +0200 x86/mm/cpa: Move CLFLUSH test into cpa_flush_range() Rather than guarding all cpa_flush_range() uses with a CLFLUSH test, put it inside. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180919085948.036195...@infradead.org --- arch/x86/mm/pageattr.c | 15 +++ 1 file changed, 7 insertions(+), 8 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 02eb18403594..3cc4a2ae4dbb 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -293,6 +293,11 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); WARN_ON(PAGE_ALIGN(start) != start); + if (!static_cpu_has(X86_FEATURE_CLFLUSH)) { + cpa_flush_all(cache); + return; + } + flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages); if (!cache) @@ -2078,10 +2083,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) /* * Before changing the encryption attribute, we need to flush caches. */ - if (static_cpu_has(X86_FEATURE_CLFLUSH)) - cpa_flush_range(start, numpages, 1); - else - cpa_flush_all(1); + cpa_flush_range(start, numpages, 1); ret = __change_page_attr_set_clr(, 1); @@ -2092,10 +2094,7 @@ static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc) * in case TLB flushing gets optimized in the cpa_flush_range() * path use the same logic as above. */ - if (static_cpu_has(X86_FEATURE_CLFLUSH)) - cpa_flush_range(start, numpages, 0); - else - cpa_flush_all(0); + cpa_flush_range(start, numpages, 0); return ret; }
[tip:x86/mm] x86/mm/cpa: Use flush_tlb_kernel_range()
Commit-ID: a7295fd53c39ce781a9792c9dd2c8747bf274160 Gitweb: https://git.kernel.org/tip/a7295fd53c39ce781a9792c9dd2c8747bf274160 Author: Peter Zijlstra AuthorDate: Wed, 19 Sep 2018 10:50:20 +0200 Committer: Thomas Gleixner CommitDate: Thu, 27 Sep 2018 20:39:41 +0200 x86/mm/cpa: Use flush_tlb_kernel_range() Both cpa_flush_range() and cpa_flush_array() have a well specified range, use that to do a range based TLB invalidate. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180919085947.985193...@infradead.org --- arch/x86/mm/pageattr.c | 9 + 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index bd9b0ac07352..02eb18403594 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -293,7 +293,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); WARN_ON(PAGE_ALIGN(start) != start); - flush_tlb_all(); + flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages); if (!cache) return; @@ -315,14 +315,15 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) } } -static void cpa_flush_array(unsigned long *start, int numpages, int cache, +static void cpa_flush_array(unsigned long baddr, unsigned long *start, + int numpages, int cache, int in_flags, struct page **pages) { unsigned int i, level; BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); - flush_tlb_all(); + flush_tlb_kernel_range(baddr, baddr + PAGE_SIZE * numpages); if (!cache) return; @@ -1757,7 +1758,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, */ if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) { if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { - cpa_flush_array(addr, numpages, cache, + cpa_flush_array(baddr, addr, numpages, cache, cpa.flags, pages); } else cpa_flush_range(baddr, numpages, cache);
[tip:x86/mm] x86/mm/cpa: Use flush_tlb_kernel_range()
Commit-ID: a7295fd53c39ce781a9792c9dd2c8747bf274160 Gitweb: https://git.kernel.org/tip/a7295fd53c39ce781a9792c9dd2c8747bf274160 Author: Peter Zijlstra AuthorDate: Wed, 19 Sep 2018 10:50:20 +0200 Committer: Thomas Gleixner CommitDate: Thu, 27 Sep 2018 20:39:41 +0200 x86/mm/cpa: Use flush_tlb_kernel_range() Both cpa_flush_range() and cpa_flush_array() have a well specified range, use that to do a range based TLB invalidate. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180919085947.985193...@infradead.org --- arch/x86/mm/pageattr.c | 9 + 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index bd9b0ac07352..02eb18403594 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -293,7 +293,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); WARN_ON(PAGE_ALIGN(start) != start); - flush_tlb_all(); + flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages); if (!cache) return; @@ -315,14 +315,15 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) } } -static void cpa_flush_array(unsigned long *start, int numpages, int cache, +static void cpa_flush_array(unsigned long baddr, unsigned long *start, + int numpages, int cache, int in_flags, struct page **pages) { unsigned int i, level; BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); - flush_tlb_all(); + flush_tlb_kernel_range(baddr, baddr + PAGE_SIZE * numpages); if (!cache) return; @@ -1757,7 +1758,7 @@ static int change_page_attr_set_clr(unsigned long *addr, int numpages, */ if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) { if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) { - cpa_flush_array(addr, numpages, cache, + cpa_flush_array(baddr, addr, numpages, cache, cpa.flags, pages); } else cpa_flush_range(baddr, numpages, cache);
[tip:x86/mm] x86/mm/cpa: Unconditionally avoid WBINDV when we can
Commit-ID: ddd07b750382adc2b78fdfbec47af8a6e0d8ef37 Gitweb: https://git.kernel.org/tip/ddd07b750382adc2b78fdfbec47af8a6e0d8ef37 Author: Peter Zijlstra AuthorDate: Wed, 19 Sep 2018 10:50:19 +0200 Committer: Thomas Gleixner CommitDate: Thu, 27 Sep 2018 20:39:41 +0200 x86/mm/cpa: Unconditionally avoid WBINDV when we can CAT has happened, WBINDV is bad (even before CAT blowing away the entire cache on a multi-core platform wasn't nice), try not to use it ever. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180919085947.933674...@infradead.org --- arch/x86/mm/pageattr.c | 18 ++ 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index b6a4c638f086..bd9b0ac07352 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -319,26 +319,12 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache, int in_flags, struct page **pages) { unsigned int i, level; -#ifdef CONFIG_PREEMPT - /* -* Avoid wbinvd() because it causes latencies on all CPUs, -* regardless of any CPU isolation that may be in effect. -* -* This should be extended for CAT enabled systems independent of -* PREEMPT because wbinvd() does not respect the CAT partitions and -* this is exposed to unpriviledged users through the graphics -* subsystem. -*/ - unsigned long do_wbinvd = 0; -#else - unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */ -#endif BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); - on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1); + flush_tlb_all(); - if (!cache || do_wbinvd) + if (!cache) return; /*
[tip:x86/mm] x86/mm/cpa: Unconditionally avoid WBINDV when we can
Commit-ID: ddd07b750382adc2b78fdfbec47af8a6e0d8ef37 Gitweb: https://git.kernel.org/tip/ddd07b750382adc2b78fdfbec47af8a6e0d8ef37 Author: Peter Zijlstra AuthorDate: Wed, 19 Sep 2018 10:50:19 +0200 Committer: Thomas Gleixner CommitDate: Thu, 27 Sep 2018 20:39:41 +0200 x86/mm/cpa: Unconditionally avoid WBINDV when we can CAT has happened, WBINDV is bad (even before CAT blowing away the entire cache on a multi-core platform wasn't nice), try not to use it ever. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180919085947.933674...@infradead.org --- arch/x86/mm/pageattr.c | 18 ++ 1 file changed, 2 insertions(+), 16 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index b6a4c638f086..bd9b0ac07352 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -319,26 +319,12 @@ static void cpa_flush_array(unsigned long *start, int numpages, int cache, int in_flags, struct page **pages) { unsigned int i, level; -#ifdef CONFIG_PREEMPT - /* -* Avoid wbinvd() because it causes latencies on all CPUs, -* regardless of any CPU isolation that may be in effect. -* -* This should be extended for CAT enabled systems independent of -* PREEMPT because wbinvd() does not respect the CAT partitions and -* this is exposed to unpriviledged users through the graphics -* subsystem. -*/ - unsigned long do_wbinvd = 0; -#else - unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */ -#endif BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); - on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1); + flush_tlb_all(); - if (!cache || do_wbinvd) + if (!cache) return; /*
[tip:x86/mm] x86/mm/cpa: Move flush_tlb_all()
Commit-ID: c0a759abf5a686a37b9204c13b7e281fe516c8f0 Gitweb: https://git.kernel.org/tip/c0a759abf5a686a37b9204c13b7e281fe516c8f0 Author: Peter Zijlstra AuthorDate: Wed, 19 Sep 2018 10:50:18 +0200 Committer: Thomas Gleixner CommitDate: Thu, 27 Sep 2018 20:39:40 +0200 x86/mm/cpa: Move flush_tlb_all() There is an atom errata, where we do a local TLB invalidate right before we return and then do a global TLB invalidate. Move the global invalidate up a little bit and avoid the local invalidate entirely. This does put the global invalidate under pgd_lock, but that shouldn't matter. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180919085947.882287...@infradead.org --- arch/x86/mm/pageattr.c | 44 +--- 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a22f6b71a308..b6a4c638f086 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -999,14 +999,24 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE))); /* -* Intel Atom errata AAH41 workaround. +* Do a global flush tlb after splitting the large page +* and before we do the actual change page attribute in the PTE. * -* The real fix should be in hw or in a microcode update, but -* we also probabilistically try to reduce the window of having -* a large TLB mixed with 4K TLBs while instruction fetches are -* going on. +* Without this, we violate the TLB application note, that says: +* "The TLBs may contain both ordinary and large-page +* translations for a 4-KByte range of linear addresses. This +* may occur if software modifies the paging structures so that +* the page size used for the address range changes. If the two +* translations differ with respect to page frame or attributes +* (e.g., permissions), processor behavior is undefined and may +* be implementation-specific." +* +* We do this global tlb flush inside the cpa_lock, so that we +* don't allow any other cpu, with stale tlb entries change the +* page attribute in parallel, that also falls into the +* just split large page entry. */ - __flush_tlb_all(); + flush_tlb_all(); spin_unlock(_lock); return 0; @@ -1531,28 +1541,8 @@ repeat: * We have to split the large page: */ err = split_large_page(cpa, kpte, address); - if (!err) { - /* -* Do a global flush tlb after splitting the large page -* and before we do the actual change page attribute in the PTE. -* -* With out this, we violate the TLB application note, that says -* "The TLBs may contain both ordinary and large-page -* translations for a 4-KByte range of linear addresses. This -* may occur if software modifies the paging structures so that -* the page size used for the address range changes. If the two -* translations differ with respect to page frame or attributes -* (e.g., permissions), processor behavior is undefined and may -* be implementation-specific." -* -* We do this global tlb flush inside the cpa_lock, so that we -* don't allow any other cpu, with stale tlb entries change the -* page attribute in parallel, that also falls into the -* just split large page entry. -*/ - flush_tlb_all(); + if (!err) goto repeat; - } return err; }
[tip:x86/mm] x86/mm/cpa: Move flush_tlb_all()
Commit-ID: c0a759abf5a686a37b9204c13b7e281fe516c8f0 Gitweb: https://git.kernel.org/tip/c0a759abf5a686a37b9204c13b7e281fe516c8f0 Author: Peter Zijlstra AuthorDate: Wed, 19 Sep 2018 10:50:18 +0200 Committer: Thomas Gleixner CommitDate: Thu, 27 Sep 2018 20:39:40 +0200 x86/mm/cpa: Move flush_tlb_all() There is an atom errata, where we do a local TLB invalidate right before we return and then do a global TLB invalidate. Move the global invalidate up a little bit and avoid the local invalidate entirely. This does put the global invalidate under pgd_lock, but that shouldn't matter. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180919085947.882287...@infradead.org --- arch/x86/mm/pageattr.c | 44 +--- 1 file changed, 17 insertions(+), 27 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index a22f6b71a308..b6a4c638f086 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -999,14 +999,24 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, unsigned long address, __set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE))); /* -* Intel Atom errata AAH41 workaround. +* Do a global flush tlb after splitting the large page +* and before we do the actual change page attribute in the PTE. * -* The real fix should be in hw or in a microcode update, but -* we also probabilistically try to reduce the window of having -* a large TLB mixed with 4K TLBs while instruction fetches are -* going on. +* Without this, we violate the TLB application note, that says: +* "The TLBs may contain both ordinary and large-page +* translations for a 4-KByte range of linear addresses. This +* may occur if software modifies the paging structures so that +* the page size used for the address range changes. If the two +* translations differ with respect to page frame or attributes +* (e.g., permissions), processor behavior is undefined and may +* be implementation-specific." +* +* We do this global tlb flush inside the cpa_lock, so that we +* don't allow any other cpu, with stale tlb entries change the +* page attribute in parallel, that also falls into the +* just split large page entry. */ - __flush_tlb_all(); + flush_tlb_all(); spin_unlock(_lock); return 0; @@ -1531,28 +1541,8 @@ repeat: * We have to split the large page: */ err = split_large_page(cpa, kpte, address); - if (!err) { - /* -* Do a global flush tlb after splitting the large page -* and before we do the actual change page attribute in the PTE. -* -* With out this, we violate the TLB application note, that says -* "The TLBs may contain both ordinary and large-page -* translations for a 4-KByte range of linear addresses. This -* may occur if software modifies the paging structures so that -* the page size used for the address range changes. If the two -* translations differ with respect to page frame or attributes -* (e.g., permissions), processor behavior is undefined and may -* be implementation-specific." -* -* We do this global tlb flush inside the cpa_lock, so that we -* don't allow any other cpu, with stale tlb entries change the -* page attribute in parallel, that also falls into the -* just split large page entry. -*/ - flush_tlb_all(); + if (!err) goto repeat; - } return err; }
[tip:x86/mm] x86/mm/cpa: Use flush_tlb_all()
Commit-ID: c6185b1f21a47af94617fde3af7e803817b522a9 Gitweb: https://git.kernel.org/tip/c6185b1f21a47af94617fde3af7e803817b522a9 Author: Peter Zijlstra AuthorDate: Wed, 19 Sep 2018 10:50:17 +0200 Committer: Thomas Gleixner CommitDate: Thu, 27 Sep 2018 20:39:40 +0200 x86/mm/cpa: Use flush_tlb_all() Instead of open-coding it.. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180919085947.831102...@infradead.org --- arch/x86/mm/pageattr.c | 12 +--- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 4e55ded01be5..a22f6b71a308 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -285,16 +285,6 @@ static void cpa_flush_all(unsigned long cache) on_each_cpu(__cpa_flush_all, (void *) cache, 1); } -static void __cpa_flush_range(void *arg) -{ - /* -* We could optimize that further and do individual per page -* tlb invalidates for a low number of pages. Caveat: we must -* flush the high aliases on 64bit as well. -*/ - __flush_tlb_all(); -} - static void cpa_flush_range(unsigned long start, int numpages, int cache) { unsigned int i, level; @@ -303,7 +293,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); WARN_ON(PAGE_ALIGN(start) != start); - on_each_cpu(__cpa_flush_range, NULL, 1); + flush_tlb_all(); if (!cache) return;
[tip:x86/mm] x86/mm/cpa: Use flush_tlb_all()
Commit-ID: c6185b1f21a47af94617fde3af7e803817b522a9 Gitweb: https://git.kernel.org/tip/c6185b1f21a47af94617fde3af7e803817b522a9 Author: Peter Zijlstra AuthorDate: Wed, 19 Sep 2018 10:50:17 +0200 Committer: Thomas Gleixner CommitDate: Thu, 27 Sep 2018 20:39:40 +0200 x86/mm/cpa: Use flush_tlb_all() Instead of open-coding it.. Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Reviewed-by: Dave Hansen Cc: Bin Yang Cc: Mark Gross Link: https://lkml.kernel.org/r/20180919085947.831102...@infradead.org --- arch/x86/mm/pageattr.c | 12 +--- 1 file changed, 1 insertion(+), 11 deletions(-) diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c index 4e55ded01be5..a22f6b71a308 100644 --- a/arch/x86/mm/pageattr.c +++ b/arch/x86/mm/pageattr.c @@ -285,16 +285,6 @@ static void cpa_flush_all(unsigned long cache) on_each_cpu(__cpa_flush_all, (void *) cache, 1); } -static void __cpa_flush_range(void *arg) -{ - /* -* We could optimize that further and do individual per page -* tlb invalidates for a low number of pages. Caveat: we must -* flush the high aliases on 64bit as well. -*/ - __flush_tlb_all(); -} - static void cpa_flush_range(unsigned long start, int numpages, int cache) { unsigned int i, level; @@ -303,7 +293,7 @@ static void cpa_flush_range(unsigned long start, int numpages, int cache) BUG_ON(irqs_disabled() && !early_boot_irqs_disabled); WARN_ON(PAGE_ALIGN(start) != start); - on_each_cpu(__cpa_flush_range, NULL, 1); + flush_tlb_all(); if (!cache) return;
[tip:locking/core] locking/lockdep, cpu/hotplug: Annotate AP thread
Commit-ID: cb92173d1f0474784c6171a9d3fdbbca0ee53554 Gitweb: https://git.kernel.org/tip/cb92173d1f0474784c6171a9d3fdbbca0ee53554 Author: Peter Zijlstra AuthorDate: Tue, 11 Sep 2018 11:51:27 +0200 Committer: Ingo Molnar CommitDate: Tue, 11 Sep 2018 20:01:03 +0200 locking/lockdep, cpu/hotplug: Annotate AP thread Anybody trying to assert the cpu_hotplug_lock is held (lockdep_assert_cpus_held()) from AP callbacks will fail, because the lock is held by the BP. Stick in an explicit annotation in cpuhp_thread_fun() to make this work. Reported-by: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-tip-comm...@vger.kernel.org Fixes: cb538267ea1e ("jump_label/lockdep: Assert we hold the hotplug lock for _cpuslocked() operations") Link: http://lkml.kernel.org/r/20180911095127.gt24...@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/cpu.c | 28 1 file changed, 28 insertions(+) diff --git a/kernel/cpu.c b/kernel/cpu.c index 0097acec1c71..be4859f07153 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -315,6 +315,16 @@ void lockdep_assert_cpus_held(void) percpu_rwsem_assert_held(_hotplug_lock); } +static void lockdep_acquire_cpus_lock(void) +{ + rwsem_acquire(_hotplug_lock.rw_sem.dep_map, 0, 0, _THIS_IP_); +} + +static void lockdep_release_cpus_lock(void) +{ + rwsem_release(_hotplug_lock.rw_sem.dep_map, 1, _THIS_IP_); +} + /* * Wait for currently running CPU hotplug operations to complete (if any) and * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects @@ -344,6 +354,17 @@ void cpu_hotplug_enable(void) cpu_maps_update_done(); } EXPORT_SYMBOL_GPL(cpu_hotplug_enable); + +#else + +static void lockdep_acquire_cpus_lock(void) +{ +} + +static void lockdep_release_cpus_lock(void) +{ +} + #endif /* CONFIG_HOTPLUG_CPU */ #ifdef CONFIG_HOTPLUG_SMT @@ -616,6 +637,12 @@ static void cpuhp_thread_fun(unsigned int cpu) */ smp_mb(); + /* +* The BP holds the hotplug lock, but we're now running on the AP, +* ensure that anybody asserting the lock is held, will actually find +* it so. +*/ + lockdep_acquire_cpus_lock(); cpuhp_lock_acquire(bringup); if (st->single) { @@ -661,6 +688,7 @@ static void cpuhp_thread_fun(unsigned int cpu) } cpuhp_lock_release(bringup); + lockdep_release_cpus_lock(); if (!st->should_run) complete_ap_thread(st, bringup);
[tip:locking/core] locking/lockdep, cpu/hotplug: Annotate AP thread
Commit-ID: cb92173d1f0474784c6171a9d3fdbbca0ee53554 Gitweb: https://git.kernel.org/tip/cb92173d1f0474784c6171a9d3fdbbca0ee53554 Author: Peter Zijlstra AuthorDate: Tue, 11 Sep 2018 11:51:27 +0200 Committer: Ingo Molnar CommitDate: Tue, 11 Sep 2018 20:01:03 +0200 locking/lockdep, cpu/hotplug: Annotate AP thread Anybody trying to assert the cpu_hotplug_lock is held (lockdep_assert_cpus_held()) from AP callbacks will fail, because the lock is held by the BP. Stick in an explicit annotation in cpuhp_thread_fun() to make this work. Reported-by: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-tip-comm...@vger.kernel.org Fixes: cb538267ea1e ("jump_label/lockdep: Assert we hold the hotplug lock for _cpuslocked() operations") Link: http://lkml.kernel.org/r/20180911095127.gt24...@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/cpu.c | 28 1 file changed, 28 insertions(+) diff --git a/kernel/cpu.c b/kernel/cpu.c index 0097acec1c71..be4859f07153 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -315,6 +315,16 @@ void lockdep_assert_cpus_held(void) percpu_rwsem_assert_held(_hotplug_lock); } +static void lockdep_acquire_cpus_lock(void) +{ + rwsem_acquire(_hotplug_lock.rw_sem.dep_map, 0, 0, _THIS_IP_); +} + +static void lockdep_release_cpus_lock(void) +{ + rwsem_release(_hotplug_lock.rw_sem.dep_map, 1, _THIS_IP_); +} + /* * Wait for currently running CPU hotplug operations to complete (if any) and * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects @@ -344,6 +354,17 @@ void cpu_hotplug_enable(void) cpu_maps_update_done(); } EXPORT_SYMBOL_GPL(cpu_hotplug_enable); + +#else + +static void lockdep_acquire_cpus_lock(void) +{ +} + +static void lockdep_release_cpus_lock(void) +{ +} + #endif /* CONFIG_HOTPLUG_CPU */ #ifdef CONFIG_HOTPLUG_SMT @@ -616,6 +637,12 @@ static void cpuhp_thread_fun(unsigned int cpu) */ smp_mb(); + /* +* The BP holds the hotplug lock, but we're now running on the AP, +* ensure that anybody asserting the lock is held, will actually find +* it so. +*/ + lockdep_acquire_cpus_lock(); cpuhp_lock_acquire(bringup); if (st->single) { @@ -661,6 +688,7 @@ static void cpuhp_thread_fun(unsigned int cpu) } cpuhp_lock_release(bringup); + lockdep_release_cpus_lock(); if (!st->should_run) complete_ap_thread(st, bringup);
[tip:locking/core] locking/lockdep, cpu/hotplug: Annotate AP thread
Commit-ID: f1b2f6eccf99fc457221cc84c7550a8e3b17d4df Gitweb: https://git.kernel.org/tip/f1b2f6eccf99fc457221cc84c7550a8e3b17d4df Author: Peter Zijlstra AuthorDate: Tue, 11 Sep 2018 11:51:27 +0200 Committer: Ingo Molnar CommitDate: Tue, 11 Sep 2018 12:37:00 +0200 locking/lockdep, cpu/hotplug: Annotate AP thread Anybody trying to assert the cpu_hotplug_lock is held (lockdep_assert_cpus_held()) from AP callbacks will fail, because the lock is held by the BP. Stick in an explicit annotation in cpuhp_thread_fun() to make this work. Reported-by: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-tip-comm...@vger.kernel.org Fixes: cb538267ea1e ("jump_label/lockdep: Assert we hold the hotplug lock for _cpuslocked() operations") Link: http://lkml.kernel.org/r/20180911095127.gt24...@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/cpu.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/kernel/cpu.c b/kernel/cpu.c index 0097acec1c71..08c168b159da 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -616,6 +616,12 @@ static void cpuhp_thread_fun(unsigned int cpu) */ smp_mb(); + /* +* The BP holds the hotplug lock, but we're now running on the AP, +* ensure that anybody asserting the lock is held, will actually find +* it so. +*/ + rwsem_acquire(_hotplug_lock.rw_sem.dep_map, 0, 0, _THIS_IP_); cpuhp_lock_acquire(bringup); if (st->single) { @@ -661,6 +667,7 @@ static void cpuhp_thread_fun(unsigned int cpu) } cpuhp_lock_release(bringup); + rwsem_release(_hotplug_lock.rw_sem.dep_map, 1, _THIS_IP_); if (!st->should_run) complete_ap_thread(st, bringup);
[tip:locking/core] locking/lockdep, cpu/hotplug: Annotate AP thread
Commit-ID: f1b2f6eccf99fc457221cc84c7550a8e3b17d4df Gitweb: https://git.kernel.org/tip/f1b2f6eccf99fc457221cc84c7550a8e3b17d4df Author: Peter Zijlstra AuthorDate: Tue, 11 Sep 2018 11:51:27 +0200 Committer: Ingo Molnar CommitDate: Tue, 11 Sep 2018 12:37:00 +0200 locking/lockdep, cpu/hotplug: Annotate AP thread Anybody trying to assert the cpu_hotplug_lock is held (lockdep_assert_cpus_held()) from AP callbacks will fail, because the lock is held by the BP. Stick in an explicit annotation in cpuhp_thread_fun() to make this work. Reported-by: Ingo Molnar Signed-off-by: Peter Zijlstra (Intel) Cc: Linus Torvalds Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: linux-tip-comm...@vger.kernel.org Fixes: cb538267ea1e ("jump_label/lockdep: Assert we hold the hotplug lock for _cpuslocked() operations") Link: http://lkml.kernel.org/r/20180911095127.gt24...@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/cpu.c | 7 +++ 1 file changed, 7 insertions(+) diff --git a/kernel/cpu.c b/kernel/cpu.c index 0097acec1c71..08c168b159da 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -616,6 +616,12 @@ static void cpuhp_thread_fun(unsigned int cpu) */ smp_mb(); + /* +* The BP holds the hotplug lock, but we're now running on the AP, +* ensure that anybody asserting the lock is held, will actually find +* it so. +*/ + rwsem_acquire(_hotplug_lock.rw_sem.dep_map, 0, 0, _THIS_IP_); cpuhp_lock_acquire(bringup); if (st->single) { @@ -661,6 +667,7 @@ static void cpuhp_thread_fun(unsigned int cpu) } cpuhp_lock_release(bringup); + rwsem_release(_hotplug_lock.rw_sem.dep_map, 1, _THIS_IP_); if (!st->should_run) complete_ap_thread(st, bringup);
[tip:timers/urgent] clocksource: Revert "Remove kthread"
Commit-ID: e2c631ba75a7e727e8db0a9d30a06bfd434adb3a Gitweb: https://git.kernel.org/tip/e2c631ba75a7e727e8db0a9d30a06bfd434adb3a Author: Peter Zijlstra AuthorDate: Wed, 5 Sep 2018 10:41:58 +0200 Committer: Thomas Gleixner CommitDate: Thu, 6 Sep 2018 23:38:35 +0200 clocksource: Revert "Remove kthread" I turns out that the silly spawn kthread from worker was actually needed. clocksource_watchdog_kthread() cannot be called directly from clocksource_watchdog_work(), because clocksource_select() calls timekeeping_notify() which uses stop_machine(). One cannot use stop_machine() from a workqueue() due lock inversions wrt CPU hotplug. Revert the patch but add a comment that explain why we jump through such apparently silly hoops. Fixes: 7197e77abcb6 ("clocksource: Remove kthread") Reported-by: Siegfried Metz Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Tested-by: Niklas Cassel Tested-by: Kevin Shanahan Tested-by: viktor_jaegerskuep...@freenet.de Tested-by: Siegfried Metz Cc: rafael.j.wyso...@intel.com Cc: len.br...@intel.com Cc: diego.vi...@gmail.com Cc: rui.zh...@intel.com Cc: bjorn.anders...@linaro.org Link: https://lkml.kernel.org/r/20180905084158.gr24...@hirez.programming.kicks-ass.net --- kernel/time/clocksource.c | 40 ++-- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index f74fb00d8064..0e6e97a01942 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -133,19 +133,40 @@ static void inline clocksource_watchdog_unlock(unsigned long *flags) spin_unlock_irqrestore(_lock, *flags); } +static int clocksource_watchdog_kthread(void *data); +static void __clocksource_change_rating(struct clocksource *cs, int rating); + /* * Interval: 0.5sec Threshold: 0.0625s */ #define WATCHDOG_INTERVAL (HZ >> 1) #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) +static void clocksource_watchdog_work(struct work_struct *work) +{ + /* +* We cannot directly run clocksource_watchdog_kthread() here, because +* clocksource_select() calls timekeeping_notify() which uses +* stop_machine(). One cannot use stop_machine() from a workqueue() due +* lock inversions wrt CPU hotplug. +* +* Also, we only ever run this work once or twice during the lifetime +* of the kernel, so there is no point in creating a more permanent +* kthread for this. +* +* If kthread_run fails the next watchdog scan over the +* watchdog_list will find the unstable clock again. +*/ + kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog"); +} + static void __clocksource_unstable(struct clocksource *cs) { cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); cs->flags |= CLOCK_SOURCE_UNSTABLE; /* -* If the clocksource is registered clocksource_watchdog_work() will +* If the clocksource is registered clocksource_watchdog_kthread() will * re-rate and re-select. */ if (list_empty(>list)) { @@ -156,7 +177,7 @@ static void __clocksource_unstable(struct clocksource *cs) if (cs->mark_unstable) cs->mark_unstable(cs); - /* kick clocksource_watchdog_work() */ + /* kick clocksource_watchdog_kthread() */ if (finished_booting) schedule_work(_work); } @@ -166,7 +187,7 @@ static void __clocksource_unstable(struct clocksource *cs) * @cs:clocksource to be marked unstable * * This function is called by the x86 TSC code to mark clocksources as unstable; - * it defers demotion and re-selection to a work. + * it defers demotion and re-selection to a kthread. */ void clocksource_mark_unstable(struct clocksource *cs) { @@ -391,9 +412,7 @@ static void clocksource_dequeue_watchdog(struct clocksource *cs) } } -static void __clocksource_change_rating(struct clocksource *cs, int rating); - -static int __clocksource_watchdog_work(void) +static int __clocksource_watchdog_kthread(void) { struct clocksource *cs, *tmp; unsigned long flags; @@ -418,12 +437,13 @@ static int __clocksource_watchdog_work(void) return select; } -static void clocksource_watchdog_work(struct work_struct *work) +static int clocksource_watchdog_kthread(void *data) { mutex_lock(_mutex); - if (__clocksource_watchdog_work()) + if (__clocksource_watchdog_kthread()) clocksource_select(); mutex_unlock(_mutex); + return 0; } static bool clocksource_is_watchdog(struct clocksource *cs) @@ -442,7 +462,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) static void clocksource_select_watchdog(bool fallback) { } static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } -static
[tip:timers/urgent] clocksource: Revert "Remove kthread"
Commit-ID: e2c631ba75a7e727e8db0a9d30a06bfd434adb3a Gitweb: https://git.kernel.org/tip/e2c631ba75a7e727e8db0a9d30a06bfd434adb3a Author: Peter Zijlstra AuthorDate: Wed, 5 Sep 2018 10:41:58 +0200 Committer: Thomas Gleixner CommitDate: Thu, 6 Sep 2018 23:38:35 +0200 clocksource: Revert "Remove kthread" I turns out that the silly spawn kthread from worker was actually needed. clocksource_watchdog_kthread() cannot be called directly from clocksource_watchdog_work(), because clocksource_select() calls timekeeping_notify() which uses stop_machine(). One cannot use stop_machine() from a workqueue() due lock inversions wrt CPU hotplug. Revert the patch but add a comment that explain why we jump through such apparently silly hoops. Fixes: 7197e77abcb6 ("clocksource: Remove kthread") Reported-by: Siegfried Metz Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Tested-by: Niklas Cassel Tested-by: Kevin Shanahan Tested-by: viktor_jaegerskuep...@freenet.de Tested-by: Siegfried Metz Cc: rafael.j.wyso...@intel.com Cc: len.br...@intel.com Cc: diego.vi...@gmail.com Cc: rui.zh...@intel.com Cc: bjorn.anders...@linaro.org Link: https://lkml.kernel.org/r/20180905084158.gr24...@hirez.programming.kicks-ass.net --- kernel/time/clocksource.c | 40 ++-- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index f74fb00d8064..0e6e97a01942 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -133,19 +133,40 @@ static void inline clocksource_watchdog_unlock(unsigned long *flags) spin_unlock_irqrestore(_lock, *flags); } +static int clocksource_watchdog_kthread(void *data); +static void __clocksource_change_rating(struct clocksource *cs, int rating); + /* * Interval: 0.5sec Threshold: 0.0625s */ #define WATCHDOG_INTERVAL (HZ >> 1) #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) +static void clocksource_watchdog_work(struct work_struct *work) +{ + /* +* We cannot directly run clocksource_watchdog_kthread() here, because +* clocksource_select() calls timekeeping_notify() which uses +* stop_machine(). One cannot use stop_machine() from a workqueue() due +* lock inversions wrt CPU hotplug. +* +* Also, we only ever run this work once or twice during the lifetime +* of the kernel, so there is no point in creating a more permanent +* kthread for this. +* +* If kthread_run fails the next watchdog scan over the +* watchdog_list will find the unstable clock again. +*/ + kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog"); +} + static void __clocksource_unstable(struct clocksource *cs) { cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); cs->flags |= CLOCK_SOURCE_UNSTABLE; /* -* If the clocksource is registered clocksource_watchdog_work() will +* If the clocksource is registered clocksource_watchdog_kthread() will * re-rate and re-select. */ if (list_empty(>list)) { @@ -156,7 +177,7 @@ static void __clocksource_unstable(struct clocksource *cs) if (cs->mark_unstable) cs->mark_unstable(cs); - /* kick clocksource_watchdog_work() */ + /* kick clocksource_watchdog_kthread() */ if (finished_booting) schedule_work(_work); } @@ -166,7 +187,7 @@ static void __clocksource_unstable(struct clocksource *cs) * @cs:clocksource to be marked unstable * * This function is called by the x86 TSC code to mark clocksources as unstable; - * it defers demotion and re-selection to a work. + * it defers demotion and re-selection to a kthread. */ void clocksource_mark_unstable(struct clocksource *cs) { @@ -391,9 +412,7 @@ static void clocksource_dequeue_watchdog(struct clocksource *cs) } } -static void __clocksource_change_rating(struct clocksource *cs, int rating); - -static int __clocksource_watchdog_work(void) +static int __clocksource_watchdog_kthread(void) { struct clocksource *cs, *tmp; unsigned long flags; @@ -418,12 +437,13 @@ static int __clocksource_watchdog_work(void) return select; } -static void clocksource_watchdog_work(struct work_struct *work) +static int clocksource_watchdog_kthread(void *data) { mutex_lock(_mutex); - if (__clocksource_watchdog_work()) + if (__clocksource_watchdog_kthread()) clocksource_select(); mutex_unlock(_mutex); + return 0; } static bool clocksource_is_watchdog(struct clocksource *cs) @@ -442,7 +462,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) static void clocksource_select_watchdog(bool fallback) { } static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } -static
[tip:timers/urgent] clocksource: Revert "Remove kthread"
Commit-ID: 760902b24960679c2e8592de3a56359d2c205731 Gitweb: https://git.kernel.org/tip/760902b24960679c2e8592de3a56359d2c205731 Author: Peter Zijlstra AuthorDate: Wed, 5 Sep 2018 10:41:58 +0200 Committer: Thomas Gleixner CommitDate: Thu, 6 Sep 2018 12:42:28 +0200 clocksource: Revert "Remove kthread" I turns out that the silly spawn kthread from worker was actually needed. clocksource_watchdog_kthread() cannot be called directly from clocksource_watchdog_work(), because clocksource_select() calls timekeeping_notify() which uses stop_machine(). One cannot use stop_machine() from a workqueue() due lock inversions wrt CPU hotplug. Revert the patch but add a comment that explain why we jump through such apparently silly hoops. Fixes: 7197e77abcb6 ("clocksource: Remove kthread") Reported-by: Siegfried Metz Signed-off-by: Peter Zijlstra (Intel) Tested-by: Niklas Cassel Tested-by: Kevin Shanahan Tested-by: viktor_jaegerskuep...@freenet.de Tested-by: Siegfried Metz Cc: rafael.j.wyso...@intel.com Cc: len.br...@intel.com Cc: diego.vi...@gmail.com Cc: rui.zh...@intel.com Cc: bjorn.anders...@linaro.org Link: https://lkml.kernel.org/r/20180905084158.gr24...@hirez.programming.kicks-ass.net --- kernel/time/clocksource.c | 40 ++-- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index f74fb00d8064..0e6e97a01942 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -133,19 +133,40 @@ static void inline clocksource_watchdog_unlock(unsigned long *flags) spin_unlock_irqrestore(_lock, *flags); } +static int clocksource_watchdog_kthread(void *data); +static void __clocksource_change_rating(struct clocksource *cs, int rating); + /* * Interval: 0.5sec Threshold: 0.0625s */ #define WATCHDOG_INTERVAL (HZ >> 1) #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) +static void clocksource_watchdog_work(struct work_struct *work) +{ + /* +* We cannot directly run clocksource_watchdog_kthread() here, because +* clocksource_select() calls timekeeping_notify() which uses +* stop_machine(). One cannot use stop_machine() from a workqueue() due +* lock inversions wrt CPU hotplug. +* +* Also, we only ever run this work once or twice during the lifetime +* of the kernel, so there is no point in creating a more permanent +* kthread for this. +* +* If kthread_run fails the next watchdog scan over the +* watchdog_list will find the unstable clock again. +*/ + kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog"); +} + static void __clocksource_unstable(struct clocksource *cs) { cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); cs->flags |= CLOCK_SOURCE_UNSTABLE; /* -* If the clocksource is registered clocksource_watchdog_work() will +* If the clocksource is registered clocksource_watchdog_kthread() will * re-rate and re-select. */ if (list_empty(>list)) { @@ -156,7 +177,7 @@ static void __clocksource_unstable(struct clocksource *cs) if (cs->mark_unstable) cs->mark_unstable(cs); - /* kick clocksource_watchdog_work() */ + /* kick clocksource_watchdog_kthread() */ if (finished_booting) schedule_work(_work); } @@ -166,7 +187,7 @@ static void __clocksource_unstable(struct clocksource *cs) * @cs:clocksource to be marked unstable * * This function is called by the x86 TSC code to mark clocksources as unstable; - * it defers demotion and re-selection to a work. + * it defers demotion and re-selection to a kthread. */ void clocksource_mark_unstable(struct clocksource *cs) { @@ -391,9 +412,7 @@ static void clocksource_dequeue_watchdog(struct clocksource *cs) } } -static void __clocksource_change_rating(struct clocksource *cs, int rating); - -static int __clocksource_watchdog_work(void) +static int __clocksource_watchdog_kthread(void) { struct clocksource *cs, *tmp; unsigned long flags; @@ -418,12 +437,13 @@ static int __clocksource_watchdog_work(void) return select; } -static void clocksource_watchdog_work(struct work_struct *work) +static int clocksource_watchdog_kthread(void *data) { mutex_lock(_mutex); - if (__clocksource_watchdog_work()) + if (__clocksource_watchdog_kthread()) clocksource_select(); mutex_unlock(_mutex); + return 0; } static bool clocksource_is_watchdog(struct clocksource *cs) @@ -442,7 +462,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) static void clocksource_select_watchdog(bool fallback) { } static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } -static inline int
[tip:timers/urgent] clocksource: Revert "Remove kthread"
Commit-ID: 760902b24960679c2e8592de3a56359d2c205731 Gitweb: https://git.kernel.org/tip/760902b24960679c2e8592de3a56359d2c205731 Author: Peter Zijlstra AuthorDate: Wed, 5 Sep 2018 10:41:58 +0200 Committer: Thomas Gleixner CommitDate: Thu, 6 Sep 2018 12:42:28 +0200 clocksource: Revert "Remove kthread" I turns out that the silly spawn kthread from worker was actually needed. clocksource_watchdog_kthread() cannot be called directly from clocksource_watchdog_work(), because clocksource_select() calls timekeeping_notify() which uses stop_machine(). One cannot use stop_machine() from a workqueue() due lock inversions wrt CPU hotplug. Revert the patch but add a comment that explain why we jump through such apparently silly hoops. Fixes: 7197e77abcb6 ("clocksource: Remove kthread") Reported-by: Siegfried Metz Signed-off-by: Peter Zijlstra (Intel) Tested-by: Niklas Cassel Tested-by: Kevin Shanahan Tested-by: viktor_jaegerskuep...@freenet.de Tested-by: Siegfried Metz Cc: rafael.j.wyso...@intel.com Cc: len.br...@intel.com Cc: diego.vi...@gmail.com Cc: rui.zh...@intel.com Cc: bjorn.anders...@linaro.org Link: https://lkml.kernel.org/r/20180905084158.gr24...@hirez.programming.kicks-ass.net --- kernel/time/clocksource.c | 40 ++-- 1 file changed, 30 insertions(+), 10 deletions(-) diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c index f74fb00d8064..0e6e97a01942 100644 --- a/kernel/time/clocksource.c +++ b/kernel/time/clocksource.c @@ -133,19 +133,40 @@ static void inline clocksource_watchdog_unlock(unsigned long *flags) spin_unlock_irqrestore(_lock, *flags); } +static int clocksource_watchdog_kthread(void *data); +static void __clocksource_change_rating(struct clocksource *cs, int rating); + /* * Interval: 0.5sec Threshold: 0.0625s */ #define WATCHDOG_INTERVAL (HZ >> 1) #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4) +static void clocksource_watchdog_work(struct work_struct *work) +{ + /* +* We cannot directly run clocksource_watchdog_kthread() here, because +* clocksource_select() calls timekeeping_notify() which uses +* stop_machine(). One cannot use stop_machine() from a workqueue() due +* lock inversions wrt CPU hotplug. +* +* Also, we only ever run this work once or twice during the lifetime +* of the kernel, so there is no point in creating a more permanent +* kthread for this. +* +* If kthread_run fails the next watchdog scan over the +* watchdog_list will find the unstable clock again. +*/ + kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog"); +} + static void __clocksource_unstable(struct clocksource *cs) { cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG); cs->flags |= CLOCK_SOURCE_UNSTABLE; /* -* If the clocksource is registered clocksource_watchdog_work() will +* If the clocksource is registered clocksource_watchdog_kthread() will * re-rate and re-select. */ if (list_empty(>list)) { @@ -156,7 +177,7 @@ static void __clocksource_unstable(struct clocksource *cs) if (cs->mark_unstable) cs->mark_unstable(cs); - /* kick clocksource_watchdog_work() */ + /* kick clocksource_watchdog_kthread() */ if (finished_booting) schedule_work(_work); } @@ -166,7 +187,7 @@ static void __clocksource_unstable(struct clocksource *cs) * @cs:clocksource to be marked unstable * * This function is called by the x86 TSC code to mark clocksources as unstable; - * it defers demotion and re-selection to a work. + * it defers demotion and re-selection to a kthread. */ void clocksource_mark_unstable(struct clocksource *cs) { @@ -391,9 +412,7 @@ static void clocksource_dequeue_watchdog(struct clocksource *cs) } } -static void __clocksource_change_rating(struct clocksource *cs, int rating); - -static int __clocksource_watchdog_work(void) +static int __clocksource_watchdog_kthread(void) { struct clocksource *cs, *tmp; unsigned long flags; @@ -418,12 +437,13 @@ static int __clocksource_watchdog_work(void) return select; } -static void clocksource_watchdog_work(struct work_struct *work) +static int clocksource_watchdog_kthread(void *data) { mutex_lock(_mutex); - if (__clocksource_watchdog_work()) + if (__clocksource_watchdog_kthread()) clocksource_select(); mutex_unlock(_mutex); + return 0; } static bool clocksource_is_watchdog(struct clocksource *cs) @@ -442,7 +462,7 @@ static void clocksource_enqueue_watchdog(struct clocksource *cs) static void clocksource_select_watchdog(bool fallback) { } static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { } static inline void clocksource_resume_watchdog(void) { } -static inline int
[tip:sched/core] stop_machine: Reflow cpu_stop_queue_two_works()
Commit-ID: b80a2bfce85e1051056d98d04ecb2d0b55cbbc1c Gitweb: https://git.kernel.org/tip/b80a2bfce85e1051056d98d04ecb2d0b55cbbc1c Author: Peter Zijlstra AuthorDate: Mon, 30 Jul 2018 13:21:40 +0200 Committer: Thomas Gleixner CommitDate: Thu, 2 Aug 2018 15:25:20 +0200 stop_machine: Reflow cpu_stop_queue_two_works() The code flow in cpu_stop_queue_two_works() is a little arcane; fix this by lifting the preempt_disable() to the top to create more natural nesting wrt the spinlocks and make the wake_up_q() and preempt_enable() unconditional at the end. Furthermore, enable preemption in the -EDEADLK case, such that we spin-wait with preemption enabled. Suggested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: isa...@codeaurora.org Cc: m...@codeblueprint.co.uk Cc: psoda...@codeaurora.org Cc: gre...@linuxfoundation.org Cc: pkond...@codeaurora.org Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/20180730112140.gh2...@hirez.programming.kicks-ass.net --- kernel/stop_machine.c | 41 +++-- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index e190d1ef3a23..34b6652e8677 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -236,13 +236,24 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, struct cpu_stopper *stopper2 = per_cpu_ptr(_stopper, cpu2); DEFINE_WAKE_Q(wakeq); int err; + retry: + /* +* The waking up of stopper threads has to happen in the same +* scheduling context as the queueing. Otherwise, there is a +* possibility of one of the above stoppers being woken up by another +* CPU, and preempting us. This will cause us to not wake up the other +* stopper forever. +*/ + preempt_disable(); raw_spin_lock_irq(>lock); raw_spin_lock_nested(>lock, SINGLE_DEPTH_NESTING); - err = -ENOENT; - if (!stopper1->enabled || !stopper2->enabled) + if (!stopper1->enabled || !stopper2->enabled) { + err = -ENOENT; goto unlock; + } + /* * Ensure that if we race with __stop_cpus() the stoppers won't get * queued up in reverse order leading to system deadlock. @@ -253,36 +264,30 @@ retry: * It can be falsely true but it is safe to spin until it is cleared, * queue_stop_cpus_work() does everything under preempt_disable(). */ - err = -EDEADLK; - if (unlikely(stop_cpus_in_progress)) - goto unlock; + if (unlikely(stop_cpus_in_progress)) { + err = -EDEADLK; + goto unlock; + } err = 0; __cpu_stop_queue_work(stopper1, work1, ); __cpu_stop_queue_work(stopper2, work2, ); - /* -* The waking up of stopper threads has to happen -* in the same scheduling context as the queueing. -* Otherwise, there is a possibility of one of the -* above stoppers being woken up by another CPU, -* and preempting us. This will cause us to n ot -* wake up the other stopper forever. -*/ - preempt_disable(); + unlock: raw_spin_unlock(>lock); raw_spin_unlock_irq(>lock); if (unlikely(err == -EDEADLK)) { + preempt_enable(); + while (stop_cpus_in_progress) cpu_relax(); + goto retry; } - if (!err) { - wake_up_q(); - preempt_enable(); - } + wake_up_q(); + preempt_enable(); return err; }
[tip:sched/core] stop_machine: Reflow cpu_stop_queue_two_works()
Commit-ID: b80a2bfce85e1051056d98d04ecb2d0b55cbbc1c Gitweb: https://git.kernel.org/tip/b80a2bfce85e1051056d98d04ecb2d0b55cbbc1c Author: Peter Zijlstra AuthorDate: Mon, 30 Jul 2018 13:21:40 +0200 Committer: Thomas Gleixner CommitDate: Thu, 2 Aug 2018 15:25:20 +0200 stop_machine: Reflow cpu_stop_queue_two_works() The code flow in cpu_stop_queue_two_works() is a little arcane; fix this by lifting the preempt_disable() to the top to create more natural nesting wrt the spinlocks and make the wake_up_q() and preempt_enable() unconditional at the end. Furthermore, enable preemption in the -EDEADLK case, such that we spin-wait with preemption enabled. Suggested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: Sebastian Andrzej Siewior Cc: isa...@codeaurora.org Cc: m...@codeblueprint.co.uk Cc: psoda...@codeaurora.org Cc: gre...@linuxfoundation.org Cc: pkond...@codeaurora.org Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/20180730112140.gh2...@hirez.programming.kicks-ass.net --- kernel/stop_machine.c | 41 +++-- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index e190d1ef3a23..34b6652e8677 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -236,13 +236,24 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, struct cpu_stopper *stopper2 = per_cpu_ptr(_stopper, cpu2); DEFINE_WAKE_Q(wakeq); int err; + retry: + /* +* The waking up of stopper threads has to happen in the same +* scheduling context as the queueing. Otherwise, there is a +* possibility of one of the above stoppers being woken up by another +* CPU, and preempting us. This will cause us to not wake up the other +* stopper forever. +*/ + preempt_disable(); raw_spin_lock_irq(>lock); raw_spin_lock_nested(>lock, SINGLE_DEPTH_NESTING); - err = -ENOENT; - if (!stopper1->enabled || !stopper2->enabled) + if (!stopper1->enabled || !stopper2->enabled) { + err = -ENOENT; goto unlock; + } + /* * Ensure that if we race with __stop_cpus() the stoppers won't get * queued up in reverse order leading to system deadlock. @@ -253,36 +264,30 @@ retry: * It can be falsely true but it is safe to spin until it is cleared, * queue_stop_cpus_work() does everything under preempt_disable(). */ - err = -EDEADLK; - if (unlikely(stop_cpus_in_progress)) - goto unlock; + if (unlikely(stop_cpus_in_progress)) { + err = -EDEADLK; + goto unlock; + } err = 0; __cpu_stop_queue_work(stopper1, work1, ); __cpu_stop_queue_work(stopper2, work2, ); - /* -* The waking up of stopper threads has to happen -* in the same scheduling context as the queueing. -* Otherwise, there is a possibility of one of the -* above stoppers being woken up by another CPU, -* and preempting us. This will cause us to n ot -* wake up the other stopper forever. -*/ - preempt_disable(); + unlock: raw_spin_unlock(>lock); raw_spin_unlock_irq(>lock); if (unlikely(err == -EDEADLK)) { + preempt_enable(); + while (stop_cpus_in_progress) cpu_relax(); + goto retry; } - if (!err) { - wake_up_q(); - preempt_enable(); - } + wake_up_q(); + preempt_enable(); return err; }
[tip:sched/core] stop_machine: Reflow cpu_stop_queue_two_works()
Commit-ID: 2171ce2d470d6e389ebbef3edd22c7643918a02f Gitweb: https://git.kernel.org/tip/2171ce2d470d6e389ebbef3edd22c7643918a02f Author: Peter Zijlstra AuthorDate: Mon, 30 Jul 2018 13:21:40 +0200 Committer: Thomas Gleixner CommitDate: Thu, 2 Aug 2018 14:02:53 +0200 stop_machine: Reflow cpu_stop_queue_two_works() The code flow in cpu_stop_queue_two_works() is a little arcane; fix this by lifting the preempt_disable() to the top to create more natural nesting wrt the spinlocks and make the wake_up_q() and preempt_enable() unconditional at the end. Furthermore, enable preemption in the -EDEADLK case, such that we spin-wait with preemption enabled. Suggested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: Sebastian Andrzej Siewior Cc: isa...@codeaurora.org Cc: m...@codeblueprint.co.uk Cc: psoda...@codeaurora.org Cc: gre...@linuxfoundation.org Cc: pkond...@codeaurora.org Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/20180730112140.gh2...@hirez.programming.kicks-ass.net --- kernel/stop_machine.c | 41 +++-- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index e190d1ef3a23..34b6652e8677 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -236,13 +236,24 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, struct cpu_stopper *stopper2 = per_cpu_ptr(_stopper, cpu2); DEFINE_WAKE_Q(wakeq); int err; + retry: + /* +* The waking up of stopper threads has to happen in the same +* scheduling context as the queueing. Otherwise, there is a +* possibility of one of the above stoppers being woken up by another +* CPU, and preempting us. This will cause us to not wake up the other +* stopper forever. +*/ + preempt_disable(); raw_spin_lock_irq(>lock); raw_spin_lock_nested(>lock, SINGLE_DEPTH_NESTING); - err = -ENOENT; - if (!stopper1->enabled || !stopper2->enabled) + if (!stopper1->enabled || !stopper2->enabled) { + err = -ENOENT; goto unlock; + } + /* * Ensure that if we race with __stop_cpus() the stoppers won't get * queued up in reverse order leading to system deadlock. @@ -253,36 +264,30 @@ retry: * It can be falsely true but it is safe to spin until it is cleared, * queue_stop_cpus_work() does everything under preempt_disable(). */ - err = -EDEADLK; - if (unlikely(stop_cpus_in_progress)) - goto unlock; + if (unlikely(stop_cpus_in_progress)) { + err = -EDEADLK; + goto unlock; + } err = 0; __cpu_stop_queue_work(stopper1, work1, ); __cpu_stop_queue_work(stopper2, work2, ); - /* -* The waking up of stopper threads has to happen -* in the same scheduling context as the queueing. -* Otherwise, there is a possibility of one of the -* above stoppers being woken up by another CPU, -* and preempting us. This will cause us to n ot -* wake up the other stopper forever. -*/ - preempt_disable(); + unlock: raw_spin_unlock(>lock); raw_spin_unlock_irq(>lock); if (unlikely(err == -EDEADLK)) { + preempt_enable(); + while (stop_cpus_in_progress) cpu_relax(); + goto retry; } - if (!err) { - wake_up_q(); - preempt_enable(); - } + wake_up_q(); + preempt_enable(); return err; }
[tip:sched/core] stop_machine: Reflow cpu_stop_queue_two_works()
Commit-ID: 2171ce2d470d6e389ebbef3edd22c7643918a02f Gitweb: https://git.kernel.org/tip/2171ce2d470d6e389ebbef3edd22c7643918a02f Author: Peter Zijlstra AuthorDate: Mon, 30 Jul 2018 13:21:40 +0200 Committer: Thomas Gleixner CommitDate: Thu, 2 Aug 2018 14:02:53 +0200 stop_machine: Reflow cpu_stop_queue_two_works() The code flow in cpu_stop_queue_two_works() is a little arcane; fix this by lifting the preempt_disable() to the top to create more natural nesting wrt the spinlocks and make the wake_up_q() and preempt_enable() unconditional at the end. Furthermore, enable preemption in the -EDEADLK case, such that we spin-wait with preemption enabled. Suggested-by: Thomas Gleixner Signed-off-by: Peter Zijlstra (Intel) Cc: Sebastian Andrzej Siewior Cc: isa...@codeaurora.org Cc: m...@codeblueprint.co.uk Cc: psoda...@codeaurora.org Cc: gre...@linuxfoundation.org Cc: pkond...@codeaurora.org Cc: sta...@vger.kernel.org Link: https://lkml.kernel.org/r/20180730112140.gh2...@hirez.programming.kicks-ass.net --- kernel/stop_machine.c | 41 +++-- 1 file changed, 23 insertions(+), 18 deletions(-) diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c index e190d1ef3a23..34b6652e8677 100644 --- a/kernel/stop_machine.c +++ b/kernel/stop_machine.c @@ -236,13 +236,24 @@ static int cpu_stop_queue_two_works(int cpu1, struct cpu_stop_work *work1, struct cpu_stopper *stopper2 = per_cpu_ptr(_stopper, cpu2); DEFINE_WAKE_Q(wakeq); int err; + retry: + /* +* The waking up of stopper threads has to happen in the same +* scheduling context as the queueing. Otherwise, there is a +* possibility of one of the above stoppers being woken up by another +* CPU, and preempting us. This will cause us to not wake up the other +* stopper forever. +*/ + preempt_disable(); raw_spin_lock_irq(>lock); raw_spin_lock_nested(>lock, SINGLE_DEPTH_NESTING); - err = -ENOENT; - if (!stopper1->enabled || !stopper2->enabled) + if (!stopper1->enabled || !stopper2->enabled) { + err = -ENOENT; goto unlock; + } + /* * Ensure that if we race with __stop_cpus() the stoppers won't get * queued up in reverse order leading to system deadlock. @@ -253,36 +264,30 @@ retry: * It can be falsely true but it is safe to spin until it is cleared, * queue_stop_cpus_work() does everything under preempt_disable(). */ - err = -EDEADLK; - if (unlikely(stop_cpus_in_progress)) - goto unlock; + if (unlikely(stop_cpus_in_progress)) { + err = -EDEADLK; + goto unlock; + } err = 0; __cpu_stop_queue_work(stopper1, work1, ); __cpu_stop_queue_work(stopper2, work2, ); - /* -* The waking up of stopper threads has to happen -* in the same scheduling context as the queueing. -* Otherwise, there is a possibility of one of the -* above stoppers being woken up by another CPU, -* and preempting us. This will cause us to n ot -* wake up the other stopper forever. -*/ - preempt_disable(); + unlock: raw_spin_unlock(>lock); raw_spin_unlock_irq(>lock); if (unlikely(err == -EDEADLK)) { + preempt_enable(); + while (stop_cpus_in_progress) cpu_relax(); + goto retry; } - if (!err) { - wake_up_q(); - preempt_enable(); - } + wake_up_q(); + preempt_enable(); return err; }
[tip:x86/timers] sched/clock: Close a hole in sched_clock_init()
Commit-ID: 9407f5a7ee77c631d1e100436132437cf6237e45 Gitweb: https://git.kernel.org/tip/9407f5a7ee77c631d1e100436132437cf6237e45 Author: Peter Zijlstra AuthorDate: Fri, 20 Jul 2018 10:09:11 +0200 Committer: Thomas Gleixner CommitDate: Fri, 20 Jul 2018 11:58:00 +0200 sched/clock: Close a hole in sched_clock_init() All data required for the 'unstable' sched_clock must be set-up _before_ enabling it -- setting sched_clock_running. This includes the __gtod_offset but also a recent scd stamp. Make the gtod-offset update also set the csd stamp -- it requires the same two clock reads _anyway_. This doesn't hurt in the sched_clock_tick_stable() case and ensures sched_clock_init() gets everything set-up before use. Also switch to unconditional IRQ-disable/enable because the static key stuff already requires this is not ran with IRQs disabled. Fixes: 857baa87b642 ("sched/clock: Enable sched clock early") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: Pavel Tatashin Cc: steven.sist...@oracle.com Cc: daniel.m.jor...@oracle.com Cc: li...@armlinux.org.uk Cc: schwidef...@de.ibm.com Cc: heiko.carst...@de.ibm.com Cc: john.stu...@linaro.org Cc: sb...@codeaurora.org Cc: h...@zytor.com Cc: douly.f...@cn.fujitsu.com Cc: pra...@redhat.com Cc: feng.t...@intel.com Cc: pmla...@suse.com Cc: gno...@lxorguk.ukuu.org.uk Cc: linux-s...@vger.kernel.org Cc: boris.ostrov...@oracle.com Cc: jgr...@suse.com Cc: pbonz...@redhat.com Link: https://lkml.kernel.org/r/20180720080911.gm2...@hirez.programming.kicks-ass.net --- kernel/sched/clock.c | 16 ++-- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c5c47ad3f386..811a39aca1ce 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -197,13 +197,14 @@ void clear_sched_clock_stable(void) static void __sched_clock_gtod_offset(void) { - __gtod_offset = (sched_clock() + __sched_clock_offset) - ktime_get_ns(); + struct sched_clock_data *scd = this_scd(); + + __scd_stamp(scd); + __gtod_offset = (scd->tick_raw + __sched_clock_offset) - scd->tick_gtod; } void __init sched_clock_init(void) { - unsigned long flags; - /* * Set __gtod_offset such that once we mark sched_clock_running, * sched_clock_tick() continues where sched_clock() left off. @@ -211,16 +212,11 @@ void __init sched_clock_init(void) * Even if TSC is buggered, we're still UP at this point so it * can't really be out of sync. */ - local_irq_save(flags); + local_irq_disable(); __sched_clock_gtod_offset(); - local_irq_restore(flags); + local_irq_enable(); static_branch_inc(_clock_running); - - /* Now that sched_clock_running is set adjust scd */ - local_irq_save(flags); - sched_clock_tick(); - local_irq_restore(flags); } /* * We run this as late_initcall() such that it runs after all built-in drivers,
[tip:x86/timers] sched/clock: Close a hole in sched_clock_init()
Commit-ID: 9407f5a7ee77c631d1e100436132437cf6237e45 Gitweb: https://git.kernel.org/tip/9407f5a7ee77c631d1e100436132437cf6237e45 Author: Peter Zijlstra AuthorDate: Fri, 20 Jul 2018 10:09:11 +0200 Committer: Thomas Gleixner CommitDate: Fri, 20 Jul 2018 11:58:00 +0200 sched/clock: Close a hole in sched_clock_init() All data required for the 'unstable' sched_clock must be set-up _before_ enabling it -- setting sched_clock_running. This includes the __gtod_offset but also a recent scd stamp. Make the gtod-offset update also set the csd stamp -- it requires the same two clock reads _anyway_. This doesn't hurt in the sched_clock_tick_stable() case and ensures sched_clock_init() gets everything set-up before use. Also switch to unconditional IRQ-disable/enable because the static key stuff already requires this is not ran with IRQs disabled. Fixes: 857baa87b642 ("sched/clock: Enable sched clock early") Signed-off-by: Peter Zijlstra (Intel) Signed-off-by: Thomas Gleixner Cc: Pavel Tatashin Cc: steven.sist...@oracle.com Cc: daniel.m.jor...@oracle.com Cc: li...@armlinux.org.uk Cc: schwidef...@de.ibm.com Cc: heiko.carst...@de.ibm.com Cc: john.stu...@linaro.org Cc: sb...@codeaurora.org Cc: h...@zytor.com Cc: douly.f...@cn.fujitsu.com Cc: pra...@redhat.com Cc: feng.t...@intel.com Cc: pmla...@suse.com Cc: gno...@lxorguk.ukuu.org.uk Cc: linux-s...@vger.kernel.org Cc: boris.ostrov...@oracle.com Cc: jgr...@suse.com Cc: pbonz...@redhat.com Link: https://lkml.kernel.org/r/20180720080911.gm2...@hirez.programming.kicks-ass.net --- kernel/sched/clock.c | 16 ++-- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c index c5c47ad3f386..811a39aca1ce 100644 --- a/kernel/sched/clock.c +++ b/kernel/sched/clock.c @@ -197,13 +197,14 @@ void clear_sched_clock_stable(void) static void __sched_clock_gtod_offset(void) { - __gtod_offset = (sched_clock() + __sched_clock_offset) - ktime_get_ns(); + struct sched_clock_data *scd = this_scd(); + + __scd_stamp(scd); + __gtod_offset = (scd->tick_raw + __sched_clock_offset) - scd->tick_gtod; } void __init sched_clock_init(void) { - unsigned long flags; - /* * Set __gtod_offset such that once we mark sched_clock_running, * sched_clock_tick() continues where sched_clock() left off. @@ -211,16 +212,11 @@ void __init sched_clock_init(void) * Even if TSC is buggered, we're still UP at this point so it * can't really be out of sync. */ - local_irq_save(flags); + local_irq_disable(); __sched_clock_gtod_offset(); - local_irq_restore(flags); + local_irq_enable(); static_branch_inc(_clock_running); - - /* Now that sched_clock_running is set adjust scd */ - local_irq_save(flags); - sched_clock_tick(); - local_irq_restore(flags); } /* * We run this as late_initcall() such that it runs after all built-in drivers,
[tip:sched/core] sched/cpufreq: Clarify sugov_get_util()
Commit-ID: 45f5519ec55e75af3565dd737586d3b041834f71 Gitweb: https://git.kernel.org/tip/45f5519ec55e75af3565dd737586d3b041834f71 Author: Peter Zijlstra AuthorDate: Thu, 5 Jul 2018 14:36:17 +0200 Committer: Ingo Molnar CommitDate: Mon, 16 Jul 2018 00:16:29 +0200 sched/cpufreq: Clarify sugov_get_util() Add a few comments to (hopefully) clarifying some of the magic in sugov_get_util(). Signed-off-by: Peter Zijlstra (Intel) Acked-by: Viresh Kumar Cc: Linus Torvalds Cc: morten.rasmus...@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Cc: clau...@evidence.eu.com Cc: daniel.lezc...@linaro.org Cc: dietmar.eggem...@arm.com Cc: j...@joelfernandes.org Cc: juri.le...@redhat.com Cc: luca.ab...@santannapisa.it Cc: patrick.bell...@arm.com Cc: quentin.per...@arm.com Cc: r...@rjwysocki.net Cc: valentin.schnei...@arm.com Link: http://lkml.kernel.org/r/20180705123617.gm2...@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 75 +--- 1 file changed, 54 insertions(+), 21 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index c9622b3f183d..97dcd4472a0e 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -177,6 +177,26 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, return cpufreq_driver_resolve_freq(policy, freq); } +/* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. + * + * The scheduler tracks the following metrics: + * + * cpu_util_{cfs,rt,dl,irq}() + * cpu_bw_dl() + * + * Where the cfs,rt and dl util numbers are tracked with the same metric and + * synchronized windows and are thus directly comparable. + * + * The cfs,rt,dl utilization are the running times measured with rq->clock_task + * which excludes things like IRQ and steal-time. These latter are then accrued + * in the irq utilization. + * + * The DL bandwidth number otoh is not a measured metric but a value computed + * based on the task model parameters and gives the minimal utilization + * required to meet deadlines. + */ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); @@ -188,47 +208,60 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) if (rt_rq_is_runnable(>rt)) return max; + /* +* Early check to see if IRQ/steal time saturates the CPU, can be +* because of inaccuracies in how we track these -- see +* update_irq_load_avg(). +*/ irq = cpu_util_irq(rq); - if (unlikely(irq >= max)) return max; - /* Sum rq utilization */ + /* +* Because the time spend on RT/DL tasks is visible as 'lost' time to +* CFS tasks and we use the same metric to track the effective +* utilization (PELT windows are synchronized) we can directly add them +* to obtain the CPU's actual utilization. +*/ util = cpu_util_cfs(rq); util += cpu_util_rt(rq); /* -* Interrupt time is not seen by RQS utilization so we can compare -* them with the CPU capacity +* We do not make cpu_util_dl() a permanent part of this sum because we +* want to use cpu_bw_dl() later on, but we need to check if the +* CFS+RT+DL sum is saturated (ie. no idle time) such that we select +* f_max when there is no idle time. +* +* NOTE: numerical errors or stop class might cause us to not quite hit +* saturation when we should -- something for later. */ if ((util + cpu_util_dl(rq)) >= max) return max; /* -* As there is still idle time on the CPU, we need to compute the -* utilization level of the CPU. +* There is still idle time; further improve the number by using the +* irq metric. Because IRQ/steal time is hidden from the task clock we +* need to scale the task numbers: * +* 1 - irq +* U' = irq + --- * U +*max +*/ + util *= (max - irq); + util /= max; + util += irq; + + /* * Bandwidth required by DEADLINE must always be granted while, for * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism * to gracefully reduce the frequency when no tasks show up for longer * periods of time. * -* Ideally we would like to set util_dl as min/guaranteed freq and -* util_cfs + util_dl as requested freq. However, cpufreq is not yet -* ready for such an interface. So, we only do the latter for now. +* Ideally we would like to set bw_dl as min/guaranteed freq and util + +* bw_dl as requested freq. However,
[tip:sched/core] sched/cpufreq: Clarify sugov_get_util()
Commit-ID: 45f5519ec55e75af3565dd737586d3b041834f71 Gitweb: https://git.kernel.org/tip/45f5519ec55e75af3565dd737586d3b041834f71 Author: Peter Zijlstra AuthorDate: Thu, 5 Jul 2018 14:36:17 +0200 Committer: Ingo Molnar CommitDate: Mon, 16 Jul 2018 00:16:29 +0200 sched/cpufreq: Clarify sugov_get_util() Add a few comments to (hopefully) clarifying some of the magic in sugov_get_util(). Signed-off-by: Peter Zijlstra (Intel) Acked-by: Viresh Kumar Cc: Linus Torvalds Cc: morten.rasmus...@arm.com Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Vincent Guittot Cc: clau...@evidence.eu.com Cc: daniel.lezc...@linaro.org Cc: dietmar.eggem...@arm.com Cc: j...@joelfernandes.org Cc: juri.le...@redhat.com Cc: luca.ab...@santannapisa.it Cc: patrick.bell...@arm.com Cc: quentin.per...@arm.com Cc: r...@rjwysocki.net Cc: valentin.schnei...@arm.com Link: http://lkml.kernel.org/r/20180705123617.gm2...@hirez.programming.kicks-ass.net Signed-off-by: Ingo Molnar --- kernel/sched/cpufreq_schedutil.c | 75 +--- 1 file changed, 54 insertions(+), 21 deletions(-) diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c index c9622b3f183d..97dcd4472a0e 100644 --- a/kernel/sched/cpufreq_schedutil.c +++ b/kernel/sched/cpufreq_schedutil.c @@ -177,6 +177,26 @@ static unsigned int get_next_freq(struct sugov_policy *sg_policy, return cpufreq_driver_resolve_freq(policy, freq); } +/* + * This function computes an effective utilization for the given CPU, to be + * used for frequency selection given the linear relation: f = u * f_max. + * + * The scheduler tracks the following metrics: + * + * cpu_util_{cfs,rt,dl,irq}() + * cpu_bw_dl() + * + * Where the cfs,rt and dl util numbers are tracked with the same metric and + * synchronized windows and are thus directly comparable. + * + * The cfs,rt,dl utilization are the running times measured with rq->clock_task + * which excludes things like IRQ and steal-time. These latter are then accrued + * in the irq utilization. + * + * The DL bandwidth number otoh is not a measured metric but a value computed + * based on the task model parameters and gives the minimal utilization + * required to meet deadlines. + */ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) { struct rq *rq = cpu_rq(sg_cpu->cpu); @@ -188,47 +208,60 @@ static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu) if (rt_rq_is_runnable(>rt)) return max; + /* +* Early check to see if IRQ/steal time saturates the CPU, can be +* because of inaccuracies in how we track these -- see +* update_irq_load_avg(). +*/ irq = cpu_util_irq(rq); - if (unlikely(irq >= max)) return max; - /* Sum rq utilization */ + /* +* Because the time spend on RT/DL tasks is visible as 'lost' time to +* CFS tasks and we use the same metric to track the effective +* utilization (PELT windows are synchronized) we can directly add them +* to obtain the CPU's actual utilization. +*/ util = cpu_util_cfs(rq); util += cpu_util_rt(rq); /* -* Interrupt time is not seen by RQS utilization so we can compare -* them with the CPU capacity +* We do not make cpu_util_dl() a permanent part of this sum because we +* want to use cpu_bw_dl() later on, but we need to check if the +* CFS+RT+DL sum is saturated (ie. no idle time) such that we select +* f_max when there is no idle time. +* +* NOTE: numerical errors or stop class might cause us to not quite hit +* saturation when we should -- something for later. */ if ((util + cpu_util_dl(rq)) >= max) return max; /* -* As there is still idle time on the CPU, we need to compute the -* utilization level of the CPU. +* There is still idle time; further improve the number by using the +* irq metric. Because IRQ/steal time is hidden from the task clock we +* need to scale the task numbers: * +* 1 - irq +* U' = irq + --- * U +*max +*/ + util *= (max - irq); + util /= max; + util += irq; + + /* * Bandwidth required by DEADLINE must always be granted while, for * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism * to gracefully reduce the frequency when no tasks show up for longer * periods of time. * -* Ideally we would like to set util_dl as min/guaranteed freq and -* util_cfs + util_dl as requested freq. However, cpufreq is not yet -* ready for such an interface. So, we only do the latter for now. +* Ideally we would like to set bw_dl as min/guaranteed freq and util + +* bw_dl as requested freq. However,