[tip:sched/core] sched: Rework pick_next_task() slow-path

2019-08-08 Thread tip-bot for Peter Zijlstra
Commit-ID:  67692435c411e5c53a1c588ecca2037aebd81f2e
Gitweb: https://git.kernel.org/tip/67692435c411e5c53a1c588ecca2037aebd81f2e
Author: Peter Zijlstra 
AuthorDate: Wed, 29 May 2019 20:36:44 +
Committer:  Peter Zijlstra 
CommitDate: Thu, 8 Aug 2019 09:09:31 +0200

sched: Rework pick_next_task() slow-path

Avoid the RETRY_TASK case in the pick_next_task() slow path.

By doing the put_prev_task() early, we get the rt/deadline pull done,
and by testing rq->nr_running we know if we need newidle_balance().

This then gives a stable state to pick a task from.

Since the fast-path is fair only; it means the other classes will
always have pick_next_task(.prev=NULL, .rf=NULL) and we can simplify.

Signed-off-by: Peter Zijlstra (Intel) 
Cc: Aaron Lu 
Cc: Valentin Schneider 
Cc: mi...@kernel.org
Cc: Phil Auld 
Cc: Julien Desfossez 
Cc: Nishanth Aravamudan 
Link: 
https://lkml.kernel.org/r/aa34d24b36547139248f32a30138791ac6c02bd6.1559129225.git.vpil...@digitalocean.com
---
 kernel/sched/core.c  | 19 ---
 kernel/sched/deadline.c  | 30 ++
 kernel/sched/fair.c  |  9 ++---
 kernel/sched/idle.c  |  4 +++-
 kernel/sched/rt.c| 29 +
 kernel/sched/sched.h | 13 -
 kernel/sched/stop_task.c |  3 ++-
 7 files changed, 34 insertions(+), 73 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 7bbe78a31ba5..a6661852907b 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3791,7 +3791,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, 
struct rq_flags *rf)
 
p = fair_sched_class.pick_next_task(rq, prev, rf);
if (unlikely(p == RETRY_TASK))
-   goto again;
+   goto restart;
 
/* Assumes fair_sched_class->next == idle_sched_class */
if (unlikely(!p))
@@ -3800,14 +3800,19 @@ pick_next_task(struct rq *rq, struct task_struct *prev, 
struct rq_flags *rf)
return p;
}
 
-again:
+restart:
+   /*
+* Ensure that we put DL/RT tasks before the pick loop, such that they
+* can PULL higher prio tasks when we lower the RQ 'priority'.
+*/
+   prev->sched_class->put_prev_task(rq, prev, rf);
+   if (!rq->nr_running)
+   newidle_balance(rq, rf);
+
for_each_class(class) {
-   p = class->pick_next_task(rq, prev, rf);
-   if (p) {
-   if (unlikely(p == RETRY_TASK))
-   goto again;
+   p = class->pick_next_task(rq, NULL, NULL);
+   if (p)
return p;
-   }
}
 
/* The idle class should always have a runnable task: */
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 2872e15a87cd..0b9cbfb2b1d4 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1761,39 +1761,13 @@ pick_next_task_dl(struct rq *rq, struct task_struct 
*prev, struct rq_flags *rf)
struct task_struct *p;
struct dl_rq *dl_rq;
 
-   dl_rq = >dl;
-
-   if (need_pull_dl_task(rq, prev)) {
-   /*
-* This is OK, because current is on_cpu, which avoids it being
-* picked for load-balance and preemption/IRQs are still
-* disabled avoiding further scheduler activity on it and we're
-* being very careful to re-start the picking loop.
-*/
-   rq_unpin_lock(rq, rf);
-   pull_dl_task(rq);
-   rq_repin_lock(rq, rf);
-   /*
-* pull_dl_task() can drop (and re-acquire) rq->lock; this
-* means a stop task can slip in, in which case we need to
-* re-start task selection.
-*/
-   if (rq->stop && task_on_rq_queued(rq->stop))
-   return RETRY_TASK;
-   }
+   WARN_ON_ONCE(prev || rf);
 
-   /*
-* When prev is DL, we may throttle it in put_prev_task().
-* So, we update time before we check for dl_nr_running.
-*/
-   if (prev->sched_class == _sched_class)
-   update_curr_dl(rq);
+   dl_rq = >dl;
 
if (unlikely(!dl_rq->dl_nr_running))
return NULL;
 
-   put_prev_task(rq, prev);
-
dl_se = pick_next_dl_entity(rq, dl_rq);
BUG_ON(!dl_se);
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 4418c1998e69..19c58599e967 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6770,7 +6770,7 @@ again:
goto idle;
 
 #ifdef CONFIG_FAIR_GROUP_SCHED
-   if (prev->sched_class != _sched_class)
+   if (!prev || prev->sched_class != _sched_class)
goto simple;
 
/*
@@ -6847,8 +6847,8 @@ again:
goto done;
 simple:
 #endif
-
-   put_prev_task(rq, prev);
+   if (prev)
+   

[tip:sched/core] sched: Allow put_prev_task() to drop rq->lock

2019-08-08 Thread tip-bot for Peter Zijlstra
Commit-ID:  5f2a45fc9e89e022233085e6f0f352eb6ff770bb
Gitweb: https://git.kernel.org/tip/5f2a45fc9e89e022233085e6f0f352eb6ff770bb
Author: Peter Zijlstra 
AuthorDate: Wed, 29 May 2019 20:36:43 +
Committer:  Peter Zijlstra 
CommitDate: Thu, 8 Aug 2019 09:09:31 +0200

sched: Allow put_prev_task() to drop rq->lock

Currently the pick_next_task() loop is convoluted and ugly because of
how it can drop the rq->lock and needs to restart the picking.

For the RT/Deadline classes, it is put_prev_task() where we do
balancing, and we could do this before the picking loop. Make this
possible.

Signed-off-by: Peter Zijlstra (Intel) 
Cc: Valentin Schneider 
Cc: Aaron Lu 
Cc: mi...@kernel.org
Cc: Phil Auld 
Cc: Julien Desfossez 
Cc: Nishanth Aravamudan 
Link: 
https://lkml.kernel.org/r/e4519f6850477ab7f3d257062796e6425ee4ba7c.1559129225.git.vpil...@digitalocean.com
---
 kernel/sched/core.c  |  2 +-
 kernel/sched/deadline.c  | 14 +-
 kernel/sched/fair.c  |  2 +-
 kernel/sched/idle.c  |  2 +-
 kernel/sched/rt.c| 14 +-
 kernel/sched/sched.h |  4 ++--
 kernel/sched/stop_task.c |  2 +-
 7 files changed, 32 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 0c4220789092..7bbe78a31ba5 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6090,7 +6090,7 @@ static struct task_struct *__pick_migrate_task(struct rq 
*rq)
for_each_class(class) {
next = class->pick_next_task(rq, NULL, NULL);
if (next) {
-   next->sched_class->put_prev_task(rq, next);
+   next->sched_class->put_prev_task(rq, next, NULL);
return next;
}
}
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 6eae79350303..2872e15a87cd 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1804,13 +1804,25 @@ pick_next_task_dl(struct rq *rq, struct task_struct 
*prev, struct rq_flags *rf)
return p;
 }
 
-static void put_prev_task_dl(struct rq *rq, struct task_struct *p)
+static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct 
rq_flags *rf)
 {
update_curr_dl(rq);
 
update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 1);
if (on_dl_rq(>dl) && p->nr_cpus_allowed > 1)
enqueue_pushable_dl_task(rq, p);
+
+   if (rf && !on_dl_rq(>dl) && need_pull_dl_task(rq, p)) {
+   /*
+* This is OK, because current is on_cpu, which avoids it being
+* picked for load-balance and preemption/IRQs are still
+* disabled avoiding further scheduler activity on it and we've
+* not yet started the picking loop.
+*/
+   rq_unpin_lock(rq, rf);
+   pull_dl_task(rq);
+   rq_repin_lock(rq, rf);
+   }
 }
 
 /*
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index e7c27eda9f24..4418c1998e69 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -6901,7 +6901,7 @@ idle:
 /*
  * Account for a descheduled task:
  */
-static void put_prev_task_fair(struct rq *rq, struct task_struct *prev)
+static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct 
rq_flags *rf)
 {
struct sched_entity *se = >se;
struct cfs_rq *cfs_rq;
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 54194d41035c..8d59de2e4a6e 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -374,7 +374,7 @@ static void check_preempt_curr_idle(struct rq *rq, struct 
task_struct *p, int fl
resched_curr(rq);
 }
 
-static void put_prev_task_idle(struct rq *rq, struct task_struct *prev)
+static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct 
rq_flags *rf)
 {
 }
 
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index f71bcbe1a00c..dbdabd76f192 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1592,7 +1592,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct 
*prev, struct rq_flags *rf)
return p;
 }
 
-static void put_prev_task_rt(struct rq *rq, struct task_struct *p)
+static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct 
rq_flags *rf)
 {
update_curr_rt(rq);
 
@@ -1604,6 +1604,18 @@ static void put_prev_task_rt(struct rq *rq, struct 
task_struct *p)
 */
if (on_rt_rq(>rt) && p->nr_cpus_allowed > 1)
enqueue_pushable_task(rq, p);
+
+   if (rf && !on_rt_rq(>rt) && need_pull_rt_task(rq, p)) {
+   /*
+* This is OK, because current is on_cpu, which avoids it being
+* picked for load-balance and preemption/IRQs are still
+* disabled avoiding further scheduler activity on it and we've
+* not yet started the picking loop.
+*/
+   rq_unpin_lock(rq, rf);
+   pull_rt_task(rq);
+   rq_repin_lock(rq, rf);
+   

[tip:sched/core] sched: Add task_struct pointer to sched_class::set_curr_task

2019-08-08 Thread tip-bot for Peter Zijlstra
Commit-ID:  03b7fad167efca3b7a39733933f9df56e79c
Gitweb: https://git.kernel.org/tip/03b7fad167efca3b7a39733933f9df56e79c
Author: Peter Zijlstra 
AuthorDate: Wed, 29 May 2019 20:36:41 +
Committer:  Peter Zijlstra 
CommitDate: Thu, 8 Aug 2019 09:09:31 +0200

sched: Add task_struct pointer to sched_class::set_curr_task

In preparation of further separating pick_next_task() and
set_curr_task() we have to pass the actual task into it, while there,
rename the thing to better pair with put_prev_task().

Signed-off-by: Peter Zijlstra (Intel) 
Cc: Aaron Lu 
Cc: Valentin Schneider 
Cc: mi...@kernel.org
Cc: Phil Auld 
Cc: Julien Desfossez 
Cc: Nishanth Aravamudan 
Link: 
https://lkml.kernel.org/r/a96d1bcdd716db4a4c5da2fece647a1456c0ed78.1559129225.git.vpil...@digitalocean.com
---
 kernel/sched/core.c  | 12 ++--
 kernel/sched/deadline.c  |  7 +--
 kernel/sched/fair.c  | 17 ++---
 kernel/sched/idle.c  | 27 +++
 kernel/sched/rt.c|  7 +--
 kernel/sched/sched.h |  7 ---
 kernel/sched/stop_task.c | 17 +++--
 7 files changed, 48 insertions(+), 46 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 364b6d7da2be..0c4220789092 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1494,7 +1494,7 @@ void do_set_cpus_allowed(struct task_struct *p, const 
struct cpumask *new_mask)
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
-   set_curr_task(rq, p);
+   set_next_task(rq, p);
 }
 
 /*
@@ -4325,7 +4325,7 @@ void rt_mutex_setprio(struct task_struct *p, struct 
task_struct *pi_task)
if (queued)
enqueue_task(rq, p, queue_flag);
if (running)
-   set_curr_task(rq, p);
+   set_next_task(rq, p);
 
check_class_changed(rq, p, prev_class, oldprio);
 out_unlock:
@@ -4392,7 +4392,7 @@ void set_user_nice(struct task_struct *p, long nice)
resched_curr(rq);
}
if (running)
-   set_curr_task(rq, p);
+   set_next_task(rq, p);
 out_unlock:
task_rq_unlock(rq, p, );
 }
@@ -4840,7 +4840,7 @@ change:
enqueue_task(rq, p, queue_flags);
}
if (running)
-   set_curr_task(rq, p);
+   set_next_task(rq, p);
 
check_class_changed(rq, p, prev_class, oldprio);
 
@@ -6042,7 +6042,7 @@ void sched_setnuma(struct task_struct *p, int nid)
if (queued)
enqueue_task(rq, p, ENQUEUE_RESTORE | ENQUEUE_NOCLOCK);
if (running)
-   set_curr_task(rq, p);
+   set_next_task(rq, p);
task_rq_unlock(rq, p, );
 }
 #endif /* CONFIG_NUMA_BALANCING */
@@ -6919,7 +6919,7 @@ void sched_move_task(struct task_struct *tsk)
if (queued)
enqueue_task(rq, tsk, queue_flags);
if (running)
-   set_curr_task(rq, tsk);
+   set_next_task(rq, tsk);
 
task_rq_unlock(rq, tsk, );
 }
diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 2dc2784b196c..6eae79350303 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1844,11 +1844,6 @@ static void task_fork_dl(struct task_struct *p)
 */
 }
 
-static void set_curr_task_dl(struct rq *rq)
-{
-   set_next_task_dl(rq, rq->curr);
-}
-
 #ifdef CONFIG_SMP
 
 /* Only try algorithms three times */
@@ -2466,6 +2461,7 @@ const struct sched_class dl_sched_class = {
 
.pick_next_task = pick_next_task_dl,
.put_prev_task  = put_prev_task_dl,
+   .set_next_task  = set_next_task_dl,
 
 #ifdef CONFIG_SMP
.select_task_rq = select_task_rq_dl,
@@ -2476,7 +2472,6 @@ const struct sched_class dl_sched_class = {
.task_woken = task_woken_dl,
 #endif
 
-   .set_curr_task  = set_curr_task_dl,
.task_tick  = task_tick_dl,
.task_fork  = task_fork_dl,
 
diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 7d8043fc8317..8ce1b8893947 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -10150,9 +10150,19 @@ static void switched_to_fair(struct rq *rq, struct 
task_struct *p)
  * This routine is mostly called to set cfs_rq->curr field when a task
  * migrates between groups/classes.
  */
-static void set_curr_task_fair(struct rq *rq)
+static void set_next_task_fair(struct rq *rq, struct task_struct *p)
 {
-   struct sched_entity *se = >curr->se;
+   struct sched_entity *se = >se;
+
+#ifdef CONFIG_SMP
+   if (task_on_rq_queued(p)) {
+   /*
+* Move the next running task to the front of the list, so our
+* cfs_tasks list becomes MRU one.
+*/
+   list_move(>group_node, >cfs_tasks);
+   }
+#endif
 
for_each_sched_entity(se) {
struct cfs_rq *cfs_rq 

[tip:sched/core] sched/fair: Expose newidle_balance()

2019-08-08 Thread tip-bot for Peter Zijlstra
Commit-ID:  5ba553eff0c3a7c099b1e29a740277a82c0c3314
Gitweb: https://git.kernel.org/tip/5ba553eff0c3a7c099b1e29a740277a82c0c3314
Author: Peter Zijlstra 
AuthorDate: Wed, 29 May 2019 20:36:42 +
Committer:  Peter Zijlstra 
CommitDate: Thu, 8 Aug 2019 09:09:31 +0200

sched/fair: Expose newidle_balance()

For pick_next_task_fair() it is the newidle balance that requires
dropping the rq->lock; provided we do put_prev_task() early, we can
also detect the condition for doing newidle early.

Signed-off-by: Peter Zijlstra (Intel) 
Cc: Aaron Lu 
Cc: Valentin Schneider 
Cc: mi...@kernel.org
Cc: Phil Auld 
Cc: Julien Desfossez 
Cc: Nishanth Aravamudan 
Link: 
https://lkml.kernel.org/r/9e3eb1859b946f03d7e500453a885725b68957ba.1559129225.git.vpil...@digitalocean.com
---
 kernel/sched/fair.c  | 18 --
 kernel/sched/sched.h |  4 
 2 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 8ce1b8893947..e7c27eda9f24 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -3690,8 +3690,6 @@ static inline unsigned long cfs_rq_load_avg(struct cfs_rq 
*cfs_rq)
return cfs_rq->avg.load_avg;
 }
 
-static int idle_balance(struct rq *this_rq, struct rq_flags *rf);
-
 static inline unsigned long task_util(struct task_struct *p)
 {
return READ_ONCE(p->se.avg.util_avg);
@@ -6878,11 +6876,10 @@ done: __maybe_unused;
return p;
 
 idle:
-   update_misfit_status(NULL, rq);
-   new_tasks = idle_balance(rq, rf);
+   new_tasks = newidle_balance(rq, rf);
 
/*
-* Because idle_balance() releases (and re-acquires) rq->lock, it is
+* Because newidle_balance() releases (and re-acquires) rq->lock, it is
 * possible for any higher priority task to appear. In that case we
 * must re-start the pick_next_entity() loop.
 */
@@ -9045,10 +9042,10 @@ out_one_pinned:
ld_moved = 0;
 
/*
-* idle_balance() disregards balance intervals, so we could repeatedly
-* reach this code, which would lead to balance_interval skyrocketting
-* in a short amount of time. Skip the balance_interval increase logic
-* to avoid that.
+* newidle_balance() disregards balance intervals, so we could
+* repeatedly reach this code, which would lead to balance_interval
+* skyrocketting in a short amount of time. Skip the balance_interval
+* increase logic to avoid that.
 */
if (env.idle == CPU_NEWLY_IDLE)
goto out;
@@ -9758,7 +9755,7 @@ static inline void nohz_newidle_balance(struct rq 
*this_rq) { }
  * idle_balance is called by schedule() if this_cpu is about to become
  * idle. Attempts to pull tasks from other CPUs.
  */
-static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
+int newidle_balance(struct rq *this_rq, struct rq_flags *rf)
 {
unsigned long next_balance = jiffies + HZ;
int this_cpu = this_rq->cpu;
@@ -9766,6 +9763,7 @@ static int idle_balance(struct rq *this_rq, struct 
rq_flags *rf)
int pulled_task = 0;
u64 curr_cost = 0;
 
+   update_misfit_status(NULL, this_rq);
/*
 * We must set idle_stamp _before_ calling idle_balance(), such that we
 * measure the duration of idle_balance() as idle time.
diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h
index f3c50445bf22..304d98e712bf 100644
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@@ -1445,10 +1445,14 @@ static inline void unregister_sched_domain_sysctl(void)
 }
 #endif
 
+extern int newidle_balance(struct rq *this_rq, struct rq_flags *rf);
+
 #else
 
 static inline void sched_ttwu_pending(void) { }
 
+static inline int newidle_balance(struct rq *this_rq, struct rq_flags *rf) { 
return 0; }
+
 #endif /* CONFIG_SMP */
 
 #include "stats.h"


[tip:sched/core] sched/{rt,deadline}: Fix set_next_task vs pick_next_task

2019-08-08 Thread tip-bot for Peter Zijlstra
Commit-ID:  f95d4eaee6d0207bff2dc93371133d31227d4cfb
Gitweb: https://git.kernel.org/tip/f95d4eaee6d0207bff2dc93371133d31227d4cfb
Author: Peter Zijlstra 
AuthorDate: Wed, 29 May 2019 20:36:40 +
Committer:  Peter Zijlstra 
CommitDate: Thu, 8 Aug 2019 09:09:30 +0200

sched/{rt,deadline}: Fix set_next_task vs pick_next_task

Because pick_next_task() implies set_curr_task() and some of the
details haven't mattered too much, some of what _should_ be in
set_curr_task() ended up in pick_next_task, correct this.

This prepares the way for a pick_next_task() variant that does not
affect the current state; allowing remote picking.

Signed-off-by: Peter Zijlstra (Intel) 
Cc: Aaron Lu 
Cc: Valentin Schneider 
Cc: mi...@kernel.org
Cc: Phil Auld 
Cc: Julien Desfossez 
Cc: Nishanth Aravamudan 
Link: 
https://lkml.kernel.org/r/38c61d5240553e043c27c5e00b9dd0d184dd6081.1559129225.git.vpil...@digitalocean.com
---
 kernel/sched/deadline.c | 22 +++---
 kernel/sched/rt.c   | 26 +-
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index 039dde2b1dac..2dc2784b196c 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1727,12 +1727,20 @@ static void start_hrtick_dl(struct rq *rq, struct 
task_struct *p)
 }
 #endif
 
-static inline void set_next_task(struct rq *rq, struct task_struct *p)
+static void set_next_task_dl(struct rq *rq, struct task_struct *p)
 {
p->se.exec_start = rq_clock_task(rq);
 
/* You can't push away the running task */
dequeue_pushable_dl_task(rq, p);
+
+   if (hrtick_enabled(rq))
+   start_hrtick_dl(rq, p);
+
+   if (rq->curr->sched_class != _sched_class)
+   update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+
+   deadline_queue_push_tasks(rq);
 }
 
 static struct sched_dl_entity *pick_next_dl_entity(struct rq *rq,
@@ -1791,15 +1799,7 @@ pick_next_task_dl(struct rq *rq, struct task_struct 
*prev, struct rq_flags *rf)
 
p = dl_task_of(dl_se);
 
-   set_next_task(rq, p);
-
-   if (hrtick_enabled(rq))
-   start_hrtick_dl(rq, p);
-
-   deadline_queue_push_tasks(rq);
-
-   if (rq->curr->sched_class != _sched_class)
-   update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+   set_next_task_dl(rq, p);
 
return p;
 }
@@ -1846,7 +1846,7 @@ static void task_fork_dl(struct task_struct *p)
 
 static void set_curr_task_dl(struct rq *rq)
 {
-   set_next_task(rq, rq->curr);
+   set_next_task_dl(rq, rq->curr);
 }
 
 #ifdef CONFIG_SMP
diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c
index a532558a5176..40bb71004325 100644
--- a/kernel/sched/rt.c
+++ b/kernel/sched/rt.c
@@ -1498,12 +1498,22 @@ static void check_preempt_curr_rt(struct rq *rq, struct 
task_struct *p, int flag
 #endif
 }
 
-static inline void set_next_task(struct rq *rq, struct task_struct *p)
+static inline void set_next_task_rt(struct rq *rq, struct task_struct *p)
 {
p->se.exec_start = rq_clock_task(rq);
 
/* The running task is never eligible for pushing */
dequeue_pushable_task(rq, p);
+
+   /*
+* If prev task was rt, put_prev_task() has already updated the
+* utilization. We only care of the case where we start to schedule a
+* rt task
+*/
+   if (rq->curr->sched_class != _sched_class)
+   update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+
+   rt_queue_push_tasks(rq);
 }
 
 static struct sched_rt_entity *pick_next_rt_entity(struct rq *rq,
@@ -1577,17 +1587,7 @@ pick_next_task_rt(struct rq *rq, struct task_struct 
*prev, struct rq_flags *rf)
 
p = _pick_next_task_rt(rq);
 
-   set_next_task(rq, p);
-
-   rt_queue_push_tasks(rq);
-
-   /*
-* If prev task was rt, put_prev_task() has already updated the
-* utilization. We only care of the case where we start to schedule a
-* rt task
-*/
-   if (rq->curr->sched_class != _sched_class)
-   update_rt_rq_load_avg(rq_clock_pelt(rq), rq, 0);
+   set_next_task_rt(rq, p);
 
return p;
 }
@@ -2356,7 +2356,7 @@ static void task_tick_rt(struct rq *rq, struct 
task_struct *p, int queued)
 
 static void set_curr_task_rt(struct rq *rq)
 {
-   set_next_task(rq, rq->curr);
+   set_next_task_rt(rq, rq->curr);
 }
 
 static unsigned int get_rr_interval_rt(struct rq *rq, struct task_struct *task)


[tip:sched/core] stop_machine: Fix stop_cpus_in_progress ordering

2019-08-08 Thread tip-bot for Peter Zijlstra
Commit-ID:  99d84bf8c65a7a0dbc9e166ca0a58ed949ac4f37
Gitweb: https://git.kernel.org/tip/99d84bf8c65a7a0dbc9e166ca0a58ed949ac4f37
Author: Peter Zijlstra 
AuthorDate: Wed, 29 May 2019 20:36:37 +
Committer:  Peter Zijlstra 
CommitDate: Thu, 8 Aug 2019 09:09:30 +0200

stop_machine: Fix stop_cpus_in_progress ordering

Make sure the entire for loop has stop_cpus_in_progress set.

Signed-off-by: Peter Zijlstra (Intel) 
Cc: Aaron Lu 
Cc: Valentin Schneider 
Cc: mi...@kernel.org
Cc: Phil Auld 
Cc: Julien Desfossez 
Cc: Nishanth Aravamudan 
Link: 
https://lkml.kernel.org/r/0fd8fd4b99b9b9aa88d8b2dff897f7fd0d88f72c.1559129225.git.vpil...@digitalocean.com
---
 kernel/stop_machine.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index b4f83f7bdf86..c7031a22aa7b 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -383,6 +383,7 @@ static bool queue_stop_cpus_work(const struct cpumask 
*cpumask,
 */
preempt_disable();
stop_cpus_in_progress = true;
+   barrier();
for_each_cpu(cpu, cpumask) {
work = _cpu(cpu_stopper.stop_work, cpu);
work->fn = fn;
@@ -391,6 +392,7 @@ static bool queue_stop_cpus_work(const struct cpumask 
*cpumask,
if (cpu_stop_queue_work(cpu, work))
queued = true;
}
+   barrier();
stop_cpus_in_progress = false;
preempt_enable();
 


[tip:sched/core] sched: Fix kerneldoc comment for ia64_set_curr_task

2019-08-08 Thread tip-bot for Peter Zijlstra
Commit-ID:  5feeb7837a448f659e0aaa19fb446b1d9a4b323a
Gitweb: https://git.kernel.org/tip/5feeb7837a448f659e0aaa19fb446b1d9a4b323a
Author: Peter Zijlstra 
AuthorDate: Wed, 29 May 2019 20:36:38 +
Committer:  Peter Zijlstra 
CommitDate: Thu, 8 Aug 2019 09:09:30 +0200

sched: Fix kerneldoc comment for ia64_set_curr_task

Signed-off-by: Peter Zijlstra (Intel) 
Cc: Aaron Lu 
Cc: Valentin Schneider 
Cc: mi...@kernel.org
Cc: Phil Auld 
Cc: Julien Desfossez 
Cc: Nishanth Aravamudan 
Link: 
https://lkml.kernel.org/r/fde3a65ea3091ec6b84dac3c19639f85f452c5d1.1559129225.git.vpil...@digitalocean.com
---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index b4a44bc84749..9a821ff68502 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -6772,7 +6772,7 @@ struct task_struct *curr_task(int cpu)
 
 #ifdef CONFIG_IA64
 /**
- * set_curr_task - set the current task for a given CPU.
+ * ia64_set_curr_task - set the current task for a given CPU.
  * @cpu: the processor in question.
  * @p: the task pointer to set.
  *


[tip:core/urgent] objtool: Improve UACCESS coverage

2019-07-25 Thread tip-bot for Peter Zijlstra
Commit-ID:  882a0db9d143e5e8dac54b96e83135bccd1f68d1
Gitweb: https://git.kernel.org/tip/882a0db9d143e5e8dac54b96e83135bccd1f68d1
Author: Peter Zijlstra 
AuthorDate: Wed, 24 Jul 2019 17:47:26 -0500
Committer:  Thomas Gleixner 
CommitDate: Thu, 25 Jul 2019 08:36:39 +0200

objtool: Improve UACCESS coverage

A clang build reported an (obvious) double CLAC while a GCC build did not;
it turns out that objtool only re-visits instructions if the first visit
was with AC=0. If OTOH the first visit was with AC=1, it completely ignores
any subsequent visit, even when it has AC=0.

Fix this by using a visited mask instead of a boolean, and (explicitly)
mark the AC state.

$ ./objtool check -b --no-fp --retpoline --uaccess 
drivers/gpu/drm/i915/gem/i915_gem_execbuffer.o
drivers/gpu/drm/i915/gem/i915_gem_execbuffer.o: warning: objtool: 
.altinstr_replacement+0x22: redundant UACCESS disable
drivers/gpu/drm/i915/gem/i915_gem_execbuffer.o: warning: objtool:   
eb_copy_relocations.isra.34()+0xea: (alt)
drivers/gpu/drm/i915/gem/i915_gem_execbuffer.o: warning: objtool:   
.altinstr_replacement+0x: (branch)
drivers/gpu/drm/i915/gem/i915_gem_execbuffer.o: warning: objtool:   
eb_copy_relocations.isra.34()+0xd9: (alt)
drivers/gpu/drm/i915/gem/i915_gem_execbuffer.o: warning: objtool:   
eb_copy_relocations.isra.34()+0xb2: (branch)
drivers/gpu/drm/i915/gem/i915_gem_execbuffer.o: warning: objtool:   
eb_copy_relocations.isra.34()+0x39: (branch)
drivers/gpu/drm/i915/gem/i915_gem_execbuffer.o: warning: objtool:   
eb_copy_relocations.isra.34()+0x0: <=== (func)

Reported-by: Josh Poimboeuf 
Reported-by: Thomas Gleixner 
Reported-by: Sedat Dilek 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Josh Poimboeuf 
Signed-off-by: Thomas Gleixner 
Tested-by: Nathan Chancellor 
Tested-by: Nick Desaulniers 
Tested-by: Sedat Dilek 
Link: https://github.com/ClangBuiltLinux/linux/issues/617
Link: 
https://lkml.kernel.org/r/5359166aad2d53f3145cd442d83d0e5115e0cd17.1564007838.git.jpoim...@redhat.com

---
 tools/objtool/check.c | 7 ---
 tools/objtool/check.h | 3 ++-
 2 files changed, 6 insertions(+), 4 deletions(-)

diff --git a/tools/objtool/check.c b/tools/objtool/check.c
index 5f26620f13f5..176f2f084060 100644
--- a/tools/objtool/check.c
+++ b/tools/objtool/check.c
@@ -1946,6 +1946,7 @@ static int validate_branch(struct objtool_file *file, 
struct symbol *func,
struct alternative *alt;
struct instruction *insn, *next_insn;
struct section *sec;
+   u8 visited;
int ret;
 
insn = first;
@@ -1972,12 +1973,12 @@ static int validate_branch(struct objtool_file *file, 
struct symbol *func,
return 1;
}
 
+   visited = 1 << state.uaccess;
if (insn->visited) {
if (!insn->hint && !insn_state_match(insn, ))
return 1;
 
-   /* If we were here with AC=0, but now have AC=1, go 
again */
-   if (insn->state.uaccess || !state.uaccess)
+   if (insn->visited & visited)
return 0;
}
 
@@ -2024,7 +2025,7 @@ static int validate_branch(struct objtool_file *file, 
struct symbol *func,
} else
insn->state = state;
 
-   insn->visited = true;
+   insn->visited |= visited;
 
if (!insn->ignore_alts) {
bool skip_orig = false;
diff --git a/tools/objtool/check.h b/tools/objtool/check.h
index b881fafcf55d..6d875ca6fce0 100644
--- a/tools/objtool/check.h
+++ b/tools/objtool/check.h
@@ -33,8 +33,9 @@ struct instruction {
unsigned int len;
enum insn_type type;
unsigned long immediate;
-   bool alt_group, visited, dead_end, ignore, hint, save, restore, 
ignore_alts;
+   bool alt_group, dead_end, ignore, hint, save, restore, ignore_alts;
bool retpoline_safe;
+   u8 visited;
struct symbol *call_dest;
struct instruction *jump_dest;
struct instruction *first_jump_src;


[tip:smp/urgent] smp: Warn on function calls from softirq context

2019-07-20 Thread tip-bot for Peter Zijlstra
Commit-ID:  19dbdcb8039cff16669a05136a29180778d16d0a
Gitweb: https://git.kernel.org/tip/19dbdcb8039cff16669a05136a29180778d16d0a
Author: Peter Zijlstra 
AuthorDate: Thu, 18 Jul 2019 11:20:09 +0200
Committer:  Thomas Gleixner 
CommitDate: Sat, 20 Jul 2019 11:27:16 +0200

smp: Warn on function calls from softirq context

It's clearly documented that smp function calls cannot be invoked from
softirq handling context. Unfortunately nothing enforces that or emits a
warning.

A single function call can be invoked from softirq context only via
smp_call_function_single_async().

The only legit context is task context, so add a warning to that effect.

Reported-by: luferry 
Signed-off-by: Peter Zijlstra 
Signed-off-by: Thomas Gleixner 
Link: 
https://lkml.kernel.org/r/20190718160601.gp3...@hirez.programming.kicks-ass.net
---
 kernel/smp.c | 16 
 1 file changed, 16 insertions(+)

diff --git a/kernel/smp.c b/kernel/smp.c
index 616d4d114847..7dbcb402c2fc 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -291,6 +291,14 @@ int smp_call_function_single(int cpu, smp_call_func_t 
func, void *info,
WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
 && !oops_in_progress);
 
+   /*
+* When @wait we can deadlock when we interrupt between llist_add() and
+* arch_send_call_function_ipi*(); when !@wait we can deadlock due to
+* csd_lock() on because the interrupt context uses the same csd
+* storage.
+*/
+   WARN_ON_ONCE(!in_task());
+
csd = _stack;
if (!wait) {
csd = this_cpu_ptr(_data);
@@ -416,6 +424,14 @@ void smp_call_function_many(const struct cpumask *mask,
WARN_ON_ONCE(cpu_online(this_cpu) && irqs_disabled()
 && !oops_in_progress && !early_boot_irqs_disabled);
 
+   /*
+* When @wait we can deadlock when we interrupt between llist_add() and
+* arch_send_call_function_ipi*(); when !@wait we can deadlock due to
+* csd_lock() on because the interrupt context uses the same csd
+* storage.
+*/
+   WARN_ON_ONCE(!in_task());
+
/* Try to fastpath.  So, what's a CPU they want? Ignoring this one. */
cpu = cpumask_first_and(mask, cpu_online_mask);
if (cpu == this_cpu)


[tip:core/urgent] stacktrace: Force USER_DS for stack_trace_save_user()

2019-07-18 Thread tip-bot for Peter Zijlstra
Commit-ID:  cac9b9a4b08304f11daace03b8b48659355e44c1
Gitweb: https://git.kernel.org/tip/cac9b9a4b08304f11daace03b8b48659355e44c1
Author: Peter Zijlstra 
AuthorDate: Thu, 18 Jul 2019 10:47:47 +0200
Committer:  Thomas Gleixner 
CommitDate: Thu, 18 Jul 2019 16:47:24 +0200

stacktrace: Force USER_DS for stack_trace_save_user()

When walking userspace stacks, USER_DS needs to be set, otherwise
access_ok() will not function as expected.

Reported-by: Vegard Nossum 
Reported-by: Eiichi Tsukata 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Tested-by: Vegard Nossum 
Reviewed-by: Joel Fernandes (Google) 
Link: 
https://lkml.kernel.org/r/20190718085754.gm3...@hirez.programming.kicks-ass.net
---
 kernel/stacktrace.c | 5 +
 1 file changed, 5 insertions(+)

diff --git a/kernel/stacktrace.c b/kernel/stacktrace.c
index e6a02b274b73..f5440abb7532 100644
--- a/kernel/stacktrace.c
+++ b/kernel/stacktrace.c
@@ -226,12 +226,17 @@ unsigned int stack_trace_save_user(unsigned long *store, 
unsigned int size)
.store  = store,
.size   = size,
};
+   mm_segment_t fs;
 
/* Trace user stack if not a kernel thread */
if (current->flags & PF_KTHREAD)
return 0;
 
+   fs = get_fs();
+   set_fs(USER_DS);
arch_stack_walk_user(consume_entry, , task_pt_regs(current));
+   set_fs(fs);
+
return c.len;
 }
 #endif


[tip:x86/urgent] x86/mm, tracing: Fix CR2 corruption

2019-07-17 Thread tip-bot for Peter Zijlstra
Commit-ID:  a0d14b8909de55139b8702fe0c7e80b69763dcfb
Gitweb: https://git.kernel.org/tip/a0d14b8909de55139b8702fe0c7e80b69763dcfb
Author: Peter Zijlstra 
AuthorDate: Thu, 11 Jul 2019 13:40:59 +0200
Committer:  Thomas Gleixner 
CommitDate: Wed, 17 Jul 2019 23:17:38 +0200

x86/mm, tracing: Fix CR2 corruption

Despite the current efforts to read CR2 before tracing happens there still
exist a number of possible holes:

  idtentry page_fault do_page_fault   has_error_code=1
call error_entry
  TRACE_IRQS_OFF
call trace_hardirqs_off*
  #PF // modifies CR2

  CALL_enter_from_user_mode
__context_tracking_exit()
  trace_user_exit(0)
#PF // modifies CR2

call do_page_fault
  address = read_cr2(); /* whoopsie */

And similar for i386.

Fix it by pulling the CR2 read into the entry code, before any of that
stuff gets a chance to run and ruin things.

Reported-by: He Zhe 
Reported-by: Eiichi Tsukata 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Andy Lutomirski 
Cc: b...@alien8.de
Cc: rost...@goodmis.org
Cc: torva...@linux-foundation.org
Cc: h...@zytor.com
Cc: dave.han...@linux.intel.com
Cc: jgr...@suse.com
Cc: j...@joelfernandes.org
Link: https://lkml.kernel.org/r/2019074336.116812...@infradead.org

Debugged-by: Steven Rostedt 
---
 arch/x86/entry/entry_32.S   | 25 ++---
 arch/x86/entry/entry_64.S   | 35 ++-
 arch/x86/include/asm/kvm_para.h |  2 +-
 arch/x86/include/asm/traps.h|  4 ++--
 arch/x86/kernel/kvm.c   |  8 
 arch/x86/kernel/traps.c |  6 +-
 arch/x86/mm/fault.c | 30 +++---
 7 files changed, 59 insertions(+), 51 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 4d4b6100f0e8..2bb986f305ac 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -1443,9 +1443,28 @@ BUILD_INTERRUPT3(hv_stimer0_callback_vector, 
HYPERV_STIMER0_VECTOR,
 
 ENTRY(page_fault)
ASM_CLAC
-   pushl   $do_page_fault
-   ALIGN
-   jmp common_exception
+   pushl   $0; /* %gs's slot on the stack */
+
+   SAVE_ALL switch_stacks=1 skip_gs=1
+
+   ENCODE_FRAME_POINTER
+   UNWIND_ESPFIX_STACK
+
+   /* fixup %gs */
+   GS_TO_REG %ecx
+   REG_TO_PTGS %ecx
+   SET_KERNEL_GS %ecx
+
+   GET_CR2_INTO(%ecx)  # might clobber %eax
+
+   /* fixup orig %eax */
+   movlPT_ORIG_EAX(%esp), %edx # get the error code
+   movl$-1, PT_ORIG_EAX(%esp)  # no syscall to restart
+
+   TRACE_IRQS_OFF
+   movl%esp, %eax  # pt_regs pointer
+   calldo_page_fault
+   jmp ret_from_exception
 END(page_fault)
 
 common_exception:
diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 95ae05f0edf2..7cb2e1f1ec09 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -864,7 +864,7 @@ apicinterrupt IRQ_WORK_VECTOR   
irq_work_interrupt  smp_irq_work_interrupt
  */
 #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8)
 
-.macro idtentry_part do_sym, has_error_code:req, paranoid:req, shift_ist=-1, 
ist_offset=0
+.macro idtentry_part do_sym, has_error_code:req, read_cr2:req, paranoid:req, 
shift_ist=-1, ist_offset=0
 
.if \paranoid
callparanoid_entry
@@ -874,12 +874,21 @@ apicinterrupt IRQ_WORK_VECTOR 
irq_work_interrupt  smp_irq_work_interrupt
.endif
UNWIND_HINT_REGS
 
-   .if \paranoid
+   .if \read_cr2
+   GET_CR2_INTO(%rdx); /* can clobber %rax */
+   .endif
+
.if \shift_ist != -1
TRACE_IRQS_OFF_DEBUG/* reload IDT in case of 
recursion */
.else
TRACE_IRQS_OFF
.endif
+
+   .if \paranoid == 0
+   testb   $3, CS(%rsp)
+   jz  .Lfrom_kernel_no_context_tracking_\@
+   CALL_enter_from_user_mode
+.Lfrom_kernel_no_context_tracking_\@:
.endif
 
movq%rsp, %rdi  /* pt_regs pointer */
@@ -923,6 +932,7 @@ apicinterrupt IRQ_WORK_VECTOR   
irq_work_interrupt  smp_irq_work_interrupt
  * fresh stack.  (This is for #DB, which has a nasty habit
  * of recursing.)
  * @create_gap:create a 6-word stack gap when coming from 
kernel mode.
+ * @read_cr2:  load CR2 into the 3rd argument; done before calling any 
C code
  *
  * idtentry generates an IDT stub that sets up a usable kernel context,
  * creates struct pt_regs, and calls @do_sym.  The stub has the following
@@ -947,7 +957,7 @@ apicinterrupt IRQ_WORK_VECTOR   
irq_work_interrupt  smp_irq_work_interrupt
  * @paranoid == 2 is special: the stub will 

[tip:x86/urgent] x86/entry/64: Update comments and sanity tests for create_gap

2019-07-17 Thread tip-bot for Peter Zijlstra
Commit-ID:  4234653e882740cbf6625294e388b3176583
Gitweb: https://git.kernel.org/tip/4234653e882740cbf6625294e388b3176583
Author: Peter Zijlstra 
AuthorDate: Thu, 11 Jul 2019 13:40:58 +0200
Committer:  Thomas Gleixner 
CommitDate: Wed, 17 Jul 2019 23:17:38 +0200

x86/entry/64: Update comments and sanity tests for create_gap

Commit 2700fefdb2d9 ("x86_64: Add gap to int3 to allow for call
emulation") forgot to update the comment, do so now.

Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Steven Rostedt (VMware) 
Acked-by: Andy Lutomirski 
Cc: b...@alien8.de
Cc: torva...@linux-foundation.org
Cc: h...@zytor.com
Cc: dave.han...@linux.intel.com
Cc: jgr...@suse.com
Cc: zhe...@windriver.com
Cc: j...@joelfernandes.org
Cc: de...@etsukata.com
Link: https://lkml.kernel.org/r/2019074336.059780...@infradead.org

---
 arch/x86/entry/entry_64.S | 17 +++--
 1 file changed, 11 insertions(+), 6 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 3db5fede743b..95ae05f0edf2 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -913,15 +913,16 @@ apicinterrupt IRQ_WORK_VECTOR 
irq_work_interrupt  smp_irq_work_interrupt
 /**
  * idtentry - Generate an IDT entry stub
  * @sym:   Name of the generated entry point
- * @do_sym:C function to be called
- * @has_error_code:True if this IDT vector has an error code on the stack
- * @paranoid:  non-zero means that this vector may be invoked from
+ * @do_sym:C function to be called
+ * @has_error_code:True if this IDT vector has an error code on the stack
+ * @paranoid:  non-zero means that this vector may be invoked from
  * kernel mode with user GSBASE and/or user CR3.
  * 2 is special -- see below.
  * @shift_ist: Set to an IST index if entries from kernel mode should
- * decrement the IST stack so that nested entries 
get a
+ * decrement the IST stack so that nested entries get a
  * fresh stack.  (This is for #DB, which has a nasty habit
- * of recursing.)
+ * of recursing.)
+ * @create_gap:create a 6-word stack gap when coming from 
kernel mode.
  *
  * idtentry generates an IDT stub that sets up a usable kernel context,
  * creates struct pt_regs, and calls @do_sym.  The stub has the following
@@ -951,10 +952,14 @@ ENTRY(\sym)
UNWIND_HINT_IRET_REGS offset=\has_error_code*8
 
/* Sanity check */
-   .if \shift_ist != -1 && \paranoid == 0
+   .if \shift_ist != -1 && \paranoid != 1
.error "using shift_ist requires paranoid=1"
.endif
 
+   .if \create_gap && \paranoid
+   .error "using create_gap requires paranoid=0"
+   .endif
+
ASM_CLAC
 
.if \has_error_code == 0


[tip:x86/urgent] x86/entry/64: Simplify idtentry a little

2019-07-17 Thread tip-bot for Peter Zijlstra
Commit-ID:  2fd37912cfb019228bf246215938e6f7619516a2
Gitweb: https://git.kernel.org/tip/2fd37912cfb019228bf246215938e6f7619516a2
Author: Peter Zijlstra 
AuthorDate: Thu, 11 Jul 2019 13:40:57 +0200
Committer:  Thomas Gleixner 
CommitDate: Wed, 17 Jul 2019 23:17:37 +0200

x86/entry/64: Simplify idtentry a little

There's a bunch of duplication in idtentry, namely the
.Lfrom_usermode_switch_stack is a paranoid=0 copy of the normal flow.

Make this explicit by creating a idtentry_part helper macro.

Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Steven Rostedt (VMware) 
Acked-by: Andy Lutomirski 
Cc: b...@alien8.de
Cc: torva...@linux-foundation.org
Cc: h...@zytor.com
Cc: dave.han...@linux.intel.com
Cc: jgr...@suse.com
Cc: zhe...@windriver.com
Cc: j...@joelfernandes.org
Cc: de...@etsukata.com
Link: https://lkml.kernel.org/r/2019074336.002429...@infradead.org

---
 arch/x86/entry/entry_64.S | 102 ++
 1 file changed, 48 insertions(+), 54 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index 0ea4831a72a4..3db5fede743b 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -864,6 +864,52 @@ apicinterrupt IRQ_WORK_VECTOR  
irq_work_interrupt  smp_irq_work_interrupt
  */
 #define CPU_TSS_IST(x) PER_CPU_VAR(cpu_tss_rw) + (TSS_ist + (x) * 8)
 
+.macro idtentry_part do_sym, has_error_code:req, paranoid:req, shift_ist=-1, 
ist_offset=0
+
+   .if \paranoid
+   callparanoid_entry
+   /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
+   .else
+   callerror_entry
+   .endif
+   UNWIND_HINT_REGS
+
+   .if \paranoid
+   .if \shift_ist != -1
+   TRACE_IRQS_OFF_DEBUG/* reload IDT in case of 
recursion */
+   .else
+   TRACE_IRQS_OFF
+   .endif
+   .endif
+
+   movq%rsp, %rdi  /* pt_regs pointer */
+
+   .if \has_error_code
+   movqORIG_RAX(%rsp), %rsi/* get error code */
+   movq$-1, ORIG_RAX(%rsp) /* no syscall to restart */
+   .else
+   xorl%esi, %esi  /* no error code */
+   .endif
+
+   .if \shift_ist != -1
+   subq$\ist_offset, CPU_TSS_IST(\shift_ist)
+   .endif
+
+   call\do_sym
+
+   .if \shift_ist != -1
+   addq$\ist_offset, CPU_TSS_IST(\shift_ist)
+   .endif
+
+   .if \paranoid
+   /* this procedure expect "no swapgs" flag in ebx */
+   jmp paranoid_exit
+   .else
+   jmp error_exit
+   .endif
+
+.endm
+
 /**
  * idtentry - Generate an IDT entry stub
  * @sym:   Name of the generated entry point
@@ -934,47 +980,7 @@ ENTRY(\sym)
 .Lfrom_usermode_no_gap_\@:
.endif
 
-   .if \paranoid
-   callparanoid_entry
-   .else
-   callerror_entry
-   .endif
-   UNWIND_HINT_REGS
-   /* returned flag: ebx=0: need swapgs on exit, ebx=1: don't need it */
-
-   .if \paranoid
-   .if \shift_ist != -1
-   TRACE_IRQS_OFF_DEBUG/* reload IDT in case of 
recursion */
-   .else
-   TRACE_IRQS_OFF
-   .endif
-   .endif
-
-   movq%rsp, %rdi  /* pt_regs pointer */
-
-   .if \has_error_code
-   movqORIG_RAX(%rsp), %rsi/* get error code */
-   movq$-1, ORIG_RAX(%rsp) /* no syscall to restart */
-   .else
-   xorl%esi, %esi  /* no error code */
-   .endif
-
-   .if \shift_ist != -1
-   subq$\ist_offset, CPU_TSS_IST(\shift_ist)
-   .endif
-
-   call\do_sym
-
-   .if \shift_ist != -1
-   addq$\ist_offset, CPU_TSS_IST(\shift_ist)
-   .endif
-
-   /* these procedures expect "no swapgs" flag in ebx */
-   .if \paranoid
-   jmp paranoid_exit
-   .else
-   jmp error_exit
-   .endif
+   idtentry_part \do_sym, \has_error_code, \paranoid, \shift_ist, 
\ist_offset
 
.if \paranoid == 1
/*
@@ -983,21 +989,9 @@ ENTRY(\sym)
 * run in real process context if user_mode(regs).
 */
 .Lfrom_usermode_switch_stack_\@:
-   callerror_entry
-
-   movq%rsp, %rdi  /* pt_regs pointer */
-
-   .if \has_error_code
-   movqORIG_RAX(%rsp), %rsi/* get error code */
-   movq$-1, ORIG_RAX(%rsp) /* no syscall to restart */
-   .else
-   xorl%esi, %esi  /* no error code */
+   idtentry_part \do_sym, \has_error_code, paranoid=0
.endif
 
-   call\do_sym
-
-   jmp error_exit
-   .endif
 _ASM_NOKPROBE(\sym)
 END(\sym)
 .endm


[tip:x86/urgent] x86/entry/32: Simplify common_exception

2019-07-17 Thread tip-bot for Peter Zijlstra
Commit-ID:  e67f1c11e5ea7fa47449a16325ecc997dbbf9bdf
Gitweb: https://git.kernel.org/tip/e67f1c11e5ea7fa47449a16325ecc997dbbf9bdf
Author: Peter Zijlstra 
AuthorDate: Thu, 11 Jul 2019 13:40:56 +0200
Committer:  Thomas Gleixner 
CommitDate: Wed, 17 Jul 2019 23:17:37 +0200

x86/entry/32: Simplify common_exception

Adding one more option to SAVE_ALL can be used in common_exception to
simplify things. This also saves duplication later where page_fault will no
longer use common_exception.

Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Steven Rostedt (VMware) 
Reviewed-by: Andy Lutomirski 
Cc: b...@alien8.de
Cc: torva...@linux-foundation.org
Cc: h...@zytor.com
Cc: dave.han...@linux.intel.com
Cc: jgr...@suse.com
Cc: zhe...@windriver.com
Cc: j...@joelfernandes.org
Cc: de...@etsukata.com
Link: https://lkml.kernel.org/r/2019074335.945136...@infradead.org

---
 arch/x86/entry/entry_32.S | 36 +---
 1 file changed, 13 insertions(+), 23 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 90b473297299..4d4b6100f0e8 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -294,9 +294,11 @@
 .Lfinished_frame_\@:
 .endm
 
-.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0
+.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0
cld
+.if \skip_gs == 0
PUSH_GS
+.endif
FIXUP_FRAME
pushl   %fs
pushl   %es
@@ -313,13 +315,13 @@
movl%edx, %es
movl$(__KERNEL_PERCPU), %edx
movl%edx, %fs
+.if \skip_gs == 0
SET_KERNEL_GS %edx
-
+.endif
/* Switch to kernel stack if necessary */
 .if \switch_stacks > 0
SWITCH_TO_KERNEL_STACK
 .endif
-
 .endm
 
 .macro SAVE_ALL_NMI cr3_reg:req
@@ -1448,32 +1450,20 @@ END(page_fault)
 
 common_exception:
/* the function address is in %gs's slot on the stack */
-   FIXUP_FRAME
-   pushl   %fs
-   pushl   %es
-   pushl   %ds
-   pushl   %eax
-   movl$(__USER_DS), %eax
-   movl%eax, %ds
-   movl%eax, %es
-   movl$(__KERNEL_PERCPU), %eax
-   movl%eax, %fs
-   pushl   %ebp
-   pushl   %edi
-   pushl   %esi
-   pushl   %edx
-   pushl   %ecx
-   pushl   %ebx
-   SWITCH_TO_KERNEL_STACK
+   SAVE_ALL switch_stacks=1 skip_gs=1
ENCODE_FRAME_POINTER
-   cld
UNWIND_ESPFIX_STACK
+
+   /* fixup %gs */
GS_TO_REG %ecx
movlPT_GS(%esp), %edi   # get the function address
-   movlPT_ORIG_EAX(%esp), %edx # get the error code
-   movl$-1, PT_ORIG_EAX(%esp)  # no syscall to restart
REG_TO_PTGS %ecx
SET_KERNEL_GS %ecx
+
+   /* fixup orig %eax */
+   movlPT_ORIG_EAX(%esp), %edx # get the error code
+   movl$-1, PT_ORIG_EAX(%esp)  # no syscall to restart
+
TRACE_IRQS_OFF
movl%esp, %eax  # pt_regs pointer
CALL_NOSPEC %edi


[tip:x86/urgent] x86/paravirt: Make read_cr2() CALLEE_SAVE

2019-07-17 Thread tip-bot for Peter Zijlstra
Commit-ID:  55aedddb6149ab71bec9f050846855113977b033
Gitweb: https://git.kernel.org/tip/55aedddb6149ab71bec9f050846855113977b033
Author: Peter Zijlstra 
AuthorDate: Thu, 11 Jul 2019 13:40:55 +0200
Committer:  Thomas Gleixner 
CommitDate: Wed, 17 Jul 2019 23:17:37 +0200

x86/paravirt: Make read_cr2() CALLEE_SAVE

The one paravirt read_cr2() implementation (Xen) is actually quite trivial
and doesn't need to clobber anything other than the return register.

Making read_cr2() CALLEE_SAVE avoids all the PUSH/POP nonsense and allows
more convenient use from assembly.

Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Juergen Gross 
Cc: b...@alien8.de
Cc: rost...@goodmis.org
Cc: l...@kernel.org
Cc: torva...@linux-foundation.org
Cc: h...@zytor.com
Cc: dave.han...@linux.intel.com
Cc: zhe...@windriver.com
Cc: j...@joelfernandes.org
Cc: de...@etsukata.com
Link: https://lkml.kernel.org/r/2019074335.887392...@infradead.org

---
 arch/x86/entry/calling.h  |  6 ++
 arch/x86/include/asm/paravirt.h   | 22 +-
 arch/x86/include/asm/paravirt_types.h |  2 +-
 arch/x86/kernel/asm-offsets.c |  1 +
 arch/x86/kernel/head_64.S |  4 +---
 arch/x86/kernel/paravirt.c|  2 +-
 arch/x86/xen/enlighten_pv.c   |  3 ++-
 arch/x86/xen/mmu_pv.c | 12 +---
 arch/x86/xen/xen-asm.S| 16 
 arch/x86/xen/xen-ops.h|  3 +++
 10 files changed, 45 insertions(+), 26 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 9f1f9e3b8230..830bd984182b 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -343,3 +343,9 @@ For 32-bit we have the following conventions - kernel is 
built with
 .Lafter_call_\@:
 #endif
 .endm
+
+#ifdef CONFIG_PARAVIRT_XXL
+#define GET_CR2_INTO(reg) GET_CR2_INTO_AX ; _ASM_MOV %_ASM_AX, reg
+#else
+#define GET_CR2_INTO(reg) _ASM_MOV %cr2, reg
+#endif
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index c25c38a05c1c..5135282683d4 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -116,7 +116,7 @@ static inline void write_cr0(unsigned long x)
 
 static inline unsigned long read_cr2(void)
 {
-   return PVOP_CALL0(unsigned long, mmu.read_cr2);
+   return PVOP_CALLEE0(unsigned long, mmu.read_cr2);
 }
 
 static inline void write_cr2(unsigned long x)
@@ -909,13 +909,7 @@ extern void default_banner(void);
  ANNOTATE_RETPOLINE_SAFE;  \
  call PARA_INDIRECT(pv_ops+PV_CPU_swapgs); \
 )
-#endif
-
-#define GET_CR2_INTO_RAX   \
-   ANNOTATE_RETPOLINE_SAFE;\
-   call PARA_INDIRECT(pv_ops+PV_MMU_read_cr2);
 
-#ifdef CONFIG_PARAVIRT_XXL
 #define USERGS_SYSRET64
\
PARA_SITE(PARA_PATCH(PV_CPU_usergs_sysret64),   \
  ANNOTATE_RETPOLINE_SAFE;  \
@@ -929,9 +923,19 @@ extern void default_banner(void);
  call PARA_INDIRECT(pv_ops+PV_IRQ_save_fl);\
  PV_RESTORE_REGS(clobbers | CLBR_CALLEE_SAVE);)
 #endif
-#endif
+#endif /* CONFIG_PARAVIRT_XXL */
+#endif /* CONFIG_X86_64 */
+
+#ifdef CONFIG_PARAVIRT_XXL
+
+#define GET_CR2_INTO_AX
\
+   PARA_SITE(PARA_PATCH(PV_MMU_read_cr2),  \
+ ANNOTATE_RETPOLINE_SAFE;  \
+ call PARA_INDIRECT(pv_ops+PV_MMU_read_cr2);   \
+)
+
+#endif /* CONFIG_PARAVIRT_XXL */
 
-#endif /* CONFIG_X86_32 */
 
 #endif /* __ASSEMBLY__ */
 #else  /* CONFIG_PARAVIRT */
diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 946f8f1f1efc..639b2df445ee 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -220,7 +220,7 @@ struct pv_mmu_ops {
void (*exit_mmap)(struct mm_struct *mm);
 
 #ifdef CONFIG_PARAVIRT_XXL
-   unsigned long (*read_cr2)(void);
+   struct paravirt_callee_save read_cr2;
void (*write_cr2)(unsigned long);
 
unsigned long (*read_cr3)(void);
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index da64452584b0..5c7ee3df4d0b 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -76,6 +76,7 @@ static void __used common(void)
BLANK();
OFFSET(XEN_vcpu_info_mask, vcpu_info, evtchn_upcall_mask);
OFFSET(XEN_vcpu_info_pending, vcpu_info, evtchn_upcall_pending);
+   OFFSET(XEN_vcpu_info_arch_cr2, vcpu_info, arch.cr2);
 #endif
 
BLANK();
diff --git a/arch/x86/kernel/head_64.S b/arch/x86/kernel/head_64.S
index bcd206c8ac90..0e2d72929a8c 100644
--- 

[tip:sched/urgent] sched/core: Fix preempt warning in ttwu

2019-07-13 Thread tip-bot for Peter Zijlstra
Commit-ID:  e3d85487fba42206024bc3ed32e4b581c7cb46db
Gitweb: https://git.kernel.org/tip/e3d85487fba42206024bc3ed32e4b581c7cb46db
Author: Peter Zijlstra 
AuthorDate: Wed, 10 Jul 2019 12:57:36 +0200
Committer:  Ingo Molnar 
CommitDate: Sat, 13 Jul 2019 11:23:27 +0200

sched/core: Fix preempt warning in ttwu

John reported a DEBUG_PREEMPT warning caused by commit:

  aacedf26fb76 ("sched/core: Optimize try_to_wake_up() for local wakeups")

I overlooked that ttwu_stat() requires preemption disabled.

Reported-by: John Stultz 
Tested-by: John Stultz 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Andrew Morton 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Fixes: aacedf26fb76 ("sched/core: Optimize try_to_wake_up() for local wakeups")
Link: 
https://lkml.kernel.org/r/20190710105736.gk3...@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar 
---
 kernel/sched/core.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index fa43ce3962e7..2b037f195473 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -2399,6 +2399,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, 
int wake_flags)
unsigned long flags;
int cpu, success = 0;
 
+   preempt_disable();
if (p == current) {
/*
 * We're waking current, this means 'p->on_rq' and 'task_cpu(p)
@@ -2412,7 +2413,7 @@ try_to_wake_up(struct task_struct *p, unsigned int state, 
int wake_flags)
 *it disabling IRQs (this allows not taking ->pi_lock).
 */
if (!(p->state & state))
-   return false;
+   goto out;
 
success = 1;
cpu = task_cpu(p);
@@ -2526,6 +2527,7 @@ unlock:
 out:
if (success)
ttwu_stat(p, cpu, wake_flags);
+   preempt_enable();
 
return success;
 }


[tip:x86/urgent] x86/alternatives: Fix int3_emulate_call() selftest stack corruption

2019-07-09 Thread tip-bot for Peter Zijlstra
Commit-ID:  ecc606103837b98a2b665e8f14e533a6c72bbdc0
Gitweb: https://git.kernel.org/tip/ecc606103837b98a2b665e8f14e533a6c72bbdc0
Author: Peter Zijlstra 
AuthorDate: Mon, 8 Jul 2019 15:55:30 -0500
Committer:  Thomas Gleixner 
CommitDate: Tue, 9 Jul 2019 22:39:15 +0200

x86/alternatives: Fix int3_emulate_call() selftest stack corruption

KASAN shows the following splat during boot:

  BUG: KASAN: unknown-crash in unwind_next_frame+0x3f6/0x490
  Read of size 8 at addr 84007db0 by task swapper/0

  CPU: 0 PID: 0 Comm: swapper Tainted: GT 
5.2.0-rc6-00013-g7457c0d #1
  Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1 
04/01/2014
  Call Trace:
   dump_stack+0x19/0x1b
   print_address_description+0x1b0/0x2b2
   __kasan_report+0x10f/0x171
   kasan_report+0x12/0x1c
   __asan_load8+0x54/0x81
   unwind_next_frame+0x3f6/0x490
   unwind_next_frame+0x1b/0x23
   arch_stack_walk+0x68/0xa5
   stack_trace_save+0x7b/0xa0
   save_trace+0x3c/0x93
   mark_lock+0x1ef/0x9b1
   lock_acquire+0x122/0x221
   __mutex_lock+0xb6/0x731
   mutex_lock_nested+0x16/0x18
   _vm_unmap_aliases+0x141/0x183
   vm_unmap_aliases+0x14/0x16
   change_page_attr_set_clr+0x15e/0x2f2
   set_memory_4k+0x2a/0x2c
   check_bugs+0x11fd/0x1298
   start_kernel+0x793/0x7eb
   x86_64_start_reservations+0x55/0x76
   x86_64_start_kernel+0x87/0xaa
   secondary_startup_64+0xa4/0xb0

  Memory state around the buggy address:
   84007c80: 00 00 00 00 00 00 00 00 00 00 00 00 00 f1 f1 f1
   84007d00: f1 00 00 00 00 00 00 00 00 00 f2 f2 f2 f3 f3 f3
  >84007d80: f3 79 be 52 49 79 be 00 00 00 00 00 00 00 00 f1

It turns out that int3_selftest() is corrupting the stack.  The problem is
that the KASAN-ified version of int3_magic() is much less trivial than the
C code appears.  It clobbers several unexpected registers.  So when the
selftest's INT3 is converted to an emulated call to int3_magic(), the
registers are clobbered and Bad Things happen when the function returns.

Fix this by converting int3_magic() to the trivial ASM function it should
be, avoiding all calling convention issues. Also add ASM_CALL_CONSTRAINT to
the INT3 ASM, since it contains a 'CALL'.

[peterz: cribbed changelog from josh]

Fixes: 7457c0da024b ("x86/alternatives: Add int3_emulate_call() selftest")
Reported-by: kernel test robot 
Debugged-by: Josh Poimboeuf 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Josh Poimboeuf 
Cc: Linus Torvalds 
Cc: Andy Lutomirski 
Link: 
https://lkml.kernel.org/r/20190709125744.gb3...@hirez.programming.kicks-ass.net
---
 arch/x86/kernel/alternative.c | 25 -
 1 file changed, 20 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c
index 99ef8b6f9a1a..ccd32013c47a 100644
--- a/arch/x86/kernel/alternative.c
+++ b/arch/x86/kernel/alternative.c
@@ -625,10 +625,23 @@ extern struct paravirt_patch_site 
__start_parainstructions[],
  *
  * See entry_{32,64}.S for more details.
  */
-static void __init int3_magic(unsigned int *ptr)
-{
-   *ptr = 1;
-}
+
+/*
+ * We define the int3_magic() function in assembly to control the calling
+ * convention such that we can 'call' it from assembly.
+ */
+
+extern void int3_magic(unsigned int *ptr); /* defined in asm */
+
+asm (
+"  .pushsection.init.text, \"ax\", @progbits\n"
+"  .type   int3_magic, @function\n"
+"int3_magic:\n"
+"  movl$1, (%" _ASM_ARG1 ")\n"
+"  ret\n"
+"  .size   int3_magic, .-int3_magic\n"
+"  .popsection\n"
+);
 
 extern __initdata unsigned long int3_selftest_ip; /* defined in asm below */
 
@@ -676,7 +689,9 @@ static void __init int3_selftest(void)
  "int3_selftest_ip:\n\t"
  __ASM_SEL(.long, .quad) " 1b\n\t"
  ".popsection\n\t"
- : : __ASM_SEL_RAW(a, D) () : "memory");
+ : ASM_CALL_CONSTRAINT
+ : __ASM_SEL_RAW(a, D) ()
+ : "memory");
 
BUG_ON(val != 1);
 


[tip:locking/core] x86/percpu: Differentiate this_cpu_{}() and __this_cpu_{}()

2019-06-17 Thread tip-bot for Peter Zijlstra
Commit-ID:  0b9ccc0a9b146b49e83bf1e32f70d2396a694bfb
Gitweb: https://git.kernel.org/tip/0b9ccc0a9b146b49e83bf1e32f70d2396a694bfb
Author: Peter Zijlstra 
AuthorDate: Thu, 6 Dec 2018 12:24:33 +0100
Committer:  Ingo Molnar 
CommitDate: Mon, 17 Jun 2019 12:43:40 +0200

x86/percpu: Differentiate this_cpu_{}() and __this_cpu_{}()

Nadav Amit reported that commit:

  b59167ac7baf ("x86/percpu: Fix this_cpu_read()")

added a bunch of constraints to all sorts of code; and while some of
that was correct and desired, some of that seems superfluous.

The thing is, the this_cpu_*() operations are defined IRQ-safe, this
means the values are subject to change from IRQs, and thus must be
reloaded.

Also, the generic form:

  local_irq_save()
  __this_cpu_read()
  local_irq_restore()

would not allow the re-use of previous values; if by nothing else,
then the barrier()s implied by local_irq_*().

Which raises the point that percpu_from_op() and the others also need
that volatile.

OTOH __this_cpu_*() operations are not IRQ-safe and assume external
preempt/IRQ disabling and could thus be allowed more room for
optimization.

This makes the this_cpu_*() vs __this_cpu_*() behaviour more
consistent with other architectures.

  $ ./compare.sh defconfig-build defconfig-build1 vmlinux.o
  x86_pmu_cancel_txn 80 71   
-9,+0
  __text_poke   919964   
+45,+0
  do_user_addr_fault   1082   1058   
-24,+0
  __do_page_fault  1194   1178   
-16,+0
  do_exit  2995   3027   
-43,+75
  process_one_work 1008989   
-67,+48
  finish_task_switch524505   
-19,+0
  __schedule_bug103 98   
-59,+54
  __schedule_bug103 98   
-59,+54
  __sched_setscheduler 2015   2030   
+15,+0
  freeze_processes  203230   
+31,-4
  rcu_gp_kthread_wake   106 99   
-7,+0
  rcu_core 1841   1834   
-7,+0
  call_timer_fn 298286   
-12,+0
  can_stop_idle_tick146139   
-31,+24
  perf_pending_event253239   
-14,+0
  shmem_alloc_page  209213   
+4,+0
  __alloc_pages_slowpath   3284   3269   
-15,+0
  umount_tree   671694   
+23,+0
  advance_transaction   803798   
-5,+0
  con_put_char   71 51   
-20,+0
  xhci_urb_enqueue 1302   1295   
-7,+0
  xhci_urb_enqueue 1302   1295   
-7,+0
  tcp_sacktag_write_queue  2130   2075   
-55,+0
  tcp_try_undo_loss 229208   
-21,+0
  tcp_v4_inbound_md5_hash   438411   
-31,+4
  tcp_v4_inbound_md5_hash   438411   
-31,+4
  tcp_v6_inbound_md5_hash   469411   
-33,-25
  tcp_v6_inbound_md5_hash   469411   
-33,-25
  restricted_pointer434420   
-14,+0
  irq_exit  162154   
-8,+0
  get_perf_callchain638624   
-14,+0
  rt_mutex_trylock  169156   
-13,+0
  avc_has_extended_perms   1092   1089   
-3,+0
  avc_has_perm_noaudit  309306   
-3,+0
  __perf_sw_event   138122   
-16,+0
  perf_swevent_get_recursion_context116102   
-14,+0
  __local_bh_enable_ip   93 72   
-21,+0
  xfrm_input   4175   4161   
-14,+0
  avc_has_perm  446443   
-3,+0
  vm_events_fold_cpu 57 56   
-1,+0
  vfree  68 61   
-7,+0
  freeze_processes  203230   
+31,-4
  _local_bh_enable 

[tip:locking/core] Documentation/atomic_t.txt: Clarify pure non-rmw usage

2019-06-03 Thread tip-bot for Peter Zijlstra
Commit-ID:  fff9b6c7d26943a8eb32b58364b7ec6b9369746a
Gitweb: https://git.kernel.org/tip/fff9b6c7d26943a8eb32b58364b7ec6b9369746a
Author: Peter Zijlstra 
AuthorDate: Fri, 24 May 2019 13:52:31 +0200
Committer:  Ingo Molnar 
CommitDate: Mon, 3 Jun 2019 12:32:57 +0200

Documentation/atomic_t.txt: Clarify pure non-rmw usage

Clarify that pure non-RMW usage of atomic_t is pointless, there is
nothing 'magical' about atomic_set() / atomic_read().

This is something that seems to confuse people, because I happen upon it
semi-regularly.

Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Greg Kroah-Hartman 
Acked-by: Will Deacon 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Link: 
https://lkml.kernel.org/r/20190524115231.gn2...@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar 
---
 Documentation/atomic_t.txt | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/Documentation/atomic_t.txt b/Documentation/atomic_t.txt
index dca3fb0554db..89eae7f6b360 100644
--- a/Documentation/atomic_t.txt
+++ b/Documentation/atomic_t.txt
@@ -81,9 +81,11 @@ Non-RMW ops:
 
 The non-RMW ops are (typically) regular LOADs and STOREs and are canonically
 implemented using READ_ONCE(), WRITE_ONCE(), smp_load_acquire() and
-smp_store_release() respectively.
+smp_store_release() respectively. Therefore, if you find yourself only using
+the Non-RMW operations of atomic_t, you do not in fact need atomic_t at all
+and are doing it wrong.
 
-The one detail to this is that atomic_set{}() should be observable to the RMW
+A subtle detail of atomic_set{}() is that it should be observable to the RMW
 ops. That is:
 
   C atomic-set


[tip:locking/core] locking/lock_events: Use raw_cpu_{add,inc}() for stats

2019-06-03 Thread tip-bot for Peter Zijlstra
Commit-ID:  24811637dbfd07c69da7e9db586d35d17e6afca3
Gitweb: https://git.kernel.org/tip/24811637dbfd07c69da7e9db586d35d17e6afca3
Author: Peter Zijlstra 
AuthorDate: Mon, 27 May 2019 10:23:26 +0200
Committer:  Ingo Molnar 
CommitDate: Mon, 3 Jun 2019 12:32:56 +0200

locking/lock_events: Use raw_cpu_{add,inc}() for stats

Instead of playing silly games with CONFIG_DEBUG_PREEMPT toggling
between this_cpu_*() and __this_cpu_*() use raw_cpu_*(), which is
exactly what we want here.

Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Linus Torvalds 
Cc: Borislav Petkov 
Cc: Davidlohr Bueso 
Cc: H. Peter Anvin 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: Tim Chen 
Cc: Waiman Long 
Cc: Will Deacon 
Cc: huang ying 
Link: 
https://lkml.kernel.org/r/20190527082326.gp2...@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar 
---
 kernel/locking/lock_events.h | 45 
 1 file changed, 4 insertions(+), 41 deletions(-)

diff --git a/kernel/locking/lock_events.h b/kernel/locking/lock_events.h
index 46b71af8eef2..8c7e7d25f09c 100644
--- a/kernel/locking/lock_events.h
+++ b/kernel/locking/lock_events.h
@@ -31,50 +31,13 @@ enum lock_events {
 DECLARE_PER_CPU(unsigned long, lockevents[lockevent_num]);
 
 /*
- * The purpose of the lock event counting subsystem is to provide a low
- * overhead way to record the number of specific locking events by using
- * percpu counters. It is the percpu sum that matters, not specifically
- * how many of them happens in each cpu.
- *
- * It is possible that the same percpu counter may be modified in both
- * the process and interrupt contexts. For architectures that perform
- * percpu operation with multiple instructions, it is possible to lose
- * count if a process context percpu update is interrupted in the middle
- * and the same counter is updated in the interrupt context. Therefore,
- * the generated percpu sum may not be precise. The error, if any, should
- * be small and insignificant.
- *
- * For those architectures that do multi-instruction percpu operation,
- * preemption in the middle and moving the task to another cpu may cause
- * a larger error in the count. Again, this will be few and far between.
- * Given the imprecise nature of the count and the possibility of resetting
- * the count and doing the measurement again, this is not really a big
- * problem.
- *
- * To get a better picture of what is happening under the hood, it is
- * suggested that a few measurements should be taken with the counts
- * reset in between to stamp out outliner because of these possible
- * error conditions.
- *
- * To minimize overhead, we use __this_cpu_*() in all cases except when
- * CONFIG_DEBUG_PREEMPT is defined. In this particular case, this_cpu_*()
- * will be used to avoid the appearance of unwanted BUG messages.
- */
-#ifdef CONFIG_DEBUG_PREEMPT
-#define lockevent_percpu_inc(x)this_cpu_inc(x)
-#define lockevent_percpu_add(x, v) this_cpu_add(x, v)
-#else
-#define lockevent_percpu_inc(x)__this_cpu_inc(x)
-#define lockevent_percpu_add(x, v) __this_cpu_add(x, v)
-#endif
-
-/*
- * Increment the PV qspinlock statistical counters
+ * Increment the statistical counters. use raw_cpu_inc() because of lower
+ * overhead and we don't care if we loose the occasional update.
  */
 static inline void __lockevent_inc(enum lock_events event, bool cond)
 {
if (cond)
-   lockevent_percpu_inc(lockevents[event]);
+   raw_cpu_inc(lockevents[event]);
 }
 
 #define lockevent_inc(ev)__lockevent_inc(LOCKEVENT_ ##ev, true)
@@ -82,7 +45,7 @@ static inline void __lockevent_inc(enum lock_events event, 
bool cond)
 
 static inline void __lockevent_add(enum lock_events event, int inc)
 {
-   lockevent_percpu_add(lockevents[event], inc);
+   raw_cpu_add(lockevents[event], inc);
 }
 
 #define lockevent_add(ev, c)   __lockevent_add(LOCKEVENT_ ##ev, c)


[tip:perf/urgent] perf/ring_buffer: Add ordering to rb->nest increment

2019-05-24 Thread tip-bot for Peter Zijlstra
Commit-ID:  3f9fbe9bd86c534eba2faf5d840fd44c6049f50e
Gitweb: https://git.kernel.org/tip/3f9fbe9bd86c534eba2faf5d840fd44c6049f50e
Author: Peter Zijlstra 
AuthorDate: Fri, 17 May 2019 13:52:32 +0200
Committer:  Ingo Molnar 
CommitDate: Fri, 24 May 2019 09:00:10 +0200

perf/ring_buffer: Add ordering to rb->nest increment

Similar to how decrementing rb->next too early can cause data_head to
(temporarily) be observed to go backward, so too can this happen when
we increment too late.

This barrier() ensures the rb->head load happens after the increment,
both the one in the 'goto again' path, as the one from
perf_output_get_handle() -- albeit very unlikely to matter for the
latter.

Suggested-by: Yabin Cui 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Alexander Shishkin 
Cc: Arnaldo Carvalho de Melo 
Cc: Jiri Olsa 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Stephane Eranian 
Cc: Thomas Gleixner 
Cc: Vince Weaver 
Cc: a...@kernel.org
Cc: mark.rutl...@arm.com
Cc: namhy...@kernel.org
Fixes: ef60777c9abd ("perf: Optimize the perf_output() path by removing 
IRQ-disables")
Link: http://lkml.kernel.org/r/20190517115418.309516...@infradead.org
Signed-off-by: Ingo Molnar 
---
 kernel/events/ring_buffer.c | 9 +
 1 file changed, 9 insertions(+)

diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 009467a60578..4b5f8d932400 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -48,6 +48,15 @@ static void perf_output_put_handle(struct perf_output_handle 
*handle)
unsigned long head;
 
 again:
+   /*
+* In order to avoid publishing a head value that goes backwards,
+* we must ensure the load of @rb->head happens after we've
+* incremented @rb->nest.
+*
+* Otherwise we can observe a @rb->head value before one published
+* by an IRQ/NMI happening between the load and the increment.
+*/
+   barrier();
head = local_read(>head);
 
/*


[tip:perf/urgent] perf/ring-buffer: Always use {READ,WRITE}_ONCE() for rb->user_page data

2019-05-24 Thread tip-bot for Peter Zijlstra
Commit-ID:  4d839dd9e4356bbacf3eb0ab13a549b83b008c21
Gitweb: https://git.kernel.org/tip/4d839dd9e4356bbacf3eb0ab13a549b83b008c21
Author: Peter Zijlstra 
AuthorDate: Fri, 17 May 2019 13:52:33 +0200
Committer:  Ingo Molnar 
CommitDate: Fri, 24 May 2019 09:00:11 +0200

perf/ring-buffer: Always use {READ,WRITE}_ONCE() for rb->user_page data

We must use {READ,WRITE}_ONCE() on rb->user_page data such that
concurrent usage will see whole values. A few key sites were missing
this.

Suggested-by: Yabin Cui 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Alexander Shishkin 
Cc: Arnaldo Carvalho de Melo 
Cc: Jiri Olsa 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Stephane Eranian 
Cc: Thomas Gleixner 
Cc: Vince Weaver 
Cc: a...@kernel.org
Cc: mark.rutl...@arm.com
Cc: namhy...@kernel.org
Fixes: 7b732a750477 ("perf_counter: new output ABI - part 1")
Link: http://lkml.kernel.org/r/20190517115418.394192...@infradead.org
Signed-off-by: Ingo Molnar 
---
 kernel/events/ring_buffer.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 4b5f8d932400..7a0c73e4b3eb 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -100,7 +100,7 @@ again:
 * See perf_output_begin().
 */
smp_wmb(); /* B, matches C */
-   rb->user_page->data_head = head;
+   WRITE_ONCE(rb->user_page->data_head, head);
 
/*
 * We must publish the head before decrementing the nest count,
@@ -496,7 +496,7 @@ void perf_aux_output_end(struct perf_output_handle *handle, 
unsigned long size)
perf_event_aux_event(handle->event, aux_head, size,
 handle->aux_flags);
 
-   rb->user_page->aux_head = rb->aux_head;
+   WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
if (rb_need_aux_wakeup(rb))
wakeup = true;
 
@@ -528,7 +528,7 @@ int perf_aux_output_skip(struct perf_output_handle *handle, 
unsigned long size)
 
rb->aux_head += size;
 
-   rb->user_page->aux_head = rb->aux_head;
+   WRITE_ONCE(rb->user_page->aux_head, rb->aux_head);
if (rb_need_aux_wakeup(rb)) {
perf_output_wakeup(handle);
handle->wakeup = rb->aux_wakeup + rb->aux_watermark;


[tip:perf/urgent] perf/ring-buffer: Use regular variables for nesting

2019-05-24 Thread tip-bot for Peter Zijlstra
Commit-ID:  5322ea58a06da2e69c5ef36a9b4d4b9255edd423
Gitweb: https://git.kernel.org/tip/5322ea58a06da2e69c5ef36a9b4d4b9255edd423
Author: Peter Zijlstra 
AuthorDate: Fri, 17 May 2019 13:52:34 +0200
Committer:  Ingo Molnar 
CommitDate: Fri, 24 May 2019 09:00:11 +0200

perf/ring-buffer: Use regular variables for nesting

While the IRQ/NMI will nest, the nest-count will be invariant over the
actual exception, since it will decrement equal to increment.

This means we can -- carefully -- use a regular variable since the
typical LOAD-STORE race doesn't exist (similar to preempt_count).

This optimizes the ring-buffer for all LOAD-STORE architectures, since
they need to use atomic ops to implement local_t.

Suggested-by: Alexander Shishkin 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Arnaldo Carvalho de Melo 
Cc: Jiri Olsa 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Stephane Eranian 
Cc: Thomas Gleixner 
Cc: Vince Weaver 
Cc: a...@kernel.org
Cc: mark.rutl...@arm.com
Cc: namhy...@kernel.org
Cc: yab...@google.com
Link: http://lkml.kernel.org/r/20190517115418.481392...@infradead.org
Signed-off-by: Ingo Molnar 
---
 kernel/events/internal.h|  4 ++--
 kernel/events/ring_buffer.c | 41 ++---
 2 files changed, 28 insertions(+), 17 deletions(-)

diff --git a/kernel/events/internal.h b/kernel/events/internal.h
index 79c47076700a..3aef4191798c 100644
--- a/kernel/events/internal.h
+++ b/kernel/events/internal.h
@@ -24,7 +24,7 @@ struct ring_buffer {
atomic_tpoll;   /* POLL_ for wakeups */
 
local_t head;   /* write position*/
-   local_t nest;   /* nested writers*/
+   unsigned intnest;   /* nested writers*/
local_t events; /* event limit   */
local_t wakeup; /* wakeup stamp  */
local_t lost;   /* nr records lost   */
@@ -41,7 +41,7 @@ struct ring_buffer {
 
/* AUX area */
longaux_head;
-   local_t aux_nest;
+   unsigned intaux_nest;
longaux_wakeup; /* last aux_watermark 
boundary crossed by aux_head */
unsigned long   aux_pgoff;
int aux_nr_pages;
diff --git a/kernel/events/ring_buffer.c b/kernel/events/ring_buffer.c
index 7a0c73e4b3eb..ffb59a4ef4ff 100644
--- a/kernel/events/ring_buffer.c
+++ b/kernel/events/ring_buffer.c
@@ -38,7 +38,12 @@ static void perf_output_get_handle(struct perf_output_handle 
*handle)
struct ring_buffer *rb = handle->rb;
 
preempt_disable();
-   local_inc(>nest);
+
+   /*
+* Avoid an explicit LOAD/STORE such that architectures with memops
+* can use them.
+*/
+   (*(volatile unsigned int *)>nest)++;
handle->wakeup = local_read(>wakeup);
 }
 
@@ -46,6 +51,17 @@ static void perf_output_put_handle(struct perf_output_handle 
*handle)
 {
struct ring_buffer *rb = handle->rb;
unsigned long head;
+   unsigned int nest;
+
+   /*
+* If this isn't the outermost nesting, we don't have to update
+* @rb->user_page->data_head.
+*/
+   nest = READ_ONCE(rb->nest);
+   if (nest > 1) {
+   WRITE_ONCE(rb->nest, nest - 1);
+   goto out;
+   }
 
 again:
/*
@@ -64,15 +80,6 @@ again:
 * load above to be stale.
 */
 
-   /*
-* If this isn't the outermost nesting, we don't have to update
-* @rb->user_page->data_head.
-*/
-   if (local_read(>nest) > 1) {
-   local_dec(>nest);
-   goto out;
-   }
-
/*
 * Since the mmap() consumer (userspace) can run on a different CPU:
 *
@@ -108,7 +115,7 @@ again:
 * write will (temporarily) publish a stale value.
 */
barrier();
-   local_set(>nest, 0);
+   WRITE_ONCE(rb->nest, 0);
 
/*
 * Ensure we decrement @rb->nest before we validate the @rb->head.
@@ -116,7 +123,7 @@ again:
 */
barrier();
if (unlikely(head != local_read(>head))) {
-   local_inc(>nest);
+   WRITE_ONCE(rb->nest, 1);
goto again;
}
 
@@ -355,6 +362,7 @@ void *perf_aux_output_begin(struct perf_output_handle 
*handle,
struct perf_event *output_event = event;
unsigned long aux_head, aux_tail;
struct ring_buffer *rb;
+   unsigned int nest;
 
if (output_event->parent)
output_event = output_event->parent;
@@ -385,13 +393,16 @@ void *perf_aux_output_begin(struct perf_output_handle 
*handle,
if (!refcount_inc_not_zero(>aux_refcount))
goto err;
 
+   nest = 

[tip:sched/core] trace: Fix preempt_enable_no_resched() abuse

2019-04-29 Thread tip-bot for Peter Zijlstra
Commit-ID:  e8bd5814989b994cf1b0cb179e1c777e40c0f02c
Gitweb: https://git.kernel.org/tip/e8bd5814989b994cf1b0cb179e1c777e40c0f02c
Author: Peter Zijlstra 
AuthorDate: Tue, 23 Apr 2019 22:03:18 +0200
Committer:  Ingo Molnar 
CommitDate: Mon, 29 Apr 2019 08:27:09 +0200

trace: Fix preempt_enable_no_resched() abuse

Unless there is a call into schedule() in the immediate
(deterministic) future, one must not use preempt_enable_no_resched().
It can cause a preemption to go missing and thereby cause arbitrary
delays, breaking the PREEMPT=y invariant.

Signed-off-by: Peter Zijlstra (Intel) 
Cc: Alexei Starovoitov 
Cc: Daniel Borkmann 
Cc: Davidlohr Bueso 
Cc: Linus Torvalds 
Cc: Roman Gushchin 
Cc: Steven Rostedt 
Cc: Thomas Gleixner 
Cc: Tim Chen 
Cc: Waiman Long 
Cc: Will Deacon 
Cc: huang ying 
Fixes: 2c2d7329d8af ("tracing/ftrace: use preempt_enable_no_resched_notrace in 
ring_buffer_time_stamp()")
Link: 
https://lkml.kernel.org/r/20190423200318.gy14...@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar 
---
 kernel/trace/ring_buffer.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/trace/ring_buffer.c b/kernel/trace/ring_buffer.c
index 41b6f96e5366..4ee8d8aa3d0f 100644
--- a/kernel/trace/ring_buffer.c
+++ b/kernel/trace/ring_buffer.c
@@ -762,7 +762,7 @@ u64 ring_buffer_time_stamp(struct ring_buffer *buffer, int 
cpu)
 
preempt_disable_notrace();
time = rb_time_stamp(buffer);
-   preempt_enable_no_resched_notrace();
+   preempt_enable_notrace();
 
return time;
 }


[tip:core/objtool] x86/uaccess: Dont leak the AC flag into __put_user() argument evaluation

2019-04-24 Thread tip-bot for Peter Zijlstra
Commit-ID:  6ae865615fc43d014da2fd1f1bba7e81ee622d1b
Gitweb: https://git.kernel.org/tip/6ae865615fc43d014da2fd1f1bba7e81ee622d1b
Author: Peter Zijlstra 
AuthorDate: Wed, 24 Apr 2019 09:19:24 +0200
Committer:  Ingo Molnar 
CommitDate: Wed, 24 Apr 2019 12:19:45 +0200

x86/uaccess: Dont leak the AC flag into __put_user() argument evaluation

The __put_user() macro evaluates it's @ptr argument inside the
__uaccess_begin() / __uaccess_end() region. While this would normally
not be expected to be an issue, an UBSAN bug (it ignored -fwrapv,
fixed in GCC 8+) would transform the @ptr evaluation for:

  drivers/gpu/drm/i915/i915_gem_execbuffer.c: if (unlikely(__put_user(offset, 
[r-stack].presumed_offset))) {

into a signed-overflow-UB check and trigger the objtool AC validation.

Finish this commit:

  2a418cf3f5f1 ("x86/uaccess: Don't leak the AC flag into __put_user() value 
evaluation")

and explicitly evaluate all 3 arguments early.

Reported-by: Randy Dunlap 
Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Randy Dunlap  # build-tested
Acked-by: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: l...@kernel.org
Fixes: 2a418cf3f5f1 ("x86/uaccess: Don't leak the AC flag into __put_user() 
value evaluation")
Link: http://lkml.kernel.org/r/20190424072208.695962...@infradead.org
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/uaccess.h | 7 ---
 1 file changed, 4 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/uaccess.h b/arch/x86/include/asm/uaccess.h
index 5ca7b91faf67..bb21913885a3 100644
--- a/arch/x86/include/asm/uaccess.h
+++ b/arch/x86/include/asm/uaccess.h
@@ -427,10 +427,11 @@ do {  
\
 ({ \
__label__ __pu_label;   \
int __pu_err = -EFAULT; \
-   __typeof__(*(ptr)) __pu_val;\
-   __pu_val = x;   \
+   __typeof__(*(ptr)) __pu_val = (x);  \
+   __typeof__(ptr) __pu_ptr = (ptr);   \
+   __typeof__(size) __pu_size = (size);\
__uaccess_begin();  \
-   __put_user_size(__pu_val, (ptr), (size), __pu_label);   \
+   __put_user_size(__pu_val, __pu_ptr, __pu_size, __pu_label); \
__pu_err = 0;   \
 __pu_label:\
__uaccess_end();\


[tip:core/objtool] mm/uaccess: Use 'unsigned long' to placate UBSAN warnings on older GCC versions

2019-04-24 Thread tip-bot for Peter Zijlstra
Commit-ID:  29da93fea3ea39ab9b12270cc6be1b70ef201c9e
Gitweb: https://git.kernel.org/tip/29da93fea3ea39ab9b12270cc6be1b70ef201c9e
Author: Peter Zijlstra 
AuthorDate: Wed, 24 Apr 2019 09:19:25 +0200
Committer:  Ingo Molnar 
CommitDate: Wed, 24 Apr 2019 12:19:45 +0200

mm/uaccess: Use 'unsigned long' to placate UBSAN warnings on older GCC versions

Randy reported objtool triggered on his (GCC-7.4) build:

  lib/strncpy_from_user.o: warning: objtool: strncpy_from_user()+0x315: call to 
__ubsan_handle_add_overflow() with UACCESS enabled
  lib/strnlen_user.o: warning: objtool: strnlen_user()+0x337: call to 
__ubsan_handle_sub_overflow() with UACCESS enabled

This is due to UBSAN generating signed-overflow-UB warnings where it
should not. Prior to GCC-8 UBSAN ignored -fwrapv (which the kernel
uses through -fno-strict-overflow).

Make the functions use 'unsigned long' throughout.

Reported-by: Randy Dunlap 
Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Randy Dunlap  # build-tested
Acked-by: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: l...@kernel.org
Link: http://lkml.kernel.org/r/20190424072208.754094...@infradead.org
Signed-off-by: Ingo Molnar 
---
 lib/strncpy_from_user.c | 5 +++--
 lib/strnlen_user.c  | 4 ++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/lib/strncpy_from_user.c b/lib/strncpy_from_user.c
index 58eacd41526c..023ba9f3b99f 100644
--- a/lib/strncpy_from_user.c
+++ b/lib/strncpy_from_user.c
@@ -23,10 +23,11 @@
  * hit it), 'max' is the address space maximum (and we return
  * -EFAULT if we hit it).
  */
-static inline long do_strncpy_from_user(char *dst, const char __user *src, 
long count, unsigned long max)
+static inline long do_strncpy_from_user(char *dst, const char __user *src,
+   unsigned long count, unsigned long max)
 {
const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
-   long res = 0;
+   unsigned long res = 0;
 
/*
 * Truncate 'max' to the user-specified limit, so that
diff --git a/lib/strnlen_user.c b/lib/strnlen_user.c
index 1c1a1b0e38a5..7f2db3fe311f 100644
--- a/lib/strnlen_user.c
+++ b/lib/strnlen_user.c
@@ -28,7 +28,7 @@
 static inline long do_strnlen_user(const char __user *src, unsigned long 
count, unsigned long max)
 {
const struct word_at_a_time constants = WORD_AT_A_TIME_CONSTANTS;
-   long align, res = 0;
+   unsigned long align, res = 0;
unsigned long c;
 
/*
@@ -42,7 +42,7 @@ static inline long do_strnlen_user(const char __user *src, 
unsigned long count,
 * Do everything aligned. But that means that we
 * need to also expand the maximum..
 */
-   align = (sizeof(long) - 1) & (unsigned long)src;
+   align = (sizeof(unsigned long) - 1) & (unsigned long)src;
src -= align;
max += align;
 


[tip:perf/core] perf/x86: Support constraint ranges

2019-04-16 Thread tip-bot for Peter Zijlstra
Commit-ID:  63b79f6ebc464afb730bc45762c820795e276da1
Gitweb: https://git.kernel.org/tip/63b79f6ebc464afb730bc45762c820795e276da1
Author: Peter Zijlstra 
AuthorDate: Tue, 2 Apr 2019 12:45:04 -0700
Committer:  Ingo Molnar 
CommitDate: Tue, 16 Apr 2019 12:26:17 +0200

perf/x86: Support constraint ranges

Icelake extended the general counters to 8, even when SMT is enabled.
However only a (large) subset of the events can be used on all 8
counters.

The events that can or cannot be used on all counters are organized
in ranges.

A lot of scheduler constraints are required to handle all this.

To avoid blowing up the tables add event code ranges to the constraint
tables, and a new inline function to match them.

Originally-by: Andi Kleen 
Signed-off-by: Peter Zijlstra (Intel)  # developer hat on
Signed-off-by: Kan Liang 
Signed-off-by: Peter Zijlstra (Intel)  # maintainer hat on
Cc: Alexander Shishkin 
Cc: Arnaldo Carvalho de Melo 
Cc: Jiri Olsa 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Stephane Eranian 
Cc: Thomas Gleixner 
Cc: Vince Weaver 
Cc: a...@kernel.org
Cc: jo...@kernel.org
Link: https://lkml.kernel.org/r/20190402194509.2832-8-kan.li...@linux.intel.com
Signed-off-by: Ingo Molnar 
---
 arch/x86/events/intel/core.c |  2 +-
 arch/x86/events/intel/ds.c   |  2 +-
 arch/x86/events/perf_event.h | 43 +--
 3 files changed, 39 insertions(+), 8 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index bdc366d709aa..d4b52896f173 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2693,7 +2693,7 @@ x86_get_event_constraints(struct cpu_hw_events *cpuc, int 
idx,
 
if (x86_pmu.event_constraints) {
for_each_event_constraint(c, x86_pmu.event_constraints) {
-   if ((event->hw.config & c->cmask) == c->code) {
+   if (constraint_match(c, event->hw.config)) {
event->hw.flags |= c->flags;
return c;
}
diff --git a/arch/x86/events/intel/ds.c b/arch/x86/events/intel/ds.c
index 6436452d6342..4429bfa92fbc 100644
--- a/arch/x86/events/intel/ds.c
+++ b/arch/x86/events/intel/ds.c
@@ -858,7 +858,7 @@ struct event_constraint *intel_pebs_constraints(struct 
perf_event *event)
 
if (x86_pmu.pebs_constraints) {
for_each_event_constraint(c, x86_pmu.pebs_constraints) {
-   if ((event->hw.config & c->cmask) == c->code) {
+   if (constraint_match(c, event->hw.config)) {
event->hw.flags |= c->flags;
return c;
}
diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index dced91582147..0ff0c5ae8c29 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -49,13 +49,19 @@ struct event_constraint {
unsigned long   idxmsk[BITS_TO_LONGS(X86_PMC_IDX_MAX)];
u64 idxmsk64;
};
-   u64 code;
-   u64 cmask;
-   int weight;
-   int overlap;
-   int flags;
+   u64 code;
+   u64 cmask;
+   int weight;
+   int overlap;
+   int flags;
+   unsigned intsize;
 };
 
+static inline bool constraint_match(struct event_constraint *c, u64 ecode)
+{
+   return ((ecode & c->cmask) - c->code) <= (u64)c->size;
+}
+
 /*
  * struct hw_perf_event.flags flags
  */
@@ -280,18 +286,29 @@ struct cpu_hw_events {
void*kfree_on_online[X86_PERF_KFREE_MAX];
 };
 
-#define __EVENT_CONSTRAINT(c, n, m, w, o, f) {\
+#define __EVENT_CONSTRAINT_RANGE(c, e, n, m, w, o, f) {\
{ .idxmsk64 = (n) },\
.code = (c),\
+   .size = (e) - (c),  \
.cmask = (m),   \
.weight = (w),  \
.overlap = (o), \
.flags = f, \
 }
 
+#define __EVENT_CONSTRAINT(c, n, m, w, o, f) \
+   __EVENT_CONSTRAINT_RANGE(c, c, n, m, w, o, f)
+
 #define EVENT_CONSTRAINT(c, n, m)  \
__EVENT_CONSTRAINT(c, n, m, HWEIGHT(n), 0, 0)
 
+/*
+ * The constraint_match() function only works for 'simple' event codes
+ * and not for extended (AMD64_EVENTSEL_EVENT) events codes.
+ */
+#define EVENT_CONSTRAINT_RANGE(c, e, n, m) \
+   __EVENT_CONSTRAINT_RANGE(c, e, n, m, HWEIGHT(n), 0, 0)
+
 #define INTEL_EXCLEVT_CONSTRAINT(c, n) \
__EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT, HWEIGHT(n),\
   0, PERF_X86_EVENT_EXCL)
@@ -326,6 +343,12 @@ struct cpu_hw_events {
 #define INTEL_EVENT_CONSTRAINT(c, n)   \
EVENT_CONSTRAINT(c, n, ARCH_PERFMON_EVENTSEL_EVENT)
 
+/*
+ * Constraint on a range of Event codes
+ */
+#define INTEL_EVENT_CONSTRAINT_RANGE(c, 

[tip:x86/urgent] x86/mm/tlb: Revert "x86/mm: Align TLB invalidation info"

2019-04-16 Thread tip-bot for Peter Zijlstra
Commit-ID:  780e0106d468a2962b16b52fdf42898f2639e0a0
Gitweb: https://git.kernel.org/tip/780e0106d468a2962b16b52fdf42898f2639e0a0
Author: Peter Zijlstra 
AuthorDate: Tue, 16 Apr 2019 10:03:35 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 16 Apr 2019 10:10:13 +0200

x86/mm/tlb: Revert "x86/mm: Align TLB invalidation info"

Revert the following commit:

  515ab7c41306: ("x86/mm: Align TLB invalidation info")

I found out (the hard way) that under some .config options (notably 
L1_CACHE_SHIFT=7)
and compiler combinations this on-stack alignment leads to a 320 byte
stack usage, which then triggers a KASAN stack warning elsewhere.

Using 320 bytes of stack space for a 40 byte structure is ludicrous and
clearly not right.

Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Linus Torvalds 
Acked-by: Nadav Amit 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Dave Hansen 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Fixes: 515ab7c41306 ("x86/mm: Align TLB invalidation info")
Link: 
http://lkml.kernel.org/r/20190416080335.gm7...@worktop.programming.kicks-ass.net
[ Minor changelog edits. ]
Signed-off-by: Ingo Molnar 
---
 arch/x86/mm/tlb.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index bc4bc7b2f075..487b8474c01c 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -728,7 +728,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long 
start,
 {
int cpu;
 
-   struct flush_tlb_info info __aligned(SMP_CACHE_BYTES) = {
+   struct flush_tlb_info info = {
.mm = mm,
.stride_shift = stride_shift,
.freed_tables = freed_tables,


[tip:perf/urgent] perf/x86/intel: Initialize TFA MSR

2019-04-03 Thread tip-bot for Peter Zijlstra
Commit-ID:  d7262457e35dbe239659e62654e56f8ddb814bed
Gitweb: https://git.kernel.org/tip/d7262457e35dbe239659e62654e56f8ddb814bed
Author: Peter Zijlstra 
AuthorDate: Thu, 21 Mar 2019 13:38:49 +0100
Committer:  Ingo Molnar 
CommitDate: Wed, 3 Apr 2019 11:40:32 +0200

perf/x86/intel: Initialize TFA MSR

Stephane reported that the TFA MSR is not initialized by the kernel,
but the TFA bit could set by firmware or as a leftover from a kexec,
which makes the state inconsistent.

Reported-by: Stephane Eranian 
Tested-by: Nelson DSouza 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Alexander Shishkin 
Cc: Arnaldo Carvalho de Melo 
Cc: Jiri Olsa 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: Vince Weaver 
Cc: to...@suse.com
Link: 
https://lkml.kernel.org/r/20190321123849.gn6...@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar 
---
 arch/x86/events/intel/core.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 1539647ea39d..f61dcbef20ff 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3575,6 +3575,12 @@ static void intel_pmu_cpu_starting(int cpu)
 
cpuc->lbr_sel = NULL;
 
+   if (x86_pmu.flags & PMU_FL_TFA) {
+   WARN_ON_ONCE(cpuc->tfa_shadow);
+   cpuc->tfa_shadow = ~0ULL;
+   intel_set_tfa(cpuc, false);
+   }
+
if (x86_pmu.version > 1)
flip_smm_bit(_pmu.attr_freeze_on_smi);
 


[tip:sched/urgent] sched/cpufreq: Fix 32-bit math overflow

2019-03-19 Thread tip-bot for Peter Zijlstra
Commit-ID:  a23314e9d88d89d49e69db08f60b7caa470f04e1
Gitweb: https://git.kernel.org/tip/a23314e9d88d89d49e69db08f60b7caa470f04e1
Author: Peter Zijlstra 
AuthorDate: Tue, 5 Mar 2019 09:32:02 +0100
Committer:  Ingo Molnar 
CommitDate: Tue, 19 Mar 2019 12:06:11 +0100

sched/cpufreq: Fix 32-bit math overflow

Vincent Wang reported that get_next_freq() has a mult overflow bug on
32-bit platforms in the IOWAIT boost case, since in that case {util,max}
are in freq units instead of capacity units.

Solve this by moving the IOWAIT boost to capacity units. And since this
means @max is constant; simplify the code.

Reported-by: Vincent Wang 
Tested-by: Vincent Wang 
Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Rafael J. Wysocki 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Chunyan Zhang 
Cc: Dave Hansen 
Cc: H. Peter Anvin 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Quentin Perret 
Cc: Rafael J. Wysocki 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Link: 
https://lkml.kernel.org/r/20190305083202.gu32...@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar 
---
 kernel/sched/cpufreq_schedutil.c | 59 +---
 1 file changed, 25 insertions(+), 34 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 033ec7c45f13..1ccf77f6d346 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -48,10 +48,10 @@ struct sugov_cpu {
 
booliowait_boost_pending;
unsigned intiowait_boost;
-   unsigned intiowait_boost_max;
u64 last_update;
 
unsigned long   bw_dl;
+   unsigned long   min;
unsigned long   max;
 
/* The field below is for single-CPU policies only: */
@@ -303,8 +303,7 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, 
u64 time,
if (delta_ns <= TICK_NSEC)
return false;
 
-   sg_cpu->iowait_boost = set_iowait_boost
-   ? sg_cpu->sg_policy->policy->min : 0;
+   sg_cpu->iowait_boost = set_iowait_boost ? sg_cpu->min : 0;
sg_cpu->iowait_boost_pending = set_iowait_boost;
 
return true;
@@ -344,14 +343,13 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, 
u64 time,
 
/* Double the boost at each request */
if (sg_cpu->iowait_boost) {
-   sg_cpu->iowait_boost <<= 1;
-   if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
-   sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+   sg_cpu->iowait_boost =
+   min_t(unsigned int, sg_cpu->iowait_boost << 1, 
SCHED_CAPACITY_SCALE);
return;
}
 
/* First wakeup after IO: start with minimum boost */
-   sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
+   sg_cpu->iowait_boost = sg_cpu->min;
 }
 
 /**
@@ -373,47 +371,38 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, 
u64 time,
  * This mechanism is designed to boost high frequently IO waiting tasks, while
  * being more conservative on tasks which does sporadic IO operations.
  */
-static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
-  unsigned long *util, unsigned long *max)
+static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
+   unsigned long util, unsigned long max)
 {
-   unsigned int boost_util, boost_max;
+   unsigned long boost;
 
/* No boost currently required */
if (!sg_cpu->iowait_boost)
-   return;
+   return util;
 
/* Reset boost if the CPU appears to have been idle enough */
if (sugov_iowait_reset(sg_cpu, time, false))
-   return;
+   return util;
 
-   /*
-* An IO waiting task has just woken up:
-* allow to further double the boost value
-*/
-   if (sg_cpu->iowait_boost_pending) {
-   sg_cpu->iowait_boost_pending = false;
-   } else {
+   if (!sg_cpu->iowait_boost_pending) {
/*
-* Otherwise: reduce the boost value and disable it when we
-* reach the minimum.
+* No boost pending; reduce the boost value.
 */
sg_cpu->iowait_boost >>= 1;
-   if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) {
+   if (sg_cpu->iowait_boost < sg_cpu->min) {
sg_cpu->iowait_boost = 0;
-   return;
+   return util;
}
}
 
+   sg_cpu->iowait_boost_pending = false;
+
/*
-* Apply the current boost value: a CPU is boosted only if its current
-* utilization is smaller then the current IO boost level.
+* @util is already in capacity scale; convert iowait_boost
+* into the same 

[tip:perf/urgent] perf/x86: Fixup typo in stub functions

2019-03-15 Thread tip-bot for Peter Zijlstra
Commit-ID:  f764c58b7faa26f5714e6907f892abc2bc0de4f8
Gitweb: https://git.kernel.org/tip/f764c58b7faa26f5714e6907f892abc2bc0de4f8
Author: Peter Zijlstra 
AuthorDate: Fri, 15 Mar 2019 09:14:10 +0100
Committer:  Ingo Molnar 
CommitDate: Fri, 15 Mar 2019 13:12:42 +0100

perf/x86: Fixup typo in stub functions

Guenter reported a build warning for CONFIG_CPU_SUP_INTEL=n:

  > With allmodconfig-CONFIG_CPU_SUP_INTEL, this patch results in:
  >
  > In file included from arch/x86/events/amd/core.c:8:0:
  > arch/x86/events/amd/../perf_event.h:1036:45: warning: ‘struct cpu_hw_event’ 
declared inside parameter list will not be visible outside of this definition 
or declaration
  >  static inline int intel_cpuc_prepare(struct cpu_hw_event *cpuc, int cpu)

While harmless (an unsed pointer is an unused pointer, no matter the type)
it needs fixing.

Reported-by: Guenter Roeck 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Greg Kroah-Hartman 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: sta...@vger.kernel.org
Fixes: d01b1f96a82e ("perf/x86/intel: Make cpuc allocations consistent")
Link: 
http://lkml.kernel.org/r/20190315081410.gr5...@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar 
---
 arch/x86/events/perf_event.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/x86/events/perf_event.h b/arch/x86/events/perf_event.h
index b04ae6c8775e..a75955741c50 100644
--- a/arch/x86/events/perf_event.h
+++ b/arch/x86/events/perf_event.h
@@ -1033,12 +1033,12 @@ static inline int intel_pmu_init(void)
return 0;
 }
 
-static inline int intel_cpuc_prepare(struct cpu_hw_event *cpuc, int cpu)
+static inline int intel_cpuc_prepare(struct cpu_hw_events *cpuc, int cpu)
 {
return 0;
 }
 
-static inline void intel_cpuc_finish(struct cpu_hw_event *cpuc)
+static inline void intel_cpuc_finish(struct cpu_hw_events *cpuc)
 {
 }
 


[tip:perf/urgent] perf/x86/intel: Fix memory corruption

2019-03-15 Thread tip-bot for Peter Zijlstra
Commit-ID:  ede271b059463731cbd6dffe55ffd70d7dbe8392
Gitweb: https://git.kernel.org/tip/ede271b059463731cbd6dffe55ffd70d7dbe8392
Author: Peter Zijlstra 
AuthorDate: Thu, 14 Mar 2019 14:01:14 +0100
Committer:  Thomas Gleixner 
CommitDate: Fri, 15 Mar 2019 12:22:51 +0100

perf/x86/intel: Fix memory corruption

Through:

  validate_event()
x86_pmu.get_event_constraints(.idx=-1)
  tfa_get_event_constraints()
dyn_constraint()

cpuc->constraint_list[-1] is used, which is an obvious out-of-bound access.

In this case, simply skip the TFA constraint code, there is no event
constraint with just PMC3, therefore the code will never result in the
empty set.

Fixes: 400816f60c54 ("perf/x86/intel: Implement support for TSX Force Abort")
Reported-by: Tony Jones 
Reported-by: "DSouza, Nelson" 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Tested-by: Tony Jones 
Tested-by: "DSouza, Nelson" 
Cc: eran...@google.com
Cc: jo...@redhat.com
Cc: sta...@kernel.org
Link: https://lkml.kernel.org/r/20190314130705.441549...@infradead.org

---
 arch/x86/events/intel/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 35102ecdfc8d..92dfeb343a6a 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3410,7 +3410,7 @@ tfa_get_event_constraints(struct cpu_hw_events *cpuc, int 
idx,
/*
 * Without TFA we must not use PMC3.
 */
-   if (!allow_tsx_force_abort && test_bit(3, c->idxmsk)) {
+   if (!allow_tsx_force_abort && test_bit(3, c->idxmsk) && idx >= 0) {
c = dyn_constraint(cpuc, c, idx);
c->idxmsk64 &= ~(1ULL << 3);
c->weight--;


[tip:sched/urgent] sched/cpufreq: Fix 32-bit math overflow

2019-03-09 Thread tip-bot for Peter Zijlstra
Commit-ID:  f1212844e9dc3a31d41f99713c5522acf92ff291
Gitweb: https://git.kernel.org/tip/f1212844e9dc3a31d41f99713c5522acf92ff291
Author: Peter Zijlstra 
AuthorDate: Tue, 5 Mar 2019 09:32:02 +0100
Committer:  Ingo Molnar 
CommitDate: Sat, 9 Mar 2019 14:03:51 +0100

sched/cpufreq: Fix 32-bit math overflow

Vincent Wang reported that get_next_freq() has a mult overflow bug on
32-bit platforms in the IOWAIT boost case, since in that case {util,max}
are in freq units instead of capacity units.

Solve this by moving the IOWAIT boost to capacity units. And since this
means @max is constant; simplify the code.

Reported-by: Vincent Wang 
Tested-by: Vincent Wang 
Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Rafael J. Wysocki 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Chunyan Zhang 
Cc: Dave Hansen 
Cc: H. Peter Anvin 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Quentin Perret 
Cc: Rafael J. Wysocki 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Link: 
https://lkml.kernel.org/r/20190305083202.gu32...@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar 
---
 kernel/sched/cpufreq_schedutil.c | 58 +---
 1 file changed, 24 insertions(+), 34 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index 033ec7c45f13..5a8932ee5112 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -48,10 +48,10 @@ struct sugov_cpu {
 
booliowait_boost_pending;
unsigned intiowait_boost;
-   unsigned intiowait_boost_max;
u64 last_update;
 
unsigned long   bw_dl;
+   unsigned long   min;
unsigned long   max;
 
/* The field below is for single-CPU policies only: */
@@ -303,8 +303,7 @@ static bool sugov_iowait_reset(struct sugov_cpu *sg_cpu, 
u64 time,
if (delta_ns <= TICK_NSEC)
return false;
 
-   sg_cpu->iowait_boost = set_iowait_boost
-   ? sg_cpu->sg_policy->policy->min : 0;
+   sg_cpu->iowait_boost = set_iowait_boost ? sg_cpu->min : 0;
sg_cpu->iowait_boost_pending = set_iowait_boost;
 
return true;
@@ -344,14 +343,12 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, 
u64 time,
 
/* Double the boost at each request */
if (sg_cpu->iowait_boost) {
-   sg_cpu->iowait_boost <<= 1;
-   if (sg_cpu->iowait_boost > sg_cpu->iowait_boost_max)
-   sg_cpu->iowait_boost = sg_cpu->iowait_boost_max;
+   sg_cpu->iowait_boost = min(sg_cpu->iowait_boost << 1, 
SCHED_CAPACITY_SCALE);
return;
}
 
/* First wakeup after IO: start with minimum boost */
-   sg_cpu->iowait_boost = sg_cpu->sg_policy->policy->min;
+   sg_cpu->iowait_boost = sg_cpu->min;
 }
 
 /**
@@ -373,47 +370,38 @@ static void sugov_iowait_boost(struct sugov_cpu *sg_cpu, 
u64 time,
  * This mechanism is designed to boost high frequently IO waiting tasks, while
  * being more conservative on tasks which does sporadic IO operations.
  */
-static void sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
-  unsigned long *util, unsigned long *max)
+static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time,
+   unsigned long util, unsigned long max)
 {
-   unsigned int boost_util, boost_max;
+   unsigned long boost;
 
/* No boost currently required */
if (!sg_cpu->iowait_boost)
-   return;
+   return util;
 
/* Reset boost if the CPU appears to have been idle enough */
if (sugov_iowait_reset(sg_cpu, time, false))
-   return;
+   return util;
 
-   /*
-* An IO waiting task has just woken up:
-* allow to further double the boost value
-*/
-   if (sg_cpu->iowait_boost_pending) {
-   sg_cpu->iowait_boost_pending = false;
-   } else {
+   if (!sg_cpu->iowait_boost_pending) {
/*
-* Otherwise: reduce the boost value and disable it when we
-* reach the minimum.
+* No boost pending; reduce the boost value.
 */
sg_cpu->iowait_boost >>= 1;
-   if (sg_cpu->iowait_boost < sg_cpu->sg_policy->policy->min) {
+   if (sg_cpu->iowait_boost < sg_cpu->min) {
sg_cpu->iowait_boost = 0;
-   return;
+   return util;
}
}
 
+   sg_cpu->iowait_boost_pending = false;
+
/*
-* Apply the current boost value: a CPU is boosted only if its current
-* utilization is smaller then the current IO boost level.
+* @util is already in capacity scale; convert iowait_boost
+* into the same scale so we can compare.
 */
-   

[tip:x86/urgent] x86/mm/cpa: Fix set_mce_nospec()

2019-02-08 Thread tip-bot for Peter Zijlstra
Commit-ID:  0521e8be211cd20d547bff9da2534b7ed6f2c1b9
Gitweb: https://git.kernel.org/tip/0521e8be211cd20d547bff9da2534b7ed6f2c1b9
Author: Peter Zijlstra 
AuthorDate: Fri, 8 Feb 2019 13:08:59 +0100
Committer:  Thomas Gleixner 
CommitDate: Fri, 8 Feb 2019 14:31:56 +0100

x86/mm/cpa: Fix set_mce_nospec()

The recent commit fe0937b24ff5 ("x86/mm/cpa: Fold cpa_flush_range() and
cpa_flush_array() into a single cpa_flush() function") accidentally made
the call to make_addr_canonical_again() go away, which breaks
set_mce_nospec().

Re-instate the call to convert the address back into canonical form right
before invoking either CLFLUSH or INVLPG. Rename the function while at it
to be shorter (and less MAGA).

Fixes: fe0937b24ff5 ("x86/mm/cpa: Fold cpa_flush_range() and cpa_flush_array() 
into a single cpa_flush() function")
Reported-by: Tony Luck 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Tested-by: Tony Luck 
Cc: Linus Torvalds 
Cc: Dan Williams 
Cc: Dave Hansen 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Rik van Riel 
Link: 
https://lkml.kernel.org/r/20190208120859.gh32...@hirez.programming.kicks-ass.net
---
 arch/x86/mm/pageattr.c | 50 +-
 1 file changed, 25 insertions(+), 25 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 4f8972311a77..14e6119838a6 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -230,6 +230,29 @@ static bool __cpa_pfn_in_highmap(unsigned long pfn)
 
 #endif
 
+/*
+ * See set_mce_nospec().
+ *
+ * Machine check recovery code needs to change cache mode of poisoned pages to
+ * UC to avoid speculative access logging another error. But passing the
+ * address of the 1:1 mapping to set_memory_uc() is a fine way to encourage a
+ * speculative access. So we cheat and flip the top bit of the address. This
+ * works fine for the code that updates the page tables. But at the end of the
+ * process we need to flush the TLB and cache and the non-canonical address
+ * causes a #GP fault when used by the INVLPG and CLFLUSH instructions.
+ *
+ * But in the common case we already have a canonical address. This code
+ * will fix the top bit if needed and is a no-op otherwise.
+ */
+static inline unsigned long fix_addr(unsigned long addr)
+{
+#ifdef CONFIG_X86_64
+   return (long)(addr << 1) >> 1;
+#else
+   return addr;
+#endif
+}
+
 static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx)
 {
if (cpa->flags & CPA_PAGES_ARRAY) {
@@ -313,7 +336,7 @@ void __cpa_flush_tlb(void *data)
unsigned int i;
 
for (i = 0; i < cpa->numpages; i++)
-   __flush_tlb_one_kernel(__cpa_addr(cpa, i));
+   __flush_tlb_one_kernel(fix_addr(__cpa_addr(cpa, i)));
 }
 
 static void cpa_flush(struct cpa_data *data, int cache)
@@ -347,7 +370,7 @@ static void cpa_flush(struct cpa_data *data, int cache)
 * Only flush present addresses:
 */
if (pte && (pte_val(*pte) & _PAGE_PRESENT))
-   clflush_cache_range_opt((void *)addr, PAGE_SIZE);
+   clflush_cache_range_opt((void *)fix_addr(addr), 
PAGE_SIZE);
}
mb();
 }
@@ -1627,29 +1650,6 @@ out:
return ret;
 }
 
-/*
- * Machine check recovery code needs to change cache mode of poisoned
- * pages to UC to avoid speculative access logging another error. But
- * passing the address of the 1:1 mapping to set_memory_uc() is a fine
- * way to encourage a speculative access. So we cheat and flip the top
- * bit of the address. This works fine for the code that updates the
- * page tables. But at the end of the process we need to flush the cache
- * and the non-canonical address causes a #GP fault when used by the
- * CLFLUSH instruction.
- *
- * But in the common case we already have a canonical address. This code
- * will fix the top bit if needed and is a no-op otherwise.
- */
-static inline unsigned long make_addr_canonical_again(unsigned long addr)
-{
-#ifdef CONFIG_X86_64
-   return (long)(addr << 1) >> 1;
-#else
-   return addr;
-#endif
-}
-
-
 static int change_page_attr_set_clr(unsigned long *addr, int numpages,
pgprot_t mask_set, pgprot_t mask_clr,
int force_split, int in_flag,


[tip:perf/core] perf/x86/intel: Delay memory deallocation until x86_pmu_dead_cpu()

2019-02-04 Thread tip-bot for Peter Zijlstra
Commit-ID:  602cae04c4864bb3487dfe4c2126c8d9e7e1614a
Gitweb: https://git.kernel.org/tip/602cae04c4864bb3487dfe4c2126c8d9e7e1614a
Author: Peter Zijlstra 
AuthorDate: Wed, 19 Dec 2018 17:53:50 +0100
Committer:  Ingo Molnar 
CommitDate: Mon, 4 Feb 2019 08:44:51 +0100

perf/x86/intel: Delay memory deallocation until x86_pmu_dead_cpu()

intel_pmu_cpu_prepare() allocated memory for ->shared_regs among other
members of struct cpu_hw_events. This memory is released in
intel_pmu_cpu_dying() which is wrong. The counterpart of the
intel_pmu_cpu_prepare() callback is x86_pmu_dead_cpu().

Otherwise if the CPU fails on the UP path between CPUHP_PERF_X86_PREPARE
and CPUHP_AP_PERF_X86_STARTING then it won't release the memory but
allocate new memory on the next attempt to online the CPU (leaking the
old memory).
Also, if the CPU down path fails between CPUHP_AP_PERF_X86_STARTING and
CPUHP_PERF_X86_PREPARE then the CPU will go back online but never
allocate the memory that was released in x86_pmu_dying_cpu().

Make the memory allocation/free symmetrical in regard to the CPU hotplug
notifier by moving the deallocation to intel_pmu_cpu_dead().

This started in commit:

   a7e3ed1e47011 ("perf: Add support for supplementary event registers").

In principle the bug was introduced in v2.6.39 (!), but it will almost
certainly not backport cleanly across the big CPU hotplug rewrite between 
v4.7-v4.15...

[ bigeasy: Added patch description. ]
[ mingo: Added backporting guidance. ]

Reported-by: He Zhe 
Signed-off-by: Peter Zijlstra (Intel)  # With developer 
hat on
Signed-off-by: Sebastian Andrzej Siewior 
Signed-off-by: Peter Zijlstra (Intel)  # With maintainer 
hat on
Cc: Alexander Shishkin 
Cc: Arnaldo Carvalho de Melo 
Cc: Jiri Olsa 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: a...@kernel.org
Cc: b...@alien8.de
Cc: h...@zytor.com
Cc: jo...@kernel.org
Cc: kan.li...@linux.intel.com
Cc: namhy...@kernel.org
Cc: 
Fixes: a7e3ed1e47011 ("perf: Add support for supplementary event registers").
Link: https://lkml.kernel.org/r/20181219165350.6s3jvyxbibpvl...@linutronix.de
Signed-off-by: Ingo Molnar 
---
 arch/x86/events/intel/core.c | 16 +++-
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 40e12cfc87f6..daafb893449b 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -3558,6 +3558,14 @@ static void free_excl_cntrs(int cpu)
 }
 
 static void intel_pmu_cpu_dying(int cpu)
+{
+   fini_debug_store_on_cpu(cpu);
+
+   if (x86_pmu.counter_freezing)
+   disable_counter_freeze();
+}
+
+static void intel_pmu_cpu_dead(int cpu)
 {
struct cpu_hw_events *cpuc = _cpu(cpu_hw_events, cpu);
struct intel_shared_regs *pc;
@@ -3570,11 +3578,6 @@ static void intel_pmu_cpu_dying(int cpu)
}
 
free_excl_cntrs(cpu);
-
-   fini_debug_store_on_cpu(cpu);
-
-   if (x86_pmu.counter_freezing)
-   disable_counter_freeze();
 }
 
 static void intel_pmu_sched_task(struct perf_event_context *ctx,
@@ -3663,6 +3666,7 @@ static __initconst const struct x86_pmu core_pmu = {
.cpu_prepare= intel_pmu_cpu_prepare,
.cpu_starting   = intel_pmu_cpu_starting,
.cpu_dying  = intel_pmu_cpu_dying,
+   .cpu_dead   = intel_pmu_cpu_dead,
 };
 
 static struct attribute *intel_pmu_attrs[];
@@ -3703,6 +3707,8 @@ static __initconst const struct x86_pmu intel_pmu = {
.cpu_prepare= intel_pmu_cpu_prepare,
.cpu_starting   = intel_pmu_cpu_starting,
.cpu_dying  = intel_pmu_cpu_dying,
+   .cpu_dead   = intel_pmu_cpu_dead,
+
.guest_get_msrs = intel_guest_get_msrs,
.sched_task = intel_pmu_sched_task,
 };


[tip:sched/core] sched/fair: Robustify CFS-bandwidth timer locking

2019-01-27 Thread tip-bot for Peter Zijlstra
Commit-ID:  c0ad4aa4d8416a39ad262a2bd68b30acd951bf0e
Gitweb: https://git.kernel.org/tip/c0ad4aa4d8416a39ad262a2bd68b30acd951bf0e
Author: Peter Zijlstra 
AuthorDate: Mon, 7 Jan 2019 13:52:31 +0100
Committer:  Ingo Molnar 
CommitDate: Sun, 27 Jan 2019 12:29:37 +0100

sched/fair: Robustify CFS-bandwidth timer locking

Traditionally hrtimer callbacks were run with IRQs disabled, but with
the introduction of HRTIMER_MODE_SOFT it is possible they run from
SoftIRQ context, which does _NOT_ have IRQs disabled.

Allow for the CFS bandwidth timers (period_timer and slack_timer) to
be ran from SoftIRQ context; this entails removing the assumption that
IRQs are already disabled from the locking.

While mainline doesn't strictly need this, -RT forces all timers not
explicitly marked with MODE_HARD into MODE_SOFT and trips over this.
And marking these timers as MODE_HARD doesn't make sense as they're
not required for RT operation and can potentially be quite expensive.

Reported-by: Tom Putzeys 
Tested-by: Mike Galbraith 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Sebastian Andrzej Siewior 
Cc: Thomas Gleixner 
Link: 
https://lkml.kernel.org/r/20190107125231.ge14...@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 30 --
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b1374fbddd0d..3b61e19b504a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4565,7 +4565,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth 
*cfs_b,
struct rq *rq = rq_of(cfs_rq);
struct rq_flags rf;
 
-   rq_lock(rq, );
+   rq_lock_irqsave(rq, );
if (!cfs_rq_throttled(cfs_rq))
goto next;
 
@@ -4582,7 +4582,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth 
*cfs_b,
unthrottle_cfs_rq(cfs_rq);
 
 next:
-   rq_unlock(rq, );
+   rq_unlock_irqrestore(rq, );
 
if (!remaining)
break;
@@ -4598,7 +4598,7 @@ next:
  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
  * used to track this state.
  */
-static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, 
unsigned long flags)
 {
u64 runtime, runtime_expires;
int throttled;
@@ -4640,11 +4640,11 @@ static int do_sched_cfs_period_timer(struct 
cfs_bandwidth *cfs_b, int overrun)
while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
runtime = cfs_b->runtime;
cfs_b->distribute_running = 1;
-   raw_spin_unlock(_b->lock);
+   raw_spin_unlock_irqrestore(_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
runtime = distribute_cfs_runtime(cfs_b, runtime,
 runtime_expires);
-   raw_spin_lock(_b->lock);
+   raw_spin_lock_irqsave(_b->lock, flags);
 
cfs_b->distribute_running = 0;
throttled = !list_empty(_b->throttled_cfs_rq);
@@ -4753,17 +4753,18 @@ static __always_inline void 
return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 {
u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
+   unsigned long flags;
u64 expires;
 
/* confirm we're still not at a refresh boundary */
-   raw_spin_lock(_b->lock);
+   raw_spin_lock_irqsave(_b->lock, flags);
if (cfs_b->distribute_running) {
-   raw_spin_unlock(_b->lock);
+   raw_spin_unlock_irqrestore(_b->lock, flags);
return;
}
 
if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
-   raw_spin_unlock(_b->lock);
+   raw_spin_unlock_irqrestore(_b->lock, flags);
return;
}
 
@@ -4774,18 +4775,18 @@ static void do_sched_cfs_slack_timer(struct 
cfs_bandwidth *cfs_b)
if (runtime)
cfs_b->distribute_running = 1;
 
-   raw_spin_unlock(_b->lock);
+   raw_spin_unlock_irqrestore(_b->lock, flags);
 
if (!runtime)
return;
 
runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
 
-   raw_spin_lock(_b->lock);
+   raw_spin_lock_irqsave(_b->lock, flags);
if (expires == cfs_b->runtime_expires)
lsub_positive(_b->runtime, runtime);
cfs_b->distribute_running = 0;
-   raw_spin_unlock(_b->lock);
+   raw_spin_unlock_irqrestore(_b->lock, flags);
 }
 
 /*
@@ -4863,20 +4864,21 @@ static enum hrtimer_restart 
sched_cfs_period_timer(struct hrtimer *timer)
 {
struct cfs_bandwidth *cfs_b =

[tip:sched/core] sched/fair: Robustify CFS-bandwidth timer locking

2019-01-21 Thread tip-bot for Peter Zijlstra
Commit-ID:  3cd126af79ed5a4d6b06eba63d3349e143a3bd3b
Gitweb: https://git.kernel.org/tip/3cd126af79ed5a4d6b06eba63d3349e143a3bd3b
Author: Peter Zijlstra 
AuthorDate: Mon, 7 Jan 2019 13:52:31 +0100
Committer:  Ingo Molnar 
CommitDate: Mon, 21 Jan 2019 14:40:28 +0100

sched/fair: Robustify CFS-bandwidth timer locking

Traditionally hrtimer callbacks were run with IRQs disabled, but with
the introduction of HRTIMER_MODE_SOFT it is possible they run from
SoftIRQ context, which does _NOT_ have IRQs disabled.

Allow for the CFS bandwidth timers (period_timer and slack_timer) to
be ran from SoftIRQ context; this entails removing the assumption that
IRQs are already disabled from the locking.

While mainline doesn't strictly need this, -RT forces all timers not
explicitly marked with MODE_HARD into MODE_SOFT and trips over this.
And marking these timers as MODE_HARD doesn't make sense as they're
not required for RT operation and can potentially be quite expensive.

Reported-by: Tom Putzeys 
Tested-by: Mike Galbraith 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Sebastian Andrzej Siewior 
Cc: Thomas Gleixner 
Link: 
https://lkml.kernel.org/r/20190107125231.ge14...@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 30 --
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index b1374fbddd0d..3b61e19b504a 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4565,7 +4565,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth 
*cfs_b,
struct rq *rq = rq_of(cfs_rq);
struct rq_flags rf;
 
-   rq_lock(rq, );
+   rq_lock_irqsave(rq, );
if (!cfs_rq_throttled(cfs_rq))
goto next;
 
@@ -4582,7 +4582,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth 
*cfs_b,
unthrottle_cfs_rq(cfs_rq);
 
 next:
-   rq_unlock(rq, );
+   rq_unlock_irqrestore(rq, );
 
if (!remaining)
break;
@@ -4598,7 +4598,7 @@ next:
  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
  * used to track this state.
  */
-static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, 
unsigned long flags)
 {
u64 runtime, runtime_expires;
int throttled;
@@ -4640,11 +4640,11 @@ static int do_sched_cfs_period_timer(struct 
cfs_bandwidth *cfs_b, int overrun)
while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
runtime = cfs_b->runtime;
cfs_b->distribute_running = 1;
-   raw_spin_unlock(_b->lock);
+   raw_spin_unlock_irqrestore(_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
runtime = distribute_cfs_runtime(cfs_b, runtime,
 runtime_expires);
-   raw_spin_lock(_b->lock);
+   raw_spin_lock_irqsave(_b->lock, flags);
 
cfs_b->distribute_running = 0;
throttled = !list_empty(_b->throttled_cfs_rq);
@@ -4753,17 +4753,18 @@ static __always_inline void 
return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 {
u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
+   unsigned long flags;
u64 expires;
 
/* confirm we're still not at a refresh boundary */
-   raw_spin_lock(_b->lock);
+   raw_spin_lock_irqsave(_b->lock, flags);
if (cfs_b->distribute_running) {
-   raw_spin_unlock(_b->lock);
+   raw_spin_unlock_irqrestore(_b->lock, flags);
return;
}
 
if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
-   raw_spin_unlock(_b->lock);
+   raw_spin_unlock_irqrestore(_b->lock, flags);
return;
}
 
@@ -4774,18 +4775,18 @@ static void do_sched_cfs_slack_timer(struct 
cfs_bandwidth *cfs_b)
if (runtime)
cfs_b->distribute_running = 1;
 
-   raw_spin_unlock(_b->lock);
+   raw_spin_unlock_irqrestore(_b->lock, flags);
 
if (!runtime)
return;
 
runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
 
-   raw_spin_lock(_b->lock);
+   raw_spin_lock_irqsave(_b->lock, flags);
if (expires == cfs_b->runtime_expires)
lsub_positive(_b->runtime, runtime);
cfs_b->distribute_running = 0;
-   raw_spin_unlock(_b->lock);
+   raw_spin_unlock_irqrestore(_b->lock, flags);
 }
 
 /*
@@ -4863,20 +4864,21 @@ static enum hrtimer_restart 
sched_cfs_period_timer(struct hrtimer *timer)
 {
struct cfs_bandwidth *cfs_b =

[tip:sched/core] sched/fair: Robustify CFS-bandwidth timer locking

2019-01-21 Thread tip-bot for Peter Zijlstra
Commit-ID:  b733c2d2f2810ec8556d2d711d1b95f491bd7697
Gitweb: https://git.kernel.org/tip/b733c2d2f2810ec8556d2d711d1b95f491bd7697
Author: Peter Zijlstra 
AuthorDate: Mon, 7 Jan 2019 13:52:31 +0100
Committer:  Ingo Molnar 
CommitDate: Mon, 21 Jan 2019 11:27:55 +0100

sched/fair: Robustify CFS-bandwidth timer locking

Traditionally hrtimer callbacks were run with IRQs disabled, but with
the introduction of HRTIMER_MODE_SOFT it is possible they run from
SoftIRQ context, which does _NOT_ have IRQs disabled.

Allow for the CFS bandwidth timers (period_timer and slack_timer) to
be ran from SoftIRQ context; this entails removing the assumption that
IRQs are already disabled from the locking.

While mainline doesn't strictly need this, -RT forces all timers not
explicitly marked with MODE_HARD into MODE_SOFT and trips over this.
And marking these timers as MODE_HARD doesn't make sense as they're
not required for RT operation and can potentially be quite expensive.

Reported-by: Tom Putzeys 
Tested-by: Mike Galbraith 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Sebastian Andrzej Siewior 
Cc: Thomas Gleixner 
Link: 
https://lkml.kernel.org/r/20190107125231.ge14...@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar 
---
 kernel/sched/fair.c | 30 --
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c
index 385725eb3bd6..90c7a7bf45d3 100644
--- a/kernel/sched/fair.c
+++ b/kernel/sched/fair.c
@@ -4565,7 +4565,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth 
*cfs_b,
struct rq *rq = rq_of(cfs_rq);
struct rq_flags rf;
 
-   rq_lock(rq, );
+   rq_lock_irqsave(rq, );
if (!cfs_rq_throttled(cfs_rq))
goto next;
 
@@ -4582,7 +4582,7 @@ static u64 distribute_cfs_runtime(struct cfs_bandwidth 
*cfs_b,
unthrottle_cfs_rq(cfs_rq);
 
 next:
-   rq_unlock(rq, );
+   rq_unlock_irqrestore(rq, );
 
if (!remaining)
break;
@@ -4598,7 +4598,7 @@ next:
  * period the timer is deactivated until scheduling resumes; cfs_b->idle is
  * used to track this state.
  */
-static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun)
+static int do_sched_cfs_period_timer(struct cfs_bandwidth *cfs_b, int overrun, 
unsigned long flags)
 {
u64 runtime, runtime_expires;
int throttled;
@@ -4640,11 +4640,11 @@ static int do_sched_cfs_period_timer(struct 
cfs_bandwidth *cfs_b, int overrun)
while (throttled && cfs_b->runtime > 0 && !cfs_b->distribute_running) {
runtime = cfs_b->runtime;
cfs_b->distribute_running = 1;
-   raw_spin_unlock(_b->lock);
+   raw_spin_unlock_irqrestore(_b->lock, flags);
/* we can't nest cfs_b->lock while distributing bandwidth */
runtime = distribute_cfs_runtime(cfs_b, runtime,
 runtime_expires);
-   raw_spin_lock(_b->lock);
+   raw_spin_lock_irqsave(_b->lock, flags);
 
cfs_b->distribute_running = 0;
throttled = !list_empty(_b->throttled_cfs_rq);
@@ -4753,17 +4753,18 @@ static __always_inline void 
return_cfs_rq_runtime(struct cfs_rq *cfs_rq)
 static void do_sched_cfs_slack_timer(struct cfs_bandwidth *cfs_b)
 {
u64 runtime = 0, slice = sched_cfs_bandwidth_slice();
+   unsigned long flags;
u64 expires;
 
/* confirm we're still not at a refresh boundary */
-   raw_spin_lock(_b->lock);
+   raw_spin_lock_irqsave(_b->lock, flags);
if (cfs_b->distribute_running) {
-   raw_spin_unlock(_b->lock);
+   raw_spin_unlock_irqrestore(_b->lock, flags);
return;
}
 
if (runtime_refresh_within(cfs_b, min_bandwidth_expiration)) {
-   raw_spin_unlock(_b->lock);
+   raw_spin_unlock_irqrestore(_b->lock, flags);
return;
}
 
@@ -4774,18 +4775,18 @@ static void do_sched_cfs_slack_timer(struct 
cfs_bandwidth *cfs_b)
if (runtime)
cfs_b->distribute_running = 1;
 
-   raw_spin_unlock(_b->lock);
+   raw_spin_unlock_irqrestore(_b->lock, flags);
 
if (!runtime)
return;
 
runtime = distribute_cfs_runtime(cfs_b, runtime, expires);
 
-   raw_spin_lock(_b->lock);
+   raw_spin_lock_irqsave(_b->lock, flags);
if (expires == cfs_b->runtime_expires)
lsub_positive(_b->runtime, runtime);
cfs_b->distribute_running = 0;
-   raw_spin_unlock(_b->lock);
+   raw_spin_unlock_irqrestore(_b->lock, flags);
 }
 
 /*
@@ -4863,20 +4864,21 @@ static enum hrtimer_restart 
sched_cfs_period_timer(struct hrtimer *timer)
 {
struct cfs_bandwidth *cfs_b =

[tip:x86/mm] x86/mm/cpa: Rename @addrinarray to @numpages

2018-12-17 Thread tip-bot for Peter Zijlstra
Commit-ID:  3c567356dbe0da4fc310cfcffafc39526e1ca43a
Gitweb: https://git.kernel.org/tip/3c567356dbe0da4fc310cfcffafc39526e1ca43a
Author: Peter Zijlstra 
AuthorDate: Mon, 3 Dec 2018 18:03:53 +0100
Committer:  Ingo Molnar 
CommitDate: Mon, 17 Dec 2018 18:54:30 +0100

x86/mm/cpa: Rename @addrinarray to @numpages

The CPA_ARRAY interface works in single pages, and everything, except
in these 'few' locations is this variable called 'numpages'.

Remove this 'addrinarray' abberation and use 'numpages' consistently.

Signed-off-by: Peter Zijlstra (Intel) 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Dave Hansen 
Cc: H. Peter Anvin 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Cc: tom.stde...@amd.com
Cc: dave.han...@intel.com
Link: http://lkml.kernel.org/r/20181203171043.695039...@infradead.org
Signed-off-by: Ingo Molnar 
---
 arch/x86/mm/pageattr.c | 52 +-
 1 file changed, 26 insertions(+), 26 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 7d05149995dc..df4340c8e293 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1808,14 +1808,14 @@ out_err:
 }
 EXPORT_SYMBOL(set_memory_uc);
 
-static int _set_memory_array(unsigned long *addr, int addrinarray,
+static int _set_memory_array(unsigned long *addr, int numpages,
enum page_cache_mode new_type)
 {
enum page_cache_mode set_type;
int i, j;
int ret;
 
-   for (i = 0; i < addrinarray; i++) {
+   for (i = 0; i < numpages; i++) {
ret = reserve_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE,
new_type, NULL);
if (ret)
@@ -1826,11 +1826,11 @@ static int _set_memory_array(unsigned long *addr, int 
addrinarray,
set_type = (new_type == _PAGE_CACHE_MODE_WC) ?
_PAGE_CACHE_MODE_UC_MINUS : new_type;
 
-   ret = change_page_attr_set(addr, addrinarray,
+   ret = change_page_attr_set(addr, numpages,
   cachemode2pgprot(set_type), 1);
 
if (!ret && new_type == _PAGE_CACHE_MODE_WC)
-   ret = change_page_attr_set_clr(addr, addrinarray,
+   ret = change_page_attr_set_clr(addr, numpages,
   cachemode2pgprot(
_PAGE_CACHE_MODE_WC),
   __pgprot(_PAGE_CACHE_MASK),
@@ -1847,21 +1847,21 @@ out_free:
return ret;
 }
 
-int set_memory_array_uc(unsigned long *addr, int addrinarray)
+int set_memory_array_uc(unsigned long *addr, int numpages)
 {
-   return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_UC_MINUS);
+   return _set_memory_array(addr, numpages, _PAGE_CACHE_MODE_UC_MINUS);
 }
 EXPORT_SYMBOL(set_memory_array_uc);
 
-int set_memory_array_wc(unsigned long *addr, int addrinarray)
+int set_memory_array_wc(unsigned long *addr, int numpages)
 {
-   return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WC);
+   return _set_memory_array(addr, numpages, _PAGE_CACHE_MODE_WC);
 }
 EXPORT_SYMBOL(set_memory_array_wc);
 
-int set_memory_array_wt(unsigned long *addr, int addrinarray)
+int set_memory_array_wt(unsigned long *addr, int numpages)
 {
-   return _set_memory_array(addr, addrinarray, _PAGE_CACHE_MODE_WT);
+   return _set_memory_array(addr, numpages, _PAGE_CACHE_MODE_WT);
 }
 EXPORT_SYMBOL_GPL(set_memory_array_wt);
 
@@ -1941,18 +1941,18 @@ int set_memory_wb(unsigned long addr, int numpages)
 }
 EXPORT_SYMBOL(set_memory_wb);
 
-int set_memory_array_wb(unsigned long *addr, int addrinarray)
+int set_memory_array_wb(unsigned long *addr, int numpages)
 {
int i;
int ret;
 
/* WB cache mode is hard wired to all cache attribute bits being 0 */
-   ret = change_page_attr_clear(addr, addrinarray,
+   ret = change_page_attr_clear(addr, numpages,
  __pgprot(_PAGE_CACHE_MASK), 1);
if (ret)
return ret;
 
-   for (i = 0; i < addrinarray; i++)
+   for (i = 0; i < numpages; i++)
free_memtype(__pa(addr[i]), __pa(addr[i]) + PAGE_SIZE);
 
return 0;
@@ -2082,7 +2082,7 @@ int set_pages_uc(struct page *page, int numpages)
 }
 EXPORT_SYMBOL(set_pages_uc);
 
-static int _set_pages_array(struct page **pages, int addrinarray,
+static int _set_pages_array(struct page **pages, int numpages,
enum page_cache_mode new_type)
 {
unsigned long start;
@@ -2092,7 +2092,7 @@ static int _set_pages_array(struct page **pages, int 
addrinarray,
int free_idx;
int ret;
 
-   for (i = 0; i < addrinarray; i++) {
+   for (i = 0; i < numpages; i++) {
if (PageHighMem(pages[i]))
continue;
start = page_to_pfn(pages[i]) << PAGE_SHIFT;
@@ -2105,10 +2105,10 @@ static int 

[tip:x86/mm] x86/mm/cpa: Better use CLFLUSHOPT

2018-12-17 Thread tip-bot for Peter Zijlstra
Commit-ID:  c38116bb940ae37f51fccd315b420ee5961dcb76
Gitweb: https://git.kernel.org/tip/c38116bb940ae37f51fccd315b420ee5961dcb76
Author: Peter Zijlstra 
AuthorDate: Mon, 3 Dec 2018 18:03:52 +0100
Committer:  Ingo Molnar 
CommitDate: Mon, 17 Dec 2018 18:54:29 +0100

x86/mm/cpa: Better use CLFLUSHOPT

Currently we issue an MFENCE before and after flushing a range. This
means that if we flush a bunch of single page ranges -- like with the
cpa array, we issue a whole bunch of superfluous MFENCEs.

Reorgainze the code a little to avoid this.

[ mingo: capitalize instructions, tweak changelog and comments. ]

Signed-off-by: Peter Zijlstra (Intel) 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Dave Hansen 
Cc: H. Peter Anvin 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Cc: tom.stde...@amd.com
Cc: dave.han...@intel.com
Link: http://lkml.kernel.org/r/20181203171043.626999...@infradead.org
Signed-off-by: Ingo Molnar 
---
 arch/x86/mm/pageattr.c | 29 +
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 85ef53b86fa0..7d05149995dc 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -251,15 +251,7 @@ static unsigned long __cpa_addr(struct cpa_data *cpa, 
unsigned long idx)
  * Flushing functions
  */
 
-/**
- * clflush_cache_range - flush a cache range with clflush
- * @vaddr: virtual start address
- * @size:  number of bytes to flush
- *
- * clflushopt is an unordered instruction which needs fencing with mfence or
- * sfence to avoid ordering issues.
- */
-void clflush_cache_range(void *vaddr, unsigned int size)
+static void clflush_cache_range_opt(void *vaddr, unsigned int size)
 {
const unsigned long clflush_size = boot_cpu_data.x86_clflush_size;
void *p = (void *)((unsigned long)vaddr & ~(clflush_size - 1));
@@ -268,11 +260,22 @@ void clflush_cache_range(void *vaddr, unsigned int size)
if (p >= vend)
return;
 
-   mb();
-
for (; p < vend; p += clflush_size)
clflushopt(p);
+}
 
+/**
+ * clflush_cache_range - flush a cache range with clflush
+ * @vaddr: virtual start address
+ * @size:  number of bytes to flush
+ *
+ * CLFLUSHOPT is an unordered instruction which needs fencing with MFENCE or
+ * SFENCE to avoid ordering issues.
+ */
+void clflush_cache_range(void *vaddr, unsigned int size)
+{
+   mb();
+   clflush_cache_range_opt(vaddr, size);
mb();
 }
 EXPORT_SYMBOL_GPL(clflush_cache_range);
@@ -333,6 +336,7 @@ static void cpa_flush(struct cpa_data *data, int cache)
if (!cache)
return;
 
+   mb();
for (i = 0; i < cpa->numpages; i++) {
unsigned long addr = __cpa_addr(cpa, i);
unsigned int level;
@@ -343,8 +347,9 @@ static void cpa_flush(struct cpa_data *data, int cache)
 * Only flush present addresses:
 */
if (pte && (pte_val(*pte) & _PAGE_PRESENT))
-   clflush_cache_range((void *)addr, PAGE_SIZE);
+   clflush_cache_range_opt((void *)addr, PAGE_SIZE);
}
+   mb();
 }
 
 static bool overlaps(unsigned long r1_start, unsigned long r1_end,


[tip:x86/mm] x86/mm/cpa: Fold cpa_flush_range() and cpa_flush_array() into a single cpa_flush() function

2018-12-17 Thread tip-bot for Peter Zijlstra
Commit-ID:  fe0937b24ff5d7b343b9922201e469f9a6009d9d
Gitweb: https://git.kernel.org/tip/fe0937b24ff5d7b343b9922201e469f9a6009d9d
Author: Peter Zijlstra 
AuthorDate: Mon, 3 Dec 2018 18:03:51 +0100
Committer:  Ingo Molnar 
CommitDate: Mon, 17 Dec 2018 18:54:28 +0100

x86/mm/cpa: Fold cpa_flush_range() and cpa_flush_array() into a single 
cpa_flush() function

Note that the cache flush loop in cpa_flush_*() is identical when we
use __cpa_addr(); further observe that flush_tlb_kernel_range() is a
special case of to the cpa_flush_array() TLB invalidation code.

This then means the two functions are virtually identical. Fold these
two functions into a single cpa_flush() call.

Signed-off-by: Peter Zijlstra (Intel) 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Dave Hansen 
Cc: H. Peter Anvin 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Cc: tom.stde...@amd.com
Cc: dave.han...@intel.com
Link: http://lkml.kernel.org/r/20181203171043.559855...@infradead.org
Signed-off-by: Ingo Molnar 
---
 arch/x86/mm/pageattr.c | 92 ++
 1 file changed, 18 insertions(+), 74 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 12b69263e501..85ef53b86fa0 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -304,51 +304,7 @@ static void cpa_flush_all(unsigned long cache)
on_each_cpu(__cpa_flush_all, (void *) cache, 1);
 }
 
-static bool __inv_flush_all(int cache)
-{
-   BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
-
-   if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
-   cpa_flush_all(cache);
-   return true;
-   }
-
-   return false;
-}
-
-static void cpa_flush_range(unsigned long start, int numpages, int cache)
-{
-   unsigned int i, level;
-   unsigned long addr;
-
-   WARN_ON(PAGE_ALIGN(start) != start);
-
-   if (__inv_flush_all(cache))
-   return;
-
-   flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages);
-
-   if (!cache)
-   return;
-
-   /*
-* We only need to flush on one CPU,
-* clflush is a MESI-coherent instruction that
-* will cause all other CPUs to flush the same
-* cachelines:
-*/
-   for (i = 0, addr = start; i < numpages; i++, addr += PAGE_SIZE) {
-   pte_t *pte = lookup_address(addr, );
-
-   /*
-* Only flush present addresses:
-*/
-   if (pte && (pte_val(*pte) & _PAGE_PRESENT))
-   clflush_cache_range((void *) addr, PAGE_SIZE);
-   }
-}
-
-void __cpa_flush_array(void *data)
+void __cpa_flush_tlb(void *data)
 {
struct cpa_data *cpa = data;
unsigned int i;
@@ -357,33 +313,31 @@ void __cpa_flush_array(void *data)
__flush_tlb_one_kernel(__cpa_addr(cpa, i));
 }
 
-static void cpa_flush_array(struct cpa_data *cpa, int cache)
+static void cpa_flush(struct cpa_data *data, int cache)
 {
+   struct cpa_data *cpa = data;
unsigned int i;
 
-   if (cpa_check_flush_all(cache))
+   BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
+
+   if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
+   cpa_flush_all(cache);
return;
+   }
 
if (cpa->numpages <= tlb_single_page_flush_ceiling)
-   on_each_cpu(__cpa_flush_array, cpa, 1);
+   on_each_cpu(__cpa_flush_tlb, cpa, 1);
else
flush_tlb_all();
 
if (!cache)
return;
 
-   /*
-* We only need to flush on one CPU,
-* clflush is a MESI-coherent instruction that
-* will cause all other CPUs to flush the same
-* cachelines:
-*/
for (i = 0; i < cpa->numpages; i++) {
unsigned long addr = __cpa_addr(cpa, i);
unsigned int level;
-   pte_t *pte;
 
-   pte = lookup_address(addr, );
+   pte_t *pte = lookup_address(addr, );
 
/*
 * Only flush present addresses:
@@ -1698,7 +1652,6 @@ static int change_page_attr_set_clr(unsigned long *addr, 
int numpages,
 {
struct cpa_data cpa;
int ret, cache, checkalias;
-   unsigned long baddr = 0;
 
memset(, 0, sizeof(cpa));
 
@@ -1732,11 +1685,6 @@ static int change_page_attr_set_clr(unsigned long *addr, 
int numpages,
 */
WARN_ON_ONCE(1);
}
-   /*
-* Save address for cache flush. *addr is modified in the call
-* to __change_page_attr_set_clr() below.
-*/
-   baddr = make_addr_canonical_again(*addr);
}
 
/* Must avoid aliasing mappings in the highmem code */
@@ -1784,11 +1732,7 @@ static int change_page_attr_set_clr(unsigned long *addr, 
int numpages,
goto out;
}

[tip:x86/mm] x86/mm/cpa: Make cpa_data::numpages invariant

2018-12-17 Thread tip-bot for Peter Zijlstra
Commit-ID:  83b4e39146aa70913580966e0f2b78b7c3492760
Gitweb: https://git.kernel.org/tip/83b4e39146aa70913580966e0f2b78b7c3492760
Author: Peter Zijlstra 
AuthorDate: Mon, 3 Dec 2018 18:03:50 +0100
Committer:  Ingo Molnar 
CommitDate: Mon, 17 Dec 2018 18:54:27 +0100

x86/mm/cpa: Make cpa_data::numpages invariant

Make sure __change_page_attr_set_clr() doesn't modify cpa->numpages.

Signed-off-by: Peter Zijlstra (Intel) 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Dave Hansen 
Cc: H. Peter Anvin 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Cc: tom.stde...@amd.com
Cc: dave.han...@intel.com
Link: http://lkml.kernel.org/r/20181203171043.493000...@infradead.org
Signed-off-by: Ingo Molnar 
---
 arch/x86/mm/pageattr.c | 21 +
 1 file changed, 13 insertions(+), 8 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 351874259a71..12b69263e501 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1625,14 +1625,15 @@ static int cpa_process_alias(struct cpa_data *cpa)
 static int __change_page_attr_set_clr(struct cpa_data *cpa, int checkalias)
 {
unsigned long numpages = cpa->numpages;
-   int ret;
+   unsigned long rempages = numpages;
+   int ret = 0;
 
-   while (numpages) {
+   while (rempages) {
/*
 * Store the remaining nr of pages for the large page
 * preservation check.
 */
-   cpa->numpages = numpages;
+   cpa->numpages = rempages;
/* for array changes, we can't use large page */
if (cpa->flags & (CPA_ARRAY | CPA_PAGES_ARRAY))
cpa->numpages = 1;
@@ -1643,12 +1644,12 @@ static int __change_page_attr_set_clr(struct cpa_data 
*cpa, int checkalias)
if (!debug_pagealloc_enabled())
spin_unlock(_lock);
if (ret)
-   return ret;
+   goto out;
 
if (checkalias) {
ret = cpa_process_alias(cpa);
if (ret)
-   return ret;
+   goto out;
}
 
/*
@@ -1656,11 +1657,15 @@ static int __change_page_attr_set_clr(struct cpa_data 
*cpa, int checkalias)
 * CPA operation. Either a large page has been
 * preserved or a single page update happened.
 */
-   BUG_ON(cpa->numpages > numpages || !cpa->numpages);
-   numpages -= cpa->numpages;
+   BUG_ON(cpa->numpages > rempages || !cpa->numpages);
+   rempages -= cpa->numpages;
cpa->curpage += cpa->numpages;
}
-   return 0;
+
+out:
+   /* Restore the original numpages */
+   cpa->numpages = numpages;
+   return ret;
 }
 
 /*


[tip:x86/mm] x86/mm/cpa: Optimize cpa_flush_array() TLB invalidation

2018-12-17 Thread tip-bot for Peter Zijlstra
Commit-ID:  935f5839827ef54b53406e80906f7c355eb73c1b
Gitweb: https://git.kernel.org/tip/935f5839827ef54b53406e80906f7c355eb73c1b
Author: Peter Zijlstra 
AuthorDate: Mon, 3 Dec 2018 18:03:49 +0100
Committer:  Ingo Molnar 
CommitDate: Mon, 17 Dec 2018 18:54:26 +0100

x86/mm/cpa: Optimize cpa_flush_array() TLB invalidation

Instead of punting and doing tlb_flush_all(), do the same as
flush_tlb_kernel_range() does and use single page invalidations.

Signed-off-by: Peter Zijlstra (Intel) 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Dave Hansen 
Cc: H. Peter Anvin 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Cc: tom.stde...@amd.com
Cc: dave.han...@intel.com
Link: http://lkml.kernel.org/r/20181203171043.430001...@infradead.org
Signed-off-by: Ingo Molnar 
---
 arch/x86/mm/mm_internal.h |  2 ++
 arch/x86/mm/pageattr.c| 42 --
 arch/x86/mm/tlb.c |  4 +++-
 3 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/arch/x86/mm/mm_internal.h b/arch/x86/mm/mm_internal.h
index 4e1f6e1b8159..319bde386d5f 100644
--- a/arch/x86/mm/mm_internal.h
+++ b/arch/x86/mm/mm_internal.h
@@ -19,4 +19,6 @@ extern int after_bootmem;
 
 void update_cache_mode_entry(unsigned entry, enum page_cache_mode cache);
 
+extern unsigned long tlb_single_page_flush_ceiling;
+
 #endif /* __X86_MM_INTERNAL_H */
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index afa98b7b6050..351874259a71 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -26,6 +26,8 @@
 #include 
 #include 
 
+#include "mm_internal.h"
+
 /*
  * The current flushing context - we pass it instead of 5 arguments:
  */
@@ -346,16 +348,26 @@ static void cpa_flush_range(unsigned long start, int 
numpages, int cache)
}
 }
 
-static void cpa_flush_array(unsigned long baddr, unsigned long *start,
-   int numpages, int cache,
-   int in_flags, struct page **pages)
+void __cpa_flush_array(void *data)
 {
-   unsigned int i, level;
+   struct cpa_data *cpa = data;
+   unsigned int i;
 
-   if (__inv_flush_all(cache))
+   for (i = 0; i < cpa->numpages; i++)
+   __flush_tlb_one_kernel(__cpa_addr(cpa, i));
+}
+
+static void cpa_flush_array(struct cpa_data *cpa, int cache)
+{
+   unsigned int i;
+
+   if (cpa_check_flush_all(cache))
return;
 
-   flush_tlb_all();
+   if (cpa->numpages <= tlb_single_page_flush_ceiling)
+   on_each_cpu(__cpa_flush_array, cpa, 1);
+   else
+   flush_tlb_all();
 
if (!cache)
return;
@@ -366,15 +378,11 @@ static void cpa_flush_array(unsigned long baddr, unsigned 
long *start,
 * will cause all other CPUs to flush the same
 * cachelines:
 */
-   for (i = 0; i < numpages; i++) {
-   unsigned long addr;
+   for (i = 0; i < cpa->numpages; i++) {
+   unsigned long addr = __cpa_addr(cpa, i);
+   unsigned int level;
pte_t *pte;
 
-   if (in_flags & CPA_PAGES_ARRAY)
-   addr = (unsigned long)page_address(pages[i]);
-   else
-   addr = start[i];
-
pte = lookup_address(addr, );
 
/*
@@ -1771,12 +1779,10 @@ static int change_page_attr_set_clr(unsigned long 
*addr, int numpages,
goto out;
}
 
-   if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
-   cpa_flush_array(baddr, addr, numpages, cache,
-   cpa.flags, pages);
-   } else {
+   if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
+   cpa_flush_array(, cache);
+   else
cpa_flush_range(baddr, numpages, cache);
-   }
 
 out:
return ret;
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 03b6b4c2238d..999d6d8f0bef 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -15,6 +15,8 @@
 #include 
 #include 
 
+#include "mm_internal.h"
+
 /*
  * TLB flushing, formerly SMP-only
  * c/o Linus Torvalds.
@@ -721,7 +723,7 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
  *
  * This is in units of pages.
  */
-static unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
+unsigned long tlb_single_page_flush_ceiling __read_mostly = 33;
 
 void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
unsigned long end, unsigned int stride_shift,


[tip:x86/mm] x86/mm/cpa: Simplify the code after making cpa->vaddr invariant

2018-12-17 Thread tip-bot for Peter Zijlstra
Commit-ID:  5fe26b7a8f4693d532c7a3c3632e47e7d7016238
Gitweb: https://git.kernel.org/tip/5fe26b7a8f4693d532c7a3c3632e47e7d7016238
Author: Peter Zijlstra 
AuthorDate: Mon, 3 Dec 2018 18:03:48 +0100
Committer:  Ingo Molnar 
CommitDate: Mon, 17 Dec 2018 18:54:25 +0100

x86/mm/cpa: Simplify the code after making cpa->vaddr invariant

Since cpa->vaddr is invariant, this means we can remove all
workarounds that deal with it changing.

Signed-off-by: Peter Zijlstra (Intel) 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Dave Hansen 
Cc: H. Peter Anvin 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Cc: tom.stde...@amd.com
Cc: dave.han...@intel.com
Link: http://lkml.kernel.org/r/20181203171043.366619...@infradead.org
Signed-off-by: Ingo Molnar 
---
 arch/x86/mm/pageattr-test.c |  7 ++-
 arch/x86/mm/pageattr.c  | 13 -
 2 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index b6b6468530f1..facce271e8b9 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -124,7 +124,6 @@ static int pageattr_test(void)
unsigned int level;
int i, k;
int err;
-   unsigned long test_addr;
 
if (print)
printk(KERN_INFO "CPA self-test:\n");
@@ -181,8 +180,7 @@ static int pageattr_test(void)
 
switch (i % 3) {
case 0:
-   test_addr = addr[i];
-   err = change_page_attr_set(_addr, len[i], 
PAGE_CPA_TEST, 0);
+   err = change_page_attr_set([i], len[i], 
PAGE_CPA_TEST, 0);
break;
 
case 1:
@@ -226,8 +224,7 @@ static int pageattr_test(void)
failed++;
continue;
}
-   test_addr = addr[i];
-   err = change_page_attr_clear(_addr, len[i], PAGE_CPA_TEST, 
0);
+   err = change_page_attr_clear([i], len[i], PAGE_CPA_TEST, 
0);
if (err < 0) {
printk(KERN_ERR "CPA reverting failed: %d\n", err);
failed++;
diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index ce8af3f08628..afa98b7b6050 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -1908,15 +1908,13 @@ EXPORT_SYMBOL_GPL(set_memory_array_wt);
 int _set_memory_wc(unsigned long addr, int numpages)
 {
int ret;
-   unsigned long addr_copy = addr;
 
ret = change_page_attr_set(, numpages,
   cachemode2pgprot(_PAGE_CACHE_MODE_UC_MINUS),
   0);
if (!ret) {
-   ret = change_page_attr_set_clr(_copy, numpages,
-  cachemode2pgprot(
-   _PAGE_CACHE_MODE_WC),
+   ret = change_page_attr_set_clr(, numpages,
+  
cachemode2pgprot(_PAGE_CACHE_MODE_WC),
   __pgprot(_PAGE_CACHE_MASK),
   0, 0, NULL);
}
@@ -2064,7 +2062,6 @@ int set_memory_global(unsigned long addr, int numpages)
 static int __set_memory_enc_dec(unsigned long addr, int numpages, bool enc)
 {
struct cpa_data cpa;
-   unsigned long start;
int ret;
 
/* Nothing to do if memory encryption is not active */
@@ -2075,8 +2072,6 @@ static int __set_memory_enc_dec(unsigned long addr, int 
numpages, bool enc)
if (WARN_ONCE(addr & ~PAGE_MASK, "misaligned address: %#lx\n", addr))
addr &= PAGE_MASK;
 
-   start = addr;
-
memset(, 0, sizeof(cpa));
cpa.vaddr = 
cpa.numpages = numpages;
@@ -2091,7 +2086,7 @@ static int __set_memory_enc_dec(unsigned long addr, int 
numpages, bool enc)
/*
 * Before changing the encryption attribute, we need to flush caches.
 */
-   cpa_flush_range(start, numpages, 1);
+   cpa_flush_range(addr, numpages, 1);
 
ret = __change_page_attr_set_clr(, 1);
 
@@ -2102,7 +2097,7 @@ static int __set_memory_enc_dec(unsigned long addr, int 
numpages, bool enc)
 * in case TLB flushing gets optimized in the cpa_flush_range()
 * path use the same logic as above.
 */
-   cpa_flush_range(start, numpages, 0);
+   cpa_flush_range(addr, numpages, 0);
 
return ret;
 }


[tip:x86/mm] x86/mm/cpa: Make cpa_data::vaddr invariant

2018-12-17 Thread tip-bot for Peter Zijlstra
Commit-ID:  98bfc9b038cde1ce108f69a50720e394fe774cb7
Gitweb: https://git.kernel.org/tip/98bfc9b038cde1ce108f69a50720e394fe774cb7
Author: Peter Zijlstra 
AuthorDate: Mon, 3 Dec 2018 18:03:47 +0100
Committer:  Ingo Molnar 
CommitDate: Mon, 17 Dec 2018 18:54:24 +0100

x86/mm/cpa: Make cpa_data::vaddr invariant

Currently __change_page_attr_set_clr() will modify cpa->vaddr when
!(CPA_ARRAY | CPA_PAGES_ARRAY), whereas in the array cases it will
increment cpa->curpage.

Change __cpa_addr() such that its @idx argument also works in the
!array case and use cpa->curpage increments for all cases.

NOTE: since cpa_data::numpages is 'unsigned long' so should cpa_data::curpage 
be.
NOTE: after this only cpa->numpages is still modified.

Signed-off-by: Peter Zijlstra (Intel) 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Dave Hansen 
Cc: H. Peter Anvin 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Cc: tom.stde...@amd.com
Cc: dave.han...@intel.com
Link: http://lkml.kernel.org/r/20181203171043.295174...@infradead.org
Signed-off-by: Ingo Molnar 
---
 arch/x86/mm/pageattr.c | 18 --
 1 file changed, 8 insertions(+), 10 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 6e6900ebea30..ce8af3f08628 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -35,11 +35,11 @@ struct cpa_data {
pgprot_tmask_set;
pgprot_tmask_clr;
unsigned long   numpages;
-   int flags;
+   unsigned long   curpage;
unsigned long   pfn;
-   unsignedforce_split : 1,
+   unsigned intflags;
+   unsigned intforce_split : 1,
force_static_prot   : 1;
-   int curpage;
struct page **pages;
 };
 
@@ -228,7 +228,7 @@ static bool __cpa_pfn_in_highmap(unsigned long pfn)
 
 #endif
 
-static unsigned long __cpa_addr(struct cpa_data *cpa, int idx)
+static unsigned long __cpa_addr(struct cpa_data *cpa, unsigned long idx)
 {
if (cpa->flags & CPA_PAGES_ARRAY) {
struct page *page = cpa->pages[idx];
@@ -242,7 +242,7 @@ static unsigned long __cpa_addr(struct cpa_data *cpa, int 
idx)
if (cpa->flags & CPA_ARRAY)
return cpa->vaddr[idx];
 
-   return *cpa->vaddr;
+   return *cpa->vaddr + idx * PAGE_SIZE;
 }
 
 /*
@@ -1581,6 +1581,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
alias_cpa = *cpa;
alias_cpa.vaddr = 
alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
+   alias_cpa.curpage = 0;
 
ret = __change_page_attr_set_clr(_cpa, 0);
if (ret)
@@ -1600,6 +1601,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
alias_cpa = *cpa;
alias_cpa.vaddr = _cpa_vaddr;
alias_cpa.flags &= ~(CPA_PAGES_ARRAY | CPA_ARRAY);
+   alias_cpa.curpage = 0;
 
/*
 * The high mapping range is imprecise, so ignore the
@@ -1648,11 +1650,7 @@ static int __change_page_attr_set_clr(struct cpa_data 
*cpa, int checkalias)
 */
BUG_ON(cpa->numpages > numpages || !cpa->numpages);
numpages -= cpa->numpages;
-   if (cpa->flags & (CPA_PAGES_ARRAY | CPA_ARRAY))
-   cpa->curpage++;
-   else
-   *cpa->vaddr += cpa->numpages * PAGE_SIZE;
-
+   cpa->curpage += cpa->numpages;
}
return 0;
 }


[tip:x86/mm] x86/mm/cpa: Add ARRAY and PAGES_ARRAY selftests

2018-12-17 Thread tip-bot for Peter Zijlstra
Commit-ID:  ecc729f1f47142ad31741549f400b611435c1af7
Gitweb: https://git.kernel.org/tip/ecc729f1f47142ad31741549f400b611435c1af7
Author: Peter Zijlstra 
AuthorDate: Mon, 3 Dec 2018 18:03:45 +0100
Committer:  Ingo Molnar 
CommitDate: Mon, 17 Dec 2018 18:54:22 +0100

x86/mm/cpa: Add ARRAY and PAGES_ARRAY selftests

The current pageattr-test code only uses the regular range interface,
add code that also tests the array and pages interface.

Signed-off-by: Peter Zijlstra (Intel) 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Dave Hansen 
Cc: H. Peter Anvin 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Cc: tom.stde...@amd.com
Cc: dave.han...@intel.com
Link: http://lkml.kernel.org/r/20181203171043.162771...@infradead.org
Signed-off-by: Ingo Molnar 
---
 arch/x86/mm/pageattr-test.c | 28 
 1 file changed, 24 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/pageattr-test.c b/arch/x86/mm/pageattr-test.c
index 08f8f76a4852..b6b6468530f1 100644
--- a/arch/x86/mm/pageattr-test.c
+++ b/arch/x86/mm/pageattr-test.c
@@ -23,7 +23,8 @@
 static __read_mostly int print = 1;
 
 enum {
-   NTEST   = 400,
+   NTEST   = 3 * 100,
+   NPAGES  = 100,
 #ifdef CONFIG_X86_64
LPS = (1 << PMD_SHIFT),
 #elif defined(CONFIG_X86_PAE)
@@ -110,6 +111,9 @@ static int print_split(struct split_state *s)
 static unsigned long addr[NTEST];
 static unsigned int len[NTEST];
 
+static struct page *pages[NPAGES];
+static unsigned long addrs[NPAGES];
+
 /* Change the global bit on random pages in the direct mapping */
 static int pageattr_test(void)
 {
@@ -137,7 +141,7 @@ static int pageattr_test(void)
unsigned long pfn = prandom_u32() % max_pfn_mapped;
 
addr[i] = (unsigned long)__va(pfn << PAGE_SHIFT);
-   len[i] = prandom_u32() % 100;
+   len[i] = prandom_u32() % NPAGES;
len[i] = min_t(unsigned long, len[i], max_pfn_mapped - pfn - 1);
 
if (len[i] == 0)
@@ -167,14 +171,30 @@ static int pageattr_test(void)
break;
}
__set_bit(pfn + k, bm);
+   addrs[k] = addr[i] + k*PAGE_SIZE;
+   pages[k] = pfn_to_page(pfn + k);
}
if (!addr[i] || !pte || !k) {
addr[i] = 0;
continue;
}
 
-   test_addr = addr[i];
-   err = change_page_attr_set(_addr, len[i], PAGE_CPA_TEST, 
0);
+   switch (i % 3) {
+   case 0:
+   test_addr = addr[i];
+   err = change_page_attr_set(_addr, len[i], 
PAGE_CPA_TEST, 0);
+   break;
+
+   case 1:
+   err = change_page_attr_set(addrs, len[1], 
PAGE_CPA_TEST, 1);
+   break;
+
+   case 2:
+   err = cpa_set_pages_array(pages, len[i], PAGE_CPA_TEST);
+   break;
+   }
+
+
if (err < 0) {
printk(KERN_ERR "CPA %d failed %d\n", i, err);
failed++;


[tip:x86/mm] x86/mm/cpa: Add __cpa_addr() helper

2018-12-17 Thread tip-bot for Peter Zijlstra
Commit-ID:  16ebf031e8ab73779a382c9f2b097891da6af923
Gitweb: https://git.kernel.org/tip/16ebf031e8ab73779a382c9f2b097891da6af923
Author: Peter Zijlstra 
AuthorDate: Mon, 3 Dec 2018 18:03:46 +0100
Committer:  Ingo Molnar 
CommitDate: Mon, 17 Dec 2018 18:54:23 +0100

x86/mm/cpa: Add __cpa_addr() helper

The code to compute the virtual address of a cpa_data is duplicated;
introduce a helper before more copies happen.

Signed-off-by: Peter Zijlstra (Intel) 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Dave Hansen 
Cc: H. Peter Anvin 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Cc: tom.stde...@amd.com
Cc: dave.han...@intel.com
Link: http://lkml.kernel.org/r/20181203171043.229119...@infradead.org
Signed-off-by: Ingo Molnar 
---
 arch/x86/mm/pageattr.c | 38 +++---
 1 file changed, 19 insertions(+), 19 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a1bcde35db4c..6e6900ebea30 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -228,6 +228,23 @@ static bool __cpa_pfn_in_highmap(unsigned long pfn)
 
 #endif
 
+static unsigned long __cpa_addr(struct cpa_data *cpa, int idx)
+{
+   if (cpa->flags & CPA_PAGES_ARRAY) {
+   struct page *page = cpa->pages[idx];
+
+   if (unlikely(PageHighMem(page)))
+   return 0;
+
+   return (unsigned long)page_address(page);
+   }
+
+   if (cpa->flags & CPA_ARRAY)
+   return cpa->vaddr[idx];
+
+   return *cpa->vaddr;
+}
+
 /*
  * Flushing functions
  */
@@ -1476,15 +1493,7 @@ static int __change_page_attr(struct cpa_data *cpa, int 
primary)
unsigned int level;
pte_t *kpte, old_pte;
 
-   if (cpa->flags & CPA_PAGES_ARRAY) {
-   struct page *page = cpa->pages[cpa->curpage];
-   if (unlikely(PageHighMem(page)))
-   return 0;
-   address = (unsigned long)page_address(page);
-   } else if (cpa->flags & CPA_ARRAY)
-   address = cpa->vaddr[cpa->curpage];
-   else
-   address = *cpa->vaddr;
+   address = __cpa_addr(cpa, cpa->curpage);
 repeat:
kpte = _lookup_address_cpa(cpa, address, );
if (!kpte)
@@ -1565,16 +1574,7 @@ static int cpa_process_alias(struct cpa_data *cpa)
 * No need to redo, when the primary call touched the direct
 * mapping already:
 */
-   if (cpa->flags & CPA_PAGES_ARRAY) {
-   struct page *page = cpa->pages[cpa->curpage];
-   if (unlikely(PageHighMem(page)))
-   return 0;
-   vaddr = (unsigned long)page_address(page);
-   } else if (cpa->flags & CPA_ARRAY)
-   vaddr = cpa->vaddr[cpa->curpage];
-   else
-   vaddr = *cpa->vaddr;
-
+   vaddr = __cpa_addr(cpa, cpa->curpage);
if (!(within(vaddr, PAGE_OFFSET,
PAGE_OFFSET + (max_pfn_mapped << PAGE_SHIFT {
 


[tip:x86/mm] x86/mm/cpa: Fix cpa_flush_array() TLB invalidation

2018-12-17 Thread tip-bot for Peter Zijlstra
Commit-ID:  721066dfd4d5c0fee5772c777d6930d0f423b4eb
Gitweb: https://git.kernel.org/tip/721066dfd4d5c0fee5772c777d6930d0f423b4eb
Author: Peter Zijlstra 
AuthorDate: Mon, 3 Dec 2018 18:03:44 +0100
Committer:  Ingo Molnar 
CommitDate: Mon, 17 Dec 2018 18:48:09 +0100

x86/mm/cpa: Fix cpa_flush_array() TLB invalidation

In commit:

  a7295fd53c39 ("x86/mm/cpa: Use flush_tlb_kernel_range()")

I misread the CAP array code and incorrectly used
tlb_flush_kernel_range(), resulting in missing TLB flushes and
consequent failures.

Instead do a full invalidate in this case -- for now.

Reported-by: StDenis, Tom 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Dave Hansen 
Cc: H. Peter Anvin 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Cc: dave.han...@intel.com
Fixes: a7295fd53c39 ("x86/mm/cpa: Use flush_tlb_kernel_range()")
Link: http://lkml.kernel.org/r/20181203171043.089868...@infradead.org
Signed-off-by: Ingo Molnar 
---
 arch/x86/mm/pageattr.c | 24 
 1 file changed, 16 insertions(+), 8 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index db7a10082238..a1bcde35db4c 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -285,20 +285,16 @@ static void cpa_flush_all(unsigned long cache)
on_each_cpu(__cpa_flush_all, (void *) cache, 1);
 }
 
-static bool __cpa_flush_range(unsigned long start, int numpages, int cache)
+static bool __inv_flush_all(int cache)
 {
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
 
-   WARN_ON(PAGE_ALIGN(start) != start);
-
if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
cpa_flush_all(cache);
return true;
}
 
-   flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages);
-
-   return !cache;
+   return false;
 }
 
 static void cpa_flush_range(unsigned long start, int numpages, int cache)
@@ -306,7 +302,14 @@ static void cpa_flush_range(unsigned long start, int 
numpages, int cache)
unsigned int i, level;
unsigned long addr;
 
-   if (__cpa_flush_range(start, numpages, cache))
+   WARN_ON(PAGE_ALIGN(start) != start);
+
+   if (__inv_flush_all(cache))
+   return;
+
+   flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages);
+
+   if (!cache)
return;
 
/*
@@ -332,7 +335,12 @@ static void cpa_flush_array(unsigned long baddr, unsigned 
long *start,
 {
unsigned int i, level;
 
-   if (__cpa_flush_range(baddr, numpages, cache))
+   if (__inv_flush_all(cache))
+   return;
+
+   flush_tlb_all();
+
+   if (!cache)
return;
 
/*


[tip:x86/pti] sched/smt: Make sched_smt_present track topology

2018-11-28 Thread tip-bot for Peter Zijlstra (Intel)
Commit-ID:  c5511d03ec090980732e929c318a7a6374b5550e
Gitweb: https://git.kernel.org/tip/c5511d03ec090980732e929c318a7a6374b5550e
Author: Peter Zijlstra (Intel) 
AuthorDate: Sun, 25 Nov 2018 19:33:36 +0100
Committer:  Thomas Gleixner 
CommitDate: Wed, 28 Nov 2018 11:57:06 +0100

sched/smt: Make sched_smt_present track topology

Currently the 'sched_smt_present' static key is enabled when at CPU bringup
SMT topology is observed, but it is never disabled. However there is demand
to also disable the key when the topology changes such that there is no SMT
present anymore.

Implement this by making the key count the number of cores that have SMT
enabled.

In particular, the SMT topology bits are set before interrrupts are enabled
and similarly, are cleared after interrupts are disabled for the last time
and the CPU dies.

Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Ingo Molnar 
Cc: Andy Lutomirski 
Cc: Linus Torvalds 
Cc: Jiri Kosina 
Cc: Tom Lendacky 
Cc: Josh Poimboeuf 
Cc: Andrea Arcangeli 
Cc: David Woodhouse 
Cc: Tim Chen 
Cc: Andi Kleen 
Cc: Dave Hansen 
Cc: Casey Schaufler 
Cc: Asit Mallick 
Cc: Arjan van de Ven 
Cc: Jon Masters 
Cc: Waiman Long 
Cc: Greg KH 
Cc: Dave Stewart 
Cc: Kees Cook 
Cc: sta...@vger.kernel.org
Link: https://lkml.kernel.org/r/20181125185004.246110...@linutronix.de


---
 kernel/sched/core.c | 19 +++
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 091e089063be..6fedf3a98581 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5738,15 +5738,10 @@ int sched_cpu_activate(unsigned int cpu)
 
 #ifdef CONFIG_SCHED_SMT
/*
-* The sched_smt_present static key needs to be evaluated on every
-* hotplug event because at boot time SMT might be disabled when
-* the number of booted CPUs is limited.
-*
-* If then later a sibling gets hotplugged, then the key would stay
-* off and SMT scheduling would never be functional.
+* When going up, increment the number of cores with SMT present.
 */
-   if (cpumask_weight(cpu_smt_mask(cpu)) > 1)
-   static_branch_enable_cpuslocked(_smt_present);
+   if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+   static_branch_inc_cpuslocked(_smt_present);
 #endif
set_cpu_active(cpu, true);
 
@@ -5790,6 +5785,14 @@ int sched_cpu_deactivate(unsigned int cpu)
 */
synchronize_rcu_mult(call_rcu, call_rcu_sched);
 
+#ifdef CONFIG_SCHED_SMT
+   /*
+* When going down, decrement the number of cores with SMT present.
+*/
+   if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+   static_branch_dec_cpuslocked(_smt_present);
+#endif
+
if (!sched_smp_initialized)
return 0;
 


[tip:x86/pti] sched/smt: Make sched_smt_present track topology

2018-11-28 Thread tip-bot for Peter Zijlstra (Intel)
Commit-ID:  c5511d03ec090980732e929c318a7a6374b5550e
Gitweb: https://git.kernel.org/tip/c5511d03ec090980732e929c318a7a6374b5550e
Author: Peter Zijlstra (Intel) 
AuthorDate: Sun, 25 Nov 2018 19:33:36 +0100
Committer:  Thomas Gleixner 
CommitDate: Wed, 28 Nov 2018 11:57:06 +0100

sched/smt: Make sched_smt_present track topology

Currently the 'sched_smt_present' static key is enabled when at CPU bringup
SMT topology is observed, but it is never disabled. However there is demand
to also disable the key when the topology changes such that there is no SMT
present anymore.

Implement this by making the key count the number of cores that have SMT
enabled.

In particular, the SMT topology bits are set before interrrupts are enabled
and similarly, are cleared after interrupts are disabled for the last time
and the CPU dies.

Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Ingo Molnar 
Cc: Andy Lutomirski 
Cc: Linus Torvalds 
Cc: Jiri Kosina 
Cc: Tom Lendacky 
Cc: Josh Poimboeuf 
Cc: Andrea Arcangeli 
Cc: David Woodhouse 
Cc: Tim Chen 
Cc: Andi Kleen 
Cc: Dave Hansen 
Cc: Casey Schaufler 
Cc: Asit Mallick 
Cc: Arjan van de Ven 
Cc: Jon Masters 
Cc: Waiman Long 
Cc: Greg KH 
Cc: Dave Stewart 
Cc: Kees Cook 
Cc: sta...@vger.kernel.org
Link: https://lkml.kernel.org/r/20181125185004.246110...@linutronix.de


---
 kernel/sched/core.c | 19 +++
 1 file changed, 11 insertions(+), 8 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 091e089063be..6fedf3a98581 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -5738,15 +5738,10 @@ int sched_cpu_activate(unsigned int cpu)
 
 #ifdef CONFIG_SCHED_SMT
/*
-* The sched_smt_present static key needs to be evaluated on every
-* hotplug event because at boot time SMT might be disabled when
-* the number of booted CPUs is limited.
-*
-* If then later a sibling gets hotplugged, then the key would stay
-* off and SMT scheduling would never be functional.
+* When going up, increment the number of cores with SMT present.
 */
-   if (cpumask_weight(cpu_smt_mask(cpu)) > 1)
-   static_branch_enable_cpuslocked(_smt_present);
+   if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+   static_branch_inc_cpuslocked(_smt_present);
 #endif
set_cpu_active(cpu, true);
 
@@ -5790,6 +5785,14 @@ int sched_cpu_deactivate(unsigned int cpu)
 */
synchronize_rcu_mult(call_rcu, call_rcu_sched);
 
+#ifdef CONFIG_SCHED_SMT
+   /*
+* When going down, decrement the number of cores with SMT present.
+*/
+   if (cpumask_weight(cpu_smt_mask(cpu)) == 2)
+   static_branch_dec_cpuslocked(_smt_present);
+#endif
+
if (!sched_smp_initialized)
return 0;
 


[tip:perf/urgent] perf/x86/intel: Fix regression by default disabling perfmon v4 interrupt handling

2018-11-20 Thread tip-bot for Peter Zijlstra
Commit-ID:  2a5bf23d5b795d5df33dc284e8f5cf8b6a5b4042
Gitweb: https://git.kernel.org/tip/2a5bf23d5b795d5df33dc284e8f5cf8b6a5b4042
Author: Peter Zijlstra 
AuthorDate: Tue, 20 Nov 2018 18:08:42 +0100
Committer:  Ingo Molnar 
CommitDate: Tue, 20 Nov 2018 18:57:48 +0100

perf/x86/intel: Fix regression by default disabling perfmon v4 interrupt 
handling

Kyle Huey reported that 'rr', a replay debugger, broke due to the following 
commit:

  af3bdb991a5c ("perf/x86/intel: Add a separate Arch Perfmon v4 PMI handler")

Rework the 'disable_counter_freezing' __setup() parameter such that we
can explicitly enable/disable it and switch to default disabled.

To this purpose, rename the parameter to "perf_v4_pmi=" which is a much
better description and allows requiring a bool argument.

[ mingo: Improved the changelog some more. ]

Reported-by: Kyle Huey 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Alexander Shishkin 
Cc: Arnaldo Carvalho de Melo 
Cc: Jiri Olsa 
Cc: Kan Liang 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Robert O'Callahan 
Cc: Stephane Eranian 
Cc: Thomas Gleixner 
Cc: Vince Weaver 
Cc: a...@kernel.org
Link: 
http://lkml.kernel.org/r/20181120170842.gz2...@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar 
---
 Documentation/admin-guide/kernel-parameters.txt |  3 ++-
 arch/x86/events/intel/core.c| 12 
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 81d1d5a74728..5463d5a4d85c 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -856,7 +856,8 @@
causing system reset or hang due to sending
INIT from AP to BSP.
 
-   disable_counter_freezing [HW]
+   perf_v4_pmi=[X86,INTEL]
+   Format: 
Disable Intel PMU counter freezing feature.
The feature only exists starting from
Arch Perfmon v4 (Skylake and newer).
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 273c62e81546..af8bea9d4006 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2306,14 +2306,18 @@ static int handle_pmi_common(struct pt_regs *regs, u64 
status)
return handled;
 }
 
-static bool disable_counter_freezing;
+static bool disable_counter_freezing = true;
 static int __init intel_perf_counter_freezing_setup(char *s)
 {
-   disable_counter_freezing = true;
-   pr_info("Intel PMU Counter freezing feature disabled\n");
+   bool res;
+
+   if (kstrtobool(s, ))
+   return -EINVAL;
+
+   disable_counter_freezing = !res;
return 1;
 }
-__setup("disable_counter_freezing", intel_perf_counter_freezing_setup);
+__setup("perf_v4_pmi=", intel_perf_counter_freezing_setup);
 
 /*
  * Simplified handler for Arch Perfmon v4:


[tip:perf/urgent] perf/x86/intel: Fix regression by default disabling perfmon v4 interrupt handling

2018-11-20 Thread tip-bot for Peter Zijlstra
Commit-ID:  2a5bf23d5b795d5df33dc284e8f5cf8b6a5b4042
Gitweb: https://git.kernel.org/tip/2a5bf23d5b795d5df33dc284e8f5cf8b6a5b4042
Author: Peter Zijlstra 
AuthorDate: Tue, 20 Nov 2018 18:08:42 +0100
Committer:  Ingo Molnar 
CommitDate: Tue, 20 Nov 2018 18:57:48 +0100

perf/x86/intel: Fix regression by default disabling perfmon v4 interrupt 
handling

Kyle Huey reported that 'rr', a replay debugger, broke due to the following 
commit:

  af3bdb991a5c ("perf/x86/intel: Add a separate Arch Perfmon v4 PMI handler")

Rework the 'disable_counter_freezing' __setup() parameter such that we
can explicitly enable/disable it and switch to default disabled.

To this purpose, rename the parameter to "perf_v4_pmi=" which is a much
better description and allows requiring a bool argument.

[ mingo: Improved the changelog some more. ]

Reported-by: Kyle Huey 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Alexander Shishkin 
Cc: Arnaldo Carvalho de Melo 
Cc: Jiri Olsa 
Cc: Kan Liang 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Robert O'Callahan 
Cc: Stephane Eranian 
Cc: Thomas Gleixner 
Cc: Vince Weaver 
Cc: a...@kernel.org
Link: 
http://lkml.kernel.org/r/20181120170842.gz2...@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar 
---
 Documentation/admin-guide/kernel-parameters.txt |  3 ++-
 arch/x86/events/intel/core.c| 12 
 2 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 81d1d5a74728..5463d5a4d85c 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -856,7 +856,8 @@
causing system reset or hang due to sending
INIT from AP to BSP.
 
-   disable_counter_freezing [HW]
+   perf_v4_pmi=[X86,INTEL]
+   Format: 
Disable Intel PMU counter freezing feature.
The feature only exists starting from
Arch Perfmon v4 (Skylake and newer).
diff --git a/arch/x86/events/intel/core.c b/arch/x86/events/intel/core.c
index 273c62e81546..af8bea9d4006 100644
--- a/arch/x86/events/intel/core.c
+++ b/arch/x86/events/intel/core.c
@@ -2306,14 +2306,18 @@ static int handle_pmi_common(struct pt_regs *regs, u64 
status)
return handled;
 }
 
-static bool disable_counter_freezing;
+static bool disable_counter_freezing = true;
 static int __init intel_perf_counter_freezing_setup(char *s)
 {
-   disable_counter_freezing = true;
-   pr_info("Intel PMU Counter freezing feature disabled\n");
+   bool res;
+
+   if (kstrtobool(s, ))
+   return -EINVAL;
+
+   disable_counter_freezing = !res;
return 1;
 }
-__setup("disable_counter_freezing", intel_perf_counter_freezing_setup);
+__setup("perf_v4_pmi=", intel_perf_counter_freezing_setup);
 
 /*
  * Simplified handler for Arch Perfmon v4:


[tip:locking/core] x86/asm: 'Simplify' GEN_*_RMWcc() macros

2018-10-16 Thread tip-bot for Peter Zijlstra
Commit-ID:  288e4521f0f6717909933116563e66bb894ae2af
Gitweb: https://git.kernel.org/tip/288e4521f0f6717909933116563e66bb894ae2af
Author: Peter Zijlstra 
AuthorDate: Wed, 3 Oct 2018 12:34:10 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 16 Oct 2018 17:33:54 +0200

x86/asm: 'Simplify' GEN_*_RMWcc() macros

Currently the GEN_*_RMWcc() macros include a return statement, which
pretty much mandates we directly wrap them in a (inline) function.

Macros with return statements are tricky and, as per the above, limit
use, so remove the return statement and make them
statement-expressions. This allows them to be used more widely.

Also, shuffle the arguments a bit. Place the @cc argument as 3rd, this
makes it consistent between UNARY and BINARY, but more importantly, it
makes the @arg0 argument last.

Since the @arg0 argument is now last, we can do CPP trickery and make
it an optional argument, simplifying the users; 17 out of 18
occurences do not need this argument.

Finally, change to asm symbolic names, instead of the numeric ordering
of operands, which allows us to get rid of __BINARY_RMWcc_ARG and get
cleaner code overall.

Signed-off-by: Peter Zijlstra (Intel) 
Cc: jbeul...@suse.com
Cc: Linus Torvalds 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: b...@alien8.de
Cc: h...@linux.intel.com
Link: https://lkml.kernel.org/r/20181003130957.108960...@infradead.org
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/atomic.h  |  8 ++---
 arch/x86/include/asm/atomic64_64.h |  8 ++---
 arch/x86/include/asm/bitops.h  |  9 ++---
 arch/x86/include/asm/local.h   |  8 ++---
 arch/x86/include/asm/preempt.h |  2 +-
 arch/x86/include/asm/refcount.h| 13 +++
 arch/x86/include/asm/rmwcc.h   | 69 ++
 7 files changed, 64 insertions(+), 53 deletions(-)

diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index ce84388e540c..ea3d95275b43 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -82,7 +82,7 @@ static __always_inline void arch_atomic_sub(int i, atomic_t 
*v)
  */
 static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v)
 {
-   GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", e);
+   return GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, e, "er", i);
 }
 #define arch_atomic_sub_and_test arch_atomic_sub_and_test
 
@@ -122,7 +122,7 @@ static __always_inline void arch_atomic_dec(atomic_t *v)
  */
 static __always_inline bool arch_atomic_dec_and_test(atomic_t *v)
 {
-   GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", e);
+   return GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, e);
 }
 #define arch_atomic_dec_and_test arch_atomic_dec_and_test
 
@@ -136,7 +136,7 @@ static __always_inline bool 
arch_atomic_dec_and_test(atomic_t *v)
  */
 static __always_inline bool arch_atomic_inc_and_test(atomic_t *v)
 {
-   GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", e);
+   return GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, e);
 }
 #define arch_atomic_inc_and_test arch_atomic_inc_and_test
 
@@ -151,7 +151,7 @@ static __always_inline bool 
arch_atomic_inc_and_test(atomic_t *v)
  */
 static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v)
 {
-   GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", s);
+   return GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, s, "er", i);
 }
 #define arch_atomic_add_negative arch_atomic_add_negative
 
diff --git a/arch/x86/include/asm/atomic64_64.h 
b/arch/x86/include/asm/atomic64_64.h
index 5f851d92eecd..dadc20adba21 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -73,7 +73,7 @@ static inline void arch_atomic64_sub(long i, atomic64_t *v)
  */
 static inline bool arch_atomic64_sub_and_test(long i, atomic64_t *v)
 {
-   GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, "er", i, "%0", e);
+   return GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, e, "er", i);
 }
 #define arch_atomic64_sub_and_test arch_atomic64_sub_and_test
 
@@ -115,7 +115,7 @@ static __always_inline void arch_atomic64_dec(atomic64_t *v)
  */
 static inline bool arch_atomic64_dec_and_test(atomic64_t *v)
 {
-   GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", e);
+   return GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, e);
 }
 #define arch_atomic64_dec_and_test arch_atomic64_dec_and_test
 
@@ -129,7 +129,7 @@ static inline bool arch_atomic64_dec_and_test(atomic64_t *v)
  */
 static inline bool arch_atomic64_inc_and_test(atomic64_t *v)
 {
-   GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", e);
+   return GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, e);
 }
 #define arch_atomic64_inc_and_test arch_atomic64_inc_and_test
 
@@ -144,7 +144,7 @@ static inline bool 

[tip:locking/core] x86/asm: 'Simplify' GEN_*_RMWcc() macros

2018-10-16 Thread tip-bot for Peter Zijlstra
Commit-ID:  288e4521f0f6717909933116563e66bb894ae2af
Gitweb: https://git.kernel.org/tip/288e4521f0f6717909933116563e66bb894ae2af
Author: Peter Zijlstra 
AuthorDate: Wed, 3 Oct 2018 12:34:10 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 16 Oct 2018 17:33:54 +0200

x86/asm: 'Simplify' GEN_*_RMWcc() macros

Currently the GEN_*_RMWcc() macros include a return statement, which
pretty much mandates we directly wrap them in a (inline) function.

Macros with return statements are tricky and, as per the above, limit
use, so remove the return statement and make them
statement-expressions. This allows them to be used more widely.

Also, shuffle the arguments a bit. Place the @cc argument as 3rd, this
makes it consistent between UNARY and BINARY, but more importantly, it
makes the @arg0 argument last.

Since the @arg0 argument is now last, we can do CPP trickery and make
it an optional argument, simplifying the users; 17 out of 18
occurences do not need this argument.

Finally, change to asm symbolic names, instead of the numeric ordering
of operands, which allows us to get rid of __BINARY_RMWcc_ARG and get
cleaner code overall.

Signed-off-by: Peter Zijlstra (Intel) 
Cc: jbeul...@suse.com
Cc: Linus Torvalds 
Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Brian Gerst 
Cc: Denys Vlasenko 
Cc: H. Peter Anvin 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: b...@alien8.de
Cc: h...@linux.intel.com
Link: https://lkml.kernel.org/r/20181003130957.108960...@infradead.org
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/atomic.h  |  8 ++---
 arch/x86/include/asm/atomic64_64.h |  8 ++---
 arch/x86/include/asm/bitops.h  |  9 ++---
 arch/x86/include/asm/local.h   |  8 ++---
 arch/x86/include/asm/preempt.h |  2 +-
 arch/x86/include/asm/refcount.h| 13 +++
 arch/x86/include/asm/rmwcc.h   | 69 ++
 7 files changed, 64 insertions(+), 53 deletions(-)

diff --git a/arch/x86/include/asm/atomic.h b/arch/x86/include/asm/atomic.h
index ce84388e540c..ea3d95275b43 100644
--- a/arch/x86/include/asm/atomic.h
+++ b/arch/x86/include/asm/atomic.h
@@ -82,7 +82,7 @@ static __always_inline void arch_atomic_sub(int i, atomic_t 
*v)
  */
 static __always_inline bool arch_atomic_sub_and_test(int i, atomic_t *v)
 {
-   GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, "er", i, "%0", e);
+   return GEN_BINARY_RMWcc(LOCK_PREFIX "subl", v->counter, e, "er", i);
 }
 #define arch_atomic_sub_and_test arch_atomic_sub_and_test
 
@@ -122,7 +122,7 @@ static __always_inline void arch_atomic_dec(atomic_t *v)
  */
 static __always_inline bool arch_atomic_dec_and_test(atomic_t *v)
 {
-   GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, "%0", e);
+   return GEN_UNARY_RMWcc(LOCK_PREFIX "decl", v->counter, e);
 }
 #define arch_atomic_dec_and_test arch_atomic_dec_and_test
 
@@ -136,7 +136,7 @@ static __always_inline bool 
arch_atomic_dec_and_test(atomic_t *v)
  */
 static __always_inline bool arch_atomic_inc_and_test(atomic_t *v)
 {
-   GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, "%0", e);
+   return GEN_UNARY_RMWcc(LOCK_PREFIX "incl", v->counter, e);
 }
 #define arch_atomic_inc_and_test arch_atomic_inc_and_test
 
@@ -151,7 +151,7 @@ static __always_inline bool 
arch_atomic_inc_and_test(atomic_t *v)
  */
 static __always_inline bool arch_atomic_add_negative(int i, atomic_t *v)
 {
-   GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, "er", i, "%0", s);
+   return GEN_BINARY_RMWcc(LOCK_PREFIX "addl", v->counter, s, "er", i);
 }
 #define arch_atomic_add_negative arch_atomic_add_negative
 
diff --git a/arch/x86/include/asm/atomic64_64.h 
b/arch/x86/include/asm/atomic64_64.h
index 5f851d92eecd..dadc20adba21 100644
--- a/arch/x86/include/asm/atomic64_64.h
+++ b/arch/x86/include/asm/atomic64_64.h
@@ -73,7 +73,7 @@ static inline void arch_atomic64_sub(long i, atomic64_t *v)
  */
 static inline bool arch_atomic64_sub_and_test(long i, atomic64_t *v)
 {
-   GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, "er", i, "%0", e);
+   return GEN_BINARY_RMWcc(LOCK_PREFIX "subq", v->counter, e, "er", i);
 }
 #define arch_atomic64_sub_and_test arch_atomic64_sub_and_test
 
@@ -115,7 +115,7 @@ static __always_inline void arch_atomic64_dec(atomic64_t *v)
  */
 static inline bool arch_atomic64_dec_and_test(atomic64_t *v)
 {
-   GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, "%0", e);
+   return GEN_UNARY_RMWcc(LOCK_PREFIX "decq", v->counter, e);
 }
 #define arch_atomic64_dec_and_test arch_atomic64_dec_and_test
 
@@ -129,7 +129,7 @@ static inline bool arch_atomic64_dec_and_test(atomic64_t *v)
  */
 static inline bool arch_atomic64_inc_and_test(atomic64_t *v)
 {
-   GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, "%0", e);
+   return GEN_UNARY_RMWcc(LOCK_PREFIX "incq", v->counter, e);
 }
 #define arch_atomic64_inc_and_test arch_atomic64_inc_and_test
 
@@ -144,7 +144,7 @@ static inline bool 

[tip:locking/core] locking/qspinlock, x86: Provide liveness guarantee

2018-10-16 Thread tip-bot for Peter Zijlstra
Commit-ID:  7aa54be2976550f17c11a1c3e3630002dea39303
Gitweb: https://git.kernel.org/tip/7aa54be2976550f17c11a1c3e3630002dea39303
Author: Peter Zijlstra 
AuthorDate: Wed, 26 Sep 2018 13:01:20 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 16 Oct 2018 17:33:54 +0200

locking/qspinlock, x86: Provide liveness guarantee

On x86 we cannot do fetch_or() with a single instruction and thus end up
using a cmpxchg loop, this reduces determinism. Replace the fetch_or()
with a composite operation: tas-pending + load.

Using two instructions of course opens a window we previously did not
have. Consider the scenario:

CPU0CPU1CPU2

 1) lock
  trylock -> (0,0,1)

 2) lock
  trylock /* fail */

 3) unlock -> (0,0,0)

 4) lock
  trylock -> (0,0,1)

 5)   tas-pending -> (0,1,1)
  load-val <- (0,1,0) from 3

 6)   clear-pending-set-locked -> (0,0,1)

  FAIL: _2_ owners

where 5) is our new composite operation. When we consider each part of
the qspinlock state as a separate variable (as we can when
_Q_PENDING_BITS == 8) then the above is entirely possible, because
tas-pending will only RmW the pending byte, so the later load is able
to observe prior tail and lock state (but not earlier than its own
trylock, which operates on the whole word, due to coherence).

To avoid this we need 2 things:

 - the load must come after the tas-pending (obviously, otherwise it
   can trivially observe prior state).

 - the tas-pending must be a full word RmW instruction, it cannot be an XCHGB 
for
   example, such that we cannot observe other state prior to setting
   pending.

On x86 we can realize this by using "LOCK BTS m32, r32" for
tas-pending followed by a regular load.

Note that observing later state is not a problem:

 - if we fail to observe a later unlock, we'll simply spin-wait for
   that store to become visible.

 - if we observe a later xchg_tail(), there is no difference from that
   xchg_tail() having taken place before the tas-pending.

Suggested-by: Will Deacon 
Reported-by: Thomas Gleixner 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Will Deacon 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: andrea.pa...@amarulasolutions.com
Cc: long...@redhat.com
Fixes: 59fb586b4a07 ("locking/qspinlock: Remove unbounded cmpxchg() loop from 
locking slowpath")
Link: https://lkml.kernel.org/r/20181003130957.183726...@infradead.org
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/qspinlock.h | 15 +++
 kernel/locking/qspinlock.c   | 16 +++-
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index 3e70bed8a978..87623c6b13db 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -6,9 +6,24 @@
 #include 
 #include 
 #include 
+#include 
 
 #define _Q_PENDING_LOOPS   (1 << 9)
 
+#define queued_fetch_set_pending_acquire queued_fetch_set_pending_acquire
+static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock 
*lock)
+{
+   u32 val = 0;
+
+   if (GEN_BINARY_RMWcc(LOCK_PREFIX "btsl", lock->val.counter, c,
+"I", _Q_PENDING_OFFSET))
+   val |= _Q_PENDING_VAL;
+
+   val |= atomic_read(>val) & ~_Q_PENDING_MASK;
+
+   return val;
+}
+
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
 extern void __pv_init_lock_hash(void);
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 47cb99787e4d..341ca666bc60 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -231,6 +231,20 @@ static __always_inline u32 xchg_tail(struct qspinlock 
*lock, u32 tail)
 }
 #endif /* _Q_PENDING_BITS == 8 */
 
+/**
+ * queued_fetch_set_pending_acquire - fetch the whole lock value and set 
pending
+ * @lock : Pointer to queued spinlock structure
+ * Return: The previous lock value
+ *
+ * *,*,* -> *,1,*
+ */
+#ifndef queued_fetch_set_pending_acquire
+static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock 
*lock)
+{
+   return atomic_fetch_or_acquire(_Q_PENDING_VAL, >val);
+}
+#endif
+
 /**
  * set_locked - Set the lock bit and own the lock
  * @lock: Pointer to queued spinlock structure
@@ -328,7 +342,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 
val)
 *
 * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
 */
-   val = atomic_fetch_or_acquire(_Q_PENDING_VAL, >val);
+   val = queued_fetch_set_pending_acquire(lock);
 
/*
 * If we observe contention, there is a concurrent locker.


[tip:locking/core] locking/qspinlock, x86: Provide liveness guarantee

2018-10-16 Thread tip-bot for Peter Zijlstra
Commit-ID:  7aa54be2976550f17c11a1c3e3630002dea39303
Gitweb: https://git.kernel.org/tip/7aa54be2976550f17c11a1c3e3630002dea39303
Author: Peter Zijlstra 
AuthorDate: Wed, 26 Sep 2018 13:01:20 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 16 Oct 2018 17:33:54 +0200

locking/qspinlock, x86: Provide liveness guarantee

On x86 we cannot do fetch_or() with a single instruction and thus end up
using a cmpxchg loop, this reduces determinism. Replace the fetch_or()
with a composite operation: tas-pending + load.

Using two instructions of course opens a window we previously did not
have. Consider the scenario:

CPU0CPU1CPU2

 1) lock
  trylock -> (0,0,1)

 2) lock
  trylock /* fail */

 3) unlock -> (0,0,0)

 4) lock
  trylock -> (0,0,1)

 5)   tas-pending -> (0,1,1)
  load-val <- (0,1,0) from 3

 6)   clear-pending-set-locked -> (0,0,1)

  FAIL: _2_ owners

where 5) is our new composite operation. When we consider each part of
the qspinlock state as a separate variable (as we can when
_Q_PENDING_BITS == 8) then the above is entirely possible, because
tas-pending will only RmW the pending byte, so the later load is able
to observe prior tail and lock state (but not earlier than its own
trylock, which operates on the whole word, due to coherence).

To avoid this we need 2 things:

 - the load must come after the tas-pending (obviously, otherwise it
   can trivially observe prior state).

 - the tas-pending must be a full word RmW instruction, it cannot be an XCHGB 
for
   example, such that we cannot observe other state prior to setting
   pending.

On x86 we can realize this by using "LOCK BTS m32, r32" for
tas-pending followed by a regular load.

Note that observing later state is not a problem:

 - if we fail to observe a later unlock, we'll simply spin-wait for
   that store to become visible.

 - if we observe a later xchg_tail(), there is no difference from that
   xchg_tail() having taken place before the tas-pending.

Suggested-by: Will Deacon 
Reported-by: Thomas Gleixner 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Will Deacon 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: andrea.pa...@amarulasolutions.com
Cc: long...@redhat.com
Fixes: 59fb586b4a07 ("locking/qspinlock: Remove unbounded cmpxchg() loop from 
locking slowpath")
Link: https://lkml.kernel.org/r/20181003130957.183726...@infradead.org
Signed-off-by: Ingo Molnar 
---
 arch/x86/include/asm/qspinlock.h | 15 +++
 kernel/locking/qspinlock.c   | 16 +++-
 2 files changed, 30 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/qspinlock.h b/arch/x86/include/asm/qspinlock.h
index 3e70bed8a978..87623c6b13db 100644
--- a/arch/x86/include/asm/qspinlock.h
+++ b/arch/x86/include/asm/qspinlock.h
@@ -6,9 +6,24 @@
 #include 
 #include 
 #include 
+#include 
 
 #define _Q_PENDING_LOOPS   (1 << 9)
 
+#define queued_fetch_set_pending_acquire queued_fetch_set_pending_acquire
+static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock 
*lock)
+{
+   u32 val = 0;
+
+   if (GEN_BINARY_RMWcc(LOCK_PREFIX "btsl", lock->val.counter, c,
+"I", _Q_PENDING_OFFSET))
+   val |= _Q_PENDING_VAL;
+
+   val |= atomic_read(>val) & ~_Q_PENDING_MASK;
+
+   return val;
+}
+
 #ifdef CONFIG_PARAVIRT_SPINLOCKS
 extern void native_queued_spin_lock_slowpath(struct qspinlock *lock, u32 val);
 extern void __pv_init_lock_hash(void);
diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index 47cb99787e4d..341ca666bc60 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -231,6 +231,20 @@ static __always_inline u32 xchg_tail(struct qspinlock 
*lock, u32 tail)
 }
 #endif /* _Q_PENDING_BITS == 8 */
 
+/**
+ * queued_fetch_set_pending_acquire - fetch the whole lock value and set 
pending
+ * @lock : Pointer to queued spinlock structure
+ * Return: The previous lock value
+ *
+ * *,*,* -> *,1,*
+ */
+#ifndef queued_fetch_set_pending_acquire
+static __always_inline u32 queued_fetch_set_pending_acquire(struct qspinlock 
*lock)
+{
+   return atomic_fetch_or_acquire(_Q_PENDING_VAL, >val);
+}
+#endif
+
 /**
  * set_locked - Set the lock bit and own the lock
  * @lock: Pointer to queued spinlock structure
@@ -328,7 +342,7 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, u32 
val)
 *
 * 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
 */
-   val = atomic_fetch_or_acquire(_Q_PENDING_VAL, >val);
+   val = queued_fetch_set_pending_acquire(lock);
 
/*
 * If we observe contention, there is a concurrent locker.


[tip:locking/core] locking/qspinlock: Rework some comments

2018-10-16 Thread tip-bot for Peter Zijlstra
Commit-ID:  756b1df4c2c82a1cdffeafa9d2aa76c92e7fb405
Gitweb: https://git.kernel.org/tip/756b1df4c2c82a1cdffeafa9d2aa76c92e7fb405
Author: Peter Zijlstra 
AuthorDate: Wed, 26 Sep 2018 13:01:19 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 16 Oct 2018 17:33:54 +0200

locking/qspinlock: Rework some comments

While working my way through the code again; I felt the comments could
use help.

Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Will Deacon 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: andrea.pa...@amarulasolutions.com
Cc: long...@redhat.com
Link: https://lkml.kernel.org/r/20181003130257.156322...@infradead.org
Signed-off-by: Ingo Molnar 
---
 kernel/locking/qspinlock.c | 36 ++--
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index ec343276f975..47cb99787e4d 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -326,16 +326,23 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, 
u32 val)
/*
 * trylock || pending
 *
-* 0,0,0 -> 0,0,1 ; trylock
-* 0,0,1 -> 0,1,1 ; pending
+* 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
 */
val = atomic_fetch_or_acquire(_Q_PENDING_VAL, >val);
+
/*
-* If we observe any contention; undo and queue.
+* If we observe contention, there is a concurrent locker.
+*
+* Undo and queue; our setting of PENDING might have made the
+* n,0,0 -> 0,0,0 transition fail and it will now be waiting
+* on @next to become !NULL.
 */
if (unlikely(val & ~_Q_LOCKED_MASK)) {
+
+   /* Undo PENDING if we set it. */
if (!(val & _Q_PENDING_MASK))
clear_pending(lock);
+
goto queue;
}
 
@@ -474,16 +481,25 @@ locked:
 */
 
/*
-* In the PV case we might already have _Q_LOCKED_VAL set.
+* In the PV case we might already have _Q_LOCKED_VAL set, because
+* of lock stealing; therefore we must also allow:
 *
-* The atomic_cond_read_acquire() call above has provided the
-* necessary acquire semantics required for locking.
+* n,0,1 -> 0,0,1
+*
+* Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
+*   above wait condition, therefore any concurrent setting of
+*   PENDING will make the uncontended transition fail.
 */
-   if (((val & _Q_TAIL_MASK) == tail) &&
-   atomic_try_cmpxchg_relaxed(>val, , _Q_LOCKED_VAL))
-   goto release; /* No contention */
+   if ((val & _Q_TAIL_MASK) == tail) {
+   if (atomic_try_cmpxchg_relaxed(>val, , _Q_LOCKED_VAL))
+   goto release; /* No contention */
+   }
 
-   /* Either somebody is queued behind us or _Q_PENDING_VAL is set */
+   /*
+* Either somebody is queued behind us or _Q_PENDING_VAL got set
+* which will then detect the remaining tail and queue behind us
+* ensuring we'll see a @next.
+*/
set_locked(lock);
 
/*


[tip:locking/core] locking/qspinlock: Rework some comments

2018-10-16 Thread tip-bot for Peter Zijlstra
Commit-ID:  756b1df4c2c82a1cdffeafa9d2aa76c92e7fb405
Gitweb: https://git.kernel.org/tip/756b1df4c2c82a1cdffeafa9d2aa76c92e7fb405
Author: Peter Zijlstra 
AuthorDate: Wed, 26 Sep 2018 13:01:19 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 16 Oct 2018 17:33:54 +0200

locking/qspinlock: Rework some comments

While working my way through the code again; I felt the comments could
use help.

Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Will Deacon 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: andrea.pa...@amarulasolutions.com
Cc: long...@redhat.com
Link: https://lkml.kernel.org/r/20181003130257.156322...@infradead.org
Signed-off-by: Ingo Molnar 
---
 kernel/locking/qspinlock.c | 36 ++--
 1 file changed, 26 insertions(+), 10 deletions(-)

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index ec343276f975..47cb99787e4d 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -326,16 +326,23 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, 
u32 val)
/*
 * trylock || pending
 *
-* 0,0,0 -> 0,0,1 ; trylock
-* 0,0,1 -> 0,1,1 ; pending
+* 0,0,* -> 0,1,* -> 0,0,1 pending, trylock
 */
val = atomic_fetch_or_acquire(_Q_PENDING_VAL, >val);
+
/*
-* If we observe any contention; undo and queue.
+* If we observe contention, there is a concurrent locker.
+*
+* Undo and queue; our setting of PENDING might have made the
+* n,0,0 -> 0,0,0 transition fail and it will now be waiting
+* on @next to become !NULL.
 */
if (unlikely(val & ~_Q_LOCKED_MASK)) {
+
+   /* Undo PENDING if we set it. */
if (!(val & _Q_PENDING_MASK))
clear_pending(lock);
+
goto queue;
}
 
@@ -474,16 +481,25 @@ locked:
 */
 
/*
-* In the PV case we might already have _Q_LOCKED_VAL set.
+* In the PV case we might already have _Q_LOCKED_VAL set, because
+* of lock stealing; therefore we must also allow:
 *
-* The atomic_cond_read_acquire() call above has provided the
-* necessary acquire semantics required for locking.
+* n,0,1 -> 0,0,1
+*
+* Note: at this point: (val & _Q_PENDING_MASK) == 0, because of the
+*   above wait condition, therefore any concurrent setting of
+*   PENDING will make the uncontended transition fail.
 */
-   if (((val & _Q_TAIL_MASK) == tail) &&
-   atomic_try_cmpxchg_relaxed(>val, , _Q_LOCKED_VAL))
-   goto release; /* No contention */
+   if ((val & _Q_TAIL_MASK) == tail) {
+   if (atomic_try_cmpxchg_relaxed(>val, , _Q_LOCKED_VAL))
+   goto release; /* No contention */
+   }
 
-   /* Either somebody is queued behind us or _Q_PENDING_VAL is set */
+   /*
+* Either somebody is queued behind us or _Q_PENDING_VAL got set
+* which will then detect the remaining tail and queue behind us
+* ensuring we'll see a @next.
+*/
set_locked(lock);
 
/*


[tip:locking/core] locking/qspinlock: Re-order code

2018-10-16 Thread tip-bot for Peter Zijlstra
Commit-ID:  53bf57fab7321fb42b703056a4c80fc9d986d170
Gitweb: https://git.kernel.org/tip/53bf57fab7321fb42b703056a4c80fc9d986d170
Author: Peter Zijlstra 
AuthorDate: Wed, 26 Sep 2018 13:01:18 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 16 Oct 2018 17:33:53 +0200

locking/qspinlock: Re-order code

Flip the branch condition after atomic_fetch_or_acquire(_Q_PENDING_VAL)
such that we loose the indent. This also result in a more natural code
flow IMO.

Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Will Deacon 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: andrea.pa...@amarulasolutions.com
Cc: long...@redhat.com
Link: https://lkml.kernel.org/r/20181003130257.156322...@infradead.org
Signed-off-by: Ingo Molnar 
---
 kernel/locking/qspinlock.c | 56 ++
 1 file changed, 27 insertions(+), 29 deletions(-)

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index bfaeb05123ff..ec343276f975 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -330,39 +330,37 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, 
u32 val)
 * 0,0,1 -> 0,1,1 ; pending
 */
val = atomic_fetch_or_acquire(_Q_PENDING_VAL, >val);
-   if (!(val & ~_Q_LOCKED_MASK)) {
-   /*
-* We're pending, wait for the owner to go away.
-*
-* *,1,1 -> *,1,0
-*
-* this wait loop must be a load-acquire such that we match the
-* store-release that clears the locked bit and create lock
-* sequentiality; this is because not all
-* clear_pending_set_locked() implementations imply full
-* barriers.
-*/
-   if (val & _Q_LOCKED_MASK) {
-   atomic_cond_read_acquire(>val,
-!(VAL & _Q_LOCKED_MASK));
-   }
-
-   /*
-* take ownership and clear the pending bit.
-*
-* *,1,0 -> *,0,1
-*/
-   clear_pending_set_locked(lock);
-   qstat_inc(qstat_lock_pending, true);
-   return;
+   /*
+* If we observe any contention; undo and queue.
+*/
+   if (unlikely(val & ~_Q_LOCKED_MASK)) {
+   if (!(val & _Q_PENDING_MASK))
+   clear_pending(lock);
+   goto queue;
}
 
/*
-* If pending was clear but there are waiters in the queue, then
-* we need to undo our setting of pending before we queue ourselves.
+* We're pending, wait for the owner to go away.
+*
+* 0,1,1 -> 0,1,0
+*
+* this wait loop must be a load-acquire such that we match the
+* store-release that clears the locked bit and create lock
+* sequentiality; this is because not all
+* clear_pending_set_locked() implementations imply full
+* barriers.
+*/
+   if (val & _Q_LOCKED_MASK)
+   atomic_cond_read_acquire(>val, !(VAL & _Q_LOCKED_MASK));
+
+   /*
+* take ownership and clear the pending bit.
+*
+* 0,1,0 -> 0,0,1
 */
-   if (!(val & _Q_PENDING_MASK))
-   clear_pending(lock);
+   clear_pending_set_locked(lock);
+   qstat_inc(qstat_lock_pending, true);
+   return;
 
/*
 * End of pending bit optimistic spinning and beginning of MCS


[tip:locking/core] locking/qspinlock: Re-order code

2018-10-16 Thread tip-bot for Peter Zijlstra
Commit-ID:  53bf57fab7321fb42b703056a4c80fc9d986d170
Gitweb: https://git.kernel.org/tip/53bf57fab7321fb42b703056a4c80fc9d986d170
Author: Peter Zijlstra 
AuthorDate: Wed, 26 Sep 2018 13:01:18 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 16 Oct 2018 17:33:53 +0200

locking/qspinlock: Re-order code

Flip the branch condition after atomic_fetch_or_acquire(_Q_PENDING_VAL)
such that we loose the indent. This also result in a more natural code
flow IMO.

Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Will Deacon 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: andrea.pa...@amarulasolutions.com
Cc: long...@redhat.com
Link: https://lkml.kernel.org/r/20181003130257.156322...@infradead.org
Signed-off-by: Ingo Molnar 
---
 kernel/locking/qspinlock.c | 56 ++
 1 file changed, 27 insertions(+), 29 deletions(-)

diff --git a/kernel/locking/qspinlock.c b/kernel/locking/qspinlock.c
index bfaeb05123ff..ec343276f975 100644
--- a/kernel/locking/qspinlock.c
+++ b/kernel/locking/qspinlock.c
@@ -330,39 +330,37 @@ void queued_spin_lock_slowpath(struct qspinlock *lock, 
u32 val)
 * 0,0,1 -> 0,1,1 ; pending
 */
val = atomic_fetch_or_acquire(_Q_PENDING_VAL, >val);
-   if (!(val & ~_Q_LOCKED_MASK)) {
-   /*
-* We're pending, wait for the owner to go away.
-*
-* *,1,1 -> *,1,0
-*
-* this wait loop must be a load-acquire such that we match the
-* store-release that clears the locked bit and create lock
-* sequentiality; this is because not all
-* clear_pending_set_locked() implementations imply full
-* barriers.
-*/
-   if (val & _Q_LOCKED_MASK) {
-   atomic_cond_read_acquire(>val,
-!(VAL & _Q_LOCKED_MASK));
-   }
-
-   /*
-* take ownership and clear the pending bit.
-*
-* *,1,0 -> *,0,1
-*/
-   clear_pending_set_locked(lock);
-   qstat_inc(qstat_lock_pending, true);
-   return;
+   /*
+* If we observe any contention; undo and queue.
+*/
+   if (unlikely(val & ~_Q_LOCKED_MASK)) {
+   if (!(val & _Q_PENDING_MASK))
+   clear_pending(lock);
+   goto queue;
}
 
/*
-* If pending was clear but there are waiters in the queue, then
-* we need to undo our setting of pending before we queue ourselves.
+* We're pending, wait for the owner to go away.
+*
+* 0,1,1 -> 0,1,0
+*
+* this wait loop must be a load-acquire such that we match the
+* store-release that clears the locked bit and create lock
+* sequentiality; this is because not all
+* clear_pending_set_locked() implementations imply full
+* barriers.
+*/
+   if (val & _Q_LOCKED_MASK)
+   atomic_cond_read_acquire(>val, !(VAL & _Q_LOCKED_MASK));
+
+   /*
+* take ownership and clear the pending bit.
+*
+* 0,1,0 -> 0,0,1
 */
-   if (!(val & _Q_PENDING_MASK))
-   clear_pending(lock);
+   clear_pending_set_locked(lock);
+   qstat_inc(qstat_lock_pending, true);
+   return;
 
/*
 * End of pending bit optimistic spinning and beginning of MCS


[tip:x86/urgent] x86/tsc: Force inlining of cyc2ns bits

2018-10-14 Thread tip-bot for Peter Zijlstra
Commit-ID:  4907c68abd3f60f650f98d5a69d4ec77c0bde44f
Gitweb: https://git.kernel.org/tip/4907c68abd3f60f650f98d5a69d4ec77c0bde44f
Author: Peter Zijlstra 
AuthorDate: Thu, 11 Oct 2018 12:38:26 +0200
Committer:  Thomas Gleixner 
CommitDate: Sun, 14 Oct 2018 11:11:22 +0200

x86/tsc: Force inlining of cyc2ns bits

Looking at the asm for native_sched_clock() I noticed we don't inline
enough. Mostly caused by sharing code with cyc2ns_read_begin(), which
we didn't used to do. So mark all that __force_inline to make it DTRT.

Fixes: 59eaef78bfea ("x86/tsc: Remodel cyc2ns to use seqcount_latch()")
Reported-by: Eric Dumazet 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Cc: h...@zytor.com
Cc: eric.duma...@gmail.com
Cc: b...@alien8.de
Cc: sta...@vger.kernel.org
Link: https://lkml.kernel.org/r/20181011104019.695196...@infradead.org

---
 arch/x86/kernel/tsc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index b52bd2b6cdb4..6d5dc5dabfd7 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -58,7 +58,7 @@ struct cyc2ns {
 
 static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
 
-void cyc2ns_read_begin(struct cyc2ns_data *data)
+void __always_inline cyc2ns_read_begin(struct cyc2ns_data *data)
 {
int seq, idx;
 
@@ -75,7 +75,7 @@ void cyc2ns_read_begin(struct cyc2ns_data *data)
} while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence)));
 }
 
-void cyc2ns_read_end(void)
+void __always_inline cyc2ns_read_end(void)
 {
preempt_enable_notrace();
 }
@@ -104,7 +104,7 @@ void cyc2ns_read_end(void)
  *  -johns...@us.ibm.com "math is hard, lets go shopping!"
  */
 
-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc)
 {
struct cyc2ns_data data;
unsigned long long ns;


[tip:x86/urgent] x86/tsc: Force inlining of cyc2ns bits

2018-10-14 Thread tip-bot for Peter Zijlstra
Commit-ID:  4907c68abd3f60f650f98d5a69d4ec77c0bde44f
Gitweb: https://git.kernel.org/tip/4907c68abd3f60f650f98d5a69d4ec77c0bde44f
Author: Peter Zijlstra 
AuthorDate: Thu, 11 Oct 2018 12:38:26 +0200
Committer:  Thomas Gleixner 
CommitDate: Sun, 14 Oct 2018 11:11:22 +0200

x86/tsc: Force inlining of cyc2ns bits

Looking at the asm for native_sched_clock() I noticed we don't inline
enough. Mostly caused by sharing code with cyc2ns_read_begin(), which
we didn't used to do. So mark all that __force_inline to make it DTRT.

Fixes: 59eaef78bfea ("x86/tsc: Remodel cyc2ns to use seqcount_latch()")
Reported-by: Eric Dumazet 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Cc: h...@zytor.com
Cc: eric.duma...@gmail.com
Cc: b...@alien8.de
Cc: sta...@vger.kernel.org
Link: https://lkml.kernel.org/r/20181011104019.695196...@infradead.org

---
 arch/x86/kernel/tsc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/kernel/tsc.c b/arch/x86/kernel/tsc.c
index b52bd2b6cdb4..6d5dc5dabfd7 100644
--- a/arch/x86/kernel/tsc.c
+++ b/arch/x86/kernel/tsc.c
@@ -58,7 +58,7 @@ struct cyc2ns {
 
 static DEFINE_PER_CPU_ALIGNED(struct cyc2ns, cyc2ns);
 
-void cyc2ns_read_begin(struct cyc2ns_data *data)
+void __always_inline cyc2ns_read_begin(struct cyc2ns_data *data)
 {
int seq, idx;
 
@@ -75,7 +75,7 @@ void cyc2ns_read_begin(struct cyc2ns_data *data)
} while (unlikely(seq != this_cpu_read(cyc2ns.seq.sequence)));
 }
 
-void cyc2ns_read_end(void)
+void __always_inline cyc2ns_read_end(void)
 {
preempt_enable_notrace();
 }
@@ -104,7 +104,7 @@ void cyc2ns_read_end(void)
  *  -johns...@us.ibm.com "math is hard, lets go shopping!"
  */
 
-static inline unsigned long long cycles_2_ns(unsigned long long cyc)
+static __always_inline unsigned long long cycles_2_ns(unsigned long long cyc)
 {
struct cyc2ns_data data;
unsigned long long ns;


[tip:x86/urgent] x86/percpu: Fix this_cpu_read()

2018-10-14 Thread tip-bot for Peter Zijlstra
Commit-ID:  b59167ac7bafd804c91e49ad53c6d33a7394d4c8
Gitweb: https://git.kernel.org/tip/b59167ac7bafd804c91e49ad53c6d33a7394d4c8
Author: Peter Zijlstra 
AuthorDate: Thu, 11 Oct 2018 12:38:27 +0200
Committer:  Thomas Gleixner 
CommitDate: Sun, 14 Oct 2018 11:11:22 +0200

x86/percpu: Fix this_cpu_read()

Eric reported that a sequence count loop using this_cpu_read() got
optimized out. This is wrong, this_cpu_read() must imply READ_ONCE()
because the interface is IRQ-safe, therefore an interrupt can have
changed the per-cpu value.

Fixes: 7c3576d261ce ("[PATCH] i386: Convert PDA into the percpu section")
Reported-by: Eric Dumazet 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Acked-by: Eric Dumazet 
Cc: h...@zytor.com
Cc: eric.duma...@gmail.com
Cc: b...@alien8.de
Cc: sta...@vger.kernel.org
Link: https://lkml.kernel.org/r/20181011104019.748208...@infradead.org

---
 arch/x86/include/asm/percpu.h | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index e9202a0de8f0..1a19d11cfbbd 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -185,22 +185,22 @@ do {  
\
typeof(var) pfo_ret__;  \
switch (sizeof(var)) {  \
case 1: \
-   asm(op "b "__percpu_arg(1)",%0" \
+   asm volatile(op "b "__percpu_arg(1)",%0"\
: "=q" (pfo_ret__)  \
: "m" (var));   \
break;  \
case 2: \
-   asm(op "w "__percpu_arg(1)",%0" \
+   asm volatile(op "w "__percpu_arg(1)",%0"\
: "=r" (pfo_ret__)  \
: "m" (var));   \
break;  \
case 4: \
-   asm(op "l "__percpu_arg(1)",%0" \
+   asm volatile(op "l "__percpu_arg(1)",%0"\
: "=r" (pfo_ret__)  \
: "m" (var));   \
break;  \
case 8: \
-   asm(op "q "__percpu_arg(1)",%0" \
+   asm volatile(op "q "__percpu_arg(1)",%0"\
: "=r" (pfo_ret__)  \
: "m" (var));   \
break;  \


[tip:x86/urgent] x86/percpu: Fix this_cpu_read()

2018-10-14 Thread tip-bot for Peter Zijlstra
Commit-ID:  b59167ac7bafd804c91e49ad53c6d33a7394d4c8
Gitweb: https://git.kernel.org/tip/b59167ac7bafd804c91e49ad53c6d33a7394d4c8
Author: Peter Zijlstra 
AuthorDate: Thu, 11 Oct 2018 12:38:27 +0200
Committer:  Thomas Gleixner 
CommitDate: Sun, 14 Oct 2018 11:11:22 +0200

x86/percpu: Fix this_cpu_read()

Eric reported that a sequence count loop using this_cpu_read() got
optimized out. This is wrong, this_cpu_read() must imply READ_ONCE()
because the interface is IRQ-safe, therefore an interrupt can have
changed the per-cpu value.

Fixes: 7c3576d261ce ("[PATCH] i386: Convert PDA into the percpu section")
Reported-by: Eric Dumazet 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Acked-by: Eric Dumazet 
Cc: h...@zytor.com
Cc: eric.duma...@gmail.com
Cc: b...@alien8.de
Cc: sta...@vger.kernel.org
Link: https://lkml.kernel.org/r/20181011104019.748208...@infradead.org

---
 arch/x86/include/asm/percpu.h | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index e9202a0de8f0..1a19d11cfbbd 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -185,22 +185,22 @@ do {  
\
typeof(var) pfo_ret__;  \
switch (sizeof(var)) {  \
case 1: \
-   asm(op "b "__percpu_arg(1)",%0" \
+   asm volatile(op "b "__percpu_arg(1)",%0"\
: "=q" (pfo_ret__)  \
: "m" (var));   \
break;  \
case 2: \
-   asm(op "w "__percpu_arg(1)",%0" \
+   asm volatile(op "w "__percpu_arg(1)",%0"\
: "=r" (pfo_ret__)  \
: "m" (var));   \
break;  \
case 4: \
-   asm(op "l "__percpu_arg(1)",%0" \
+   asm volatile(op "l "__percpu_arg(1)",%0"\
: "=r" (pfo_ret__)  \
: "m" (var));   \
break;  \
case 8: \
-   asm(op "q "__percpu_arg(1)",%0" \
+   asm volatile(op "q "__percpu_arg(1)",%0"\
: "=r" (pfo_ret__)  \
: "m" (var));   \
break;  \


[tip:x86/boot] x86/kaslr, ACPI/NUMA: Fix KASLR build error

2018-10-09 Thread tip-bot for Peter Zijlstra (Intel)
Commit-ID:  9d94e8b1d4f94a3c4cee5ad11a1be460cd070839
Gitweb: https://git.kernel.org/tip/9d94e8b1d4f94a3c4cee5ad11a1be460cd070839
Author: Peter Zijlstra (Intel) 
AuthorDate: Wed, 3 Oct 2018 14:41:27 +0200
Committer:  Borislav Petkov 
CommitDate: Tue, 9 Oct 2018 12:30:25 +0200

x86/kaslr, ACPI/NUMA: Fix KASLR build error

There is no point in trying to compile KASLR-specific code when there is
no KASLR.

 [ bp: Move the whole crap into kaslr.c and make
   rand_mem_physical_padding static. Make kaslr_check_padding()
   weak to avoid build breakage on other architectures. ]

Reported-by: Naresh Kamboju 
Reported-by: Mark Brown 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Borislav Petkov 
Cc: 
Cc: 
Cc: 
Cc: 
Cc: 
Cc: 
Link: 
http://lkml.kernel.org/r/20181003123402.ga15...@hirez.programming.kicks-ass.net
---
 arch/x86/include/asm/setup.h |  2 --
 arch/x86/mm/kaslr.c  | 19 ++-
 drivers/acpi/numa.c  | 17 +
 3 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 65a5bf8f6aba..ae13bc974416 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -80,8 +80,6 @@ static inline unsigned long kaslr_offset(void)
return (unsigned long)&_text - __START_KERNEL;
 }
 
-extern int rand_mem_physical_padding;
-
 /*
  * Do NOT EVER look at the BIOS memory size location.
  * It does not work on many machines.
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 00cf4cae38f5..b3471388288d 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -40,7 +41,7 @@
  */
 static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
 
-int __initdata rand_mem_physical_padding = 
CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
+static int __initdata rand_mem_physical_padding = 
CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
 /*
  * Memory regions randomized by KASLR (except modules that use a separate logic
  * earlier during boot). The list is ordered based on virtual addresses. This
@@ -70,6 +71,22 @@ static inline bool kaslr_memory_enabled(void)
return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN);
 }
 
+/*
+ * Check the padding size for KASLR is enough.
+ */
+void __init kaslr_check_padding(void)
+{
+   u64 max_possible_phys, max_actual_phys, threshold;
+
+   max_actual_phys = roundup(PFN_PHYS(max_pfn), 1ULL << 40);
+   max_possible_phys = roundup(PFN_PHYS(max_possible_pfn), 1ULL << 40);
+   threshold = max_actual_phys + ((u64)rand_mem_physical_padding << 40);
+
+   if (max_possible_phys > threshold)
+   pr_warn("Set 'rand_mem_physical_padding=%llu' to avoid memory 
hotadd failure.\n",
+   (max_possible_phys - max_actual_phys) >> 40);
+}
+
 static int __init rand_mem_physical_padding_setup(char *str)
 {
int max_padding = (1 << (MAX_PHYSMEM_BITS - TB_SHIFT)) - 1;
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 3d69834c692f..ba62004f4d86 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -32,7 +32,6 @@
 #include 
 #include 
 #include 
-#include 
 
 static nodemask_t nodes_found_map = NODE_MASK_NONE;
 
@@ -433,10 +432,12 @@ acpi_table_parse_srat(enum acpi_srat_type id,
handler, max_entries);
 }
 
+/* To be overridden by architectures */
+void __init __weak kaslr_check_padding(void) { }
+
 int __init acpi_numa_init(void)
 {
int cnt = 0;
-   u64 max_possible_phys, max_actual_phys, threshold;
 
if (acpi_disabled)
return -EINVAL;
@@ -466,17 +467,9 @@ int __init acpi_numa_init(void)
cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
acpi_parse_memory_affinity, 0);
 
-   /* check the padding size for KASLR is enough. */
-   if (parsed_numa_memblks && kaslr_enabled()) {
-   max_actual_phys = roundup(PFN_PHYS(max_pfn), 1ULL << 
40);
-   max_possible_phys = roundup(PFN_PHYS(max_possible_pfn), 
1ULL << 40);
-   threshold = max_actual_phys + 
((u64)rand_mem_physical_padding << 40);
+   if (parsed_numa_memblks)
+   kaslr_check_padding();
 
-   if (max_possible_phys > threshold) {
-   pr_warn("Set 'rand_mem_physical_padding=%llu' 
to avoid memory hotadd failure.\n",
- (max_possible_phys - max_actual_phys) >> 40);
-   }
-   }
}
 
/* SLIT: System Locality Information Table */


[tip:x86/boot] x86/kaslr, ACPI/NUMA: Fix KASLR build error

2018-10-09 Thread tip-bot for Peter Zijlstra (Intel)
Commit-ID:  9d94e8b1d4f94a3c4cee5ad11a1be460cd070839
Gitweb: https://git.kernel.org/tip/9d94e8b1d4f94a3c4cee5ad11a1be460cd070839
Author: Peter Zijlstra (Intel) 
AuthorDate: Wed, 3 Oct 2018 14:41:27 +0200
Committer:  Borislav Petkov 
CommitDate: Tue, 9 Oct 2018 12:30:25 +0200

x86/kaslr, ACPI/NUMA: Fix KASLR build error

There is no point in trying to compile KASLR-specific code when there is
no KASLR.

 [ bp: Move the whole crap into kaslr.c and make
   rand_mem_physical_padding static. Make kaslr_check_padding()
   weak to avoid build breakage on other architectures. ]

Reported-by: Naresh Kamboju 
Reported-by: Mark Brown 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Borislav Petkov 
Cc: 
Cc: 
Cc: 
Cc: 
Cc: 
Cc: 
Link: 
http://lkml.kernel.org/r/20181003123402.ga15...@hirez.programming.kicks-ass.net
---
 arch/x86/include/asm/setup.h |  2 --
 arch/x86/mm/kaslr.c  | 19 ++-
 drivers/acpi/numa.c  | 17 +
 3 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 65a5bf8f6aba..ae13bc974416 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -80,8 +80,6 @@ static inline unsigned long kaslr_offset(void)
return (unsigned long)&_text - __START_KERNEL;
 }
 
-extern int rand_mem_physical_padding;
-
 /*
  * Do NOT EVER look at the BIOS memory size location.
  * It does not work on many machines.
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 00cf4cae38f5..b3471388288d 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -40,7 +41,7 @@
  */
 static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
 
-int __initdata rand_mem_physical_padding = 
CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
+static int __initdata rand_mem_physical_padding = 
CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
 /*
  * Memory regions randomized by KASLR (except modules that use a separate logic
  * earlier during boot). The list is ordered based on virtual addresses. This
@@ -70,6 +71,22 @@ static inline bool kaslr_memory_enabled(void)
return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN);
 }
 
+/*
+ * Check the padding size for KASLR is enough.
+ */
+void __init kaslr_check_padding(void)
+{
+   u64 max_possible_phys, max_actual_phys, threshold;
+
+   max_actual_phys = roundup(PFN_PHYS(max_pfn), 1ULL << 40);
+   max_possible_phys = roundup(PFN_PHYS(max_possible_pfn), 1ULL << 40);
+   threshold = max_actual_phys + ((u64)rand_mem_physical_padding << 40);
+
+   if (max_possible_phys > threshold)
+   pr_warn("Set 'rand_mem_physical_padding=%llu' to avoid memory 
hotadd failure.\n",
+   (max_possible_phys - max_actual_phys) >> 40);
+}
+
 static int __init rand_mem_physical_padding_setup(char *str)
 {
int max_padding = (1 << (MAX_PHYSMEM_BITS - TB_SHIFT)) - 1;
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 3d69834c692f..ba62004f4d86 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -32,7 +32,6 @@
 #include 
 #include 
 #include 
-#include 
 
 static nodemask_t nodes_found_map = NODE_MASK_NONE;
 
@@ -433,10 +432,12 @@ acpi_table_parse_srat(enum acpi_srat_type id,
handler, max_entries);
 }
 
+/* To be overridden by architectures */
+void __init __weak kaslr_check_padding(void) { }
+
 int __init acpi_numa_init(void)
 {
int cnt = 0;
-   u64 max_possible_phys, max_actual_phys, threshold;
 
if (acpi_disabled)
return -EINVAL;
@@ -466,17 +467,9 @@ int __init acpi_numa_init(void)
cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
acpi_parse_memory_affinity, 0);
 
-   /* check the padding size for KASLR is enough. */
-   if (parsed_numa_memblks && kaslr_enabled()) {
-   max_actual_phys = roundup(PFN_PHYS(max_pfn), 1ULL << 
40);
-   max_possible_phys = roundup(PFN_PHYS(max_possible_pfn), 
1ULL << 40);
-   threshold = max_actual_phys + 
((u64)rand_mem_physical_padding << 40);
+   if (parsed_numa_memblks)
+   kaslr_check_padding();
 
-   if (max_possible_phys > threshold) {
-   pr_warn("Set 'rand_mem_physical_padding=%llu' 
to avoid memory hotadd failure.\n",
- (max_possible_phys - max_actual_phys) >> 40);
-   }
-   }
}
 
/* SLIT: System Locality Information Table */


[tip:x86/boot] x86/kaslr, ACPI/NUMA: Fix KASLR build error

2018-10-03 Thread tip-bot for Peter Zijlstra (Intel)
Commit-ID:  3a387c6d96e69f1710a3804eb68e1253263298f2
Gitweb: https://git.kernel.org/tip/3a387c6d96e69f1710a3804eb68e1253263298f2
Author: Peter Zijlstra (Intel) 
AuthorDate: Wed, 3 Oct 2018 14:41:27 +0200
Committer:  Borislav Petkov 
CommitDate: Wed, 3 Oct 2018 16:15:49 +0200

x86/kaslr, ACPI/NUMA: Fix KASLR build error

There is no point in trying to compile KASLR-specific code when there is
no KASLR.

 [ bp: Move the whole crap into kaslr.c and make
   rand_mem_physical_padding static. ]

Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Borislav Petkov 
Cc: 
Cc: 
Cc: 
Cc: 
Cc: 
Cc: 
Link: 
http://lkml.kernel.org/r/20181003123402.ga15...@hirez.programming.kicks-ass.net
---
 arch/x86/include/asm/kaslr.h |  2 ++
 arch/x86/include/asm/setup.h |  2 --
 arch/x86/mm/kaslr.c  | 19 ++-
 drivers/acpi/numa.c  | 15 +++
 4 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h
index db7ba2feb947..95ef3fc01d12 100644
--- a/arch/x86/include/asm/kaslr.h
+++ b/arch/x86/include/asm/kaslr.h
@@ -6,8 +6,10 @@ unsigned long kaslr_get_random_long(const char *purpose);
 
 #ifdef CONFIG_RANDOMIZE_MEMORY
 void kernel_randomize_memory(void);
+void kaslr_check_padding(void);
 #else
 static inline void kernel_randomize_memory(void) { }
+static inline void kaslr_check_padding(void) { }
 #endif /* CONFIG_RANDOMIZE_MEMORY */
 
 #endif
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 65a5bf8f6aba..ae13bc974416 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -80,8 +80,6 @@ static inline unsigned long kaslr_offset(void)
return (unsigned long)&_text - __START_KERNEL;
 }
 
-extern int rand_mem_physical_padding;
-
 /*
  * Do NOT EVER look at the BIOS memory size location.
  * It does not work on many machines.
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 00cf4cae38f5..b3471388288d 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -40,7 +41,7 @@
  */
 static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
 
-int __initdata rand_mem_physical_padding = 
CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
+static int __initdata rand_mem_physical_padding = 
CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
 /*
  * Memory regions randomized by KASLR (except modules that use a separate logic
  * earlier during boot). The list is ordered based on virtual addresses. This
@@ -70,6 +71,22 @@ static inline bool kaslr_memory_enabled(void)
return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN);
 }
 
+/*
+ * Check the padding size for KASLR is enough.
+ */
+void __init kaslr_check_padding(void)
+{
+   u64 max_possible_phys, max_actual_phys, threshold;
+
+   max_actual_phys = roundup(PFN_PHYS(max_pfn), 1ULL << 40);
+   max_possible_phys = roundup(PFN_PHYS(max_possible_pfn), 1ULL << 40);
+   threshold = max_actual_phys + ((u64)rand_mem_physical_padding << 40);
+
+   if (max_possible_phys > threshold)
+   pr_warn("Set 'rand_mem_physical_padding=%llu' to avoid memory 
hotadd failure.\n",
+   (max_possible_phys - max_actual_phys) >> 40);
+}
+
 static int __init rand_mem_physical_padding_setup(char *str)
 {
int max_padding = (1 << (MAX_PHYSMEM_BITS - TB_SHIFT)) - 1;
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 3d69834c692f..4408e37600ef 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -32,7 +32,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 
 static nodemask_t nodes_found_map = NODE_MASK_NONE;
 
@@ -436,7 +436,6 @@ acpi_table_parse_srat(enum acpi_srat_type id,
 int __init acpi_numa_init(void)
 {
int cnt = 0;
-   u64 max_possible_phys, max_actual_phys, threshold;
 
if (acpi_disabled)
return -EINVAL;
@@ -466,17 +465,9 @@ int __init acpi_numa_init(void)
cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
acpi_parse_memory_affinity, 0);
 
-   /* check the padding size for KASLR is enough. */
-   if (parsed_numa_memblks && kaslr_enabled()) {
-   max_actual_phys = roundup(PFN_PHYS(max_pfn), 1ULL << 
40);
-   max_possible_phys = roundup(PFN_PHYS(max_possible_pfn), 
1ULL << 40);
-   threshold = max_actual_phys + 
((u64)rand_mem_physical_padding << 40);
+   if (parsed_numa_memblks)
+   kaslr_check_padding();
 
-   if (max_possible_phys > threshold) {
-   pr_warn("Set 'rand_mem_physical_padding=%llu' 
to avoid memory hotadd failure.\n",
- (max_possible_phys - max_actual_phys) >> 40);
-   }
-   }
}
 
/* SLIT: System Locality 

[tip:x86/boot] x86/kaslr, ACPI/NUMA: Fix KASLR build error

2018-10-03 Thread tip-bot for Peter Zijlstra (Intel)
Commit-ID:  3a387c6d96e69f1710a3804eb68e1253263298f2
Gitweb: https://git.kernel.org/tip/3a387c6d96e69f1710a3804eb68e1253263298f2
Author: Peter Zijlstra (Intel) 
AuthorDate: Wed, 3 Oct 2018 14:41:27 +0200
Committer:  Borislav Petkov 
CommitDate: Wed, 3 Oct 2018 16:15:49 +0200

x86/kaslr, ACPI/NUMA: Fix KASLR build error

There is no point in trying to compile KASLR-specific code when there is
no KASLR.

 [ bp: Move the whole crap into kaslr.c and make
   rand_mem_physical_padding static. ]

Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Borislav Petkov 
Cc: 
Cc: 
Cc: 
Cc: 
Cc: 
Cc: 
Link: 
http://lkml.kernel.org/r/20181003123402.ga15...@hirez.programming.kicks-ass.net
---
 arch/x86/include/asm/kaslr.h |  2 ++
 arch/x86/include/asm/setup.h |  2 --
 arch/x86/mm/kaslr.c  | 19 ++-
 drivers/acpi/numa.c  | 15 +++
 4 files changed, 23 insertions(+), 15 deletions(-)

diff --git a/arch/x86/include/asm/kaslr.h b/arch/x86/include/asm/kaslr.h
index db7ba2feb947..95ef3fc01d12 100644
--- a/arch/x86/include/asm/kaslr.h
+++ b/arch/x86/include/asm/kaslr.h
@@ -6,8 +6,10 @@ unsigned long kaslr_get_random_long(const char *purpose);
 
 #ifdef CONFIG_RANDOMIZE_MEMORY
 void kernel_randomize_memory(void);
+void kaslr_check_padding(void);
 #else
 static inline void kernel_randomize_memory(void) { }
+static inline void kaslr_check_padding(void) { }
 #endif /* CONFIG_RANDOMIZE_MEMORY */
 
 #endif
diff --git a/arch/x86/include/asm/setup.h b/arch/x86/include/asm/setup.h
index 65a5bf8f6aba..ae13bc974416 100644
--- a/arch/x86/include/asm/setup.h
+++ b/arch/x86/include/asm/setup.h
@@ -80,8 +80,6 @@ static inline unsigned long kaslr_offset(void)
return (unsigned long)&_text - __START_KERNEL;
 }
 
-extern int rand_mem_physical_padding;
-
 /*
  * Do NOT EVER look at the BIOS memory size location.
  * It does not work on many machines.
diff --git a/arch/x86/mm/kaslr.c b/arch/x86/mm/kaslr.c
index 00cf4cae38f5..b3471388288d 100644
--- a/arch/x86/mm/kaslr.c
+++ b/arch/x86/mm/kaslr.c
@@ -23,6 +23,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -40,7 +41,7 @@
  */
 static const unsigned long vaddr_end = CPU_ENTRY_AREA_BASE;
 
-int __initdata rand_mem_physical_padding = 
CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
+static int __initdata rand_mem_physical_padding = 
CONFIG_RANDOMIZE_MEMORY_PHYSICAL_PADDING;
 /*
  * Memory regions randomized by KASLR (except modules that use a separate logic
  * earlier during boot). The list is ordered based on virtual addresses. This
@@ -70,6 +71,22 @@ static inline bool kaslr_memory_enabled(void)
return kaslr_enabled() && !IS_ENABLED(CONFIG_KASAN);
 }
 
+/*
+ * Check the padding size for KASLR is enough.
+ */
+void __init kaslr_check_padding(void)
+{
+   u64 max_possible_phys, max_actual_phys, threshold;
+
+   max_actual_phys = roundup(PFN_PHYS(max_pfn), 1ULL << 40);
+   max_possible_phys = roundup(PFN_PHYS(max_possible_pfn), 1ULL << 40);
+   threshold = max_actual_phys + ((u64)rand_mem_physical_padding << 40);
+
+   if (max_possible_phys > threshold)
+   pr_warn("Set 'rand_mem_physical_padding=%llu' to avoid memory 
hotadd failure.\n",
+   (max_possible_phys - max_actual_phys) >> 40);
+}
+
 static int __init rand_mem_physical_padding_setup(char *str)
 {
int max_padding = (1 << (MAX_PHYSMEM_BITS - TB_SHIFT)) - 1;
diff --git a/drivers/acpi/numa.c b/drivers/acpi/numa.c
index 3d69834c692f..4408e37600ef 100644
--- a/drivers/acpi/numa.c
+++ b/drivers/acpi/numa.c
@@ -32,7 +32,7 @@
 #include 
 #include 
 #include 
-#include 
+#include 
 
 static nodemask_t nodes_found_map = NODE_MASK_NONE;
 
@@ -436,7 +436,6 @@ acpi_table_parse_srat(enum acpi_srat_type id,
 int __init acpi_numa_init(void)
 {
int cnt = 0;
-   u64 max_possible_phys, max_actual_phys, threshold;
 
if (acpi_disabled)
return -EINVAL;
@@ -466,17 +465,9 @@ int __init acpi_numa_init(void)
cnt = acpi_table_parse_srat(ACPI_SRAT_TYPE_MEMORY_AFFINITY,
acpi_parse_memory_affinity, 0);
 
-   /* check the padding size for KASLR is enough. */
-   if (parsed_numa_memblks && kaslr_enabled()) {
-   max_actual_phys = roundup(PFN_PHYS(max_pfn), 1ULL << 
40);
-   max_possible_phys = roundup(PFN_PHYS(max_possible_pfn), 
1ULL << 40);
-   threshold = max_actual_phys + 
((u64)rand_mem_physical_padding << 40);
+   if (parsed_numa_memblks)
+   kaslr_check_padding();
 
-   if (max_possible_phys > threshold) {
-   pr_warn("Set 'rand_mem_physical_padding=%llu' 
to avoid memory hotadd failure.\n",
- (max_possible_phys - max_actual_phys) >> 40);
-   }
-   }
}
 
/* SLIT: System Locality 

[tip:x86/mm] x86/mm/cpa: Optimize __cpa_flush_range()

2018-09-27 Thread tip-bot for Peter Zijlstra
Commit-ID:  7904ba8a66f400182a204893c92098994e22a88d
Gitweb: https://git.kernel.org/tip/7904ba8a66f400182a204893c92098994e22a88d
Author: Peter Zijlstra 
AuthorDate: Wed, 19 Sep 2018 10:50:24 +0200
Committer:  Thomas Gleixner 
CommitDate: Thu, 27 Sep 2018 20:39:42 +0200

x86/mm/cpa: Optimize __cpa_flush_range()

If we IPI for WBINDV, then we might as well kill the entire TLB too.
But if we don't have to invalidate cache, there is no reason not to
use a range TLB flush.

Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Dave Hansen 
Cc: Bin Yang 
Cc: Mark Gross 
Link: https://lkml.kernel.org/r/20180919085948.195633...@infradead.org

---
 arch/x86/mm/pageattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index dc552824e86a..62bb30b4bd2a 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -291,7 +291,7 @@ static bool __cpa_flush_range(unsigned long start, int 
numpages, int cache)
 
WARN_ON(PAGE_ALIGN(start) != start);
 
-   if (!static_cpu_has(X86_FEATURE_CLFLUSH)) {
+   if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
cpa_flush_all(cache);
return true;
}


[tip:x86/mm] x86/mm/cpa: Optimize __cpa_flush_range()

2018-09-27 Thread tip-bot for Peter Zijlstra
Commit-ID:  7904ba8a66f400182a204893c92098994e22a88d
Gitweb: https://git.kernel.org/tip/7904ba8a66f400182a204893c92098994e22a88d
Author: Peter Zijlstra 
AuthorDate: Wed, 19 Sep 2018 10:50:24 +0200
Committer:  Thomas Gleixner 
CommitDate: Thu, 27 Sep 2018 20:39:42 +0200

x86/mm/cpa: Optimize __cpa_flush_range()

If we IPI for WBINDV, then we might as well kill the entire TLB too.
But if we don't have to invalidate cache, there is no reason not to
use a range TLB flush.

Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Dave Hansen 
Cc: Bin Yang 
Cc: Mark Gross 
Link: https://lkml.kernel.org/r/20180919085948.195633...@infradead.org

---
 arch/x86/mm/pageattr.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index dc552824e86a..62bb30b4bd2a 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -291,7 +291,7 @@ static bool __cpa_flush_range(unsigned long start, int 
numpages, int cache)
 
WARN_ON(PAGE_ALIGN(start) != start);
 
-   if (!static_cpu_has(X86_FEATURE_CLFLUSH)) {
+   if (cache && !static_cpu_has(X86_FEATURE_CLFLUSH)) {
cpa_flush_all(cache);
return true;
}


[tip:x86/mm] x86/mm/cpa: Factor common code between cpa_flush_*()

2018-09-27 Thread tip-bot for Peter Zijlstra
Commit-ID:  47e262ac5b84015c4a101ff51767c464fb7497a6
Gitweb: https://git.kernel.org/tip/47e262ac5b84015c4a101ff51767c464fb7497a6
Author: Peter Zijlstra 
AuthorDate: Wed, 19 Sep 2018 10:50:23 +0200
Committer:  Thomas Gleixner 
CommitDate: Thu, 27 Sep 2018 20:39:42 +0200

x86/mm/cpa: Factor common code between cpa_flush_*()

The start of cpa_flush_range() and cpa_flush_array() is the same, use
a common function.

Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Dave Hansen 
Cc: Bin Yang 
Cc: Mark Gross 
Link: https://lkml.kernel.org/r/20180919085948.138859...@infradead.org

---
 arch/x86/mm/pageattr.c | 29 +
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 33d89d505f93..dc552824e86a 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -285,22 +285,28 @@ static void cpa_flush_all(unsigned long cache)
on_each_cpu(__cpa_flush_all, (void *) cache, 1);
 }
 
-static void cpa_flush_range(unsigned long start, int numpages, int cache)
+static bool __cpa_flush_range(unsigned long start, int numpages, int cache)
 {
-   unsigned int i, level;
-   unsigned long addr;
-
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
+
WARN_ON(PAGE_ALIGN(start) != start);
 
if (!static_cpu_has(X86_FEATURE_CLFLUSH)) {
cpa_flush_all(cache);
-   return;
+   return true;
}
 
flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages);
 
-   if (!cache)
+   return !cache;
+}
+
+static void cpa_flush_range(unsigned long start, int numpages, int cache)
+{
+   unsigned int i, level;
+   unsigned long addr;
+
+   if (__cpa_flush_range(start, numpages, cache))
return;
 
/*
@@ -326,16 +332,7 @@ static void cpa_flush_array(unsigned long baddr, unsigned 
long *start,
 {
unsigned int i, level;
 
-   BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
-
-   if (!static_cpu_has(X86_FEATURE_CLFLUSH)) {
-   cpa_flush_all(cache);
-   return;
-   }
-
-   flush_tlb_kernel_range(baddr, baddr + PAGE_SIZE * numpages);
-
-   if (!cache)
+   if (__cpa_flush_range(baddr, numpages, cache))
return;
 
/*


[tip:x86/mm] x86/mm/cpa: Factor common code between cpa_flush_*()

2018-09-27 Thread tip-bot for Peter Zijlstra
Commit-ID:  47e262ac5b84015c4a101ff51767c464fb7497a6
Gitweb: https://git.kernel.org/tip/47e262ac5b84015c4a101ff51767c464fb7497a6
Author: Peter Zijlstra 
AuthorDate: Wed, 19 Sep 2018 10:50:23 +0200
Committer:  Thomas Gleixner 
CommitDate: Thu, 27 Sep 2018 20:39:42 +0200

x86/mm/cpa: Factor common code between cpa_flush_*()

The start of cpa_flush_range() and cpa_flush_array() is the same, use
a common function.

Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Dave Hansen 
Cc: Bin Yang 
Cc: Mark Gross 
Link: https://lkml.kernel.org/r/20180919085948.138859...@infradead.org

---
 arch/x86/mm/pageattr.c | 29 +
 1 file changed, 13 insertions(+), 16 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 33d89d505f93..dc552824e86a 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -285,22 +285,28 @@ static void cpa_flush_all(unsigned long cache)
on_each_cpu(__cpa_flush_all, (void *) cache, 1);
 }
 
-static void cpa_flush_range(unsigned long start, int numpages, int cache)
+static bool __cpa_flush_range(unsigned long start, int numpages, int cache)
 {
-   unsigned int i, level;
-   unsigned long addr;
-
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
+
WARN_ON(PAGE_ALIGN(start) != start);
 
if (!static_cpu_has(X86_FEATURE_CLFLUSH)) {
cpa_flush_all(cache);
-   return;
+   return true;
}
 
flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages);
 
-   if (!cache)
+   return !cache;
+}
+
+static void cpa_flush_range(unsigned long start, int numpages, int cache)
+{
+   unsigned int i, level;
+   unsigned long addr;
+
+   if (__cpa_flush_range(start, numpages, cache))
return;
 
/*
@@ -326,16 +332,7 @@ static void cpa_flush_array(unsigned long baddr, unsigned 
long *start,
 {
unsigned int i, level;
 
-   BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
-
-   if (!static_cpu_has(X86_FEATURE_CLFLUSH)) {
-   cpa_flush_all(cache);
-   return;
-   }
-
-   flush_tlb_kernel_range(baddr, baddr + PAGE_SIZE * numpages);
-
-   if (!cache)
+   if (__cpa_flush_range(baddr, numpages, cache))
return;
 
/*


[tip:x86/mm] x86/mm/cpa: Move CLFLUSH test into cpa_flush_array()

2018-09-27 Thread tip-bot for Peter Zijlstra
Commit-ID:  fce2ce9544e9f098ba828442221ce99c2a5ecb0f
Gitweb: https://git.kernel.org/tip/fce2ce9544e9f098ba828442221ce99c2a5ecb0f
Author: Peter Zijlstra 
AuthorDate: Wed, 19 Sep 2018 10:50:22 +0200
Committer:  Thomas Gleixner 
CommitDate: Thu, 27 Sep 2018 20:39:42 +0200

x86/mm/cpa: Move CLFLUSH test into cpa_flush_array()

Rather than guarding cpa_flush_array() users with a CLFLUSH test, put
it inside.

Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Dave Hansen 
Cc: Bin Yang 
Cc: Mark Gross 
Link: https://lkml.kernel.org/r/20180919085948.087848...@infradead.org

---
 arch/x86/mm/pageattr.c | 27 ---
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 3cc4a2ae4dbb..33d89d505f93 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -328,6 +328,11 @@ static void cpa_flush_array(unsigned long baddr, unsigned 
long *start,
 
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
 
+   if (!static_cpu_has(X86_FEATURE_CLFLUSH)) {
+   cpa_flush_all(cache);
+   return;
+   }
+
flush_tlb_kernel_range(baddr, baddr + PAGE_SIZE * numpages);
 
if (!cache)
@@ -1756,19 +1761,19 @@ static int change_page_attr_set_clr(unsigned long 
*addr, int numpages,
cache = !!pgprot2cachemode(mask_set);
 
/*
-* On success we use CLFLUSH, when the CPU supports it to
-* avoid the WBINVD. If the CPU does not support it and in the
-* error case we fall back to cpa_flush_all (which uses
-* WBINVD):
+* On error; flush everything to be sure.
 */
-   if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) {
-   if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
-   cpa_flush_array(baddr, addr, numpages, cache,
-   cpa.flags, pages);
-   } else
-   cpa_flush_range(baddr, numpages, cache);
-   } else
+   if (ret) {
cpa_flush_all(cache);
+   goto out;
+   }
+
+   if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
+   cpa_flush_array(baddr, addr, numpages, cache,
+   cpa.flags, pages);
+   } else {
+   cpa_flush_range(baddr, numpages, cache);
+   }
 
 out:
return ret;


[tip:x86/mm] x86/mm/cpa: Move CLFLUSH test into cpa_flush_array()

2018-09-27 Thread tip-bot for Peter Zijlstra
Commit-ID:  fce2ce9544e9f098ba828442221ce99c2a5ecb0f
Gitweb: https://git.kernel.org/tip/fce2ce9544e9f098ba828442221ce99c2a5ecb0f
Author: Peter Zijlstra 
AuthorDate: Wed, 19 Sep 2018 10:50:22 +0200
Committer:  Thomas Gleixner 
CommitDate: Thu, 27 Sep 2018 20:39:42 +0200

x86/mm/cpa: Move CLFLUSH test into cpa_flush_array()

Rather than guarding cpa_flush_array() users with a CLFLUSH test, put
it inside.

Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Dave Hansen 
Cc: Bin Yang 
Cc: Mark Gross 
Link: https://lkml.kernel.org/r/20180919085948.087848...@infradead.org

---
 arch/x86/mm/pageattr.c | 27 ---
 1 file changed, 16 insertions(+), 11 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 3cc4a2ae4dbb..33d89d505f93 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -328,6 +328,11 @@ static void cpa_flush_array(unsigned long baddr, unsigned 
long *start,
 
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
 
+   if (!static_cpu_has(X86_FEATURE_CLFLUSH)) {
+   cpa_flush_all(cache);
+   return;
+   }
+
flush_tlb_kernel_range(baddr, baddr + PAGE_SIZE * numpages);
 
if (!cache)
@@ -1756,19 +1761,19 @@ static int change_page_attr_set_clr(unsigned long 
*addr, int numpages,
cache = !!pgprot2cachemode(mask_set);
 
/*
-* On success we use CLFLUSH, when the CPU supports it to
-* avoid the WBINVD. If the CPU does not support it and in the
-* error case we fall back to cpa_flush_all (which uses
-* WBINVD):
+* On error; flush everything to be sure.
 */
-   if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) {
-   if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
-   cpa_flush_array(baddr, addr, numpages, cache,
-   cpa.flags, pages);
-   } else
-   cpa_flush_range(baddr, numpages, cache);
-   } else
+   if (ret) {
cpa_flush_all(cache);
+   goto out;
+   }
+
+   if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
+   cpa_flush_array(baddr, addr, numpages, cache,
+   cpa.flags, pages);
+   } else {
+   cpa_flush_range(baddr, numpages, cache);
+   }
 
 out:
return ret;


[tip:x86/mm] x86/mm/cpa: Move CLFLUSH test into cpa_flush_range()

2018-09-27 Thread tip-bot for Peter Zijlstra
Commit-ID:  5f464b33b17219a233af1267c621632225bc7acc
Gitweb: https://git.kernel.org/tip/5f464b33b17219a233af1267c621632225bc7acc
Author: Peter Zijlstra 
AuthorDate: Wed, 19 Sep 2018 10:50:21 +0200
Committer:  Thomas Gleixner 
CommitDate: Thu, 27 Sep 2018 20:39:41 +0200

x86/mm/cpa: Move CLFLUSH test into cpa_flush_range()

Rather than guarding all cpa_flush_range() uses with a CLFLUSH test,
put it inside.

Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Dave Hansen 
Cc: Bin Yang 
Cc: Mark Gross 
Link: https://lkml.kernel.org/r/20180919085948.036195...@infradead.org

---
 arch/x86/mm/pageattr.c | 15 +++
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 02eb18403594..3cc4a2ae4dbb 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -293,6 +293,11 @@ static void cpa_flush_range(unsigned long start, int 
numpages, int cache)
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
WARN_ON(PAGE_ALIGN(start) != start);
 
+   if (!static_cpu_has(X86_FEATURE_CLFLUSH)) {
+   cpa_flush_all(cache);
+   return;
+   }
+
flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages);
 
if (!cache)
@@ -2078,10 +2083,7 @@ static int __set_memory_enc_dec(unsigned long addr, int 
numpages, bool enc)
/*
 * Before changing the encryption attribute, we need to flush caches.
 */
-   if (static_cpu_has(X86_FEATURE_CLFLUSH))
-   cpa_flush_range(start, numpages, 1);
-   else
-   cpa_flush_all(1);
+   cpa_flush_range(start, numpages, 1);
 
ret = __change_page_attr_set_clr(, 1);
 
@@ -2092,10 +2094,7 @@ static int __set_memory_enc_dec(unsigned long addr, int 
numpages, bool enc)
 * in case TLB flushing gets optimized in the cpa_flush_range()
 * path use the same logic as above.
 */
-   if (static_cpu_has(X86_FEATURE_CLFLUSH))
-   cpa_flush_range(start, numpages, 0);
-   else
-   cpa_flush_all(0);
+   cpa_flush_range(start, numpages, 0);
 
return ret;
 }


[tip:x86/mm] x86/mm/cpa: Move CLFLUSH test into cpa_flush_range()

2018-09-27 Thread tip-bot for Peter Zijlstra
Commit-ID:  5f464b33b17219a233af1267c621632225bc7acc
Gitweb: https://git.kernel.org/tip/5f464b33b17219a233af1267c621632225bc7acc
Author: Peter Zijlstra 
AuthorDate: Wed, 19 Sep 2018 10:50:21 +0200
Committer:  Thomas Gleixner 
CommitDate: Thu, 27 Sep 2018 20:39:41 +0200

x86/mm/cpa: Move CLFLUSH test into cpa_flush_range()

Rather than guarding all cpa_flush_range() uses with a CLFLUSH test,
put it inside.

Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Dave Hansen 
Cc: Bin Yang 
Cc: Mark Gross 
Link: https://lkml.kernel.org/r/20180919085948.036195...@infradead.org

---
 arch/x86/mm/pageattr.c | 15 +++
 1 file changed, 7 insertions(+), 8 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 02eb18403594..3cc4a2ae4dbb 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -293,6 +293,11 @@ static void cpa_flush_range(unsigned long start, int 
numpages, int cache)
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
WARN_ON(PAGE_ALIGN(start) != start);
 
+   if (!static_cpu_has(X86_FEATURE_CLFLUSH)) {
+   cpa_flush_all(cache);
+   return;
+   }
+
flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages);
 
if (!cache)
@@ -2078,10 +2083,7 @@ static int __set_memory_enc_dec(unsigned long addr, int 
numpages, bool enc)
/*
 * Before changing the encryption attribute, we need to flush caches.
 */
-   if (static_cpu_has(X86_FEATURE_CLFLUSH))
-   cpa_flush_range(start, numpages, 1);
-   else
-   cpa_flush_all(1);
+   cpa_flush_range(start, numpages, 1);
 
ret = __change_page_attr_set_clr(, 1);
 
@@ -2092,10 +2094,7 @@ static int __set_memory_enc_dec(unsigned long addr, int 
numpages, bool enc)
 * in case TLB flushing gets optimized in the cpa_flush_range()
 * path use the same logic as above.
 */
-   if (static_cpu_has(X86_FEATURE_CLFLUSH))
-   cpa_flush_range(start, numpages, 0);
-   else
-   cpa_flush_all(0);
+   cpa_flush_range(start, numpages, 0);
 
return ret;
 }


[tip:x86/mm] x86/mm/cpa: Use flush_tlb_kernel_range()

2018-09-27 Thread tip-bot for Peter Zijlstra
Commit-ID:  a7295fd53c39ce781a9792c9dd2c8747bf274160
Gitweb: https://git.kernel.org/tip/a7295fd53c39ce781a9792c9dd2c8747bf274160
Author: Peter Zijlstra 
AuthorDate: Wed, 19 Sep 2018 10:50:20 +0200
Committer:  Thomas Gleixner 
CommitDate: Thu, 27 Sep 2018 20:39:41 +0200

x86/mm/cpa: Use flush_tlb_kernel_range()

Both cpa_flush_range() and cpa_flush_array() have a well specified
range, use that to do a range based TLB invalidate.

Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Dave Hansen 
Cc: Bin Yang 
Cc: Mark Gross 
Link: https://lkml.kernel.org/r/20180919085947.985193...@infradead.org

---
 arch/x86/mm/pageattr.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index bd9b0ac07352..02eb18403594 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -293,7 +293,7 @@ static void cpa_flush_range(unsigned long start, int 
numpages, int cache)
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
WARN_ON(PAGE_ALIGN(start) != start);
 
-   flush_tlb_all();
+   flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages);
 
if (!cache)
return;
@@ -315,14 +315,15 @@ static void cpa_flush_range(unsigned long start, int 
numpages, int cache)
}
 }
 
-static void cpa_flush_array(unsigned long *start, int numpages, int cache,
+static void cpa_flush_array(unsigned long baddr, unsigned long *start,
+   int numpages, int cache,
int in_flags, struct page **pages)
 {
unsigned int i, level;
 
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
 
-   flush_tlb_all();
+   flush_tlb_kernel_range(baddr, baddr + PAGE_SIZE * numpages);
 
if (!cache)
return;
@@ -1757,7 +1758,7 @@ static int change_page_attr_set_clr(unsigned long *addr, 
int numpages,
 */
if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) {
if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
-   cpa_flush_array(addr, numpages, cache,
+   cpa_flush_array(baddr, addr, numpages, cache,
cpa.flags, pages);
} else
cpa_flush_range(baddr, numpages, cache);


[tip:x86/mm] x86/mm/cpa: Use flush_tlb_kernel_range()

2018-09-27 Thread tip-bot for Peter Zijlstra
Commit-ID:  a7295fd53c39ce781a9792c9dd2c8747bf274160
Gitweb: https://git.kernel.org/tip/a7295fd53c39ce781a9792c9dd2c8747bf274160
Author: Peter Zijlstra 
AuthorDate: Wed, 19 Sep 2018 10:50:20 +0200
Committer:  Thomas Gleixner 
CommitDate: Thu, 27 Sep 2018 20:39:41 +0200

x86/mm/cpa: Use flush_tlb_kernel_range()

Both cpa_flush_range() and cpa_flush_array() have a well specified
range, use that to do a range based TLB invalidate.

Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Dave Hansen 
Cc: Bin Yang 
Cc: Mark Gross 
Link: https://lkml.kernel.org/r/20180919085947.985193...@infradead.org

---
 arch/x86/mm/pageattr.c | 9 +
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index bd9b0ac07352..02eb18403594 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -293,7 +293,7 @@ static void cpa_flush_range(unsigned long start, int 
numpages, int cache)
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
WARN_ON(PAGE_ALIGN(start) != start);
 
-   flush_tlb_all();
+   flush_tlb_kernel_range(start, start + PAGE_SIZE * numpages);
 
if (!cache)
return;
@@ -315,14 +315,15 @@ static void cpa_flush_range(unsigned long start, int 
numpages, int cache)
}
 }
 
-static void cpa_flush_array(unsigned long *start, int numpages, int cache,
+static void cpa_flush_array(unsigned long baddr, unsigned long *start,
+   int numpages, int cache,
int in_flags, struct page **pages)
 {
unsigned int i, level;
 
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
 
-   flush_tlb_all();
+   flush_tlb_kernel_range(baddr, baddr + PAGE_SIZE * numpages);
 
if (!cache)
return;
@@ -1757,7 +1758,7 @@ static int change_page_attr_set_clr(unsigned long *addr, 
int numpages,
 */
if (!ret && boot_cpu_has(X86_FEATURE_CLFLUSH)) {
if (cpa.flags & (CPA_PAGES_ARRAY | CPA_ARRAY)) {
-   cpa_flush_array(addr, numpages, cache,
+   cpa_flush_array(baddr, addr, numpages, cache,
cpa.flags, pages);
} else
cpa_flush_range(baddr, numpages, cache);


[tip:x86/mm] x86/mm/cpa: Unconditionally avoid WBINDV when we can

2018-09-27 Thread tip-bot for Peter Zijlstra
Commit-ID:  ddd07b750382adc2b78fdfbec47af8a6e0d8ef37
Gitweb: https://git.kernel.org/tip/ddd07b750382adc2b78fdfbec47af8a6e0d8ef37
Author: Peter Zijlstra 
AuthorDate: Wed, 19 Sep 2018 10:50:19 +0200
Committer:  Thomas Gleixner 
CommitDate: Thu, 27 Sep 2018 20:39:41 +0200

x86/mm/cpa: Unconditionally avoid WBINDV when we can

CAT has happened, WBINDV is bad (even before CAT blowing away the
entire cache on a multi-core platform wasn't nice), try not to use it
ever.

Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Dave Hansen 
Cc: Bin Yang 
Cc: Mark Gross 
Link: https://lkml.kernel.org/r/20180919085947.933674...@infradead.org

---
 arch/x86/mm/pageattr.c | 18 ++
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index b6a4c638f086..bd9b0ac07352 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -319,26 +319,12 @@ static void cpa_flush_array(unsigned long *start, int 
numpages, int cache,
int in_flags, struct page **pages)
 {
unsigned int i, level;
-#ifdef CONFIG_PREEMPT
-   /*
-* Avoid wbinvd() because it causes latencies on all CPUs,
-* regardless of any CPU isolation that may be in effect.
-*
-* This should be extended for CAT enabled systems independent of
-* PREEMPT because wbinvd() does not respect the CAT partitions and
-* this is exposed to unpriviledged users through the graphics
-* subsystem.
-*/
-   unsigned long do_wbinvd = 0;
-#else
-   unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
-#endif
 
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
 
-   on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
+   flush_tlb_all();
 
-   if (!cache || do_wbinvd)
+   if (!cache)
return;
 
/*


[tip:x86/mm] x86/mm/cpa: Unconditionally avoid WBINDV when we can

2018-09-27 Thread tip-bot for Peter Zijlstra
Commit-ID:  ddd07b750382adc2b78fdfbec47af8a6e0d8ef37
Gitweb: https://git.kernel.org/tip/ddd07b750382adc2b78fdfbec47af8a6e0d8ef37
Author: Peter Zijlstra 
AuthorDate: Wed, 19 Sep 2018 10:50:19 +0200
Committer:  Thomas Gleixner 
CommitDate: Thu, 27 Sep 2018 20:39:41 +0200

x86/mm/cpa: Unconditionally avoid WBINDV when we can

CAT has happened, WBINDV is bad (even before CAT blowing away the
entire cache on a multi-core platform wasn't nice), try not to use it
ever.

Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Dave Hansen 
Cc: Bin Yang 
Cc: Mark Gross 
Link: https://lkml.kernel.org/r/20180919085947.933674...@infradead.org

---
 arch/x86/mm/pageattr.c | 18 ++
 1 file changed, 2 insertions(+), 16 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index b6a4c638f086..bd9b0ac07352 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -319,26 +319,12 @@ static void cpa_flush_array(unsigned long *start, int 
numpages, int cache,
int in_flags, struct page **pages)
 {
unsigned int i, level;
-#ifdef CONFIG_PREEMPT
-   /*
-* Avoid wbinvd() because it causes latencies on all CPUs,
-* regardless of any CPU isolation that may be in effect.
-*
-* This should be extended for CAT enabled systems independent of
-* PREEMPT because wbinvd() does not respect the CAT partitions and
-* this is exposed to unpriviledged users through the graphics
-* subsystem.
-*/
-   unsigned long do_wbinvd = 0;
-#else
-   unsigned long do_wbinvd = cache && numpages >= 1024; /* 4M threshold */
-#endif
 
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
 
-   on_each_cpu(__cpa_flush_all, (void *) do_wbinvd, 1);
+   flush_tlb_all();
 
-   if (!cache || do_wbinvd)
+   if (!cache)
return;
 
/*


[tip:x86/mm] x86/mm/cpa: Move flush_tlb_all()

2018-09-27 Thread tip-bot for Peter Zijlstra
Commit-ID:  c0a759abf5a686a37b9204c13b7e281fe516c8f0
Gitweb: https://git.kernel.org/tip/c0a759abf5a686a37b9204c13b7e281fe516c8f0
Author: Peter Zijlstra 
AuthorDate: Wed, 19 Sep 2018 10:50:18 +0200
Committer:  Thomas Gleixner 
CommitDate: Thu, 27 Sep 2018 20:39:40 +0200

x86/mm/cpa: Move flush_tlb_all()

There is an atom errata, where we do a local TLB invalidate right
before we return and then do a global TLB invalidate.

Move the global invalidate up a little bit and avoid the local
invalidate entirely.

This does put the global invalidate under pgd_lock, but that shouldn't
matter.

Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Dave Hansen 
Cc: Bin Yang 
Cc: Mark Gross 
Link: https://lkml.kernel.org/r/20180919085947.882287...@infradead.org

---
 arch/x86/mm/pageattr.c | 44 +---
 1 file changed, 17 insertions(+), 27 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a22f6b71a308..b6a4c638f086 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -999,14 +999,24 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, 
unsigned long address,
__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
 
/*
-* Intel Atom errata AAH41 workaround.
+* Do a global flush tlb after splitting the large page
+* and before we do the actual change page attribute in the PTE.
 *
-* The real fix should be in hw or in a microcode update, but
-* we also probabilistically try to reduce the window of having
-* a large TLB mixed with 4K TLBs while instruction fetches are
-* going on.
+* Without this, we violate the TLB application note, that says:
+* "The TLBs may contain both ordinary and large-page
+*  translations for a 4-KByte range of linear addresses. This
+*  may occur if software modifies the paging structures so that
+*  the page size used for the address range changes. If the two
+*  translations differ with respect to page frame or attributes
+*  (e.g., permissions), processor behavior is undefined and may
+*  be implementation-specific."
+*
+* We do this global tlb flush inside the cpa_lock, so that we
+* don't allow any other cpu, with stale tlb entries change the
+* page attribute in parallel, that also falls into the
+* just split large page entry.
 */
-   __flush_tlb_all();
+   flush_tlb_all();
spin_unlock(_lock);
 
return 0;
@@ -1531,28 +1541,8 @@ repeat:
 * We have to split the large page:
 */
err = split_large_page(cpa, kpte, address);
-   if (!err) {
-   /*
-* Do a global flush tlb after splitting the large page
-* and before we do the actual change page attribute in the PTE.
-*
-* With out this, we violate the TLB application note, that says
-* "The TLBs may contain both ordinary and large-page
-*  translations for a 4-KByte range of linear addresses. This
-*  may occur if software modifies the paging structures so that
-*  the page size used for the address range changes. If the two
-*  translations differ with respect to page frame or attributes
-*  (e.g., permissions), processor behavior is undefined and may
-*  be implementation-specific."
-*
-* We do this global tlb flush inside the cpa_lock, so that we
-* don't allow any other cpu, with stale tlb entries change the
-* page attribute in parallel, that also falls into the
-* just split large page entry.
-*/
-   flush_tlb_all();
+   if (!err)
goto repeat;
-   }
 
return err;
 }


[tip:x86/mm] x86/mm/cpa: Move flush_tlb_all()

2018-09-27 Thread tip-bot for Peter Zijlstra
Commit-ID:  c0a759abf5a686a37b9204c13b7e281fe516c8f0
Gitweb: https://git.kernel.org/tip/c0a759abf5a686a37b9204c13b7e281fe516c8f0
Author: Peter Zijlstra 
AuthorDate: Wed, 19 Sep 2018 10:50:18 +0200
Committer:  Thomas Gleixner 
CommitDate: Thu, 27 Sep 2018 20:39:40 +0200

x86/mm/cpa: Move flush_tlb_all()

There is an atom errata, where we do a local TLB invalidate right
before we return and then do a global TLB invalidate.

Move the global invalidate up a little bit and avoid the local
invalidate entirely.

This does put the global invalidate under pgd_lock, but that shouldn't
matter.

Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Dave Hansen 
Cc: Bin Yang 
Cc: Mark Gross 
Link: https://lkml.kernel.org/r/20180919085947.882287...@infradead.org

---
 arch/x86/mm/pageattr.c | 44 +---
 1 file changed, 17 insertions(+), 27 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index a22f6b71a308..b6a4c638f086 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -999,14 +999,24 @@ __split_large_page(struct cpa_data *cpa, pte_t *kpte, 
unsigned long address,
__set_pmd_pte(kpte, address, mk_pte(base, __pgprot(_KERNPG_TABLE)));
 
/*
-* Intel Atom errata AAH41 workaround.
+* Do a global flush tlb after splitting the large page
+* and before we do the actual change page attribute in the PTE.
 *
-* The real fix should be in hw or in a microcode update, but
-* we also probabilistically try to reduce the window of having
-* a large TLB mixed with 4K TLBs while instruction fetches are
-* going on.
+* Without this, we violate the TLB application note, that says:
+* "The TLBs may contain both ordinary and large-page
+*  translations for a 4-KByte range of linear addresses. This
+*  may occur if software modifies the paging structures so that
+*  the page size used for the address range changes. If the two
+*  translations differ with respect to page frame or attributes
+*  (e.g., permissions), processor behavior is undefined and may
+*  be implementation-specific."
+*
+* We do this global tlb flush inside the cpa_lock, so that we
+* don't allow any other cpu, with stale tlb entries change the
+* page attribute in parallel, that also falls into the
+* just split large page entry.
 */
-   __flush_tlb_all();
+   flush_tlb_all();
spin_unlock(_lock);
 
return 0;
@@ -1531,28 +1541,8 @@ repeat:
 * We have to split the large page:
 */
err = split_large_page(cpa, kpte, address);
-   if (!err) {
-   /*
-* Do a global flush tlb after splitting the large page
-* and before we do the actual change page attribute in the PTE.
-*
-* With out this, we violate the TLB application note, that says
-* "The TLBs may contain both ordinary and large-page
-*  translations for a 4-KByte range of linear addresses. This
-*  may occur if software modifies the paging structures so that
-*  the page size used for the address range changes. If the two
-*  translations differ with respect to page frame or attributes
-*  (e.g., permissions), processor behavior is undefined and may
-*  be implementation-specific."
-*
-* We do this global tlb flush inside the cpa_lock, so that we
-* don't allow any other cpu, with stale tlb entries change the
-* page attribute in parallel, that also falls into the
-* just split large page entry.
-*/
-   flush_tlb_all();
+   if (!err)
goto repeat;
-   }
 
return err;
 }


[tip:x86/mm] x86/mm/cpa: Use flush_tlb_all()

2018-09-27 Thread tip-bot for Peter Zijlstra
Commit-ID:  c6185b1f21a47af94617fde3af7e803817b522a9
Gitweb: https://git.kernel.org/tip/c6185b1f21a47af94617fde3af7e803817b522a9
Author: Peter Zijlstra 
AuthorDate: Wed, 19 Sep 2018 10:50:17 +0200
Committer:  Thomas Gleixner 
CommitDate: Thu, 27 Sep 2018 20:39:40 +0200

x86/mm/cpa: Use flush_tlb_all()

Instead of open-coding it..

Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Dave Hansen 
Cc: Bin Yang 
Cc: Mark Gross 
Link: https://lkml.kernel.org/r/20180919085947.831102...@infradead.org

---
 arch/x86/mm/pageattr.c | 12 +---
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 4e55ded01be5..a22f6b71a308 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -285,16 +285,6 @@ static void cpa_flush_all(unsigned long cache)
on_each_cpu(__cpa_flush_all, (void *) cache, 1);
 }
 
-static void __cpa_flush_range(void *arg)
-{
-   /*
-* We could optimize that further and do individual per page
-* tlb invalidates for a low number of pages. Caveat: we must
-* flush the high aliases on 64bit as well.
-*/
-   __flush_tlb_all();
-}
-
 static void cpa_flush_range(unsigned long start, int numpages, int cache)
 {
unsigned int i, level;
@@ -303,7 +293,7 @@ static void cpa_flush_range(unsigned long start, int 
numpages, int cache)
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
WARN_ON(PAGE_ALIGN(start) != start);
 
-   on_each_cpu(__cpa_flush_range, NULL, 1);
+   flush_tlb_all();
 
if (!cache)
return;


[tip:x86/mm] x86/mm/cpa: Use flush_tlb_all()

2018-09-27 Thread tip-bot for Peter Zijlstra
Commit-ID:  c6185b1f21a47af94617fde3af7e803817b522a9
Gitweb: https://git.kernel.org/tip/c6185b1f21a47af94617fde3af7e803817b522a9
Author: Peter Zijlstra 
AuthorDate: Wed, 19 Sep 2018 10:50:17 +0200
Committer:  Thomas Gleixner 
CommitDate: Thu, 27 Sep 2018 20:39:40 +0200

x86/mm/cpa: Use flush_tlb_all()

Instead of open-coding it..

Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Reviewed-by: Dave Hansen 
Cc: Bin Yang 
Cc: Mark Gross 
Link: https://lkml.kernel.org/r/20180919085947.831102...@infradead.org

---
 arch/x86/mm/pageattr.c | 12 +---
 1 file changed, 1 insertion(+), 11 deletions(-)

diff --git a/arch/x86/mm/pageattr.c b/arch/x86/mm/pageattr.c
index 4e55ded01be5..a22f6b71a308 100644
--- a/arch/x86/mm/pageattr.c
+++ b/arch/x86/mm/pageattr.c
@@ -285,16 +285,6 @@ static void cpa_flush_all(unsigned long cache)
on_each_cpu(__cpa_flush_all, (void *) cache, 1);
 }
 
-static void __cpa_flush_range(void *arg)
-{
-   /*
-* We could optimize that further and do individual per page
-* tlb invalidates for a low number of pages. Caveat: we must
-* flush the high aliases on 64bit as well.
-*/
-   __flush_tlb_all();
-}
-
 static void cpa_flush_range(unsigned long start, int numpages, int cache)
 {
unsigned int i, level;
@@ -303,7 +293,7 @@ static void cpa_flush_range(unsigned long start, int 
numpages, int cache)
BUG_ON(irqs_disabled() && !early_boot_irqs_disabled);
WARN_ON(PAGE_ALIGN(start) != start);
 
-   on_each_cpu(__cpa_flush_range, NULL, 1);
+   flush_tlb_all();
 
if (!cache)
return;


[tip:locking/core] locking/lockdep, cpu/hotplug: Annotate AP thread

2018-09-11 Thread tip-bot for Peter Zijlstra
Commit-ID:  cb92173d1f0474784c6171a9d3fdbbca0ee53554
Gitweb: https://git.kernel.org/tip/cb92173d1f0474784c6171a9d3fdbbca0ee53554
Author: Peter Zijlstra 
AuthorDate: Tue, 11 Sep 2018 11:51:27 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 11 Sep 2018 20:01:03 +0200

locking/lockdep, cpu/hotplug: Annotate AP thread

Anybody trying to assert the cpu_hotplug_lock is held 
(lockdep_assert_cpus_held())
from AP callbacks will fail, because the lock is held by the BP.

Stick in an explicit annotation in cpuhp_thread_fun() to make this work.

Reported-by: Ingo Molnar 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: linux-tip-comm...@vger.kernel.org
Fixes: cb538267ea1e ("jump_label/lockdep: Assert we hold the hotplug lock for 
_cpuslocked() operations")
Link: 
http://lkml.kernel.org/r/20180911095127.gt24...@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar 
---
 kernel/cpu.c | 28 
 1 file changed, 28 insertions(+)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 0097acec1c71..be4859f07153 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -315,6 +315,16 @@ void lockdep_assert_cpus_held(void)
percpu_rwsem_assert_held(_hotplug_lock);
 }
 
+static void lockdep_acquire_cpus_lock(void)
+{
+   rwsem_acquire(_hotplug_lock.rw_sem.dep_map, 0, 0, _THIS_IP_);
+}
+
+static void lockdep_release_cpus_lock(void)
+{
+   rwsem_release(_hotplug_lock.rw_sem.dep_map, 1, _THIS_IP_);
+}
+
 /*
  * Wait for currently running CPU hotplug operations to complete (if any) and
  * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
@@ -344,6 +354,17 @@ void cpu_hotplug_enable(void)
cpu_maps_update_done();
 }
 EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
+
+#else
+
+static void lockdep_acquire_cpus_lock(void)
+{
+}
+
+static void lockdep_release_cpus_lock(void)
+{
+}
+
 #endif /* CONFIG_HOTPLUG_CPU */
 
 #ifdef CONFIG_HOTPLUG_SMT
@@ -616,6 +637,12 @@ static void cpuhp_thread_fun(unsigned int cpu)
 */
smp_mb();
 
+   /*
+* The BP holds the hotplug lock, but we're now running on the AP,
+* ensure that anybody asserting the lock is held, will actually find
+* it so.
+*/
+   lockdep_acquire_cpus_lock();
cpuhp_lock_acquire(bringup);
 
if (st->single) {
@@ -661,6 +688,7 @@ static void cpuhp_thread_fun(unsigned int cpu)
}
 
cpuhp_lock_release(bringup);
+   lockdep_release_cpus_lock();
 
if (!st->should_run)
complete_ap_thread(st, bringup);


[tip:locking/core] locking/lockdep, cpu/hotplug: Annotate AP thread

2018-09-11 Thread tip-bot for Peter Zijlstra
Commit-ID:  cb92173d1f0474784c6171a9d3fdbbca0ee53554
Gitweb: https://git.kernel.org/tip/cb92173d1f0474784c6171a9d3fdbbca0ee53554
Author: Peter Zijlstra 
AuthorDate: Tue, 11 Sep 2018 11:51:27 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 11 Sep 2018 20:01:03 +0200

locking/lockdep, cpu/hotplug: Annotate AP thread

Anybody trying to assert the cpu_hotplug_lock is held 
(lockdep_assert_cpus_held())
from AP callbacks will fail, because the lock is held by the BP.

Stick in an explicit annotation in cpuhp_thread_fun() to make this work.

Reported-by: Ingo Molnar 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: linux-tip-comm...@vger.kernel.org
Fixes: cb538267ea1e ("jump_label/lockdep: Assert we hold the hotplug lock for 
_cpuslocked() operations")
Link: 
http://lkml.kernel.org/r/20180911095127.gt24...@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar 
---
 kernel/cpu.c | 28 
 1 file changed, 28 insertions(+)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 0097acec1c71..be4859f07153 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -315,6 +315,16 @@ void lockdep_assert_cpus_held(void)
percpu_rwsem_assert_held(_hotplug_lock);
 }
 
+static void lockdep_acquire_cpus_lock(void)
+{
+   rwsem_acquire(_hotplug_lock.rw_sem.dep_map, 0, 0, _THIS_IP_);
+}
+
+static void lockdep_release_cpus_lock(void)
+{
+   rwsem_release(_hotplug_lock.rw_sem.dep_map, 1, _THIS_IP_);
+}
+
 /*
  * Wait for currently running CPU hotplug operations to complete (if any) and
  * disable future CPU hotplug (from sysfs). The 'cpu_add_remove_lock' protects
@@ -344,6 +354,17 @@ void cpu_hotplug_enable(void)
cpu_maps_update_done();
 }
 EXPORT_SYMBOL_GPL(cpu_hotplug_enable);
+
+#else
+
+static void lockdep_acquire_cpus_lock(void)
+{
+}
+
+static void lockdep_release_cpus_lock(void)
+{
+}
+
 #endif /* CONFIG_HOTPLUG_CPU */
 
 #ifdef CONFIG_HOTPLUG_SMT
@@ -616,6 +637,12 @@ static void cpuhp_thread_fun(unsigned int cpu)
 */
smp_mb();
 
+   /*
+* The BP holds the hotplug lock, but we're now running on the AP,
+* ensure that anybody asserting the lock is held, will actually find
+* it so.
+*/
+   lockdep_acquire_cpus_lock();
cpuhp_lock_acquire(bringup);
 
if (st->single) {
@@ -661,6 +688,7 @@ static void cpuhp_thread_fun(unsigned int cpu)
}
 
cpuhp_lock_release(bringup);
+   lockdep_release_cpus_lock();
 
if (!st->should_run)
complete_ap_thread(st, bringup);


[tip:locking/core] locking/lockdep, cpu/hotplug: Annotate AP thread

2018-09-11 Thread tip-bot for Peter Zijlstra
Commit-ID:  f1b2f6eccf99fc457221cc84c7550a8e3b17d4df
Gitweb: https://git.kernel.org/tip/f1b2f6eccf99fc457221cc84c7550a8e3b17d4df
Author: Peter Zijlstra 
AuthorDate: Tue, 11 Sep 2018 11:51:27 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 11 Sep 2018 12:37:00 +0200

locking/lockdep, cpu/hotplug: Annotate AP thread

Anybody trying to assert the cpu_hotplug_lock is held 
(lockdep_assert_cpus_held())
from AP callbacks will fail, because the lock is held by the BP.

Stick in an explicit annotation in cpuhp_thread_fun() to make this work.

Reported-by: Ingo Molnar 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: linux-tip-comm...@vger.kernel.org
Fixes: cb538267ea1e ("jump_label/lockdep: Assert we hold the hotplug lock for 
_cpuslocked() operations")
Link: 
http://lkml.kernel.org/r/20180911095127.gt24...@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar 
---
 kernel/cpu.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 0097acec1c71..08c168b159da 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -616,6 +616,12 @@ static void cpuhp_thread_fun(unsigned int cpu)
 */
smp_mb();
 
+   /*
+* The BP holds the hotplug lock, but we're now running on the AP,
+* ensure that anybody asserting the lock is held, will actually find
+* it so.
+*/
+   rwsem_acquire(_hotplug_lock.rw_sem.dep_map, 0, 0, _THIS_IP_);
cpuhp_lock_acquire(bringup);
 
if (st->single) {
@@ -661,6 +667,7 @@ static void cpuhp_thread_fun(unsigned int cpu)
}
 
cpuhp_lock_release(bringup);
+   rwsem_release(_hotplug_lock.rw_sem.dep_map, 1, _THIS_IP_);
 
if (!st->should_run)
complete_ap_thread(st, bringup);


[tip:locking/core] locking/lockdep, cpu/hotplug: Annotate AP thread

2018-09-11 Thread tip-bot for Peter Zijlstra
Commit-ID:  f1b2f6eccf99fc457221cc84c7550a8e3b17d4df
Gitweb: https://git.kernel.org/tip/f1b2f6eccf99fc457221cc84c7550a8e3b17d4df
Author: Peter Zijlstra 
AuthorDate: Tue, 11 Sep 2018 11:51:27 +0200
Committer:  Ingo Molnar 
CommitDate: Tue, 11 Sep 2018 12:37:00 +0200

locking/lockdep, cpu/hotplug: Annotate AP thread

Anybody trying to assert the cpu_hotplug_lock is held 
(lockdep_assert_cpus_held())
from AP callbacks will fail, because the lock is held by the BP.

Stick in an explicit annotation in cpuhp_thread_fun() to make this work.

Reported-by: Ingo Molnar 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Linus Torvalds 
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: linux-tip-comm...@vger.kernel.org
Fixes: cb538267ea1e ("jump_label/lockdep: Assert we hold the hotplug lock for 
_cpuslocked() operations")
Link: 
http://lkml.kernel.org/r/20180911095127.gt24...@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar 
---
 kernel/cpu.c | 7 +++
 1 file changed, 7 insertions(+)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 0097acec1c71..08c168b159da 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -616,6 +616,12 @@ static void cpuhp_thread_fun(unsigned int cpu)
 */
smp_mb();
 
+   /*
+* The BP holds the hotplug lock, but we're now running on the AP,
+* ensure that anybody asserting the lock is held, will actually find
+* it so.
+*/
+   rwsem_acquire(_hotplug_lock.rw_sem.dep_map, 0, 0, _THIS_IP_);
cpuhp_lock_acquire(bringup);
 
if (st->single) {
@@ -661,6 +667,7 @@ static void cpuhp_thread_fun(unsigned int cpu)
}
 
cpuhp_lock_release(bringup);
+   rwsem_release(_hotplug_lock.rw_sem.dep_map, 1, _THIS_IP_);
 
if (!st->should_run)
complete_ap_thread(st, bringup);


[tip:timers/urgent] clocksource: Revert "Remove kthread"

2018-09-06 Thread tip-bot for Peter Zijlstra
Commit-ID:  e2c631ba75a7e727e8db0a9d30a06bfd434adb3a
Gitweb: https://git.kernel.org/tip/e2c631ba75a7e727e8db0a9d30a06bfd434adb3a
Author: Peter Zijlstra 
AuthorDate: Wed, 5 Sep 2018 10:41:58 +0200
Committer:  Thomas Gleixner 
CommitDate: Thu, 6 Sep 2018 23:38:35 +0200

clocksource: Revert "Remove kthread"

I turns out that the silly spawn kthread from worker was actually needed.

clocksource_watchdog_kthread() cannot be called directly from
clocksource_watchdog_work(), because clocksource_select() calls
timekeeping_notify() which uses stop_machine(). One cannot use
stop_machine() from a workqueue() due lock inversions wrt CPU hotplug.

Revert the patch but add a comment that explain why we jump through such
apparently silly hoops.

Fixes: 7197e77abcb6 ("clocksource: Remove kthread")
Reported-by: Siegfried Metz 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Tested-by: Niklas Cassel 
Tested-by: Kevin Shanahan 
Tested-by: viktor_jaegerskuep...@freenet.de
Tested-by: Siegfried Metz 
Cc: rafael.j.wyso...@intel.com
Cc: len.br...@intel.com
Cc: diego.vi...@gmail.com
Cc: rui.zh...@intel.com
Cc: bjorn.anders...@linaro.org
Link: 
https://lkml.kernel.org/r/20180905084158.gr24...@hirez.programming.kicks-ass.net
---
 kernel/time/clocksource.c | 40 ++--
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index f74fb00d8064..0e6e97a01942 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -133,19 +133,40 @@ static void inline clocksource_watchdog_unlock(unsigned 
long *flags)
spin_unlock_irqrestore(_lock, *flags);
 }
 
+static int clocksource_watchdog_kthread(void *data);
+static void __clocksource_change_rating(struct clocksource *cs, int rating);
+
 /*
  * Interval: 0.5sec Threshold: 0.0625s
  */
 #define WATCHDOG_INTERVAL (HZ >> 1)
 #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
 
+static void clocksource_watchdog_work(struct work_struct *work)
+{
+   /*
+* We cannot directly run clocksource_watchdog_kthread() here, because
+* clocksource_select() calls timekeeping_notify() which uses
+* stop_machine(). One cannot use stop_machine() from a workqueue() due
+* lock inversions wrt CPU hotplug.
+*
+* Also, we only ever run this work once or twice during the lifetime
+* of the kernel, so there is no point in creating a more permanent
+* kthread for this.
+*
+* If kthread_run fails the next watchdog scan over the
+* watchdog_list will find the unstable clock again.
+*/
+   kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
+}
+
 static void __clocksource_unstable(struct clocksource *cs)
 {
cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
cs->flags |= CLOCK_SOURCE_UNSTABLE;
 
/*
-* If the clocksource is registered clocksource_watchdog_work() will
+* If the clocksource is registered clocksource_watchdog_kthread() will
 * re-rate and re-select.
 */
if (list_empty(>list)) {
@@ -156,7 +177,7 @@ static void __clocksource_unstable(struct clocksource *cs)
if (cs->mark_unstable)
cs->mark_unstable(cs);
 
-   /* kick clocksource_watchdog_work() */
+   /* kick clocksource_watchdog_kthread() */
if (finished_booting)
schedule_work(_work);
 }
@@ -166,7 +187,7 @@ static void __clocksource_unstable(struct clocksource *cs)
  * @cs:clocksource to be marked unstable
  *
  * This function is called by the x86 TSC code to mark clocksources as 
unstable;
- * it defers demotion and re-selection to a work.
+ * it defers demotion and re-selection to a kthread.
  */
 void clocksource_mark_unstable(struct clocksource *cs)
 {
@@ -391,9 +412,7 @@ static void clocksource_dequeue_watchdog(struct clocksource 
*cs)
}
 }
 
-static void __clocksource_change_rating(struct clocksource *cs, int rating);
-
-static int __clocksource_watchdog_work(void)
+static int __clocksource_watchdog_kthread(void)
 {
struct clocksource *cs, *tmp;
unsigned long flags;
@@ -418,12 +437,13 @@ static int __clocksource_watchdog_work(void)
return select;
 }
 
-static void clocksource_watchdog_work(struct work_struct *work)
+static int clocksource_watchdog_kthread(void *data)
 {
mutex_lock(_mutex);
-   if (__clocksource_watchdog_work())
+   if (__clocksource_watchdog_kthread())
clocksource_select();
mutex_unlock(_mutex);
+   return 0;
 }
 
 static bool clocksource_is_watchdog(struct clocksource *cs)
@@ -442,7 +462,7 @@ static void clocksource_enqueue_watchdog(struct clocksource 
*cs)
 static void clocksource_select_watchdog(bool fallback) { }
 static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
 static inline void clocksource_resume_watchdog(void) { }
-static 

[tip:timers/urgent] clocksource: Revert "Remove kthread"

2018-09-06 Thread tip-bot for Peter Zijlstra
Commit-ID:  e2c631ba75a7e727e8db0a9d30a06bfd434adb3a
Gitweb: https://git.kernel.org/tip/e2c631ba75a7e727e8db0a9d30a06bfd434adb3a
Author: Peter Zijlstra 
AuthorDate: Wed, 5 Sep 2018 10:41:58 +0200
Committer:  Thomas Gleixner 
CommitDate: Thu, 6 Sep 2018 23:38:35 +0200

clocksource: Revert "Remove kthread"

I turns out that the silly spawn kthread from worker was actually needed.

clocksource_watchdog_kthread() cannot be called directly from
clocksource_watchdog_work(), because clocksource_select() calls
timekeeping_notify() which uses stop_machine(). One cannot use
stop_machine() from a workqueue() due lock inversions wrt CPU hotplug.

Revert the patch but add a comment that explain why we jump through such
apparently silly hoops.

Fixes: 7197e77abcb6 ("clocksource: Remove kthread")
Reported-by: Siegfried Metz 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Tested-by: Niklas Cassel 
Tested-by: Kevin Shanahan 
Tested-by: viktor_jaegerskuep...@freenet.de
Tested-by: Siegfried Metz 
Cc: rafael.j.wyso...@intel.com
Cc: len.br...@intel.com
Cc: diego.vi...@gmail.com
Cc: rui.zh...@intel.com
Cc: bjorn.anders...@linaro.org
Link: 
https://lkml.kernel.org/r/20180905084158.gr24...@hirez.programming.kicks-ass.net
---
 kernel/time/clocksource.c | 40 ++--
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index f74fb00d8064..0e6e97a01942 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -133,19 +133,40 @@ static void inline clocksource_watchdog_unlock(unsigned 
long *flags)
spin_unlock_irqrestore(_lock, *flags);
 }
 
+static int clocksource_watchdog_kthread(void *data);
+static void __clocksource_change_rating(struct clocksource *cs, int rating);
+
 /*
  * Interval: 0.5sec Threshold: 0.0625s
  */
 #define WATCHDOG_INTERVAL (HZ >> 1)
 #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
 
+static void clocksource_watchdog_work(struct work_struct *work)
+{
+   /*
+* We cannot directly run clocksource_watchdog_kthread() here, because
+* clocksource_select() calls timekeeping_notify() which uses
+* stop_machine(). One cannot use stop_machine() from a workqueue() due
+* lock inversions wrt CPU hotplug.
+*
+* Also, we only ever run this work once or twice during the lifetime
+* of the kernel, so there is no point in creating a more permanent
+* kthread for this.
+*
+* If kthread_run fails the next watchdog scan over the
+* watchdog_list will find the unstable clock again.
+*/
+   kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
+}
+
 static void __clocksource_unstable(struct clocksource *cs)
 {
cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
cs->flags |= CLOCK_SOURCE_UNSTABLE;
 
/*
-* If the clocksource is registered clocksource_watchdog_work() will
+* If the clocksource is registered clocksource_watchdog_kthread() will
 * re-rate and re-select.
 */
if (list_empty(>list)) {
@@ -156,7 +177,7 @@ static void __clocksource_unstable(struct clocksource *cs)
if (cs->mark_unstable)
cs->mark_unstable(cs);
 
-   /* kick clocksource_watchdog_work() */
+   /* kick clocksource_watchdog_kthread() */
if (finished_booting)
schedule_work(_work);
 }
@@ -166,7 +187,7 @@ static void __clocksource_unstable(struct clocksource *cs)
  * @cs:clocksource to be marked unstable
  *
  * This function is called by the x86 TSC code to mark clocksources as 
unstable;
- * it defers demotion and re-selection to a work.
+ * it defers demotion and re-selection to a kthread.
  */
 void clocksource_mark_unstable(struct clocksource *cs)
 {
@@ -391,9 +412,7 @@ static void clocksource_dequeue_watchdog(struct clocksource 
*cs)
}
 }
 
-static void __clocksource_change_rating(struct clocksource *cs, int rating);
-
-static int __clocksource_watchdog_work(void)
+static int __clocksource_watchdog_kthread(void)
 {
struct clocksource *cs, *tmp;
unsigned long flags;
@@ -418,12 +437,13 @@ static int __clocksource_watchdog_work(void)
return select;
 }
 
-static void clocksource_watchdog_work(struct work_struct *work)
+static int clocksource_watchdog_kthread(void *data)
 {
mutex_lock(_mutex);
-   if (__clocksource_watchdog_work())
+   if (__clocksource_watchdog_kthread())
clocksource_select();
mutex_unlock(_mutex);
+   return 0;
 }
 
 static bool clocksource_is_watchdog(struct clocksource *cs)
@@ -442,7 +462,7 @@ static void clocksource_enqueue_watchdog(struct clocksource 
*cs)
 static void clocksource_select_watchdog(bool fallback) { }
 static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
 static inline void clocksource_resume_watchdog(void) { }
-static 

[tip:timers/urgent] clocksource: Revert "Remove kthread"

2018-09-06 Thread tip-bot for Peter Zijlstra
Commit-ID:  760902b24960679c2e8592de3a56359d2c205731
Gitweb: https://git.kernel.org/tip/760902b24960679c2e8592de3a56359d2c205731
Author: Peter Zijlstra 
AuthorDate: Wed, 5 Sep 2018 10:41:58 +0200
Committer:  Thomas Gleixner 
CommitDate: Thu, 6 Sep 2018 12:42:28 +0200

clocksource: Revert "Remove kthread"

I turns out that the silly spawn kthread from worker was actually needed.

clocksource_watchdog_kthread() cannot be called directly from
clocksource_watchdog_work(), because clocksource_select() calls
timekeeping_notify() which uses stop_machine(). One cannot use
stop_machine() from a workqueue() due lock inversions wrt CPU hotplug.

Revert the patch but add a comment that explain why we jump through such
apparently silly hoops.

Fixes: 7197e77abcb6 ("clocksource: Remove kthread")
Reported-by: Siegfried Metz 
Signed-off-by: Peter Zijlstra (Intel) 
Tested-by: Niklas Cassel 
Tested-by: Kevin Shanahan 
Tested-by: viktor_jaegerskuep...@freenet.de
Tested-by: Siegfried Metz 
Cc: rafael.j.wyso...@intel.com
Cc: len.br...@intel.com
Cc: diego.vi...@gmail.com
Cc: rui.zh...@intel.com
Cc: bjorn.anders...@linaro.org
Link: 
https://lkml.kernel.org/r/20180905084158.gr24...@hirez.programming.kicks-ass.net
---
 kernel/time/clocksource.c | 40 ++--
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index f74fb00d8064..0e6e97a01942 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -133,19 +133,40 @@ static void inline clocksource_watchdog_unlock(unsigned 
long *flags)
spin_unlock_irqrestore(_lock, *flags);
 }
 
+static int clocksource_watchdog_kthread(void *data);
+static void __clocksource_change_rating(struct clocksource *cs, int rating);
+
 /*
  * Interval: 0.5sec Threshold: 0.0625s
  */
 #define WATCHDOG_INTERVAL (HZ >> 1)
 #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
 
+static void clocksource_watchdog_work(struct work_struct *work)
+{
+   /*
+* We cannot directly run clocksource_watchdog_kthread() here, because
+* clocksource_select() calls timekeeping_notify() which uses
+* stop_machine(). One cannot use stop_machine() from a workqueue() due
+* lock inversions wrt CPU hotplug.
+*
+* Also, we only ever run this work once or twice during the lifetime
+* of the kernel, so there is no point in creating a more permanent
+* kthread for this.
+*
+* If kthread_run fails the next watchdog scan over the
+* watchdog_list will find the unstable clock again.
+*/
+   kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
+}
+
 static void __clocksource_unstable(struct clocksource *cs)
 {
cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
cs->flags |= CLOCK_SOURCE_UNSTABLE;
 
/*
-* If the clocksource is registered clocksource_watchdog_work() will
+* If the clocksource is registered clocksource_watchdog_kthread() will
 * re-rate and re-select.
 */
if (list_empty(>list)) {
@@ -156,7 +177,7 @@ static void __clocksource_unstable(struct clocksource *cs)
if (cs->mark_unstable)
cs->mark_unstable(cs);
 
-   /* kick clocksource_watchdog_work() */
+   /* kick clocksource_watchdog_kthread() */
if (finished_booting)
schedule_work(_work);
 }
@@ -166,7 +187,7 @@ static void __clocksource_unstable(struct clocksource *cs)
  * @cs:clocksource to be marked unstable
  *
  * This function is called by the x86 TSC code to mark clocksources as 
unstable;
- * it defers demotion and re-selection to a work.
+ * it defers demotion and re-selection to a kthread.
  */
 void clocksource_mark_unstable(struct clocksource *cs)
 {
@@ -391,9 +412,7 @@ static void clocksource_dequeue_watchdog(struct clocksource 
*cs)
}
 }
 
-static void __clocksource_change_rating(struct clocksource *cs, int rating);
-
-static int __clocksource_watchdog_work(void)
+static int __clocksource_watchdog_kthread(void)
 {
struct clocksource *cs, *tmp;
unsigned long flags;
@@ -418,12 +437,13 @@ static int __clocksource_watchdog_work(void)
return select;
 }
 
-static void clocksource_watchdog_work(struct work_struct *work)
+static int clocksource_watchdog_kthread(void *data)
 {
mutex_lock(_mutex);
-   if (__clocksource_watchdog_work())
+   if (__clocksource_watchdog_kthread())
clocksource_select();
mutex_unlock(_mutex);
+   return 0;
 }
 
 static bool clocksource_is_watchdog(struct clocksource *cs)
@@ -442,7 +462,7 @@ static void clocksource_enqueue_watchdog(struct clocksource 
*cs)
 static void clocksource_select_watchdog(bool fallback) { }
 static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
 static inline void clocksource_resume_watchdog(void) { }
-static inline int 

[tip:timers/urgent] clocksource: Revert "Remove kthread"

2018-09-06 Thread tip-bot for Peter Zijlstra
Commit-ID:  760902b24960679c2e8592de3a56359d2c205731
Gitweb: https://git.kernel.org/tip/760902b24960679c2e8592de3a56359d2c205731
Author: Peter Zijlstra 
AuthorDate: Wed, 5 Sep 2018 10:41:58 +0200
Committer:  Thomas Gleixner 
CommitDate: Thu, 6 Sep 2018 12:42:28 +0200

clocksource: Revert "Remove kthread"

I turns out that the silly spawn kthread from worker was actually needed.

clocksource_watchdog_kthread() cannot be called directly from
clocksource_watchdog_work(), because clocksource_select() calls
timekeeping_notify() which uses stop_machine(). One cannot use
stop_machine() from a workqueue() due lock inversions wrt CPU hotplug.

Revert the patch but add a comment that explain why we jump through such
apparently silly hoops.

Fixes: 7197e77abcb6 ("clocksource: Remove kthread")
Reported-by: Siegfried Metz 
Signed-off-by: Peter Zijlstra (Intel) 
Tested-by: Niklas Cassel 
Tested-by: Kevin Shanahan 
Tested-by: viktor_jaegerskuep...@freenet.de
Tested-by: Siegfried Metz 
Cc: rafael.j.wyso...@intel.com
Cc: len.br...@intel.com
Cc: diego.vi...@gmail.com
Cc: rui.zh...@intel.com
Cc: bjorn.anders...@linaro.org
Link: 
https://lkml.kernel.org/r/20180905084158.gr24...@hirez.programming.kicks-ass.net
---
 kernel/time/clocksource.c | 40 ++--
 1 file changed, 30 insertions(+), 10 deletions(-)

diff --git a/kernel/time/clocksource.c b/kernel/time/clocksource.c
index f74fb00d8064..0e6e97a01942 100644
--- a/kernel/time/clocksource.c
+++ b/kernel/time/clocksource.c
@@ -133,19 +133,40 @@ static void inline clocksource_watchdog_unlock(unsigned 
long *flags)
spin_unlock_irqrestore(_lock, *flags);
 }
 
+static int clocksource_watchdog_kthread(void *data);
+static void __clocksource_change_rating(struct clocksource *cs, int rating);
+
 /*
  * Interval: 0.5sec Threshold: 0.0625s
  */
 #define WATCHDOG_INTERVAL (HZ >> 1)
 #define WATCHDOG_THRESHOLD (NSEC_PER_SEC >> 4)
 
+static void clocksource_watchdog_work(struct work_struct *work)
+{
+   /*
+* We cannot directly run clocksource_watchdog_kthread() here, because
+* clocksource_select() calls timekeeping_notify() which uses
+* stop_machine(). One cannot use stop_machine() from a workqueue() due
+* lock inversions wrt CPU hotplug.
+*
+* Also, we only ever run this work once or twice during the lifetime
+* of the kernel, so there is no point in creating a more permanent
+* kthread for this.
+*
+* If kthread_run fails the next watchdog scan over the
+* watchdog_list will find the unstable clock again.
+*/
+   kthread_run(clocksource_watchdog_kthread, NULL, "kwatchdog");
+}
+
 static void __clocksource_unstable(struct clocksource *cs)
 {
cs->flags &= ~(CLOCK_SOURCE_VALID_FOR_HRES | CLOCK_SOURCE_WATCHDOG);
cs->flags |= CLOCK_SOURCE_UNSTABLE;
 
/*
-* If the clocksource is registered clocksource_watchdog_work() will
+* If the clocksource is registered clocksource_watchdog_kthread() will
 * re-rate and re-select.
 */
if (list_empty(>list)) {
@@ -156,7 +177,7 @@ static void __clocksource_unstable(struct clocksource *cs)
if (cs->mark_unstable)
cs->mark_unstable(cs);
 
-   /* kick clocksource_watchdog_work() */
+   /* kick clocksource_watchdog_kthread() */
if (finished_booting)
schedule_work(_work);
 }
@@ -166,7 +187,7 @@ static void __clocksource_unstable(struct clocksource *cs)
  * @cs:clocksource to be marked unstable
  *
  * This function is called by the x86 TSC code to mark clocksources as 
unstable;
- * it defers demotion and re-selection to a work.
+ * it defers demotion and re-selection to a kthread.
  */
 void clocksource_mark_unstable(struct clocksource *cs)
 {
@@ -391,9 +412,7 @@ static void clocksource_dequeue_watchdog(struct clocksource 
*cs)
}
 }
 
-static void __clocksource_change_rating(struct clocksource *cs, int rating);
-
-static int __clocksource_watchdog_work(void)
+static int __clocksource_watchdog_kthread(void)
 {
struct clocksource *cs, *tmp;
unsigned long flags;
@@ -418,12 +437,13 @@ static int __clocksource_watchdog_work(void)
return select;
 }
 
-static void clocksource_watchdog_work(struct work_struct *work)
+static int clocksource_watchdog_kthread(void *data)
 {
mutex_lock(_mutex);
-   if (__clocksource_watchdog_work())
+   if (__clocksource_watchdog_kthread())
clocksource_select();
mutex_unlock(_mutex);
+   return 0;
 }
 
 static bool clocksource_is_watchdog(struct clocksource *cs)
@@ -442,7 +462,7 @@ static void clocksource_enqueue_watchdog(struct clocksource 
*cs)
 static void clocksource_select_watchdog(bool fallback) { }
 static inline void clocksource_dequeue_watchdog(struct clocksource *cs) { }
 static inline void clocksource_resume_watchdog(void) { }
-static inline int 

[tip:sched/core] stop_machine: Reflow cpu_stop_queue_two_works()

2018-08-02 Thread tip-bot for Peter Zijlstra
Commit-ID:  b80a2bfce85e1051056d98d04ecb2d0b55cbbc1c
Gitweb: https://git.kernel.org/tip/b80a2bfce85e1051056d98d04ecb2d0b55cbbc1c
Author: Peter Zijlstra 
AuthorDate: Mon, 30 Jul 2018 13:21:40 +0200
Committer:  Thomas Gleixner 
CommitDate: Thu, 2 Aug 2018 15:25:20 +0200

stop_machine: Reflow cpu_stop_queue_two_works()

The code flow in cpu_stop_queue_two_works() is a little arcane; fix this by
lifting the preempt_disable() to the top to create more natural nesting wrt
the spinlocks and make the wake_up_q() and preempt_enable() unconditional
at the end.

Furthermore, enable preemption in the -EDEADLK case, such that we spin-wait
with preemption enabled.

Suggested-by: Thomas Gleixner 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Cc: Sebastian Andrzej Siewior 
Cc: isa...@codeaurora.org
Cc: m...@codeblueprint.co.uk
Cc: psoda...@codeaurora.org
Cc: gre...@linuxfoundation.org
Cc: pkond...@codeaurora.org
Cc: sta...@vger.kernel.org
Link: 
https://lkml.kernel.org/r/20180730112140.gh2...@hirez.programming.kicks-ass.net
---
 kernel/stop_machine.c | 41 +++--
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index e190d1ef3a23..34b6652e8677 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -236,13 +236,24 @@ static int cpu_stop_queue_two_works(int cpu1, struct 
cpu_stop_work *work1,
struct cpu_stopper *stopper2 = per_cpu_ptr(_stopper, cpu2);
DEFINE_WAKE_Q(wakeq);
int err;
+
 retry:
+   /*
+* The waking up of stopper threads has to happen in the same
+* scheduling context as the queueing.  Otherwise, there is a
+* possibility of one of the above stoppers being woken up by another
+* CPU, and preempting us. This will cause us to not wake up the other
+* stopper forever.
+*/
+   preempt_disable();
raw_spin_lock_irq(>lock);
raw_spin_lock_nested(>lock, SINGLE_DEPTH_NESTING);
 
-   err = -ENOENT;
-   if (!stopper1->enabled || !stopper2->enabled)
+   if (!stopper1->enabled || !stopper2->enabled) {
+   err = -ENOENT;
goto unlock;
+   }
+
/*
 * Ensure that if we race with __stop_cpus() the stoppers won't get
 * queued up in reverse order leading to system deadlock.
@@ -253,36 +264,30 @@ retry:
 * It can be falsely true but it is safe to spin until it is cleared,
 * queue_stop_cpus_work() does everything under preempt_disable().
 */
-   err = -EDEADLK;
-   if (unlikely(stop_cpus_in_progress))
-   goto unlock;
+   if (unlikely(stop_cpus_in_progress)) {
+   err = -EDEADLK;
+   goto unlock;
+   }
 
err = 0;
__cpu_stop_queue_work(stopper1, work1, );
__cpu_stop_queue_work(stopper2, work2, );
-   /*
-* The waking up of stopper threads has to happen
-* in the same scheduling context as the queueing.
-* Otherwise, there is a possibility of one of the
-* above stoppers being woken up by another CPU,
-* and preempting us. This will cause us to n ot
-* wake up the other stopper forever.
-*/
-   preempt_disable();
+
 unlock:
raw_spin_unlock(>lock);
raw_spin_unlock_irq(>lock);
 
if (unlikely(err == -EDEADLK)) {
+   preempt_enable();
+
while (stop_cpus_in_progress)
cpu_relax();
+
goto retry;
}
 
-   if (!err) {
-   wake_up_q();
-   preempt_enable();
-   }
+   wake_up_q();
+   preempt_enable();
 
return err;
 }


[tip:sched/core] stop_machine: Reflow cpu_stop_queue_two_works()

2018-08-02 Thread tip-bot for Peter Zijlstra
Commit-ID:  b80a2bfce85e1051056d98d04ecb2d0b55cbbc1c
Gitweb: https://git.kernel.org/tip/b80a2bfce85e1051056d98d04ecb2d0b55cbbc1c
Author: Peter Zijlstra 
AuthorDate: Mon, 30 Jul 2018 13:21:40 +0200
Committer:  Thomas Gleixner 
CommitDate: Thu, 2 Aug 2018 15:25:20 +0200

stop_machine: Reflow cpu_stop_queue_two_works()

The code flow in cpu_stop_queue_two_works() is a little arcane; fix this by
lifting the preempt_disable() to the top to create more natural nesting wrt
the spinlocks and make the wake_up_q() and preempt_enable() unconditional
at the end.

Furthermore, enable preemption in the -EDEADLK case, such that we spin-wait
with preemption enabled.

Suggested-by: Thomas Gleixner 
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Cc: Sebastian Andrzej Siewior 
Cc: isa...@codeaurora.org
Cc: m...@codeblueprint.co.uk
Cc: psoda...@codeaurora.org
Cc: gre...@linuxfoundation.org
Cc: pkond...@codeaurora.org
Cc: sta...@vger.kernel.org
Link: 
https://lkml.kernel.org/r/20180730112140.gh2...@hirez.programming.kicks-ass.net
---
 kernel/stop_machine.c | 41 +++--
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index e190d1ef3a23..34b6652e8677 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -236,13 +236,24 @@ static int cpu_stop_queue_two_works(int cpu1, struct 
cpu_stop_work *work1,
struct cpu_stopper *stopper2 = per_cpu_ptr(_stopper, cpu2);
DEFINE_WAKE_Q(wakeq);
int err;
+
 retry:
+   /*
+* The waking up of stopper threads has to happen in the same
+* scheduling context as the queueing.  Otherwise, there is a
+* possibility of one of the above stoppers being woken up by another
+* CPU, and preempting us. This will cause us to not wake up the other
+* stopper forever.
+*/
+   preempt_disable();
raw_spin_lock_irq(>lock);
raw_spin_lock_nested(>lock, SINGLE_DEPTH_NESTING);
 
-   err = -ENOENT;
-   if (!stopper1->enabled || !stopper2->enabled)
+   if (!stopper1->enabled || !stopper2->enabled) {
+   err = -ENOENT;
goto unlock;
+   }
+
/*
 * Ensure that if we race with __stop_cpus() the stoppers won't get
 * queued up in reverse order leading to system deadlock.
@@ -253,36 +264,30 @@ retry:
 * It can be falsely true but it is safe to spin until it is cleared,
 * queue_stop_cpus_work() does everything under preempt_disable().
 */
-   err = -EDEADLK;
-   if (unlikely(stop_cpus_in_progress))
-   goto unlock;
+   if (unlikely(stop_cpus_in_progress)) {
+   err = -EDEADLK;
+   goto unlock;
+   }
 
err = 0;
__cpu_stop_queue_work(stopper1, work1, );
__cpu_stop_queue_work(stopper2, work2, );
-   /*
-* The waking up of stopper threads has to happen
-* in the same scheduling context as the queueing.
-* Otherwise, there is a possibility of one of the
-* above stoppers being woken up by another CPU,
-* and preempting us. This will cause us to n ot
-* wake up the other stopper forever.
-*/
-   preempt_disable();
+
 unlock:
raw_spin_unlock(>lock);
raw_spin_unlock_irq(>lock);
 
if (unlikely(err == -EDEADLK)) {
+   preempt_enable();
+
while (stop_cpus_in_progress)
cpu_relax();
+
goto retry;
}
 
-   if (!err) {
-   wake_up_q();
-   preempt_enable();
-   }
+   wake_up_q();
+   preempt_enable();
 
return err;
 }


[tip:sched/core] stop_machine: Reflow cpu_stop_queue_two_works()

2018-08-02 Thread tip-bot for Peter Zijlstra
Commit-ID:  2171ce2d470d6e389ebbef3edd22c7643918a02f
Gitweb: https://git.kernel.org/tip/2171ce2d470d6e389ebbef3edd22c7643918a02f
Author: Peter Zijlstra 
AuthorDate: Mon, 30 Jul 2018 13:21:40 +0200
Committer:  Thomas Gleixner 
CommitDate: Thu, 2 Aug 2018 14:02:53 +0200

stop_machine: Reflow cpu_stop_queue_two_works()

The code flow in cpu_stop_queue_two_works() is a little arcane; fix this by
lifting the preempt_disable() to the top to create more natural nesting wrt
the spinlocks and make the wake_up_q() and preempt_enable() unconditional
at the end.

Furthermore, enable preemption in the -EDEADLK case, such that we spin-wait
with preemption enabled.

Suggested-by: Thomas Gleixner 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Sebastian Andrzej Siewior 
Cc: isa...@codeaurora.org
Cc: m...@codeblueprint.co.uk
Cc: psoda...@codeaurora.org
Cc: gre...@linuxfoundation.org
Cc: pkond...@codeaurora.org
Cc: sta...@vger.kernel.org
Link: 
https://lkml.kernel.org/r/20180730112140.gh2...@hirez.programming.kicks-ass.net
---
 kernel/stop_machine.c | 41 +++--
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index e190d1ef3a23..34b6652e8677 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -236,13 +236,24 @@ static int cpu_stop_queue_two_works(int cpu1, struct 
cpu_stop_work *work1,
struct cpu_stopper *stopper2 = per_cpu_ptr(_stopper, cpu2);
DEFINE_WAKE_Q(wakeq);
int err;
+
 retry:
+   /*
+* The waking up of stopper threads has to happen in the same
+* scheduling context as the queueing.  Otherwise, there is a
+* possibility of one of the above stoppers being woken up by another
+* CPU, and preempting us. This will cause us to not wake up the other
+* stopper forever.
+*/
+   preempt_disable();
raw_spin_lock_irq(>lock);
raw_spin_lock_nested(>lock, SINGLE_DEPTH_NESTING);
 
-   err = -ENOENT;
-   if (!stopper1->enabled || !stopper2->enabled)
+   if (!stopper1->enabled || !stopper2->enabled) {
+   err = -ENOENT;
goto unlock;
+   }
+
/*
 * Ensure that if we race with __stop_cpus() the stoppers won't get
 * queued up in reverse order leading to system deadlock.
@@ -253,36 +264,30 @@ retry:
 * It can be falsely true but it is safe to spin until it is cleared,
 * queue_stop_cpus_work() does everything under preempt_disable().
 */
-   err = -EDEADLK;
-   if (unlikely(stop_cpus_in_progress))
-   goto unlock;
+   if (unlikely(stop_cpus_in_progress)) {
+   err = -EDEADLK;
+   goto unlock;
+   }
 
err = 0;
__cpu_stop_queue_work(stopper1, work1, );
__cpu_stop_queue_work(stopper2, work2, );
-   /*
-* The waking up of stopper threads has to happen
-* in the same scheduling context as the queueing.
-* Otherwise, there is a possibility of one of the
-* above stoppers being woken up by another CPU,
-* and preempting us. This will cause us to n ot
-* wake up the other stopper forever.
-*/
-   preempt_disable();
+
 unlock:
raw_spin_unlock(>lock);
raw_spin_unlock_irq(>lock);
 
if (unlikely(err == -EDEADLK)) {
+   preempt_enable();
+
while (stop_cpus_in_progress)
cpu_relax();
+
goto retry;
}
 
-   if (!err) {
-   wake_up_q();
-   preempt_enable();
-   }
+   wake_up_q();
+   preempt_enable();
 
return err;
 }


[tip:sched/core] stop_machine: Reflow cpu_stop_queue_two_works()

2018-08-02 Thread tip-bot for Peter Zijlstra
Commit-ID:  2171ce2d470d6e389ebbef3edd22c7643918a02f
Gitweb: https://git.kernel.org/tip/2171ce2d470d6e389ebbef3edd22c7643918a02f
Author: Peter Zijlstra 
AuthorDate: Mon, 30 Jul 2018 13:21:40 +0200
Committer:  Thomas Gleixner 
CommitDate: Thu, 2 Aug 2018 14:02:53 +0200

stop_machine: Reflow cpu_stop_queue_two_works()

The code flow in cpu_stop_queue_two_works() is a little arcane; fix this by
lifting the preempt_disable() to the top to create more natural nesting wrt
the spinlocks and make the wake_up_q() and preempt_enable() unconditional
at the end.

Furthermore, enable preemption in the -EDEADLK case, such that we spin-wait
with preemption enabled.

Suggested-by: Thomas Gleixner 
Signed-off-by: Peter Zijlstra (Intel) 
Cc: Sebastian Andrzej Siewior 
Cc: isa...@codeaurora.org
Cc: m...@codeblueprint.co.uk
Cc: psoda...@codeaurora.org
Cc: gre...@linuxfoundation.org
Cc: pkond...@codeaurora.org
Cc: sta...@vger.kernel.org
Link: 
https://lkml.kernel.org/r/20180730112140.gh2...@hirez.programming.kicks-ass.net
---
 kernel/stop_machine.c | 41 +++--
 1 file changed, 23 insertions(+), 18 deletions(-)

diff --git a/kernel/stop_machine.c b/kernel/stop_machine.c
index e190d1ef3a23..34b6652e8677 100644
--- a/kernel/stop_machine.c
+++ b/kernel/stop_machine.c
@@ -236,13 +236,24 @@ static int cpu_stop_queue_two_works(int cpu1, struct 
cpu_stop_work *work1,
struct cpu_stopper *stopper2 = per_cpu_ptr(_stopper, cpu2);
DEFINE_WAKE_Q(wakeq);
int err;
+
 retry:
+   /*
+* The waking up of stopper threads has to happen in the same
+* scheduling context as the queueing.  Otherwise, there is a
+* possibility of one of the above stoppers being woken up by another
+* CPU, and preempting us. This will cause us to not wake up the other
+* stopper forever.
+*/
+   preempt_disable();
raw_spin_lock_irq(>lock);
raw_spin_lock_nested(>lock, SINGLE_DEPTH_NESTING);
 
-   err = -ENOENT;
-   if (!stopper1->enabled || !stopper2->enabled)
+   if (!stopper1->enabled || !stopper2->enabled) {
+   err = -ENOENT;
goto unlock;
+   }
+
/*
 * Ensure that if we race with __stop_cpus() the stoppers won't get
 * queued up in reverse order leading to system deadlock.
@@ -253,36 +264,30 @@ retry:
 * It can be falsely true but it is safe to spin until it is cleared,
 * queue_stop_cpus_work() does everything under preempt_disable().
 */
-   err = -EDEADLK;
-   if (unlikely(stop_cpus_in_progress))
-   goto unlock;
+   if (unlikely(stop_cpus_in_progress)) {
+   err = -EDEADLK;
+   goto unlock;
+   }
 
err = 0;
__cpu_stop_queue_work(stopper1, work1, );
__cpu_stop_queue_work(stopper2, work2, );
-   /*
-* The waking up of stopper threads has to happen
-* in the same scheduling context as the queueing.
-* Otherwise, there is a possibility of one of the
-* above stoppers being woken up by another CPU,
-* and preempting us. This will cause us to n ot
-* wake up the other stopper forever.
-*/
-   preempt_disable();
+
 unlock:
raw_spin_unlock(>lock);
raw_spin_unlock_irq(>lock);
 
if (unlikely(err == -EDEADLK)) {
+   preempt_enable();
+
while (stop_cpus_in_progress)
cpu_relax();
+
goto retry;
}
 
-   if (!err) {
-   wake_up_q();
-   preempt_enable();
-   }
+   wake_up_q();
+   preempt_enable();
 
return err;
 }


[tip:x86/timers] sched/clock: Close a hole in sched_clock_init()

2018-07-20 Thread tip-bot for Peter Zijlstra
Commit-ID:  9407f5a7ee77c631d1e100436132437cf6237e45
Gitweb: https://git.kernel.org/tip/9407f5a7ee77c631d1e100436132437cf6237e45
Author: Peter Zijlstra 
AuthorDate: Fri, 20 Jul 2018 10:09:11 +0200
Committer:  Thomas Gleixner 
CommitDate: Fri, 20 Jul 2018 11:58:00 +0200

sched/clock: Close a hole in sched_clock_init()

All data required for the 'unstable' sched_clock must be set-up _before_
enabling it -- setting sched_clock_running. This includes the
__gtod_offset but also a recent scd stamp.

Make the gtod-offset update also set the csd stamp -- it requires the
same two clock reads _anyway_. This doesn't hurt in the
sched_clock_tick_stable() case and ensures sched_clock_init() gets
everything set-up before use.

Also switch to unconditional IRQ-disable/enable because the static key
stuff already requires this is not ran with IRQs disabled.

Fixes: 857baa87b642 ("sched/clock: Enable sched clock early")
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Cc: Pavel Tatashin 
Cc: steven.sist...@oracle.com
Cc: daniel.m.jor...@oracle.com
Cc: li...@armlinux.org.uk
Cc: schwidef...@de.ibm.com
Cc: heiko.carst...@de.ibm.com
Cc: john.stu...@linaro.org
Cc: sb...@codeaurora.org
Cc: h...@zytor.com
Cc: douly.f...@cn.fujitsu.com
Cc: pra...@redhat.com
Cc: feng.t...@intel.com
Cc: pmla...@suse.com
Cc: gno...@lxorguk.ukuu.org.uk
Cc: linux-s...@vger.kernel.org
Cc: boris.ostrov...@oracle.com
Cc: jgr...@suse.com
Cc: pbonz...@redhat.com
Link: 
https://lkml.kernel.org/r/20180720080911.gm2...@hirez.programming.kicks-ass.net
---
 kernel/sched/clock.c | 16 ++--
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c5c47ad3f386..811a39aca1ce 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -197,13 +197,14 @@ void clear_sched_clock_stable(void)
 
 static void __sched_clock_gtod_offset(void)
 {
-   __gtod_offset = (sched_clock() + __sched_clock_offset) - ktime_get_ns();
+   struct sched_clock_data *scd = this_scd();
+
+   __scd_stamp(scd);
+   __gtod_offset = (scd->tick_raw + __sched_clock_offset) - scd->tick_gtod;
 }
 
 void __init sched_clock_init(void)
 {
-   unsigned long flags;
-
/*
 * Set __gtod_offset such that once we mark sched_clock_running,
 * sched_clock_tick() continues where sched_clock() left off.
@@ -211,16 +212,11 @@ void __init sched_clock_init(void)
 * Even if TSC is buggered, we're still UP at this point so it
 * can't really be out of sync.
 */
-   local_irq_save(flags);
+   local_irq_disable();
__sched_clock_gtod_offset();
-   local_irq_restore(flags);
+   local_irq_enable();
 
static_branch_inc(_clock_running);
-
-   /* Now that sched_clock_running is set adjust scd */
-   local_irq_save(flags);
-   sched_clock_tick();
-   local_irq_restore(flags);
 }
 /*
  * We run this as late_initcall() such that it runs after all built-in drivers,


[tip:x86/timers] sched/clock: Close a hole in sched_clock_init()

2018-07-20 Thread tip-bot for Peter Zijlstra
Commit-ID:  9407f5a7ee77c631d1e100436132437cf6237e45
Gitweb: https://git.kernel.org/tip/9407f5a7ee77c631d1e100436132437cf6237e45
Author: Peter Zijlstra 
AuthorDate: Fri, 20 Jul 2018 10:09:11 +0200
Committer:  Thomas Gleixner 
CommitDate: Fri, 20 Jul 2018 11:58:00 +0200

sched/clock: Close a hole in sched_clock_init()

All data required for the 'unstable' sched_clock must be set-up _before_
enabling it -- setting sched_clock_running. This includes the
__gtod_offset but also a recent scd stamp.

Make the gtod-offset update also set the csd stamp -- it requires the
same two clock reads _anyway_. This doesn't hurt in the
sched_clock_tick_stable() case and ensures sched_clock_init() gets
everything set-up before use.

Also switch to unconditional IRQ-disable/enable because the static key
stuff already requires this is not ran with IRQs disabled.

Fixes: 857baa87b642 ("sched/clock: Enable sched clock early")
Signed-off-by: Peter Zijlstra (Intel) 
Signed-off-by: Thomas Gleixner 
Cc: Pavel Tatashin 
Cc: steven.sist...@oracle.com
Cc: daniel.m.jor...@oracle.com
Cc: li...@armlinux.org.uk
Cc: schwidef...@de.ibm.com
Cc: heiko.carst...@de.ibm.com
Cc: john.stu...@linaro.org
Cc: sb...@codeaurora.org
Cc: h...@zytor.com
Cc: douly.f...@cn.fujitsu.com
Cc: pra...@redhat.com
Cc: feng.t...@intel.com
Cc: pmla...@suse.com
Cc: gno...@lxorguk.ukuu.org.uk
Cc: linux-s...@vger.kernel.org
Cc: boris.ostrov...@oracle.com
Cc: jgr...@suse.com
Cc: pbonz...@redhat.com
Link: 
https://lkml.kernel.org/r/20180720080911.gm2...@hirez.programming.kicks-ass.net
---
 kernel/sched/clock.c | 16 ++--
 1 file changed, 6 insertions(+), 10 deletions(-)

diff --git a/kernel/sched/clock.c b/kernel/sched/clock.c
index c5c47ad3f386..811a39aca1ce 100644
--- a/kernel/sched/clock.c
+++ b/kernel/sched/clock.c
@@ -197,13 +197,14 @@ void clear_sched_clock_stable(void)
 
 static void __sched_clock_gtod_offset(void)
 {
-   __gtod_offset = (sched_clock() + __sched_clock_offset) - ktime_get_ns();
+   struct sched_clock_data *scd = this_scd();
+
+   __scd_stamp(scd);
+   __gtod_offset = (scd->tick_raw + __sched_clock_offset) - scd->tick_gtod;
 }
 
 void __init sched_clock_init(void)
 {
-   unsigned long flags;
-
/*
 * Set __gtod_offset such that once we mark sched_clock_running,
 * sched_clock_tick() continues where sched_clock() left off.
@@ -211,16 +212,11 @@ void __init sched_clock_init(void)
 * Even if TSC is buggered, we're still UP at this point so it
 * can't really be out of sync.
 */
-   local_irq_save(flags);
+   local_irq_disable();
__sched_clock_gtod_offset();
-   local_irq_restore(flags);
+   local_irq_enable();
 
static_branch_inc(_clock_running);
-
-   /* Now that sched_clock_running is set adjust scd */
-   local_irq_save(flags);
-   sched_clock_tick();
-   local_irq_restore(flags);
 }
 /*
  * We run this as late_initcall() such that it runs after all built-in drivers,


[tip:sched/core] sched/cpufreq: Clarify sugov_get_util()

2018-07-15 Thread tip-bot for Peter Zijlstra
Commit-ID:  45f5519ec55e75af3565dd737586d3b041834f71
Gitweb: https://git.kernel.org/tip/45f5519ec55e75af3565dd737586d3b041834f71
Author: Peter Zijlstra 
AuthorDate: Thu, 5 Jul 2018 14:36:17 +0200
Committer:  Ingo Molnar 
CommitDate: Mon, 16 Jul 2018 00:16:29 +0200

sched/cpufreq: Clarify sugov_get_util()

Add a few comments to (hopefully) clarifying some of the magic in
sugov_get_util().

Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Viresh Kumar 
Cc: Linus Torvalds 
Cc: morten.rasmus...@arm.com
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: Vincent Guittot 
Cc: clau...@evidence.eu.com
Cc: daniel.lezc...@linaro.org
Cc: dietmar.eggem...@arm.com
Cc: j...@joelfernandes.org
Cc: juri.le...@redhat.com
Cc: luca.ab...@santannapisa.it
Cc: patrick.bell...@arm.com
Cc: quentin.per...@arm.com
Cc: r...@rjwysocki.net
Cc: valentin.schnei...@arm.com
Link: 
http://lkml.kernel.org/r/20180705123617.gm2...@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar 
---
 kernel/sched/cpufreq_schedutil.c | 75 +---
 1 file changed, 54 insertions(+), 21 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index c9622b3f183d..97dcd4472a0e 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -177,6 +177,26 @@ static unsigned int get_next_freq(struct sugov_policy 
*sg_policy,
return cpufreq_driver_resolve_freq(policy, freq);
 }
 
+/*
+ * This function computes an effective utilization for the given CPU, to be
+ * used for frequency selection given the linear relation: f = u * f_max.
+ *
+ * The scheduler tracks the following metrics:
+ *
+ *   cpu_util_{cfs,rt,dl,irq}()
+ *   cpu_bw_dl()
+ *
+ * Where the cfs,rt and dl util numbers are tracked with the same metric and
+ * synchronized windows and are thus directly comparable.
+ *
+ * The cfs,rt,dl utilization are the running times measured with rq->clock_task
+ * which excludes things like IRQ and steal-time. These latter are then accrued
+ * in the irq utilization.
+ *
+ * The DL bandwidth number otoh is not a measured metric but a value computed
+ * based on the task model parameters and gives the minimal utilization
+ * required to meet deadlines.
+ */
 static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
 {
struct rq *rq = cpu_rq(sg_cpu->cpu);
@@ -188,47 +208,60 @@ static unsigned long sugov_get_util(struct sugov_cpu 
*sg_cpu)
if (rt_rq_is_runnable(>rt))
return max;
 
+   /*
+* Early check to see if IRQ/steal time saturates the CPU, can be
+* because of inaccuracies in how we track these -- see
+* update_irq_load_avg().
+*/
irq = cpu_util_irq(rq);
-
if (unlikely(irq >= max))
return max;
 
-   /* Sum rq utilization */
+   /*
+* Because the time spend on RT/DL tasks is visible as 'lost' time to
+* CFS tasks and we use the same metric to track the effective
+* utilization (PELT windows are synchronized) we can directly add them
+* to obtain the CPU's actual utilization.
+*/
util = cpu_util_cfs(rq);
util += cpu_util_rt(rq);
 
/*
-* Interrupt time is not seen by RQS utilization so we can compare
-* them with the CPU capacity
+* We do not make cpu_util_dl() a permanent part of this sum because we
+* want to use cpu_bw_dl() later on, but we need to check if the
+* CFS+RT+DL sum is saturated (ie. no idle time) such that we select
+* f_max when there is no idle time.
+*
+* NOTE: numerical errors or stop class might cause us to not quite hit
+* saturation when we should -- something for later.
 */
if ((util + cpu_util_dl(rq)) >= max)
return max;
 
/*
-* As there is still idle time on the CPU, we need to compute the
-* utilization level of the CPU.
+* There is still idle time; further improve the number by using the
+* irq metric. Because IRQ/steal time is hidden from the task clock we
+* need to scale the task numbers:
 *
+*  1 - irq
+*   U' = irq + --- * U
+*max
+*/
+   util *= (max - irq);
+   util /= max;
+   util += irq;
+
+   /*
 * Bandwidth required by DEADLINE must always be granted while, for
 * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
 * to gracefully reduce the frequency when no tasks show up for longer
 * periods of time.
 *
-* Ideally we would like to set util_dl as min/guaranteed freq and
-* util_cfs + util_dl as requested freq. However, cpufreq is not yet
-* ready for such an interface. So, we only do the latter for now.
+* Ideally we would like to set bw_dl as min/guaranteed freq and util +
+* bw_dl as requested freq. However, 

[tip:sched/core] sched/cpufreq: Clarify sugov_get_util()

2018-07-15 Thread tip-bot for Peter Zijlstra
Commit-ID:  45f5519ec55e75af3565dd737586d3b041834f71
Gitweb: https://git.kernel.org/tip/45f5519ec55e75af3565dd737586d3b041834f71
Author: Peter Zijlstra 
AuthorDate: Thu, 5 Jul 2018 14:36:17 +0200
Committer:  Ingo Molnar 
CommitDate: Mon, 16 Jul 2018 00:16:29 +0200

sched/cpufreq: Clarify sugov_get_util()

Add a few comments to (hopefully) clarifying some of the magic in
sugov_get_util().

Signed-off-by: Peter Zijlstra (Intel) 
Acked-by: Viresh Kumar 
Cc: Linus Torvalds 
Cc: morten.rasmus...@arm.com
Cc: Peter Zijlstra 
Cc: Thomas Gleixner 
Cc: Vincent Guittot 
Cc: clau...@evidence.eu.com
Cc: daniel.lezc...@linaro.org
Cc: dietmar.eggem...@arm.com
Cc: j...@joelfernandes.org
Cc: juri.le...@redhat.com
Cc: luca.ab...@santannapisa.it
Cc: patrick.bell...@arm.com
Cc: quentin.per...@arm.com
Cc: r...@rjwysocki.net
Cc: valentin.schnei...@arm.com
Link: 
http://lkml.kernel.org/r/20180705123617.gm2...@hirez.programming.kicks-ass.net
Signed-off-by: Ingo Molnar 
---
 kernel/sched/cpufreq_schedutil.c | 75 +---
 1 file changed, 54 insertions(+), 21 deletions(-)

diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c
index c9622b3f183d..97dcd4472a0e 100644
--- a/kernel/sched/cpufreq_schedutil.c
+++ b/kernel/sched/cpufreq_schedutil.c
@@ -177,6 +177,26 @@ static unsigned int get_next_freq(struct sugov_policy 
*sg_policy,
return cpufreq_driver_resolve_freq(policy, freq);
 }
 
+/*
+ * This function computes an effective utilization for the given CPU, to be
+ * used for frequency selection given the linear relation: f = u * f_max.
+ *
+ * The scheduler tracks the following metrics:
+ *
+ *   cpu_util_{cfs,rt,dl,irq}()
+ *   cpu_bw_dl()
+ *
+ * Where the cfs,rt and dl util numbers are tracked with the same metric and
+ * synchronized windows and are thus directly comparable.
+ *
+ * The cfs,rt,dl utilization are the running times measured with rq->clock_task
+ * which excludes things like IRQ and steal-time. These latter are then accrued
+ * in the irq utilization.
+ *
+ * The DL bandwidth number otoh is not a measured metric but a value computed
+ * based on the task model parameters and gives the minimal utilization
+ * required to meet deadlines.
+ */
 static unsigned long sugov_get_util(struct sugov_cpu *sg_cpu)
 {
struct rq *rq = cpu_rq(sg_cpu->cpu);
@@ -188,47 +208,60 @@ static unsigned long sugov_get_util(struct sugov_cpu 
*sg_cpu)
if (rt_rq_is_runnable(>rt))
return max;
 
+   /*
+* Early check to see if IRQ/steal time saturates the CPU, can be
+* because of inaccuracies in how we track these -- see
+* update_irq_load_avg().
+*/
irq = cpu_util_irq(rq);
-
if (unlikely(irq >= max))
return max;
 
-   /* Sum rq utilization */
+   /*
+* Because the time spend on RT/DL tasks is visible as 'lost' time to
+* CFS tasks and we use the same metric to track the effective
+* utilization (PELT windows are synchronized) we can directly add them
+* to obtain the CPU's actual utilization.
+*/
util = cpu_util_cfs(rq);
util += cpu_util_rt(rq);
 
/*
-* Interrupt time is not seen by RQS utilization so we can compare
-* them with the CPU capacity
+* We do not make cpu_util_dl() a permanent part of this sum because we
+* want to use cpu_bw_dl() later on, but we need to check if the
+* CFS+RT+DL sum is saturated (ie. no idle time) such that we select
+* f_max when there is no idle time.
+*
+* NOTE: numerical errors or stop class might cause us to not quite hit
+* saturation when we should -- something for later.
 */
if ((util + cpu_util_dl(rq)) >= max)
return max;
 
/*
-* As there is still idle time on the CPU, we need to compute the
-* utilization level of the CPU.
+* There is still idle time; further improve the number by using the
+* irq metric. Because IRQ/steal time is hidden from the task clock we
+* need to scale the task numbers:
 *
+*  1 - irq
+*   U' = irq + --- * U
+*max
+*/
+   util *= (max - irq);
+   util /= max;
+   util += irq;
+
+   /*
 * Bandwidth required by DEADLINE must always be granted while, for
 * FAIR and RT, we use blocked utilization of IDLE CPUs as a mechanism
 * to gracefully reduce the frequency when no tasks show up for longer
 * periods of time.
 *
-* Ideally we would like to set util_dl as min/guaranteed freq and
-* util_cfs + util_dl as requested freq. However, cpufreq is not yet
-* ready for such an interface. So, we only do the latter for now.
+* Ideally we would like to set bw_dl as min/guaranteed freq and util +
+* bw_dl as requested freq. However, 

  1   2   3   4   5   6   7   8   9   10   >