[PATCH RT 4/8] sched: migrate disable: Protect cpus_ptr with lock

2019-07-26 Thread Scott Wood
Various places assume that cpus_ptr is protected by rq/pi locks,
so don't change it before grabbing those locks.

Signed-off-by: Scott Wood 
---
 kernel/sched/core.c | 6 ++
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 99a3cfccf4d3..38a9a9df5638 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7283,9 +7283,8 @@ void dump_cpu_task(int cpu)
struct rq *rq;
struct rq_flags rf;
 
-   p->cpus_ptr = cpumask_of(smp_processor_id());
-
rq = task_rq_lock(p, );
+   p->cpus_ptr = cpumask_of(smp_processor_id());
update_nr_migratory(p, -1);
p->nr_cpus_allowed = 1;
task_rq_unlock(rq, p, );
@@ -7297,9 +7296,8 @@ void dump_cpu_task(int cpu)
struct rq *rq;
struct rq_flags rf;
 
-   p->cpus_ptr = >cpus_mask;
-
rq = task_rq_lock(p, );
+   p->cpus_ptr = >cpus_mask;
p->nr_cpus_allowed = cpumask_weight(>cpus_mask);
update_nr_migratory(p, 1);
task_rq_unlock(rq, p, );
-- 
1.8.3.1



[PATCH RT 1/8] sched: migrate_enable: Use sleeping_lock to indicate involuntary sleep

2019-07-26 Thread Scott Wood
Without this, rcu_note_context_switch() will complain if an RCU read
lock is held when migrate_enable() calls stop_one_cpu().

Signed-off-by: Scott Wood 
---
 include/linux/sched.h| 4 ++--
 kernel/rcu/tree_plugin.h | 2 +-
 kernel/sched/core.c  | 2 ++
 3 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4e218f8d8048..ad23ab939b35 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -673,7 +673,7 @@ struct task_struct {
int migrate_disable_atomic;
 # endif
 #endif
-#ifdef CONFIG_PREEMPT_RT_FULL
+#ifdef CONFIG_PREEMPT_RT_BASE
int sleeping_lock;
 #endif
 
@@ -1873,7 +1873,7 @@ static __always_inline bool need_resched(void)
return unlikely(tif_need_resched());
 }
 
-#ifdef CONFIG_PREEMPT_RT_FULL
+#ifdef CONFIG_PREEMPT_RT_BASE
 static inline void sleeping_lock_inc(void)
 {
current->sleeping_lock++;
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 23a54e4b649c..7a3aa085ce2c 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -292,7 +292,7 @@ void rcu_note_context_switch(bool preempt)
barrier(); /* Avoid RCU read-side critical sections leaking down. */
trace_rcu_utilization(TPS("Start context switch"));
lockdep_assert_irqs_disabled();
-#if defined(CONFIG_PREEMPT_RT_FULL)
+#if defined(CONFIG_PREEMPT_RT_BASE)
sleeping_l = t->sleeping_lock;
 #endif
WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0 && !sleeping_l);
diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index d3c6542b306f..c3407707e367 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7405,7 +7405,9 @@ void migrate_enable(void)
unpin_current_cpu();
preempt_lazy_enable();
preempt_enable();
+   sleeping_lock_inc();
stop_one_cpu(task_cpu(p), migration_cpu_stop, );
+   sleeping_lock_dec();
return;
}
}
-- 
1.8.3.1



[PATCH RT 6/8] sched: migrate_enable: Set state to TASK_RUNNING

2019-07-26 Thread Scott Wood
If migrate_enable() is called while a task is preparing to sleep
(state != TASK_RUNNING), that triggers a debug check in stop_one_cpu().
Explicitly reset state to acknowledge that we're accepting the spurious
wakeup.

Signed-off-by: Scott Wood 
---
 kernel/sched/core.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 38a9a9df5638..eb27a9bf70d7 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7396,6 +7396,14 @@ void migrate_enable(void)
unpin_current_cpu();
preempt_lazy_enable();
preempt_enable();
+
+   /*
+* Avoid sleeping with an existing non-running
+* state.  This will result in a spurious wakeup
+* for the calling context.
+*/
+   __set_current_state(TASK_RUNNING);
+
sleeping_lock_inc();
stop_one_cpu(task_cpu(p), migration_cpu_stop, );
sleeping_lock_dec();
-- 
1.8.3.1



[PATCH] raid1: factor out a common routine to handle the completion of sync write

2019-07-26 Thread Hou Tao
It's just code clean-up.

Signed-off-by: Hou Tao 
---
 drivers/md/raid1.c | 39 ++-
 1 file changed, 18 insertions(+), 21 deletions(-)

diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 1755d2233e4d..d73ed94764c1 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1904,6 +1904,22 @@ static void abort_sync_write(struct mddev *mddev, struct 
r1bio *r1_bio)
} while (sectors_to_go > 0);
 }
 
+static void put_sync_write_buf(struct r1bio *r1_bio, int uptodate)
+{
+   if (atomic_dec_and_test(_bio->remaining)) {
+   struct mddev *mddev = r1_bio->mddev;
+   int s = r1_bio->sectors;
+
+   if (test_bit(R1BIO_MadeGood, _bio->state) ||
+   test_bit(R1BIO_WriteError, _bio->state))
+   reschedule_retry(r1_bio);
+   else {
+   put_buf(r1_bio);
+   md_done_sync(mddev, s, uptodate);
+   }
+   }
+}
+
 static void end_sync_write(struct bio *bio)
 {
int uptodate = !bio->bi_status;
@@ -1930,16 +1946,7 @@ static void end_sync_write(struct bio *bio)
)
set_bit(R1BIO_MadeGood, _bio->state);
 
-   if (atomic_dec_and_test(_bio->remaining)) {
-   int s = r1_bio->sectors;
-   if (test_bit(R1BIO_MadeGood, _bio->state) ||
-   test_bit(R1BIO_WriteError, _bio->state))
-   reschedule_retry(r1_bio);
-   else {
-   put_buf(r1_bio);
-   md_done_sync(mddev, s, uptodate);
-   }
-   }
+   put_sync_write_buf(r1_bio, uptodate);
 }
 
 static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector,
@@ -,17 +2229,7 @@ static void sync_request_write(struct mddev *mddev, 
struct r1bio *r1_bio)
generic_make_request(wbio);
}
 
-   if (atomic_dec_and_test(_bio->remaining)) {
-   /* if we're here, all write(s) have completed, so clean up */
-   int s = r1_bio->sectors;
-   if (test_bit(R1BIO_MadeGood, _bio->state) ||
-   test_bit(R1BIO_WriteError, _bio->state))
-   reschedule_retry(r1_bio);
-   else {
-   put_buf(r1_bio);
-   md_done_sync(mddev, s, 1);
-   }
-   }
+   put_sync_write_buf(r1_bio, 1);
 }
 
 /*
-- 
2.22.0



[PATCH RT 5/8] sched/deadline: Reclaim cpuset bandwidth in .migrate_task_rq()

2019-07-26 Thread Scott Wood
With the changes to migrate disabling, ->set_cpus_allowed() no longer
gets deferred until migrate_enable().  To avoid releasing the bandwidth
while the task may still be executing on the old CPU, move the subtraction
to ->migrate_task_rq().

Signed-off-by: Scott Wood 
---
 kernel/sched/deadline.c | 67 +++--
 1 file changed, 31 insertions(+), 36 deletions(-)

diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c
index c18be51f7608..2f18d0cf1b56 100644
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@@ -1606,14 +1606,42 @@ static void yield_task_dl(struct rq *rq)
return cpu;
 }
 
+static void free_old_cpuset_bw_dl(struct rq *rq, struct task_struct *p)
+{
+   struct root_domain *src_rd = rq->rd;
+
+   /*
+* Migrating a SCHED_DEADLINE task between exclusive
+* cpusets (different root_domains) entails a bandwidth
+* update. We already made space for us in the destination
+* domain (see cpuset_can_attach()).
+*/
+   if (!cpumask_intersects(src_rd->span, p->cpus_ptr)) {
+   struct dl_bw *src_dl_b;
+
+   src_dl_b = dl_bw_of(cpu_of(rq));
+   /*
+* We now free resources of the root_domain we are migrating
+* off. In the worst case, sched_setattr() may temporary fail
+* until we complete the update.
+*/
+   raw_spin_lock(_dl_b->lock);
+   __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
+   raw_spin_unlock(_dl_b->lock);
+   }
+}
+
 static void migrate_task_rq_dl(struct task_struct *p, int new_cpu 
__maybe_unused)
 {
struct rq *rq;
 
-   if (p->state != TASK_WAKING)
+   rq = task_rq(p);
+
+   if (p->state != TASK_WAKING) {
+   free_old_cpuset_bw_dl(rq, p);
return;
+   }
 
-   rq = task_rq(p);
/*
 * Since p->state == TASK_WAKING, set_task_cpu() has been called
 * from try_to_wake_up(). Hence, p->pi_lock is locked, but
@@ -2220,39 +2248,6 @@ static void task_woken_dl(struct rq *rq, struct 
task_struct *p)
}
 }
 
-static void set_cpus_allowed_dl(struct task_struct *p,
-   const struct cpumask *new_mask)
-{
-   struct root_domain *src_rd;
-   struct rq *rq;
-
-   BUG_ON(!dl_task(p));
-
-   rq = task_rq(p);
-   src_rd = rq->rd;
-   /*
-* Migrating a SCHED_DEADLINE task between exclusive
-* cpusets (different root_domains) entails a bandwidth
-* update. We already made space for us in the destination
-* domain (see cpuset_can_attach()).
-*/
-   if (!cpumask_intersects(src_rd->span, new_mask)) {
-   struct dl_bw *src_dl_b;
-
-   src_dl_b = dl_bw_of(cpu_of(rq));
-   /*
-* We now free resources of the root_domain we are migrating
-* off. In the worst case, sched_setattr() may temporary fail
-* until we complete the update.
-*/
-   raw_spin_lock(_dl_b->lock);
-   __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p)));
-   raw_spin_unlock(_dl_b->lock);
-   }
-
-   set_cpus_allowed_common(p, new_mask);
-}
-
 /* Assumes rq->lock is held */
 static void rq_online_dl(struct rq *rq)
 {
@@ -2407,7 +2402,7 @@ static void prio_changed_dl(struct rq *rq, struct 
task_struct *p,
 #ifdef CONFIG_SMP
.select_task_rq = select_task_rq_dl,
.migrate_task_rq= migrate_task_rq_dl,
-   .set_cpus_allowed   = set_cpus_allowed_dl,
+   .set_cpus_allowed   = set_cpus_allowed_common,
.rq_online  = rq_online_dl,
.rq_offline = rq_offline_dl,
.task_woken = task_woken_dl,
-- 
1.8.3.1



[PATCH RT 2/8] sched: __set_cpus_allowed_ptr: Check cpus_mask, not cpus_ptr

2019-07-26 Thread Scott Wood
This function is concerned with the long-term cpu mask, not the
transitory mask the task might have while migrate disabled.  Before
this patch, if a task was migrate disabled at the time
__set_cpus_allowed_ptr() was called, and the new mask happened to be
equal to the cpu that the task was running on, then the mask update
would be lost.

Signed-off-by: Scott Wood 
---
 kernel/sched/core.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index c3407707e367..6e643d656d71 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1218,7 +1218,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
goto out;
}
 
-   if (cpumask_equal(p->cpus_ptr, new_mask))
+   if (cpumask_equal(>cpus_mask, new_mask))
goto out;
 
if (!cpumask_intersects(new_mask, cpu_valid_mask)) {
-- 
1.8.3.1



[PATCH RT 7/8] sched: migrate_enable: Use select_fallback_rq()

2019-07-26 Thread Scott Wood
migrate_enable() currently open-codes a variant of select_fallback_rq().
However, it does not have the "No more Mr. Nice Guy" fallback and thus
it will pass an invalid CPU to the migration thread if cpus_mask only
contains a CPU that is !active.

Signed-off-by: Scott Wood 
---
This scenario will be more likely after the next patch, since
the migrate_disable_update check goes away.  However, it could happen
anyway if cpus_mask was updated to a CPU other than the one we were
pinned to, and that CPU subsequently became inactive.
---
 kernel/sched/core.c | 25 ++---
 1 file changed, 10 insertions(+), 15 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index eb27a9bf70d7..3a2d8251a30c 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -7368,6 +7368,7 @@ void migrate_enable(void)
if (p->migrate_disable_update) {
struct rq *rq;
struct rq_flags rf;
+   int cpu = task_cpu(p);
 
rq = task_rq_lock(p, );
update_rq_clock(rq);
@@ -7377,21 +7378,15 @@ void migrate_enable(void)
 
p->migrate_disable_update = 0;
 
-   WARN_ON(smp_processor_id() != task_cpu(p));
-   if (!cpumask_test_cpu(task_cpu(p), >cpus_mask)) {
-   const struct cpumask *cpu_valid_mask = cpu_active_mask;
-   struct migration_arg arg;
-   unsigned int dest_cpu;
-
-   if (p->flags & PF_KTHREAD) {
-   /*
-* Kernel threads are allowed on online && 
!active CPUs
-*/
-   cpu_valid_mask = cpu_online_mask;
-   }
-   dest_cpu = cpumask_any_and(cpu_valid_mask, 
>cpus_mask);
-   arg.task = p;
-   arg.dest_cpu = dest_cpu;
+   WARN_ON(smp_processor_id() != cpu);
+   if (!cpumask_test_cpu(cpu, >cpus_mask)) {
+   struct migration_arg arg = { p };
+   struct rq_flags rf;
+
+   rq = task_rq_lock(p, );
+   update_rq_clock(rq);
+   arg.dest_cpu = select_fallback_rq(cpu, p);
+   task_rq_unlock(rq, p, );
 
unpin_current_cpu();
preempt_lazy_enable();
-- 
1.8.3.1



[PATCH RT 3/8] sched: Remove dead __migrate_disabled() check

2019-07-26 Thread Scott Wood
This code was unreachable given the __migrate_disabled() branch
to "out" immediately beforehand.

Signed-off-by: Scott Wood 
---
 kernel/sched/core.c | 7 ---
 1 file changed, 7 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 6e643d656d71..99a3cfccf4d3 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -1242,13 +1242,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p,
if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p))
goto out;
 
-#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
-   if (__migrate_disabled(p)) {
-   p->migrate_disable_update = 1;
-   goto out;
-   }
-#endif
-
dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask);
if (task_running(rq, p) || p->state == TASK_WAKING) {
struct migration_arg arg = { p, dest_cpu };
-- 
1.8.3.1



[PATCH RT 8/8] sched: Lazy migrate_disable processing

2019-07-26 Thread Scott Wood
Avoid overhead on the majority of migrate disable/enable sequences
by only manipulating scheduler data (and grabbing the relevant locks)
when the task actually schedules while migrate-disabled.  Very large
speedups were seen during a kernel build.

Instead of cpuhp_pin_lock, CPU hotplug is handled by keeping a per-CPU
count of the number of pinned tasks (including tasks which have not
scheduled in the migrate-disabled section); takedown_cpu() will
wait until that reaches zero (confirmed by take_cpu_down() in stop
machine context to deal with races) before migrating tasks off of the
cpu.

To simplify synchronization, updating cpus_mask is no longer deferred
until migrate_enable().  This lets us not have to worry about
migrate_enable() missing the update if it's on the fast path (didn't
schedule during the migrate disabled section).  It also makes the code
a bit simpler and reduces deviation from mainline.

While the main motivation for this is the performance benefit, lazy
migrate disable also eliminates the restriction on calling
migrate_disable() while atomic but leaving the atomic region prior to
calling migrate_enable() -- though this won't help with local_bh_disable()
(and thus rcutorture) unless something similar is done with the recently
added local_lock.

Signed-off-by: Scott Wood 
---
 include/linux/cpu.h|   4 --
 include/linux/sched.h  |  11 +--
 init/init_task.c   |   4 ++
 kernel/cpu.c   |  97 +
 kernel/sched/core.c| 192 -
 kernel/sched/sched.h   |   4 ++
 lib/smp_processor_id.c |   3 +
 7 files changed, 130 insertions(+), 185 deletions(-)

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index f4a772c12d14..2df500fdcbc4 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -113,8 +113,6 @@ static inline void cpu_maps_update_done(void)
 extern void cpu_hotplug_enable(void);
 void clear_tasks_mm_cpumask(int cpu);
 int cpu_down(unsigned int cpu);
-extern void pin_current_cpu(void);
-extern void unpin_current_cpu(void);
 
 #else /* CONFIG_HOTPLUG_CPU */
 
@@ -126,8 +124,6 @@ static inline void cpus_read_unlock(void) { }
 static inline void lockdep_assert_cpus_held(void) { }
 static inline void cpu_hotplug_disable(void) { }
 static inline void cpu_hotplug_enable(void) { }
-static inline void pin_current_cpu(void) { }
-static inline void unpin_current_cpu(void) { }
 
 #endif /* !CONFIG_HOTPLUG_CPU */
 
diff --git a/include/linux/sched.h b/include/linux/sched.h
index ad23ab939b35..069c46dde15b 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -229,6 +229,8 @@
 extern long io_schedule_timeout(long timeout);
 extern void io_schedule(void);
 
+int cpu_nr_pinned(int cpu);
+
 /**
  * struct prev_cputime - snapshot of system and user cputime
  * @utime: time spent in user mode
@@ -661,16 +663,13 @@ struct task_struct {
cpumask_t   cpus_mask;
 #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
int migrate_disable;
-   int migrate_disable_update;
-   int pinned_on_cpu;
+   boolmigrate_disable_scheduled;
 # ifdef CONFIG_SCHED_DEBUG
-   int migrate_disable_atomic;
+   int pinned_on_cpu;
 # endif
-
 #elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE)
 # ifdef CONFIG_SCHED_DEBUG
int migrate_disable;
-   int migrate_disable_atomic;
 # endif
 #endif
 #ifdef CONFIG_PREEMPT_RT_BASE
@@ -2066,4 +2065,6 @@ static inline void rseq_syscall(struct pt_regs *regs)
 
 #endif
 
+extern struct task_struct *takedown_cpu_task;
+
 #endif
diff --git a/init/init_task.c b/init/init_task.c
index e402413dc47d..c0c7618fd2fb 100644
--- a/init/init_task.c
+++ b/init/init_task.c
@@ -81,6 +81,10 @@ struct task_struct init_task
.cpus_ptr   = _task.cpus_mask,
.cpus_mask  = CPU_MASK_ALL,
.nr_cpus_allowed= NR_CPUS,
+#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) && \
+defined(CONFIG_SCHED_DEBUG)
+   .pinned_on_cpu  = -1,
+#endif
.mm = NULL,
.active_mm  = _mm,
.restart_block  = {
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 885a195dfbe0..0096acf1a692 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -76,11 +76,6 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = 
{
.fail = CPUHP_INVALID,
 };
 
-#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PREEMPT_RT_FULL)
-static DEFINE_PER_CPU(struct rt_rw_lock, cpuhp_pin_lock) = \
-   __RWLOCK_RT_INITIALIZER(cpuhp_pin_lock);
-#endif
-
 #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP)
 static struct lockdep_map cpuhp_state_up_map =
STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", _state_up_map);
@@ -287,55 +282,6 @@ void cpu_maps_update_done(void)
 
 #ifdef 

[RT PATCH 0/8] migrate disable fixes and performance

2019-07-26 Thread Scott Wood
With these patches, a kernel build on a 104-cpu machine took around 75%
less wall time and 85% less system time.  Note that there is a difference
in v5.2-rt compared to v5.0-rt.  The performance with these patches is
similar in both cases, but without these patches v5.2-rt is substantially
slower.  In v5.0-rt with a previous version of these patches, lazy
migrate disable reduced kernel build time by around 15-20% wall and
70-75% system.

Scott Wood (8):
  sched: migrate_enable: Use sleeping_lock to indicate involuntary sleep
  sched: __set_cpus_allowed_ptr: Check cpus_mask, not cpus_ptr
  sched: Remove dead __migrate_disabled() check
  sched: migrate disable: Protect cpus_ptr with lock
  sched/deadline: Reclaim cpuset bandwidth in .migrate_task_rq()
  sched: migrate_enable: Set state to TASK_RUNNING
  sched: migrate_enable: Use select_fallback_rq()
  sched: Lazy migrate_disable processing

 include/linux/cpu.h  |   4 -
 include/linux/sched.h|  15 ++--
 init/init_task.c |   4 +
 kernel/cpu.c |  97 ---
 kernel/rcu/tree_plugin.h |   2 +-
 kernel/sched/core.c  | 200 +++
 kernel/sched/deadline.c  |  67 
 kernel/sched/sched.h |   4 +
 lib/smp_processor_id.c   |   3 +
 9 files changed, 166 insertions(+), 230 deletions(-)

-- 
1.8.3.1



[RFC PATCH 00/21] x86/sgx: KVM: Add SGX virtualization

2019-07-26 Thread Sean Christopherson
This is an "early" RFC series for adding SGX virtualization to KVM.  SGX
virtualization (more specifically, EPC virtualization) is dependent on the
not-yet-merged SGX enabling series and so cannot be considered for
inclusion any time soon.

The primary goal of this RFC is to get feedback on the overall approach,
e.g. code location, uAPI changes, functionality, etc...  My hope is to
sort out any major issues sooner rather than later, so that if/when the
base SGX enabling is merged, virtualization support can quickly follow
suit.

That being said, nitpicking and bikeshedding is more than welcome :-)

This code applies on top of a slightly modified version of v21 of the SGX
enabling series[1].  The modifications on top of the SGX series are a few
minor bug fixes that are not related to SGX virtualization, but affect
code that is moved/modified by this series.  The full source for the
modified version of v21 can be found at:

 https://github.com/sean-jc/linux.git

under the tag:

  sgx-v21-ish

A corresponding Qemu RFC will (hopefully) follow next week, the Qemu
patches need a bit more cleanup...

[1] 
https://lkml.kernel.org/r/20190713170804.2340-1-jarkko.sakki...@linux.intel.com

Sean Christopherson (21):
  x86/sgx: Add defines for SGX device minor numbers
  x86/sgx: Move bus registration and device init to common code
  x86/sgx: Move provisioning device to common code
  x86/sgx: Add /dev/sgx/virt_epc device to allocate "raw" EPC for VMs
  x86/sgx: Expose SGX architectural definitions to the kernel
  KVM: x86: Add SGX sub-features leaf to reverse CPUID table
  KVM: x86: Add WARN_ON_ONCE(index!=0) in __do_cpuid_ent
  KVM: x86: Add kvm_x86_ops hook to short circuit emulation
  KVM: VMX: Add basic handling of VM-Exit from SGX enclave
  KVM: x86: Export kvm_mmu_gva_to_gpa_{read,write}() for VMX/SGX
  KVM: x86: Export kvm_propagate_fault (as kvm_propagate_page_fault)
  KVM: x86: Define new #PF SGX error code bit
  x86/sgx: Move the intermediate EINIT helper into the driver
  x86/sgx: Add helpers to expose ECREATE and EINIT to KVM
  KVM: VMX: Add SGX ENCLS[ECREATE] handler to enforce CPUID restrictions
  KVM: VMX: Edd emulation of SGX Launch Control LE hash MSRs
  KVM: VMX: Add handler for ENCLS[EINIT] to support SGX Launch Control
  KVM: x86: Invoke kvm_x86_ops->cpuid_update() after kvm_update_cpuid()
  KVM: VMX: Enable SGX virtualization for SGX1, SGX2 and LC
  x86/sgx: Export sgx_set_attribute() for use by KVM
  KVM: x86: Add capability to grant VM access to privileged SGX
attribute

 Documentation/virtual/kvm/api.txt |  20 ++
 arch/x86/Kconfig  |  13 +
 arch/x86/include/asm/kvm_host.h   |   8 +-
 arch/x86/include/asm/sgx.h|  17 +
 .../cpu/sgx/arch.h => include/asm/sgx_arch.h} |   1 +
 arch/x86/include/asm/vmx.h|   1 +
 arch/x86/include/uapi/asm/vmx.h   |   1 +
 arch/x86/kernel/cpu/sgx/Makefile  |   1 +
 arch/x86/kernel/cpu/sgx/driver/driver.h   |   3 +-
 arch/x86/kernel/cpu/sgx/driver/ioctl.c|  40 ++-
 arch/x86/kernel/cpu/sgx/driver/main.c |  73 +
 arch/x86/kernel/cpu/sgx/encl.c|   2 +-
 arch/x86/kernel/cpu/sgx/encls.h   |   2 +-
 arch/x86/kernel/cpu/sgx/main.c| 141 ++--
 arch/x86/kernel/cpu/sgx/sgx.h |  16 +-
 arch/x86/kernel/cpu/sgx/virt.c| 308 ++
 arch/x86/kernel/cpu/sgx/virt.h|  14 +
 arch/x86/kvm/Makefile |   2 +
 arch/x86/kvm/cpuid.c  | 135 ++--
 arch/x86/kvm/cpuid.h  |  20 ++
 arch/x86/kvm/emulate.c|   1 +
 arch/x86/kvm/mmu.c|  12 -
 arch/x86/kvm/svm.c|  19 +-
 arch/x86/kvm/vmx/nested.c |  21 +-
 arch/x86/kvm/vmx/nested.h |   5 +
 arch/x86/kvm/vmx/sgx.c| 247 ++
 arch/x86/kvm/vmx/sgx.h|  11 +
 arch/x86/kvm/vmx/vmcs12.c |   1 +
 arch/x86/kvm/vmx/vmcs12.h |   4 +-
 arch/x86/kvm/vmx/vmx.c| 251 +-
 arch/x86/kvm/vmx/vmx.h|   6 +
 arch/x86/kvm/x86.c|  40 ++-
 arch/x86/kvm/x86.h|   5 -
 include/uapi/linux/kvm.h  |   1 +
 tools/testing/selftests/x86/sgx/defines.h |   2 +-
 35 files changed, 1234 insertions(+), 210 deletions(-)
 create mode 100644 arch/x86/include/asm/sgx.h
 rename arch/x86/{kernel/cpu/sgx/arch.h => include/asm/sgx_arch.h} (99%)
 create mode 100644 arch/x86/kernel/cpu/sgx/virt.c
 create mode 100644 arch/x86/kernel/cpu/sgx/virt.h
 create mode 100644 arch/x86/kvm/vmx/sgx.c
 create mode 100644 arch/x86/kvm/vmx/sgx.h

-- 
2.22.0



[RFC PATCH 02/21] x86/sgx: Move bus registration and device init to common code

2019-07-26 Thread Sean Christopherson
Move the SGX bus registration and initialization into common code in
preparation for adding a virtual EPC device, which will reside outside
of the native SGX userspace driver.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kernel/cpu/sgx/driver/main.c | 48 +
 arch/x86/kernel/cpu/sgx/main.c| 50 ++-
 arch/x86/kernel/cpu/sgx/sgx.h |  4 +++
 3 files changed, 54 insertions(+), 48 deletions(-)

diff --git a/arch/x86/kernel/cpu/sgx/driver/main.c 
b/arch/x86/kernel/cpu/sgx/driver/main.c
index a2506a49c95a..d62bdc7ed4d9 100644
--- a/arch/x86/kernel/cpu/sgx/driver/main.c
+++ b/arch/x86/kernel/cpu/sgx/driver/main.c
@@ -158,42 +158,10 @@ const struct file_operations sgx_provision_fops = {
.owner  = THIS_MODULE,
 };
 
-static struct bus_type sgx_bus_type = {
-   .name   = "sgx",
-};
-
 static struct device sgx_encl_dev;
 static struct cdev sgx_encl_cdev;
 static struct device sgx_provision_dev;
 static struct cdev sgx_provision_cdev;
-static dev_t sgx_devt;
-
-static void sgx_dev_release(struct device *dev)
-{
-}
-
-static __init int sgx_dev_init(const char *name, struct device *dev,
-  struct cdev *cdev,
-  const struct file_operations *fops, int minor)
-{
-   int ret;
-
-   device_initialize(dev);
-
-   dev->bus = _bus_type;
-   dev->devt = MKDEV(MAJOR(sgx_devt), minor);
-   dev->release = sgx_dev_release;
-
-   ret = dev_set_name(dev, name);
-   if (ret) {
-   put_device(dev);
-   return ret;
-   }
-
-   cdev_init(cdev, fops);
-   cdev->owner = THIS_MODULE;
-   return 0;
-}
 
 int __init sgx_drv_init(void)
 {
@@ -207,14 +175,6 @@ int __init sgx_drv_init(void)
return -ENODEV;
}
 
-   ret = bus_register(_bus_type);
-   if (ret)
-   return ret;
-
-   ret = alloc_chrdev_region(_devt, 0, SGX_MAX_NR_DEVICES, "sgx");
-   if (ret < 0)
-   goto err_bus;
-
cpuid_count(SGX_CPUID, 0, , , , );
sgx_misc_reserved_mask = ~ebx | SGX_MISC_RESERVED_MASK;
sgx_encl_size_max_64 = 1ULL << ((edx >> 8) & 0xFF);
@@ -240,7 +200,7 @@ int __init sgx_drv_init(void)
ret = sgx_dev_init("sgx/enclave", _encl_dev, _encl_cdev,
   _encl_fops, SGX_ENCL_DEV_MINOR);
if (ret)
-   goto err_chrdev_region;
+   return ret;
 
ret = sgx_dev_init("sgx/provision", _provision_dev,
   _provision_cdev, _provision_fops,
@@ -277,11 +237,5 @@ int __init sgx_drv_init(void)
 err_encl_dev:
put_device(_encl_dev);
 
-err_chrdev_region:
-   unregister_chrdev_region(sgx_devt, SGX_MAX_NR_DEVICES);
-
-err_bus:
-   bus_unregister(_bus_type);
-
return ret;
 }
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index f790a03571c5..edbd465083c7 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -1,6 +1,7 @@
 // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause)
 // Copyright(c) 2016-17 Intel Corporation.
 
+#include 
 #include 
 #include 
 #include 
@@ -329,6 +330,39 @@ static __init int sgx_page_cache_init(void)
return 0;
 }
 
+static struct bus_type sgx_bus_type = {
+   .name   = "sgx",
+};
+static dev_t sgx_devt;
+
+static void sgx_dev_release(struct device *dev)
+{
+
+}
+
+__init int sgx_dev_init(const char *name, struct device *dev,
+   struct cdev *cdev, const struct file_operations *fops,
+   int minor)
+{
+   int ret;
+
+   device_initialize(dev);
+
+   dev->bus = _bus_type;
+   dev->devt = MKDEV(MAJOR(sgx_devt), minor);
+   dev->release = sgx_dev_release;
+
+   ret = dev_set_name(dev, name);
+   if (ret) {
+   put_device(dev);
+   return ret;
+   }
+
+   cdev_init(cdev, fops);
+   cdev->owner = THIS_MODULE;
+   return 0;
+}
+
 static __init int sgx_init(void)
 {
int ret;
@@ -344,12 +378,26 @@ static __init int sgx_init(void)
if (ret)
goto err_page_cache;
 
-   ret = sgx_drv_init();
+   ret = bus_register(_bus_type);
if (ret)
goto err_kthread;
 
+   ret = alloc_chrdev_region(_devt, 0, SGX_MAX_NR_DEVICES, "sgx");
+   if (ret < 0)
+   goto err_bus;
+
+   ret = sgx_drv_init();
+   if (ret)
+   goto err_chrdev_region;
+
return 0;
 
+err_chrdev_region:
+   unregister_chrdev_region(sgx_devt, SGX_MAX_NR_DEVICES);
+
+err_bus:
+   bus_unregister(_bus_type);
+
 err_kthread:
kthread_stop(ksgxswapd_tsk);
 
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index 4e2c3ce94f63..85b3674e1d43 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -93,4 +93,8 @@ int sgx_einit(struct sgx_sigstruct *sigstruct, struct 

[RFC PATCH 11/21] KVM: x86: Export kvm_propagate_fault (as kvm_propagate_page_fault)

2019-07-26 Thread Sean Christopherson
Support for SGX Launch Control requires KVM to trap and execute
ENCLS[EINIT] on behalf of the guest.  Interception of ENCLS leafs
occurs immediately after CPL0 checks, i.e. before any processing
of the leaf-specific operands.  As such, it's possible that KVM
could intercept an EINIT from L2 and encounter an EPT fault while
walking L1's EPT tables.  Rather than force EINIT through the
generic emulator, export kvm_propagate_fault() so that the EINIT
handler can inject the proper page fault.  Rename the function to
kvm_propagate_page_fault() to clarify that it is only for page
faults, and WARN if it is invoked with an exception other than
PF_VECTOR.

Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/kvm_host.h | 1 +
 arch/x86/kvm/x86.c  | 7 +--
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 1341d8390ebe..397d755bb353 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1357,6 +1357,7 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, 
unsigned nr, u32 error_code);
 void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr);
 void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 
error_code);
 void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault);
+bool kvm_propagate_page_fault(struct kvm_vcpu *vcpu, struct x86_exception 
*fault);
 int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu,
gfn_t gfn, void *data, int offset, int len,
u32 access);
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index 2b64bb854571..ec92c5534336 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -587,8 +587,10 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct 
x86_exception *fault)
 }
 EXPORT_SYMBOL_GPL(kvm_inject_page_fault);
 
-static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception 
*fault)
+bool kvm_propagate_page_fault(struct kvm_vcpu *vcpu, struct x86_exception 
*fault)
 {
+   WARN_ON(fault->vector != PF_VECTOR);
+
if (mmu_is_nested(vcpu) && !fault->nested_page_fault)
vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault);
else
@@ -596,6 +598,7 @@ static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, 
struct x86_exception *fau
 
return fault->nested_page_fault;
 }
+EXPORT_SYMBOL_GPL(kvm_propagate_page_fault);
 
 void kvm_inject_nmi(struct kvm_vcpu *vcpu)
 {
@@ -6089,7 +6092,7 @@ static bool inject_emulated_exception(struct kvm_vcpu 
*vcpu)
 {
struct x86_emulate_ctxt *ctxt = >arch.emulate_ctxt;
if (ctxt->exception.vector == PF_VECTOR)
-   return kvm_propagate_fault(vcpu, >exception);
+   return kvm_propagate_page_fault(vcpu, >exception);
 
if (ctxt->exception.error_code_valid)
kvm_queue_exception_e(vcpu, ctxt->exception.vector,
-- 
2.22.0



[RFC PATCH 13/21] x86/sgx: Move the intermediate EINIT helper into the driver

2019-07-26 Thread Sean Christopherson
Providing sgx_einit() in the common SGX code was a bit premature.  The
thought was that the native SGX driver and KVM would be able to use a
common EINIT helper, but that may or may not hold true depending on
how KVM's implementation shakes out.  For example, KVM may want to pass
user pointers directly to EINIT in order to avoid copying large amounts
of data to in-kernel temp structures.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kernel/cpu/sgx/driver/ioctl.c | 21 +++--
 arch/x86/kernel/cpu/sgx/main.c | 43 ++
 arch/x86/kernel/cpu/sgx/sgx.h  |  4 +--
 3 files changed, 30 insertions(+), 38 deletions(-)

diff --git a/arch/x86/kernel/cpu/sgx/driver/ioctl.c 
b/arch/x86/kernel/cpu/sgx/driver/ioctl.c
index b7aa06920d10..a1cb5f772363 100644
--- a/arch/x86/kernel/cpu/sgx/driver/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/driver/ioctl.c
@@ -658,6 +658,23 @@ static int sgx_get_key_hash(const void *modulus, void 
*hash)
return ret;
 }
 
+static int __sgx_einit(struct sgx_sigstruct *sigstruct,
+  struct sgx_einittoken *token, struct sgx_epc_page *secs,
+  u64 *lepubkeyhash)
+{
+   int ret;
+
+   preempt_disable();
+   sgx_update_lepubkeyhash_msrs(lepubkeyhash, false);
+   ret = __einit(sigstruct, token, sgx_epc_addr(secs));
+   if (ret == SGX_INVALID_EINITTOKEN) {
+   sgx_update_lepubkeyhash_msrs(lepubkeyhash, true);
+   ret = __einit(sigstruct, token, sgx_epc_addr(secs));
+   }
+   preempt_enable();
+   return ret;
+}
+
 static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct 
*sigstruct,
 struct sgx_einittoken *token)
 {
@@ -686,8 +703,8 @@ static int sgx_encl_init(struct sgx_encl *encl, struct 
sgx_sigstruct *sigstruct,
 
for (i = 0; i < SGX_EINIT_SLEEP_COUNT; i++) {
for (j = 0; j < SGX_EINIT_SPIN_COUNT; j++) {
-   ret = sgx_einit(sigstruct, token, encl->secs.epc_page,
-   mrsigner);
+   ret = __sgx_einit(sigstruct, token,
+ encl->secs.epc_page, mrsigner);
if (ret == SGX_UNMASKED_EVENT)
continue;
else
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 532dd90e09e1..542427c6ae9c 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -166,7 +166,15 @@ void sgx_free_page(struct sgx_epc_page *page)
WARN(ret > 0, "sgx: EREMOVE returned %d (0x%x)", ret, ret);
 }
 
-static void sgx_update_lepubkeyhash_msrs(u64 *lepubkeyhash, bool enforce)
+/**
+ * sgx_update_lepubkeyhash_msrs - Write the IA32_SGXLEPUBKEYHASHx MSRs
+ * @lepubkeyhash:  array of desired MSRs values
+ * @enforce:   force WRMSR regardless of cache status
+ *
+ * Write the IA32_SGXLEPUBKEYHASHx MSRs according to @lepubkeyhash if the
+ * last cached value doesn't match the desired value, or if @enforce is %true.
+ */
+void sgx_update_lepubkeyhash_msrs(u64 *lepubkeyhash, bool enforce)
 {
u64 *cache;
int i;
@@ -180,39 +188,6 @@ static void sgx_update_lepubkeyhash_msrs(u64 
*lepubkeyhash, bool enforce)
}
 }
 
-/**
- * sgx_einit - initialize an enclave
- * @sigstruct: a pointer a SIGSTRUCT
- * @token: a pointer an EINITTOKEN (optional)
- * @secs:  a pointer a SECS
- * @lepubkeyhash:  the desired value for IA32_SGXLEPUBKEYHASHx MSRs
- *
- * Execute ENCLS[EINIT], writing the IA32_SGXLEPUBKEYHASHx MSRs according
- * to @lepubkeyhash (if possible and necessary).
- *
- * Return:
- *   0 on success,
- *   -errno or SGX error on failure
- */
-int sgx_einit(struct sgx_sigstruct *sigstruct, struct sgx_einittoken *token,
- struct sgx_epc_page *secs, u64 *lepubkeyhash)
-{
-   int ret;
-
-   if (!boot_cpu_has(X86_FEATURE_SGX_LC))
-   return __einit(sigstruct, token, sgx_epc_addr(secs));
-
-   preempt_disable();
-   sgx_update_lepubkeyhash_msrs(lepubkeyhash, false);
-   ret = __einit(sigstruct, token, sgx_epc_addr(secs));
-   if (ret == SGX_INVALID_EINITTOKEN) {
-   sgx_update_lepubkeyhash_msrs(lepubkeyhash, true);
-   ret = __einit(sigstruct, token, sgx_epc_addr(secs));
-   }
-   preempt_enable();
-   return ret;
-}
-
 static __init void sgx_free_epc_section(struct sgx_epc_section *section)
 {
struct sgx_epc_page *page;
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index 748b1633d770..3f3311024bd0 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -85,8 +85,8 @@ void sgx_reclaim_pages(void);
 struct sgx_epc_page *sgx_alloc_page(void *owner, bool reclaim);
 int __sgx_free_page(struct sgx_epc_page *page);
 void sgx_free_page(struct sgx_epc_page *page);
-int sgx_einit(struct sgx_sigstruct 

[RFC PATCH 12/21] KVM: x86: Define new #PF SGX error code bit

2019-07-26 Thread Sean Christopherson
Page faults that are signaled by the SGX Enclave Page Cache Map (EPCM),
as opposed to the traditional IA32/EPT page tables, set an SGX bit in
the error code to indicate that the #PF was induced by SGX.  KVM will
need to emulate this behavior as part of its trap-and-execute-EINIT
scheme needed to virtualize SGX Launch Control, e.g. if EINIT itself
faults due to the EPC being zapped by hardware after suspend-resume.

Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/kvm_host.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 397d755bb353..103df8cbdd24 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -201,6 +201,7 @@ enum {
 #define PFERR_RSVD_BIT 3
 #define PFERR_FETCH_BIT 4
 #define PFERR_PK_BIT 5
+#define PFERR_SGX_BIT 15
 #define PFERR_GUEST_FINAL_BIT 32
 #define PFERR_GUEST_PAGE_BIT 33
 
@@ -210,6 +211,7 @@ enum {
 #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT)
 #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT)
 #define PFERR_PK_MASK (1U << PFERR_PK_BIT)
+#define PFERR_SGX_MASK (1U << PFERR_SGX_BIT)
 #define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT)
 #define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT)
 
-- 
2.22.0



[RFC PATCH 01/21] x86/sgx: Add defines for SGX device minor numbers

2019-07-26 Thread Sean Christopherson
Add defines to track the minor numbers for each SGX device in
preparation for moving the helper code and provisioning device to the
common subsystem, and in preparation for adding a third device, i.e. a
virtual EPC device.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kernel/cpu/sgx/driver/driver.h | 1 -
 arch/x86/kernel/cpu/sgx/driver/main.c   | 9 +
 arch/x86/kernel/cpu/sgx/sgx.h   | 4 
 3 files changed, 9 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kernel/cpu/sgx/driver/driver.h 
b/arch/x86/kernel/cpu/sgx/driver/driver.h
index da60839b133a..6ce18c766a5a 100644
--- a/arch/x86/kernel/cpu/sgx/driver/driver.h
+++ b/arch/x86/kernel/cpu/sgx/driver/driver.h
@@ -15,7 +15,6 @@
 #include "../encls.h"
 #include "../sgx.h"
 
-#define SGX_DRV_NR_DEVICES 2
 #define SGX_EINIT_SPIN_COUNT   20
 #define SGX_EINIT_SLEEP_COUNT  50
 #define SGX_EINIT_SLEEP_TIME   20
diff --git a/arch/x86/kernel/cpu/sgx/driver/main.c 
b/arch/x86/kernel/cpu/sgx/driver/main.c
index bb7f1932529f..a2506a49c95a 100644
--- a/arch/x86/kernel/cpu/sgx/driver/main.c
+++ b/arch/x86/kernel/cpu/sgx/driver/main.c
@@ -211,7 +211,7 @@ int __init sgx_drv_init(void)
if (ret)
return ret;
 
-   ret = alloc_chrdev_region(_devt, 0, SGX_DRV_NR_DEVICES, "sgx");
+   ret = alloc_chrdev_region(_devt, 0, SGX_MAX_NR_DEVICES, "sgx");
if (ret < 0)
goto err_bus;
 
@@ -238,12 +238,13 @@ int __init sgx_drv_init(void)
}
 
ret = sgx_dev_init("sgx/enclave", _encl_dev, _encl_cdev,
-  _encl_fops, 0);
+  _encl_fops, SGX_ENCL_DEV_MINOR);
if (ret)
goto err_chrdev_region;
 
ret = sgx_dev_init("sgx/provision", _provision_dev,
-  _provision_cdev, _provision_fops, 1);
+  _provision_cdev, _provision_fops,
+  SGX_PROV_DEV_MINOR);
if (ret)
goto err_encl_dev;
 
@@ -277,7 +278,7 @@ int __init sgx_drv_init(void)
put_device(_encl_dev);
 
 err_chrdev_region:
-   unregister_chrdev_region(sgx_devt, SGX_DRV_NR_DEVICES);
+   unregister_chrdev_region(sgx_devt, SGX_MAX_NR_DEVICES);
 
 err_bus:
bus_unregister(_bus_type);
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index c9276d4b6ffe..4e2c3ce94f63 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -89,4 +89,8 @@ void sgx_free_page(struct sgx_epc_page *page);
 int sgx_einit(struct sgx_sigstruct *sigstruct, struct sgx_einittoken *token,
  struct sgx_epc_page *secs, u64 *lepubkeyhash);
 
+#define SGX_ENCL_DEV_MINOR 0
+#define SGX_PROV_DEV_MINOR 1
+#define SGX_MAX_NR_DEVICES 2
+
 #endif /* _X86_SGX_H */
-- 
2.22.0



[RFC PATCH 09/21] KVM: VMX: Add basic handling of VM-Exit from SGX enclave

2019-07-26 Thread Sean Christopherson
Intel SGX adds a new CPL3-only execution environment referred to as an
"enclave".  To protect the secrets of an enclave, the CPU's state is
loaded with synthetic data when exiting the enclave (the enclave's state
is saved/restored via protected memory), and the RIP is set to a defined
exit value.  This behavior also applies to VMi-Exits from the enclave,
e.g. GUEST_RIP may not necessarily reflect the actual RIP that triggered
the VMExit.

To help a VMM recognize and handle exits from enclaves, SGX adds bits to
existing VMCS fields, VM_EXIT_REASON.VMX_EXIT_REASON_FROM_ENCLAVE and
GUEST_INTERRUPTIBILITY_INFO.GUEST_INTR_STATE_ENCLAVE_INTR.  Define the
new architectural bits and add a boolean to struct vcpu_vmx to cache
VMX_EXIT_REASON_FROM_ENCLAVE and clear the bit in exit_reason so that
checks against exit_reason do not need to account for SGX, e.g.
exit_reason == EXIT_REASON_EXCEPTION_NMI continues to work.

As for new behavior for VM-Exits from enclaves, KVM is for the most
part a passive observer of both bits, e.g. it needs to account for
the bits when propagating information to a nested VMM, but otherwise
doesn't need to act differently for VMExits from enclaves.

The one scenario that is impacted is emulation, which becomes impossible
since KVM does not have access to the RIP or instruction stream that
triggered the VMExit[2].  This is largely a non-issue as most
instructions that might trigger VM-Exit are designed to unconditionally
that may VM-Exit but do not #UD, KVM either never sets the exiting
control, e.g. PAUSE_EXITING[1], or sets it if and only if the feature is
not exposed to the guest in order to inject a #UD, e.g. RDRAND_EXITING.

But, because it is still possible for a guest to trigger emulation,
e.g. MMIO, inject a #UD if KVM ever attempts emulation after a VM-Exit
from an enclave.  This is architecturally accurate for instruction
VM-Exits, and for MMIO it's the least bad choice, e.g. it's preferable
to killing the VM.  In practice, only broken or particularly stupid
guests should ever encounter this behavior.

Add a WARN in skip_emulated_instruction to detect any attempt to
modify the guest's RIP during an SGX enclave VM-Exit as all such flows
should either be unreachable or must handle exits from enclaves before
getting to skip_emulated_instruction.

[1] PAUSE_LOOP_EXITING only affects CPL0 and enclaves exist only at
CPL3, so we also don't need to worry about that interaction.

[2] Impossible for all practical purposes.  Not truly impossible
since KVM could implement some form of para-virtualization scheme.

Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/vmx.h  |  1 +
 arch/x86/include/uapi/asm/vmx.h |  1 +
 arch/x86/kvm/vmx/nested.c   |  2 ++
 arch/x86/kvm/vmx/vmx.c  | 42 ++---
 arch/x86/kvm/vmx/vmx.h  |  3 +++
 5 files changed, 46 insertions(+), 3 deletions(-)

diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h
index a39136b0d509..a62ac47d2006 100644
--- a/arch/x86/include/asm/vmx.h
+++ b/arch/x86/include/asm/vmx.h
@@ -364,6 +364,7 @@ enum vmcs_field {
 #define GUEST_INTR_STATE_MOV_SS0x0002
 #define GUEST_INTR_STATE_SMI   0x0004
 #define GUEST_INTR_STATE_NMI   0x0008
+#define GUEST_INTR_STATE_ENCLAVE_INTR  0x0010
 
 /* GUEST_ACTIVITY_STATE flags */
 #define GUEST_ACTIVITY_ACTIVE  0
diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h
index d213ec5c3766..501a35bd4cc7 100644
--- a/arch/x86/include/uapi/asm/vmx.h
+++ b/arch/x86/include/uapi/asm/vmx.h
@@ -27,6 +27,7 @@
 
 
 #define VMX_EXIT_REASONS_FAILED_VMENTRY 0x8000
+#define VMX_EXIT_REASON_FROM_ENCLAVE   0x0800
 
 #define EXIT_REASON_EXCEPTION_NMI   0
 #define EXIT_REASON_EXTERNAL_INTERRUPT  1
diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c
index 46af3a5e9209..fef4fb3e1aaa 100644
--- a/arch/x86/kvm/vmx/nested.c
+++ b/arch/x86/kvm/vmx/nested.c
@@ -3523,6 +3523,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct 
vmcs12 *vmcs12,
/* update exit information fields: */
 
vmcs12->vm_exit_reason = exit_reason;
+   if (to_vmx(vcpu)->sgx_enclave_exit)
+   vmcs12->vm_exit_reason |= VMX_EXIT_REASON_FROM_ENCLAVE;
vmcs12->exit_qualification = exit_qualification;
vmcs12->vm_exit_intr_info = exit_intr_info;
 
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index f48fc990ca6d..abcd2f7a36f5 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1460,16 +1460,40 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, 
u64 data)
 
 static bool vmx_is_emulatable(struct kvm_vcpu *vcpu, void *insn, int insn_len)
 {
+   if (unlikely(to_vmx(vcpu)->sgx_enclave_exit)) {
+   kvm_queue_exception(vcpu, UD_VECTOR);
+   return false;
+   }
return true;
 }
 
 static void skip_emulated_instruction(struct kvm_vcpu *vcpu)
 {
+  

[RFC PATCH 04/21] x86/sgx: Add /dev/sgx/virt_epc device to allocate "raw" EPC for VMs

2019-07-26 Thread Sean Christopherson
Add an SGX device to enable userspace to allocate EPC without an
associated enclave.  The intended and only known use case for direct EPC
allocation is to expose EPC to a KVM guest, hence the virt_epc moniker,
virt.{c,h} files and INTEL_SGX_VIRTUALIZATION Kconfig.

Although KVM is the end consumer of EPC, and will need hooks into the
virtual EPC management if oversubscription of EPC for guest is ever
supported (see below), implement direct access to EPC in the SGX
subsystem instead of in KVM.  Doing so has two major advantages:

  - Does not require changes to KVM's uAPI, e.g. EPC gets handled as
just another memory backend for guests.

  - EPC management is wholly contained in the SGX subsystem, e.g. SGX
does not have to export any symbols, changes to reclaim flows don't
need to be routed through KVM, SGX's dirty laundry doesn't have to
get aired out for the world to see, and so on and so forth.

Oversubscription of EPC for KVM guests is not currently supported.  Due
to the complications of handling reclaim conflicts between guest and
host, KVM EPC oversubscription is expected to be at least an order of
magnitude more complex than basic support for SGX virtualization.

Signed-off-by: Sean Christopherson 
---
 arch/x86/Kconfig |  10 ++
 arch/x86/kernel/cpu/sgx/Makefile |   1 +
 arch/x86/kernel/cpu/sgx/main.c   |   3 +
 arch/x86/kernel/cpu/sgx/sgx.h|   3 +-
 arch/x86/kernel/cpu/sgx/virt.c   | 253 +++
 arch/x86/kernel/cpu/sgx/virt.h   |  14 ++
 6 files changed, 283 insertions(+), 1 deletion(-)
 create mode 100644 arch/x86/kernel/cpu/sgx/virt.c
 create mode 100644 arch/x86/kernel/cpu/sgx/virt.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index 74ccb1bdea16..c1bdb9f85928 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1961,6 +1961,16 @@ config INTEL_SGX_DRIVER
 
  If unsure, say N.
 
+config INTEL_SGX_VIRTUALIZATION
+   bool "Intel SGX Virtualization"
+   depends on INTEL_SGX && KVM_INTEL
+   help
+ Enabling support for SGX virtualization enables userspace to allocate
+ "raw" EPC for the purpose of exposing EPC to a KVM guest, i.e. a
+ virtual machine, via a device node (/dev/sgx/virt_epc by default).
+
+ If unsure, say N.
+
 config EFI
bool "EFI runtime service support"
depends on ACPI
diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile
index e5d1e862969c..559fd0f9be50 100644
--- a/arch/x86/kernel/cpu/sgx/Makefile
+++ b/arch/x86/kernel/cpu/sgx/Makefile
@@ -1,2 +1,3 @@
 obj-y += encl.o encls.o main.o reclaim.o
 obj-$(CONFIG_INTEL_SGX_DRIVER) += driver/
+obj-$(CONFIG_INTEL_SGX_VIRTUALIZATION) += virt.o
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 9f4473597620..ead827371139 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -14,6 +14,7 @@
 #include "arch.h"
 #include "encls.h"
 #include "sgx.h"
+#include "virt.h"
 
 struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS];
 int sgx_nr_epc_sections;
@@ -422,7 +423,9 @@ static __init int sgx_init(void)
if (ret)
goto err_provision_dev;
 
+   /* Success if the native *or* virtual driver initialized cleanly. */
ret = sgx_drv_init();
+   ret = sgx_virt_epc_init() ? ret : 0;
if (ret)
goto err_provision_cdev;
 
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index a0af8849c7c3..16cdb935aaa7 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -91,7 +91,8 @@ int sgx_einit(struct sgx_sigstruct *sigstruct, struct 
sgx_einittoken *token,
 
 #define SGX_ENCL_DEV_MINOR 0
 #define SGX_PROV_DEV_MINOR 1
-#define SGX_MAX_NR_DEVICES 2
+#define SGX_VIRT_DEV_MINOR 2
+#define SGX_MAX_NR_DEVICES 3
 
 __init int sgx_dev_init(const char *name, struct device *dev,
struct cdev *cdev, const struct file_operations *fops,
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
new file mode 100644
index ..79ee5917a4fc
--- /dev/null
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -0,0 +1,253 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "encls.h"
+#include "sgx.h"
+#include "virt.h"
+
+struct sgx_virt_epc_page {
+   struct sgx_epc_page *epc_page;
+};
+
+struct sgx_virt_epc {
+   struct radix_tree_root page_tree;
+   struct rw_semaphore lock;
+};
+
+static inline unsigned long sgx_virt_epc_calc_index(struct vm_area_struct *vma,
+   unsigned long addr)
+{
+   return vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start);
+}
+
+static struct sgx_virt_epc_page *__sgx_virt_epc_fault(struct sgx_virt_epc *epc,
+ struct vm_area_struct 
*vma,
+   

[RFC PATCH 08/21] KVM: x86: Add kvm_x86_ops hook to short circuit emulation

2019-07-26 Thread Sean Christopherson
Similar to the existing AMD #NPF case where emulation of the current
instruction is not possible due to lack of information, virtualization
of Intel SGX will introduce a scenario where emulation is not possible
due to the VMExit occurring in an SGX enclave.  And again similar to
the AMD case, emulation can be initiated by kvm_mmu_page_fault(), i.e.
outside of the control of the vendor-specific code.

While the cause and architecturally visible behavior of the two cases
is different,  e.g. Intel SGX will inject a #UD whereas AMD #NPF is a
clean resume or complete shutdown, the impact on the common emulation
code is identical: KVM must stop emulation immediately and resume the
guest.

Replace the exisiting need_emulation_on_page_fault() with a more generic
is_emulatable() kvm_x86_ops callback, which is called unconditionally
by x86_emulate_instruction().

Query is_emulatable() in handle_ud() as well so that the
force_emulation_prefix code doesn't incorrectly modify RIP before
calling emulate_instruction() in the absurdly unlikely scenario that
we encounter forced emulation in conjunction with "do not emulate".
Do this for both Intel and AMD so that any future changes to AMD's
emulation logic take effect as expected for handle_ud().

Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/kvm_host.h |  2 +-
 arch/x86/kvm/mmu.c  | 12 
 arch/x86/kvm/svm.c  | 19 +--
 arch/x86/kvm/vmx/vmx.c  | 11 +--
 arch/x86/kvm/x86.c  |  9 -
 5 files changed, 31 insertions(+), 22 deletions(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 26d1eb83f72a..1341d8390ebe 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -1198,7 +1198,7 @@ struct kvm_x86_ops {
   uint16_t *vmcs_version);
uint16_t (*nested_get_evmcs_version)(struct kvm_vcpu *vcpu);
 
-   bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu);
+   bool (*is_emulatable)(struct kvm_vcpu *vcpu, void *insn, int insn_len);
 };
 
 struct kvm_arch_async_pf {
diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
index 98f6e4f88b04..bf6952f8f330 100644
--- a/arch/x86/kvm/mmu.c
+++ b/arch/x86/kvm/mmu.c
@@ -5412,18 +5412,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, 
u64 error_code,
if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu))
emulation_type = EMULTYPE_ALLOW_RETRY;
 emulate:
-   /*
-* On AMD platforms, under certain conditions insn_len may be zero on 
#NPF.
-* This can happen if a guest gets a page-fault on data access but the 
HW
-* table walker is not able to read the instruction page (e.g 
instruction
-* page is not present in memory). In those cases we simply restart the
-* guest, with the exception of AMD Erratum 1096 which is unrecoverable.
-*/
-   if (unlikely(insn && !insn_len)) {
-   if (!kvm_x86_ops->need_emulation_on_page_fault(vcpu))
-   return 1;
-   }
-
er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len);
 
switch (er) {
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 48c865a4e5dd..0fb8b60eb136 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -7115,10 +7115,25 @@ static int nested_enable_evmcs(struct kvm_vcpu *vcpu,
return -ENODEV;
 }
 
-static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu)
+static bool svm_is_emulatable(struct kvm_vcpu *vcpu, void *insn, int insn_len)
 {
bool is_user, smap;
 
+   if (likely(!insn || insn_len))
+   return true;
+
+   /*
+* Under certain conditions insn_len may be zero on #NPF.  This can
+* happen if a guest gets a page-fault on data access but the HW table
+* walker is not able to read the instruction page (e.g instruction
+* page is not present in memory). In those cases we simply restart the
+* guest, with the exception of AMD Erratum 1096 which is unrecoverable.
+*/
+   if (unlikely(insn && !insn_len)) {
+   if (!kvm_x86_ops->need_emulation_on_page_fault(vcpu))
+   return 1;
+   }
+
is_user = svm_get_cpl(vcpu) == 3;
smap = !kvm_read_cr4_bits(vcpu, X86_CR4_SMAP);
 
@@ -7279,7 +7294,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = {
.nested_enable_evmcs = nested_enable_evmcs,
.nested_get_evmcs_version = nested_get_evmcs_version,
 
-   .need_emulation_on_page_fault = svm_need_emulation_on_page_fault,
+   .is_emulatable = svm_is_emulatable,
 };
 
 static int __init svm_init(void)
diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index d98eac371c0a..f48fc990ca6d 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -1458,6 +1458,10 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 
data)

[RFC PATCH 10/21] KVM: x86: Export kvm_mmu_gva_to_gpa_{read,write}() for VMX/SGX

2019-07-26 Thread Sean Christopherson
Support for SGX Launch Control requires KVM to trap and execute
ENCLS[ECREATE] and ENCLS[EINIT] on behalf of the guest, which requires
requires obtaining the GPA of a Secure Enclave Control Structure (SECS)
in order to get its corresponding HVA.

Because the SECS must reside in the Enclave Page Cache (EPC), copying
the SECS's data to a host-controlled buffer via existing exported
helpers is not a viable option as the EPC is not readable or writable
by the kernel.

Translating GVA->HVA for non-EPC pages is also desirable, as passing
user pointers directly to ECREATE and EINIT avoids having to copy pages
worth of data into the kernel.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/x86.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index afcc01a59421..2b64bb854571 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5089,6 +5089,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, 
gva_t gva,
u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read);
 
  gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva,
struct x86_exception *exception)
@@ -5105,6 +5106,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, 
gva_t gva,
access |= PFERR_WRITE_MASK;
return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception);
 }
+EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write);
 
 /* uses this to access any guest's mapped memory without checking CPL */
 gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva,
-- 
2.22.0



[RFC PATCH 16/21] KVM: VMX: Edd emulation of SGX Launch Control LE hash MSRs

2019-07-26 Thread Sean Christopherson
SGX Launch Control (LC) modifies the behavior of ENCLS[EINIT] to query
a set of user-controllable MSRs (Launch Enclave, a.k.a. LE, Hash MSRs)
when verifying the key used to sign an enclave.  On CPUs without LC
support, the public key hash of allowed LEs is hardwired into the CPU to
an Intel controlled key (the Intel key is also the reset value of the LE
hash MSRs).  Track the guest's desired hash and stuff it into hardware
when executing EINIT on behalf of the guest (in a future patch).

Note, KVM allows writes to the LE hash MSRs if IA32_FEATURE_CONTROL is
unlocked.  This is technically not arch behavior, but it's roughly
equivalent to the arch behavior of the MSRs being writable prior to
activating SGX[1].  Emulating SGX activation is feasible, but adds no
tangible benefits and would just create extra work for KVM and guest
firmware.

[1] SGX related bits in IA32_FEATURE_CONTROL cannot be set until SGX
is activated, e.g. by firmware.  SGX activation is triggered by
setting bit 0 in MSR 0x7a.  Until SGX is activated, the LE hash
MSRs are writable, e.g. to allow firmware to lock down the LE
root key with a non-Intel value.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/vmx/vmx.c | 42 ++
 arch/x86/kvm/vmx/vmx.h |  2 ++
 2 files changed, 44 insertions(+)

diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c
index abcd2f7a36f5..819c47fee157 100644
--- a/arch/x86/kvm/vmx/vmx.c
+++ b/arch/x86/kvm/vmx/vmx.c
@@ -390,6 +390,8 @@ static const struct kvm_vmx_segment_field {
 
 u64 host_efer;
 
+static u64 sgx_pubkey_hash[4] __ro_after_init;
+
 /*
  * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm
  * will emulate SYSCALL in legacy mode if the vendor string in guest
@@ -1740,6 +1742,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
case MSR_IA32_FEATURE_CONTROL:
msr_info->data = vmx->msr_ia32_feature_control;
break;
+   case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
+   if (!msr_info->host_initiated &&
+   !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC))
+   return 1;
+   msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash
+   [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0];
+   break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
if (!nested_vmx_allowed(vcpu))
return 1;
@@ -1953,6 +1962,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct 
msr_data *msr_info)
if (msr_info->host_initiated && data == 0)
vmx_leave_nested(vcpu);
break;
+   case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3:
+   if (!msr_info->host_initiated &&
+   (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) ||
+   ((vmx->msr_ia32_feature_control & FEATURE_CONTROL_LOCKED) &&
+   !(vmx->msr_ia32_feature_control & 
FEATURE_CONTROL_SGX_LE_WR
+   return 1;
+   vmx->msr_ia32_sgxlepubkeyhash
+   [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data;
+   break;
case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC:
if (!msr_info->host_initiated)
return 1; /* they are read-only */
@@ -6698,6 +6716,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, 
unsigned int id)
else
memset(>nested.msrs, 0, sizeof(vmx->nested.msrs));
 
+   memcpy(vmx->msr_ia32_sgxlepubkeyhash, sgx_pubkey_hash,
+  sizeof(sgx_pubkey_hash));
+
vmx->nested.posted_intr_nv = -1;
vmx->nested.current_vmptr = -1ull;
 
@@ -7588,6 +7609,27 @@ static __init int hardware_setup(void)
if (!enable_ept || !cpu_has_vmx_intel_pt())
pt_mode = PT_MODE_SYSTEM;
 
+   /*
+* Use Intel's default value for Skylake hardware if Launch Control is
+* not supported, i.e. Intel's hash is hardcoded into silicon, or if
+* Launch Control is supported and enabled, i.e. mimic the reset value
+* and let the guest write the MSRs at will.  If Launch Control is
+* supported but disabled, then we have to use the current MSR values
+* as the MSRs the hash MSRs exist but are locked and not writable.
+*/
+   if (boot_cpu_has(X86_FEATURE_SGX_LC) ||
+   rdmsrl_safe(MSR_IA32_SGXLEPUBKEYHASH0, _pubkey_hash[0])) {
+   sgx_pubkey_hash[0] = 0xa6053e051270b7acULL;
+   sgx_pubkey_hash[1] = 0x6cfbe8ba8b3b413dULL;
+   sgx_pubkey_hash[2] = 0xc4916d99f2b3735dULL;
+   sgx_pubkey_hash[3] = 0xd4f8c05909f9bb3bULL;
+   } else {
+   /* MSR_IA32_SGXLEPUBKEYHASH0 is read above */
+   rdmsrl(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]);
+   

[RFC PATCH 17/21] KVM: VMX: Add handler for ENCLS[EINIT] to support SGX Launch Control

2019-07-26 Thread Sean Christopherson
SGX Launch Control (LC) modifies the behavior of ENCLS[EINIT] to query
a set of user-controllable MSRs (Launch Enclave, a.k.a. LE, Hash MSRs)
when verifying the key used to sign an enclave.  On CPUs without LC
support, the public key hash of allowed LEs is hardwired into the CPU to
an Intel controlled key (the Intel key is also the reset value of the LE
hash MSRs).

When LC is enabled in the host, EINIT must be intercepted and executed
in the host using the guest's LE hash MSR value, even if the guest's
values are fixed to hardware default values.  The MSRs are not switched
on VM-Enter/VM-Exit as writing the MSRs is extraordinarily expensive,
e.g. each WRMSR is 4x slower than a regular WRMSR and on par with a full
VM-Enter -> VM-Exit transition.  Furthermore, as the MSRS aren't allowed
in the hardware-supported lists, i.e. would need to be manually read and
written.  On the other hand, EINIT takes tens of thousands of cycles to
execute (it's so slow that it's interruptible), i.e. the ~1k cycles of
overhead to trap-and-execute EINIT is unlikely to be noticed by the
guest, let alone impact the overall performance of SGX.

Actual usage of the handler will be added in a future patch, i.e. when
SGX virtualization is fully enabled.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/vmx/sgx.c | 24 
 1 file changed, 24 insertions(+)

diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
index 5b08e7dcc3a3..2bcfa3b6c75e 100644
--- a/arch/x86/kvm/vmx/sgx.c
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -221,3 +221,27 @@ int handle_encls_ecreate(struct kvm_vcpu *vcpu)
 
return sgx_encls_postamble(vcpu, ret, trapnr, secs_gva);
 }
+
+int handle_encls_einit(struct kvm_vcpu *vcpu)
+{
+   unsigned long sig_hva, secs_hva, token_hva;
+   struct vcpu_vmx *vmx = to_vmx(vcpu);
+   gva_t sig_gva, secs_gva, token_gva;
+   int ret, trapnr;
+
+   if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 1808, 4096, _gva) ||
+   sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, _gva) 
||
+   sgx_get_encls_gva(vcpu, kvm_rdx_read(vcpu), 304, 512, _gva))
+   return 1;
+
+   if (sgx_gva_to_hva(vcpu, sig_gva, false, _hva) ||
+   sgx_gva_to_hva(vcpu, secs_gva, true, _hva) ||
+   sgx_gva_to_hva(vcpu, token_gva, false, _hva))
+   return 1;
+
+   ret = sgx_einit((void __user *)sig_hva, (void __user *)token_hva,
+   (void __user *)secs_hva, vmx->msr_ia32_sgxlepubkeyhash,
+   );
+
+   return sgx_encls_postamble(vcpu, ret, trapnr, secs_hva);
+}
-- 
2.22.0



[RFC PATCH 05/21] x86/sgx: Expose SGX architectural definitions to the kernel

2019-07-26 Thread Sean Christopherson
KVM will use many of the architectural constants and structs to
virtualize SGX.

Signed-off-by: Sean Christopherson 
---
 arch/x86/{kernel/cpu/sgx/arch.h => include/asm/sgx_arch.h} | 0
 arch/x86/kernel/cpu/sgx/driver/driver.h| 2 +-
 arch/x86/kernel/cpu/sgx/encl.c | 2 +-
 arch/x86/kernel/cpu/sgx/encls.h| 2 +-
 arch/x86/kernel/cpu/sgx/main.c | 2 +-
 arch/x86/kernel/cpu/sgx/sgx.h  | 3 +--
 tools/testing/selftests/x86/sgx/defines.h  | 2 +-
 7 files changed, 6 insertions(+), 7 deletions(-)
 rename arch/x86/{kernel/cpu/sgx/arch.h => include/asm/sgx_arch.h} (100%)

diff --git a/arch/x86/kernel/cpu/sgx/arch.h b/arch/x86/include/asm/sgx_arch.h
similarity index 100%
rename from arch/x86/kernel/cpu/sgx/arch.h
rename to arch/x86/include/asm/sgx_arch.h
diff --git a/arch/x86/kernel/cpu/sgx/driver/driver.h 
b/arch/x86/kernel/cpu/sgx/driver/driver.h
index 6ce18c766a5a..4dc133f3c186 100644
--- a/arch/x86/kernel/cpu/sgx/driver/driver.h
+++ b/arch/x86/kernel/cpu/sgx/driver/driver.h
@@ -10,7 +10,7 @@
 #include 
 #include 
 #include 
-#include "../arch.h"
+#include 
 #include "../encl.h"
 #include "../encls.h"
 #include "../sgx.h"
diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c
index 836c55d4352d..8549fd95f02d 100644
--- a/arch/x86/kernel/cpu/sgx/encl.c
+++ b/arch/x86/kernel/cpu/sgx/encl.c
@@ -7,7 +7,7 @@
 #include 
 #include 
 #include 
-#include "arch.h"
+#include 
 #include "encl.h"
 #include "encls.h"
 #include "sgx.h"
diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h
index aea3b9d09936..1b49c7419767 100644
--- a/arch/x86/kernel/cpu/sgx/encls.h
+++ b/arch/x86/kernel/cpu/sgx/encls.h
@@ -8,7 +8,7 @@
 #include 
 #include 
 #include 
-#include "arch.h"
+#include 
 
 /**
  * ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index ead827371139..532dd90e09e1 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -10,8 +10,8 @@
 #include 
 #include 
 #include 
+#include 
 #include "driver/driver.h"
-#include "arch.h"
 #include "encls.h"
 #include "sgx.h"
 #include "virt.h"
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index 16cdb935aaa7..748b1633d770 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -8,10 +8,9 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
-#include "arch.h"
-
 struct sgx_epc_page {
unsigned long desc;
struct sgx_encl_page *owner;
diff --git a/tools/testing/selftests/x86/sgx/defines.h 
b/tools/testing/selftests/x86/sgx/defines.h
index 3ff73a9d9b93..ebc4c6cf57c4 100644
--- a/tools/testing/selftests/x86/sgx/defines.h
+++ b/tools/testing/selftests/x86/sgx/defines.h
@@ -33,7 +33,7 @@ typedef uint64_t u64;
(((~0ULL) - (1ULL << (l)) + 1) & \
 (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h
 
-#include "../../../../../arch/x86/kernel/cpu/sgx/arch.h"
+#include "../../../../../arch/x86/include/asm/sgx_arch.h"
 #include "../../../../../arch/x86/include/uapi/asm/sgx.h"
 
 #endif /* TYPES_H */
-- 
2.22.0



[RFC PATCH 21/21] KVM: x86: Add capability to grant VM access to privileged SGX attribute

2019-07-26 Thread Sean Christopherson
The SGX subsystem restricts access to a subset of enclave attributes to
provide additional security for an uncompromised kernel, e.g. to prevent
malware from using the PROVISIONKEY to ensure its nodes are running
inside a geniune SGX enclave and/or to obtain a stable fingerprint.

To prevent userspace from circumventing such restrictions by running an
enclave in a VM, KVM restricts guest access to privileged attributes by
default.  Add a capability, KVM_CAP_SGX_ATTRIBUTE, that can be used by
userspace to grant a VM access to a priveleged attribute, with args[0]
holding a file handle to a valid SGX attribute file corresponding to
an attribute that is restricted by KVM (currently only PROVISIONKEY).

Cc: Andy Lutomirski 
Signed-off-by: Sean Christopherson 
---
 Documentation/virtual/kvm/api.txt | 20 
 arch/x86/kvm/cpuid.c  |  2 +-
 arch/x86/kvm/x86.c| 22 ++
 include/uapi/linux/kvm.h  |  1 +
 4 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 383b292966fa..b1c0ff4e9224 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -5013,6 +5013,26 @@ it hard or impossible to use it correctly.  The 
availability of
 KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 signals that those bugs are fixed.
 Userspace should not try to use KVM_CAP_MANUAL_DIRTY_LOG_PROTECT.
 
+7.19 KVM_CAP_SGX_ATTRIBUTE
+
+Architectures: x86
+Parameters: args[0] is a file handle of a SGX attribute file in securityfs
+Returns: 0 on success, -EINVAL if the file handle is invalid or if a requested
+attribute is not supported by KVM.
+
+The SGX subsystem restricts access to a subset of enclave attributes, e.g. the
+PROVISIONKEY, to provide additional security for an uncompromised kernel, e.g.
+to prevent malware from using the PROVISIONKEY to ensure its nodes are running
+inside a geniune SGX enclave and/or to obtain a stable system fingerprint.
+
+To prevent userspace from circumventing such restrictions by running an enclave
+in a VM, KVM prevents access to privileged attributes by default.  Userspace
+can use KVM_CAP_SGX_ATTRIBUTE to grant a VM access to a priveleged attribute.
+args[0] must hold a file handle to a valid SGX attribute file corresponding to
+an attribute that is supported/restricted by KVM (currently only PROVISIONKEY).
+
+See Documentation/x86/sgx/2.Kernel-internals.rst for more details.
+
 8. Other capabilities.
 --
 
diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 73a0326a1968..73af09edb2fa 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -439,7 +439,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
 
/* cpuid 12.1.eax*/
const u32 kvm_cpuid_12_1_eax_sgx_features =
-   SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | 0 /* PROVISIONKEY */ |
+   SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | SGX_ATTR_PROVISIONKEY |
SGX_ATTR_EINITTOKENKEY | SGX_ATTR_KSS;
 
/* cpuid 12.1.ebx*/
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index ec92c5534336..9144909d4a8e 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -67,6 +67,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 
 #define CREATE_TRACE_POINTS
@@ -3090,6 +3092,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long 
ext)
case KVM_CAP_GET_MSR_FEATURES:
case KVM_CAP_MSR_PLATFORM_INFO:
case KVM_CAP_EXCEPTION_PAYLOAD:
+#ifdef CONFIG_INTEL_SGX_VIRTUALIZATION
+   case KVM_CAP_SGX_ATTRIBUTE:
+#endif
r = 1;
break;
case KVM_CAP_SYNC_REGS:
@@ -4626,6 +4631,23 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm,
kvm->arch.exception_payload_enabled = cap->args[0];
r = 0;
break;
+#ifdef CONFIG_INTEL_SGX_VIRTUALIZATION
+   case KVM_CAP_SGX_ATTRIBUTE: {
+   u64 allowed_attributes = 0;
+
+   r = sgx_set_attribute(_attributes, cap->args[0]);
+   if (r)
+   break;
+
+   /* KVM only supports the PROVISIONKEY privileged attribute. */
+   if ((allowed_attributes & SGX_ATTR_PROVISIONKEY) &&
+   !(allowed_attributes & ~SGX_ATTR_PROVISIONKEY))
+   kvm->arch.sgx_provisioning_allowed = true;
+   else
+   r = -EINVAL;
+   break;
+   }
+#endif
default:
r = -EINVAL;
break;
diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 2fe12b40d503..b16708c2b6c9 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -993,6 +993,7 @@ struct kvm_ppc_resize_hpt {
 #define KVM_CAP_ARM_SVE 170
 #define KVM_CAP_ARM_PTRAUTH_ADDRESS 171
 #define KVM_CAP_ARM_PTRAUTH_GENERIC 172
+#define KVM_CAP_SGX_ATTRIBUTE 200
 
 #ifdef 

[RFC PATCH 20/21] x86/sgx: Export sgx_set_attribute() for use by KVM

2019-07-26 Thread Sean Christopherson
To prevent userspace from circumventing access to the PROVISIONKEY by
running an enclave in a VM, KVM will deny access to the PROVISIONKEY
unless userspace proves to KVM that it is allowed to access the key.
Export sgx_set_attribute() so that it may be used by KVM to verify an
SGX attribute file.

Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/sgx.h | 2 ++
 arch/x86/kernel/cpu/sgx/driver/ioctl.c | 1 +
 arch/x86/kernel/cpu/sgx/main.c | 1 +
 arch/x86/kernel/cpu/sgx/sgx.h  | 1 -
 4 files changed, 4 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h
index f0f0176b8e2f..65c9417d3a80 100644
--- a/arch/x86/include/asm/sgx.h
+++ b/arch/x86/include/asm/sgx.h
@@ -6,6 +6,8 @@
 
 struct sgx_pageinfo;
 
+int sgx_set_attribute(u64 *allowed_attributes, unsigned int attribute_fd);
+
 #if IS_ENABLED(CONFIG_KVM_INTEL)
 int sgx_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs, int *trapnr);
 int sgx_einit(void __user *sigstruct, void __user *token,
diff --git a/arch/x86/kernel/cpu/sgx/driver/ioctl.c 
b/arch/x86/kernel/cpu/sgx/driver/ioctl.c
index a1cb5f772363..1b7a05cd9d02 100644
--- a/arch/x86/kernel/cpu/sgx/driver/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/driver/ioctl.c
@@ -2,6 +2,7 @@
 // Copyright(c) 2016-19 Intel Corporation.
 
 #include 
+#include 
 #include 
 #include 
 #include 
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index 542427c6ae9c..68e5c704378a 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -336,6 +336,7 @@ int sgx_set_attribute(u64 *allowed_attributes, unsigned int 
attribute_fd)
*allowed_attributes |= SGX_ATTR_PROVISIONKEY;
return 0;
 }
+EXPORT_SYMBOL_GPL(sgx_set_attribute);
 
 static void sgx_dev_release(struct device *dev)
 {
diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h
index 3f3311024bd0..fab12cc0e7c5 100644
--- a/arch/x86/kernel/cpu/sgx/sgx.h
+++ b/arch/x86/kernel/cpu/sgx/sgx.h
@@ -96,6 +96,5 @@ void sgx_update_lepubkeyhash_msrs(u64 *lepubkeyhash, bool 
enforce);
 __init int sgx_dev_init(const char *name, struct device *dev,
struct cdev *cdev, const struct file_operations *fops,
int minor);
-int sgx_set_attribute(u64 *allowed_attributes, unsigned int attribute_fd);
 
 #endif /* _X86_SGX_H */
-- 
2.22.0



[RFC PATCH 19/21] KVM: VMX: Enable SGX virtualization for SGX1, SGX2 and LC

2019-07-26 Thread Sean Christopherson
SGX adds a basic support bit to CPUID(7, 0), and enumerates SGX
capabilities, e.g. EPC info, ENCLS leafs, etc..., in CPUID(0x12, *).
All SGX1 and SGX2 ENCLS leafs (supported in hardware) can be exposed
to the guest unconditionally.  All other ENCLS leafs (currently the
ENCLS_C leafs) and all ENCLV leafs currently cannot be exposed to the
guest.

Flexible Launch Control, a.k.a. SGX LC, allows software to control the
key that is used to verify the signer of an enclave.  Because SGX LC
impacts guest operation even if it's not exposed to the guest, i.e.
EINIT is affected by hardware's LE hash MSRs, SGX cannot be exposed to
the guest if the host supports LC without explicit LC support in KVM.
In other words, LC support is required to run on platforms with LC
enabled in the host, thus making exposure of SGX LC to the guest a
formality.

Access to the provision key is not supported in this patch.  Access to
the provision key is controlled via securityfs, a future patch will
plumb in the ability for the userspace hypervisor to grant a VM access
to the provision key by passing in an appropriate file descriptor.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/cpuid.c  |  72 +-
 arch/x86/kvm/vmx/nested.c |  19 -
 arch/x86/kvm/vmx/nested.h |   5 ++
 arch/x86/kvm/vmx/sgx.h|  11 +++
 arch/x86/kvm/vmx/vmcs12.c |   1 +
 arch/x86/kvm/vmx/vmcs12.h |   4 +-
 arch/x86/kvm/vmx/vmx.c| 156 --
 arch/x86/kvm/vmx/vmx.h|   1 +
 8 files changed, 254 insertions(+), 15 deletions(-)
 create mode 100644 arch/x86/kvm/vmx/sgx.h

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 4c235af5318c..73a0326a1968 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 #include "cpuid.h"
 #include "lapic.h"
 #include "mmu.h"
@@ -117,6 +118,21 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu)
if (best && (best->eax & (F(XSAVES) | F(XSAVEC
best->ebx = xstate_required_size(vcpu->arch.xcr0, true);
 
+   /*
+* Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate
+* the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's
+* requested XCR0 value.  The enclave's XFRM must be a subset of XCRO
+* at the time of EENTER, thus adjust the allowed XFRM by the guest's
+* supported XCR0.  Similar to XCR0 handling, FP and SSE are forced to
+* '1' even on CPUs that don't support XSAVE.
+*/
+   best = kvm_find_cpuid_entry(vcpu, 0x12, 0x1);
+   if (best) {
+   best->ecx &= vcpu->arch.guest_supported_xcr0 & 0x;
+   best->edx &= vcpu->arch.guest_supported_xcr0 >> 32;
+   best->ecx |= XFEATURE_MASK_FPSSE;
+   }
+
/*
 * The existing code assumes virtual address is 48-bit or 57-bit in the
 * canonical address checks; exit if it is ever changed.
@@ -393,7 +409,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) |
F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) |
F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | 
F(AVX512DQ) |
-   F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt;
+   F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt | F(SGX);
 
/* cpuid 0xD.1.eax */
const u32 kvm_cpuid_D_1_eax_x86_features =
@@ -404,7 +420,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ |
F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) |
F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) |
-   F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B);
+   F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | F(SGX_LC);
 
/* cpuid 7.0.edx*/
const u32 kvm_cpuid_7_0_edx_x86_features =
@@ -412,6 +428,23 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
F(MD_CLEAR);
 
+   /* cpuid 12.0.eax*/
+   const u32 kvm_cpuid_12_0_eax_x86_features =
+   F(SGX1) | F(SGX2) | 0 /* Reserved */ | 0 /* Reserved */ |
+   0 /* Reserved */ | 0 /* ENCLV */ | 0 /* ENCLS_C */;
+
+   /* cpuid 12.0.ebx*/
+   const u32 kvm_cpuid_12_0_ebx_sgx_features =
+   SGX_MISC_EXINFO;
+
+   /* cpuid 12.1.eax*/
+   const u32 kvm_cpuid_12_1_eax_sgx_features =
+   SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | 0 /* PROVISIONKEY */ |
+   SGX_ATTR_EINITTOKENKEY | SGX_ATTR_KSS;
+
+   /* cpuid 12.1.ebx*/
+   const u32 kvm_cpuid_12_1_ebx_sgx_features = 0;
+
/*
 * The code below assumes index == 0, which simplifies handling 

[RFC PATCH 15/21] KVM: VMX: Add SGX ENCLS[ECREATE] handler to enforce CPUID restrictions

2019-07-26 Thread Sean Christopherson
Userspace can restrict what bits can be set in MISCSELECT, ATTRIBUTES
and XFRM via CPUID.  Intercept ECREATE when any of the aforementioned
masks diverges from hardware in order to enforce the desired CPUID
model, i.e. inject #GP if the guest attempts to set a bit that hasn't
been enumerated as allowed-1 in CPUID.

Add the handler in a dedicated SGX file under the VMX sub-directory so
as to confine the ugliness of the SGX specific code (re-executing ENCLS
leafs is messy due to the need to follow pointers from structs, get EPC
pages, etc...) and to save compilation cycles when SGX functionality is
disabled in the kernel.  The ENCLS handlers will soon grow to ~300 lines
of code when Launch Control support is added, and in the distant future
could balloon significantly if/when EPC oversubscription is supported.

Actual usage of the handler will be added in a future patch, i.e. when
SGX virtualization is fully enabled.

Note, access to the PROVISIONKEY is not yet supported.

Signed-off-by: Sean Christopherson 
---
 arch/x86/include/asm/kvm_host.h |   3 +
 arch/x86/include/asm/sgx_arch.h |   1 +
 arch/x86/kvm/Makefile   |   2 +
 arch/x86/kvm/vmx/sgx.c  | 223 
 4 files changed, 229 insertions(+)
 create mode 100644 arch/x86/kvm/vmx/sgx.c

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index 103df8cbdd24..27841a5d7851 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -928,6 +928,9 @@ struct kvm_arch {
 
bool guest_can_read_msr_platform_info;
bool exception_payload_enabled;
+
+   /* Guest can access the SGX PROVISIONKEY. */
+   bool sgx_provisioning_allowed;
 };
 
 struct kvm_vm_stat {
diff --git a/arch/x86/include/asm/sgx_arch.h b/arch/x86/include/asm/sgx_arch.h
index 39f731580ea8..e06f3ff717b4 100644
--- a/arch/x86/include/asm/sgx_arch.h
+++ b/arch/x86/include/asm/sgx_arch.h
@@ -8,6 +8,7 @@
 #ifndef _ASM_X86_SGX_ARCH_H
 #define _ASM_X86_SGX_ARCH_H
 
+#include 
 #include 
 
 #define SGX_CPUID  0x12
diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile
index 31ecf7a76d5a..f919c3e6abd7 100644
--- a/arch/x86/kvm/Makefile
+++ b/arch/x86/kvm/Makefile
@@ -13,6 +13,8 @@ kvm-y += x86.o mmu.o emulate.o i8259.o irq.o 
lapic.o \
   hyperv.o page_track.o debugfs.o
 
 kvm-intel-y+= vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o 
vmx/evmcs.o vmx/nested.o
+kvm-intel-$(CONFIG_INTEL_SGX_VIRTUALIZATION) += vmx/sgx.o
+
 kvm-amd-y  += svm.o pmu_amd.o
 
 obj-$(CONFIG_KVM)  += kvm.o
diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c
new file mode 100644
index ..5b08e7dcc3a3
--- /dev/null
+++ b/arch/x86/kvm/vmx/sgx.c
@@ -0,0 +1,223 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include 
+#include 
+
+#include "cpuid.h"
+#include "kvm_cache_regs.h"
+#include "vmx.h"
+#include "x86.h"
+
+/*
+ * ENCLS's memory operands use a fixed segment (DS) and a fixed
+ * address size based on the mode.  Related prefixes are ignored.
+ */
+static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset,
+int size, int alignment, gva_t *gva)
+{
+   struct kvm_segment s;
+   bool fault;
+
+   vmx_get_segment(vcpu, , VCPU_SREG_DS);
+
+   *gva = s.base + offset;
+
+   if (!IS_ALIGNED(*gva, alignment)) {
+   fault = true;
+   } else if (is_long_mode(vcpu)) {
+   fault = is_noncanonical_address(*gva, vcpu);
+   } else {
+   *gva &= 0x;
+   fault = (s.unusable) ||
+   (s.type != 2 && s.type != 3) ||
+   (*gva > s.limit) ||
+   ((s.base != 0 || s.limit != 0x) &&
+   (((u64)*gva + size - 1) > s.limit + 1));
+   }
+   if (fault)
+   kvm_inject_gp(vcpu, 0);
+   return fault ? -EINVAL : 0;
+}
+
+static int sgx_read_gva(struct kvm_vcpu *vcpu, gva_t gva, void *data,
+unsigned int size)
+{
+   struct x86_exception ex;
+
+   if (kvm_read_guest_virt(vcpu, gva, data, size, )) {
+   kvm_propagate_page_fault(vcpu, );
+   return -EFAULT;
+   }
+   return 0;
+}
+
+static int sgx_read_hva(struct kvm_vcpu *vcpu, unsigned long hva, void *data,
+   unsigned int size)
+{
+   if (__copy_from_user(data, (void __user *)hva, size)) {
+   vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
+   vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION;
+   vcpu->run->internal.ndata = 2;
+   vcpu->run->internal.data[0] = hva;
+   vcpu->run->internal.data[1] = size;
+   return -EFAULT;
+   }
+   return 0;
+}
+
+static int sgx_gva_to_hva(struct kvm_vcpu *vcpu, gva_t gva, bool write,
+ unsigned long *hva)
+{

[RFC PATCH 18/21] KVM: x86: Invoke kvm_x86_ops->cpuid_update() after kvm_update_cpuid()

2019-07-26 Thread Sean Christopherson
VMX's virtualization of SGX adds a lovely dependency on the guest's
supported xcr0, which is calculated in kvm_update_cpuid().  VMX must
toggled its interception of SGX instructions based on the supported
xcr0, i.e. kvm_x86_ops->cpuid_update() is certainly the correct
location for the dependent code.

kvm_update_cpuid() was originally added by commit 2acf923e38fb ("KVM:
VMX: Enable XSAVE/XRSTOR for guest").  There is no indication that its
placement after kvm_x86_ops->cpuid_update() was anything more than a
"new function at the end" decision.

Inspection of the current code reveals no dependency on kvm_x86_ops's
cpuid_update() in kvm_update_cpuid() or any of its helpers.

  - SVM's sole update is to conditionally clear X86_FEATURE_X2APIC.
X86_FEATURE_X2APIC is only consumed by kvm_apic_set_state(), which
is already called immediately prior to kvm_x86_ops->cpuid_update().

  - VMX updates only nested VMX MSRs, allowed FEATURE_CONTROL bits,
and VMCS fields, e.g. secondary execution controls, none of which
should bleed back into kvm_update_cpuid() barring an egregious
dependency bug somewhere else.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/cpuid.c | 6 --
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 70e488951f25..4c235af5318c 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -222,8 +222,9 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu,
vcpu->arch.cpuid_nent = cpuid->nent;
cpuid_fix_nx_cap(vcpu);
kvm_apic_set_version(vcpu);
-   kvm_x86_ops->cpuid_update(vcpu);
r = kvm_update_cpuid(vcpu);
+   if (!r)
+   kvm_x86_ops->cpuid_update(vcpu);
 
 out:
vfree(cpuid_entries);
@@ -245,8 +246,9 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu,
goto out;
vcpu->arch.cpuid_nent = cpuid->nent;
kvm_apic_set_version(vcpu);
-   kvm_x86_ops->cpuid_update(vcpu);
r = kvm_update_cpuid(vcpu);
+   if (!r)
+   kvm_x86_ops->cpuid_update(vcpu);
 out:
return r;
 }
-- 
2.22.0



[RFC PATCH 07/21] KVM: x86: Add WARN_ON_ONCE(index!=0) in __do_cpuid_ent

2019-07-26 Thread Sean Christopherson
Except for one outlier, function 7, all cases in __do_cpuid_ent and
its children assume that the index passed in is zero.  Furthermore,
the index is fully under KVM's control and all callers pass an index
of zero.  In other words, a non-zero index would indicate either a
bug in the caller or a new case that is expected to be handled.  WARN
and return an error on a non-zero index and remove the now unreachable
code in function 7 for handling a non-zero index.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/cpuid.c | 57 ++--
 1 file changed, 29 insertions(+), 28 deletions(-)

diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c
index 4992e7c99588..70e488951f25 100644
--- a/arch/x86/kvm/cpuid.c
+++ b/arch/x86/kvm/cpuid.c
@@ -410,6 +410,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) |
F(MD_CLEAR);
 
+   /*
+* The code below assumes index == 0, which simplifies handling leafs
+* with a dynamic number of sub-leafs.  The index is fully under KVM's
+* control, i.e. a non-zero value is a bug.
+*/
+   if (WARN_ON_ONCE(index != 0))
+   return -EINVAL;
+
/* all calls to cpuid_count() should be made on the same cpu */
get_cpu();
 
@@ -480,38 +488,31 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 
*entry, u32 function,
entry->ecx = 0;
entry->edx = 0;
break;
-   case 7: {
+   case 7:
entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX;
/* Mask ebx against host capability word 9 */
-   if (index == 0) {
-   entry->ebx &= kvm_cpuid_7_0_ebx_x86_features;
-   cpuid_mask(>ebx, CPUID_7_0_EBX);
-   // TSC_ADJUST is emulated
-   entry->ebx |= F(TSC_ADJUST);
-   entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
-   f_la57 = entry->ecx & F(LA57);
-   cpuid_mask(>ecx, CPUID_7_ECX);
-   /* Set LA57 based on hardware capability. */
-   entry->ecx |= f_la57;
-   entry->ecx |= f_umip;
-   /* PKU is not yet implemented for shadow paging. */
-   if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
-   entry->ecx &= ~F(PKU);
-   entry->edx &= kvm_cpuid_7_0_edx_x86_features;
-   cpuid_mask(>edx, CPUID_7_EDX);
-   /*
-* We emulate ARCH_CAPABILITIES in software even
-* if the host doesn't support it.
-*/
-   entry->edx |= F(ARCH_CAPABILITIES);
-   } else {
-   entry->ebx = 0;
-   entry->ecx = 0;
-   entry->edx = 0;
-   }
+   entry->ebx &= kvm_cpuid_7_0_ebx_x86_features;
+   cpuid_mask(>ebx, CPUID_7_0_EBX);
+   // TSC_ADJUST is emulated
+   entry->ebx |= F(TSC_ADJUST);
+   entry->ecx &= kvm_cpuid_7_0_ecx_x86_features;
+   f_la57 = entry->ecx & F(LA57);
+   cpuid_mask(>ecx, CPUID_7_ECX);
+   /* Set LA57 based on hardware capability. */
+   entry->ecx |= f_la57;
+   entry->ecx |= f_umip;
+   /* PKU is not yet implemented for shadow paging. */
+   if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE))
+   entry->ecx &= ~F(PKU);
+   entry->edx &= kvm_cpuid_7_0_edx_x86_features;
+   cpuid_mask(>edx, CPUID_7_EDX);
+   /*
+* We emulate ARCH_CAPABILITIES in software even
+* if the host doesn't support it.
+*/
+   entry->edx |= F(ARCH_CAPABILITIES);
entry->eax = 0;
break;
-   }
case 9:
break;
case 0xa: { /* Architectural Performance Monitoring */
-- 
2.22.0



[RFC PATCH 14/21] x86/sgx: Add helpers to expose ECREATE and EINIT to KVM

2019-07-26 Thread Sean Christopherson
Provide wrappers around __ecreate() and __einit() to export their
functionality for use by KVM without having to export a large amount of
SGX boilerplate code.  Intermediate helpers also shelter KVM from the
ugliness of overloading the ENCLS return value to encode multiple error
formats in a single int.

KVM will use the helpers to trap-and-execute ECREATE and EINIT as part
its SGX virtualization.

Signed-off-by: Sean Christopherson 
---
 arch/x86/Kconfig   |  3 ++
 arch/x86/include/asm/sgx.h | 15 ++
 arch/x86/kernel/cpu/sgx/virt.c | 55 ++
 3 files changed, 73 insertions(+)
 create mode 100644 arch/x86/include/asm/sgx.h

diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig
index c1bdb9f85928..8bbc6a30588d 100644
--- a/arch/x86/Kconfig
+++ b/arch/x86/Kconfig
@@ -1969,6 +1969,9 @@ config INTEL_SGX_VIRTUALIZATION
  "raw" EPC for the purpose of exposing EPC to a KVM guest, i.e. a
  virtual machine, via a device node (/dev/sgx/virt_epc by default).
 
+ SGX virtualization also adds helpers that are used by KVM to trap
+ and execute certain ENCLS instructions on behalf of a KVM guest.
+
  If unsure, say N.
 
 config EFI
diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h
new file mode 100644
index ..f0f0176b8e2f
--- /dev/null
+++ b/arch/x86/include/asm/sgx.h
@@ -0,0 +1,15 @@
+/* SPDX-License-Identifier: GPL-2.0 */
+#ifndef _ASM_X86_SGX_H
+#define _ASM_X86_SGX_H
+
+#include 
+
+struct sgx_pageinfo;
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+int sgx_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs, int *trapnr);
+int sgx_einit(void __user *sigstruct, void __user *token,
+ void __user *secs, u64 *lepubkeyhash, int *trapnr);
+#endif
+
+#endif /* _ASM_X86_SGX_H */
diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c
index 79ee5917a4fc..9e5bf4450bf7 100644
--- a/arch/x86/kernel/cpu/sgx/virt.c
+++ b/arch/x86/kernel/cpu/sgx/virt.c
@@ -251,3 +251,58 @@ int __init sgx_virt_epc_init(void)
 
return ret;
 }
+
+#if IS_ENABLED(CONFIG_KVM_INTEL)
+int sgx_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs, int *trapnr)
+{
+   int ret;
+
+   __uaccess_begin();
+   ret = __ecreate(pageinfo, (void *)secs);
+   __uaccess_end();
+
+   if (encls_faulted(ret)) {
+   *trapnr = ENCLS_TRAPNR(ret);
+   return -EFAULT;
+   }
+   return ret;
+}
+EXPORT_SYMBOL_GPL(sgx_ecreate);
+
+static int __sgx_einit(void __user *sigstruct, void __user *token,
+  void __user *secs)
+{
+   int ret;
+
+   __uaccess_begin();
+   ret =  __einit((void *)sigstruct, (void *)token, (void *)secs);
+   __uaccess_end();
+   return ret;
+}
+
+int sgx_einit(void __user *sigstruct, void __user *token,
+ void __user *secs, u64 *lepubkeyhash, int *trapnr)
+{
+   int ret;
+
+   if (!boot_cpu_has(X86_FEATURE_SGX_LC)) {
+   ret = __sgx_einit(sigstruct, token, secs);
+   } else {
+   preempt_disable();
+   sgx_update_lepubkeyhash_msrs(lepubkeyhash, false);
+   ret = __sgx_einit(sigstruct, token, secs);
+   if (ret == SGX_INVALID_EINITTOKEN) {
+   sgx_update_lepubkeyhash_msrs(lepubkeyhash, true);
+   ret = __sgx_einit(sigstruct, token, secs);
+   }
+   preempt_enable();
+   }
+
+   if (encls_faulted(ret)) {
+   *trapnr = ENCLS_TRAPNR(ret);
+   return -EFAULT;
+   }
+   return ret;
+}
+EXPORT_SYMBOL_GPL(sgx_einit);
+#endif
-- 
2.22.0



[RFC PATCH 06/21] KVM: x86: Add SGX sub-features leaf to reverse CPUID table

2019-07-26 Thread Sean Christopherson
CPUID_12_EAX is an Intel-defined feature bits leaf dedicated for SGX
that enumerates the SGX instruction sets that are supported by the
CPU, e.g. SGX1, SGX2, etc...

Since Linux only cares about two bits at this time (SGX1 and SGX2), the
SGX bits were relocated to to Linux-defined word 8, i.e. CPUID_LNX_3,
instead of adding a dedicated SGX word so as to conserve space.  But,
to make KVM's life simple, the bit numbers of the SGX features were
intentionally kept the same between the Intel-defined leaf and the
Linux-defined leaf.

Add build-time assertions to ensure X86_FEATURE_SGX{1,2} are at the
expected locations, and that KVM isn't trying to do a reverse CPUID
lookup on a non-SGX bit in CPUID_LNX_3.

Relocate bit() to cpuid.h where it belongs (it's NOT a generic bit
function) and add a beefy comment explaining what the hell it's doing.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kvm/cpuid.h   | 20 
 arch/x86/kvm/emulate.c |  1 +
 arch/x86/kvm/x86.h |  5 -
 3 files changed, 21 insertions(+), 5 deletions(-)

diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h
index d78a61408243..aed49d639c3b 100644
--- a/arch/x86/kvm/cpuid.h
+++ b/arch/x86/kvm/cpuid.h
@@ -53,6 +53,7 @@ static const struct cpuid_reg reverse_cpuid[] = {
[CPUID_7_ECX] = { 7, 0, CPUID_ECX},
[CPUID_8000_0007_EBX] = {0x8007, 0, CPUID_EBX},
[CPUID_7_EDX] = { 7, 0, CPUID_EDX},
+   [CPUID_LNX_3] = {  0x12, 0, CPUID_EAX},
 };
 
 static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature)
@@ -61,6 +62,7 @@ static __always_inline struct cpuid_reg 
x86_feature_cpuid(unsigned x86_feature)
 
BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid));
BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0);
+   BUILD_BUG_ON(x86_leaf == CPUID_LNX_3 && (x86_feature & 31) > 1);
 
return reverse_cpuid[x86_leaf];
 }
@@ -89,6 +91,24 @@ static __always_inline int *guest_cpuid_get_register(struct 
kvm_vcpu *vcpu, unsi
}
 }
 
+/*
+ * Retrieve the bit from an X86_FEATURE_* definition using a simple AND to
+ * isolate the bit number from the feature definition.  Note that this works
+ * only for features that are NOT scattered, i.e. the X86_FEATURE_* bit number
+ * must match the hardware-defined CPUID bit number.  The only exception to
+ * this rule is the SGX sub-features leaf, which is scattered but only in the
+ * sense that its bits are relocated from hardware-defined leaf 0x12.0.EAX to
+ * Linux defined word 8, but its bit numbers are maintained (KVM asserts this
+ * expectation at build time).
+ */
+static __always_inline u32 bit(unsigned x86_feature)
+{
+   BUILD_BUG_ON((X86_FEATURE_SGX1 & 31) != 0);
+   BUILD_BUG_ON((X86_FEATURE_SGX2 & 31) != 1);
+
+   return 1 << (x86_feature & 31);
+}
+
 static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned 
x86_feature)
 {
int *reg;
diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c
index 4a387a235424..6ffe23febcd7 100644
--- a/arch/x86/kvm/emulate.c
+++ b/arch/x86/kvm/emulate.c
@@ -29,6 +29,7 @@
 #include "tss.h"
 #include "mmu.h"
 #include "pmu.h"
+#include "cpuid.h"
 
 /*
  * Operand types
diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h
index a470ff0868c5..1e0c7b17effa 100644
--- a/arch/x86/kvm/x86.h
+++ b/arch/x86/kvm/x86.h
@@ -139,11 +139,6 @@ static inline int is_paging(struct kvm_vcpu *vcpu)
return likely(kvm_read_cr0_bits(vcpu, X86_CR0_PG));
 }
 
-static inline u32 bit(int bitno)
-{
-   return 1 << (bitno & 31);
-}
-
 static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu)
 {
return kvm_read_cr4_bits(vcpu, X86_CR4_LA57) ? 57 : 48;
-- 
2.22.0



[RFC PATCH 03/21] x86/sgx: Move provisioning device to common code

2019-07-26 Thread Sean Christopherson
Move the provisioning device to common code in preparation for adding
support for SGX virtualization.  The provisioning device will need to be
instantiated if the native SGX driver *or* the virtual EPC "driver" is
loaded.

Signed-off-by: Sean Christopherson 
---
 arch/x86/kernel/cpu/sgx/driver/ioctl.c | 18 ++-
 arch/x86/kernel/cpu/sgx/driver/main.c  | 24 +-
 arch/x86/kernel/cpu/sgx/main.c | 44 +-
 arch/x86/kernel/cpu/sgx/sgx.h  |  1 +
 4 files changed, 47 insertions(+), 40 deletions(-)

diff --git a/arch/x86/kernel/cpu/sgx/driver/ioctl.c 
b/arch/x86/kernel/cpu/sgx/driver/ioctl.c
index 89b3fb81c15b..b7aa06920d10 100644
--- a/arch/x86/kernel/cpu/sgx/driver/ioctl.c
+++ b/arch/x86/kernel/cpu/sgx/driver/ioctl.c
@@ -794,26 +794,12 @@ static long sgx_ioc_enclave_set_attribute(struct file 
*filep, void __user *arg)
 {
struct sgx_encl *encl = filep->private_data;
struct sgx_enclave_set_attribute params;
-   struct file *attribute_file;
-   int ret;
 
if (copy_from_user(, arg, sizeof(params)))
return -EFAULT;
 
-   attribute_file = fget(params.attribute_fd);
-   if (!attribute_file)
-   return -EINVAL;
-
-   if (attribute_file->f_op != _provision_fops) {
-   ret = -EINVAL;
-   goto out;
-   }
-
-   encl->allowed_attributes |= SGX_ATTR_PROVISIONKEY;
-
-out:
-   fput(attribute_file);
-   return ret;
+   return sgx_set_attribute(>allowed_attributes,
+params.attribute_fd);
 }
 
 long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg)
diff --git a/arch/x86/kernel/cpu/sgx/driver/main.c 
b/arch/x86/kernel/cpu/sgx/driver/main.c
index d62bdc7ed4d9..1e107dd0d909 100644
--- a/arch/x86/kernel/cpu/sgx/driver/main.c
+++ b/arch/x86/kernel/cpu/sgx/driver/main.c
@@ -154,14 +154,8 @@ static const struct file_operations sgx_encl_fops = {
.get_unmapped_area  = sgx_get_unmapped_area,
 };
 
-const struct file_operations sgx_provision_fops = {
-   .owner  = THIS_MODULE,
-};
-
 static struct device sgx_encl_dev;
 static struct cdev sgx_encl_cdev;
-static struct device sgx_provision_dev;
-static struct cdev sgx_provision_cdev;
 
 int __init sgx_drv_init(void)
 {
@@ -202,38 +196,22 @@ int __init sgx_drv_init(void)
if (ret)
return ret;
 
-   ret = sgx_dev_init("sgx/provision", _provision_dev,
-  _provision_cdev, _provision_fops,
-  SGX_PROV_DEV_MINOR);
-   if (ret)
-   goto err_encl_dev;
-
sgx_encl_wq = alloc_workqueue("sgx-encl-wq",
  WQ_UNBOUND | WQ_FREEZABLE, 1);
if (!sgx_encl_wq) {
ret = -ENOMEM;
-   goto err_provision_dev;
+   goto err_encl_dev;
}
 
ret = cdev_device_add(_encl_cdev, _encl_dev);
if (ret)
goto err_encl_wq;
 
-   ret = cdev_device_add(_provision_cdev, _provision_dev);
-   if (ret)
-   goto err_encl_cdev;
-
return 0;
 
-err_encl_cdev:
-   cdev_device_del(_encl_cdev, _encl_dev);
-
 err_encl_wq:
destroy_workqueue(sgx_encl_wq);
 
-err_provision_dev:
-   put_device(_provision_dev);
-
 err_encl_dev:
put_device(_encl_dev);
 
diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c
index edbd465083c7..9f4473597620 100644
--- a/arch/x86/kernel/cpu/sgx/main.c
+++ b/arch/x86/kernel/cpu/sgx/main.c
@@ -2,6 +2,7 @@
 // Copyright(c) 2016-17 Intel Corporation.
 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -335,6 +336,31 @@ static struct bus_type sgx_bus_type = {
 };
 static dev_t sgx_devt;
 
+const struct file_operations sgx_provision_fops = {
+   .owner  = THIS_MODULE,
+};
+
+static struct device sgx_provision_dev;
+static struct cdev sgx_provision_cdev;
+
+int sgx_set_attribute(u64 *allowed_attributes, unsigned int attribute_fd)
+{
+   struct file *attribute_file;
+
+   attribute_file = fget(attribute_fd);
+   if (!attribute_file)
+   return -EINVAL;
+
+   if (attribute_file->f_op != _provision_fops) {
+   fput(attribute_file);
+   return -EINVAL;
+   }
+   fput(attribute_file);
+
+   *allowed_attributes |= SGX_ATTR_PROVISIONKEY;
+   return 0;
+}
+
 static void sgx_dev_release(struct device *dev)
 {
 
@@ -386,12 +412,28 @@ static __init int sgx_init(void)
if (ret < 0)
goto err_bus;
 
-   ret = sgx_drv_init();
+   ret = sgx_dev_init("sgx/provision", _provision_dev,
+  _provision_cdev, _provision_fops,
+  SGX_PROV_DEV_MINOR);
if (ret)
goto err_chrdev_region;
 
+   ret = cdev_device_add(_provision_cdev, _provision_dev);
+   if (ret)
+   goto err_provision_dev;
+
+   ret = 

[PATCH V3 net-next 10/10] net: hns3: use dev_info() instead of pr_info()

2019-07-26 Thread Huazhong Tan
dev_info() is more appropriate for printing messages when driver
initialization done, so switch to dev_info().

Signed-off-by: Huazhong Tan 
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c   | 4 +++-
 drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 3 ++-
 2 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 30a7074..4138780 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -8862,7 +8862,9 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
hclge_state_init(hdev);
hdev->last_reset_time = jiffies;
 
-   pr_info("%s driver initialization finished.\n", HCLGE_DRIVER_NAME);
+   dev_info(>pdev->dev, "%s driver initialization finished.\n",
+HCLGE_DRIVER_NAME);
+
return 0;
 
 err_mdiobus_unreg:
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
index a13a0e1..ae0e6a6 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c
@@ -2695,7 +2695,8 @@ static int hclgevf_init_hdev(struct hclgevf_dev *hdev)
}
 
hdev->last_reset_time = jiffies;
-   pr_info("finished initializing %s driver\n", HCLGEVF_DRIVER_NAME);
+   dev_info(>pdev->dev, "finished initializing %s driver\n",
+HCLGEVF_DRIVER_NAME);
 
return 0;
 
-- 
2.7.4



[PATCH V3 net-next 07/10] net: hns3: make hclge_service use delayed workqueue

2019-07-26 Thread Huazhong Tan
From: Yunsheng Lin 

Use delayed work instead of using timers to trigger the
hclge_serive.

Simplify the code with one less middle function and in order
to support misc irq affinity.

Signed-off-by: Yunsheng Lin 
Reviewed-by: Peng Li 
Signed-off-by: Huazhong Tan 
---
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c| 52 +-
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h|  3 +-
 2 files changed, 21 insertions(+), 34 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 14199c4..13c9697 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -2513,8 +2513,12 @@ static void hclge_task_schedule(struct hclge_dev *hdev)
 {
if (!test_bit(HCLGE_STATE_DOWN, >state) &&
!test_bit(HCLGE_STATE_REMOVING, >state) &&
-   !test_and_set_bit(HCLGE_STATE_SERVICE_SCHED, >state))
-   (void)schedule_work(>service_task);
+   !test_and_set_bit(HCLGE_STATE_SERVICE_SCHED, >state)) {
+   hdev->hw_stats.stats_timer++;
+   hdev->fd_arfs_expire_timer++;
+   mod_delayed_work(system_wq, >service_task,
+round_jiffies_relative(HZ));
+   }
 }
 
 static int hclge_get_mac_link_status(struct hclge_dev *hdev)
@@ -2729,25 +2733,6 @@ static int hclge_get_status(struct hnae3_handle *handle)
return hdev->hw.mac.link;
 }
 
-static void hclge_service_timer(struct timer_list *t)
-{
-   struct hclge_dev *hdev = from_timer(hdev, t, service_timer);
-
-   mod_timer(>service_timer, jiffies + HZ);
-   hdev->hw_stats.stats_timer++;
-   hdev->fd_arfs_expire_timer++;
-   hclge_task_schedule(hdev);
-}
-
-static void hclge_service_complete(struct hclge_dev *hdev)
-{
-   WARN_ON(!test_bit(HCLGE_STATE_SERVICE_SCHED, >state));
-
-   /* Flush memory before next watchdog */
-   smp_mb__before_atomic();
-   clear_bit(HCLGE_STATE_SERVICE_SCHED, >state);
-}
-
 static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval)
 {
u32 rst_src_reg, cmdq_src_reg, msix_src_reg;
@@ -3594,7 +3579,9 @@ static void hclge_update_vport_alive(struct hclge_dev 
*hdev)
 static void hclge_service_task(struct work_struct *work)
 {
struct hclge_dev *hdev =
-   container_of(work, struct hclge_dev, service_task);
+   container_of(work, struct hclge_dev, service_task.work);
+
+   clear_bit(HCLGE_STATE_SERVICE_SCHED, >state);
 
if (hdev->hw_stats.stats_timer >= HCLGE_STATS_TIMER_INTERVAL) {
hclge_update_stats_for_all(hdev);
@@ -3609,7 +3596,8 @@ static void hclge_service_task(struct work_struct *work)
hclge_rfs_filter_expire(hdev);
hdev->fd_arfs_expire_timer = 0;
}
-   hclge_service_complete(hdev);
+
+   hclge_task_schedule(hdev);
 }
 
 struct hclge_vport *hclge_get_vport(struct hnae3_handle *handle)
@@ -6148,10 +6136,13 @@ static void hclge_set_timer_task(struct hnae3_handle 
*handle, bool enable)
struct hclge_dev *hdev = vport->back;
 
if (enable) {
-   mod_timer(>service_timer, jiffies + HZ);
+   hclge_task_schedule(hdev);
} else {
-   del_timer_sync(>service_timer);
-   cancel_work_sync(>service_task);
+   /* Set the DOWN flag here to disable the service to be
+* scheduled again
+*/
+   set_bit(HCLGE_STATE_DOWN, >state);
+   cancel_delayed_work_sync(>service_task);
clear_bit(HCLGE_STATE_SERVICE_SCHED, >state);
}
 }
@@ -8590,12 +8581,10 @@ static void hclge_state_uninit(struct hclge_dev *hdev)
set_bit(HCLGE_STATE_DOWN, >state);
set_bit(HCLGE_STATE_REMOVING, >state);
 
-   if (hdev->service_timer.function)
-   del_timer_sync(>service_timer);
if (hdev->reset_timer.function)
del_timer_sync(>reset_timer);
-   if (hdev->service_task.func)
-   cancel_work_sync(>service_task);
+   if (hdev->service_task.work.func)
+   cancel_delayed_work_sync(>service_task);
if (hdev->rst_service_task.func)
cancel_work_sync(>rst_service_task);
if (hdev->mbx_service_task.func)
@@ -8800,9 +8789,8 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
 
hclge_dcb_ops_set(hdev);
 
-   timer_setup(>service_timer, hclge_service_timer, 0);
timer_setup(>reset_timer, hclge_reset_timer, 0);
-   INIT_WORK(>service_task, hclge_service_task);
+   INIT_DELAYED_WORK(>service_task, hclge_service_task);
INIT_WORK(>rst_service_task, hclge_reset_service_task);
INIT_WORK(>mbx_service_task, hclge_mailbox_service_task);
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h 

[PATCH V3 net-next 08/10] net: hns3: add interrupt affinity support for misc interrupt

2019-07-26 Thread Huazhong Tan
From: Yunsheng Lin 

The misc interrupt is used to schedule the reset and mailbox
subtask, and service_task delayed_work is used to do periodic
management work each second.

This patch sets the above three subtask's affinity using the
misc interrupt' affinity.

Also this patch setups a affinity notify for misc interrupt to
allow user to change the above three subtask's affinity.

Signed-off-by: Yunsheng Lin 
Signed-off-by: Peng Li 
Signed-off-by: Huazhong Tan 
---
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c| 53 --
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h|  4 ++
 2 files changed, 53 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 13c9697..30a7074 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -1270,6 +1270,12 @@ static int hclge_configure(struct hclge_dev *hdev)
 
hclge_init_kdump_kernel_config(hdev);
 
+   /* Set the init affinity based on pci func number */
+   i = cpumask_weight(cpumask_of_node(dev_to_node(>pdev->dev)));
+   i = i ? PCI_FUNC(hdev->pdev->devfn) % i : 0;
+   cpumask_set_cpu(cpumask_local_spread(i, dev_to_node(>pdev->dev)),
+   >affinity_mask);
+
return ret;
 }
 
@@ -2499,14 +2505,16 @@ static void hclge_mbx_task_schedule(struct hclge_dev 
*hdev)
 {
if (!test_bit(HCLGE_STATE_CMD_DISABLE, >state) &&
!test_and_set_bit(HCLGE_STATE_MBX_SERVICE_SCHED, >state))
-   schedule_work(>mbx_service_task);
+   queue_work_on(cpumask_first(>affinity_mask), system_wq,
+ >mbx_service_task);
 }
 
 static void hclge_reset_task_schedule(struct hclge_dev *hdev)
 {
if (!test_bit(HCLGE_STATE_REMOVING, >state) &&
!test_and_set_bit(HCLGE_STATE_RST_SERVICE_SCHED, >state))
-   schedule_work(>rst_service_task);
+   queue_work_on(cpumask_first(>affinity_mask), system_wq,
+ >rst_service_task);
 }
 
 static void hclge_task_schedule(struct hclge_dev *hdev)
@@ -2516,8 +2524,9 @@ static void hclge_task_schedule(struct hclge_dev *hdev)
!test_and_set_bit(HCLGE_STATE_SERVICE_SCHED, >state)) {
hdev->hw_stats.stats_timer++;
hdev->fd_arfs_expire_timer++;
-   mod_delayed_work(system_wq, >service_task,
-round_jiffies_relative(HZ));
+   mod_delayed_work_on(cpumask_first(>affinity_mask),
+   system_wq, >service_task,
+   round_jiffies_relative(HZ));
}
 }
 
@@ -2903,6 +2912,36 @@ static void hclge_get_misc_vector(struct hclge_dev *hdev)
hdev->num_msi_used += 1;
 }
 
+static void hclge_irq_affinity_notify(struct irq_affinity_notify *notify,
+ const cpumask_t *mask)
+{
+   struct hclge_dev *hdev = container_of(notify, struct hclge_dev,
+ affinity_notify);
+
+   cpumask_copy(>affinity_mask, mask);
+}
+
+static void hclge_irq_affinity_release(struct kref *ref)
+{
+}
+
+static void hclge_misc_affinity_setup(struct hclge_dev *hdev)
+{
+   irq_set_affinity_hint(hdev->misc_vector.vector_irq,
+ >affinity_mask);
+
+   hdev->affinity_notify.notify = hclge_irq_affinity_notify;
+   hdev->affinity_notify.release = hclge_irq_affinity_release;
+   irq_set_affinity_notifier(hdev->misc_vector.vector_irq,
+ >affinity_notify);
+}
+
+static void hclge_misc_affinity_teardown(struct hclge_dev *hdev)
+{
+   irq_set_affinity_notifier(hdev->misc_vector.vector_irq, NULL);
+   irq_set_affinity_hint(hdev->misc_vector.vector_irq, NULL);
+}
+
 static int hclge_misc_irq_init(struct hclge_dev *hdev)
 {
int ret;
@@ -8794,6 +8833,11 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev)
INIT_WORK(>rst_service_task, hclge_reset_service_task);
INIT_WORK(>mbx_service_task, hclge_mailbox_service_task);
 
+   /* Setup affinity after service timer setup because add_timer_on
+* is called in affinity notify.
+*/
+   hclge_misc_affinity_setup(hdev);
+
hclge_clear_all_event_cause(hdev);
hclge_clear_resetting_state(hdev);
 
@@ -8955,6 +8999,7 @@ static void hclge_uninit_ae_dev(struct hnae3_ae_dev 
*ae_dev)
struct hclge_dev *hdev = ae_dev->priv;
struct hclge_mac *mac = >hw.mac;
 
+   hclge_misc_affinity_teardown(hdev);
hclge_state_uninit(hdev);
 
if (mac->phydev)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h 
b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
index dde8f22..688e425 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
+++ 

[PATCH V3 net-next 02/10] net: hns3: add a check for get_reset_level

2019-07-26 Thread Huazhong Tan
From: Guangbin Huang 

For some cases, ops->get_reset_level may not be implemented, so we
should check whether it is NULL before calling get_reset_level.

Signed-off-by: Guangbin Huang 
Signed-off-by: Huazhong Tan 
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 08af782..4d58c53 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -1963,7 +1963,7 @@ static pci_ers_result_t hns3_slot_reset(struct pci_dev 
*pdev)
 
ops = ae_dev->ops;
/* request the reset */
-   if (ops->reset_event) {
+   if (ops->reset_event && ops->get_reset_level) {
if (ae_dev->hw_err_reset_req) {
reset_type = ops->get_reset_level(ae_dev,
_dev->hw_err_reset_req);
-- 
2.7.4



[PATCH V3 net-next 00/10] net: hns3: some code optimizations & bugfixes & features

2019-07-26 Thread Huazhong Tan
This patch-set includes code optimizations, bugfixes and features for
the HNS3 ethernet controller driver.

[patch 1/10] checks reset status before setting channel.

[patch 2/10] adds a NULL pointer checking.

[patch 3/10] removes reset level upgrading when current reset fails.

[patch 4/10] fixes a GFP flags errors when holding spin_lock.

[patch 5/10] modifies firmware version format.

[patch 6/10] adds some print information which is off by default.

[patch 7/10 - 8/10] adds two code optimizations about interrupt handler
and work task.

[patch 9/10] adds support for using order 1 pages with a 4K buffer.

[patch 10/10] modifies messages prints with dev_info() instead of
pr_info().

Change log:
V2->V3: fixes comments from Saeed Mahameed and Joe Perches.
V1->V2: fixes comments from Saeed Mahameed and
removes previous [patch 4/11] and [patch 11/11]
which needs further discussion, and adds a new
patch [11/11] suggested by Saeed Mahameed.


Guangbin Huang (1):
  net: hns3: add a check for get_reset_level

Huazhong Tan (2):
  net: hns3: remove upgrade reset level when reset fail
  net: hns3: use dev_info() instead of pr_info()

Jian Shen (1):
  net: hns3: add reset checking before set channels

Yonglong Liu (1):
  net: hns3: add debug messages to identify eth down cause

Yufeng Mo (2):
  net: hns3: change GFP flag during lock period
  net: hns3: modify firmware version display format

Yunsheng Lin (3):
  net: hns3: make hclge_service use delayed workqueue
  net: hns3: add interrupt affinity support for misc interrupt
  net: hns3: Add support for using order 1 pages with a 4K buffer

 drivers/net/ethernet/hisilicon/hns3/hnae3.h|   9 ++
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c|  33 -
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.h|  15 ++-
 drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c |  34 +-
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c |  10 +-
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c |  11 ++
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c| 135 -
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h|   7 +-
 .../ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c   |  10 +-
 .../ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c  |   3 +-
 10 files changed, 195 insertions(+), 72 deletions(-)

-- 
2.7.4



[PATCH V3 net-next 04/10] net: hns3: change GFP flag during lock period

2019-07-26 Thread Huazhong Tan
From: Yufeng Mo 

When allocating memory, the GFP_KERNEL cannot be used during the
spin_lock period. This is because it may cause scheduling when holding
spin_lock. This patch changes GFP flag to GFP_ATOMIC in this case.

Fixes: dd74f815dd41 ("net: hns3: Add support for rule add/delete for flow 
director")
Signed-off-by: Yufeng Mo 
Signed-off-by: lipeng 00277521 
Signed-off-by: Huazhong Tan 
---
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 3c64d70..14199c4 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -5796,7 +5796,7 @@ static int hclge_add_fd_entry_by_arfs(struct hnae3_handle 
*handle, u16 queue_id,
return -ENOSPC;
}
 
-   rule = kzalloc(sizeof(*rule), GFP_KERNEL);
+   rule = kzalloc(sizeof(*rule), GFP_ATOMIC);
if (!rule) {
spin_unlock_bh(>fd_rule_lock);
 
-- 
2.7.4



[PATCH V3 net-next 06/10] net: hns3: add debug messages to identify eth down cause

2019-07-26 Thread Huazhong Tan
From: Yonglong Liu 

Some times just see the eth interface have been down/up via
dmesg, but can not know why the eth down. So adds some debug
messages to identify the cause for this.

Signed-off-by: Yonglong Liu 
Signed-off-by: Peng Li 
Signed-off-by: Huazhong Tan 
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c   | 18 ++
 drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c| 19 +++
 .../net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c| 11 +++
 3 files changed, 48 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 4d58c53..973c57b 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -459,6 +459,9 @@ static int hns3_nic_net_open(struct net_device *netdev)
h->ae_algo->ops->set_timer_task(priv->ae_handle, true);
 
hns3_config_xps(priv);
+
+   netif_info(h, drv, netdev, "net open\n");
+
return 0;
 }
 
@@ -519,6 +522,8 @@ static int hns3_nic_net_stop(struct net_device *netdev)
if (test_and_set_bit(HNS3_NIC_STATE_DOWN, >state))
return 0;
 
+   netif_info(h, drv, netdev, "net stop\n");
+
if (h->ae_algo->ops->set_timer_task)
h->ae_algo->ops->set_timer_task(priv->ae_handle, false);
 
@@ -1550,6 +1555,8 @@ static int hns3_setup_tc(struct net_device *netdev, void 
*type_data)
h = hns3_get_handle(netdev);
kinfo = >kinfo;
 
+   netif_info(h, drv, netdev, "setup tc: num_tc=%u\n", tc);
+
return (kinfo->dcb_ops && kinfo->dcb_ops->setup_tc) ?
kinfo->dcb_ops->setup_tc(h, tc, prio_tc) : -EOPNOTSUPP;
 }
@@ -1593,6 +1600,10 @@ static int hns3_ndo_set_vf_vlan(struct net_device 
*netdev, int vf, u16 vlan,
struct hnae3_handle *h = hns3_get_handle(netdev);
int ret = -EIO;
 
+   netif_info(h, drv, netdev,
+  "set vf vlan: vf=%d, vlan=%u, qos=%u, vlan_proto=%u\n",
+  vf, vlan, qos, vlan_proto);
+
if (h->ae_algo->ops->set_vf_vlan_filter)
ret = h->ae_algo->ops->set_vf_vlan_filter(h, vf, vlan,
  qos, vlan_proto);
@@ -1611,6 +1622,9 @@ static int hns3_nic_change_mtu(struct net_device *netdev, 
int new_mtu)
if (!h->ae_algo->ops->set_mtu)
return -EOPNOTSUPP;
 
+   netif_info(h, drv, netdev,
+  "change mtu from %u to %d\n", netdev->mtu, new_mtu);
+
ret = h->ae_algo->ops->set_mtu(h, new_mtu);
if (ret)
netdev_err(netdev, "failed to change MTU in hardware %d\n",
@@ -4395,6 +4409,10 @@ int hns3_set_channels(struct net_device *netdev,
if (kinfo->rss_size == new_tqp_num)
return 0;
 
+   netif_info(h, drv, netdev,
+  "set channels: tqp_num=%u, rxfh=%d\n",
+  new_tqp_num, rxfh_configured);
+
ret = hns3_reset_notify(h, HNAE3_DOWN_CLIENT);
if (ret)
return ret;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
index e71c92b..8553200 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
@@ -311,6 +311,8 @@ static void hns3_self_test(struct net_device *ndev,
if (eth_test->flags != ETH_TEST_FL_OFFLINE)
return;
 
+   netif_info(h, drv, ndev, "self test start");
+
st_param[HNAE3_LOOP_APP][0] = HNAE3_LOOP_APP;
st_param[HNAE3_LOOP_APP][1] =
h->flags & HNAE3_SUPPORT_APP_LOOPBACK;
@@ -374,6 +376,8 @@ static void hns3_self_test(struct net_device *ndev,
 
if (if_running)
ndev->netdev_ops->ndo_open(ndev);
+
+   netif_info(h, drv, ndev, "self test end\n");
 }
 
 static int hns3_get_sset_count(struct net_device *netdev, int stringset)
@@ -604,6 +608,10 @@ static int hns3_set_pauseparam(struct net_device *netdev,
 {
struct hnae3_handle *h = hns3_get_handle(netdev);
 
+   netif_info(h, drv, netdev,
+  "set pauseparam: autoneg=%u, rx:%u, tx:%u\n",
+  param->autoneg, param->rx_pause, param->tx_pause);
+
if (h->ae_algo->ops->set_pauseparam)
return h->ae_algo->ops->set_pauseparam(h, param->autoneg,
   param->rx_pause,
@@ -743,6 +751,11 @@ static int hns3_set_link_ksettings(struct net_device 
*netdev,
if (cmd->base.speed == SPEED_1000 && cmd->base.duplex == DUPLEX_HALF)
return -EINVAL;
 
+   netif_info(handle, drv, netdev,
+  "set link(%s): autoneg=%u, speed=%u, duplex=%u\n",
+  netdev->phydev ? "phy" : "mac",
+  cmd->base.autoneg, cmd->base.speed, cmd->base.duplex);
+
/* Only support ksettings_set for netdev with phy 

[PATCH V3 net-next 01/10] net: hns3: add reset checking before set channels

2019-07-26 Thread Huazhong Tan
From: Jian Shen 

hns3_set_channels() should check the resetting status firstly,
since the device will reinitialize when resetting. If the
reset has not completed, the hns3_set_channels() may access
invalid memory.

Signed-off-by: Jian Shen 
Signed-off-by: Huazhong Tan 
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 69f7ef8..08af782 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -4378,6 +4378,9 @@ int hns3_set_channels(struct net_device *netdev,
u16 org_tqp_num;
int ret;
 
+   if (hns3_nic_resetting(netdev))
+   return -EBUSY;
+
if (ch->rx_count || ch->tx_count)
return -EINVAL;
 
-- 
2.7.4



[PATCH V3 net-next 03/10] net: hns3: remove upgrade reset level when reset fail

2019-07-26 Thread Huazhong Tan
Currently, hclge_reset_err_handle() will assert a global reset
when the failing count is smaller than MAX_RESET_FAIL_CNT, which
will affect other running functions.

So this patch removes this upgrading, and uses re-scheduling reset
task to do it.

Signed-off-by: Huazhong Tan 
Reviewed-by: Yunsheng Lin 
---
 .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c| 28 +++---
 1 file changed, 8 insertions(+), 20 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
index 3fde5471..3c64d70 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c
@@ -3305,7 +3305,7 @@ static int hclge_reset_prepare_wait(struct hclge_dev 
*hdev)
return ret;
 }
 
-static bool hclge_reset_err_handle(struct hclge_dev *hdev, bool is_timeout)
+static bool hclge_reset_err_handle(struct hclge_dev *hdev)
 {
 #define MAX_RESET_FAIL_CNT 5
 
@@ -3322,20 +3322,11 @@ static bool hclge_reset_err_handle(struct hclge_dev 
*hdev, bool is_timeout)
return false;
} else if (hdev->reset_fail_cnt < MAX_RESET_FAIL_CNT) {
hdev->reset_fail_cnt++;
-   if (is_timeout) {
-   set_bit(hdev->reset_type, >reset_pending);
-   dev_info(>pdev->dev,
-"re-schedule to wait for hw reset done\n");
-   return true;
-   }
-
-   dev_info(>pdev->dev, "Upgrade reset level\n");
-   hclge_clear_reset_cause(hdev);
-   set_bit(HNAE3_GLOBAL_RESET, >default_reset_request);
-   mod_timer(>reset_timer,
- jiffies + HCLGE_RESET_INTERVAL);
-
-   return false;
+   set_bit(hdev->reset_type, >reset_pending);
+   dev_info(>pdev->dev,
+"re-schedule reset task(%d)\n",
+hdev->reset_fail_cnt);
+   return true;
}
 
hclge_clear_reset_cause(hdev);
@@ -3382,7 +3373,6 @@ static int hclge_reset_stack(struct hclge_dev *hdev)
 static void hclge_reset(struct hclge_dev *hdev)
 {
struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev);
-   bool is_timeout = false;
int ret;
 
/* Initialize ae_dev reset status as well, in case enet layer wants to
@@ -3410,10 +3400,8 @@ static void hclge_reset(struct hclge_dev *hdev)
if (ret)
goto err_reset;
 
-   if (hclge_reset_wait(hdev)) {
-   is_timeout = true;
+   if (hclge_reset_wait(hdev))
goto err_reset;
-   }
 
hdev->rst_stats.hw_reset_done_cnt++;
 
@@ -3465,7 +3453,7 @@ static void hclge_reset(struct hclge_dev *hdev)
 err_reset_lock:
rtnl_unlock();
 err_reset:
-   if (hclge_reset_err_handle(hdev, is_timeout))
+   if (hclge_reset_err_handle(hdev))
hclge_reset_task_schedule(hdev);
 }
 
-- 
2.7.4



[PATCH V3 net-next 05/10] net: hns3: modify firmware version display format

2019-07-26 Thread Huazhong Tan
From: Yufeng Mo 

This patch modifies firmware version display format in
hclge(vf)_cmd_init() and hns3_get_drvinfo(). Also, adds
some optimizations for firmware version display format.

Signed-off-by: Yufeng Mo 
Signed-off-by: Peng Li 
Signed-off-by: Huazhong Tan 
---
 drivers/net/ethernet/hisilicon/hns3/hnae3.h  |  9 +
 drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c   | 15 +--
 drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c   | 10 +-
 drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c | 10 +-
 4 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h 
b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
index 48c7b70..a4624db 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h
@@ -179,6 +179,15 @@ struct hnae3_vector_info {
 #define HNAE3_RING_GL_RX 0
 #define HNAE3_RING_GL_TX 1
 
+#define HNAE3_FW_VERSION_BYTE3_SHIFT   24
+#define HNAE3_FW_VERSION_BYTE3_MASKGENMASK(31, 24)
+#define HNAE3_FW_VERSION_BYTE2_SHIFT   16
+#define HNAE3_FW_VERSION_BYTE2_MASKGENMASK(23, 16)
+#define HNAE3_FW_VERSION_BYTE1_SHIFT   8
+#define HNAE3_FW_VERSION_BYTE1_MASKGENMASK(15, 8)
+#define HNAE3_FW_VERSION_BYTE0_SHIFT   0
+#define HNAE3_FW_VERSION_BYTE0_MASKGENMASK(7, 0)
+
 struct hnae3_ring_chain_node {
struct hnae3_ring_chain_node *next;
u32 tqp_index;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
index 5bff98a..e71c92b 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c
@@ -527,6 +527,7 @@ static void hns3_get_drvinfo(struct net_device *netdev,
 {
struct hns3_nic_priv *priv = netdev_priv(netdev);
struct hnae3_handle *h = priv->ae_handle;
+   u32 fw_version;
 
if (!h->ae_algo->ops->get_fw_version) {
netdev_err(netdev, "could not get fw version!\n");
@@ -545,8 +546,18 @@ static void hns3_get_drvinfo(struct net_device *netdev,
sizeof(drvinfo->bus_info));
drvinfo->bus_info[ETHTOOL_BUSINFO_LEN - 1] = '\0';
 
-   snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), "0x%08x",
-priv->ae_handle->ae_algo->ops->get_fw_version(h));
+   fw_version = priv->ae_handle->ae_algo->ops->get_fw_version(h);
+
+   snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version),
+"%lu.%lu.%lu.%lu",
+hnae3_get_field(fw_version, HNAE3_FW_VERSION_BYTE3_MASK,
+HNAE3_FW_VERSION_BYTE3_SHIFT),
+hnae3_get_field(fw_version, HNAE3_FW_VERSION_BYTE2_MASK,
+HNAE3_FW_VERSION_BYTE2_SHIFT),
+hnae3_get_field(fw_version, HNAE3_FW_VERSION_BYTE1_MASK,
+HNAE3_FW_VERSION_BYTE1_SHIFT),
+hnae3_get_field(fw_version, HNAE3_FW_VERSION_BYTE0_MASK,
+HNAE3_FW_VERSION_BYTE0_SHIFT));
 }
 
 static u32 hns3_get_link(struct net_device *netdev)
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
index 22f6acd..d9858f2 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c
@@ -419,7 +419,15 @@ int hclge_cmd_init(struct hclge_dev *hdev)
}
hdev->fw_version = version;
 
-   dev_info(>pdev->dev, "The firmware version is %08x\n", version);
+   dev_info(>pdev->dev, "The firmware version is %lu.%lu.%lu.%lu\n",
+hnae3_get_field(version, HNAE3_FW_VERSION_BYTE3_MASK,
+HNAE3_FW_VERSION_BYTE3_SHIFT),
+hnae3_get_field(version, HNAE3_FW_VERSION_BYTE2_MASK,
+HNAE3_FW_VERSION_BYTE2_SHIFT),
+hnae3_get_field(version, HNAE3_FW_VERSION_BYTE1_MASK,
+HNAE3_FW_VERSION_BYTE1_SHIFT),
+hnae3_get_field(version, HNAE3_FW_VERSION_BYTE0_MASK,
+HNAE3_FW_VERSION_BYTE0_SHIFT));
 
return 0;
 
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c
index 652b796..8f21eb3 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c
@@ -405,7 +405,15 @@ int hclgevf_cmd_init(struct hclgevf_dev *hdev)
}
hdev->fw_version = version;
 
-   dev_info(>pdev->dev, "The firmware version is %08x\n", version);
+   dev_info(>pdev->dev, "The firmware version is %lu.%lu.%lu.%lu\n",
+hnae3_get_field(version, HNAE3_FW_VERSION_BYTE3_MASK,
+HNAE3_FW_VERSION_BYTE3_SHIFT),
+hnae3_get_field(version, 

[PATCH V3 net-next 09/10] net: hns3: Add support for using order 1 pages with a 4K buffer

2019-07-26 Thread Huazhong Tan
From: Yunsheng Lin 

Hardware supports 0.5K, 1K, 2K, 4K RX buffer size, the
RX buffer can not be reused because the hns3_page_order
return 0 when page size and RX buffer size are both 4096.

So this patch changes the hns3_page_order to return 1 when
RX buffer is greater than half of the page size and page size
is less the 8192, and dev_alloc_pages has already been used
to allocate the compound page for RX buffer.

This patch also changes hnae3_* to hns3_* for page order
and RX buffer size calculation because they are used in
hns3 module.

Signed-off-by: Yunsheng Lin 
Reviewed-by: Peng Li 
Signed-off-by: Huazhong Tan 
---
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 10 +-
 drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 15 ---
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c 
b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
index 973c57b..59a6076 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
@@ -2081,7 +2081,7 @@ static void hns3_set_default_feature(struct net_device 
*netdev)
 static int hns3_alloc_buffer(struct hns3_enet_ring *ring,
 struct hns3_desc_cb *cb)
 {
-   unsigned int order = hnae3_page_order(ring);
+   unsigned int order = hns3_page_order(ring);
struct page *p;
 
p = dev_alloc_pages(order);
@@ -2092,7 +2092,7 @@ static int hns3_alloc_buffer(struct hns3_enet_ring *ring,
cb->page_offset = 0;
cb->reuse_flag = 0;
cb->buf  = page_address(p);
-   cb->length = hnae3_page_size(ring);
+   cb->length = hns3_page_size(ring);
cb->type = DESC_TYPE_PAGE;
 
return 0;
@@ -2395,7 +2395,7 @@ static void hns3_nic_reuse_page(struct sk_buff *skb, int 
i,
 {
struct hns3_desc *desc = >desc[ring->next_to_clean];
int size = le16_to_cpu(desc->rx.size);
-   u32 truesize = hnae3_buf_size(ring);
+   u32 truesize = hns3_buf_size(ring);
 
skb_add_rx_frag(skb, i, desc_cb->priv, desc_cb->page_offset + pull_len,
size - pull_len, truesize);
@@ -2410,7 +2410,7 @@ static void hns3_nic_reuse_page(struct sk_buff *skb, int 
i,
/* Move offset up to the next cache line */
desc_cb->page_offset += truesize;
 
-   if (desc_cb->page_offset + truesize <= hnae3_page_size(ring)) {
+   if (desc_cb->page_offset + truesize <= hns3_page_size(ring)) {
desc_cb->reuse_flag = 1;
/* Bump ref count on page before it is given */
get_page(desc_cb->priv);
@@ -2692,7 +2692,7 @@ static int hns3_add_frag(struct hns3_enet_ring *ring, 
struct hns3_desc *desc,
}
 
if (ring->tail_skb) {
-   head_skb->truesize += hnae3_buf_size(ring);
+   head_skb->truesize += hns3_buf_size(ring);
head_skb->data_len += le16_to_cpu(desc->rx.size);
head_skb->len += le16_to_cpu(desc->rx.size);
skb = ring->tail_skb;
diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h 
b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
index 848b866..1a17856 100644
--- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
+++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h
@@ -608,9 +608,18 @@ static inline bool hns3_nic_resetting(struct net_device 
*netdev)
 
 #define tx_ring_data(priv, idx) ((priv)->ring_data[idx])
 
-#define hnae3_buf_size(_ring) ((_ring)->buf_size)
-#define hnae3_page_order(_ring) (get_order(hnae3_buf_size(_ring)))
-#define hnae3_page_size(_ring) (PAGE_SIZE << (u32)hnae3_page_order(_ring))
+#define hns3_buf_size(_ring) ((_ring)->buf_size)
+
+static inline unsigned int hns3_page_order(struct hns3_enet_ring *ring)
+{
+#if (PAGE_SIZE < 8192)
+   if (ring->buf_size > (PAGE_SIZE / 2))
+   return 1;
+#endif
+   return 0;
+}
+
+#define hns3_page_size(_ring) (PAGE_SIZE << hns3_page_order(_ring))
 
 /* iterator for handling rings in ring group */
 #define hns3_for_each_ring(pos, head) \
-- 
2.7.4



Re: [PATCH 5.2 00/66] 5.2.4-stable review

2019-07-26 Thread Naresh Kamboju
On Fri, 26 Jul 2019 at 20:55, Greg Kroah-Hartman
 wrote:
>
> This is the start of the stable review cycle for the 5.2.4 release.
> There are 66 patches in this series, all will be posted as a response
> to this one.  If anyone has any issues with these being applied, please
> let me know.
>
> Responses should be made by Sun 28 Jul 2019 03:21:13 PM UTC.
> Anything received after that time might be too late.
>
> The whole patch series can be found in one patch at:
> 
> https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.2.4-rc1.gz
> or in the git tree and branch at:
> 
> git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git 
> linux-5.2.y
> and the diffstat can be found below.
>
> thanks,
>
> greg k-h

Results from Linaro’s test farm.
No regressions on arm64, arm, x86_64, and i386.

Summary


kernel: 5.2.4-rc1
git repo: 
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
git branch: linux-5.2.y
git commit: d61e440a1852a64d8a2d0d358b9582b19157e039
git describe: v5.2.3-67-gd61e440a1852
Test details: 
https://qa-reports.linaro.org/lkft/linux-stable-rc-5.2-oe/build/v5.2.3-67-gd61e440a1852

No regressions (compared to build v5.2.3)

No fixes (compared to build v5.2.3)

Ran 22512 total tests in the following environments and test suites.

Environments
--
- dragonboard-410c
- hi6220-hikey
- i386
- juno-r2
- qemu_arm
- qemu_arm64
- qemu_i386
- qemu_x86_64
- x15
- x86

Test Suites
---
* build
* install-android-platform-tools-r2600
* kselftest
* libgpiod
* libhugetlbfs
* ltp-cap_bounds-tests
* ltp-commands-tests
* ltp-containers-tests
* ltp-cpuhotplug-tests
* ltp-cve-tests
* ltp-dio-tests
* ltp-fcntl-locktests-tests
* ltp-filecaps-tests
* ltp-fs_bind-tests
* ltp-fs_perms_simple-tests
* ltp-fsx-tests
* ltp-hugetlb-tests
* ltp-io-tests
* ltp-ipc-tests
* ltp-math-tests
* ltp-mm-tests
* ltp-nptl-tests
* ltp-pty-tests
* ltp-sched-tests
* ltp-securebits-tests
* ltp-syscalls-tests
* ltp-timers-tests
* network-basic-tests
* perf
* spectre-meltdown-checker-test
* v4l2-compliance
* ltp-fs-tests
* ltp-open-posix-tests
* kselftest-vsyscall-mode-native
* kselftest-vsyscall-mode-none
* kvm-unit-tests

-- 
Linaro LKFT
https://lkft.linaro.org


Re: [PATCH 5.1 00/62] 5.1.21-stable review

2019-07-26 Thread Naresh Kamboju
On Fri, 26 Jul 2019 at 20:59, Greg Kroah-Hartman
 wrote:
>
> Note, this will be the LAST 5.1.y kernel release.  Everyone should move
> to the 5.2.y series at this point in time.
>
> This is the start of the stable review cycle for the 5.1.21 release.
> There are 62 patches in this series, all will be posted as a response
> to this one.  If anyone has any issues with these being applied, please
> let me know.
>
> Responses should be made by Sun 28 Jul 2019 03:21:13 PM UTC.
> Anything received after that time might be too late.
>
> The whole patch series can be found in one patch at:
> 
> https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.1.21-rc1.gz
> or in the git tree and branch at:
> 
> git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git 
> linux-5.1.y
> and the diffstat can be found below.
>
> thanks,
>
> greg k-h

Results from Linaro’s test farm.
No regressions on arm64, arm, x86_64, and i386.

Summary


kernel: 5.1.21-rc1
git repo: 
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
git branch: linux-5.1.y
git commit: f878628d8f1efc883e9bd6f9f81173194b4a01dd
git describe: v5.1.20-63-gf878628d8f1e
Test details: 
https://qa-reports.linaro.org/lkft/linux-stable-rc-5.1-oe/build/v5.1.20-63-gf878628d8f1e

No regressions (compared to build v5.1.20)

No fixes (compared to build v5.1.20)

Ran 21561 total tests in the following environments and test suites.

Environments
--
- dragonboard-410c
- hi6220-hikey
- i386
- juno-r2
- qemu_arm
- qemu_arm64
- qemu_i386
- qemu_x86_64
- x15
- x86

Test Suites
---
* build
* install-android-platform-tools-r2600
* kselftest
* libgpiod
* libhugetlbfs
* ltp-cap_bounds-tests
* ltp-commands-tests
* ltp-containers-tests
* ltp-cpuhotplug-tests
* ltp-cve-tests
* ltp-dio-tests
* ltp-fcntl-locktests-tests
* ltp-filecaps-tests
* ltp-fs-tests
* ltp-fs_bind-tests
* ltp-fs_perms_simple-tests
* ltp-fsx-tests
* ltp-hugetlb-tests
* ltp-io-tests
* ltp-ipc-tests
* ltp-math-tests
* ltp-mm-tests
* ltp-nptl-tests
* ltp-pty-tests
* ltp-sched-tests
* ltp-securebits-tests
* ltp-syscalls-tests
* ltp-timers-tests
* network-basic-tests
* perf
* spectre-meltdown-checker-test
* v4l2-compliance
* ltp-open-posix-tests
* kvm-unit-tests
* kselftest-vsyscall-mode-native
* kselftest-vsyscall-mode-none

-- 
Linaro LKFT
https://lkft.linaro.org


Re: [PATCH] hv_sock: use HV_HYP_PAGE_SIZE instead of PAGE_SIZE_4K

2019-07-26 Thread kbuild test robot
Hi Himadri,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on linus/master]
[cannot apply to v5.3-rc1 next-20190726]
[if your patch is applied to the wrong git tree, please drop us a note to help 
improve the system]

url:
https://github.com/0day-ci/linux/commits/Himadri-Pandya/hv_sock-use-HV_HYP_PAGE_SIZE-instead-of-PAGE_SIZE_4K/20190726-085229
config: x86_64-allyesconfig (attached as .config)
compiler: gcc-7 (Debian 7.4.0-10) 7.4.0
reproduce:
# save the attached .config to linux build tree
make ARCH=x86_64 

If you fix the issue, kindly add following tag
Reported-by: kbuild test robot 

All error/warnings (new ones prefixed by >>):

>> net/vmw_vsock/hyperv_transport.c:58:28: error: 'HV_HYP_PAGE_SIZE' undeclared 
>> here (not in a function); did you mean 'HV_MESSAGE_SIZE'?
#define HVS_SEND_BUF_SIZE (HV_HYP_PAGE_SIZE - sizeof(struct 
vmpipe_proto_header))
   ^
>> net/vmw_vsock/hyperv_transport.c:65:10: note: in expansion of macro 
>> 'HVS_SEND_BUF_SIZE'
 u8 data[HVS_SEND_BUF_SIZE];
 ^
   In file included from include/linux/list.h:9:0,
from include/linux/module.h:9,
from net/vmw_vsock/hyperv_transport.c:11:
   net/vmw_vsock/hyperv_transport.c: In function 'hvs_open_connection':
>> include/linux/kernel.h:845:2: error: first argument to 
>> '__builtin_choose_expr' not a constant
 __builtin_choose_expr(__safe_cmp(x, y), \
 ^
   include/linux/kernel.h:921:27: note: in expansion of macro '__careful_cmp'
#define max_t(type, x, y) __careful_cmp((type)(x), (type)(y), >)
  ^
>> net/vmw_vsock/hyperv_transport.c:390:12: note: in expansion of macro 'max_t'
  sndbuf = max_t(int, sk->sk_sndbuf, RINGBUFFER_HVS_SND_SIZE);
   ^
>> include/linux/kernel.h:845:2: error: first argument to 
>> '__builtin_choose_expr' not a constant
 __builtin_choose_expr(__safe_cmp(x, y), \
 ^
   include/linux/kernel.h:913:27: note: in expansion of macro '__careful_cmp'
#define min_t(type, x, y) __careful_cmp((type)(x), (type)(y), <)
  ^
>> net/vmw_vsock/hyperv_transport.c:391:12: note: in expansion of macro 'min_t'
  sndbuf = min_t(int, sndbuf, RINGBUFFER_HVS_MAX_SIZE);
   ^
>> include/linux/kernel.h:845:2: error: first argument to 
>> '__builtin_choose_expr' not a constant
 __builtin_choose_expr(__safe_cmp(x, y), \
 ^
   include/linux/kernel.h:921:27: note: in expansion of macro '__careful_cmp'
#define max_t(type, x, y) __careful_cmp((type)(x), (type)(y), >)
  ^
   net/vmw_vsock/hyperv_transport.c:393:12: note: in expansion of macro 'max_t'
  rcvbuf = max_t(int, sk->sk_rcvbuf, RINGBUFFER_HVS_RCV_SIZE);
   ^
>> include/linux/kernel.h:845:2: error: first argument to 
>> '__builtin_choose_expr' not a constant
 __builtin_choose_expr(__safe_cmp(x, y), \
 ^
   include/linux/kernel.h:913:27: note: in expansion of macro '__careful_cmp'
#define min_t(type, x, y) __careful_cmp((type)(x), (type)(y), <)
  ^
   net/vmw_vsock/hyperv_transport.c:394:12: note: in expansion of macro 'min_t'
  rcvbuf = min_t(int, rcvbuf, RINGBUFFER_HVS_MAX_SIZE);
   ^
   net/vmw_vsock/hyperv_transport.c: In function 'hvs_stream_enqueue':
>> include/linux/kernel.h:845:2: error: first argument to 
>> '__builtin_choose_expr' not a constant
 __builtin_choose_expr(__safe_cmp(x, y), \
 ^
   include/linux/kernel.h:913:27: note: in expansion of macro '__careful_cmp'
#define min_t(type, x, y) __careful_cmp((type)(x), (type)(y), <)
  ^
   net/vmw_vsock/hyperv_transport.c:681:14: note: in expansion of macro 'min_t'
  to_write = min_t(ssize_t, to_write, HVS_SEND_BUF_SIZE);
 ^

vim +58 net/vmw_vsock/hyperv_transport.c

---
0-DAY kernel test infrastructureOpen Source Technology Center
https://lists.01.org/pipermail/kbuild-all   Intel Corporation


.config.gz
Description: application/gzip


linux-next: Fixes tag needs some work in the usb tree

2019-07-26 Thread Stephen Rothwell
Hi all,

In commit

  6269e4c76eac ("usb: host: xhci-hub: fix extra endianness conversion")

Fixes tag

  Fixes: 395f540 "xhci: support new USB 3.1 hub request to get extended port 
status"

has these problem(s):

  - SHA1 should be at least 12 digits long
Can be fixed by setting core.abbrev to 12 (or more) or (for git v2.11
or later) just making sure it is not set (or set to "auto").

-- 
Cheers,
Stephen Rothwell


pgpocCJcVz2AL.pgp
Description: OpenPGP digital signature


[PATCH] clocksource/drivers: hyperv_timer: Fix CPU offlining by unbinding the timer

2019-07-26 Thread Dexuan Cui
The commit fd1fea6834d0 says "No behavior is changed", but actually it
removes the clockevents_unbind_device() call from hv_synic_cleanup().

In the discussion earlier this month, I thought the unbind call is
unnecessary (see https://www.spinics.net/lists/arm-kernel/msg739888.html),
however, after more investigation, when a VM runs on Hyper-V, it turns out
the unbind call must be kept, otherwise CPU offling may not work, because
a per-cpu timer device is still needed, after hv_synic_cleanup() disables
the per-cpu Hyper-V timer device.

The issue is found in the hibernation test. These are the details:

1. CPU0 hangs in wait_for_ap_thread(), when trying to offline CPU1:

hibernation_snapshot
  create_image
suspend_disable_secondary_cpus
  freeze_secondary_cpus
_cpu_down(1, 1, CPUHP_OFFLINE)
  cpuhp_kick_ap_work
cpuhp_kick_ap
  __cpuhp_kick_ap
wait_for_ap_thread()

2. CPU0 hangs because CPU1 hangs this way: after CPU1 disables the per-cpu
Hyper-V timer device in hv_synic_cleanup(), CPU1 sets a timer... Please
read on to see how this can happen.

2.1 By "_cpu_down(1, 1, CPUHP_OFFLINE):", CPU0 first tries to move CPU1 to
the CPUHP_TEARDOWN_CPU state and this wakes up the cpuhp/1 thread on CPU1;
the thread is basically a loop of executing various callbacks defined in
the global array cpuhp_hp_states[]: see smpboot_thread_fn().

2.2 This is how a callback is called on CPU1:
  smpboot_thread_fn
ht->thread_fn(td->cpu), i.e. cpuhp_thread_fun
  cpuhp_invoke_callback
state = st->state
st->state--
cpuhp_get_step(state)->teardown.single()

2.3 At first, the state of CPU1 is CPUHP_ONLINE, which defines a
.teardown.single of NULL, so the execution of the code returns to the loop
in smpboot_thread_fn(), and then reruns cpuhp_invoke_callback() with a
smaller st->state.

2.4 The .teardown.single of every state between CPUHP_ONLINE and
CPUHP_TEARDOWN_CPU runs one by one.

2.5 When it comes to the CPUHP_AP_ONLINE_DYN range, hv_synic_cleanup()
runs: see vmbus_bus_init(). It calls hv_stimer_cleanup() ->
hv_ce_shutdown() to disable the per-cpu timer device, so timer interrupt
will no longer happen on CPU1.

2.6 Later, the .teardown.single of CPUHP_AP_SMPBOOT_THREADS, i.e.
smpboot_park_threads(), starts to run, trying to park all the other
hotplug_threads, e.g. ksoftirqd/1 and rcuc/1; here a timer can be set up
this way and the timer will never be fired since CPU1 doesn't have
an active timer device now, so CPU1 hangs and can not be offlined:
  smpboot_park_threads
smpboot_park_thread
  kthread_park
wait_task_inactive
  schedule_hrtimeout(, HRTIMER_MODE_REL)

With this patch, when the per-cpu Hyper-V timer device is disabled, the
system switches to the Local APIC timer, and the hang issue can not
happen.

Fixes: fd1fea6834d0 ("clocksource/drivers: Make Hyper-V clocksource ISA 
agnostic")
Signed-off-by: Dexuan Cui 
---
 drivers/clocksource/hyperv_timer.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/drivers/clocksource/hyperv_timer.c 
b/drivers/clocksource/hyperv_timer.c
index 41c31a7ac0e4..8f3422c66cbb 100644
--- a/drivers/clocksource/hyperv_timer.c
+++ b/drivers/clocksource/hyperv_timer.c
@@ -139,6 +139,7 @@ void hv_stimer_cleanup(unsigned int cpu)
/* Turn off clockevent device */
if (ms_hyperv.features & HV_MSR_SYNTIMER_AVAILABLE) {
ce = per_cpu_ptr(hv_clock_event, cpu);
+   clockevents_unbind_device(ce, cpu);
hv_ce_shutdown(ce);
}
 }
-- 
2.19.1



linux-next: Signed-off-by missing for commit in the crypto tree

2019-07-26 Thread Stephen Rothwell
Hi all,

Commit

  53a5d5192803 ("crypto: ccp - Log an error message when ccp-crypto fails to 
load")

is missing a Signed-off-by from its author.

-- 
Cheers,
Stephen Rothwell


pgp8CGxhoQd5K.pgp
Description: OpenPGP digital signature


Re: [PATCH 4.19 00/50] 4.19.62-stable review

2019-07-26 Thread Naresh Kamboju
On Fri, 26 Jul 2019 at 21:03, Greg Kroah-Hartman
 wrote:
>
> This is the start of the stable review cycle for the 4.19.62 release.
> There are 50 patches in this series, all will be posted as a response
> to this one.  If anyone has any issues with these being applied, please
> let me know.
>
> Responses should be made by Sun 28 Jul 2019 03:21:13 PM UTC.
> Anything received after that time might be too late.
>
> The whole patch series can be found in one patch at:
> 
> https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.19.62-rc1.gz
> or in the git tree and branch at:
> 
> git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git 
> linux-4.19.y
> and the diffstat can be found below.
>
> thanks,
>
> greg k-h

Results from Linaro’s test farm.
No regressions on arm64, arm, x86_64, and i386.

Summary


kernel: 4.19.62-rc1
git repo: 
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
git branch: linux-4.19.y
git commit: 213a5f3ac1f5e2af0e25fd4b26497590ec290be0
git describe: v4.19.61-51-g213a5f3ac1f5
Test details: 
https://qa-reports.linaro.org/lkft/linux-stable-rc-4.19-oe/build/v4.19.61-51-g213a5f3ac1f5


No regressions (compared to build v4.19.61)

No fixes (compared to build v4.19.61)

Ran 23490 total tests in the following environments and test suites.

Environments
--
- dragonboard-410c - arm64
- hi6220-hikey - arm64
- i386
- juno-r2 - arm64
- qemu_arm
- qemu_arm64
- qemu_i386
- qemu_x86_64
- x15 - arm
- x86_64

Test Suites
---
* build
* install-android-platform-tools-r2600
* kselftest
* libgpiod
* libhugetlbfs
* ltp-cap_bounds-tests
* ltp-commands-tests
* ltp-containers-tests
* ltp-cpuhotplug-tests
* ltp-cve-tests
* ltp-dio-tests
* ltp-fcntl-locktests-tests
* ltp-filecaps-tests
* ltp-fs-tests
* ltp-fs_bind-tests
* ltp-fs_perms_simple-tests
* ltp-fsx-tests
* ltp-hugetlb-tests
* ltp-io-tests
* ltp-ipc-tests
* ltp-math-tests
* ltp-mm-tests
* ltp-nptl-tests
* ltp-pty-tests
* ltp-sched-tests
* ltp-securebits-tests
* ltp-syscalls-tests
* ltp-timers-tests
* network-basic-tests
* perf
* spectre-meltdown-checker-test
* v4l2-compliance
* ltp-open-posix-tests
* kvm-unit-tests
* kselftest-vsyscall-mode-native
* kselftest-vsyscall-mode-none

-- 
Linaro LKFT
https://lkft.linaro.org


Re: mmotm 2019-07-24-21-39 uploaded (mm/memcontrol)

2019-07-26 Thread Nathan Chancellor
On Fri, Jul 26, 2019 at 09:19:52PM -0700, Andrew Morton wrote:
> On Fri, 26 Jul 2019 20:42:05 -0700 Nathan Chancellor 
>  wrote:
> 
> > > @@ -2414,8 +2414,9 @@ void mem_cgroup_handle_over_high(void)
> > >*/
> > >   clamped_high = max(high, 1UL);
> > >  
> > > - overage = ((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT)
> > > - / clamped_high;
> > > + overage = (u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT;
> > > + do_div(overage, clamped_high);
> > > +
> > >   penalty_jiffies = ((u64)overage * overage * HZ)
> > >   >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT);
> > >  
> > > _
> > > 
> > 
> > This causes a build error on arm:
> > 
> 
> Ah.
> 
> It's rather unclear why that u64 cast is there anyway.  We're dealing
> with ulongs all over this code.  The below will suffice.

I was thinking the same thing.

> Chris, please take a look?
> 
> --- 
> a/mm/memcontrol.c~mm-throttle-allocators-when-failing-reclaim-over-memoryhigh-fix-fix-fix
> +++ a/mm/memcontrol.c
> @@ -2415,7 +2415,7 @@ void mem_cgroup_handle_over_high(void)
>   clamped_high = max(high, 1UL);
>  
>   overage = (u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT;
> - do_div(overage, clamped_high);
> + overage /= clamped_high;
>  
>   penalty_jiffies = ((u64)overage * overage * HZ)
>   >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT);
> _
> 

I assume this will get folded in with the original patch but for
completeness (multi_v7_defconfig + CONFIG_MEMCG):

Tested-by: Nathan Chancellor 

Thanks for the quick fix!


Re: [PATCH 5.1 00/62] 5.1.21-stable review

2019-07-26 Thread kernelci.org bot
stable-rc/linux-5.1.y boot: 127 boots: 2 failed, 81 passed with 44 offline 
(v5.1.20-63-gf878628d8f1e)

Full Boot Summary: 
https://kernelci.org/boot/all/job/stable-rc/branch/linux-5.1.y/kernel/v5.1.20-63-gf878628d8f1e/
Full Build Summary: 
https://kernelci.org/build/stable-rc/branch/linux-5.1.y/kernel/v5.1.20-63-gf878628d8f1e/

Tree: stable-rc
Branch: linux-5.1.y
Git Describe: v5.1.20-63-gf878628d8f1e
Git Commit: f878628d8f1efc883e9bd6f9f81173194b4a01dd
Git URL: 
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
Tested: 74 unique boards, 27 SoC families, 17 builds out of 209

Boot Failures Detected:

arm64:
defconfig:
gcc-8:
meson-g12a-x96-max: 1 failed lab

arm:
multi_v7_defconfig:
gcc-8:
bcm4708-smartrg-sr400ac: 1 failed lab

Offline Platforms:

arm64:

defconfig:
gcc-8
meson-axg-s400: 1 offline lab
meson-g12a-u200: 1 offline lab
meson-g12a-x96-max: 1 offline lab
meson-gxbb-odroidc2: 1 offline lab
meson-gxl-s905d-p230: 1 offline lab
meson-gxl-s905x-libretech-cc: 1 offline lab
meson-gxl-s905x-nexbox-a95x: 1 offline lab
meson-gxl-s905x-p212: 1 offline lab
meson-gxm-nexbox-a1: 1 offline lab
rk3399-firefly: 1 offline lab
sun50i-a64-pine64-plus: 1 offline lab

mips:

pistachio_defconfig:
gcc-8
pistachio_marduk: 1 offline lab

arm:

exynos_defconfig:
gcc-8
exynos5250-arndale: 1 offline lab
exynos5420-arndale-octa: 1 offline lab
exynos5800-peach-pi: 1 offline lab

multi_v7_defconfig:
gcc-8
bcm72521-bcm97252sffe: 1 offline lab
bcm7445-bcm97445c: 1 offline lab
exynos5250-arndale: 1 offline lab
exynos5420-arndale-octa: 1 offline lab
exynos5800-peach-pi: 1 offline lab
imx6dl-wandboard_dual: 1 offline lab
imx6dl-wandboard_solo: 1 offline lab
imx6q-wandboard: 1 offline lab
imx7s-warp: 1 offline lab
meson8b-odroidc1: 1 offline lab
omap3-beagle: 1 offline lab
omap4-panda: 1 offline lab
qcom-apq8064-ifc6410: 1 offline lab
stih410-b2120: 1 offline lab
sun4i-a10-cubieboard: 1 offline lab
sun7i-a20-bananapi: 1 offline lab
vf610-colibri-eval-v3: 1 offline lab

omap2plus_defconfig:
gcc-8
omap3-beagle: 1 offline lab
omap4-panda: 1 offline lab

qcom_defconfig:
gcc-8
qcom-apq8064-ifc6410: 1 offline lab

davinci_all_defconfig:
gcc-8
da850-evm: 1 offline lab
dm365evm,legacy: 1 offline lab

imx_v6_v7_defconfig:
gcc-8
imx6dl-wandboard_dual: 1 offline lab
imx6dl-wandboard_solo: 1 offline lab
imx6q-wandboard: 1 offline lab
imx7s-warp: 1 offline lab
vf610-colibri-eval-v3: 1 offline lab

sunxi_defconfig:
gcc-8
sun4i-a10-cubieboard: 1 offline lab
sun7i-a20-bananapi: 1 offline lab

---
For more info write to 


Re: mmotm 2019-07-24-21-39 uploaded (mm/memcontrol)

2019-07-26 Thread Andrew Morton
On Fri, 26 Jul 2019 20:42:05 -0700 Nathan Chancellor  
wrote:

> > @@ -2414,8 +2414,9 @@ void mem_cgroup_handle_over_high(void)
> >  */
> > clamped_high = max(high, 1UL);
> >  
> > -   overage = ((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT)
> > -   / clamped_high;
> > +   overage = (u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT;
> > +   do_div(overage, clamped_high);
> > +
> > penalty_jiffies = ((u64)overage * overage * HZ)
> > >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT);
> >  
> > _
> > 
> 
> This causes a build error on arm:
> 

Ah.

It's rather unclear why that u64 cast is there anyway.  We're dealing
with ulongs all over this code.  The below will suffice.

Chris, please take a look?

--- 
a/mm/memcontrol.c~mm-throttle-allocators-when-failing-reclaim-over-memoryhigh-fix-fix-fix
+++ a/mm/memcontrol.c
@@ -2415,7 +2415,7 @@ void mem_cgroup_handle_over_high(void)
clamped_high = max(high, 1UL);
 
overage = (u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT;
-   do_div(overage, clamped_high);
+   overage /= clamped_high;
 
penalty_jiffies = ((u64)overage * overage * HZ)
>> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT);
_



Re: mmotm 2019-07-24-21-39 uploaded (mm/memcontrol)

2019-07-26 Thread Nathan Chancellor
On Thu, Jul 25, 2019 at 04:39:59PM -0700, Andrew Morton wrote:
> On Thu, 25 Jul 2019 15:02:59 -0700 Randy Dunlap  wrote:
> 
> > On 7/24/19 9:40 PM, a...@linux-foundation.org wrote:
> > > The mm-of-the-moment snapshot 2019-07-24-21-39 has been uploaded to
> > > 
> > >http://www.ozlabs.org/~akpm/mmotm/
> > > 
> > > mmotm-readme.txt says
> > > 
> > > README for mm-of-the-moment:
> > > 
> > > http://www.ozlabs.org/~akpm/mmotm/
> > > 
> > > This is a snapshot of my -mm patch queue.  Uploaded at random hopefully
> > > more than once a week.
> > > 
> > > You will need quilt to apply these patches to the latest Linus release 
> > > (5.x
> > > or 5.x-rcY).  The series file is in broken-out.tar.gz and is duplicated in
> > > http://ozlabs.org/~akpm/mmotm/series
> > > 
> > 
> > on i386:
> > 
> > ld: mm/memcontrol.o: in function `mem_cgroup_handle_over_high':
> > memcontrol.c:(.text+0x6235): undefined reference to `__udivdi3'
> 
> Thanks.  This?
> 
> --- 
> a/mm/memcontrol.c~mm-throttle-allocators-when-failing-reclaim-over-memoryhigh-fix-fix
> +++ a/mm/memcontrol.c
> @@ -2414,8 +2414,9 @@ void mem_cgroup_handle_over_high(void)
>*/
>   clamped_high = max(high, 1UL);
>  
> - overage = ((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT)
> - / clamped_high;
> + overage = (u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT;
> + do_div(overage, clamped_high);
> +
>   penalty_jiffies = ((u64)overage * overage * HZ)
>   >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT);
>  
> _
> 

This causes a build error on arm:


In file included from ../arch/arm/include/asm/div64.h:127,
 from ../include/linux/kernel.h:18,
 from ../include/linux/page_counter.h:6,
 from ../mm/memcontrol.c:25:
../mm/memcontrol.c: In function 'mem_cgroup_handle_over_high':
../include/asm-generic/div64.h:222:28: warning: comparison of distinct pointer 
types lacks a cast
  222 |  (void)(((typeof((n)) *)0) == ((uint64_t *)0)); \
  |^~
../mm/memcontrol.c:2423:2: note: in expansion of macro 'do_div'
 2423 |  do_div(overage, clamped_high);
  |  ^~
In file included from ../arch/arm/include/asm/atomic.h:11,
 from ../include/linux/atomic.h:7,
 from ../include/linux/page_counter.h:5,
 from ../mm/memcontrol.c:25:
../include/asm-generic/div64.h:235:25: warning: right shift count >= width of 
type [-Wshift-count-overflow]
  235 |  } else if (likely(((n) >> 32) == 0)) {  \
  | ^~
../include/linux/compiler.h:77:40: note: in definition of macro 'likely'
   77 | # define likely(x) __builtin_expect(!!(x), 1)
  |^
../mm/memcontrol.c:2423:2: note: in expansion of macro 'do_div'
 2423 |  do_div(overage, clamped_high);
  |  ^~
In file included from ../arch/arm/include/asm/div64.h:127,
 from ../include/linux/kernel.h:18,
 from ../include/linux/page_counter.h:6,
 from ../mm/memcontrol.c:25:
../include/asm-generic/div64.h:239:22: error: passing argument 1 of 
'__div64_32' from incompatible pointer type [-Werror=incompatible-pointer-types]
  239 |   __rem = __div64_32(&(n), __base); \
  |  ^~~~
  |  |
  |  long unsigned int *
../mm/memcontrol.c:2423:2: note: in expansion of macro 'do_div'
 2423 |  do_div(overage, clamped_high);
  |  ^~
In file included from ../include/linux/kernel.h:18,
 from ../include/linux/page_counter.h:6,
 from ../mm/memcontrol.c:25:
../arch/arm/include/asm/div64.h:33:45: note: expected 'uint64_t *' {aka 'long 
long unsigned int *'} but argument is of type 'long unsigned int *'
   33 | static inline uint32_t __div64_32(uint64_t *n, uint32_t base)
  |   ~~^
cc1: some warnings being treated as errors
make[3]: *** [../scripts/Makefile.build:274: mm/memcontrol.o] Error 1
make[2]: *** [../Makefile:1768: mm/memcontrol.o] Error 2
make[1]: *** [/home/nathan/cbl/linux-next/Makefile:330: __build_one_by_one] 
Error 2
make: *** [Makefile:179: sub-make] Error 2


I fixed it up like so but no idea if that is the ideal function to use.


diff --git a/mm/memcontrol.c b/mm/memcontrol.c
index 5c7b9facb0eb..04b621f1cb6b 100644
--- a/mm/memcontrol.c
+++ b/mm/memcontrol.c
@@ -2419,8 +2419,8 @@ void mem_cgroup_handle_over_high(void)
 */
clamped_high = max(high, 1UL);
 
-   overage = (u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT;
-   do_div(overage, clamped_high);
+   overage = div64_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT,
+   clamped_high);
 
penalty_jiffies = ((u64)overage * overage * HZ)
>> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT);


[PATCH v7] driver core: Fix use-after-free and double free on glue directory

2019-07-26 Thread Muchun Song
There is a race condition between removing glue directory and adding a new
device under the glue dir. It can be reproduced in following test:

CPU1: CPU2:

device_add()
  get_device_parent()
class_dir_create_and_add()
  kobject_add_internal()
create_dir()// create glue_dir

  device_add()
get_device_parent()
  kobject_get() // get glue_dir

device_del()
  cleanup_glue_dir()
kobject_del(glue_dir)

kobject_add()
  kobject_add_internal()
create_dir() // in glue_dir
  sysfs_create_dir_ns()
kernfs_create_dir_ns(sd)

  sysfs_remove_dir() // glue_dir->sd=NULL
  sysfs_put()// free glue_dir->sd

  // sd is freed
  kernfs_new_node(sd)
kernfs_get(glue_dir)
kernfs_add_one()
kernfs_put()

Before CPU1 remove last child device under glue dir, if CPU2 add a new
device under glue dir, the glue_dir kobject reference count will be
increase to 2 via kobject_get() in get_device_parent(). And CPU2 has
been called kernfs_create_dir_ns(), but not call kernfs_new_node().
Meanwhile, CPU1 call sysfs_remove_dir() and sysfs_put(). This result in
glue_dir->sd is freed and it's reference count will be 0. Then CPU2 call
kernfs_get(glue_dir) will trigger a warning in kernfs_get() and increase
it's reference count to 1. Because glue_dir->sd is freed by CPU1, the next
call kernfs_add_one() by CPU2 will fail(This is also use-after-free)
and call kernfs_put() to decrease reference count. Because the reference
count is decremented to 0, it will also call kmem_cache_free() to free
the glue_dir->sd again. This will result in double free.

In order to avoid this happening, we also should make sure that kernfs_node
for glue_dir is released in CPU1 only when refcount for glue_dir kobj is
1 to fix this race.

The following calltrace is captured in kernel 4.14 with the following patch
applied:

commit 726e41097920 ("drivers: core: Remove glue dirs from sysfs earlier")

--
[3.633703] WARNING: CPU: 4 PID: 513 at .../fs/kernfs/dir.c:494
Here is WARN_ON(!atomic_read(>count) in kernfs_get().

[3.633986] Call trace:
[3.633991]  kernfs_create_dir_ns+0xa8/0xb0
[3.633994]  sysfs_create_dir_ns+0x54/0xe8
[3.634001]  kobject_add_internal+0x22c/0x3f0
[3.634005]  kobject_add+0xe4/0x118
[3.634011]  device_add+0x200/0x870
[3.634017]  _request_firmware+0x958/0xc38
[3.634020]  request_firmware_into_buf+0x4c/0x70

[3.634064] kernel BUG at .../mm/slub.c:294!
Here is BUG_ON(object == fp) in set_freepointer().

[3.634346] Call trace:
[3.634351]  kmem_cache_free+0x504/0x6b8
[3.634355]  kernfs_put+0x14c/0x1d8
[3.634359]  kernfs_create_dir_ns+0x88/0xb0
[3.634362]  sysfs_create_dir_ns+0x54/0xe8
[3.634366]  kobject_add_internal+0x22c/0x3f0
[3.634370]  kobject_add+0xe4/0x118
[3.634374]  device_add+0x200/0x870
[3.634378]  _request_firmware+0x958/0xc38
[3.634381]  request_firmware_into_buf+0x4c/0x70
--

Fixes: 726e41097920 ("drivers: core: Remove glue dirs from sysfs earlier")

Signed-off-by: Muchun Song 
Reviewed-by: Mukesh Ojha 
---

Change in v7:
   1. Update commit message.
Change in v6:
   1. Remove hardcoding "1 "
Change in v5:
   1. Revert to the v1 fix.
   2. Add some comment to explain why we need do this in
  cleanup_glue_dir().
Change in v4:
   1. Add some kerneldoc comment.
   2. Remove unlock_if_glue_dir().
   3. Rename get_device_parent_locked_if_glue_dir() to
  get_device_parent_locked.
   4. Update commit message.
Change in v3:
   Add change log.
Change in v2:
   Fix device_move() also.

 drivers/base/core.c | 53 -
 1 file changed, 52 insertions(+), 1 deletion(-)

diff --git a/drivers/base/core.c b/drivers/base/core.c
index 4aeaa0c92bda..edc55160c5f0 100644
--- a/drivers/base/core.c
+++ b/drivers/base/core.c
@@ -1820,12 +1820,63 @@ static inline struct kobject *get_glue_dir(struct 
device *dev)
  */
 static void cleanup_glue_dir(struct device *dev, struct kobject *glue_dir)
 {
+   unsigned int ref;
+
/* see if we live in a "glue" 

Re: [PATCH V2 net-next 07/11] net: hns3: adds debug messages to identify eth down cause

2019-07-26 Thread Joe Perches
On Sat, 2019-07-27 at 10:28 +0800, liuyonglong wrote:
> On 2019/7/27 6:18, Joe Perches wrote:
> > On Fri, 2019-07-26 at 22:00 +, Saeed Mahameed wrote:
> > > On Fri, 2019-07-26 at 11:24 +0800, Huazhong Tan wrote:
> > > > From: Yonglong Liu 
> > > > 
> > > > Some times just see the eth interface have been down/up via
> > > > dmesg, but can not know why the eth down. So adds some debug
> > > > messages to identify the cause for this.
> > []
> > > > diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
> > > []
> > > > @@ -459,6 +459,10 @@ static int hns3_nic_net_open(struct net_device
> > > > *netdev)
> > > > h->ae_algo->ops->set_timer_task(priv->ae_handle, true);
> > > >  
> > > > hns3_config_xps(priv);
> > > > +
> > > > +   if (netif_msg_drv(h))
> > > > +   netdev_info(netdev, "net open\n");
> > > > +
> > > 
> > > to make sure this is only intended for debug, and to avoid repetition.
> > > #define hns3_dbg(__dev, format, args...)  \
> > > ({\
> > >   if (netif_msg_drv(h))   \
> > >   netdev_info(h->netdev, format, ##args); \
> > > })
> > 
> > netif_dbg(h, drv, h->netdev, "net open\n")
> > 
> 
> Hi, Saeed && Joe:
> For our cases, maybe netif_info() can be use for HNS3 drivers?
> netif_dbg need to open dynamic debug options additional.

Your code, your choice.

I do think littering dmesg with "net open" style messages
and such may be unnecessary.  KERN_DEBUG seems a more
appropriate log level.




Re: [GIT PULL] SELinux fixes for v5.3 (#1)

2019-07-26 Thread pr-tracker-bot
The pull request you sent on Fri, 26 Jul 2019 18:13:53 -0400:

> git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/selinux.git 
> tags/selinux-pr-20190726

has been merged into torvalds/linux.git:
https://git.kernel.org/torvalds/c/40233e7c447367ffc615b524187970732848d5e3

Thank you!

-- 
Deet-doot-dot, I am a bot.
https://korg.wiki.kernel.org/userdoc/prtracker


Re: [PATCH v2] .gitignore: Add compilation database file

2019-07-26 Thread Masahiro Yamada
On Wed, Jul 24, 2019 at 9:22 AM Toru Komatsu  wrote:
>
> This file is used by clangd to use language server protocol.
> It can be generated at each compile using scripts/gen_compile_commands.py.
> Therefore it is different depending on the environment and should be
> ignored.
>
> Signed-off-by: Toru Komatsu 
> ---
>  .gitignore | 3 +++
>  1 file changed, 3 insertions(+)

Applied to linux-kbuild/fixes. Thanks.

-- 
Best Regards
Masahiro Yamada


[PATCH] gen_compile_commands: lower the entry count threshold

2019-07-26 Thread Masahiro Yamada
Running gen_compile_commands.py after building with allnoconfig
gave this:

$ ./scripts/gen_compile_commands.py
WARNING: Found 449 entries. Have you compiled the kernel?

Signed-off-by: Masahiro Yamada 
---

 scripts/gen_compile_commands.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/scripts/gen_compile_commands.py b/scripts/gen_compile_commands.py
index 7915823b92a5..c458696ef3a7 100755
--- a/scripts/gen_compile_commands.py
+++ b/scripts/gen_compile_commands.py
@@ -21,9 +21,9 @@ _LINE_PATTERN = r'^cmd_[^ ]*\.o := (.* )([^ ]*\.c)$'
 _VALID_LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL']
 
 # A kernel build generally has over 2000 entries in its compile_commands.json
-# database. If this code finds 500 or fewer, then warn the user that they might
+# database. If this code finds 300 or fewer, then warn the user that they might
 # not have all the .cmd files, and they might need to compile the kernel.
-_LOW_COUNT_THRESHOLD = 500
+_LOW_COUNT_THRESHOLD = 300
 
 
 def parse_arguments():
-- 
2.17.1



Re: [PATCH] ext4: Fix deadlock on page reclaim

2019-07-26 Thread Damien Le Moal
On 2019/07/27 7:55, Theodore Y. Ts'o wrote:
> On Sat, Jul 27, 2019 at 08:44:23AM +1000, Dave Chinner wrote:
>>>
>>> This looks like something that could hit every file systems, so
>>> shouldn't we fix this in common code?  We could also look into
>>> just using memalloc_nofs_save for the page cache allocation path
>>> instead of the per-mapping gfp_mask.
>>
>> I think it has to be the entire IO path - any allocation from the
>> underlying filesystem could recurse into the top level filesystem
>> and then deadlock if the memory reclaim submits IO or blocks on
>> IO completion from the upper filesystem. That's a bloody big hammer
>> for something that is only necessary when there are stacked
>> filesystems like this
> 
> Yeah that's why using memalloc_nofs_save() probably makes the most
> sense, and dm_zoned should use that before it calls into ext4.

Unfortunately, with this particular setup, that will not solve the problem.
dm-zoned submit BIOs to its backend drive in response to XFS activity. The
requests for these BIOs are passed along to the kernel tcmu HBA and end up in
that HBA command ring. The commands themselves are read from the ring and
executed by the tcmu-runner user process which executes them doing
pread()/pwrite() to the ext4 file. The tcmu-runner process being a different
context than the dm-zoned worker thread issuing the BIO,
memalloc_nofs_save/restore() calls in dm-zoned will have no effect.

We tried a simpler setup using loopback mount (XFS used directly in an ext4
file) and running the same workload. We failed to recreate a similar deadlock in
this case, but I am strongly suspecting that it can happen too. It is simply
much harder to hit because the IO path from XFS to ext4 is all in-kernel and
asynchronous, whereas tcmu-runner ZBC handler is a synchronous QD=1 path for IOs
which makes it relatively easy to get inter-dependent writes or read+write
queued back-to-back and create the deadlock.

So back to Dave's point, we may be needing the big-hammer solution in the case
of stacked file systems, while a non-stack setups do not necessarily need it
(that is for the FS to decide). But I do not see how to implement this big
hammer conditionally. How can a file system tell if it is at the top of the
stack (big hammer not needed) or lower than the top level (big hammer needed) ?

One simple hack would be an fcntl() or mount option to tell the FS to use
GFP_NOFS unconditionally, but avoiding the bug would mean making sure that the
applications or system setup is correct. So not so safe.

-- 
Damien Le Moal
Western Digital Research


Re: memory leak in kobject_set_name_vargs (2)

2019-07-26 Thread Qian Cai



> On Jul 26, 2019, at 10:29 PM, Linus Torvalds  
> wrote:
> 
> On Fri, Jul 26, 2019 at 4:26 PM syzbot
>  wrote:
>> 
>> syzbot has bisected this bug to:
>> 
>> commit 0e034f5c4bc408c943f9c4a06244415d75d7108c
>> Author: Linus Torvalds 
>> Date:   Wed May 18 18:51:25 2016 +
>> 
>> iwlwifi: fix mis-merge that breaks the driver
> 
> While this bisection looks more likely than the other syzbot entry
> that bisected to a version change, I don't think it is correct eitger.
> 
> The bisection ended up doing a lot of "git bisect skip" because of the
> 
>undefined reference to `nf_nat_icmp_reply_translation'
> 
> issue. Also, the memory leak doesn't seem to be entirely reliable:
> when the bisect does 10 runs to verify that some test kernel is bad,
> there are a couple of cases where only one or two of the ten run
> failed.
> 
> Which makes me wonder if one or two of the "everything OK" runs were
> actually buggy, but just happened to have all ten pass…

Real bisection should point to,

8ed633b9baf9e (“Revert "net-sysfs: Fix memory leak in netdev_register_kobject”")

I did encounter those memory leak and comes up with a similar fix in,

6b70fc94afd1 ("net-sysfs: Fix memory leak in netdev_register_kobject”)

but those error handling paths are tricky that seems nobody did much testing 
there, so it will
keep hitting other bugs in upper functions.

Re: WARNING in ovl_real_fdget_meta

2019-07-26 Thread syzbot

Hello,

syzbot tried to test the proposed patch but build/boot failed:

vmalloc)
[6.623186][T1] TCP established hash table entries: 65536 (order: 7,  
524288 bytes, vmalloc)
[6.629001][T1] TCP bind hash table entries: 65536 (order: 10,  
4194304 bytes, vmalloc)
[6.633571][T1] TCP: Hash tables configured (established 65536 bind  
65536)
[6.635510][T1] UDP hash table entries: 4096 (order: 7, 655360  
bytes, vmalloc)
[6.637367][T1] UDP-Lite hash table entries: 4096 (order: 7, 655360  
bytes, vmalloc)

[6.639861][T1] NET: Registered protocol family 1
[6.642372][T1] RPC: Registered named UNIX socket transport module.
[6.643458][T1] RPC: Registered udp transport module.
[6.644319][T1] RPC: Registered tcp transport module.
[6.645199][T1] RPC: Registered tcp NFSv4.1 backchannel transport  
module.

[6.647753][T1] NET: Registered protocol family 44
[6.648732][T1] pci :00:00.0: Limiting direct PCI/PCI transfers
[6.649837][T1] PCI: CLS 0 bytes, default 64
[6.654238][T1] PCI-DMA: Using software bounce buffering for IO  
(SWIOTLB)
[6.655433][T1] software IO TLB: mapped [mem 0xaa80-0xae80]  
(64MB)
[6.660080][T1] RAPL PMU: API unit is 2^-32 Joules, 0 fixed  
counters, 10737418240 ms ovfl timer

[6.663698][T1] kvm: already loaded the other module
[6.664750][T1] clocksource: tsc: mask: 0x  
max_cycles: 0x212735223b2, max_idle_ns: 440795277976 ns

[6.666833][T1] clocksource: Switched to clocksource tsc
[6.667884][T1] mce: Machine check injector initialized
[6.672842][T1] check: Scanning for low memory corruption every 60  
seconds

[6.784695][T1] Initialise system trusted keyrings
[6.786453][T1] workingset: timestamp_bits=40 max_order=21  
bucket_order=0

[6.788062][T1] zbud: loaded
[6.793680][T1] DLM installed
[6.795747][T1] squashfs: version 4.0 (2009/01/31) Phillip Lougher
[6.799822][T1] FS-Cache: Netfs 'nfs' registered for caching
[6.802062][T1] NFS: Registering the id_resolver key type
[6.803162][T1] Key type id_resolver registered
[6.804299][T1] Key type id_legacy registered
[6.805300][T1] nfs4filelayout_init: NFSv4 File Layout Driver  
Registering...
[6.806905][T1] Installing knfsd (copyright (C) 1996  
o...@monad.swb.de).

[6.811461][T1] ntfs: driver 2.1.32 [Flags: R/W].
[6.813297][T1] fuse: init (API version 7.31)
[6.816259][T1] JFS: nTxBlock = 8192, nTxLock = 65536
[6.826202][T1] SGI XFS with ACLs, security attributes, realtime, no  
debug enabled

[6.832172][T1] 9p: Installing v9fs 9p2000 file system support
[6.833515][T1] FS-Cache: Netfs '9p' registered for caching
[6.838070][T1] gfs2: GFS2 installed
[6.841163][T1] FS-Cache: Netfs 'ceph' registered for caching
[6.842969][T1] ceph: loaded (mds proto 32)
[6.850819][T1] NET: Registered protocol family 38
[6.852584][T1] async_tx: api initialized (async)
[6.853585][T1] Key type asymmetric registered
[6.854272][T1] Asymmetric key parser 'x509' registered
[6.855126][T1] Asymmetric key parser 'pkcs8' registered
[6.855903][T1] Key type pkcs7_test registered
[6.856598][T1] Asymmetric key parser 'tpm_parser' registered
[6.857618][T1] Block layer SCSI generic (bsg) driver version 0.4  
loaded (major 246)

[6.859381][T1] io scheduler mq-deadline registered
[6.860444][T1] io scheduler kyber registered
[6.861501][T1] io scheduler bfq registered
[6.866618][T1] input: Power Button as  
/devices/LNXSYSTM:00/LNXPWRBN:00/input/input0

[6.869055][T1] ACPI: Power Button [PWRF]
[6.870629][T1] input: Sleep Button as  
/devices/LNXSYSTM:00/LNXSLPBN:00/input/input1

[6.872202][T1] ACPI: Sleep Button [SLPF]
[6.877520][T1] ioatdma: Intel(R) QuickData Technology Driver 5.00
[6.889497][T1] PCI Interrupt Link [LNKC] enabled at IRQ 11
[6.890599][T1] virtio-pci :00:03.0: virtio_pci: leaving for  
legacy driver

[6.903444][T1] PCI Interrupt Link [LNKD] enabled at IRQ 10
[6.904470][T1] virtio-pci :00:04.0: virtio_pci: leaving for  
legacy driver

[7.39][T1] HDLC line discipline maxframe=4096
[7.223063][T1] N_HDLC line discipline registered.
[7.223876][T1] Serial: 8250/16550 driver, 4 ports, IRQ sharing  
enabled
[7.247483][T1] 00:03: ttyS0 at I/O 0x3f8 (irq = 4, base_baud =  
115200) is a 16550A
[7.273815][T1] 00:04: ttyS1 at I/O 0x2f8 (irq = 3, base_baud =  
115200) is a 16550A
[7.299513][T1] 00:05: ttyS2 at I/O 0x3e8 (irq = 6, base_baud =  
115200) is a 16550A
[7.325004][T1] 00:06: ttyS3 at I/O 0x2e8 (irq = 7, base_baud =  
115200) is a 16550A

[7.335983][T1] Non-volatile memory driver v1.3
[7.337472][

Re: [PATCH 4.19 00/50] 4.19.62-stable review

2019-07-26 Thread shuah

On 7/26/19 9:24 AM, Greg Kroah-Hartman wrote:

This is the start of the stable review cycle for the 4.19.62 release.
There are 50 patches in this series, all will be posted as a response
to this one.  If anyone has any issues with these being applied, please
let me know.

Responses should be made by Sun 28 Jul 2019 03:21:13 PM UTC.
Anything received after that time might be too late.

The whole patch series can be found in one patch at:

https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.19.62-rc1.gz
or in the git tree and branch at:

git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git 
linux-4.19.y
and the diffstat can be found below.

thanks,

greg k-h



Compiled and booted on my test system. No dmesg regressions.

thanks,
-- Shuah



Re: [PATCH 02/10] mm/page_alloc: use unsigned int for "order" in __rmqueue_fallback()

2019-07-26 Thread Pengfei Li
On Fri, Jul 26, 2019 at 5:36 PM Rasmus Villemoes
 wrote:
>
> On 25/07/2019 20.42, Pengfei Li wrote:
> > Because "order" will never be negative in __rmqueue_fallback(),
> > so just make "order" unsigned int.
> > And modify trace_mm_page_alloc_extfrag() accordingly.
> >
>
> > diff --git a/mm/page_alloc.c b/mm/page_alloc.c
> > index 75c18f4fd66a..1432cbcd87cd 100644
> > --- a/mm/page_alloc.c
> > +++ b/mm/page_alloc.c
> > @@ -2631,8 +2631,8 @@ static bool unreserve_highatomic_pageblock(const 
> > struct alloc_context *ac,
> >   * condition simpler.
> >   */
> >  static __always_inline bool
> > -__rmqueue_fallback(struct zone *zone, int order, int start_migratetype,
> > - unsigned int alloc_flags)
> > +__rmqueue_fallback(struct zone *zone, unsigned int order,
> > + int start_migratetype, unsigned int alloc_flags)
> >  {
>
> Please read the last paragraph of the comment above this function, run
> git blame to figure out when that was introduced, and then read the full
> commit description.

Thanks for your comments.

I have read the commit info of commit b002529d2563 ("mm/page_alloc.c:
eliminate unsigned confusion in __rmqueue_fallback").

And I looked at the discussion at https://lkml.org/lkml/2017/6/21/684 in detail.

> Here be dragons. At the very least, this patch is
> wrong in that it makes that comment inaccurate.

I wonder if you noticed the commit 6bb154504f8b ("mm, page_alloc: spread
allocations across zones before introducing fragmentation").

Commit 6bb154504f8b introduces a local variable min_order in
__rmqueue_fallback().

And you can see

for (current_order = MAX_ORDER - 1; current_order >= min_order;
--current_order) {

The “current_order” and "min_order"  are int, so here is ok.

Since __rmqueue_fallback() is only called by __rmqueue() and "order" is unsigned
int in __rmqueue(), then I think that making "order" is also unsigned
int is good.

Maybe I should also modify the comments here?

>
> Rasmus

Thank you again for your review.

--
Pengfei


Re: [PATCH 5.1 00/62] 5.1.21-stable review

2019-07-26 Thread shuah

On 7/26/19 9:24 AM, Greg Kroah-Hartman wrote:

Note, this will be the LAST 5.1.y kernel release.  Everyone should move
to the 5.2.y series at this point in time.

This is the start of the stable review cycle for the 5.1.21 release.
There are 62 patches in this series, all will be posted as a response
to this one.  If anyone has any issues with these being applied, please
let me know.

Responses should be made by Sun 28 Jul 2019 03:21:13 PM UTC.
Anything received after that time might be too late.

The whole patch series can be found in one patch at:

https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.1.21-rc1.gz
or in the git tree and branch at:

git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git 
linux-5.1.y
and the diffstat can be found below.

thanks,

greg k-h



Compiled and booted on my test system. No dmesg regressions.

thanks,
-- Shuah



Re: [PATCH 5.2 00/66] 5.2.4-stable review

2019-07-26 Thread shuah

On 7/26/19 9:23 AM, Greg Kroah-Hartman wrote:

This is the start of the stable review cycle for the 5.2.4 release.
There are 66 patches in this series, all will be posted as a response
to this one.  If anyone has any issues with these being applied, please
let me know.

Responses should be made by Sun 28 Jul 2019 03:21:13 PM UTC.
Anything received after that time might be too late.

The whole patch series can be found in one patch at:

https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.2.4-rc1.gz
or in the git tree and branch at:

git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git 
linux-5.2.y
and the diffstat can be found below.

thanks,

greg k-h



Compiled and booted on my test system. No dmesg regressions,

thanks,
-- Shuah



Re: memory leak in kobject_set_name_vargs (2)

2019-07-26 Thread Linus Torvalds
On Fri, Jul 26, 2019 at 4:26 PM syzbot
 wrote:
>
> syzbot has bisected this bug to:
>
> commit 0e034f5c4bc408c943f9c4a06244415d75d7108c
> Author: Linus Torvalds 
> Date:   Wed May 18 18:51:25 2016 +
>
>  iwlwifi: fix mis-merge that breaks the driver

While this bisection looks more likely than the other syzbot entry
that bisected to a version change, I don't think it is correct eitger.

The bisection ended up doing a lot of "git bisect skip" because of the

undefined reference to `nf_nat_icmp_reply_translation'

issue. Also, the memory leak doesn't seem to be entirely reliable:
when the bisect does 10 runs to verify that some test kernel is bad,
there are a couple of cases where only one or two of the ten run
failed.

Which makes me wonder if one or two of the "everything OK" runs were
actually buggy, but just happened to have all ten pass...

   Linus


Re: Regression in 5.3 for some FS_USERNS_MOUNT (aka user-namespace-mountable) filesystems

2019-07-26 Thread Al Viro
On Fri, Jul 26, 2019 at 07:46:18PM -0500, Eric W. Biederman wrote:

> If someone had bothered to actually look at how I was proposing to clean
> things up before the new mount api we would already have that.  Sigh.
> 
> You should be able to get away with something like this which moves the
> checks earlier and makes things clearer.  My old patch against the pre
> new mount api code.

Check your instances of ->permission(); AFAICS in all cases it's (in
current terms)
return ns_capable(fc->user_ns, CAP_SYS_ADMIN) ? 0 : -EPERM;

In principle I like killing FS_USERNS_MOUNT flag, but when a method
is always either NULL or exact same function...


Re: [PATCH V2 net-next 07/11] net: hns3: adds debug messages to identify eth down cause

2019-07-26 Thread liuyonglong



On 2019/7/27 6:18, Joe Perches wrote:
> On Fri, 2019-07-26 at 22:00 +, Saeed Mahameed wrote:
>> On Fri, 2019-07-26 at 11:24 +0800, Huazhong Tan wrote:
>>> From: Yonglong Liu 
>>>
>>> Some times just see the eth interface have been down/up via
>>> dmesg, but can not know why the eth down. So adds some debug
>>> messages to identify the cause for this.
> []
>>> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c
>> []
>>> @@ -459,6 +459,10 @@ static int hns3_nic_net_open(struct net_device
>>> *netdev)
>>> h->ae_algo->ops->set_timer_task(priv->ae_handle, true);
>>>  
>>> hns3_config_xps(priv);
>>> +
>>> +   if (netif_msg_drv(h))
>>> +   netdev_info(netdev, "net open\n");
>>> +
>>
>> to make sure this is only intended for debug, and to avoid repetition.
>> #define hns3_dbg(__dev, format, args...) \
>> ({   \
>>  if (netif_msg_drv(h))   \
>>  netdev_info(h->netdev, format, ##args); \
>> })
> 
>   netif_dbg(h, drv, h->netdev, "net open\n")
> 

Hi, Saeed && Joe:
For our cases, maybe netif_info() can be use for HNS3 drivers?
netif_dbg need to open dynamic debug options additional.



Re: Regression in 5.3 for some FS_USERNS_MOUNT (aka user-namespace-mountable) filesystems

2019-07-26 Thread Al Viro
On Sat, Jul 27, 2019 at 12:22:20AM +0100, Al Viro wrote:
> On Fri, Jul 26, 2019 at 03:47:02PM -0700, Linus Torvalds wrote:
> 
> > Of course, then later on, commit 20284ab7427f ("switch mount_capable()
> > to fs_context") drops that argument entirely, and hardcodes the
> > decision to look at fc->global.
> > 
> > But that fc->global decision wasn't there originally, and is incorrect
> > since it breaks existing users.
> > 
> > What gets much more confusing about this is that the two different
> > users then moved around. The sget_userns() case got moved to
> > legacy_get_tree(), and then joined together in vfs_get_tree(), and
> > then split and moved out to do_new_mount() and vfs_fsconfig_locked().
> > 
> > And that "joined together into vfs_get_tree()" must be wrong, because
> > the two cases used two different namespace rules. The sget_userns()
> > case *did* have that "global" flag check, while the sget_fc() did not.
> > 
> > Messy. Al?
> 
> Digging through that mess...  It's my fuckup, and we obviously need to
> restore the old behaviour, but I really hope to manage that with
> checks _not_ in superblock allocator ;-/

It shouldn't have looked at fc->global for those checks.  In any cases.
sget_fc() should indeed have been passing fc->user_ns, not userns.
And as for sget_userns(), by the time of 20284ab7427f
its checks had been moved to legacy_get_tree().  In form of
if (!mount_capable(fc->fs_type, fc->user_ns))
as it bloody well ought to.

So the first mistake (wrong argument passed to mount_capable() by sget_fc()
in 0ce0cf12fc4c) has been completed by 20284ab7427f - that conversion was,
actually, an equivalent transformation (callers of legacy_get_tree() never
have fc->global set, so it's all the same).  However, the bug introduced in
the earlier commit was now spelled out in mount_capable() itself.

IOW, the minimal fix should be as below.  In principle, I'm not against
Eric's "add a method instead of setting FS_USERNS_MOUNT", but note that
in *all* cases the instances of his method end up being equivalent to
return ns_capable(fc->user_ns, CAP_SYS_ADMIN) ? 0 : -EPERM;

Anyway, AFAICS the regression fix should be simply this:

Unbreak mount_capable()

In "consolidate the capability checks in sget_{fc,userns}())" the
wrong argument had been passed to mount_capable() by sget_fc().
That mistake had been further obscured later, when switching
mount_capable() to fs_context has moved the calculation of
bogus argument from sget_fc() to mount_capable() itself.  It
should've been fc->user_ns all along.

Screwed-up-by: Al Viro 
Reported-by: Christian Brauner 
Signed-off-by: Al Viro 
---
diff --git a/fs/super.c b/fs/super.c
index 113c58f19425..5960578a4076 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -478,13 +478,10 @@ EXPORT_SYMBOL(generic_shutdown_super);
 
 bool mount_capable(struct fs_context *fc)
 {
-   struct user_namespace *user_ns = fc->global ? _user_ns
-   : fc->user_ns;
-
if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT))
return capable(CAP_SYS_ADMIN);
else
-   return ns_capable(user_ns, CAP_SYS_ADMIN);
+   return ns_capable(fc->user_ns, CAP_SYS_ADMIN);
 }
 
 /**



RE: [PATCH 3/4] RISC-V: Support case insensitive ISA string parsing.

2019-07-26 Thread Anup Patel



> -Original Message-
> From: Paul Walmsley 
> Sent: Saturday, July 27, 2019 5:00 AM
> To: Atish Patra 
> Cc: linux-kernel@vger.kernel.org; Alan Kao ;
> Albert Ou ; Allison Randal ;
> Anup Patel ; Daniel Lezcano
> ; Greg Kroah-Hartman
> ; Johan Hovold ; linux-
> ri...@lists.infradead.org; Palmer Dabbelt ; Thomas
> Gleixner 
> Subject: Re: [PATCH 3/4] RISC-V: Support case insensitive ISA string parsing.
> 
> On Fri, 26 Jul 2019, Atish Patra wrote:
> 
> > On 7/26/19 1:47 PM, Paul Walmsley wrote:
> > > On Fri, 26 Jul 2019, Atish Patra wrote:
> > >
> > > > As per riscv specification, ISA naming strings are case
> > > > insensitive. However, currently only lower case strings are parsed
> > > > during cpu procfs.
> > > >
> > > > Support parsing of upper case letters as well.
> > > >
> > > > Signed-off-by: Atish Patra 
> > >
> > > Is there a use case that's driving this, or
> >
> > Currently, we use all lower case isa string in kvmtool. But somebody
> > can have uppercase letters in future as spec allows it.
> >
> >
> > can we just say, "use
> > > lowercase letters" and leave it at that?
> > >
> >
> > In that case, it will not comply with RISC-V spec. Is that okay ?
> 
> I think that section of the specification is mostly concerned with someone
> trying to define "f" as a different extension than "F", or something like 
> that.
> I'm not sure that it imposes any constraint that software must accept both
> upper and lower case ISA strings.
> 
> What gives me pause here is that this winds up impacting DT schema
> validation:
> 
> https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Docu
> mentation/devicetree/bindings/riscv/cpus.yaml#n41

If 'f' and 'F' mean same extension as-per RISC-V spec then software should also
interpret it that way hence this patch.

Regards,
Anup


Re: [PATCH v3 2/3] augmented rbtree: add new RB_DECLARE_CALLBACKS_MAX macro

2019-07-26 Thread Michel Lespinasse
On Fri, Jul 26, 2019 at 06:44:19PM -0700, Andrew Morton wrote:
> On Mon, 8 Jul 2019 05:24:09 -0700 Michel Lespinasse  wrote:
> 
> > Syncing up with v5.2, I see that there is a new use for augmented
> > rbtrees in mm/vmalloc.c which does not compile after applying my
> > patchset.
> > 
> > It's an easy fix though:
> 
> It still doesn't build.
> 
> lib/rbtree_test.c: In function check_augmented:
> lib/rbtree_test.c:225:35: error: implicit declaration of function 
> augment_recompute [-Werror=implicit-function-declaration]
>WARN_ON_ONCE(node->augmented != augment_recompute(node));

grumpf, sorry about that. I thought I had rbtree_test enabled in my
build, but turned out I only had interval_tree_test :/

I would suggest the following fix, which reintroduces the code to compute
node->augmented as was previously done in augment_recompute():

--- 8< 

After introducing RB_DECLARE_CALLBACKS_MAX, we do not need the
augment_recompute function to recompute node->augmented during
rbtree rebalancing callbacks. However, this function was also
used in check_augmented() to verify that node->augmented was
correctly set, so we need to reintroduce the code for that check.

Signed-off-by: Michel Lespinasse 
---
 lib/rbtree_test.c | 15 ++-
 1 file changed, 14 insertions(+), 1 deletion(-)

diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c
index 1939419ba869..41ae3c7570d3 100644
--- a/lib/rbtree_test.c
+++ b/lib/rbtree_test.c
@@ -222,7 +222,20 @@ static void check_augmented(int nr_nodes)
check(nr_nodes);
for (rb = rb_first(_root); rb; rb = rb_next(rb)) {
struct test_node *node = rb_entry(rb, struct test_node, rb);
-   WARN_ON_ONCE(node->augmented != augment_recompute(node));
+   u32 subtree, max = node->val;
+   if (node->rb.rb_left) {
+   subtree = rb_entry(node->rb.rb_left, struct test_node,
+  rb)->augmented;
+   if (max < subtree)
+   max = subtree;
+   }
+   if (node->rb.rb_right) {
+   subtree = rb_entry(node->rb.rb_right, struct test_node,
+  rb)->augmented;
+   if (max < subtree)
+   max = subtree;
+   }
+   WARN_ON_ONCE(node->augmented != max);
}
 }
 
-- 
Michel "Walken" Lespinasse
A program is never fully debugged until the last user dies.


Re: [PATCH 4.19 00/50] 4.19.62-stable review

2019-07-26 Thread kernelci.org bot
stable-rc/linux-4.19.y boot: 118 boots: 1 failed, 77 passed with 40 offline 
(v4.19.61-51-g213a5f3ac1f5)

Full Boot Summary: 
https://kernelci.org/boot/all/job/stable-rc/branch/linux-4.19.y/kernel/v4.19.61-51-g213a5f3ac1f5/
Full Build Summary: 
https://kernelci.org/build/stable-rc/branch/linux-4.19.y/kernel/v4.19.61-51-g213a5f3ac1f5/

Tree: stable-rc
Branch: linux-4.19.y
Git Describe: v4.19.61-51-g213a5f3ac1f5
Git Commit: 213a5f3ac1f5e2af0e25fd4b26497590ec290be0
Git URL: 
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
Tested: 68 unique boards, 27 SoC families, 17 builds out of 206

Boot Failure Detected:

arc:
hsdk_defconfig:
gcc-8:
hsdk: 1 failed lab

Offline Platforms:

arm64:

defconfig:
gcc-8
meson-axg-s400: 1 offline lab
meson-gxbb-odroidc2: 1 offline lab
meson-gxl-s905d-p230: 1 offline lab
meson-gxl-s905x-libretech-cc: 1 offline lab
meson-gxl-s905x-nexbox-a95x: 1 offline lab
meson-gxl-s905x-p212: 1 offline lab
meson-gxm-nexbox-a1: 1 offline lab
rk3399-firefly: 1 offline lab
sun50i-a64-pine64-plus: 1 offline lab

mips:

pistachio_defconfig:
gcc-8
pistachio_marduk: 1 offline lab

arm:

exynos_defconfig:
gcc-8
exynos5250-arndale: 1 offline lab
exynos5420-arndale-octa: 1 offline lab
exynos5800-peach-pi: 1 offline lab

multi_v7_defconfig:
gcc-8
exynos5250-arndale: 1 offline lab
exynos5420-arndale-octa: 1 offline lab
exynos5800-peach-pi: 1 offline lab
imx6dl-wandboard_dual: 1 offline lab
imx6dl-wandboard_solo: 1 offline lab
imx6q-wandboard: 1 offline lab
imx7s-warp: 1 offline lab
meson8b-odroidc1: 1 offline lab
omap3-beagle: 1 offline lab
omap4-panda: 1 offline lab
qcom-apq8064-ifc6410: 1 offline lab
stih410-b2120: 1 offline lab
sun4i-a10-cubieboard: 1 offline lab
sun7i-a20-bananapi: 1 offline lab
vf610-colibri-eval-v3: 1 offline lab

omap2plus_defconfig:
gcc-8
omap3-beagle: 1 offline lab
omap4-panda: 1 offline lab

qcom_defconfig:
gcc-8
qcom-apq8064-ifc6410: 1 offline lab

davinci_all_defconfig:
gcc-8
da850-evm: 1 offline lab
dm365evm,legacy: 1 offline lab

imx_v6_v7_defconfig:
gcc-8
imx6dl-wandboard_dual: 1 offline lab
imx6dl-wandboard_solo: 1 offline lab
imx6q-wandboard: 1 offline lab
imx7s-warp: 1 offline lab
vf610-colibri-eval-v3: 1 offline lab

sunxi_defconfig:
gcc-8
sun4i-a10-cubieboard: 1 offline lab
sun7i-a20-bananapi: 1 offline lab

---
For more info write to 


Re: [PATCH 5.2 00/66] 5.2.4-stable review

2019-07-26 Thread kernelci.org bot
stable-rc/linux-5.2.y boot: 129 boots: 1 failed, 83 passed with 45 offline 
(v5.2.3-67-gd61e440a1852)

Full Boot Summary: 
https://kernelci.org/boot/all/job/stable-rc/branch/linux-5.2.y/kernel/v5.2.3-67-gd61e440a1852/
Full Build Summary: 
https://kernelci.org/build/stable-rc/branch/linux-5.2.y/kernel/v5.2.3-67-gd61e440a1852/

Tree: stable-rc
Branch: linux-5.2.y
Git Describe: v5.2.3-67-gd61e440a1852
Git Commit: d61e440a1852a64d8a2d0d358b9582b19157e039
Git URL: 
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git
Tested: 76 unique boards, 28 SoC families, 17 builds out of 209

Boot Failure Detected:

arm:
omap2plus_defconfig:
gcc-8:
omap4-panda: 1 failed lab

Offline Platforms:

riscv:

defconfig:
gcc-8
sifive_fu540: 1 offline lab

arm64:

defconfig:
gcc-8
meson-axg-s400: 1 offline lab
meson-g12a-u200: 1 offline lab
meson-g12a-x96-max: 1 offline lab
meson-gxbb-odroidc2: 1 offline lab
meson-gxl-s905d-p230: 1 offline lab
meson-gxl-s905x-libretech-cc: 1 offline lab
meson-gxl-s905x-nexbox-a95x: 1 offline lab
meson-gxl-s905x-p212: 1 offline lab
meson-gxm-nexbox-a1: 1 offline lab
rk3399-firefly: 1 offline lab
sun50i-a64-pine64-plus: 1 offline lab

mips:

pistachio_defconfig:
gcc-8
pistachio_marduk: 1 offline lab

arm:

exynos_defconfig:
gcc-8
exynos5250-arndale: 1 offline lab
exynos5420-arndale-octa: 1 offline lab
exynos5800-peach-pi: 1 offline lab

multi_v7_defconfig:
gcc-8
bcm72521-bcm97252sffe: 1 offline lab
bcm7445-bcm97445c: 1 offline lab
exynos5250-arndale: 1 offline lab
exynos5420-arndale-octa: 1 offline lab
exynos5800-peach-pi: 1 offline lab
imx6dl-wandboard_dual: 1 offline lab
imx6dl-wandboard_solo: 1 offline lab
imx6q-wandboard: 1 offline lab
imx7s-warp: 1 offline lab
meson8b-odroidc1: 1 offline lab
omap3-beagle: 1 offline lab
omap4-panda: 1 offline lab
qcom-apq8064-ifc6410: 1 offline lab
stih410-b2120: 1 offline lab
sun4i-a10-cubieboard: 1 offline lab
sun7i-a20-bananapi: 1 offline lab
vf610-colibri-eval-v3: 1 offline lab

omap2plus_defconfig:
gcc-8
omap3-beagle: 1 offline lab
omap4-panda: 1 offline lab

qcom_defconfig:
gcc-8
qcom-apq8064-ifc6410: 1 offline lab

davinci_all_defconfig:
gcc-8
da850-evm: 1 offline lab
dm365evm,legacy: 1 offline lab

imx_v6_v7_defconfig:
gcc-8
imx6dl-wandboard_dual: 1 offline lab
imx6dl-wandboard_solo: 1 offline lab
imx6q-wandboard: 1 offline lab
imx7s-warp: 1 offline lab
vf610-colibri-eval-v3: 1 offline lab

sunxi_defconfig:
gcc-8
sun4i-a10-cubieboard: 1 offline lab
sun7i-a20-bananapi: 1 offline lab

---
For more info write to 


mmotm 2019-07-26-19-00 uploaded

2019-07-26 Thread akpm
The mm-of-the-moment snapshot 2019-07-26-19-00 has been uploaded to

   http://www.ozlabs.org/~akpm/mmotm/

mmotm-readme.txt says

README for mm-of-the-moment:

http://www.ozlabs.org/~akpm/mmotm/

This is a snapshot of my -mm patch queue.  Uploaded at random hopefully
more than once a week.

You will need quilt to apply these patches to the latest Linus release (5.x
or 5.x-rcY).  The series file is in broken-out.tar.gz and is duplicated in
http://ozlabs.org/~akpm/mmotm/series

The file broken-out.tar.gz contains two datestamp files: .DATE and
.DATE--mm-dd-hh-mm-ss.  Both contain the string -mm-dd-hh-mm-ss,
followed by the base kernel version against which this patch series is to
be applied.

This tree is partially included in linux-next.  To see which patches are
included in linux-next, consult the `series' file.  Only the patches
within the #NEXT_PATCHES_START/#NEXT_PATCHES_END markers are included in
linux-next.


A full copy of the full kernel tree with the linux-next and mmotm patches
already applied is available through git within an hour of the mmotm
release.  Individual mmotm releases are tagged.  The master branch always
points to the latest release, so it's constantly rebasing.

http://git.cmpxchg.org/cgit.cgi/linux-mmotm.git/



The directory http://www.ozlabs.org/~akpm/mmots/ (mm-of-the-second)
contains daily snapshots of the -mm tree.  It is updated more frequently
than mmotm, and is untested.

A git copy of this tree is available at

http://git.cmpxchg.org/cgit.cgi/linux-mmots.git/

and use of this tree is similar to
http://git.cmpxchg.org/cgit.cgi/linux-mmotm.git/, described above.


This mmotm tree contains the following patches against 5.3-rc1:
(patches marked "*" will be included in linux-next)

  origin.patch
* docs-signal-fix-a-kernel-doc-markup.patch
* revert-kmemleak-allow-to-coexist-with-fault-injection.patch
* ocfs2-remove-set-but-not-used-variable-last_hash.patch
* 
mm-vmscan-check-if-mem-cgroup-is-disabled-or-not-before-calling-memcg-slab-shrinker.patch
* 
mm-migrate-fix-reference-check-race-between-__find_get_block-and-migration.patch
* 
mm-compaction-avoid-100%-cpu-usage-during-compaction-when-a-task-is-killed.patch
* kasan-remove-clang-version-check-for-kasan_stack.patch
* ubsan-build-ubsanc-more-conservatively.patch
* page-flags-prioritize-kasan-bits-over-last-cpuid.patch
* page-flags-prioritize-kasan-bits-over-last-cpuid-fix.patch
* coredump-split-pipe-command-whitespace-before-expanding-template.patch
* mm-migrate-initialize-pud_entry-in-migrate_vma.patch
* mm-hotplug-remove-unneeded-return-for-void-function.patch
* cgroup-kselftest-relax-fs_spec-checks.patch
* asm-generic-fix-wtype-limits-compiler-warnings.patch
* asm-generic-fix-wtype-limits-compiler-warnings-fix.patch
* asm-generic-fix-wtype-limits-compiler-warnings-v2.patch
* test_meminit-use-gfp_atomic-in-rcu-critical-section.patch
* proc-kpageflags-prevent-an-integer-overflow-in-stable_page_flags.patch
* proc-kpageflags-do-not-use-uninitialized-struct-pages.patch
* mm-document-zone-device-struct-page-field-usage.patch
* mm-hmm-fix-zone_device-anon-page-mapping-reuse.patch
* mm-hmm-fix-bad-subpage-pointer-in-try_to_unmap_one.patch
* mm-hmm-fix-bad-subpage-pointer-in-try_to_unmap_one-v3.patch
* acpi-scan-acquire-device_hotplug_lock-in-acpi_scan_init.patch
* 
mm-mempolicy-make-the-behavior-consistent-when-mpol_mf_move-and-mpol_mf_strict-were-specified.patch
* 
mm-mempolicy-make-the-behavior-consistent-when-mpol_mf_move-and-mpol_mf_strict-were-specified-v4.patch
* mm-mempolicy-handle-vma-with-unmovable-pages-mapped-correctly-in-mbind.patch
* 
mm-mempolicy-handle-vma-with-unmovable-pages-mapped-correctly-in-mbind-v4.patch
* mm-z3foldc-fix-z3fold_destroy_pool-ordering.patch
* mm-z3foldc-fix-z3fold_destroy_pool-race-condition.patch
* kbuild-clean-compressed-initramfs-image.patch
* ocfs2-use-jbd2_inode-dirty-range-scoping.patch
* jbd2-remove-jbd2_journal_inode_add_.patch
* ocfs2-clear-zero-in-unaligned-direct-io.patch
* ocfs2-clear-zero-in-unaligned-direct-io-checkpatch-fixes.patch
* ocfs2-wait-for-recovering-done-after-direct-unlock-request.patch
* ocfs2-checkpoint-appending-truncate-log-transaction-before-flushing.patch
* ramfs-support-o_tmpfile.patch
  mm.patch
* mm-slab-extend-slab-shrink-to-shrink-all-memcg-caches.patch
* mm-slab-move-memcg_cache_params-structure-to-mm-slabh.patch
* memremap-move-from-kernel-to-mm.patch
* mm-page_poison-fix-a-typo-in-a-comment.patch
* mm-rmapc-remove-set-but-not-used-variable-cstart.patch
* mm-introduce-page_size.patch
* mm-introduce-page_shift.patch
* mm-introduce-page_shift-fix.patch
* mm-introduce-compound_nr.patch
* mm-replace-list_move_tail-with-add_page_to_lru_list_tail.patch
* mm-filemap-rewrite-mapping_needs_writeback-in-less-fancy-manner.patch
* mm-throttle-allocators-when-failing-reclaim-over-memoryhigh.patch
* mm-throttle-allocators-when-failing-reclaim-over-memoryhigh-fix.patch
* mm-throttle-allocators-when-failing-reclaim-over-memoryhigh-fix-fix.patch
* 

Re: [PATCH] sched/core: Don't use dying mm as active_mm for kernel threads

2019-07-26 Thread Waiman Long
On 7/26/19 7:45 PM, Waiman Long wrote:
> It was found that a dying mm_struct where the owning task has exited can
> stay on as active_mm of kernel threads as long as no other user tasks
> run on those CPUs that use it as active_mm. This prolongs the life time
> of dying mm holding up memory and other resources that cannot be freed.
>
> Fix that by forcing the kernel threads to use init_mm as the active_mm
> if the previous active_mm is dying.
>
> Signed-off-by: Waiman Long 
> ---
>  kernel/sched/core.c | 13 +++--
>  mm/init-mm.c|  2 ++
>  2 files changed, 13 insertions(+), 2 deletions(-)


Sorry, I didn't realize that mm->owner depends on CONFIG_MEMCG. I will
need to refresh the patch and send out v2 when I am done testing.

Cheers,
Longman



Re: [PATCH] isdn/gigaset: check endpoint null in gigaset_probe

2019-07-26 Thread Phong Tran

On 7/26/19 9:22 PM, Paul Bolle wrote:

Phong Tran schreef op vr 26-07-2019 om 20:35 [+0700]:

This fixed the potential reference NULL pointer while using variable
endpoint.

Reported-by: syzbot+35b1c403a14f5c89e...@syzkaller.appspotmail.com
Tested by syzbot:
https://groups.google.com/d/msg/syzkaller-bugs/wnHG8eRNWEA/Qn2HhjNdBgAJ

Signed-off-by: Phong Tran 
---
  drivers/isdn/gigaset/usb-gigaset.c | 9 +


This is now drivers/staging/isdn/gigaset/usb-gigaset.c.


this patch was created base on branch 
kasan/usb-fuzzer-usb-testing-2019.07.11 [1]

I did not notice about the driver was moved to staging.




  1 file changed, 9 insertions(+)

diff --git a/drivers/isdn/gigaset/usb-gigaset.c 
b/drivers/isdn/gigaset/usb-gigaset.c
index 1b9b43659bdf..2e011f3db59e 100644
--- a/drivers/isdn/gigaset/usb-gigaset.c
+++ b/drivers/isdn/gigaset/usb-gigaset.c
@@ -703,6 +703,10 @@ static int gigaset_probe(struct usb_interface *interface,
usb_set_intfdata(interface, cs);
  
  	endpoint = >endpoint[0].desc;

+if (!endpoint) {
+   dev_err(cs->dev, "Couldn't get control endpoint\n");
+   return -ENODEV;
+   }


When can this happen? Is this one of those bugs that one can only trigger with
a specially crafted (evil) usb device?



Yes, in my understanding, this only happens with random test of syzbot.


buffer_size = le16_to_cpu(endpoint->wMaxPacketSize);
ucs->bulk_out_size = buffer_size;
@@ -722,6 +726,11 @@ static int gigaset_probe(struct usb_interface *interface,
}
  
  	endpoint = >endpoint[1].desc;

+if (!endpoint) {
+   dev_err(cs->dev, "Endpoint not available\n");
+   retval = -ENODEV;
+   goto error;
+   }
  
  	ucs->busy = 0;
  


Please note that I'm very close to getting cut off from the ISDN network, so
the chances of being able to testi this on a live system are getting small.



This bug can be invalid now. Do you agree?
There is an instruction to report invalid bug to syzbot [2].


Thanks,


Paul Bolle




[1] 
https://github.com/google/kasan/commits/usb-fuzzer-usb-testing-2019.07.11
[2] 
https://github.com/google/syzkaller/blob/master/docs/syzbot.md#communication-with-syzbot


Thanks,
Phong


Re: [PATCH v3 2/3] augmented rbtree: add new RB_DECLARE_CALLBACKS_MAX macro

2019-07-26 Thread Andrew Morton
On Mon, 8 Jul 2019 05:24:09 -0700 Michel Lespinasse  wrote:

> Syncing up with v5.2, I see that there is a new use for augmented
> rbtrees in mm/vmalloc.c which does not compile after applying my
> patchset.
> 
> It's an easy fix though:

It still doesn't build.

lib/rbtree_test.c: In function check_augmented:
lib/rbtree_test.c:225:35: error: implicit declaration of function 
augment_recompute [-Werror=implicit-function-declaration]
   WARN_ON_ONCE(node->augmented != augment_recompute(node));

I think I'll just do this:

--- 
a/lib/rbtree_test.c~augmented-rbtree-add-new-rb_declare_callbacks_max-macro-fix-2
+++ a/lib/rbtree_test.c
@@ -220,10 +220,6 @@ static void check_augmented(int nr_nodes
struct rb_node *rb;
 
check(nr_nodes);
-   for (rb = rb_first(_root); rb; rb = rb_next(rb)) {
-   struct test_node *node = rb_entry(rb, struct test_node, rb);
-   WARN_ON_ONCE(node->augmented != augment_recompute(node));
-   }
 }
 
 static int __init rbtree_test_init(void)

although there may be something we can do here to restore the lost
coverage?



Re: [PATCH bpf-next v10 06/10] bpf,landlock: Add a new map type: inode

2019-07-26 Thread Alexei Starovoitov
On Sun, Jul 21, 2019 at 11:31:12PM +0200, Mickaël Salaün wrote:
> FIXME: 64-bits in the doc
> 
> This new map store arbitrary values referenced by inode keys.  The map
> can be updated from user space with file descriptor pointing to inodes
> tied to a file system.  From an eBPF (Landlock) program point of view,
> such a map is read-only and can only be used to retrieved a value tied
> to a given inode.  This is useful to recognize an inode tagged by user
> space, without access right to this inode (i.e. no need to have a write
> access to this inode).
> 
> Add dedicated BPF functions to handle this type of map:
> * bpf_inode_htab_map_update_elem()
> * bpf_inode_htab_map_lookup_elem()
> * bpf_inode_htab_map_delete_elem()
> 
> This new map require a dedicated helper inode_map_lookup_elem() because
> of the key which is a pointer to an opaque data (only provided by the
> kernel).  This act like a (physical or cryptographic) key, which is why
> it is also not allowed to get the next key.
> 
> Signed-off-by: Mickaël Salaün 

there are too many things to comment on.
Let's do this patch.

imo inode_map concept is interesting, but see below...

> +
> + /*
> +  * Limit number of entries in an inode map to the maximum number of
> +  * open files for the current process. The maximum number of file
> +  * references (including all inode maps) for a process is then
> +  * (RLIMIT_NOFILE - 1) * RLIMIT_NOFILE. If the process' RLIMIT_NOFILE
> +  * is 0, then any entry update is forbidden.
> +  *
> +  * An eBPF program can inherit all the inode map FD. The worse case is
> +  * to fill a bunch of arraymaps, create an eBPF program, close the
> +  * inode map FDs, and start again. The maximum number of inode map
> +  * entries can then be close to RLIMIT_NOFILE^3.
> +  */
> + if (attr->max_entries > rlimit(RLIMIT_NOFILE))
> + return -EMFILE;

rlimit is checked, but no fd are consumed.
Once created such inode map_fd can be passed to a different process.
map_fd can be pinned into bpffs.
etc.
what the value of the check?

> +
> + /* decorelate UAPI from kernel API */
> + attr->key_size = sizeof(struct inode *);
> +
> + return htab_map_alloc_check(attr);
> +}
> +
> +static void inode_htab_put_key(void *key)
> +{
> + struct inode **inode = key;
> +
> + if ((*inode)->i_state & I_FREEING)
> + return;

checking the state without take a lock? isn't it racy?

> + iput(*inode);
> +}
> +
> +/* called from syscall or (never) from eBPF program */
> +static int map_get_next_no_key(struct bpf_map *map, void *key, void 
> *next_key)
> +{
> + /* do not leak a file descriptor */

what this comment suppose to mean?

> + return -ENOTSUPP;
> +}
> +
> +/* must call iput(inode) after this call */
> +static struct inode *inode_from_fd(int ufd, bool check_access)
> +{
> + struct inode *ret;
> + struct fd f;
> + int deny;
> +
> + f = fdget(ufd);
> + if (unlikely(!f.file))
> + return ERR_PTR(-EBADF);
> + /* TODO?: add this check when called from an eBPF program too (already
> + * checked by the LSM parent hooks anyway) */
> + if (unlikely(IS_PRIVATE(file_inode(f.file {
> + ret = ERR_PTR(-EINVAL);
> + goto put_fd;
> + }
> + /* check if the FD is tied to a mount point */
> + /* TODO?: add this check when called from an eBPF program too */
> + if (unlikely(f.file->f_path.mnt->mnt_flags & MNT_INTERNAL)) {
> + ret = ERR_PTR(-EINVAL);
> + goto put_fd;
> + }

a bunch of TODOs do not inspire confidence.

> + if (check_access) {
> + /*
> + * must be allowed to access attributes from this file to then
> + * be able to compare an inode to its map entry
> + */
> + deny = security_inode_getattr(>f_path);
> + if (deny) {
> + ret = ERR_PTR(deny);
> + goto put_fd;
> + }
> + }
> + ret = file_inode(f.file);
> + ihold(ret);
> +
> +put_fd:
> + fdput(f);
> + return ret;
> +}
> +
> +/*
> + * The key is a FD when called from a syscall, but an inode address when 
> called
> + * from an eBPF program.
> + */
> +
> +/* called from syscall */
> +int bpf_inode_fd_htab_map_lookup_elem(struct bpf_map *map, int *key, void 
> *value)
> +{
> + void *ptr;
> + struct inode *inode;
> + int ret;
> +
> + /* check inode access */
> + inode = inode_from_fd(*key, true);
> + if (IS_ERR(inode))
> + return PTR_ERR(inode);
> +
> + rcu_read_lock();
> + ptr = htab_map_lookup_elem(map, );
> + iput(inode);
> + if (IS_ERR(ptr)) {
> + ret = PTR_ERR(ptr);
> + } else if (!ptr) {
> + ret = -ENOENT;
> + } else {
> + ret = 0;
> + copy_map_value(map, value, ptr);
> + }
> + rcu_read_unlock();
> + return ret;
> +}
> +
> +/* 

[PATCH v3 -next] staging: vc04_services: fix unused-but-set-variable warning

2019-07-26 Thread YueHaibing
Fix gcc used-but-set-variable warning:

drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c: In function 
vchiq_release_internal:
drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c:2827:16: warning:
 variable local_entity_uc set but not used [-Wunused-but-set-variable]
drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c:2827:6: warning:
 variable local_uc set but not used [-Wunused-but-set-variable]

Remove the unused variables 'local_entity_uc' and 'local_uc'

Reported-by: Hulk Robot 
Signed-off-by: YueHaibing 
Acked-by: Stefan Wahren 
---
v3: fix patch title
v2: remove the unused variable
---
 drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c 
b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c
index cc4383d..b1595b1 100644
--- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c
+++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c
@@ -2824,7 +2824,6 @@ vchiq_release_internal(struct vchiq_state *state, struct 
vchiq_service *service)
VCHIQ_STATUS_T ret = VCHIQ_SUCCESS;
char entity[16];
int *entity_uc;
-   int local_uc, local_entity_uc;
 
if (!arm_state)
goto out;
@@ -2849,8 +2848,8 @@ vchiq_release_internal(struct vchiq_state *state, struct 
vchiq_service *service)
ret = VCHIQ_ERROR;
goto unlock;
}
-   local_uc = --arm_state->videocore_use_count;
-   local_entity_uc = --(*entity_uc);
+   --arm_state->videocore_use_count;
+   --(*entity_uc);
 
if (!vchiq_videocore_wanted(state)) {
if (vchiq_platform_use_suspend_timer() &&
-- 
2.7.4




[RFC PATCH v3 2/2] printk-rb: add test module

2019-07-26 Thread John Ogness
This module does some heavy write stress testing on the ringbuffer
with a reader that is checking for integrity.

Signed-off-by: John Ogness 
---
 kernel/printk/Makefile   |   2 +
 kernel/printk/test_prb.c | 256 +++
 2 files changed, 258 insertions(+)
 create mode 100644 kernel/printk/test_prb.c

diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index 567999aa93af..24365ecee348 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -5,3 +5,5 @@ obj-$(CONFIG_PRINTK)+= ringbuffer.o
 obj-$(CONFIG_PRINTK)   += numlist.o
 obj-$(CONFIG_PRINTK)   += dataring.o
 obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
+
+obj-m  += test_prb.o
diff --git a/kernel/printk/test_prb.c b/kernel/printk/test_prb.c
new file mode 100644
index ..1ecb4fcbf823
--- /dev/null
+++ b/kernel/printk/test_prb.c
@@ -0,0 +1,256 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "ringbuffer.h"
+
+/*
+ * This is a test module that starts "num_online_cpus() - 1" writer threads
+ * and 1 reader thread. The writer threads each write strings of varying
+ * length. They do this as fast as they can.
+ *
+ * The reader thread reads as fast as it can and performs sanity checks on
+ * the data.
+ *
+ * Because the threads are running in such tight loops, they will call
+ * schedule() from time to time so the system stays alive.
+ *
+ * If either the writers or the reader encounter an error, the test is
+ * aborted. Test results are recorded to the ftrace buffers, with some
+ * additional information also provided via printk. The test can be aborted
+ * manually by removing the module. (Ideally the test should never abort on
+ * its own.)
+ */
+
+struct rbdata {
+   int len;
+   char text[0];
+};
+
+static char *test_running;
+static int halt_test;
+
+static void dump_rb(struct printk_ringbuffer *rb)
+{
+   DECLARE_PRINTKRB_ENTRY(entry, 160);
+   DECLARE_PRINTKRB_ITER(iter, rb, );
+   unsigned long last_seq = 0;
+   struct rbdata *dat;
+   char buf[160];
+   int len;
+
+   trace_printk("BEGIN full dump\n");
+
+   prb_for_each_entry_continue(, len) {
+   if (entry.seq - last_seq != 1) {
+   trace_printk("LOST %lu\n",
+entry.seq - (last_seq + 1));
+   }
+   last_seq = entry.seq;
+
+   dat = (struct rbdata *)[0];
+
+   snprintf(buf, sizeof(buf), "%s", dat->text);
+   buf[sizeof(buf) - 1] = 0;
+   trace_printk("seq=%lu len=%d textlen=%d dataval=%s\n",
+entry.seq, len, dat->len, buf);
+   }
+
+   trace_printk("END full dump\n");
+}
+
+DECLARE_PRINTKRB(test_rb, 7, 5);
+
+static int prbtest_writer(void *data)
+{
+   unsigned long num = (unsigned long)data;
+   struct prb_reserved_entry e;
+   char id = 'A' + num;
+   struct rbdata *dat;
+   int count = 0;
+   int len;
+
+   pr_err("prbtest: start thread %lu (writer)\n", num);
+
+   for (;;) {
+   len = sizeof(struct rbdata) + (prandom_u32() & 0x7f) + 2;
+
+   dat = (struct rbdata *)prb_reserve(, _rb, len);
+   if (!IS_ERR(dat)) {
+   len -= sizeof(struct rbdata) + 1;
+   memset(>text[0], id, len);
+   dat->text[len] = 0;
+   dat->len = len;
+   prb_commit();
+   } else {
+   WRITE_ONCE(halt_test, 1);
+   trace_printk("writer%lu (%c) reserve failed (%ld)\n",
+num, id, PTR_ERR(dat));
+   }
+
+   if ((count++ & 0x3fff) == 0)
+   schedule();
+
+   if (READ_ONCE(halt_test) == 1)
+   break;
+   }
+
+   pr_err("prbtest: end thread %lu (writer)\n", num);
+
+   test_running[num] = 0;
+
+   return 0;
+}
+
+static int prbtest_reader(void *data)
+{
+   unsigned long num = (unsigned long)data;
+   DECLARE_PRINTKRB_ENTRY(entry, 160);
+   DECLARE_PRINTKRB_ITER(iter, _rb, );
+   unsigned long total_lost = 0;
+   unsigned long last_seq = 0;
+   unsigned long max_lost = 0;
+   unsigned long count = 0;
+   struct rbdata *dat;
+   int did_sched = 1;
+   int len;
+
+   pr_err("prbtest: start thread %lu (reader)\n", num);
+
+   for (;;) {
+   prb_for_each_entry_continue(, len) {
+   if (entry.seq < last_seq) {
+   WRITE_ONCE(halt_test, 1);
+   trace_printk(
+   "reader%lu invalid seq %lu -> %lu\n",
+   num, last_seq, entry.seq);
+   goto out;
+   }
+
+

[RFC PATCH v3 0/2] printk: new ringbuffer implementation

2019-07-26 Thread John Ogness
Hello,

This is a follow-up RFC on the work to re-implement much of
the core of printk. The threads for the previous RFC versions
are here: v1[0], v2[1].

As was planned[2], this is only the first piece: a new
lockless ringbuffer.

Changes from v2:

- Moved all code into kernel/printk/. Let's keep it private
  for now.

- Split the ringbuffer into 3 components:

  * a data ringbuffer (dataring) to manage the raw data and
data descriptors

  * a numbered list (numlist) to manage committed entries and
their sequence numbers

  * the printk_ringbuffer, which is the high-level structure
providing the reader/writer API and glue for the other
structures

  Splitting the components apart helped to document their
  roles and their related memory barriers (and will hopefully
  also simplify the review process).

- Renamed most functions, structures, and variables based on
  v2 feedback.

- Rewrote and reformatted nearly all comments (particularly
  the memory barrier comments) based on v2 feedback.

- Addressed implementation issues with v2:

  * invalid data blocks potentially becoming valid because of
overflows

  * weak associations between data blocks and descriptors

  * excessive freeing of data blocks due to unavailable
descriptors

- Improved error handling and data integrity checks in the test
  module.

For the memory barrier work I wrote a litmus test for nearly
every memory barrier. I did not include these in the series.
Should I? If yes, where should they be placed?

I would like to point out that Petr Mladek posted a
proof-of-concept[3] alternate implementation. I wanted to base my
v3 on his work, but ran into too many problems getting it to
run acceptably. I will address those issues in that thread. This
is why my v3 is based directly on my v2.

John Ogness

[0] https://lkml.kernel.org/r/20190212143003.48446-1-john.ogn...@linutronix.de
[1] https://lkml.kernel.org/r/20190607162349.18199-1-john.ogn...@linutronix.de
[2] https://lkml.kernel.org/r/87y35hn6ih@linutronix.de
[3] https://lkml.kernel.org/r/20190704103321.10022-1-pmla...@suse.com

John Ogness (2):
  printk-rb: add a new printk ringbuffer implementation
  printk-rb: add test module

 kernel/printk/Makefile |   5 +
 kernel/printk/dataring.c   | 761 ++
 kernel/printk/dataring.h   |  95 ++
 kernel/printk/numlist.c| 375 +
 kernel/printk/numlist.h|  72 
 kernel/printk/ringbuffer.c | 800 +
 kernel/printk/ringbuffer.h | 288 
 kernel/printk/test_prb.c   | 256 +++
 8 files changed, 2652 insertions(+)
 create mode 100644 kernel/printk/dataring.c
 create mode 100644 kernel/printk/dataring.h
 create mode 100644 kernel/printk/numlist.c
 create mode 100644 kernel/printk/numlist.h
 create mode 100644 kernel/printk/ringbuffer.c
 create mode 100644 kernel/printk/ringbuffer.h
 create mode 100644 kernel/printk/test_prb.c

-- 
2.11.0



[RFC PATCH v3 1/2] printk-rb: add a new printk ringbuffer implementation

2019-07-26 Thread John Ogness
See documentation for details.

For the real patch the "prb overview" documentation section in
kernel/printk/ringbuffer.c will be included in the commit message.

Signed-off-by: John Ogness 
---
 kernel/printk/Makefile |   3 +
 kernel/printk/dataring.c   | 761 ++
 kernel/printk/dataring.h   |  95 ++
 kernel/printk/numlist.c| 375 +
 kernel/printk/numlist.h|  72 
 kernel/printk/ringbuffer.c | 800 +
 kernel/printk/ringbuffer.h | 288 
 7 files changed, 2394 insertions(+)
 create mode 100644 kernel/printk/dataring.c
 create mode 100644 kernel/printk/dataring.h
 create mode 100644 kernel/printk/numlist.c
 create mode 100644 kernel/printk/numlist.h
 create mode 100644 kernel/printk/ringbuffer.c
 create mode 100644 kernel/printk/ringbuffer.h

diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile
index 4d052fc6bcde..567999aa93af 100644
--- a/kernel/printk/Makefile
+++ b/kernel/printk/Makefile
@@ -1,4 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0-only
 obj-y  = printk.o
 obj-$(CONFIG_PRINTK)   += printk_safe.o
+obj-$(CONFIG_PRINTK)   += ringbuffer.o
+obj-$(CONFIG_PRINTK)   += numlist.o
+obj-$(CONFIG_PRINTK)   += dataring.o
 obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o
diff --git a/kernel/printk/dataring.c b/kernel/printk/dataring.c
new file mode 100644
index ..911bac593ec1
--- /dev/null
+++ b/kernel/printk/dataring.c
@@ -0,0 +1,761 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include "dataring.h"
+
+/**
+ * DOC: dataring overview
+ *
+ * A dataring is a lockless ringbuffer consisting of variable length data
+ * blocks, each of which are assigned an ID. The IDs map to descriptors, which
+ * contain metadata about the data block. The lookup function mapping IDs to
+ * descriptors is implemented by the user.
+ *
+ * Data Blocks
+ * ---
+ * All ringbuffer data is stored within a single static byte array. This is
+ * to ensure that any pointers to the data (past and present) will always
+ * point to valid memory. This is important because the lockless readers
+ * and writers may preempt for long periods of time and when they resume may
+ * be working with expired pointers.
+ *
+ * Data blocks are specified by begin and end logical positions (lpos) that
+ * map directly to byte array offsets. Using logical positions indirectly
+ * provides tagged state references for the data blocks to avoid ABA issues
+ * when the ringbuffer wraps. The number of tagged states per index is::
+ *
+ * sizeof(long) / size of byte array
+ *
+ * If a data block starts near the end of the byte array but would extend
+ * beyond it, that data block is handled differently: a special "wrapping data
+ * block" is inserted in the space available at the end of the byte array and
+ * a "content data block" is placed at the beginning of the byte array. This
+ * can waste space at the end of the byte array, but simplifies the
+ * implementation by allowing writers to always work with contiguous buffers.
+ * For example, for a 1000 byte array, a descriptor may show a start lpos of
+ * 1950 and an end lpos of 2100. The data block associated with this
+ * descriptor is 100 bytes in size. Its ID is located in the "wrapping" data
+ * block (located at offset 950 of the byte array) and its data is found in
+ * the "content" data block (located at offset 0 of the byte array).
+ *
+ * Descriptors
+ * ---
+ * A descriptor is a handle to a data block. How descriptors are structured
+ * and mapped to IDs is implemented by the user.
+ *
+ * Descriptors contain the begin (begin_lpos) and end (next_lpos) logical
+ * positions of the data block they represent. The end logical position
+ * matches the begin logical position of the adjacent data block.
+ *
+ * Why Descriptors?
+ * 
+ * The data ringbuffer supports variable length entities, which means that
+ * data blocks will not always begin at a predictable offset of the byte
+ * array. This is a major problem for lockless writers that, for example, will
+ * compete to expire and reuse old data blocks when the ringbuffer is full.
+ * Without a predictable begin for the data blocks, a writer has no reliable
+ * information about the status of the "free" area. Are any flags or state
+ * variables already set or is it just garbage left over from previous usage?
+ *
+ * Descriptors allow safe and controlled access to data block metadata by
+ * providing predictable offsets for such metadata. This is key to supporting
+ * multiple concurrent lockless writers.
+ *
+ * Behavior
+ * 
+ * The data ringbuffer allows writers to commit data without regard for
+ * readers. Readers must pre- and post-validate the data blocks they are
+ * processing to be sure the processed data is consistent. A function
+ * dataring_datablock_isvalid() is available for that. Readers can only
+ * iterate data blocks by utilizing an external 

Re: [PATCH v2 -next] staging: vc04_services: fix used-but-set-variable warning

2019-07-26 Thread Yuehaibing


On 2019/7/26 23:57, Stefan Wahren wrote:
> Hi Yue,
> 
> Am 26.07.19 um 11:26 schrieb YueHaibing:
>> Fix gcc used-but-set-variable warning:
> 
> just a nit. It is call "unused-but-set-variable"

Oh, yes, thanks!

> 
> Acked-by: Stefan Wahren 
> 
>>
>> drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c: In function 
>> vchiq_release_internal:
>> drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c:2827:16: 
>> warning:
>>  variable local_entity_uc set but not used [-Wunused-but-set-variable]
>> drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c:2827:6: 
>> warning:
>>  variable local_uc set but not used [-Wunused-but-set-variable]
>>
>> Remove the unused variables 'local_entity_uc' and 'local_uc'
>>
>> Reported-by: Hulk Robot 
>> Signed-off-by: YueHaibing 
>> ---
> 
> .
> 



Re: [patch 0/8] core, x86: Preparatory steps for RT

2019-07-26 Thread Steven Rostedt
On Fri, 26 Jul 2019 23:19:36 +0200
Thomas Gleixner  wrote:

> CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by
> CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same
> functionality which today depends on CONFIG_PREEMPT.
> 
> The following series adjusts the core and x86 code to use
> CONFIG_PREEMPTION where appropriate and extends the x86 dumpstack
> implementation to display PREEMPT_RT instead of PREEMPT on a RT
> enabled kernel.
>

Hmm, I'm looking at v5.3-rc1 and I don't see a CONFIG_PREEMPTION
defined. And the first patch doesn't define it. Did I miss a patch
series that adds it?

-- Steve


Re: [PATCH v3 1/2] dt-bindings: i3c: Document MediaTek I3C master bindings

2019-07-26 Thread Qii Wang
On Wed, 2019-07-24 at 14:21 -0600, Rob Herring wrote:
> On Tue, Jul 09, 2019 at 09:09:21PM +0800, Qii Wang wrote:
> > Document MediaTek I3C master DT bindings.
> > 
> > Signed-off-by: Qii Wang 
> > ---
> >  .../devicetree/bindings/i3c/mtk,i3c-master.txt |   48 
> > 
> >  1 file changed, 48 insertions(+)
> >  create mode 100644 Documentation/devicetree/bindings/i3c/mtk,i3c-master.txt
> > 
> > diff --git a/Documentation/devicetree/bindings/i3c/mtk,i3c-master.txt 
> > b/Documentation/devicetree/bindings/i3c/mtk,i3c-master.txt
> > new file mode 100644
> > index 000..d32eda6
> > --- /dev/null
> > +++ b/Documentation/devicetree/bindings/i3c/mtk,i3c-master.txt
> > @@ -0,0 +1,48 @@
> > +Bindings for MediaTek I3C master block
> > +=
> > +
> > +Required properties:
> > +
> > +- compatible: shall be "mediatek,i3c-master"
> 
> Needs to be SoC specific.
> 

We hope that the SOCs will use the same driver and try to avoid big
changes. If there are inevitable changes in the future, then we will
modify the compatible to be SoC specific. cdns,i3c-master.txt is not SoC
specific either.

> > +- reg: physical base address of the controller and apdma base, length of
> > +  memory mapped region.
> > +- reg-names: shall be "main" for master controller and "dma" for apdma.
> > +- interrupts: the interrupt line connected to this I3C master.
> > +- clocks: shall reference the i3c and apdma clocks.
> > +- clock-names: shall include "main" and "dma".
> > +
> > +Mandatory properties defined by the generic binding (see
> > +Documentation/devicetree/bindings/i3c/i3c.txt for more details):
> > +
> > +- #address-cells: shall be set to 3
> > +- #size-cells: shall be set to 0
> > +
> > +Optional properties defined by the generic binding (see
> > +Documentation/devicetree/bindings/i3c/i3c.txt for more details):
> > +
> > +- i2c-scl-hz
> > +- i3c-scl-hz
> > +
> > +I3C device connected on the bus follow the generic description (see
> > +Documentation/devicetree/bindings/i3c/i3c.txt for more details).
> > +
> > +Example:
> > +
> > +   i3c0: i3c@1100d000 {
> > +   compatible = "mediatek,i3c-master";
> > +   reg = <0x1100d000 0x1000>,
> > + <0x11000300 0x80>;
> > +   reg-names = "main", "dma";
> > +   interrupts = ;
> > +   clocks = < CLK_INFRA_I3C0>,
> > +< CLK_INFRA_AP_DMA>;
> > +   clock-names = "main", "dma";
> > +   #address-cells = <3>;
> > +   #size-cells = <0>;
> > +   i2c-scl-hz = <10>;
> > +
> > +   nunchuk: nunchuk@52 {
> > +   compatible = "nintendo,nunchuk";
> > +   reg = <0x52 0x0 0x10>;
> > +   };
> > +   };
> > -- 
> > 1.7.9.5
> > 




[PATCH] PM / wakeup: Avoid dev_name collisions in wakeup class

2019-07-26 Thread Stephen Boyd
If a device is wakeup capable and the driver calls device_wakeup_init()
on it during probe and then userspace writes 'enabled' to that device's
power/wakeup file in sysfs we'll try to create the same named wakeup
device in sysfs. The kernel will complain about duplicate file names.

sysfs: cannot create duplicate filename '/devices/virtual/wakeup/1-1.1'
kobject_add_internal failed for 1-1.1 with -EEXIST, don't try to register 
things with the same name in the same directory.

It may be advantageous to not write 'enabled' to the wakeup file (see
wakeup_store()) from userspace for these devices because we allocate
devices and register them and then throw them all away later on if the
device driver has already initialized the wakeup attribute. The
implementation currently tries to avoid taking locks here so it seems
best to optimize that path in a separate patch.

Let's rename the wakeup class devices as 'wakeupN' with an IDA that's
simple enough to just return some sort of number. In addition, let's
make the device registering the wakeup the parent and include a 'name'
attribute in case userspace wants to figure out the type of wakeup it is
(in the case of virtual wakeups) or the device associated with the
wakeup. This makes it easier for userspace to go from /sys/class/wakeup
to a place in the device hierarchy where the wakeup is generated from
like an input device.

Cc: Tri Vo 
Cc: Kalesh Singh 
Cc: Greg Kroah-Hartman 
Cc: Ravi Chandra Sadineni 
Signed-off-by: Stephen Boyd 
---
 drivers/acpi/device_pm.c  |  2 +-
 drivers/base/power/wakeup.c   |  8 +---
 drivers/base/power/wakeup_stats.c | 31 ++-
 fs/eventpoll.c|  4 ++--
 include/linux/pm_wakeup.h | 12 
 kernel/power/autosleep.c  |  2 +-
 kernel/power/wakelock.c   |  2 +-
 kernel/time/alarmtimer.c  |  2 +-
 8 files changed, 45 insertions(+), 18 deletions(-)

diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c
index 28cffaaf9d82..0863be1e42d6 100644
--- a/drivers/acpi/device_pm.c
+++ b/drivers/acpi/device_pm.c
@@ -495,7 +495,7 @@ acpi_status acpi_add_pm_notifier(struct acpi_device *adev, 
struct device *dev,
goto out;
 
mutex_lock(_pm_notifier_lock);
-   adev->wakeup.ws = wakeup_source_register(dev_name(>dev));
+   adev->wakeup.ws = wakeup_source_register(>dev, 
dev_name(>dev));
adev->wakeup.context.dev = dev;
adev->wakeup.context.func = func;
adev->wakeup.flags.notifier_present = true;
diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c
index 2b8def0ea59f..7ba242b49831 100644
--- a/drivers/base/power/wakeup.c
+++ b/drivers/base/power/wakeup.c
@@ -201,15 +201,17 @@ EXPORT_SYMBOL_GPL(wakeup_source_remove);
 /**
  * wakeup_source_register - Create wakeup source and add it to the list.
  * @name: Name of the wakeup source to register.
+ * @dev: Device wakeup source is associated with (or NULL if virtual)
  */
-struct wakeup_source *wakeup_source_register(const char *name)
+struct wakeup_source *wakeup_source_register(struct device *dev,
+const char *name)
 {
struct wakeup_source *ws;
int ret;
 
ws = wakeup_source_create(name);
if (ws) {
-   ret = wakeup_source_sysfs_add(ws);
+   ret = wakeup_source_sysfs_add(dev, ws);
if (ret) {
kfree_const(ws->name);
kfree(ws);
@@ -273,7 +275,7 @@ int device_wakeup_enable(struct device *dev)
if (pm_suspend_target_state != PM_SUSPEND_ON)
dev_dbg(dev, "Suspicious %s() during system transition!\n", 
__func__);
 
-   ws = wakeup_source_register(dev_name(dev));
+   ws = wakeup_source_register(dev, dev_name(dev));
if (!ws)
return -ENOMEM;
 
diff --git a/drivers/base/power/wakeup_stats.c 
b/drivers/base/power/wakeup_stats.c
index 9c01150f1213..927cc84d3392 100644
--- a/drivers/base/power/wakeup_stats.c
+++ b/drivers/base/power/wakeup_stats.c
@@ -7,8 +7,9 @@
  * Copyright (c) 2019 Google Inc.
  */
 
-#include 
+#include 
 #include 
+#include 
 
 #include "power.h"
 
@@ -80,6 +81,15 @@ static ssize_t last_change_ms_show(struct device *dev,
 }
 static DEVICE_ATTR_RO(last_change_ms);
 
+static ssize_t name_show(struct device *dev,
+  struct device_attribute *attr, char *buf)
+{
+   struct wakeup_source *ws = dev_get_drvdata(dev);
+
+   return sprintf(buf, "%s\n", ws->name);
+}
+static DEVICE_ATTR_RO(name);
+
 static ssize_t prevent_suspend_time_ms_show(struct device *dev,
struct device_attribute *attr,
char *buf)
@@ -96,6 +106,7 @@ static ssize_t prevent_suspend_time_ms_show(struct device 
*dev,
 static DEVICE_ATTR_RO(prevent_suspend_time_ms);
 
 static struct attribute *wakeup_source_attrs[] = {
+  

Re: [PATCH 1/3 v2] fs: ocfs2: Fix possible null-pointer dereferences in ocfs2_xa_prepare_entry()

2019-07-26 Thread Joseph Qi



On 19/7/26 18:14, Jia-Ju Bai wrote:
> In ocfs2_xa_prepare_entry(), there is an if statement on line 2136 to
> check whether loc->xl_entry is NULL:
> if (loc->xl_entry)
> 
> When loc->xl_entry is NULL, it is used on line 2158:
> ocfs2_xa_add_entry(loc, name_hash);
> loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash);
> loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size);
> and line 2164:
> ocfs2_xa_add_namevalue(loc, xi);
> loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len);
> loc->xl_entry->xe_name_len = xi->xi_name_len;
> 
> Thus, possible null-pointer dereferences may occur.
> 
> To fix these bugs, if loc-xl_entry is NULL, ocfs2_xa_prepare_entry()
> abnormally returns with -EINVAL.
> 
> These bugs are found by a static analysis tool STCheck written by us.
> 
> Signed-off-by: Jia-Ju Bai 

Reviewed-by: Joseph Qi 
> ---
> v2:
> * Directly return -EINVAL if loc-xl_entry is NULL.
>   Thank Joseph for helpful advice.
> 
> ---
>  fs/ocfs2/xattr.c | 44 +++-
>  1 file changed, 23 insertions(+), 21 deletions(-)
> 
> diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c
> index 385f3aaa2448..4b876c82a35c 100644
> --- a/fs/ocfs2/xattr.c
> +++ b/fs/ocfs2/xattr.c
> @@ -2133,29 +2133,31 @@ static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc 
> *loc,
>   if (rc)
>   goto out;
>  
> - if (loc->xl_entry) {
> - if (ocfs2_xa_can_reuse_entry(loc, xi)) {
> - orig_value_size = loc->xl_entry->xe_value_size;
> - rc = ocfs2_xa_reuse_entry(loc, xi, ctxt);
> - if (rc)
> - goto out;
> - goto alloc_value;
> - }
> + if (!loc->xl_entry) {
> + rc = -EINVAL;
> + goto out;
> + }
>  
> - if (!ocfs2_xattr_is_local(loc->xl_entry)) {
> - orig_clusters = ocfs2_xa_value_clusters(loc);
> - rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
> - if (rc) {
> - mlog_errno(rc);
> - ocfs2_xa_cleanup_value_truncate(loc,
> - "overwriting",
> - orig_clusters);
> - goto out;
> - }
> + if (ocfs2_xa_can_reuse_entry(loc, xi)) {
> + orig_value_size = loc->xl_entry->xe_value_size;
> + rc = ocfs2_xa_reuse_entry(loc, xi, ctxt);
> + if (rc)
> + goto out;
> + goto alloc_value;
> + }
> +
> + if (!ocfs2_xattr_is_local(loc->xl_entry)) {
> + orig_clusters = ocfs2_xa_value_clusters(loc);
> + rc = ocfs2_xa_value_truncate(loc, 0, ctxt);
> + if (rc) {
> + mlog_errno(rc);
> + ocfs2_xa_cleanup_value_truncate(loc,
> + "overwriting",
> + orig_clusters);
> + goto out;
>   }
> - ocfs2_xa_wipe_namevalue(loc);
> - } else
> - ocfs2_xa_add_entry(loc, name_hash);
> + }
> + ocfs2_xa_wipe_namevalue(loc);
>  
>   /*
>* If we get here, we have a blank entry.  Fill it.  We grow our
> 


Re: Regression in 5.3 for some FS_USERNS_MOUNT (aka user-namespace-mountable) filesystems

2019-07-26 Thread Eric W. Biederman
Al Viro  writes:

> On Fri, Jul 26, 2019 at 03:47:02PM -0700, Linus Torvalds wrote:
>
>> Of course, then later on, commit 20284ab7427f ("switch mount_capable()
>> to fs_context") drops that argument entirely, and hardcodes the
>> decision to look at fc->global.
>> 
>> But that fc->global decision wasn't there originally, and is incorrect
>> since it breaks existing users.
>> 
>> What gets much more confusing about this is that the two different
>> users then moved around. The sget_userns() case got moved to
>> legacy_get_tree(), and then joined together in vfs_get_tree(), and
>> then split and moved out to do_new_mount() and vfs_fsconfig_locked().
>> 
>> And that "joined together into vfs_get_tree()" must be wrong, because
>> the two cases used two different namespace rules. The sget_userns()
>> case *did* have that "global" flag check, while the sget_fc() did not.
>> 
>> Messy. Al?
>
> Digging through that mess...  It's my fuckup, and we obviously need to
> restore the old behaviour, but I really hope to manage that with
> checks _not_ in superblock allocator ;-/

If someone had bothered to actually look at how I was proposing to clean
things up before the new mount api we would already have that.  Sigh.

You should be able to get away with something like this which moves the
checks earlier and makes things clearer.  My old patch against the pre
new mount api code.

I am running at undependable speed due to the new baby so it is probably
better for someone else to forward port this, but I will attempt it
otherwise.

Eric

From: "Eric W. Biederman" 
Date: Wed, 21 Nov 2018 11:17:01 -0600
Subject: [PATCH] vfs: Replace FS_USERNS_MOUNT with file_system_type->permission

Permission checking of the user to see if the can mount an individual
filesystem using FS_USERNS_MOUNT and checks in sget is not very
comprehensible.  Further by pushing the logic down into sget the
attack surface on filesystems that don't support unprivilged mounts is
much larger than it should be.

Now that it is understood what the permission checks need to be refactor the
checks into a simple per filesystme permission check.  If no permission check is
implemented the default check becomes a simple capable(CAP_SYS_ADMIN).

The result is code that is much simpler to understand and much easier to 
maintain.

Signed-off-by: "Eric W. Biederman" 
---
 fs/devpts/inode.c  |  2 +-
 fs/fuse/inode.c|  3 ++-
 fs/namespace.c | 15 +++
 fs/proc/root.c |  8 +++-
 fs/ramfs/inode.c   |  2 +-
 fs/super.c | 20 ++--
 fs/sysfs/mount.c   | 13 +++--
 include/linux/fs.h |  4 +++-
 ipc/mqueue.c   |  8 +++-
 kernel/cgroup/cgroup.c | 16 
 mm/shmem.c |  4 ++--
 11 files changed, 59 insertions(+), 36 deletions(-)

diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c
index c53814539070..1418912efc7d 100644
--- a/fs/devpts/inode.c
+++ b/fs/devpts/inode.c
@@ -519,9 +519,9 @@ static void devpts_kill_sb(struct super_block *sb)
 
 static struct file_system_type devpts_fs_type = {
.name   = "devpts",
+   .permission = userns_mount_permission,
.mount  = devpts_mount,
.kill_sb= devpts_kill_sb,
-   .fs_flags   = FS_USERNS_MOUNT,
 };
 
 /*
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 0b94b23b02d4..e9f6aa9974f8 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -1259,7 +1259,8 @@ static void fuse_kill_sb_anon(struct super_block *sb)
 static struct file_system_type fuse_fs_type = {
.owner  = THIS_MODULE,
.name   = "fuse",
-   .fs_flags   = FS_HAS_SUBTYPE | FS_USERNS_MOUNT,
+   .fs_flags   = FS_HAS_SUBTYPE,
+   .permission = userns_mount_permission,
.mount  = fuse_mount,
.kill_sb= fuse_kill_sb_anon,
 };
diff --git a/fs/namespace.c b/fs/namespace.c
index 74f64294a410..44935dbdb162 100644
--- a/fs/namespace.c
+++ b/fs/namespace.c
@@ -2448,6 +2448,16 @@ static int do_add_mount(struct mount *newmnt, struct 
path *path, int mnt_flags)
 
 static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags);
 
+static int new_mount_permission(struct file_system_type *type)
+{
+   int err = 0;
+   if (type->permission)
+   err = type->permission();
+   else if (!capable(CAP_SYS_ADMIN))
+   err = -EPERM;
+   return err;
+}
+
 /*
  * create a new mount for userspace and request it to be added into the
  * namespace's tree
@@ -2466,6 +2476,11 @@ static int do_new_mount(struct path *path, const char 
*fstype, int sb_flags,
if (!type)
return -ENODEV;
 
+   /* Verify the mounter has permission to mount the filesystem */
+   err = new_mount_permission(type);
+   if (err)
+   return err;
+
mnt = vfs_kern_mount(type, sb_flags, name, data);
if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
  

Re: [PATCH v6 0/9] Add Error Disconnect Recover (EDR) support

2019-07-26 Thread Austin.Bolen
On 7/26/2019 6:33 PM, sathyanarayanan kuppuswamy wrote:
> +Austin , Huong
>
> On 7/26/19 2:53 PM, Keith Busch wrote:
>> On Fri, Jul 26, 2019 at 02:43:10PM -0700, 
>> sathyanarayanan.kuppusw...@linux.intel.com wrote:
>>> From: Kuppuswamy Sathyanarayanan 
>>> 
>>>
>>> This patchset adds support for following features:
>>>
>>> 1. Error Disconnect Recover (EDR) support.
>>> 2. _OSC based negotiation support for DPC.
>>>
>>> You can find EDR spec in the following link.
>>>
>>> https://members.pcisig.com/wg/PCI-SIG/document/12614
>> Thank you for sticking with this. I've reviewed the series and I think
>> this looks good for the next merge window.
>>
>> Acked-by: Keith Busch 

Tested on a DPC-enabled PCIe switch (Broadcom PEX9733) in a Dell
PowerEdge R740xd.  Injected fatal and non-fatal errors on an NVMe
endpoint below the switch and on the switch downstream port itself and
verified errors were contained and then recovered at the PCIe level.

Tested-by: Austin Bolen 

>>



Re: [PATCH] mm: Make kvfree safe to call

2019-07-26 Thread Matthew Wilcox
On Fri, Jul 26, 2019 at 05:25:03PM -0400, Jeff Layton wrote:
> On Fri, 2019-07-26 at 14:10 -0700, Alexander Duyck wrote:
> > On Fri, Jul 26, 2019 at 2:01 PM Matthew Wilcox  wrote:
> > > From: "Matthew Wilcox (Oracle)" 
> > > 
> > > Since vfree() can sleep, calling kvfree() from contexts where sleeping
> > > is not permitted (eg holding a spinlock) is a bit of a lottery whether
> > > it'll work.  Introduce kvfree_safe() for situations where we know we can
> > > sleep, but make kvfree() safe by default.
> > > 
> > > Reported-by: Jeff Layton 
> > > Cc: Alexander Viro 
> > > Cc: Luis Henriques 
> > > Cc: Christoph Hellwig 
> > > Cc: Carlos Maiolino 
> > > Signed-off-by: Matthew Wilcox (Oracle) 
> > 
> > So you say you are adding kvfree_safe() in the patch description, but
> > it looks like you are introducing kvfree_fast() below. Did something
> > change and the patch description wasn't updated, or is this just the
> > wrong description for this patch?

Oops, bad description.  Thanks, I'll fix it for v2.

> > > +/**
> > > + * kvfree_fast() - Free memory.
> > > + * @addr: Pointer to allocated memory.
> > > + *
> > > + * kvfree_fast frees memory allocated by any of vmalloc(), kmalloc() or
> > > + * kvmalloc().  It is slightly more efficient to use kfree() or vfree() 
> > > if
> > > + * you are certain that you know which one to use.
> > > + *
> > > + * Context: Either preemptible task context or not-NMI interrupt.  Must 
> > > not
> > > + * hold a spinlock as it can sleep.
> > > + */
> > > +void kvfree_fast(const void *addr)
> > > +{
> > > +   might_sleep();
> > > +
> 
> might_sleep_if(!in_interrupt());
> 
> That's what vfree does anyway, so we might as well exempt the case where
> you are.

True, but if we are in interrupt, then we may as well call kvfree() since
it'll do the same thing, and this way the rules are clearer.

> > > +   if (is_vmalloc_addr(addr))
> > > +   vfree(addr);
> > > +   else
> > > +   kfree(addr);
> > > +}
> > > +EXPORT_SYMBOL(kvfree_fast);
> > > +
> 
> That said -- is this really useful?
> 
> The only way to know that this is safe is to know what sort of
> allocation it is, and in that case you can just call kfree or vfree as
> appropriate.

It's safe if you know you're not holding any spinlocks, for example ...



Re: [PATCH v9 4/4] uprobe: use FOLL_SPLIT_PMD instead of FOLL_SPLIT

2019-07-26 Thread Song Liu



> On Jul 26, 2019, at 4:52 PM, Andrew Morton  wrote:
> 
> On Fri, 26 Jul 2019 23:44:34 + Song Liu  wrote:
> 
>> 
>> 
>>> On Jul 26, 2019, at 4:02 PM, Andrew Morton  
>>> wrote:
>>> 
>>> On Thu, 25 Jul 2019 22:46:54 -0700 Song Liu  wrote:
>>> 
 This patches uses newly added FOLL_SPLIT_PMD in uprobe. This enables easy
 regroup of huge pmd after the uprobe is disabled (in next patch).
>>> 
>>> Confused.  There is no "next patch".
>> 
>> That was the patch 5, which was in earlier versions. I am working on 
>> addressing Kirill's feedback for it. 
>> 
>> Do I need to resubmit 4/4 with modified change log? 
> 
> Please just send new changelog text now.  I assume this [4/4] patch is
> useful without patch #5, but a description of why it is useful is
> appropriate.

Yes, 4/4 is useful with #5. Please find the updated change log. 

= 8< 

This patch uses newly added FOLL_SPLIT_PMD in uprobe. This preserves the 
huge page when the uprobe is enabled. When the uprobe is disabled, newer 
instances of the same application could still benefit from huge page. 

For the next step, we will enable khugepaged to regroup the pmd, so that 
existing instances of the application could also benefit from huge page 
after the uprobe is disabled. 

Acked-by: Kirill A. Shutemov 
Reviewed-by: Srikar Dronamraju 
Signed-off-by: Song Liu 

= 8< 

> 
> I trust the fifth patch is to be sent soon?

Yes, I am working on it. 

Thanks,
Song

[GIT PULL] Devicetree fixes for 5.3-rc, take 2

2019-07-26 Thread Rob Herring
Hi Linus,

Please pull some more DT fixes for 5.3. The nvmem changes would
typically go thru Greg's tree, but they were missed in the merge
window and I've been unable to get a response (partly because Srinivas
is out on vacation it appears).

Rob


The following changes since commit e2297f7c3ab3b68dda2ac732b1767212019d3bdf:

  dt-bindings: pinctrl: stm32: Fix missing 'clocks' property in
examples (2019-07-20 20:28:53 -0600)

are available in the Git repository at:

  git://git.kernel.org/pub/scm/linux/kernel/git/robh/linux.git
tags/devicetree-fixes-for-5.3-2

for you to fetch changes up to e1ff7390f58e609aa113a2452a953f669abce6cc:

  dt-bindings: Fix more $id value mismatches filenames (2019-07-26
17:41:41 -0600)


Devicetree fixes for 5.3-rc:

- Fix mismatches in $id values and actual filenames. Now checked by
  tools.

- Convert nvmem binding to DT schema

- Fix a typo in of_property_read_bool() kerneldoc

- Remove some redundant description in al-fic interrupt-controller


Maxime Ripard (2):
  dt-bindings: nvmem: Add YAML schemas for the generic NVMEM bindings
  dt-bindings: nvmem: SID: Fix the examples node names

Rob Herring (2):
  dt-bindings: clk: allwinner,sun4i-a10-ccu: Correct path in $id
  dt-bindings: Fix more $id value mismatches filenames

Talel Shenhar (1):
  dt-bindings: interrupt-controller: al-fic: remove redundant binding

Thierry Reding (1):
  of: Fix typo in kerneldoc

 Documentation/devicetree/bindings/arm/renesas.yaml |  2 +-
 .../bindings/arm/socionext/milbeaut.yaml   |  2 +-
 .../devicetree/bindings/arm/ti/ti,davinci.yaml |  2 +-
 .../bindings/clock/allwinner,sun4i-a10-ccu.yaml|  2 +-
 .../intel,ixp4xx-network-processing-engine.yaml|  2 +-
 .../devicetree/bindings/iio/accel/adi,adxl345.yaml |  2 +-
 .../devicetree/bindings/iio/accel/adi,adxl372.yaml |  2 +-
 .../interrupt-controller/amazon,al-fic.txt | 16 ++--
 .../intel,ixp4xx-interrupt.yaml|  2 +-
 ...er.yaml => intel,ixp4xx-ahb-queue-manager.yaml} |  2 +-
 .../bindings/net/allwinner,sun8i-a83t-emac.yaml|  2 +-
 .../bindings/nvmem/allwinner,sun4i-a10-sid.yaml|  4 +-
 .../devicetree/bindings/nvmem/nvmem-consumer.yaml  | 45 +++
 Documentation/devicetree/bindings/nvmem/nvmem.txt  | 81 +--
 Documentation/devicetree/bindings/nvmem/nvmem.yaml | 93 ++
 .../phy/allwinner,sun6i-a31-mipi-dphy.yaml |  2 +-
 .../bindings/timer/intel,ixp4xx-timer.yaml |  2 +-
 include/linux/of.h |  2 +-
 18 files changed, 161 insertions(+), 104 deletions(-)
 rename Documentation/devicetree/bindings/misc/{intel,ixp4xx-queue-manager.yaml
=> intel,ixp4xx-ahb-queue-manager.yaml} (95%)
 create mode 100644 Documentation/devicetree/bindings/nvmem/nvmem-consumer.yaml
 create mode 100644 Documentation/devicetree/bindings/nvmem/nvmem.yaml


[PATCH v3] mm: memcontrol: fix use after free in mem_cgroup_iter()

2019-07-26 Thread Miles Chen
This patch is sent to report an use after free in mem_cgroup_iter()
after merging commit: be2657752e9e "mm: memcg: fix use after free in
mem_cgroup_iter()".

I work with android kernel tree (4.9 & 4.14), and the commit:
be2657752e9e "mm: memcg: fix use after free in mem_cgroup_iter()" has
been merged to the trees. However, I can still observe use after free
issues addressed in the commit be2657752e9e.
(on low-end devices, a few times this month)

backtrace:
css_tryget <- crash here
mem_cgroup_iter
shrink_node
shrink_zones
do_try_to_free_pages
try_to_free_pages
__perform_reclaim
__alloc_pages_direct_reclaim
__alloc_pages_slowpath
__alloc_pages_nodemask

To debug, I poisoned mem_cgroup before freeing it:

static void __mem_cgroup_free(struct mem_cgroup *memcg)
for_each_node(node)
free_mem_cgroup_per_node_info(memcg, node);
free_percpu(memcg->stat);
+   /* poison memcg before freeing it */
+   memset(memcg, 0x78, sizeof(struct mem_cgroup));
kfree(memcg);
}

The coredump shows the position=0xdbbc2a00 is freed.

(gdb) p/x ((struct mem_cgroup_per_node *)0xe5009e00)->iter[8]
$13 = {position = 0xdbbc2a00, generation = 0x2efd}

0xdbbc2a00: 0xdbbc2e00  0x  0xdbbc2800  0x0100
0xdbbc2a10: 0x0200  0x78787878  0x00026218  0x
0xdbbc2a20: 0xdcad6000  0x0001  0x78787800  0x
0xdbbc2a30: 0x7878  0x  0x0068fb84  0x78787878
0xdbbc2a40: 0x78787878  0x78787878  0x78787878  0xe3fa5cc0
0xdbbc2a50: 0x78787878  0x78787878  0x  0x
0xdbbc2a60: 0x  0x  0x  0x
0xdbbc2a70: 0x  0x  0x  0x
0xdbbc2a80: 0x  0x  0x  0x
0xdbbc2a90: 0x0001  0x  0x  0x0010
0xdbbc2aa0: 0x0001  0xdbbc2ac8  0x  0x
0xdbbc2ab0: 0x  0x  0x  0x
0xdbbc2ac0: 0x  0x  0xe5b02618  0x1000
0xdbbc2ad0: 0x  0x78787878  0x78787878  0x78787878
0xdbbc2ae0: 0x78787878  0x78787878  0x78787878  0x78787878
0xdbbc2af0: 0x78787878  0x78787878  0x78787878  0x78787878
0xdbbc2b00: 0x78787878  0x78787878  0x78787878  0x78787878
0xdbbc2b10: 0x78787878  0x78787878  0x78787878  0x78787878
0xdbbc2b20: 0x78787878  0x78787878  0x78787878  0x78787878
0xdbbc2b30: 0x78787878  0x78787878  0x78787878  0x78787878
0xdbbc2b40: 0x78787878  0x78787878  0x78787878  0x78787878
0xdbbc2b50: 0x78787878  0x78787878  0x78787878  0x78787878
0xdbbc2b60: 0x78787878  0x78787878  0x78787878  0x78787878
0xdbbc2b70: 0x78787878  0x78787878  0x78787878  0x78787878
0xdbbc2b80: 0x78787878  0x78787878  0x  0x78787878
0xdbbc2b90: 0x78787878  0x78787878  0x78787878  0x78787878
0xdbbc2ba0: 0x78787878  0x78787878  0x78787878  0x78787878

In the reclaim path, try_to_free_pages() does not setup
sc.target_mem_cgroup and sc is passed to do_try_to_free_pages(), ...,
shrink_node().

In mem_cgroup_iter(), root is set to root_mem_cgroup because
sc->target_mem_cgroup is NULL.
It is possible to assign a memcg to root_mem_cgroup.nodeinfo.iter in
mem_cgroup_iter().

try_to_free_pages
struct scan_control sc = {...}, target_mem_cgroup is 0x0;
do_try_to_free_pages
shrink_zones
shrink_node
 mem_cgroup *root = sc->target_mem_cgroup;
 memcg = mem_cgroup_iter(root, NULL, );
mem_cgroup_iter()
if (!root)
root = root_mem_cgroup;
...

css = css_next_descendant_pre(css, >css);
memcg = mem_cgroup_from_css(css);
cmpxchg(>position, pos, memcg);

My device uses memcg non-hierarchical mode.
When we release a memcg: invalidate_reclaim_iterators() reaches only
dead_memcg and its parents. If non-hierarchical mode is used,
invalidate_reclaim_iterators() never reaches root_mem_cgroup.

static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg)
{
struct mem_cgroup *memcg = dead_memcg;

for (; memcg; memcg = parent_mem_cgroup(memcg)
...
}

So the use after free scenario looks like:

CPU1CPU2

try_to_free_pages
do_try_to_free_pages
shrink_zones
shrink_node
mem_cgroup_iter()
if (!root)
root = root_mem_cgroup;
...
css = css_next_descendant_pre(css, >css);
memcg = mem_cgroup_from_css(css);
cmpxchg(>position, pos, memcg);


Re: [PATCH 3/3] Fix sched-messaging.c use of uninitialized value errors

2019-07-26 Thread Ian Rogers
On Fri, Jul 26, 2019 at 12:32 PM Arnaldo Carvalho de Melo
 wrote:
>
> Em Wed, Jul 24, 2019 at 04:45:00PM -0700, Numfor Mbiziwo-Tiapo escreveu:
> > Our local MSAN (Memory Sanitizer) build of perf throws use of
> > uninitialized value warnings in "tools/perf/bench/sched-messaging.c"
> > when running perf bench.
> >
> > The first warning comes from the "ready" function where the "dummy" char
> > is declared and then passed into "write" without being initialized.
> > Initializing "dummy" to any character silences the warning.
> >
> > The second warning comes from the "sender" function where a "write" call
> > is made to write the contents from the "data" char array when it has not
> > yet been initialized. Calling memset on "data" silences the warning.
>
> So, this is just to silence MSAN, as it doesn't matter what is sent,
> whatever values are in those variables is ok, as it will not be used,
> right?

That's right.

Thanks,
Ian Rogers

> - Arnaldo
>
> > To reproduce this warning, build perf by running:
> > make -C tools/perf CLANG=1 CC=clang EXTRA_CFLAGS="-fsanitize=memory\
> >  -fsanitize-memory-track-origins"
> >
> > (Additionally, llvm might have to be installed and clang might have to
> > be specified as the compiler - export CC=/usr/bin/clang)
> >
> > then running: tools/perf/perf bench sched all
> >
> > Please see the cover letter for why false positive warnings may be
> > generated.
> >
> > Signed-off-by: Numfor Mbiziwo-Tiapo 
> > ---
> >  tools/perf/bench/sched-messaging.c | 3 ++-
> >  1 file changed, 2 insertions(+), 1 deletion(-)
> >
> > diff --git a/tools/perf/bench/sched-messaging.c 
> > b/tools/perf/bench/sched-messaging.c
> > index f9d7641ae833..d22d7b7b591d 100644
> > --- a/tools/perf/bench/sched-messaging.c
> > +++ b/tools/perf/bench/sched-messaging.c
> > @@ -69,7 +69,7 @@ static void fdpair(int fds[2])
> >  /* Block until we're ready to go */
> >  static void ready(int ready_out, int wakefd)
> >  {
> > - char dummy;
> > + char dummy = 'N';
> >   struct pollfd pollfd = { .fd = wakefd, .events = POLLIN };
> >
> >   /* Tell them we're ready. */
> > @@ -87,6 +87,7 @@ static void *sender(struct sender_context *ctx)
> >   char data[DATASIZE];
> >   unsigned int i, j;
> >
> > + memset(data, 'N', DATASIZE);
> >   ready(ctx->ready_out, ctx->wakefd);
> >
> >   /* Now pump to every receiver. */
> > --
> > 2.22.0.657.g960e92d24f-goog
>
> --
>
> - Arnaldo


Re: [PATCH v9 4/4] uprobe: use FOLL_SPLIT_PMD instead of FOLL_SPLIT

2019-07-26 Thread Andrew Morton
On Fri, 26 Jul 2019 23:44:34 + Song Liu  wrote:

> 
> 
> > On Jul 26, 2019, at 4:02 PM, Andrew Morton  
> > wrote:
> > 
> > On Thu, 25 Jul 2019 22:46:54 -0700 Song Liu  wrote:
> > 
> >> This patches uses newly added FOLL_SPLIT_PMD in uprobe. This enables easy
> >> regroup of huge pmd after the uprobe is disabled (in next patch).
> > 
> > Confused.  There is no "next patch".
> 
> That was the patch 5, which was in earlier versions. I am working on 
> addressing Kirill's feedback for it. 
> 
> Do I need to resubmit 4/4 with modified change log? 

Please just send new changelog text now.  I assume this [4/4] patch is
useful without patch #5, but a description of why it is useful is
appropriate.

I trust the fifth patch is to be sent soon?


Re: [PATCH v2] mm: memcontrol: fix use after free in mem_cgroup_iter()

2019-07-26 Thread Miles Chen
On Fri, 2019-07-26 at 14:55 +0200, Michal Hocko wrote:
> On Fri 26-07-19 14:49:33, Michal Hocko wrote:
> > On Fri 26-07-19 10:12:47, Miles Chen wrote:
> > > This patch is sent to report an use after free in mem_cgroup_iter()
> > > after merging commit: be2657752e9e "mm: memcg: fix use after free in
> > > mem_cgroup_iter()".
> > > 
> > > I work with android kernel tree (4.9 & 4.14), and the commit:
> > > be2657752e9e "mm: memcg: fix use after free in mem_cgroup_iter()" has
> > > been merged to the trees. However, I can still observe use after free
> > > issues addressed in the commit be2657752e9e.
> > > (on low-end devices, a few times this month)
> > > 
> > > backtrace:
> > >   css_tryget <- crash here
> > >   mem_cgroup_iter
> > >   shrink_node
> > >   shrink_zones
> > >   do_try_to_free_pages
> > >   try_to_free_pages
> > >   __perform_reclaim
> > >   __alloc_pages_direct_reclaim
> > >   __alloc_pages_slowpath
> > >   __alloc_pages_nodemask
> > > 
> > > To debug, I poisoned mem_cgroup before freeing it:
> > > 
> > > static void __mem_cgroup_free(struct mem_cgroup *memcg)
> > >   for_each_node(node)
> > >   free_mem_cgroup_per_node_info(memcg, node);
> > >   free_percpu(memcg->stat);
> > > +   /* poison memcg before freeing it */
> > > +   memset(memcg, 0x78, sizeof(struct mem_cgroup));
> > >   kfree(memcg);
> > > }
> > > 
> > > The coredump shows the position=0xdbbc2a00 is freed.
> > > 
> > > (gdb) p/x ((struct mem_cgroup_per_node *)0xe5009e00)->iter[8]
> > > $13 = {position = 0xdbbc2a00, generation = 0x2efd}
> > > 
> > > 0xdbbc2a00: 0xdbbc2e00  0x  0xdbbc2800  0x0100
> > > 0xdbbc2a10: 0x0200  0x78787878  0x00026218  0x
> > > 0xdbbc2a20: 0xdcad6000  0x0001  0x78787800  0x
> > > 0xdbbc2a30: 0x7878  0x  0x0068fb84  0x78787878
> > > 0xdbbc2a40: 0x78787878  0x78787878  0x78787878  0xe3fa5cc0
> > > 0xdbbc2a50: 0x78787878  0x78787878  0x  0x
> > > 0xdbbc2a60: 0x  0x  0x  0x
> > > 0xdbbc2a70: 0x  0x  0x  0x
> > > 0xdbbc2a80: 0x  0x  0x  0x
> > > 0xdbbc2a90: 0x0001  0x  0x  0x0010
> > > 0xdbbc2aa0: 0x0001  0xdbbc2ac8  0x  0x
> > > 0xdbbc2ab0: 0x  0x  0x  0x
> > > 0xdbbc2ac0: 0x  0x  0xe5b02618  0x1000
> > > 0xdbbc2ad0: 0x  0x78787878  0x78787878  0x78787878
> > > 0xdbbc2ae0: 0x78787878  0x78787878  0x78787878  0x78787878
> > > 0xdbbc2af0: 0x78787878  0x78787878  0x78787878  0x78787878
> > > 0xdbbc2b00: 0x78787878  0x78787878  0x78787878  0x78787878
> > > 0xdbbc2b10: 0x78787878  0x78787878  0x78787878  0x78787878
> > > 0xdbbc2b20: 0x78787878  0x78787878  0x78787878  0x78787878
> > > 0xdbbc2b30: 0x78787878  0x78787878  0x78787878  0x78787878
> > > 0xdbbc2b40: 0x78787878  0x78787878  0x78787878  0x78787878
> > > 0xdbbc2b50: 0x78787878  0x78787878  0x78787878  0x78787878
> > > 0xdbbc2b60: 0x78787878  0x78787878  0x78787878  0x78787878
> > > 0xdbbc2b70: 0x78787878  0x78787878  0x78787878  0x78787878
> > > 0xdbbc2b80: 0x78787878  0x78787878  0x  0x78787878
> > > 0xdbbc2b90: 0x78787878  0x78787878  0x78787878  0x78787878
> > > 0xdbbc2ba0: 0x78787878  0x78787878  0x78787878  0x78787878
> > > 
> > > In the reclaim path, try_to_free_pages() does not setup
> > > sc.target_mem_cgroup and sc is passed to do_try_to_free_pages(), ...,
> > > shrink_node().
> > > 
> > > In mem_cgroup_iter(), root is set to root_mem_cgroup because
> > > sc->target_mem_cgroup is NULL.
> > > It is possible to assign a memcg to root_mem_cgroup.nodeinfo.iter in
> > > mem_cgroup_iter().
> > > 
> > >   try_to_free_pages
> > >   struct scan_control sc = {...}, target_mem_cgroup is 0x0;
> > >   do_try_to_free_pages
> > >   shrink_zones
> > >   shrink_node
> > >mem_cgroup *root = sc->target_mem_cgroup;
> > >memcg = mem_cgroup_iter(root, NULL, );
> > >   mem_cgroup_iter()
> > >   if (!root)
> > >   root = root_mem_cgroup;
> > >   ...
> > > 
> > >   css = css_next_descendant_pre(css, >css);
> > >   memcg = mem_cgroup_from_css(css);
> > >   cmpxchg(>position, pos, memcg);
> > > 
> > > My device uses memcg non-hierarchical mode.
> > > When we release a memcg: invalidate_reclaim_iterators() reaches only
> > > dead_memcg and its parents. If non-hierarchical mode is used,
> > > invalidate_reclaim_iterators() never reaches root_mem_cgroup.
> > > 
> > > static void 

[PATCH] sched/core: Don't use dying mm as active_mm for kernel threads

2019-07-26 Thread Waiman Long
It was found that a dying mm_struct where the owning task has exited can
stay on as active_mm of kernel threads as long as no other user tasks
run on those CPUs that use it as active_mm. This prolongs the life time
of dying mm holding up memory and other resources that cannot be freed.

Fix that by forcing the kernel threads to use init_mm as the active_mm
if the previous active_mm is dying.

Signed-off-by: Waiman Long 
---
 kernel/sched/core.c | 13 +++--
 mm/init-mm.c|  2 ++
 2 files changed, 13 insertions(+), 2 deletions(-)

diff --git a/kernel/sched/core.c b/kernel/sched/core.c
index 2b037f195473..ca348e1f5a1e 100644
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@@ -3233,13 +3233,22 @@ context_switch(struct rq *rq, struct task_struct *prev,
 * Both of these contain the full memory barrier required by
 * membarrier after storing to rq->curr, before returning to
 * user-space.
+*
+* If mm is NULL and oldmm is dying (!owner), we switch to
+* init_mm instead to make sure that oldmm can be freed ASAP.
 */
-   if (!mm) {
+   if (!mm && oldmm->owner) {
next->active_mm = oldmm;
mmgrab(oldmm);
enter_lazy_tlb(oldmm, next);
-   } else
+   } else {
+   if (!mm) {
+   mm = _mm;
+   next->active_mm = mm;
+   mmgrab(mm);
+   }
switch_mm_irqs_off(oldmm, mm, next);
+   }
 
if (!prev->mm) {
prev->active_mm = NULL;
diff --git a/mm/init-mm.c b/mm/init-mm.c
index a787a319211e..5bfc6bc333ca 100644
--- a/mm/init-mm.c
+++ b/mm/init-mm.c
@@ -5,6 +5,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -36,5 +37,6 @@ struct mm_struct init_mm = {
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
.user_ns= _user_ns,
.cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0},
+   .owner  = _task,
INIT_MM_CONTEXT(init_mm)
 };
-- 
2.18.1



Re: [PATCH v9 4/4] uprobe: use FOLL_SPLIT_PMD instead of FOLL_SPLIT

2019-07-26 Thread Song Liu



> On Jul 26, 2019, at 4:02 PM, Andrew Morton  wrote:
> 
> On Thu, 25 Jul 2019 22:46:54 -0700 Song Liu  wrote:
> 
>> This patches uses newly added FOLL_SPLIT_PMD in uprobe. This enables easy
>> regroup of huge pmd after the uprobe is disabled (in next patch).
> 
> Confused.  There is no "next patch".

That was the patch 5, which was in earlier versions. I am working on 
addressing Kirill's feedback for it. 

Do I need to resubmit 4/4 with modified change log? 

Thanks,
Song

[PATCH] dt-bindings: Fix more $id value mismatches filenames

2019-07-26 Thread Rob Herring
The path in the schema '$id' values are wrong. Fix them.

Signed-off-by: Rob Herring 
---
 Documentation/devicetree/bindings/arm/renesas.yaml  | 2 +-
 Documentation/devicetree/bindings/arm/socionext/milbeaut.yaml   | 2 +-
 Documentation/devicetree/bindings/arm/ti/ti,davinci.yaml| 2 +-
 .../firmware/intel,ixp4xx-network-processing-engine.yaml| 2 +-
 Documentation/devicetree/bindings/iio/accel/adi,adxl345.yaml| 2 +-
 Documentation/devicetree/bindings/iio/accel/adi,adxl372.yaml| 2 +-
 .../bindings/interrupt-controller/intel,ixp4xx-interrupt.yaml   | 2 +-
 ...x-queue-manager.yaml => intel,ixp4xx-ahb-queue-manager.yaml} | 2 +-
 .../devicetree/bindings/net/allwinner,sun8i-a83t-emac.yaml  | 2 +-
 .../devicetree/bindings/phy/allwinner,sun6i-a31-mipi-dphy.yaml  | 2 +-
 Documentation/devicetree/bindings/timer/intel,ixp4xx-timer.yaml | 2 +-
 11 files changed, 11 insertions(+), 11 deletions(-)
 rename Documentation/devicetree/bindings/misc/{intel,ixp4xx-queue-manager.yaml 
=> intel,ixp4xx-ahb-queue-manager.yaml} (95%)

diff --git a/Documentation/devicetree/bindings/arm/renesas.yaml 
b/Documentation/devicetree/bindings/arm/renesas.yaml
index 08c923f8c257..28eb458f761a 100644
--- a/Documentation/devicetree/bindings/arm/renesas.yaml
+++ b/Documentation/devicetree/bindings/arm/renesas.yaml
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 %YAML 1.2
 ---
-$id: http://devicetree.org/schemas/arm/shmobile.yaml#
+$id: http://devicetree.org/schemas/arm/renesas.yaml#
 $schema: http://devicetree.org/meta-schemas/core.yaml#
 
 title: Renesas SH-Mobile, R-Mobile, and R-Car Platform Device Tree Bindings
diff --git a/Documentation/devicetree/bindings/arm/socionext/milbeaut.yaml 
b/Documentation/devicetree/bindings/arm/socionext/milbeaut.yaml
index aae53fc3cb1e..2bd519d2e855 100644
--- a/Documentation/devicetree/bindings/arm/socionext/milbeaut.yaml
+++ b/Documentation/devicetree/bindings/arm/socionext/milbeaut.yaml
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 %YAML 1.2
 ---
-$id: http://devicetree.org/schemas/arm/milbeaut.yaml#
+$id: http://devicetree.org/schemas/arm/socionext/milbeaut.yaml#
 $schema: http://devicetree.org/meta-schemas/core.yaml#
 
 title: Milbeaut platforms device tree bindings
diff --git a/Documentation/devicetree/bindings/arm/ti/ti,davinci.yaml 
b/Documentation/devicetree/bindings/arm/ti/ti,davinci.yaml
index 4326d2cfa15d..a8765ba29476 100644
--- a/Documentation/devicetree/bindings/arm/ti/ti,davinci.yaml
+++ b/Documentation/devicetree/bindings/arm/ti/ti,davinci.yaml
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 %YAML 1.2
 ---
-$id: http://devicetree.org/schemas/arm/ti/davinci.yaml#
+$id: http://devicetree.org/schemas/arm/ti/ti,davinci.yaml#
 $schema: http://devicetree.org/meta-schemas/core.yaml#
 
 title: Texas Instruments DaVinci Platforms Device Tree Bindings
diff --git 
a/Documentation/devicetree/bindings/firmware/intel,ixp4xx-network-processing-engine.yaml
 
b/Documentation/devicetree/bindings/firmware/intel,ixp4xx-network-processing-engine.yaml
index 8cb136c376fb..4f0db8ee226a 100644
--- 
a/Documentation/devicetree/bindings/firmware/intel,ixp4xx-network-processing-engine.yaml
+++ 
b/Documentation/devicetree/bindings/firmware/intel,ixp4xx-network-processing-engine.yaml
@@ -2,7 +2,7 @@
 # Copyright 2019 Linaro Ltd.
 %YAML 1.2
 ---
-$id: 
"http://devicetree.org/schemas/firmware/intel-ixp4xx-network-processing-engine.yaml#;
+$id: 
"http://devicetree.org/schemas/firmware/intel,ixp4xx-network-processing-engine.yaml#;
 $schema: "http://devicetree.org/meta-schemas/core.yaml#;
 
 title: Intel IXP4xx Network Processing Engine
diff --git a/Documentation/devicetree/bindings/iio/accel/adi,adxl345.yaml 
b/Documentation/devicetree/bindings/iio/accel/adi,adxl345.yaml
index 7ba167e2e1ea..c602b6fe1c0c 100644
--- a/Documentation/devicetree/bindings/iio/accel/adi,adxl345.yaml
+++ b/Documentation/devicetree/bindings/iio/accel/adi,adxl345.yaml
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 %YAML 1.2
 ---
-$id: http://devicetree.org/schemas/iio/accelerometers/adi,adxl345.yaml#
+$id: http://devicetree.org/schemas/iio/accel/adi,adxl345.yaml#
 $schema: http://devicetree.org/meta-schemas/core.yaml#
 
 title: Analog Devices ADXL345/ADXL375 3-Axis Digital Accelerometers
diff --git a/Documentation/devicetree/bindings/iio/accel/adi,adxl372.yaml 
b/Documentation/devicetree/bindings/iio/accel/adi,adxl372.yaml
index a7fafb9bf5c6..e7daffec88d3 100644
--- a/Documentation/devicetree/bindings/iio/accel/adi,adxl372.yaml
+++ b/Documentation/devicetree/bindings/iio/accel/adi,adxl372.yaml
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: GPL-2.0
 %YAML 1.2
 ---
-$id: http://devicetree.org/schemas/iio/accelerometers/adi,adxl372.yaml#
+$id: http://devicetree.org/schemas/iio/accel/adi,adxl372.yaml#
 $schema: http://devicetree.org/meta-schemas/core.yaml#
 
 title: Analog Devices ADXL372 3-Axis, +/-(200g) Digital Accelerometer
diff --git 

  1   2   3   4   5   6   7   8   9   10   >