[PATCH RT 4/8] sched: migrate disable: Protect cpus_ptr with lock
Various places assume that cpus_ptr is protected by rq/pi locks, so don't change it before grabbing those locks. Signed-off-by: Scott Wood --- kernel/sched/core.c | 6 ++ 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 99a3cfccf4d3..38a9a9df5638 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7283,9 +7283,8 @@ void dump_cpu_task(int cpu) struct rq *rq; struct rq_flags rf; - p->cpus_ptr = cpumask_of(smp_processor_id()); - rq = task_rq_lock(p, ); + p->cpus_ptr = cpumask_of(smp_processor_id()); update_nr_migratory(p, -1); p->nr_cpus_allowed = 1; task_rq_unlock(rq, p, ); @@ -7297,9 +7296,8 @@ void dump_cpu_task(int cpu) struct rq *rq; struct rq_flags rf; - p->cpus_ptr = >cpus_mask; - rq = task_rq_lock(p, ); + p->cpus_ptr = >cpus_mask; p->nr_cpus_allowed = cpumask_weight(>cpus_mask); update_nr_migratory(p, 1); task_rq_unlock(rq, p, ); -- 1.8.3.1
[PATCH RT 1/8] sched: migrate_enable: Use sleeping_lock to indicate involuntary sleep
Without this, rcu_note_context_switch() will complain if an RCU read lock is held when migrate_enable() calls stop_one_cpu(). Signed-off-by: Scott Wood --- include/linux/sched.h| 4 ++-- kernel/rcu/tree_plugin.h | 2 +- kernel/sched/core.c | 2 ++ 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/include/linux/sched.h b/include/linux/sched.h index 4e218f8d8048..ad23ab939b35 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -673,7 +673,7 @@ struct task_struct { int migrate_disable_atomic; # endif #endif -#ifdef CONFIG_PREEMPT_RT_FULL +#ifdef CONFIG_PREEMPT_RT_BASE int sleeping_lock; #endif @@ -1873,7 +1873,7 @@ static __always_inline bool need_resched(void) return unlikely(tif_need_resched()); } -#ifdef CONFIG_PREEMPT_RT_FULL +#ifdef CONFIG_PREEMPT_RT_BASE static inline void sleeping_lock_inc(void) { current->sleeping_lock++; diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h index 23a54e4b649c..7a3aa085ce2c 100644 --- a/kernel/rcu/tree_plugin.h +++ b/kernel/rcu/tree_plugin.h @@ -292,7 +292,7 @@ void rcu_note_context_switch(bool preempt) barrier(); /* Avoid RCU read-side critical sections leaking down. */ trace_rcu_utilization(TPS("Start context switch")); lockdep_assert_irqs_disabled(); -#if defined(CONFIG_PREEMPT_RT_FULL) +#if defined(CONFIG_PREEMPT_RT_BASE) sleeping_l = t->sleeping_lock; #endif WARN_ON_ONCE(!preempt && t->rcu_read_lock_nesting > 0 && !sleeping_l); diff --git a/kernel/sched/core.c b/kernel/sched/core.c index d3c6542b306f..c3407707e367 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7405,7 +7405,9 @@ void migrate_enable(void) unpin_current_cpu(); preempt_lazy_enable(); preempt_enable(); + sleeping_lock_inc(); stop_one_cpu(task_cpu(p), migration_cpu_stop, ); + sleeping_lock_dec(); return; } } -- 1.8.3.1
[PATCH RT 6/8] sched: migrate_enable: Set state to TASK_RUNNING
If migrate_enable() is called while a task is preparing to sleep (state != TASK_RUNNING), that triggers a debug check in stop_one_cpu(). Explicitly reset state to acknowledge that we're accepting the spurious wakeup. Signed-off-by: Scott Wood --- kernel/sched/core.c | 8 1 file changed, 8 insertions(+) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 38a9a9df5638..eb27a9bf70d7 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7396,6 +7396,14 @@ void migrate_enable(void) unpin_current_cpu(); preempt_lazy_enable(); preempt_enable(); + + /* +* Avoid sleeping with an existing non-running +* state. This will result in a spurious wakeup +* for the calling context. +*/ + __set_current_state(TASK_RUNNING); + sleeping_lock_inc(); stop_one_cpu(task_cpu(p), migration_cpu_stop, ); sleeping_lock_dec(); -- 1.8.3.1
[PATCH] raid1: factor out a common routine to handle the completion of sync write
It's just code clean-up. Signed-off-by: Hou Tao --- drivers/md/raid1.c | 39 ++- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c index 1755d2233e4d..d73ed94764c1 100644 --- a/drivers/md/raid1.c +++ b/drivers/md/raid1.c @@ -1904,6 +1904,22 @@ static void abort_sync_write(struct mddev *mddev, struct r1bio *r1_bio) } while (sectors_to_go > 0); } +static void put_sync_write_buf(struct r1bio *r1_bio, int uptodate) +{ + if (atomic_dec_and_test(_bio->remaining)) { + struct mddev *mddev = r1_bio->mddev; + int s = r1_bio->sectors; + + if (test_bit(R1BIO_MadeGood, _bio->state) || + test_bit(R1BIO_WriteError, _bio->state)) + reschedule_retry(r1_bio); + else { + put_buf(r1_bio); + md_done_sync(mddev, s, uptodate); + } + } +} + static void end_sync_write(struct bio *bio) { int uptodate = !bio->bi_status; @@ -1930,16 +1946,7 @@ static void end_sync_write(struct bio *bio) ) set_bit(R1BIO_MadeGood, _bio->state); - if (atomic_dec_and_test(_bio->remaining)) { - int s = r1_bio->sectors; - if (test_bit(R1BIO_MadeGood, _bio->state) || - test_bit(R1BIO_WriteError, _bio->state)) - reschedule_retry(r1_bio); - else { - put_buf(r1_bio); - md_done_sync(mddev, s, uptodate); - } - } + put_sync_write_buf(r1_bio, uptodate); } static int r1_sync_page_io(struct md_rdev *rdev, sector_t sector, @@ -,17 +2229,7 @@ static void sync_request_write(struct mddev *mddev, struct r1bio *r1_bio) generic_make_request(wbio); } - if (atomic_dec_and_test(_bio->remaining)) { - /* if we're here, all write(s) have completed, so clean up */ - int s = r1_bio->sectors; - if (test_bit(R1BIO_MadeGood, _bio->state) || - test_bit(R1BIO_WriteError, _bio->state)) - reschedule_retry(r1_bio); - else { - put_buf(r1_bio); - md_done_sync(mddev, s, 1); - } - } + put_sync_write_buf(r1_bio, 1); } /* -- 2.22.0
[PATCH RT 5/8] sched/deadline: Reclaim cpuset bandwidth in .migrate_task_rq()
With the changes to migrate disabling, ->set_cpus_allowed() no longer gets deferred until migrate_enable(). To avoid releasing the bandwidth while the task may still be executing on the old CPU, move the subtraction to ->migrate_task_rq(). Signed-off-by: Scott Wood --- kernel/sched/deadline.c | 67 +++-- 1 file changed, 31 insertions(+), 36 deletions(-) diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c index c18be51f7608..2f18d0cf1b56 100644 --- a/kernel/sched/deadline.c +++ b/kernel/sched/deadline.c @@ -1606,14 +1606,42 @@ static void yield_task_dl(struct rq *rq) return cpu; } +static void free_old_cpuset_bw_dl(struct rq *rq, struct task_struct *p) +{ + struct root_domain *src_rd = rq->rd; + + /* +* Migrating a SCHED_DEADLINE task between exclusive +* cpusets (different root_domains) entails a bandwidth +* update. We already made space for us in the destination +* domain (see cpuset_can_attach()). +*/ + if (!cpumask_intersects(src_rd->span, p->cpus_ptr)) { + struct dl_bw *src_dl_b; + + src_dl_b = dl_bw_of(cpu_of(rq)); + /* +* We now free resources of the root_domain we are migrating +* off. In the worst case, sched_setattr() may temporary fail +* until we complete the update. +*/ + raw_spin_lock(_dl_b->lock); + __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); + raw_spin_unlock(_dl_b->lock); + } +} + static void migrate_task_rq_dl(struct task_struct *p, int new_cpu __maybe_unused) { struct rq *rq; - if (p->state != TASK_WAKING) + rq = task_rq(p); + + if (p->state != TASK_WAKING) { + free_old_cpuset_bw_dl(rq, p); return; + } - rq = task_rq(p); /* * Since p->state == TASK_WAKING, set_task_cpu() has been called * from try_to_wake_up(). Hence, p->pi_lock is locked, but @@ -2220,39 +2248,6 @@ static void task_woken_dl(struct rq *rq, struct task_struct *p) } } -static void set_cpus_allowed_dl(struct task_struct *p, - const struct cpumask *new_mask) -{ - struct root_domain *src_rd; - struct rq *rq; - - BUG_ON(!dl_task(p)); - - rq = task_rq(p); - src_rd = rq->rd; - /* -* Migrating a SCHED_DEADLINE task between exclusive -* cpusets (different root_domains) entails a bandwidth -* update. We already made space for us in the destination -* domain (see cpuset_can_attach()). -*/ - if (!cpumask_intersects(src_rd->span, new_mask)) { - struct dl_bw *src_dl_b; - - src_dl_b = dl_bw_of(cpu_of(rq)); - /* -* We now free resources of the root_domain we are migrating -* off. In the worst case, sched_setattr() may temporary fail -* until we complete the update. -*/ - raw_spin_lock(_dl_b->lock); - __dl_sub(src_dl_b, p->dl.dl_bw, dl_bw_cpus(task_cpu(p))); - raw_spin_unlock(_dl_b->lock); - } - - set_cpus_allowed_common(p, new_mask); -} - /* Assumes rq->lock is held */ static void rq_online_dl(struct rq *rq) { @@ -2407,7 +2402,7 @@ static void prio_changed_dl(struct rq *rq, struct task_struct *p, #ifdef CONFIG_SMP .select_task_rq = select_task_rq_dl, .migrate_task_rq= migrate_task_rq_dl, - .set_cpus_allowed = set_cpus_allowed_dl, + .set_cpus_allowed = set_cpus_allowed_common, .rq_online = rq_online_dl, .rq_offline = rq_offline_dl, .task_woken = task_woken_dl, -- 1.8.3.1
[PATCH RT 2/8] sched: __set_cpus_allowed_ptr: Check cpus_mask, not cpus_ptr
This function is concerned with the long-term cpu mask, not the transitory mask the task might have while migrate disabled. Before this patch, if a task was migrate disabled at the time __set_cpus_allowed_ptr() was called, and the new mask happened to be equal to the cpu that the task was running on, then the mask update would be lost. Signed-off-by: Scott Wood --- kernel/sched/core.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index c3407707e367..6e643d656d71 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1218,7 +1218,7 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, goto out; } - if (cpumask_equal(p->cpus_ptr, new_mask)) + if (cpumask_equal(>cpus_mask, new_mask)) goto out; if (!cpumask_intersects(new_mask, cpu_valid_mask)) { -- 1.8.3.1
[PATCH RT 7/8] sched: migrate_enable: Use select_fallback_rq()
migrate_enable() currently open-codes a variant of select_fallback_rq(). However, it does not have the "No more Mr. Nice Guy" fallback and thus it will pass an invalid CPU to the migration thread if cpus_mask only contains a CPU that is !active. Signed-off-by: Scott Wood --- This scenario will be more likely after the next patch, since the migrate_disable_update check goes away. However, it could happen anyway if cpus_mask was updated to a CPU other than the one we were pinned to, and that CPU subsequently became inactive. --- kernel/sched/core.c | 25 ++--- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index eb27a9bf70d7..3a2d8251a30c 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -7368,6 +7368,7 @@ void migrate_enable(void) if (p->migrate_disable_update) { struct rq *rq; struct rq_flags rf; + int cpu = task_cpu(p); rq = task_rq_lock(p, ); update_rq_clock(rq); @@ -7377,21 +7378,15 @@ void migrate_enable(void) p->migrate_disable_update = 0; - WARN_ON(smp_processor_id() != task_cpu(p)); - if (!cpumask_test_cpu(task_cpu(p), >cpus_mask)) { - const struct cpumask *cpu_valid_mask = cpu_active_mask; - struct migration_arg arg; - unsigned int dest_cpu; - - if (p->flags & PF_KTHREAD) { - /* -* Kernel threads are allowed on online && !active CPUs -*/ - cpu_valid_mask = cpu_online_mask; - } - dest_cpu = cpumask_any_and(cpu_valid_mask, >cpus_mask); - arg.task = p; - arg.dest_cpu = dest_cpu; + WARN_ON(smp_processor_id() != cpu); + if (!cpumask_test_cpu(cpu, >cpus_mask)) { + struct migration_arg arg = { p }; + struct rq_flags rf; + + rq = task_rq_lock(p, ); + update_rq_clock(rq); + arg.dest_cpu = select_fallback_rq(cpu, p); + task_rq_unlock(rq, p, ); unpin_current_cpu(); preempt_lazy_enable(); -- 1.8.3.1
[PATCH RT 3/8] sched: Remove dead __migrate_disabled() check
This code was unreachable given the __migrate_disabled() branch to "out" immediately beforehand. Signed-off-by: Scott Wood --- kernel/sched/core.c | 7 --- 1 file changed, 7 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 6e643d656d71..99a3cfccf4d3 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -1242,13 +1242,6 @@ static int __set_cpus_allowed_ptr(struct task_struct *p, if (cpumask_test_cpu(task_cpu(p), new_mask) || __migrate_disabled(p)) goto out; -#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) - if (__migrate_disabled(p)) { - p->migrate_disable_update = 1; - goto out; - } -#endif - dest_cpu = cpumask_any_and(cpu_valid_mask, new_mask); if (task_running(rq, p) || p->state == TASK_WAKING) { struct migration_arg arg = { p, dest_cpu }; -- 1.8.3.1
[PATCH RT 8/8] sched: Lazy migrate_disable processing
Avoid overhead on the majority of migrate disable/enable sequences by only manipulating scheduler data (and grabbing the relevant locks) when the task actually schedules while migrate-disabled. Very large speedups were seen during a kernel build. Instead of cpuhp_pin_lock, CPU hotplug is handled by keeping a per-CPU count of the number of pinned tasks (including tasks which have not scheduled in the migrate-disabled section); takedown_cpu() will wait until that reaches zero (confirmed by take_cpu_down() in stop machine context to deal with races) before migrating tasks off of the cpu. To simplify synchronization, updating cpus_mask is no longer deferred until migrate_enable(). This lets us not have to worry about migrate_enable() missing the update if it's on the fast path (didn't schedule during the migrate disabled section). It also makes the code a bit simpler and reduces deviation from mainline. While the main motivation for this is the performance benefit, lazy migrate disable also eliminates the restriction on calling migrate_disable() while atomic but leaving the atomic region prior to calling migrate_enable() -- though this won't help with local_bh_disable() (and thus rcutorture) unless something similar is done with the recently added local_lock. Signed-off-by: Scott Wood --- include/linux/cpu.h| 4 -- include/linux/sched.h | 11 +-- init/init_task.c | 4 ++ kernel/cpu.c | 97 + kernel/sched/core.c| 192 - kernel/sched/sched.h | 4 ++ lib/smp_processor_id.c | 3 + 7 files changed, 130 insertions(+), 185 deletions(-) diff --git a/include/linux/cpu.h b/include/linux/cpu.h index f4a772c12d14..2df500fdcbc4 100644 --- a/include/linux/cpu.h +++ b/include/linux/cpu.h @@ -113,8 +113,6 @@ static inline void cpu_maps_update_done(void) extern void cpu_hotplug_enable(void); void clear_tasks_mm_cpumask(int cpu); int cpu_down(unsigned int cpu); -extern void pin_current_cpu(void); -extern void unpin_current_cpu(void); #else /* CONFIG_HOTPLUG_CPU */ @@ -126,8 +124,6 @@ static inline void cpus_read_unlock(void) { } static inline void lockdep_assert_cpus_held(void) { } static inline void cpu_hotplug_disable(void) { } static inline void cpu_hotplug_enable(void) { } -static inline void pin_current_cpu(void) { } -static inline void unpin_current_cpu(void) { } #endif /* !CONFIG_HOTPLUG_CPU */ diff --git a/include/linux/sched.h b/include/linux/sched.h index ad23ab939b35..069c46dde15b 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -229,6 +229,8 @@ extern long io_schedule_timeout(long timeout); extern void io_schedule(void); +int cpu_nr_pinned(int cpu); + /** * struct prev_cputime - snapshot of system and user cputime * @utime: time spent in user mode @@ -661,16 +663,13 @@ struct task_struct { cpumask_t cpus_mask; #if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) int migrate_disable; - int migrate_disable_update; - int pinned_on_cpu; + boolmigrate_disable_scheduled; # ifdef CONFIG_SCHED_DEBUG - int migrate_disable_atomic; + int pinned_on_cpu; # endif - #elif !defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) # ifdef CONFIG_SCHED_DEBUG int migrate_disable; - int migrate_disable_atomic; # endif #endif #ifdef CONFIG_PREEMPT_RT_BASE @@ -2066,4 +2065,6 @@ static inline void rseq_syscall(struct pt_regs *regs) #endif +extern struct task_struct *takedown_cpu_task; + #endif diff --git a/init/init_task.c b/init/init_task.c index e402413dc47d..c0c7618fd2fb 100644 --- a/init/init_task.c +++ b/init/init_task.c @@ -81,6 +81,10 @@ struct task_struct init_task .cpus_ptr = _task.cpus_mask, .cpus_mask = CPU_MASK_ALL, .nr_cpus_allowed= NR_CPUS, +#if defined(CONFIG_SMP) && defined(CONFIG_PREEMPT_RT_BASE) && \ +defined(CONFIG_SCHED_DEBUG) + .pinned_on_cpu = -1, +#endif .mm = NULL, .active_mm = _mm, .restart_block = { diff --git a/kernel/cpu.c b/kernel/cpu.c index 885a195dfbe0..0096acf1a692 100644 --- a/kernel/cpu.c +++ b/kernel/cpu.c @@ -76,11 +76,6 @@ static DEFINE_PER_CPU(struct cpuhp_cpu_state, cpuhp_state) = { .fail = CPUHP_INVALID, }; -#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_PREEMPT_RT_FULL) -static DEFINE_PER_CPU(struct rt_rw_lock, cpuhp_pin_lock) = \ - __RWLOCK_RT_INITIALIZER(cpuhp_pin_lock); -#endif - #if defined(CONFIG_LOCKDEP) && defined(CONFIG_SMP) static struct lockdep_map cpuhp_state_up_map = STATIC_LOCKDEP_MAP_INIT("cpuhp_state-up", _state_up_map); @@ -287,55 +282,6 @@ void cpu_maps_update_done(void) #ifdef
[RT PATCH 0/8] migrate disable fixes and performance
With these patches, a kernel build on a 104-cpu machine took around 75% less wall time and 85% less system time. Note that there is a difference in v5.2-rt compared to v5.0-rt. The performance with these patches is similar in both cases, but without these patches v5.2-rt is substantially slower. In v5.0-rt with a previous version of these patches, lazy migrate disable reduced kernel build time by around 15-20% wall and 70-75% system. Scott Wood (8): sched: migrate_enable: Use sleeping_lock to indicate involuntary sleep sched: __set_cpus_allowed_ptr: Check cpus_mask, not cpus_ptr sched: Remove dead __migrate_disabled() check sched: migrate disable: Protect cpus_ptr with lock sched/deadline: Reclaim cpuset bandwidth in .migrate_task_rq() sched: migrate_enable: Set state to TASK_RUNNING sched: migrate_enable: Use select_fallback_rq() sched: Lazy migrate_disable processing include/linux/cpu.h | 4 - include/linux/sched.h| 15 ++-- init/init_task.c | 4 + kernel/cpu.c | 97 --- kernel/rcu/tree_plugin.h | 2 +- kernel/sched/core.c | 200 +++ kernel/sched/deadline.c | 67 kernel/sched/sched.h | 4 + lib/smp_processor_id.c | 3 + 9 files changed, 166 insertions(+), 230 deletions(-) -- 1.8.3.1
[RFC PATCH 00/21] x86/sgx: KVM: Add SGX virtualization
This is an "early" RFC series for adding SGX virtualization to KVM. SGX virtualization (more specifically, EPC virtualization) is dependent on the not-yet-merged SGX enabling series and so cannot be considered for inclusion any time soon. The primary goal of this RFC is to get feedback on the overall approach, e.g. code location, uAPI changes, functionality, etc... My hope is to sort out any major issues sooner rather than later, so that if/when the base SGX enabling is merged, virtualization support can quickly follow suit. That being said, nitpicking and bikeshedding is more than welcome :-) This code applies on top of a slightly modified version of v21 of the SGX enabling series[1]. The modifications on top of the SGX series are a few minor bug fixes that are not related to SGX virtualization, but affect code that is moved/modified by this series. The full source for the modified version of v21 can be found at: https://github.com/sean-jc/linux.git under the tag: sgx-v21-ish A corresponding Qemu RFC will (hopefully) follow next week, the Qemu patches need a bit more cleanup... [1] https://lkml.kernel.org/r/20190713170804.2340-1-jarkko.sakki...@linux.intel.com Sean Christopherson (21): x86/sgx: Add defines for SGX device minor numbers x86/sgx: Move bus registration and device init to common code x86/sgx: Move provisioning device to common code x86/sgx: Add /dev/sgx/virt_epc device to allocate "raw" EPC for VMs x86/sgx: Expose SGX architectural definitions to the kernel KVM: x86: Add SGX sub-features leaf to reverse CPUID table KVM: x86: Add WARN_ON_ONCE(index!=0) in __do_cpuid_ent KVM: x86: Add kvm_x86_ops hook to short circuit emulation KVM: VMX: Add basic handling of VM-Exit from SGX enclave KVM: x86: Export kvm_mmu_gva_to_gpa_{read,write}() for VMX/SGX KVM: x86: Export kvm_propagate_fault (as kvm_propagate_page_fault) KVM: x86: Define new #PF SGX error code bit x86/sgx: Move the intermediate EINIT helper into the driver x86/sgx: Add helpers to expose ECREATE and EINIT to KVM KVM: VMX: Add SGX ENCLS[ECREATE] handler to enforce CPUID restrictions KVM: VMX: Edd emulation of SGX Launch Control LE hash MSRs KVM: VMX: Add handler for ENCLS[EINIT] to support SGX Launch Control KVM: x86: Invoke kvm_x86_ops->cpuid_update() after kvm_update_cpuid() KVM: VMX: Enable SGX virtualization for SGX1, SGX2 and LC x86/sgx: Export sgx_set_attribute() for use by KVM KVM: x86: Add capability to grant VM access to privileged SGX attribute Documentation/virtual/kvm/api.txt | 20 ++ arch/x86/Kconfig | 13 + arch/x86/include/asm/kvm_host.h | 8 +- arch/x86/include/asm/sgx.h| 17 + .../cpu/sgx/arch.h => include/asm/sgx_arch.h} | 1 + arch/x86/include/asm/vmx.h| 1 + arch/x86/include/uapi/asm/vmx.h | 1 + arch/x86/kernel/cpu/sgx/Makefile | 1 + arch/x86/kernel/cpu/sgx/driver/driver.h | 3 +- arch/x86/kernel/cpu/sgx/driver/ioctl.c| 40 ++- arch/x86/kernel/cpu/sgx/driver/main.c | 73 + arch/x86/kernel/cpu/sgx/encl.c| 2 +- arch/x86/kernel/cpu/sgx/encls.h | 2 +- arch/x86/kernel/cpu/sgx/main.c| 141 ++-- arch/x86/kernel/cpu/sgx/sgx.h | 16 +- arch/x86/kernel/cpu/sgx/virt.c| 308 ++ arch/x86/kernel/cpu/sgx/virt.h| 14 + arch/x86/kvm/Makefile | 2 + arch/x86/kvm/cpuid.c | 135 ++-- arch/x86/kvm/cpuid.h | 20 ++ arch/x86/kvm/emulate.c| 1 + arch/x86/kvm/mmu.c| 12 - arch/x86/kvm/svm.c| 19 +- arch/x86/kvm/vmx/nested.c | 21 +- arch/x86/kvm/vmx/nested.h | 5 + arch/x86/kvm/vmx/sgx.c| 247 ++ arch/x86/kvm/vmx/sgx.h| 11 + arch/x86/kvm/vmx/vmcs12.c | 1 + arch/x86/kvm/vmx/vmcs12.h | 4 +- arch/x86/kvm/vmx/vmx.c| 251 +- arch/x86/kvm/vmx/vmx.h| 6 + arch/x86/kvm/x86.c| 40 ++- arch/x86/kvm/x86.h| 5 - include/uapi/linux/kvm.h | 1 + tools/testing/selftests/x86/sgx/defines.h | 2 +- 35 files changed, 1234 insertions(+), 210 deletions(-) create mode 100644 arch/x86/include/asm/sgx.h rename arch/x86/{kernel/cpu/sgx/arch.h => include/asm/sgx_arch.h} (99%) create mode 100644 arch/x86/kernel/cpu/sgx/virt.c create mode 100644 arch/x86/kernel/cpu/sgx/virt.h create mode 100644 arch/x86/kvm/vmx/sgx.c create mode 100644 arch/x86/kvm/vmx/sgx.h -- 2.22.0
[RFC PATCH 02/21] x86/sgx: Move bus registration and device init to common code
Move the SGX bus registration and initialization into common code in preparation for adding a virtual EPC device, which will reside outside of the native SGX userspace driver. Signed-off-by: Sean Christopherson --- arch/x86/kernel/cpu/sgx/driver/main.c | 48 + arch/x86/kernel/cpu/sgx/main.c| 50 ++- arch/x86/kernel/cpu/sgx/sgx.h | 4 +++ 3 files changed, 54 insertions(+), 48 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/driver/main.c b/arch/x86/kernel/cpu/sgx/driver/main.c index a2506a49c95a..d62bdc7ed4d9 100644 --- a/arch/x86/kernel/cpu/sgx/driver/main.c +++ b/arch/x86/kernel/cpu/sgx/driver/main.c @@ -158,42 +158,10 @@ const struct file_operations sgx_provision_fops = { .owner = THIS_MODULE, }; -static struct bus_type sgx_bus_type = { - .name = "sgx", -}; - static struct device sgx_encl_dev; static struct cdev sgx_encl_cdev; static struct device sgx_provision_dev; static struct cdev sgx_provision_cdev; -static dev_t sgx_devt; - -static void sgx_dev_release(struct device *dev) -{ -} - -static __init int sgx_dev_init(const char *name, struct device *dev, - struct cdev *cdev, - const struct file_operations *fops, int minor) -{ - int ret; - - device_initialize(dev); - - dev->bus = _bus_type; - dev->devt = MKDEV(MAJOR(sgx_devt), minor); - dev->release = sgx_dev_release; - - ret = dev_set_name(dev, name); - if (ret) { - put_device(dev); - return ret; - } - - cdev_init(cdev, fops); - cdev->owner = THIS_MODULE; - return 0; -} int __init sgx_drv_init(void) { @@ -207,14 +175,6 @@ int __init sgx_drv_init(void) return -ENODEV; } - ret = bus_register(_bus_type); - if (ret) - return ret; - - ret = alloc_chrdev_region(_devt, 0, SGX_MAX_NR_DEVICES, "sgx"); - if (ret < 0) - goto err_bus; - cpuid_count(SGX_CPUID, 0, , , , ); sgx_misc_reserved_mask = ~ebx | SGX_MISC_RESERVED_MASK; sgx_encl_size_max_64 = 1ULL << ((edx >> 8) & 0xFF); @@ -240,7 +200,7 @@ int __init sgx_drv_init(void) ret = sgx_dev_init("sgx/enclave", _encl_dev, _encl_cdev, _encl_fops, SGX_ENCL_DEV_MINOR); if (ret) - goto err_chrdev_region; + return ret; ret = sgx_dev_init("sgx/provision", _provision_dev, _provision_cdev, _provision_fops, @@ -277,11 +237,5 @@ int __init sgx_drv_init(void) err_encl_dev: put_device(_encl_dev); -err_chrdev_region: - unregister_chrdev_region(sgx_devt, SGX_MAX_NR_DEVICES); - -err_bus: - bus_unregister(_bus_type); - return ret; } diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index f790a03571c5..edbd465083c7 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -1,6 +1,7 @@ // SPDX-License-Identifier: (GPL-2.0 OR BSD-3-Clause) // Copyright(c) 2016-17 Intel Corporation. +#include #include #include #include @@ -329,6 +330,39 @@ static __init int sgx_page_cache_init(void) return 0; } +static struct bus_type sgx_bus_type = { + .name = "sgx", +}; +static dev_t sgx_devt; + +static void sgx_dev_release(struct device *dev) +{ + +} + +__init int sgx_dev_init(const char *name, struct device *dev, + struct cdev *cdev, const struct file_operations *fops, + int minor) +{ + int ret; + + device_initialize(dev); + + dev->bus = _bus_type; + dev->devt = MKDEV(MAJOR(sgx_devt), minor); + dev->release = sgx_dev_release; + + ret = dev_set_name(dev, name); + if (ret) { + put_device(dev); + return ret; + } + + cdev_init(cdev, fops); + cdev->owner = THIS_MODULE; + return 0; +} + static __init int sgx_init(void) { int ret; @@ -344,12 +378,26 @@ static __init int sgx_init(void) if (ret) goto err_page_cache; - ret = sgx_drv_init(); + ret = bus_register(_bus_type); if (ret) goto err_kthread; + ret = alloc_chrdev_region(_devt, 0, SGX_MAX_NR_DEVICES, "sgx"); + if (ret < 0) + goto err_bus; + + ret = sgx_drv_init(); + if (ret) + goto err_chrdev_region; + return 0; +err_chrdev_region: + unregister_chrdev_region(sgx_devt, SGX_MAX_NR_DEVICES); + +err_bus: + bus_unregister(_bus_type); + err_kthread: kthread_stop(ksgxswapd_tsk); diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 4e2c3ce94f63..85b3674e1d43 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -93,4 +93,8 @@ int sgx_einit(struct sgx_sigstruct *sigstruct, struct
[RFC PATCH 11/21] KVM: x86: Export kvm_propagate_fault (as kvm_propagate_page_fault)
Support for SGX Launch Control requires KVM to trap and execute ENCLS[EINIT] on behalf of the guest. Interception of ENCLS leafs occurs immediately after CPL0 checks, i.e. before any processing of the leaf-specific operands. As such, it's possible that KVM could intercept an EINIT from L2 and encounter an EPT fault while walking L1's EPT tables. Rather than force EINIT through the generic emulator, export kvm_propagate_fault() so that the EINIT handler can inject the proper page fault. Rename the function to kvm_propagate_page_fault() to clarify that it is only for page faults, and WARN if it is invoked with an exception other than PF_VECTOR. Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 1 + arch/x86/kvm/x86.c | 7 +-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 1341d8390ebe..397d755bb353 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1357,6 +1357,7 @@ void kvm_queue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_requeue_exception(struct kvm_vcpu *vcpu, unsigned nr); void kvm_requeue_exception_e(struct kvm_vcpu *vcpu, unsigned nr, u32 error_code); void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); +bool kvm_propagate_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault); int kvm_read_guest_page_mmu(struct kvm_vcpu *vcpu, struct kvm_mmu *mmu, gfn_t gfn, void *data, int offset, int len, u32 access); diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index 2b64bb854571..ec92c5534336 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -587,8 +587,10 @@ void kvm_inject_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) } EXPORT_SYMBOL_GPL(kvm_inject_page_fault); -static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) +bool kvm_propagate_page_fault(struct kvm_vcpu *vcpu, struct x86_exception *fault) { + WARN_ON(fault->vector != PF_VECTOR); + if (mmu_is_nested(vcpu) && !fault->nested_page_fault) vcpu->arch.nested_mmu.inject_page_fault(vcpu, fault); else @@ -596,6 +598,7 @@ static bool kvm_propagate_fault(struct kvm_vcpu *vcpu, struct x86_exception *fau return fault->nested_page_fault; } +EXPORT_SYMBOL_GPL(kvm_propagate_page_fault); void kvm_inject_nmi(struct kvm_vcpu *vcpu) { @@ -6089,7 +6092,7 @@ static bool inject_emulated_exception(struct kvm_vcpu *vcpu) { struct x86_emulate_ctxt *ctxt = >arch.emulate_ctxt; if (ctxt->exception.vector == PF_VECTOR) - return kvm_propagate_fault(vcpu, >exception); + return kvm_propagate_page_fault(vcpu, >exception); if (ctxt->exception.error_code_valid) kvm_queue_exception_e(vcpu, ctxt->exception.vector, -- 2.22.0
[RFC PATCH 13/21] x86/sgx: Move the intermediate EINIT helper into the driver
Providing sgx_einit() in the common SGX code was a bit premature. The thought was that the native SGX driver and KVM would be able to use a common EINIT helper, but that may or may not hold true depending on how KVM's implementation shakes out. For example, KVM may want to pass user pointers directly to EINIT in order to avoid copying large amounts of data to in-kernel temp structures. Signed-off-by: Sean Christopherson --- arch/x86/kernel/cpu/sgx/driver/ioctl.c | 21 +++-- arch/x86/kernel/cpu/sgx/main.c | 43 ++ arch/x86/kernel/cpu/sgx/sgx.h | 4 +-- 3 files changed, 30 insertions(+), 38 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/driver/ioctl.c b/arch/x86/kernel/cpu/sgx/driver/ioctl.c index b7aa06920d10..a1cb5f772363 100644 --- a/arch/x86/kernel/cpu/sgx/driver/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/driver/ioctl.c @@ -658,6 +658,23 @@ static int sgx_get_key_hash(const void *modulus, void *hash) return ret; } +static int __sgx_einit(struct sgx_sigstruct *sigstruct, + struct sgx_einittoken *token, struct sgx_epc_page *secs, + u64 *lepubkeyhash) +{ + int ret; + + preempt_disable(); + sgx_update_lepubkeyhash_msrs(lepubkeyhash, false); + ret = __einit(sigstruct, token, sgx_epc_addr(secs)); + if (ret == SGX_INVALID_EINITTOKEN) { + sgx_update_lepubkeyhash_msrs(lepubkeyhash, true); + ret = __einit(sigstruct, token, sgx_epc_addr(secs)); + } + preempt_enable(); + return ret; +} + static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, struct sgx_einittoken *token) { @@ -686,8 +703,8 @@ static int sgx_encl_init(struct sgx_encl *encl, struct sgx_sigstruct *sigstruct, for (i = 0; i < SGX_EINIT_SLEEP_COUNT; i++) { for (j = 0; j < SGX_EINIT_SPIN_COUNT; j++) { - ret = sgx_einit(sigstruct, token, encl->secs.epc_page, - mrsigner); + ret = __sgx_einit(sigstruct, token, + encl->secs.epc_page, mrsigner); if (ret == SGX_UNMASKED_EVENT) continue; else diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 532dd90e09e1..542427c6ae9c 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -166,7 +166,15 @@ void sgx_free_page(struct sgx_epc_page *page) WARN(ret > 0, "sgx: EREMOVE returned %d (0x%x)", ret, ret); } -static void sgx_update_lepubkeyhash_msrs(u64 *lepubkeyhash, bool enforce) +/** + * sgx_update_lepubkeyhash_msrs - Write the IA32_SGXLEPUBKEYHASHx MSRs + * @lepubkeyhash: array of desired MSRs values + * @enforce: force WRMSR regardless of cache status + * + * Write the IA32_SGXLEPUBKEYHASHx MSRs according to @lepubkeyhash if the + * last cached value doesn't match the desired value, or if @enforce is %true. + */ +void sgx_update_lepubkeyhash_msrs(u64 *lepubkeyhash, bool enforce) { u64 *cache; int i; @@ -180,39 +188,6 @@ static void sgx_update_lepubkeyhash_msrs(u64 *lepubkeyhash, bool enforce) } } -/** - * sgx_einit - initialize an enclave - * @sigstruct: a pointer a SIGSTRUCT - * @token: a pointer an EINITTOKEN (optional) - * @secs: a pointer a SECS - * @lepubkeyhash: the desired value for IA32_SGXLEPUBKEYHASHx MSRs - * - * Execute ENCLS[EINIT], writing the IA32_SGXLEPUBKEYHASHx MSRs according - * to @lepubkeyhash (if possible and necessary). - * - * Return: - * 0 on success, - * -errno or SGX error on failure - */ -int sgx_einit(struct sgx_sigstruct *sigstruct, struct sgx_einittoken *token, - struct sgx_epc_page *secs, u64 *lepubkeyhash) -{ - int ret; - - if (!boot_cpu_has(X86_FEATURE_SGX_LC)) - return __einit(sigstruct, token, sgx_epc_addr(secs)); - - preempt_disable(); - sgx_update_lepubkeyhash_msrs(lepubkeyhash, false); - ret = __einit(sigstruct, token, sgx_epc_addr(secs)); - if (ret == SGX_INVALID_EINITTOKEN) { - sgx_update_lepubkeyhash_msrs(lepubkeyhash, true); - ret = __einit(sigstruct, token, sgx_epc_addr(secs)); - } - preempt_enable(); - return ret; -} - static __init void sgx_free_epc_section(struct sgx_epc_section *section) { struct sgx_epc_page *page; diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 748b1633d770..3f3311024bd0 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -85,8 +85,8 @@ void sgx_reclaim_pages(void); struct sgx_epc_page *sgx_alloc_page(void *owner, bool reclaim); int __sgx_free_page(struct sgx_epc_page *page); void sgx_free_page(struct sgx_epc_page *page); -int sgx_einit(struct sgx_sigstruct
[RFC PATCH 12/21] KVM: x86: Define new #PF SGX error code bit
Page faults that are signaled by the SGX Enclave Page Cache Map (EPCM), as opposed to the traditional IA32/EPT page tables, set an SGX bit in the error code to indicate that the #PF was induced by SGX. KVM will need to emulate this behavior as part of its trap-and-execute-EINIT scheme needed to virtualize SGX Launch Control, e.g. if EINIT itself faults due to the EPC being zapped by hardware after suspend-resume. Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 397d755bb353..103df8cbdd24 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -201,6 +201,7 @@ enum { #define PFERR_RSVD_BIT 3 #define PFERR_FETCH_BIT 4 #define PFERR_PK_BIT 5 +#define PFERR_SGX_BIT 15 #define PFERR_GUEST_FINAL_BIT 32 #define PFERR_GUEST_PAGE_BIT 33 @@ -210,6 +211,7 @@ enum { #define PFERR_RSVD_MASK (1U << PFERR_RSVD_BIT) #define PFERR_FETCH_MASK (1U << PFERR_FETCH_BIT) #define PFERR_PK_MASK (1U << PFERR_PK_BIT) +#define PFERR_SGX_MASK (1U << PFERR_SGX_BIT) #define PFERR_GUEST_FINAL_MASK (1ULL << PFERR_GUEST_FINAL_BIT) #define PFERR_GUEST_PAGE_MASK (1ULL << PFERR_GUEST_PAGE_BIT) -- 2.22.0
[RFC PATCH 01/21] x86/sgx: Add defines for SGX device minor numbers
Add defines to track the minor numbers for each SGX device in preparation for moving the helper code and provisioning device to the common subsystem, and in preparation for adding a third device, i.e. a virtual EPC device. Signed-off-by: Sean Christopherson --- arch/x86/kernel/cpu/sgx/driver/driver.h | 1 - arch/x86/kernel/cpu/sgx/driver/main.c | 9 + arch/x86/kernel/cpu/sgx/sgx.h | 4 3 files changed, 9 insertions(+), 5 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/driver/driver.h b/arch/x86/kernel/cpu/sgx/driver/driver.h index da60839b133a..6ce18c766a5a 100644 --- a/arch/x86/kernel/cpu/sgx/driver/driver.h +++ b/arch/x86/kernel/cpu/sgx/driver/driver.h @@ -15,7 +15,6 @@ #include "../encls.h" #include "../sgx.h" -#define SGX_DRV_NR_DEVICES 2 #define SGX_EINIT_SPIN_COUNT 20 #define SGX_EINIT_SLEEP_COUNT 50 #define SGX_EINIT_SLEEP_TIME 20 diff --git a/arch/x86/kernel/cpu/sgx/driver/main.c b/arch/x86/kernel/cpu/sgx/driver/main.c index bb7f1932529f..a2506a49c95a 100644 --- a/arch/x86/kernel/cpu/sgx/driver/main.c +++ b/arch/x86/kernel/cpu/sgx/driver/main.c @@ -211,7 +211,7 @@ int __init sgx_drv_init(void) if (ret) return ret; - ret = alloc_chrdev_region(_devt, 0, SGX_DRV_NR_DEVICES, "sgx"); + ret = alloc_chrdev_region(_devt, 0, SGX_MAX_NR_DEVICES, "sgx"); if (ret < 0) goto err_bus; @@ -238,12 +238,13 @@ int __init sgx_drv_init(void) } ret = sgx_dev_init("sgx/enclave", _encl_dev, _encl_cdev, - _encl_fops, 0); + _encl_fops, SGX_ENCL_DEV_MINOR); if (ret) goto err_chrdev_region; ret = sgx_dev_init("sgx/provision", _provision_dev, - _provision_cdev, _provision_fops, 1); + _provision_cdev, _provision_fops, + SGX_PROV_DEV_MINOR); if (ret) goto err_encl_dev; @@ -277,7 +278,7 @@ int __init sgx_drv_init(void) put_device(_encl_dev); err_chrdev_region: - unregister_chrdev_region(sgx_devt, SGX_DRV_NR_DEVICES); + unregister_chrdev_region(sgx_devt, SGX_MAX_NR_DEVICES); err_bus: bus_unregister(_bus_type); diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index c9276d4b6ffe..4e2c3ce94f63 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -89,4 +89,8 @@ void sgx_free_page(struct sgx_epc_page *page); int sgx_einit(struct sgx_sigstruct *sigstruct, struct sgx_einittoken *token, struct sgx_epc_page *secs, u64 *lepubkeyhash); +#define SGX_ENCL_DEV_MINOR 0 +#define SGX_PROV_DEV_MINOR 1 +#define SGX_MAX_NR_DEVICES 2 + #endif /* _X86_SGX_H */ -- 2.22.0
[RFC PATCH 09/21] KVM: VMX: Add basic handling of VM-Exit from SGX enclave
Intel SGX adds a new CPL3-only execution environment referred to as an "enclave". To protect the secrets of an enclave, the CPU's state is loaded with synthetic data when exiting the enclave (the enclave's state is saved/restored via protected memory), and the RIP is set to a defined exit value. This behavior also applies to VMi-Exits from the enclave, e.g. GUEST_RIP may not necessarily reflect the actual RIP that triggered the VMExit. To help a VMM recognize and handle exits from enclaves, SGX adds bits to existing VMCS fields, VM_EXIT_REASON.VMX_EXIT_REASON_FROM_ENCLAVE and GUEST_INTERRUPTIBILITY_INFO.GUEST_INTR_STATE_ENCLAVE_INTR. Define the new architectural bits and add a boolean to struct vcpu_vmx to cache VMX_EXIT_REASON_FROM_ENCLAVE and clear the bit in exit_reason so that checks against exit_reason do not need to account for SGX, e.g. exit_reason == EXIT_REASON_EXCEPTION_NMI continues to work. As for new behavior for VM-Exits from enclaves, KVM is for the most part a passive observer of both bits, e.g. it needs to account for the bits when propagating information to a nested VMM, but otherwise doesn't need to act differently for VMExits from enclaves. The one scenario that is impacted is emulation, which becomes impossible since KVM does not have access to the RIP or instruction stream that triggered the VMExit[2]. This is largely a non-issue as most instructions that might trigger VM-Exit are designed to unconditionally that may VM-Exit but do not #UD, KVM either never sets the exiting control, e.g. PAUSE_EXITING[1], or sets it if and only if the feature is not exposed to the guest in order to inject a #UD, e.g. RDRAND_EXITING. But, because it is still possible for a guest to trigger emulation, e.g. MMIO, inject a #UD if KVM ever attempts emulation after a VM-Exit from an enclave. This is architecturally accurate for instruction VM-Exits, and for MMIO it's the least bad choice, e.g. it's preferable to killing the VM. In practice, only broken or particularly stupid guests should ever encounter this behavior. Add a WARN in skip_emulated_instruction to detect any attempt to modify the guest's RIP during an SGX enclave VM-Exit as all such flows should either be unreachable or must handle exits from enclaves before getting to skip_emulated_instruction. [1] PAUSE_LOOP_EXITING only affects CPL0 and enclaves exist only at CPL3, so we also don't need to worry about that interaction. [2] Impossible for all practical purposes. Not truly impossible since KVM could implement some form of para-virtualization scheme. Signed-off-by: Sean Christopherson --- arch/x86/include/asm/vmx.h | 1 + arch/x86/include/uapi/asm/vmx.h | 1 + arch/x86/kvm/vmx/nested.c | 2 ++ arch/x86/kvm/vmx/vmx.c | 42 ++--- arch/x86/kvm/vmx/vmx.h | 3 +++ 5 files changed, 46 insertions(+), 3 deletions(-) diff --git a/arch/x86/include/asm/vmx.h b/arch/x86/include/asm/vmx.h index a39136b0d509..a62ac47d2006 100644 --- a/arch/x86/include/asm/vmx.h +++ b/arch/x86/include/asm/vmx.h @@ -364,6 +364,7 @@ enum vmcs_field { #define GUEST_INTR_STATE_MOV_SS0x0002 #define GUEST_INTR_STATE_SMI 0x0004 #define GUEST_INTR_STATE_NMI 0x0008 +#define GUEST_INTR_STATE_ENCLAVE_INTR 0x0010 /* GUEST_ACTIVITY_STATE flags */ #define GUEST_ACTIVITY_ACTIVE 0 diff --git a/arch/x86/include/uapi/asm/vmx.h b/arch/x86/include/uapi/asm/vmx.h index d213ec5c3766..501a35bd4cc7 100644 --- a/arch/x86/include/uapi/asm/vmx.h +++ b/arch/x86/include/uapi/asm/vmx.h @@ -27,6 +27,7 @@ #define VMX_EXIT_REASONS_FAILED_VMENTRY 0x8000 +#define VMX_EXIT_REASON_FROM_ENCLAVE 0x0800 #define EXIT_REASON_EXCEPTION_NMI 0 #define EXIT_REASON_EXTERNAL_INTERRUPT 1 diff --git a/arch/x86/kvm/vmx/nested.c b/arch/x86/kvm/vmx/nested.c index 46af3a5e9209..fef4fb3e1aaa 100644 --- a/arch/x86/kvm/vmx/nested.c +++ b/arch/x86/kvm/vmx/nested.c @@ -3523,6 +3523,8 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, struct vmcs12 *vmcs12, /* update exit information fields: */ vmcs12->vm_exit_reason = exit_reason; + if (to_vmx(vcpu)->sgx_enclave_exit) + vmcs12->vm_exit_reason |= VMX_EXIT_REASON_FROM_ENCLAVE; vmcs12->exit_qualification = exit_qualification; vmcs12->vm_exit_intr_info = exit_intr_info; diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index f48fc990ca6d..abcd2f7a36f5 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1460,16 +1460,40 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data) static bool vmx_is_emulatable(struct kvm_vcpu *vcpu, void *insn, int insn_len) { + if (unlikely(to_vmx(vcpu)->sgx_enclave_exit)) { + kvm_queue_exception(vcpu, UD_VECTOR); + return false; + } return true; } static void skip_emulated_instruction(struct kvm_vcpu *vcpu) { +
[RFC PATCH 04/21] x86/sgx: Add /dev/sgx/virt_epc device to allocate "raw" EPC for VMs
Add an SGX device to enable userspace to allocate EPC without an associated enclave. The intended and only known use case for direct EPC allocation is to expose EPC to a KVM guest, hence the virt_epc moniker, virt.{c,h} files and INTEL_SGX_VIRTUALIZATION Kconfig. Although KVM is the end consumer of EPC, and will need hooks into the virtual EPC management if oversubscription of EPC for guest is ever supported (see below), implement direct access to EPC in the SGX subsystem instead of in KVM. Doing so has two major advantages: - Does not require changes to KVM's uAPI, e.g. EPC gets handled as just another memory backend for guests. - EPC management is wholly contained in the SGX subsystem, e.g. SGX does not have to export any symbols, changes to reclaim flows don't need to be routed through KVM, SGX's dirty laundry doesn't have to get aired out for the world to see, and so on and so forth. Oversubscription of EPC for KVM guests is not currently supported. Due to the complications of handling reclaim conflicts between guest and host, KVM EPC oversubscription is expected to be at least an order of magnitude more complex than basic support for SGX virtualization. Signed-off-by: Sean Christopherson --- arch/x86/Kconfig | 10 ++ arch/x86/kernel/cpu/sgx/Makefile | 1 + arch/x86/kernel/cpu/sgx/main.c | 3 + arch/x86/kernel/cpu/sgx/sgx.h| 3 +- arch/x86/kernel/cpu/sgx/virt.c | 253 +++ arch/x86/kernel/cpu/sgx/virt.h | 14 ++ 6 files changed, 283 insertions(+), 1 deletion(-) create mode 100644 arch/x86/kernel/cpu/sgx/virt.c create mode 100644 arch/x86/kernel/cpu/sgx/virt.h diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index 74ccb1bdea16..c1bdb9f85928 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1961,6 +1961,16 @@ config INTEL_SGX_DRIVER If unsure, say N. +config INTEL_SGX_VIRTUALIZATION + bool "Intel SGX Virtualization" + depends on INTEL_SGX && KVM_INTEL + help + Enabling support for SGX virtualization enables userspace to allocate + "raw" EPC for the purpose of exposing EPC to a KVM guest, i.e. a + virtual machine, via a device node (/dev/sgx/virt_epc by default). + + If unsure, say N. + config EFI bool "EFI runtime service support" depends on ACPI diff --git a/arch/x86/kernel/cpu/sgx/Makefile b/arch/x86/kernel/cpu/sgx/Makefile index e5d1e862969c..559fd0f9be50 100644 --- a/arch/x86/kernel/cpu/sgx/Makefile +++ b/arch/x86/kernel/cpu/sgx/Makefile @@ -1,2 +1,3 @@ obj-y += encl.o encls.o main.o reclaim.o obj-$(CONFIG_INTEL_SGX_DRIVER) += driver/ +obj-$(CONFIG_INTEL_SGX_VIRTUALIZATION) += virt.o diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 9f4473597620..ead827371139 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -14,6 +14,7 @@ #include "arch.h" #include "encls.h" #include "sgx.h" +#include "virt.h" struct sgx_epc_section sgx_epc_sections[SGX_MAX_EPC_SECTIONS]; int sgx_nr_epc_sections; @@ -422,7 +423,9 @@ static __init int sgx_init(void) if (ret) goto err_provision_dev; + /* Success if the native *or* virtual driver initialized cleanly. */ ret = sgx_drv_init(); + ret = sgx_virt_epc_init() ? ret : 0; if (ret) goto err_provision_cdev; diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index a0af8849c7c3..16cdb935aaa7 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -91,7 +91,8 @@ int sgx_einit(struct sgx_sigstruct *sigstruct, struct sgx_einittoken *token, #define SGX_ENCL_DEV_MINOR 0 #define SGX_PROV_DEV_MINOR 1 -#define SGX_MAX_NR_DEVICES 2 +#define SGX_VIRT_DEV_MINOR 2 +#define SGX_MAX_NR_DEVICES 3 __init int sgx_dev_init(const char *name, struct device *dev, struct cdev *cdev, const struct file_operations *fops, diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c new file mode 100644 index ..79ee5917a4fc --- /dev/null +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -0,0 +1,253 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include + +#include "encls.h" +#include "sgx.h" +#include "virt.h" + +struct sgx_virt_epc_page { + struct sgx_epc_page *epc_page; +}; + +struct sgx_virt_epc { + struct radix_tree_root page_tree; + struct rw_semaphore lock; +}; + +static inline unsigned long sgx_virt_epc_calc_index(struct vm_area_struct *vma, + unsigned long addr) +{ + return vma->vm_pgoff + PFN_DOWN(addr - vma->vm_start); +} + +static struct sgx_virt_epc_page *__sgx_virt_epc_fault(struct sgx_virt_epc *epc, + struct vm_area_struct *vma, +
[RFC PATCH 08/21] KVM: x86: Add kvm_x86_ops hook to short circuit emulation
Similar to the existing AMD #NPF case where emulation of the current instruction is not possible due to lack of information, virtualization of Intel SGX will introduce a scenario where emulation is not possible due to the VMExit occurring in an SGX enclave. And again similar to the AMD case, emulation can be initiated by kvm_mmu_page_fault(), i.e. outside of the control of the vendor-specific code. While the cause and architecturally visible behavior of the two cases is different, e.g. Intel SGX will inject a #UD whereas AMD #NPF is a clean resume or complete shutdown, the impact on the common emulation code is identical: KVM must stop emulation immediately and resume the guest. Replace the exisiting need_emulation_on_page_fault() with a more generic is_emulatable() kvm_x86_ops callback, which is called unconditionally by x86_emulate_instruction(). Query is_emulatable() in handle_ud() as well so that the force_emulation_prefix code doesn't incorrectly modify RIP before calling emulate_instruction() in the absurdly unlikely scenario that we encounter forced emulation in conjunction with "do not emulate". Do this for both Intel and AMD so that any future changes to AMD's emulation logic take effect as expected for handle_ud(). Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 2 +- arch/x86/kvm/mmu.c | 12 arch/x86/kvm/svm.c | 19 +-- arch/x86/kvm/vmx/vmx.c | 11 +-- arch/x86/kvm/x86.c | 9 - 5 files changed, 31 insertions(+), 22 deletions(-) diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 26d1eb83f72a..1341d8390ebe 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -1198,7 +1198,7 @@ struct kvm_x86_ops { uint16_t *vmcs_version); uint16_t (*nested_get_evmcs_version)(struct kvm_vcpu *vcpu); - bool (*need_emulation_on_page_fault)(struct kvm_vcpu *vcpu); + bool (*is_emulatable)(struct kvm_vcpu *vcpu, void *insn, int insn_len); }; struct kvm_arch_async_pf { diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c index 98f6e4f88b04..bf6952f8f330 100644 --- a/arch/x86/kvm/mmu.c +++ b/arch/x86/kvm/mmu.c @@ -5412,18 +5412,6 @@ int kvm_mmu_page_fault(struct kvm_vcpu *vcpu, gva_t cr2, u64 error_code, if (!mmio_info_in_cache(vcpu, cr2, direct) && !is_guest_mode(vcpu)) emulation_type = EMULTYPE_ALLOW_RETRY; emulate: - /* -* On AMD platforms, under certain conditions insn_len may be zero on #NPF. -* This can happen if a guest gets a page-fault on data access but the HW -* table walker is not able to read the instruction page (e.g instruction -* page is not present in memory). In those cases we simply restart the -* guest, with the exception of AMD Erratum 1096 which is unrecoverable. -*/ - if (unlikely(insn && !insn_len)) { - if (!kvm_x86_ops->need_emulation_on_page_fault(vcpu)) - return 1; - } - er = x86_emulate_instruction(vcpu, cr2, emulation_type, insn, insn_len); switch (er) { diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c index 48c865a4e5dd..0fb8b60eb136 100644 --- a/arch/x86/kvm/svm.c +++ b/arch/x86/kvm/svm.c @@ -7115,10 +7115,25 @@ static int nested_enable_evmcs(struct kvm_vcpu *vcpu, return -ENODEV; } -static bool svm_need_emulation_on_page_fault(struct kvm_vcpu *vcpu) +static bool svm_is_emulatable(struct kvm_vcpu *vcpu, void *insn, int insn_len) { bool is_user, smap; + if (likely(!insn || insn_len)) + return true; + + /* +* Under certain conditions insn_len may be zero on #NPF. This can +* happen if a guest gets a page-fault on data access but the HW table +* walker is not able to read the instruction page (e.g instruction +* page is not present in memory). In those cases we simply restart the +* guest, with the exception of AMD Erratum 1096 which is unrecoverable. +*/ + if (unlikely(insn && !insn_len)) { + if (!kvm_x86_ops->need_emulation_on_page_fault(vcpu)) + return 1; + } + is_user = svm_get_cpl(vcpu) == 3; smap = !kvm_read_cr4_bits(vcpu, X86_CR4_SMAP); @@ -7279,7 +7294,7 @@ static struct kvm_x86_ops svm_x86_ops __ro_after_init = { .nested_enable_evmcs = nested_enable_evmcs, .nested_get_evmcs_version = nested_get_evmcs_version, - .need_emulation_on_page_fault = svm_need_emulation_on_page_fault, + .is_emulatable = svm_is_emulatable, }; static int __init svm_init(void) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index d98eac371c0a..f48fc990ca6d 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -1458,6 +1458,10 @@ static int vmx_rtit_ctl_check(struct kvm_vcpu *vcpu, u64 data)
[RFC PATCH 10/21] KVM: x86: Export kvm_mmu_gva_to_gpa_{read,write}() for VMX/SGX
Support for SGX Launch Control requires KVM to trap and execute ENCLS[ECREATE] and ENCLS[EINIT] on behalf of the guest, which requires requires obtaining the GPA of a Secure Enclave Control Structure (SECS) in order to get its corresponding HVA. Because the SECS must reside in the Enclave Page Cache (EPC), copying the SECS's data to a host-controlled buffer via existing exported helpers is not a viable option as the EPC is not readable or writable by the kernel. Translating GVA->HVA for non-EPC pages is also desirable, as passing user pointers directly to ECREATE and EINIT avoids having to copy pages worth of data into the kernel. Signed-off-by: Sean Christopherson --- arch/x86/kvm/x86.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index afcc01a59421..2b64bb854571 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -5089,6 +5089,7 @@ gpa_t kvm_mmu_gva_to_gpa_read(struct kvm_vcpu *vcpu, gva_t gva, u32 access = (kvm_x86_ops->get_cpl(vcpu) == 3) ? PFERR_USER_MASK : 0; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } +EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_read); gpa_t kvm_mmu_gva_to_gpa_fetch(struct kvm_vcpu *vcpu, gva_t gva, struct x86_exception *exception) @@ -5105,6 +5106,7 @@ gpa_t kvm_mmu_gva_to_gpa_write(struct kvm_vcpu *vcpu, gva_t gva, access |= PFERR_WRITE_MASK; return vcpu->arch.walk_mmu->gva_to_gpa(vcpu, gva, access, exception); } +EXPORT_SYMBOL_GPL(kvm_mmu_gva_to_gpa_write); /* uses this to access any guest's mapped memory without checking CPL */ gpa_t kvm_mmu_gva_to_gpa_system(struct kvm_vcpu *vcpu, gva_t gva, -- 2.22.0
[RFC PATCH 16/21] KVM: VMX: Edd emulation of SGX Launch Control LE hash MSRs
SGX Launch Control (LC) modifies the behavior of ENCLS[EINIT] to query a set of user-controllable MSRs (Launch Enclave, a.k.a. LE, Hash MSRs) when verifying the key used to sign an enclave. On CPUs without LC support, the public key hash of allowed LEs is hardwired into the CPU to an Intel controlled key (the Intel key is also the reset value of the LE hash MSRs). Track the guest's desired hash and stuff it into hardware when executing EINIT on behalf of the guest (in a future patch). Note, KVM allows writes to the LE hash MSRs if IA32_FEATURE_CONTROL is unlocked. This is technically not arch behavior, but it's roughly equivalent to the arch behavior of the MSRs being writable prior to activating SGX[1]. Emulating SGX activation is feasible, but adds no tangible benefits and would just create extra work for KVM and guest firmware. [1] SGX related bits in IA32_FEATURE_CONTROL cannot be set until SGX is activated, e.g. by firmware. SGX activation is triggered by setting bit 0 in MSR 0x7a. Until SGX is activated, the LE hash MSRs are writable, e.g. to allow firmware to lock down the LE root key with a non-Intel value. Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/vmx.c | 42 ++ arch/x86/kvm/vmx/vmx.h | 2 ++ 2 files changed, 44 insertions(+) diff --git a/arch/x86/kvm/vmx/vmx.c b/arch/x86/kvm/vmx/vmx.c index abcd2f7a36f5..819c47fee157 100644 --- a/arch/x86/kvm/vmx/vmx.c +++ b/arch/x86/kvm/vmx/vmx.c @@ -390,6 +390,8 @@ static const struct kvm_vmx_segment_field { u64 host_efer; +static u64 sgx_pubkey_hash[4] __ro_after_init; + /* * Though SYSCALL is only supported in 64-bit mode on Intel CPUs, kvm * will emulate SYSCALL in legacy mode if the vendor string in guest @@ -1740,6 +1742,13 @@ static int vmx_get_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) case MSR_IA32_FEATURE_CONTROL: msr_info->data = vmx->msr_ia32_feature_control; break; + case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: + if (!msr_info->host_initiated && + !guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC)) + return 1; + msr_info->data = to_vmx(vcpu)->msr_ia32_sgxlepubkeyhash + [msr_info->index - MSR_IA32_SGXLEPUBKEYHASH0]; + break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!nested_vmx_allowed(vcpu)) return 1; @@ -1953,6 +1962,15 @@ static int vmx_set_msr(struct kvm_vcpu *vcpu, struct msr_data *msr_info) if (msr_info->host_initiated && data == 0) vmx_leave_nested(vcpu); break; + case MSR_IA32_SGXLEPUBKEYHASH0 ... MSR_IA32_SGXLEPUBKEYHASH3: + if (!msr_info->host_initiated && + (!guest_cpuid_has(vcpu, X86_FEATURE_SGX_LC) || + ((vmx->msr_ia32_feature_control & FEATURE_CONTROL_LOCKED) && + !(vmx->msr_ia32_feature_control & FEATURE_CONTROL_SGX_LE_WR + return 1; + vmx->msr_ia32_sgxlepubkeyhash + [msr_index - MSR_IA32_SGXLEPUBKEYHASH0] = data; + break; case MSR_IA32_VMX_BASIC ... MSR_IA32_VMX_VMFUNC: if (!msr_info->host_initiated) return 1; /* they are read-only */ @@ -6698,6 +6716,9 @@ static struct kvm_vcpu *vmx_create_vcpu(struct kvm *kvm, unsigned int id) else memset(>nested.msrs, 0, sizeof(vmx->nested.msrs)); + memcpy(vmx->msr_ia32_sgxlepubkeyhash, sgx_pubkey_hash, + sizeof(sgx_pubkey_hash)); + vmx->nested.posted_intr_nv = -1; vmx->nested.current_vmptr = -1ull; @@ -7588,6 +7609,27 @@ static __init int hardware_setup(void) if (!enable_ept || !cpu_has_vmx_intel_pt()) pt_mode = PT_MODE_SYSTEM; + /* +* Use Intel's default value for Skylake hardware if Launch Control is +* not supported, i.e. Intel's hash is hardcoded into silicon, or if +* Launch Control is supported and enabled, i.e. mimic the reset value +* and let the guest write the MSRs at will. If Launch Control is +* supported but disabled, then we have to use the current MSR values +* as the MSRs the hash MSRs exist but are locked and not writable. +*/ + if (boot_cpu_has(X86_FEATURE_SGX_LC) || + rdmsrl_safe(MSR_IA32_SGXLEPUBKEYHASH0, _pubkey_hash[0])) { + sgx_pubkey_hash[0] = 0xa6053e051270b7acULL; + sgx_pubkey_hash[1] = 0x6cfbe8ba8b3b413dULL; + sgx_pubkey_hash[2] = 0xc4916d99f2b3735dULL; + sgx_pubkey_hash[3] = 0xd4f8c05909f9bb3bULL; + } else { + /* MSR_IA32_SGXLEPUBKEYHASH0 is read above */ + rdmsrl(MSR_IA32_SGXLEPUBKEYHASH1, sgx_pubkey_hash[1]); +
[RFC PATCH 17/21] KVM: VMX: Add handler for ENCLS[EINIT] to support SGX Launch Control
SGX Launch Control (LC) modifies the behavior of ENCLS[EINIT] to query a set of user-controllable MSRs (Launch Enclave, a.k.a. LE, Hash MSRs) when verifying the key used to sign an enclave. On CPUs without LC support, the public key hash of allowed LEs is hardwired into the CPU to an Intel controlled key (the Intel key is also the reset value of the LE hash MSRs). When LC is enabled in the host, EINIT must be intercepted and executed in the host using the guest's LE hash MSR value, even if the guest's values are fixed to hardware default values. The MSRs are not switched on VM-Enter/VM-Exit as writing the MSRs is extraordinarily expensive, e.g. each WRMSR is 4x slower than a regular WRMSR and on par with a full VM-Enter -> VM-Exit transition. Furthermore, as the MSRS aren't allowed in the hardware-supported lists, i.e. would need to be manually read and written. On the other hand, EINIT takes tens of thousands of cycles to execute (it's so slow that it's interruptible), i.e. the ~1k cycles of overhead to trap-and-execute EINIT is unlikely to be noticed by the guest, let alone impact the overall performance of SGX. Actual usage of the handler will be added in a future patch, i.e. when SGX virtualization is fully enabled. Signed-off-by: Sean Christopherson --- arch/x86/kvm/vmx/sgx.c | 24 1 file changed, 24 insertions(+) diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c index 5b08e7dcc3a3..2bcfa3b6c75e 100644 --- a/arch/x86/kvm/vmx/sgx.c +++ b/arch/x86/kvm/vmx/sgx.c @@ -221,3 +221,27 @@ int handle_encls_ecreate(struct kvm_vcpu *vcpu) return sgx_encls_postamble(vcpu, ret, trapnr, secs_gva); } + +int handle_encls_einit(struct kvm_vcpu *vcpu) +{ + unsigned long sig_hva, secs_hva, token_hva; + struct vcpu_vmx *vmx = to_vmx(vcpu); + gva_t sig_gva, secs_gva, token_gva; + int ret, trapnr; + + if (sgx_get_encls_gva(vcpu, kvm_rbx_read(vcpu), 1808, 4096, _gva) || + sgx_get_encls_gva(vcpu, kvm_rcx_read(vcpu), 4096, 4096, _gva) || + sgx_get_encls_gva(vcpu, kvm_rdx_read(vcpu), 304, 512, _gva)) + return 1; + + if (sgx_gva_to_hva(vcpu, sig_gva, false, _hva) || + sgx_gva_to_hva(vcpu, secs_gva, true, _hva) || + sgx_gva_to_hva(vcpu, token_gva, false, _hva)) + return 1; + + ret = sgx_einit((void __user *)sig_hva, (void __user *)token_hva, + (void __user *)secs_hva, vmx->msr_ia32_sgxlepubkeyhash, + ); + + return sgx_encls_postamble(vcpu, ret, trapnr, secs_hva); +} -- 2.22.0
[RFC PATCH 05/21] x86/sgx: Expose SGX architectural definitions to the kernel
KVM will use many of the architectural constants and structs to virtualize SGX. Signed-off-by: Sean Christopherson --- arch/x86/{kernel/cpu/sgx/arch.h => include/asm/sgx_arch.h} | 0 arch/x86/kernel/cpu/sgx/driver/driver.h| 2 +- arch/x86/kernel/cpu/sgx/encl.c | 2 +- arch/x86/kernel/cpu/sgx/encls.h| 2 +- arch/x86/kernel/cpu/sgx/main.c | 2 +- arch/x86/kernel/cpu/sgx/sgx.h | 3 +-- tools/testing/selftests/x86/sgx/defines.h | 2 +- 7 files changed, 6 insertions(+), 7 deletions(-) rename arch/x86/{kernel/cpu/sgx/arch.h => include/asm/sgx_arch.h} (100%) diff --git a/arch/x86/kernel/cpu/sgx/arch.h b/arch/x86/include/asm/sgx_arch.h similarity index 100% rename from arch/x86/kernel/cpu/sgx/arch.h rename to arch/x86/include/asm/sgx_arch.h diff --git a/arch/x86/kernel/cpu/sgx/driver/driver.h b/arch/x86/kernel/cpu/sgx/driver/driver.h index 6ce18c766a5a..4dc133f3c186 100644 --- a/arch/x86/kernel/cpu/sgx/driver/driver.h +++ b/arch/x86/kernel/cpu/sgx/driver/driver.h @@ -10,7 +10,7 @@ #include #include #include -#include "../arch.h" +#include #include "../encl.h" #include "../encls.h" #include "../sgx.h" diff --git a/arch/x86/kernel/cpu/sgx/encl.c b/arch/x86/kernel/cpu/sgx/encl.c index 836c55d4352d..8549fd95f02d 100644 --- a/arch/x86/kernel/cpu/sgx/encl.c +++ b/arch/x86/kernel/cpu/sgx/encl.c @@ -7,7 +7,7 @@ #include #include #include -#include "arch.h" +#include #include "encl.h" #include "encls.h" #include "sgx.h" diff --git a/arch/x86/kernel/cpu/sgx/encls.h b/arch/x86/kernel/cpu/sgx/encls.h index aea3b9d09936..1b49c7419767 100644 --- a/arch/x86/kernel/cpu/sgx/encls.h +++ b/arch/x86/kernel/cpu/sgx/encls.h @@ -8,7 +8,7 @@ #include #include #include -#include "arch.h" +#include /** * ENCLS_FAULT_FLAG - flag signifying an ENCLS return code is a trapnr diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index ead827371139..532dd90e09e1 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -10,8 +10,8 @@ #include #include #include +#include #include "driver/driver.h" -#include "arch.h" #include "encls.h" #include "sgx.h" #include "virt.h" diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 16cdb935aaa7..748b1633d770 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -8,10 +8,9 @@ #include #include #include +#include #include -#include "arch.h" - struct sgx_epc_page { unsigned long desc; struct sgx_encl_page *owner; diff --git a/tools/testing/selftests/x86/sgx/defines.h b/tools/testing/selftests/x86/sgx/defines.h index 3ff73a9d9b93..ebc4c6cf57c4 100644 --- a/tools/testing/selftests/x86/sgx/defines.h +++ b/tools/testing/selftests/x86/sgx/defines.h @@ -33,7 +33,7 @@ typedef uint64_t u64; (((~0ULL) - (1ULL << (l)) + 1) & \ (~0ULL >> (BITS_PER_LONG_LONG - 1 - (h -#include "../../../../../arch/x86/kernel/cpu/sgx/arch.h" +#include "../../../../../arch/x86/include/asm/sgx_arch.h" #include "../../../../../arch/x86/include/uapi/asm/sgx.h" #endif /* TYPES_H */ -- 2.22.0
[RFC PATCH 21/21] KVM: x86: Add capability to grant VM access to privileged SGX attribute
The SGX subsystem restricts access to a subset of enclave attributes to provide additional security for an uncompromised kernel, e.g. to prevent malware from using the PROVISIONKEY to ensure its nodes are running inside a geniune SGX enclave and/or to obtain a stable fingerprint. To prevent userspace from circumventing such restrictions by running an enclave in a VM, KVM restricts guest access to privileged attributes by default. Add a capability, KVM_CAP_SGX_ATTRIBUTE, that can be used by userspace to grant a VM access to a priveleged attribute, with args[0] holding a file handle to a valid SGX attribute file corresponding to an attribute that is restricted by KVM (currently only PROVISIONKEY). Cc: Andy Lutomirski Signed-off-by: Sean Christopherson --- Documentation/virtual/kvm/api.txt | 20 arch/x86/kvm/cpuid.c | 2 +- arch/x86/kvm/x86.c| 22 ++ include/uapi/linux/kvm.h | 1 + 4 files changed, 44 insertions(+), 1 deletion(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index 383b292966fa..b1c0ff4e9224 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -5013,6 +5013,26 @@ it hard or impossible to use it correctly. The availability of KVM_CAP_MANUAL_DIRTY_LOG_PROTECT2 signals that those bugs are fixed. Userspace should not try to use KVM_CAP_MANUAL_DIRTY_LOG_PROTECT. +7.19 KVM_CAP_SGX_ATTRIBUTE + +Architectures: x86 +Parameters: args[0] is a file handle of a SGX attribute file in securityfs +Returns: 0 on success, -EINVAL if the file handle is invalid or if a requested +attribute is not supported by KVM. + +The SGX subsystem restricts access to a subset of enclave attributes, e.g. the +PROVISIONKEY, to provide additional security for an uncompromised kernel, e.g. +to prevent malware from using the PROVISIONKEY to ensure its nodes are running +inside a geniune SGX enclave and/or to obtain a stable system fingerprint. + +To prevent userspace from circumventing such restrictions by running an enclave +in a VM, KVM prevents access to privileged attributes by default. Userspace +can use KVM_CAP_SGX_ATTRIBUTE to grant a VM access to a priveleged attribute. +args[0] must hold a file handle to a valid SGX attribute file corresponding to +an attribute that is supported/restricted by KVM (currently only PROVISIONKEY). + +See Documentation/x86/sgx/2.Kernel-internals.rst for more details. + 8. Other capabilities. -- diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 73a0326a1968..73af09edb2fa 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -439,7 +439,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, /* cpuid 12.1.eax*/ const u32 kvm_cpuid_12_1_eax_sgx_features = - SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | 0 /* PROVISIONKEY */ | + SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | SGX_ATTR_PROVISIONKEY | SGX_ATTR_EINITTOKENKEY | SGX_ATTR_KSS; /* cpuid 12.1.ebx*/ diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c index ec92c5534336..9144909d4a8e 100644 --- a/arch/x86/kvm/x86.c +++ b/arch/x86/kvm/x86.c @@ -67,6 +67,8 @@ #include #include #include +#include +#include #include #define CREATE_TRACE_POINTS @@ -3090,6 +3092,9 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext) case KVM_CAP_GET_MSR_FEATURES: case KVM_CAP_MSR_PLATFORM_INFO: case KVM_CAP_EXCEPTION_PAYLOAD: +#ifdef CONFIG_INTEL_SGX_VIRTUALIZATION + case KVM_CAP_SGX_ATTRIBUTE: +#endif r = 1; break; case KVM_CAP_SYNC_REGS: @@ -4626,6 +4631,23 @@ int kvm_vm_ioctl_enable_cap(struct kvm *kvm, kvm->arch.exception_payload_enabled = cap->args[0]; r = 0; break; +#ifdef CONFIG_INTEL_SGX_VIRTUALIZATION + case KVM_CAP_SGX_ATTRIBUTE: { + u64 allowed_attributes = 0; + + r = sgx_set_attribute(_attributes, cap->args[0]); + if (r) + break; + + /* KVM only supports the PROVISIONKEY privileged attribute. */ + if ((allowed_attributes & SGX_ATTR_PROVISIONKEY) && + !(allowed_attributes & ~SGX_ATTR_PROVISIONKEY)) + kvm->arch.sgx_provisioning_allowed = true; + else + r = -EINVAL; + break; + } +#endif default: r = -EINVAL; break; diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 2fe12b40d503..b16708c2b6c9 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -993,6 +993,7 @@ struct kvm_ppc_resize_hpt { #define KVM_CAP_ARM_SVE 170 #define KVM_CAP_ARM_PTRAUTH_ADDRESS 171 #define KVM_CAP_ARM_PTRAUTH_GENERIC 172 +#define KVM_CAP_SGX_ATTRIBUTE 200 #ifdef
[RFC PATCH 20/21] x86/sgx: Export sgx_set_attribute() for use by KVM
To prevent userspace from circumventing access to the PROVISIONKEY by running an enclave in a VM, KVM will deny access to the PROVISIONKEY unless userspace proves to KVM that it is allowed to access the key. Export sgx_set_attribute() so that it may be used by KVM to verify an SGX attribute file. Signed-off-by: Sean Christopherson --- arch/x86/include/asm/sgx.h | 2 ++ arch/x86/kernel/cpu/sgx/driver/ioctl.c | 1 + arch/x86/kernel/cpu/sgx/main.c | 1 + arch/x86/kernel/cpu/sgx/sgx.h | 1 - 4 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h index f0f0176b8e2f..65c9417d3a80 100644 --- a/arch/x86/include/asm/sgx.h +++ b/arch/x86/include/asm/sgx.h @@ -6,6 +6,8 @@ struct sgx_pageinfo; +int sgx_set_attribute(u64 *allowed_attributes, unsigned int attribute_fd); + #if IS_ENABLED(CONFIG_KVM_INTEL) int sgx_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs, int *trapnr); int sgx_einit(void __user *sigstruct, void __user *token, diff --git a/arch/x86/kernel/cpu/sgx/driver/ioctl.c b/arch/x86/kernel/cpu/sgx/driver/ioctl.c index a1cb5f772363..1b7a05cd9d02 100644 --- a/arch/x86/kernel/cpu/sgx/driver/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/driver/ioctl.c @@ -2,6 +2,7 @@ // Copyright(c) 2016-19 Intel Corporation. #include +#include #include #include #include diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index 542427c6ae9c..68e5c704378a 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -336,6 +336,7 @@ int sgx_set_attribute(u64 *allowed_attributes, unsigned int attribute_fd) *allowed_attributes |= SGX_ATTR_PROVISIONKEY; return 0; } +EXPORT_SYMBOL_GPL(sgx_set_attribute); static void sgx_dev_release(struct device *dev) { diff --git a/arch/x86/kernel/cpu/sgx/sgx.h b/arch/x86/kernel/cpu/sgx/sgx.h index 3f3311024bd0..fab12cc0e7c5 100644 --- a/arch/x86/kernel/cpu/sgx/sgx.h +++ b/arch/x86/kernel/cpu/sgx/sgx.h @@ -96,6 +96,5 @@ void sgx_update_lepubkeyhash_msrs(u64 *lepubkeyhash, bool enforce); __init int sgx_dev_init(const char *name, struct device *dev, struct cdev *cdev, const struct file_operations *fops, int minor); -int sgx_set_attribute(u64 *allowed_attributes, unsigned int attribute_fd); #endif /* _X86_SGX_H */ -- 2.22.0
[RFC PATCH 19/21] KVM: VMX: Enable SGX virtualization for SGX1, SGX2 and LC
SGX adds a basic support bit to CPUID(7, 0), and enumerates SGX capabilities, e.g. EPC info, ENCLS leafs, etc..., in CPUID(0x12, *). All SGX1 and SGX2 ENCLS leafs (supported in hardware) can be exposed to the guest unconditionally. All other ENCLS leafs (currently the ENCLS_C leafs) and all ENCLV leafs currently cannot be exposed to the guest. Flexible Launch Control, a.k.a. SGX LC, allows software to control the key that is used to verify the signer of an enclave. Because SGX LC impacts guest operation even if it's not exposed to the guest, i.e. EINIT is affected by hardware's LE hash MSRs, SGX cannot be exposed to the guest if the host supports LC without explicit LC support in KVM. In other words, LC support is required to run on platforms with LC enabled in the host, thus making exposure of SGX LC to the guest a formality. Access to the provision key is not supported in this patch. Access to the provision key is controlled via securityfs, a future patch will plumb in the ability for the userspace hypervisor to grant a VM access to the provision key by passing in an appropriate file descriptor. Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 72 +- arch/x86/kvm/vmx/nested.c | 19 - arch/x86/kvm/vmx/nested.h | 5 ++ arch/x86/kvm/vmx/sgx.h| 11 +++ arch/x86/kvm/vmx/vmcs12.c | 1 + arch/x86/kvm/vmx/vmcs12.h | 4 +- arch/x86/kvm/vmx/vmx.c| 156 -- arch/x86/kvm/vmx/vmx.h| 1 + 8 files changed, 254 insertions(+), 15 deletions(-) create mode 100644 arch/x86/kvm/vmx/sgx.h diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 4c235af5318c..73a0326a1968 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -18,6 +18,7 @@ #include #include #include +#include #include "cpuid.h" #include "lapic.h" #include "mmu.h" @@ -117,6 +118,21 @@ int kvm_update_cpuid(struct kvm_vcpu *vcpu) if (best && (best->eax & (F(XSAVES) | F(XSAVEC best->ebx = xstate_required_size(vcpu->arch.xcr0, true); + /* +* Bits 127:0 of the allowed SECS.ATTRIBUTES (CPUID.0x12.0x1) enumerate +* the supported XSAVE Feature Request Mask (XFRM), i.e. the enclave's +* requested XCR0 value. The enclave's XFRM must be a subset of XCRO +* at the time of EENTER, thus adjust the allowed XFRM by the guest's +* supported XCR0. Similar to XCR0 handling, FP and SSE are forced to +* '1' even on CPUs that don't support XSAVE. +*/ + best = kvm_find_cpuid_entry(vcpu, 0x12, 0x1); + if (best) { + best->ecx &= vcpu->arch.guest_supported_xcr0 & 0x; + best->edx &= vcpu->arch.guest_supported_xcr0 >> 32; + best->ecx |= XFEATURE_MASK_FPSSE; + } + /* * The existing code assumes virtual address is 48-bit or 57-bit in the * canonical address checks; exit if it is ever changed. @@ -393,7 +409,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(BMI2) | F(ERMS) | f_invpcid | F(RTM) | f_mpx | F(RDSEED) | F(ADX) | F(SMAP) | F(AVX512IFMA) | F(AVX512F) | F(AVX512PF) | F(AVX512ER) | F(AVX512CD) | F(CLFLUSHOPT) | F(CLWB) | F(AVX512DQ) | - F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt; + F(SHA_NI) | F(AVX512BW) | F(AVX512VL) | f_intel_pt | F(SGX); /* cpuid 0xD.1.eax */ const u32 kvm_cpuid_D_1_eax_x86_features = @@ -404,7 +420,7 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(AVX512VBMI) | F(LA57) | F(PKU) | 0 /*OSPKE*/ | F(AVX512_VPOPCNTDQ) | F(UMIP) | F(AVX512_VBMI2) | F(GFNI) | F(VAES) | F(VPCLMULQDQ) | F(AVX512_VNNI) | F(AVX512_BITALG) | - F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B); + F(CLDEMOTE) | F(MOVDIRI) | F(MOVDIR64B) | F(SGX_LC); /* cpuid 7.0.edx*/ const u32 kvm_cpuid_7_0_edx_x86_features = @@ -412,6 +428,23 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | F(MD_CLEAR); + /* cpuid 12.0.eax*/ + const u32 kvm_cpuid_12_0_eax_x86_features = + F(SGX1) | F(SGX2) | 0 /* Reserved */ | 0 /* Reserved */ | + 0 /* Reserved */ | 0 /* ENCLV */ | 0 /* ENCLS_C */; + + /* cpuid 12.0.ebx*/ + const u32 kvm_cpuid_12_0_ebx_sgx_features = + SGX_MISC_EXINFO; + + /* cpuid 12.1.eax*/ + const u32 kvm_cpuid_12_1_eax_sgx_features = + SGX_ATTR_DEBUG | SGX_ATTR_MODE64BIT | 0 /* PROVISIONKEY */ | + SGX_ATTR_EINITTOKENKEY | SGX_ATTR_KSS; + + /* cpuid 12.1.ebx*/ + const u32 kvm_cpuid_12_1_ebx_sgx_features = 0; + /* * The code below assumes index == 0, which simplifies handling
[RFC PATCH 15/21] KVM: VMX: Add SGX ENCLS[ECREATE] handler to enforce CPUID restrictions
Userspace can restrict what bits can be set in MISCSELECT, ATTRIBUTES and XFRM via CPUID. Intercept ECREATE when any of the aforementioned masks diverges from hardware in order to enforce the desired CPUID model, i.e. inject #GP if the guest attempts to set a bit that hasn't been enumerated as allowed-1 in CPUID. Add the handler in a dedicated SGX file under the VMX sub-directory so as to confine the ugliness of the SGX specific code (re-executing ENCLS leafs is messy due to the need to follow pointers from structs, get EPC pages, etc...) and to save compilation cycles when SGX functionality is disabled in the kernel. The ENCLS handlers will soon grow to ~300 lines of code when Launch Control support is added, and in the distant future could balloon significantly if/when EPC oversubscription is supported. Actual usage of the handler will be added in a future patch, i.e. when SGX virtualization is fully enabled. Note, access to the PROVISIONKEY is not yet supported. Signed-off-by: Sean Christopherson --- arch/x86/include/asm/kvm_host.h | 3 + arch/x86/include/asm/sgx_arch.h | 1 + arch/x86/kvm/Makefile | 2 + arch/x86/kvm/vmx/sgx.c | 223 4 files changed, 229 insertions(+) create mode 100644 arch/x86/kvm/vmx/sgx.c diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h index 103df8cbdd24..27841a5d7851 100644 --- a/arch/x86/include/asm/kvm_host.h +++ b/arch/x86/include/asm/kvm_host.h @@ -928,6 +928,9 @@ struct kvm_arch { bool guest_can_read_msr_platform_info; bool exception_payload_enabled; + + /* Guest can access the SGX PROVISIONKEY. */ + bool sgx_provisioning_allowed; }; struct kvm_vm_stat { diff --git a/arch/x86/include/asm/sgx_arch.h b/arch/x86/include/asm/sgx_arch.h index 39f731580ea8..e06f3ff717b4 100644 --- a/arch/x86/include/asm/sgx_arch.h +++ b/arch/x86/include/asm/sgx_arch.h @@ -8,6 +8,7 @@ #ifndef _ASM_X86_SGX_ARCH_H #define _ASM_X86_SGX_ARCH_H +#include #include #define SGX_CPUID 0x12 diff --git a/arch/x86/kvm/Makefile b/arch/x86/kvm/Makefile index 31ecf7a76d5a..f919c3e6abd7 100644 --- a/arch/x86/kvm/Makefile +++ b/arch/x86/kvm/Makefile @@ -13,6 +13,8 @@ kvm-y += x86.o mmu.o emulate.o i8259.o irq.o lapic.o \ hyperv.o page_track.o debugfs.o kvm-intel-y+= vmx/vmx.o vmx/vmenter.o vmx/pmu_intel.o vmx/vmcs12.o vmx/evmcs.o vmx/nested.o +kvm-intel-$(CONFIG_INTEL_SGX_VIRTUALIZATION) += vmx/sgx.o + kvm-amd-y += svm.o pmu_amd.o obj-$(CONFIG_KVM) += kvm.o diff --git a/arch/x86/kvm/vmx/sgx.c b/arch/x86/kvm/vmx/sgx.c new file mode 100644 index ..5b08e7dcc3a3 --- /dev/null +++ b/arch/x86/kvm/vmx/sgx.c @@ -0,0 +1,223 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include + +#include "cpuid.h" +#include "kvm_cache_regs.h" +#include "vmx.h" +#include "x86.h" + +/* + * ENCLS's memory operands use a fixed segment (DS) and a fixed + * address size based on the mode. Related prefixes are ignored. + */ +static int sgx_get_encls_gva(struct kvm_vcpu *vcpu, unsigned long offset, +int size, int alignment, gva_t *gva) +{ + struct kvm_segment s; + bool fault; + + vmx_get_segment(vcpu, , VCPU_SREG_DS); + + *gva = s.base + offset; + + if (!IS_ALIGNED(*gva, alignment)) { + fault = true; + } else if (is_long_mode(vcpu)) { + fault = is_noncanonical_address(*gva, vcpu); + } else { + *gva &= 0x; + fault = (s.unusable) || + (s.type != 2 && s.type != 3) || + (*gva > s.limit) || + ((s.base != 0 || s.limit != 0x) && + (((u64)*gva + size - 1) > s.limit + 1)); + } + if (fault) + kvm_inject_gp(vcpu, 0); + return fault ? -EINVAL : 0; +} + +static int sgx_read_gva(struct kvm_vcpu *vcpu, gva_t gva, void *data, +unsigned int size) +{ + struct x86_exception ex; + + if (kvm_read_guest_virt(vcpu, gva, data, size, )) { + kvm_propagate_page_fault(vcpu, ); + return -EFAULT; + } + return 0; +} + +static int sgx_read_hva(struct kvm_vcpu *vcpu, unsigned long hva, void *data, + unsigned int size) +{ + if (__copy_from_user(data, (void __user *)hva, size)) { + vcpu->run->exit_reason = KVM_EXIT_INTERNAL_ERROR; + vcpu->run->internal.suberror = KVM_INTERNAL_ERROR_EMULATION; + vcpu->run->internal.ndata = 2; + vcpu->run->internal.data[0] = hva; + vcpu->run->internal.data[1] = size; + return -EFAULT; + } + return 0; +} + +static int sgx_gva_to_hva(struct kvm_vcpu *vcpu, gva_t gva, bool write, + unsigned long *hva) +{
[RFC PATCH 18/21] KVM: x86: Invoke kvm_x86_ops->cpuid_update() after kvm_update_cpuid()
VMX's virtualization of SGX adds a lovely dependency on the guest's supported xcr0, which is calculated in kvm_update_cpuid(). VMX must toggled its interception of SGX instructions based on the supported xcr0, i.e. kvm_x86_ops->cpuid_update() is certainly the correct location for the dependent code. kvm_update_cpuid() was originally added by commit 2acf923e38fb ("KVM: VMX: Enable XSAVE/XRSTOR for guest"). There is no indication that its placement after kvm_x86_ops->cpuid_update() was anything more than a "new function at the end" decision. Inspection of the current code reveals no dependency on kvm_x86_ops's cpuid_update() in kvm_update_cpuid() or any of its helpers. - SVM's sole update is to conditionally clear X86_FEATURE_X2APIC. X86_FEATURE_X2APIC is only consumed by kvm_apic_set_state(), which is already called immediately prior to kvm_x86_ops->cpuid_update(). - VMX updates only nested VMX MSRs, allowed FEATURE_CONTROL bits, and VMCS fields, e.g. secondary execution controls, none of which should bleed back into kvm_update_cpuid() barring an egregious dependency bug somewhere else. Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 6 -- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 70e488951f25..4c235af5318c 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -222,8 +222,9 @@ int kvm_vcpu_ioctl_set_cpuid(struct kvm_vcpu *vcpu, vcpu->arch.cpuid_nent = cpuid->nent; cpuid_fix_nx_cap(vcpu); kvm_apic_set_version(vcpu); - kvm_x86_ops->cpuid_update(vcpu); r = kvm_update_cpuid(vcpu); + if (!r) + kvm_x86_ops->cpuid_update(vcpu); out: vfree(cpuid_entries); @@ -245,8 +246,9 @@ int kvm_vcpu_ioctl_set_cpuid2(struct kvm_vcpu *vcpu, goto out; vcpu->arch.cpuid_nent = cpuid->nent; kvm_apic_set_version(vcpu); - kvm_x86_ops->cpuid_update(vcpu); r = kvm_update_cpuid(vcpu); + if (!r) + kvm_x86_ops->cpuid_update(vcpu); out: return r; } -- 2.22.0
[RFC PATCH 07/21] KVM: x86: Add WARN_ON_ONCE(index!=0) in __do_cpuid_ent
Except for one outlier, function 7, all cases in __do_cpuid_ent and its children assume that the index passed in is zero. Furthermore, the index is fully under KVM's control and all callers pass an index of zero. In other words, a non-zero index would indicate either a bug in the caller or a new case that is expected to be handled. WARN and return an error on a non-zero index and remove the now unreachable code in function 7 for handling a non-zero index. Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.c | 57 ++-- 1 file changed, 29 insertions(+), 28 deletions(-) diff --git a/arch/x86/kvm/cpuid.c b/arch/x86/kvm/cpuid.c index 4992e7c99588..70e488951f25 100644 --- a/arch/x86/kvm/cpuid.c +++ b/arch/x86/kvm/cpuid.c @@ -410,6 +410,14 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, F(SPEC_CTRL_SSBD) | F(ARCH_CAPABILITIES) | F(INTEL_STIBP) | F(MD_CLEAR); + /* +* The code below assumes index == 0, which simplifies handling leafs +* with a dynamic number of sub-leafs. The index is fully under KVM's +* control, i.e. a non-zero value is a bug. +*/ + if (WARN_ON_ONCE(index != 0)) + return -EINVAL; + /* all calls to cpuid_count() should be made on the same cpu */ get_cpu(); @@ -480,38 +488,31 @@ static inline int __do_cpuid_ent(struct kvm_cpuid_entry2 *entry, u32 function, entry->ecx = 0; entry->edx = 0; break; - case 7: { + case 7: entry->flags |= KVM_CPUID_FLAG_SIGNIFCANT_INDEX; /* Mask ebx against host capability word 9 */ - if (index == 0) { - entry->ebx &= kvm_cpuid_7_0_ebx_x86_features; - cpuid_mask(>ebx, CPUID_7_0_EBX); - // TSC_ADJUST is emulated - entry->ebx |= F(TSC_ADJUST); - entry->ecx &= kvm_cpuid_7_0_ecx_x86_features; - f_la57 = entry->ecx & F(LA57); - cpuid_mask(>ecx, CPUID_7_ECX); - /* Set LA57 based on hardware capability. */ - entry->ecx |= f_la57; - entry->ecx |= f_umip; - /* PKU is not yet implemented for shadow paging. */ - if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) - entry->ecx &= ~F(PKU); - entry->edx &= kvm_cpuid_7_0_edx_x86_features; - cpuid_mask(>edx, CPUID_7_EDX); - /* -* We emulate ARCH_CAPABILITIES in software even -* if the host doesn't support it. -*/ - entry->edx |= F(ARCH_CAPABILITIES); - } else { - entry->ebx = 0; - entry->ecx = 0; - entry->edx = 0; - } + entry->ebx &= kvm_cpuid_7_0_ebx_x86_features; + cpuid_mask(>ebx, CPUID_7_0_EBX); + // TSC_ADJUST is emulated + entry->ebx |= F(TSC_ADJUST); + entry->ecx &= kvm_cpuid_7_0_ecx_x86_features; + f_la57 = entry->ecx & F(LA57); + cpuid_mask(>ecx, CPUID_7_ECX); + /* Set LA57 based on hardware capability. */ + entry->ecx |= f_la57; + entry->ecx |= f_umip; + /* PKU is not yet implemented for shadow paging. */ + if (!tdp_enabled || !boot_cpu_has(X86_FEATURE_OSPKE)) + entry->ecx &= ~F(PKU); + entry->edx &= kvm_cpuid_7_0_edx_x86_features; + cpuid_mask(>edx, CPUID_7_EDX); + /* +* We emulate ARCH_CAPABILITIES in software even +* if the host doesn't support it. +*/ + entry->edx |= F(ARCH_CAPABILITIES); entry->eax = 0; break; - } case 9: break; case 0xa: { /* Architectural Performance Monitoring */ -- 2.22.0
[RFC PATCH 14/21] x86/sgx: Add helpers to expose ECREATE and EINIT to KVM
Provide wrappers around __ecreate() and __einit() to export their functionality for use by KVM without having to export a large amount of SGX boilerplate code. Intermediate helpers also shelter KVM from the ugliness of overloading the ENCLS return value to encode multiple error formats in a single int. KVM will use the helpers to trap-and-execute ECREATE and EINIT as part its SGX virtualization. Signed-off-by: Sean Christopherson --- arch/x86/Kconfig | 3 ++ arch/x86/include/asm/sgx.h | 15 ++ arch/x86/kernel/cpu/sgx/virt.c | 55 ++ 3 files changed, 73 insertions(+) create mode 100644 arch/x86/include/asm/sgx.h diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index c1bdb9f85928..8bbc6a30588d 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -1969,6 +1969,9 @@ config INTEL_SGX_VIRTUALIZATION "raw" EPC for the purpose of exposing EPC to a KVM guest, i.e. a virtual machine, via a device node (/dev/sgx/virt_epc by default). + SGX virtualization also adds helpers that are used by KVM to trap + and execute certain ENCLS instructions on behalf of a KVM guest. + If unsure, say N. config EFI diff --git a/arch/x86/include/asm/sgx.h b/arch/x86/include/asm/sgx.h new file mode 100644 index ..f0f0176b8e2f --- /dev/null +++ b/arch/x86/include/asm/sgx.h @@ -0,0 +1,15 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _ASM_X86_SGX_H +#define _ASM_X86_SGX_H + +#include + +struct sgx_pageinfo; + +#if IS_ENABLED(CONFIG_KVM_INTEL) +int sgx_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs, int *trapnr); +int sgx_einit(void __user *sigstruct, void __user *token, + void __user *secs, u64 *lepubkeyhash, int *trapnr); +#endif + +#endif /* _ASM_X86_SGX_H */ diff --git a/arch/x86/kernel/cpu/sgx/virt.c b/arch/x86/kernel/cpu/sgx/virt.c index 79ee5917a4fc..9e5bf4450bf7 100644 --- a/arch/x86/kernel/cpu/sgx/virt.c +++ b/arch/x86/kernel/cpu/sgx/virt.c @@ -251,3 +251,58 @@ int __init sgx_virt_epc_init(void) return ret; } + +#if IS_ENABLED(CONFIG_KVM_INTEL) +int sgx_ecreate(struct sgx_pageinfo *pageinfo, void __user *secs, int *trapnr) +{ + int ret; + + __uaccess_begin(); + ret = __ecreate(pageinfo, (void *)secs); + __uaccess_end(); + + if (encls_faulted(ret)) { + *trapnr = ENCLS_TRAPNR(ret); + return -EFAULT; + } + return ret; +} +EXPORT_SYMBOL_GPL(sgx_ecreate); + +static int __sgx_einit(void __user *sigstruct, void __user *token, + void __user *secs) +{ + int ret; + + __uaccess_begin(); + ret = __einit((void *)sigstruct, (void *)token, (void *)secs); + __uaccess_end(); + return ret; +} + +int sgx_einit(void __user *sigstruct, void __user *token, + void __user *secs, u64 *lepubkeyhash, int *trapnr) +{ + int ret; + + if (!boot_cpu_has(X86_FEATURE_SGX_LC)) { + ret = __sgx_einit(sigstruct, token, secs); + } else { + preempt_disable(); + sgx_update_lepubkeyhash_msrs(lepubkeyhash, false); + ret = __sgx_einit(sigstruct, token, secs); + if (ret == SGX_INVALID_EINITTOKEN) { + sgx_update_lepubkeyhash_msrs(lepubkeyhash, true); + ret = __sgx_einit(sigstruct, token, secs); + } + preempt_enable(); + } + + if (encls_faulted(ret)) { + *trapnr = ENCLS_TRAPNR(ret); + return -EFAULT; + } + return ret; +} +EXPORT_SYMBOL_GPL(sgx_einit); +#endif -- 2.22.0
[RFC PATCH 06/21] KVM: x86: Add SGX sub-features leaf to reverse CPUID table
CPUID_12_EAX is an Intel-defined feature bits leaf dedicated for SGX that enumerates the SGX instruction sets that are supported by the CPU, e.g. SGX1, SGX2, etc... Since Linux only cares about two bits at this time (SGX1 and SGX2), the SGX bits were relocated to to Linux-defined word 8, i.e. CPUID_LNX_3, instead of adding a dedicated SGX word so as to conserve space. But, to make KVM's life simple, the bit numbers of the SGX features were intentionally kept the same between the Intel-defined leaf and the Linux-defined leaf. Add build-time assertions to ensure X86_FEATURE_SGX{1,2} are at the expected locations, and that KVM isn't trying to do a reverse CPUID lookup on a non-SGX bit in CPUID_LNX_3. Relocate bit() to cpuid.h where it belongs (it's NOT a generic bit function) and add a beefy comment explaining what the hell it's doing. Signed-off-by: Sean Christopherson --- arch/x86/kvm/cpuid.h | 20 arch/x86/kvm/emulate.c | 1 + arch/x86/kvm/x86.h | 5 - 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/arch/x86/kvm/cpuid.h b/arch/x86/kvm/cpuid.h index d78a61408243..aed49d639c3b 100644 --- a/arch/x86/kvm/cpuid.h +++ b/arch/x86/kvm/cpuid.h @@ -53,6 +53,7 @@ static const struct cpuid_reg reverse_cpuid[] = { [CPUID_7_ECX] = { 7, 0, CPUID_ECX}, [CPUID_8000_0007_EBX] = {0x8007, 0, CPUID_EBX}, [CPUID_7_EDX] = { 7, 0, CPUID_EDX}, + [CPUID_LNX_3] = { 0x12, 0, CPUID_EAX}, }; static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature) @@ -61,6 +62,7 @@ static __always_inline struct cpuid_reg x86_feature_cpuid(unsigned x86_feature) BUILD_BUG_ON(x86_leaf >= ARRAY_SIZE(reverse_cpuid)); BUILD_BUG_ON(reverse_cpuid[x86_leaf].function == 0); + BUILD_BUG_ON(x86_leaf == CPUID_LNX_3 && (x86_feature & 31) > 1); return reverse_cpuid[x86_leaf]; } @@ -89,6 +91,24 @@ static __always_inline int *guest_cpuid_get_register(struct kvm_vcpu *vcpu, unsi } } +/* + * Retrieve the bit from an X86_FEATURE_* definition using a simple AND to + * isolate the bit number from the feature definition. Note that this works + * only for features that are NOT scattered, i.e. the X86_FEATURE_* bit number + * must match the hardware-defined CPUID bit number. The only exception to + * this rule is the SGX sub-features leaf, which is scattered but only in the + * sense that its bits are relocated from hardware-defined leaf 0x12.0.EAX to + * Linux defined word 8, but its bit numbers are maintained (KVM asserts this + * expectation at build time). + */ +static __always_inline u32 bit(unsigned x86_feature) +{ + BUILD_BUG_ON((X86_FEATURE_SGX1 & 31) != 0); + BUILD_BUG_ON((X86_FEATURE_SGX2 & 31) != 1); + + return 1 << (x86_feature & 31); +} + static __always_inline bool guest_cpuid_has(struct kvm_vcpu *vcpu, unsigned x86_feature) { int *reg; diff --git a/arch/x86/kvm/emulate.c b/arch/x86/kvm/emulate.c index 4a387a235424..6ffe23febcd7 100644 --- a/arch/x86/kvm/emulate.c +++ b/arch/x86/kvm/emulate.c @@ -29,6 +29,7 @@ #include "tss.h" #include "mmu.h" #include "pmu.h" +#include "cpuid.h" /* * Operand types diff --git a/arch/x86/kvm/x86.h b/arch/x86/kvm/x86.h index a470ff0868c5..1e0c7b17effa 100644 --- a/arch/x86/kvm/x86.h +++ b/arch/x86/kvm/x86.h @@ -139,11 +139,6 @@ static inline int is_paging(struct kvm_vcpu *vcpu) return likely(kvm_read_cr0_bits(vcpu, X86_CR0_PG)); } -static inline u32 bit(int bitno) -{ - return 1 << (bitno & 31); -} - static inline u8 vcpu_virt_addr_bits(struct kvm_vcpu *vcpu) { return kvm_read_cr4_bits(vcpu, X86_CR4_LA57) ? 57 : 48; -- 2.22.0
[RFC PATCH 03/21] x86/sgx: Move provisioning device to common code
Move the provisioning device to common code in preparation for adding support for SGX virtualization. The provisioning device will need to be instantiated if the native SGX driver *or* the virtual EPC "driver" is loaded. Signed-off-by: Sean Christopherson --- arch/x86/kernel/cpu/sgx/driver/ioctl.c | 18 ++- arch/x86/kernel/cpu/sgx/driver/main.c | 24 +- arch/x86/kernel/cpu/sgx/main.c | 44 +- arch/x86/kernel/cpu/sgx/sgx.h | 1 + 4 files changed, 47 insertions(+), 40 deletions(-) diff --git a/arch/x86/kernel/cpu/sgx/driver/ioctl.c b/arch/x86/kernel/cpu/sgx/driver/ioctl.c index 89b3fb81c15b..b7aa06920d10 100644 --- a/arch/x86/kernel/cpu/sgx/driver/ioctl.c +++ b/arch/x86/kernel/cpu/sgx/driver/ioctl.c @@ -794,26 +794,12 @@ static long sgx_ioc_enclave_set_attribute(struct file *filep, void __user *arg) { struct sgx_encl *encl = filep->private_data; struct sgx_enclave_set_attribute params; - struct file *attribute_file; - int ret; if (copy_from_user(, arg, sizeof(params))) return -EFAULT; - attribute_file = fget(params.attribute_fd); - if (!attribute_file) - return -EINVAL; - - if (attribute_file->f_op != _provision_fops) { - ret = -EINVAL; - goto out; - } - - encl->allowed_attributes |= SGX_ATTR_PROVISIONKEY; - -out: - fput(attribute_file); - return ret; + return sgx_set_attribute(>allowed_attributes, +params.attribute_fd); } long sgx_ioctl(struct file *filep, unsigned int cmd, unsigned long arg) diff --git a/arch/x86/kernel/cpu/sgx/driver/main.c b/arch/x86/kernel/cpu/sgx/driver/main.c index d62bdc7ed4d9..1e107dd0d909 100644 --- a/arch/x86/kernel/cpu/sgx/driver/main.c +++ b/arch/x86/kernel/cpu/sgx/driver/main.c @@ -154,14 +154,8 @@ static const struct file_operations sgx_encl_fops = { .get_unmapped_area = sgx_get_unmapped_area, }; -const struct file_operations sgx_provision_fops = { - .owner = THIS_MODULE, -}; - static struct device sgx_encl_dev; static struct cdev sgx_encl_cdev; -static struct device sgx_provision_dev; -static struct cdev sgx_provision_cdev; int __init sgx_drv_init(void) { @@ -202,38 +196,22 @@ int __init sgx_drv_init(void) if (ret) return ret; - ret = sgx_dev_init("sgx/provision", _provision_dev, - _provision_cdev, _provision_fops, - SGX_PROV_DEV_MINOR); - if (ret) - goto err_encl_dev; - sgx_encl_wq = alloc_workqueue("sgx-encl-wq", WQ_UNBOUND | WQ_FREEZABLE, 1); if (!sgx_encl_wq) { ret = -ENOMEM; - goto err_provision_dev; + goto err_encl_dev; } ret = cdev_device_add(_encl_cdev, _encl_dev); if (ret) goto err_encl_wq; - ret = cdev_device_add(_provision_cdev, _provision_dev); - if (ret) - goto err_encl_cdev; - return 0; -err_encl_cdev: - cdev_device_del(_encl_cdev, _encl_dev); - err_encl_wq: destroy_workqueue(sgx_encl_wq); -err_provision_dev: - put_device(_provision_dev); - err_encl_dev: put_device(_encl_dev); diff --git a/arch/x86/kernel/cpu/sgx/main.c b/arch/x86/kernel/cpu/sgx/main.c index edbd465083c7..9f4473597620 100644 --- a/arch/x86/kernel/cpu/sgx/main.c +++ b/arch/x86/kernel/cpu/sgx/main.c @@ -2,6 +2,7 @@ // Copyright(c) 2016-17 Intel Corporation. #include +#include #include #include #include @@ -335,6 +336,31 @@ static struct bus_type sgx_bus_type = { }; static dev_t sgx_devt; +const struct file_operations sgx_provision_fops = { + .owner = THIS_MODULE, +}; + +static struct device sgx_provision_dev; +static struct cdev sgx_provision_cdev; + +int sgx_set_attribute(u64 *allowed_attributes, unsigned int attribute_fd) +{ + struct file *attribute_file; + + attribute_file = fget(attribute_fd); + if (!attribute_file) + return -EINVAL; + + if (attribute_file->f_op != _provision_fops) { + fput(attribute_file); + return -EINVAL; + } + fput(attribute_file); + + *allowed_attributes |= SGX_ATTR_PROVISIONKEY; + return 0; +} + static void sgx_dev_release(struct device *dev) { @@ -386,12 +412,28 @@ static __init int sgx_init(void) if (ret < 0) goto err_bus; - ret = sgx_drv_init(); + ret = sgx_dev_init("sgx/provision", _provision_dev, + _provision_cdev, _provision_fops, + SGX_PROV_DEV_MINOR); if (ret) goto err_chrdev_region; + ret = cdev_device_add(_provision_cdev, _provision_dev); + if (ret) + goto err_provision_dev; + + ret =
[PATCH V3 net-next 10/10] net: hns3: use dev_info() instead of pr_info()
dev_info() is more appropriate for printing messages when driver initialization done, so switch to dev_info(). Signed-off-by: Huazhong Tan --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 4 +++- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 3 ++- 2 files changed, 5 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 30a7074..4138780 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -8862,7 +8862,9 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) hclge_state_init(hdev); hdev->last_reset_time = jiffies; - pr_info("%s driver initialization finished.\n", HCLGE_DRIVER_NAME); + dev_info(>pdev->dev, "%s driver initialization finished.\n", +HCLGE_DRIVER_NAME); + return 0; err_mdiobus_unreg: diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c index a13a0e1..ae0e6a6 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c @@ -2695,7 +2695,8 @@ static int hclgevf_init_hdev(struct hclgevf_dev *hdev) } hdev->last_reset_time = jiffies; - pr_info("finished initializing %s driver\n", HCLGEVF_DRIVER_NAME); + dev_info(>pdev->dev, "finished initializing %s driver\n", +HCLGEVF_DRIVER_NAME); return 0; -- 2.7.4
[PATCH V3 net-next 07/10] net: hns3: make hclge_service use delayed workqueue
From: Yunsheng Lin Use delayed work instead of using timers to trigger the hclge_serive. Simplify the code with one less middle function and in order to support misc irq affinity. Signed-off-by: Yunsheng Lin Reviewed-by: Peng Li Signed-off-by: Huazhong Tan --- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c| 52 +- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h| 3 +- 2 files changed, 21 insertions(+), 34 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 14199c4..13c9697 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -2513,8 +2513,12 @@ static void hclge_task_schedule(struct hclge_dev *hdev) { if (!test_bit(HCLGE_STATE_DOWN, >state) && !test_bit(HCLGE_STATE_REMOVING, >state) && - !test_and_set_bit(HCLGE_STATE_SERVICE_SCHED, >state)) - (void)schedule_work(>service_task); + !test_and_set_bit(HCLGE_STATE_SERVICE_SCHED, >state)) { + hdev->hw_stats.stats_timer++; + hdev->fd_arfs_expire_timer++; + mod_delayed_work(system_wq, >service_task, +round_jiffies_relative(HZ)); + } } static int hclge_get_mac_link_status(struct hclge_dev *hdev) @@ -2729,25 +2733,6 @@ static int hclge_get_status(struct hnae3_handle *handle) return hdev->hw.mac.link; } -static void hclge_service_timer(struct timer_list *t) -{ - struct hclge_dev *hdev = from_timer(hdev, t, service_timer); - - mod_timer(>service_timer, jiffies + HZ); - hdev->hw_stats.stats_timer++; - hdev->fd_arfs_expire_timer++; - hclge_task_schedule(hdev); -} - -static void hclge_service_complete(struct hclge_dev *hdev) -{ - WARN_ON(!test_bit(HCLGE_STATE_SERVICE_SCHED, >state)); - - /* Flush memory before next watchdog */ - smp_mb__before_atomic(); - clear_bit(HCLGE_STATE_SERVICE_SCHED, >state); -} - static u32 hclge_check_event_cause(struct hclge_dev *hdev, u32 *clearval) { u32 rst_src_reg, cmdq_src_reg, msix_src_reg; @@ -3594,7 +3579,9 @@ static void hclge_update_vport_alive(struct hclge_dev *hdev) static void hclge_service_task(struct work_struct *work) { struct hclge_dev *hdev = - container_of(work, struct hclge_dev, service_task); + container_of(work, struct hclge_dev, service_task.work); + + clear_bit(HCLGE_STATE_SERVICE_SCHED, >state); if (hdev->hw_stats.stats_timer >= HCLGE_STATS_TIMER_INTERVAL) { hclge_update_stats_for_all(hdev); @@ -3609,7 +3596,8 @@ static void hclge_service_task(struct work_struct *work) hclge_rfs_filter_expire(hdev); hdev->fd_arfs_expire_timer = 0; } - hclge_service_complete(hdev); + + hclge_task_schedule(hdev); } struct hclge_vport *hclge_get_vport(struct hnae3_handle *handle) @@ -6148,10 +6136,13 @@ static void hclge_set_timer_task(struct hnae3_handle *handle, bool enable) struct hclge_dev *hdev = vport->back; if (enable) { - mod_timer(>service_timer, jiffies + HZ); + hclge_task_schedule(hdev); } else { - del_timer_sync(>service_timer); - cancel_work_sync(>service_task); + /* Set the DOWN flag here to disable the service to be +* scheduled again +*/ + set_bit(HCLGE_STATE_DOWN, >state); + cancel_delayed_work_sync(>service_task); clear_bit(HCLGE_STATE_SERVICE_SCHED, >state); } } @@ -8590,12 +8581,10 @@ static void hclge_state_uninit(struct hclge_dev *hdev) set_bit(HCLGE_STATE_DOWN, >state); set_bit(HCLGE_STATE_REMOVING, >state); - if (hdev->service_timer.function) - del_timer_sync(>service_timer); if (hdev->reset_timer.function) del_timer_sync(>reset_timer); - if (hdev->service_task.func) - cancel_work_sync(>service_task); + if (hdev->service_task.work.func) + cancel_delayed_work_sync(>service_task); if (hdev->rst_service_task.func) cancel_work_sync(>rst_service_task); if (hdev->mbx_service_task.func) @@ -8800,9 +8789,8 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) hclge_dcb_ops_set(hdev); - timer_setup(>service_timer, hclge_service_timer, 0); timer_setup(>reset_timer, hclge_reset_timer, 0); - INIT_WORK(>service_task, hclge_service_task); + INIT_DELAYED_WORK(>service_task, hclge_service_task); INIT_WORK(>rst_service_task, hclge_reset_service_task); INIT_WORK(>mbx_service_task, hclge_mailbox_service_task); diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h
[PATCH V3 net-next 08/10] net: hns3: add interrupt affinity support for misc interrupt
From: Yunsheng Lin The misc interrupt is used to schedule the reset and mailbox subtask, and service_task delayed_work is used to do periodic management work each second. This patch sets the above three subtask's affinity using the misc interrupt' affinity. Also this patch setups a affinity notify for misc interrupt to allow user to change the above three subtask's affinity. Signed-off-by: Yunsheng Lin Signed-off-by: Peng Li Signed-off-by: Huazhong Tan --- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c| 53 -- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h| 4 ++ 2 files changed, 53 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 13c9697..30a7074 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -1270,6 +1270,12 @@ static int hclge_configure(struct hclge_dev *hdev) hclge_init_kdump_kernel_config(hdev); + /* Set the init affinity based on pci func number */ + i = cpumask_weight(cpumask_of_node(dev_to_node(>pdev->dev))); + i = i ? PCI_FUNC(hdev->pdev->devfn) % i : 0; + cpumask_set_cpu(cpumask_local_spread(i, dev_to_node(>pdev->dev)), + >affinity_mask); + return ret; } @@ -2499,14 +2505,16 @@ static void hclge_mbx_task_schedule(struct hclge_dev *hdev) { if (!test_bit(HCLGE_STATE_CMD_DISABLE, >state) && !test_and_set_bit(HCLGE_STATE_MBX_SERVICE_SCHED, >state)) - schedule_work(>mbx_service_task); + queue_work_on(cpumask_first(>affinity_mask), system_wq, + >mbx_service_task); } static void hclge_reset_task_schedule(struct hclge_dev *hdev) { if (!test_bit(HCLGE_STATE_REMOVING, >state) && !test_and_set_bit(HCLGE_STATE_RST_SERVICE_SCHED, >state)) - schedule_work(>rst_service_task); + queue_work_on(cpumask_first(>affinity_mask), system_wq, + >rst_service_task); } static void hclge_task_schedule(struct hclge_dev *hdev) @@ -2516,8 +2524,9 @@ static void hclge_task_schedule(struct hclge_dev *hdev) !test_and_set_bit(HCLGE_STATE_SERVICE_SCHED, >state)) { hdev->hw_stats.stats_timer++; hdev->fd_arfs_expire_timer++; - mod_delayed_work(system_wq, >service_task, -round_jiffies_relative(HZ)); + mod_delayed_work_on(cpumask_first(>affinity_mask), + system_wq, >service_task, + round_jiffies_relative(HZ)); } } @@ -2903,6 +2912,36 @@ static void hclge_get_misc_vector(struct hclge_dev *hdev) hdev->num_msi_used += 1; } +static void hclge_irq_affinity_notify(struct irq_affinity_notify *notify, + const cpumask_t *mask) +{ + struct hclge_dev *hdev = container_of(notify, struct hclge_dev, + affinity_notify); + + cpumask_copy(>affinity_mask, mask); +} + +static void hclge_irq_affinity_release(struct kref *ref) +{ +} + +static void hclge_misc_affinity_setup(struct hclge_dev *hdev) +{ + irq_set_affinity_hint(hdev->misc_vector.vector_irq, + >affinity_mask); + + hdev->affinity_notify.notify = hclge_irq_affinity_notify; + hdev->affinity_notify.release = hclge_irq_affinity_release; + irq_set_affinity_notifier(hdev->misc_vector.vector_irq, + >affinity_notify); +} + +static void hclge_misc_affinity_teardown(struct hclge_dev *hdev) +{ + irq_set_affinity_notifier(hdev->misc_vector.vector_irq, NULL); + irq_set_affinity_hint(hdev->misc_vector.vector_irq, NULL); +} + static int hclge_misc_irq_init(struct hclge_dev *hdev) { int ret; @@ -8794,6 +8833,11 @@ static int hclge_init_ae_dev(struct hnae3_ae_dev *ae_dev) INIT_WORK(>rst_service_task, hclge_reset_service_task); INIT_WORK(>mbx_service_task, hclge_mailbox_service_task); + /* Setup affinity after service timer setup because add_timer_on +* is called in affinity notify. +*/ + hclge_misc_affinity_setup(hdev); + hclge_clear_all_event_cause(hdev); hclge_clear_resetting_state(hdev); @@ -8955,6 +8999,7 @@ static void hclge_uninit_ae_dev(struct hnae3_ae_dev *ae_dev) struct hclge_dev *hdev = ae_dev->priv; struct hclge_mac *mac = >hw.mac; + hclge_misc_affinity_teardown(hdev); hclge_state_uninit(hdev); if (mac->phydev) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h index dde8f22..688e425 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.h +++
[PATCH V3 net-next 02/10] net: hns3: add a check for get_reset_level
From: Guangbin Huang For some cases, ops->get_reset_level may not be implemented, so we should check whether it is NULL before calling get_reset_level. Signed-off-by: Guangbin Huang Signed-off-by: Huazhong Tan --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 08af782..4d58c53 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -1963,7 +1963,7 @@ static pci_ers_result_t hns3_slot_reset(struct pci_dev *pdev) ops = ae_dev->ops; /* request the reset */ - if (ops->reset_event) { + if (ops->reset_event && ops->get_reset_level) { if (ae_dev->hw_err_reset_req) { reset_type = ops->get_reset_level(ae_dev, _dev->hw_err_reset_req); -- 2.7.4
[PATCH V3 net-next 00/10] net: hns3: some code optimizations & bugfixes & features
This patch-set includes code optimizations, bugfixes and features for the HNS3 ethernet controller driver. [patch 1/10] checks reset status before setting channel. [patch 2/10] adds a NULL pointer checking. [patch 3/10] removes reset level upgrading when current reset fails. [patch 4/10] fixes a GFP flags errors when holding spin_lock. [patch 5/10] modifies firmware version format. [patch 6/10] adds some print information which is off by default. [patch 7/10 - 8/10] adds two code optimizations about interrupt handler and work task. [patch 9/10] adds support for using order 1 pages with a 4K buffer. [patch 10/10] modifies messages prints with dev_info() instead of pr_info(). Change log: V2->V3: fixes comments from Saeed Mahameed and Joe Perches. V1->V2: fixes comments from Saeed Mahameed and removes previous [patch 4/11] and [patch 11/11] which needs further discussion, and adds a new patch [11/11] suggested by Saeed Mahameed. Guangbin Huang (1): net: hns3: add a check for get_reset_level Huazhong Tan (2): net: hns3: remove upgrade reset level when reset fail net: hns3: use dev_info() instead of pr_info() Jian Shen (1): net: hns3: add reset checking before set channels Yonglong Liu (1): net: hns3: add debug messages to identify eth down cause Yufeng Mo (2): net: hns3: change GFP flag during lock period net: hns3: modify firmware version display format Yunsheng Lin (3): net: hns3: make hclge_service use delayed workqueue net: hns3: add interrupt affinity support for misc interrupt net: hns3: Add support for using order 1 pages with a 4K buffer drivers/net/ethernet/hisilicon/hns3/hnae3.h| 9 ++ drivers/net/ethernet/hisilicon/hns3/hns3_enet.c| 33 - drivers/net/ethernet/hisilicon/hns3/hns3_enet.h| 15 ++- drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 34 +- .../net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c | 10 +- .../net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c | 11 ++ .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c| 135 - .../ethernet/hisilicon/hns3/hns3pf/hclge_main.h| 7 +- .../ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c | 10 +- .../ethernet/hisilicon/hns3/hns3vf/hclgevf_main.c | 3 +- 10 files changed, 195 insertions(+), 72 deletions(-) -- 2.7.4
[PATCH V3 net-next 04/10] net: hns3: change GFP flag during lock period
From: Yufeng Mo When allocating memory, the GFP_KERNEL cannot be used during the spin_lock period. This is because it may cause scheduling when holding spin_lock. This patch changes GFP flag to GFP_ATOMIC in this case. Fixes: dd74f815dd41 ("net: hns3: Add support for rule add/delete for flow director") Signed-off-by: Yufeng Mo Signed-off-by: lipeng 00277521 Signed-off-by: Huazhong Tan --- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 3c64d70..14199c4 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -5796,7 +5796,7 @@ static int hclge_add_fd_entry_by_arfs(struct hnae3_handle *handle, u16 queue_id, return -ENOSPC; } - rule = kzalloc(sizeof(*rule), GFP_KERNEL); + rule = kzalloc(sizeof(*rule), GFP_ATOMIC); if (!rule) { spin_unlock_bh(>fd_rule_lock); -- 2.7.4
[PATCH V3 net-next 06/10] net: hns3: add debug messages to identify eth down cause
From: Yonglong Liu Some times just see the eth interface have been down/up via dmesg, but can not know why the eth down. So adds some debug messages to identify the cause for this. Signed-off-by: Yonglong Liu Signed-off-by: Peng Li Signed-off-by: Huazhong Tan --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 18 ++ drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c| 19 +++ .../net/ethernet/hisilicon/hns3/hns3pf/hclge_dcb.c| 11 +++ 3 files changed, 48 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 4d58c53..973c57b 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -459,6 +459,9 @@ static int hns3_nic_net_open(struct net_device *netdev) h->ae_algo->ops->set_timer_task(priv->ae_handle, true); hns3_config_xps(priv); + + netif_info(h, drv, netdev, "net open\n"); + return 0; } @@ -519,6 +522,8 @@ static int hns3_nic_net_stop(struct net_device *netdev) if (test_and_set_bit(HNS3_NIC_STATE_DOWN, >state)) return 0; + netif_info(h, drv, netdev, "net stop\n"); + if (h->ae_algo->ops->set_timer_task) h->ae_algo->ops->set_timer_task(priv->ae_handle, false); @@ -1550,6 +1555,8 @@ static int hns3_setup_tc(struct net_device *netdev, void *type_data) h = hns3_get_handle(netdev); kinfo = >kinfo; + netif_info(h, drv, netdev, "setup tc: num_tc=%u\n", tc); + return (kinfo->dcb_ops && kinfo->dcb_ops->setup_tc) ? kinfo->dcb_ops->setup_tc(h, tc, prio_tc) : -EOPNOTSUPP; } @@ -1593,6 +1600,10 @@ static int hns3_ndo_set_vf_vlan(struct net_device *netdev, int vf, u16 vlan, struct hnae3_handle *h = hns3_get_handle(netdev); int ret = -EIO; + netif_info(h, drv, netdev, + "set vf vlan: vf=%d, vlan=%u, qos=%u, vlan_proto=%u\n", + vf, vlan, qos, vlan_proto); + if (h->ae_algo->ops->set_vf_vlan_filter) ret = h->ae_algo->ops->set_vf_vlan_filter(h, vf, vlan, qos, vlan_proto); @@ -1611,6 +1622,9 @@ static int hns3_nic_change_mtu(struct net_device *netdev, int new_mtu) if (!h->ae_algo->ops->set_mtu) return -EOPNOTSUPP; + netif_info(h, drv, netdev, + "change mtu from %u to %d\n", netdev->mtu, new_mtu); + ret = h->ae_algo->ops->set_mtu(h, new_mtu); if (ret) netdev_err(netdev, "failed to change MTU in hardware %d\n", @@ -4395,6 +4409,10 @@ int hns3_set_channels(struct net_device *netdev, if (kinfo->rss_size == new_tqp_num) return 0; + netif_info(h, drv, netdev, + "set channels: tqp_num=%u, rxfh=%d\n", + new_tqp_num, rxfh_configured); + ret = hns3_reset_notify(h, HNAE3_DOWN_CLIENT); if (ret) return ret; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index e71c92b..8553200 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -311,6 +311,8 @@ static void hns3_self_test(struct net_device *ndev, if (eth_test->flags != ETH_TEST_FL_OFFLINE) return; + netif_info(h, drv, ndev, "self test start"); + st_param[HNAE3_LOOP_APP][0] = HNAE3_LOOP_APP; st_param[HNAE3_LOOP_APP][1] = h->flags & HNAE3_SUPPORT_APP_LOOPBACK; @@ -374,6 +376,8 @@ static void hns3_self_test(struct net_device *ndev, if (if_running) ndev->netdev_ops->ndo_open(ndev); + + netif_info(h, drv, ndev, "self test end\n"); } static int hns3_get_sset_count(struct net_device *netdev, int stringset) @@ -604,6 +608,10 @@ static int hns3_set_pauseparam(struct net_device *netdev, { struct hnae3_handle *h = hns3_get_handle(netdev); + netif_info(h, drv, netdev, + "set pauseparam: autoneg=%u, rx:%u, tx:%u\n", + param->autoneg, param->rx_pause, param->tx_pause); + if (h->ae_algo->ops->set_pauseparam) return h->ae_algo->ops->set_pauseparam(h, param->autoneg, param->rx_pause, @@ -743,6 +751,11 @@ static int hns3_set_link_ksettings(struct net_device *netdev, if (cmd->base.speed == SPEED_1000 && cmd->base.duplex == DUPLEX_HALF) return -EINVAL; + netif_info(handle, drv, netdev, + "set link(%s): autoneg=%u, speed=%u, duplex=%u\n", + netdev->phydev ? "phy" : "mac", + cmd->base.autoneg, cmd->base.speed, cmd->base.duplex); + /* Only support ksettings_set for netdev with phy
[PATCH V3 net-next 01/10] net: hns3: add reset checking before set channels
From: Jian Shen hns3_set_channels() should check the resetting status firstly, since the device will reinitialize when resetting. If the reset has not completed, the hns3_set_channels() may access invalid memory. Signed-off-by: Jian Shen Signed-off-by: Huazhong Tan --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 69f7ef8..08af782 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -4378,6 +4378,9 @@ int hns3_set_channels(struct net_device *netdev, u16 org_tqp_num; int ret; + if (hns3_nic_resetting(netdev)) + return -EBUSY; + if (ch->rx_count || ch->tx_count) return -EINVAL; -- 2.7.4
[PATCH V3 net-next 03/10] net: hns3: remove upgrade reset level when reset fail
Currently, hclge_reset_err_handle() will assert a global reset when the failing count is smaller than MAX_RESET_FAIL_CNT, which will affect other running functions. So this patch removes this upgrading, and uses re-scheduling reset task to do it. Signed-off-by: Huazhong Tan Reviewed-by: Yunsheng Lin --- .../ethernet/hisilicon/hns3/hns3pf/hclge_main.c| 28 +++--- 1 file changed, 8 insertions(+), 20 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c index 3fde5471..3c64d70 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_main.c @@ -3305,7 +3305,7 @@ static int hclge_reset_prepare_wait(struct hclge_dev *hdev) return ret; } -static bool hclge_reset_err_handle(struct hclge_dev *hdev, bool is_timeout) +static bool hclge_reset_err_handle(struct hclge_dev *hdev) { #define MAX_RESET_FAIL_CNT 5 @@ -3322,20 +3322,11 @@ static bool hclge_reset_err_handle(struct hclge_dev *hdev, bool is_timeout) return false; } else if (hdev->reset_fail_cnt < MAX_RESET_FAIL_CNT) { hdev->reset_fail_cnt++; - if (is_timeout) { - set_bit(hdev->reset_type, >reset_pending); - dev_info(>pdev->dev, -"re-schedule to wait for hw reset done\n"); - return true; - } - - dev_info(>pdev->dev, "Upgrade reset level\n"); - hclge_clear_reset_cause(hdev); - set_bit(HNAE3_GLOBAL_RESET, >default_reset_request); - mod_timer(>reset_timer, - jiffies + HCLGE_RESET_INTERVAL); - - return false; + set_bit(hdev->reset_type, >reset_pending); + dev_info(>pdev->dev, +"re-schedule reset task(%d)\n", +hdev->reset_fail_cnt); + return true; } hclge_clear_reset_cause(hdev); @@ -3382,7 +3373,6 @@ static int hclge_reset_stack(struct hclge_dev *hdev) static void hclge_reset(struct hclge_dev *hdev) { struct hnae3_ae_dev *ae_dev = pci_get_drvdata(hdev->pdev); - bool is_timeout = false; int ret; /* Initialize ae_dev reset status as well, in case enet layer wants to @@ -3410,10 +3400,8 @@ static void hclge_reset(struct hclge_dev *hdev) if (ret) goto err_reset; - if (hclge_reset_wait(hdev)) { - is_timeout = true; + if (hclge_reset_wait(hdev)) goto err_reset; - } hdev->rst_stats.hw_reset_done_cnt++; @@ -3465,7 +3453,7 @@ static void hclge_reset(struct hclge_dev *hdev) err_reset_lock: rtnl_unlock(); err_reset: - if (hclge_reset_err_handle(hdev, is_timeout)) + if (hclge_reset_err_handle(hdev)) hclge_reset_task_schedule(hdev); } -- 2.7.4
[PATCH V3 net-next 05/10] net: hns3: modify firmware version display format
From: Yufeng Mo This patch modifies firmware version display format in hclge(vf)_cmd_init() and hns3_get_drvinfo(). Also, adds some optimizations for firmware version display format. Signed-off-by: Yufeng Mo Signed-off-by: Peng Li Signed-off-by: Huazhong Tan --- drivers/net/ethernet/hisilicon/hns3/hnae3.h | 9 + drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c | 15 +-- drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c | 10 +- drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c | 10 +- 4 files changed, 40 insertions(+), 4 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hnae3.h b/drivers/net/ethernet/hisilicon/hns3/hnae3.h index 48c7b70..a4624db 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hnae3.h +++ b/drivers/net/ethernet/hisilicon/hns3/hnae3.h @@ -179,6 +179,15 @@ struct hnae3_vector_info { #define HNAE3_RING_GL_RX 0 #define HNAE3_RING_GL_TX 1 +#define HNAE3_FW_VERSION_BYTE3_SHIFT 24 +#define HNAE3_FW_VERSION_BYTE3_MASKGENMASK(31, 24) +#define HNAE3_FW_VERSION_BYTE2_SHIFT 16 +#define HNAE3_FW_VERSION_BYTE2_MASKGENMASK(23, 16) +#define HNAE3_FW_VERSION_BYTE1_SHIFT 8 +#define HNAE3_FW_VERSION_BYTE1_MASKGENMASK(15, 8) +#define HNAE3_FW_VERSION_BYTE0_SHIFT 0 +#define HNAE3_FW_VERSION_BYTE0_MASKGENMASK(7, 0) + struct hnae3_ring_chain_node { struct hnae3_ring_chain_node *next; u32 tqp_index; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c index 5bff98a..e71c92b 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_ethtool.c @@ -527,6 +527,7 @@ static void hns3_get_drvinfo(struct net_device *netdev, { struct hns3_nic_priv *priv = netdev_priv(netdev); struct hnae3_handle *h = priv->ae_handle; + u32 fw_version; if (!h->ae_algo->ops->get_fw_version) { netdev_err(netdev, "could not get fw version!\n"); @@ -545,8 +546,18 @@ static void hns3_get_drvinfo(struct net_device *netdev, sizeof(drvinfo->bus_info)); drvinfo->bus_info[ETHTOOL_BUSINFO_LEN - 1] = '\0'; - snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), "0x%08x", -priv->ae_handle->ae_algo->ops->get_fw_version(h)); + fw_version = priv->ae_handle->ae_algo->ops->get_fw_version(h); + + snprintf(drvinfo->fw_version, sizeof(drvinfo->fw_version), +"%lu.%lu.%lu.%lu", +hnae3_get_field(fw_version, HNAE3_FW_VERSION_BYTE3_MASK, +HNAE3_FW_VERSION_BYTE3_SHIFT), +hnae3_get_field(fw_version, HNAE3_FW_VERSION_BYTE2_MASK, +HNAE3_FW_VERSION_BYTE2_SHIFT), +hnae3_get_field(fw_version, HNAE3_FW_VERSION_BYTE1_MASK, +HNAE3_FW_VERSION_BYTE1_SHIFT), +hnae3_get_field(fw_version, HNAE3_FW_VERSION_BYTE0_MASK, +HNAE3_FW_VERSION_BYTE0_SHIFT)); } static u32 hns3_get_link(struct net_device *netdev) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c index 22f6acd..d9858f2 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3pf/hclge_cmd.c @@ -419,7 +419,15 @@ int hclge_cmd_init(struct hclge_dev *hdev) } hdev->fw_version = version; - dev_info(>pdev->dev, "The firmware version is %08x\n", version); + dev_info(>pdev->dev, "The firmware version is %lu.%lu.%lu.%lu\n", +hnae3_get_field(version, HNAE3_FW_VERSION_BYTE3_MASK, +HNAE3_FW_VERSION_BYTE3_SHIFT), +hnae3_get_field(version, HNAE3_FW_VERSION_BYTE2_MASK, +HNAE3_FW_VERSION_BYTE2_SHIFT), +hnae3_get_field(version, HNAE3_FW_VERSION_BYTE1_MASK, +HNAE3_FW_VERSION_BYTE1_SHIFT), +hnae3_get_field(version, HNAE3_FW_VERSION_BYTE0_MASK, +HNAE3_FW_VERSION_BYTE0_SHIFT)); return 0; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c index 652b796..8f21eb3 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3vf/hclgevf_cmd.c @@ -405,7 +405,15 @@ int hclgevf_cmd_init(struct hclgevf_dev *hdev) } hdev->fw_version = version; - dev_info(>pdev->dev, "The firmware version is %08x\n", version); + dev_info(>pdev->dev, "The firmware version is %lu.%lu.%lu.%lu\n", +hnae3_get_field(version, HNAE3_FW_VERSION_BYTE3_MASK, +HNAE3_FW_VERSION_BYTE3_SHIFT), +hnae3_get_field(version,
[PATCH V3 net-next 09/10] net: hns3: Add support for using order 1 pages with a 4K buffer
From: Yunsheng Lin Hardware supports 0.5K, 1K, 2K, 4K RX buffer size, the RX buffer can not be reused because the hns3_page_order return 0 when page size and RX buffer size are both 4096. So this patch changes the hns3_page_order to return 1 when RX buffer is greater than half of the page size and page size is less the 8192, and dev_alloc_pages has already been used to allocate the compound page for RX buffer. This patch also changes hnae3_* to hns3_* for page order and RX buffer size calculation because they are used in hns3 module. Signed-off-by: Yunsheng Lin Reviewed-by: Peng Li Signed-off-by: Huazhong Tan --- drivers/net/ethernet/hisilicon/hns3/hns3_enet.c | 10 +- drivers/net/ethernet/hisilicon/hns3/hns3_enet.h | 15 --- 2 files changed, 17 insertions(+), 8 deletions(-) diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c index 973c57b..59a6076 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c @@ -2081,7 +2081,7 @@ static void hns3_set_default_feature(struct net_device *netdev) static int hns3_alloc_buffer(struct hns3_enet_ring *ring, struct hns3_desc_cb *cb) { - unsigned int order = hnae3_page_order(ring); + unsigned int order = hns3_page_order(ring); struct page *p; p = dev_alloc_pages(order); @@ -2092,7 +2092,7 @@ static int hns3_alloc_buffer(struct hns3_enet_ring *ring, cb->page_offset = 0; cb->reuse_flag = 0; cb->buf = page_address(p); - cb->length = hnae3_page_size(ring); + cb->length = hns3_page_size(ring); cb->type = DESC_TYPE_PAGE; return 0; @@ -2395,7 +2395,7 @@ static void hns3_nic_reuse_page(struct sk_buff *skb, int i, { struct hns3_desc *desc = >desc[ring->next_to_clean]; int size = le16_to_cpu(desc->rx.size); - u32 truesize = hnae3_buf_size(ring); + u32 truesize = hns3_buf_size(ring); skb_add_rx_frag(skb, i, desc_cb->priv, desc_cb->page_offset + pull_len, size - pull_len, truesize); @@ -2410,7 +2410,7 @@ static void hns3_nic_reuse_page(struct sk_buff *skb, int i, /* Move offset up to the next cache line */ desc_cb->page_offset += truesize; - if (desc_cb->page_offset + truesize <= hnae3_page_size(ring)) { + if (desc_cb->page_offset + truesize <= hns3_page_size(ring)) { desc_cb->reuse_flag = 1; /* Bump ref count on page before it is given */ get_page(desc_cb->priv); @@ -2692,7 +2692,7 @@ static int hns3_add_frag(struct hns3_enet_ring *ring, struct hns3_desc *desc, } if (ring->tail_skb) { - head_skb->truesize += hnae3_buf_size(ring); + head_skb->truesize += hns3_buf_size(ring); head_skb->data_len += le16_to_cpu(desc->rx.size); head_skb->len += le16_to_cpu(desc->rx.size); skb = ring->tail_skb; diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h index 848b866..1a17856 100644 --- a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h +++ b/drivers/net/ethernet/hisilicon/hns3/hns3_enet.h @@ -608,9 +608,18 @@ static inline bool hns3_nic_resetting(struct net_device *netdev) #define tx_ring_data(priv, idx) ((priv)->ring_data[idx]) -#define hnae3_buf_size(_ring) ((_ring)->buf_size) -#define hnae3_page_order(_ring) (get_order(hnae3_buf_size(_ring))) -#define hnae3_page_size(_ring) (PAGE_SIZE << (u32)hnae3_page_order(_ring)) +#define hns3_buf_size(_ring) ((_ring)->buf_size) + +static inline unsigned int hns3_page_order(struct hns3_enet_ring *ring) +{ +#if (PAGE_SIZE < 8192) + if (ring->buf_size > (PAGE_SIZE / 2)) + return 1; +#endif + return 0; +} + +#define hns3_page_size(_ring) (PAGE_SIZE << hns3_page_order(_ring)) /* iterator for handling rings in ring group */ #define hns3_for_each_ring(pos, head) \ -- 2.7.4
Re: [PATCH 5.2 00/66] 5.2.4-stable review
On Fri, 26 Jul 2019 at 20:55, Greg Kroah-Hartman wrote: > > This is the start of the stable review cycle for the 5.2.4 release. > There are 66 patches in this series, all will be posted as a response > to this one. If anyone has any issues with these being applied, please > let me know. > > Responses should be made by Sun 28 Jul 2019 03:21:13 PM UTC. > Anything received after that time might be too late. > > The whole patch series can be found in one patch at: > > https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.2.4-rc1.gz > or in the git tree and branch at: > > git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git > linux-5.2.y > and the diffstat can be found below. > > thanks, > > greg k-h Results from Linaro’s test farm. No regressions on arm64, arm, x86_64, and i386. Summary kernel: 5.2.4-rc1 git repo: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git git branch: linux-5.2.y git commit: d61e440a1852a64d8a2d0d358b9582b19157e039 git describe: v5.2.3-67-gd61e440a1852 Test details: https://qa-reports.linaro.org/lkft/linux-stable-rc-5.2-oe/build/v5.2.3-67-gd61e440a1852 No regressions (compared to build v5.2.3) No fixes (compared to build v5.2.3) Ran 22512 total tests in the following environments and test suites. Environments -- - dragonboard-410c - hi6220-hikey - i386 - juno-r2 - qemu_arm - qemu_arm64 - qemu_i386 - qemu_x86_64 - x15 - x86 Test Suites --- * build * install-android-platform-tools-r2600 * kselftest * libgpiod * libhugetlbfs * ltp-cap_bounds-tests * ltp-commands-tests * ltp-containers-tests * ltp-cpuhotplug-tests * ltp-cve-tests * ltp-dio-tests * ltp-fcntl-locktests-tests * ltp-filecaps-tests * ltp-fs_bind-tests * ltp-fs_perms_simple-tests * ltp-fsx-tests * ltp-hugetlb-tests * ltp-io-tests * ltp-ipc-tests * ltp-math-tests * ltp-mm-tests * ltp-nptl-tests * ltp-pty-tests * ltp-sched-tests * ltp-securebits-tests * ltp-syscalls-tests * ltp-timers-tests * network-basic-tests * perf * spectre-meltdown-checker-test * v4l2-compliance * ltp-fs-tests * ltp-open-posix-tests * kselftest-vsyscall-mode-native * kselftest-vsyscall-mode-none * kvm-unit-tests -- Linaro LKFT https://lkft.linaro.org
Re: [PATCH 5.1 00/62] 5.1.21-stable review
On Fri, 26 Jul 2019 at 20:59, Greg Kroah-Hartman wrote: > > Note, this will be the LAST 5.1.y kernel release. Everyone should move > to the 5.2.y series at this point in time. > > This is the start of the stable review cycle for the 5.1.21 release. > There are 62 patches in this series, all will be posted as a response > to this one. If anyone has any issues with these being applied, please > let me know. > > Responses should be made by Sun 28 Jul 2019 03:21:13 PM UTC. > Anything received after that time might be too late. > > The whole patch series can be found in one patch at: > > https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.1.21-rc1.gz > or in the git tree and branch at: > > git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git > linux-5.1.y > and the diffstat can be found below. > > thanks, > > greg k-h Results from Linaro’s test farm. No regressions on arm64, arm, x86_64, and i386. Summary kernel: 5.1.21-rc1 git repo: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git git branch: linux-5.1.y git commit: f878628d8f1efc883e9bd6f9f81173194b4a01dd git describe: v5.1.20-63-gf878628d8f1e Test details: https://qa-reports.linaro.org/lkft/linux-stable-rc-5.1-oe/build/v5.1.20-63-gf878628d8f1e No regressions (compared to build v5.1.20) No fixes (compared to build v5.1.20) Ran 21561 total tests in the following environments and test suites. Environments -- - dragonboard-410c - hi6220-hikey - i386 - juno-r2 - qemu_arm - qemu_arm64 - qemu_i386 - qemu_x86_64 - x15 - x86 Test Suites --- * build * install-android-platform-tools-r2600 * kselftest * libgpiod * libhugetlbfs * ltp-cap_bounds-tests * ltp-commands-tests * ltp-containers-tests * ltp-cpuhotplug-tests * ltp-cve-tests * ltp-dio-tests * ltp-fcntl-locktests-tests * ltp-filecaps-tests * ltp-fs-tests * ltp-fs_bind-tests * ltp-fs_perms_simple-tests * ltp-fsx-tests * ltp-hugetlb-tests * ltp-io-tests * ltp-ipc-tests * ltp-math-tests * ltp-mm-tests * ltp-nptl-tests * ltp-pty-tests * ltp-sched-tests * ltp-securebits-tests * ltp-syscalls-tests * ltp-timers-tests * network-basic-tests * perf * spectre-meltdown-checker-test * v4l2-compliance * ltp-open-posix-tests * kvm-unit-tests * kselftest-vsyscall-mode-native * kselftest-vsyscall-mode-none -- Linaro LKFT https://lkft.linaro.org
Re: [PATCH] hv_sock: use HV_HYP_PAGE_SIZE instead of PAGE_SIZE_4K
Hi Himadri, Thank you for the patch! Yet something to improve: [auto build test ERROR on linus/master] [cannot apply to v5.3-rc1 next-20190726] [if your patch is applied to the wrong git tree, please drop us a note to help improve the system] url: https://github.com/0day-ci/linux/commits/Himadri-Pandya/hv_sock-use-HV_HYP_PAGE_SIZE-instead-of-PAGE_SIZE_4K/20190726-085229 config: x86_64-allyesconfig (attached as .config) compiler: gcc-7 (Debian 7.4.0-10) 7.4.0 reproduce: # save the attached .config to linux build tree make ARCH=x86_64 If you fix the issue, kindly add following tag Reported-by: kbuild test robot All error/warnings (new ones prefixed by >>): >> net/vmw_vsock/hyperv_transport.c:58:28: error: 'HV_HYP_PAGE_SIZE' undeclared >> here (not in a function); did you mean 'HV_MESSAGE_SIZE'? #define HVS_SEND_BUF_SIZE (HV_HYP_PAGE_SIZE - sizeof(struct vmpipe_proto_header)) ^ >> net/vmw_vsock/hyperv_transport.c:65:10: note: in expansion of macro >> 'HVS_SEND_BUF_SIZE' u8 data[HVS_SEND_BUF_SIZE]; ^ In file included from include/linux/list.h:9:0, from include/linux/module.h:9, from net/vmw_vsock/hyperv_transport.c:11: net/vmw_vsock/hyperv_transport.c: In function 'hvs_open_connection': >> include/linux/kernel.h:845:2: error: first argument to >> '__builtin_choose_expr' not a constant __builtin_choose_expr(__safe_cmp(x, y), \ ^ include/linux/kernel.h:921:27: note: in expansion of macro '__careful_cmp' #define max_t(type, x, y) __careful_cmp((type)(x), (type)(y), >) ^ >> net/vmw_vsock/hyperv_transport.c:390:12: note: in expansion of macro 'max_t' sndbuf = max_t(int, sk->sk_sndbuf, RINGBUFFER_HVS_SND_SIZE); ^ >> include/linux/kernel.h:845:2: error: first argument to >> '__builtin_choose_expr' not a constant __builtin_choose_expr(__safe_cmp(x, y), \ ^ include/linux/kernel.h:913:27: note: in expansion of macro '__careful_cmp' #define min_t(type, x, y) __careful_cmp((type)(x), (type)(y), <) ^ >> net/vmw_vsock/hyperv_transport.c:391:12: note: in expansion of macro 'min_t' sndbuf = min_t(int, sndbuf, RINGBUFFER_HVS_MAX_SIZE); ^ >> include/linux/kernel.h:845:2: error: first argument to >> '__builtin_choose_expr' not a constant __builtin_choose_expr(__safe_cmp(x, y), \ ^ include/linux/kernel.h:921:27: note: in expansion of macro '__careful_cmp' #define max_t(type, x, y) __careful_cmp((type)(x), (type)(y), >) ^ net/vmw_vsock/hyperv_transport.c:393:12: note: in expansion of macro 'max_t' rcvbuf = max_t(int, sk->sk_rcvbuf, RINGBUFFER_HVS_RCV_SIZE); ^ >> include/linux/kernel.h:845:2: error: first argument to >> '__builtin_choose_expr' not a constant __builtin_choose_expr(__safe_cmp(x, y), \ ^ include/linux/kernel.h:913:27: note: in expansion of macro '__careful_cmp' #define min_t(type, x, y) __careful_cmp((type)(x), (type)(y), <) ^ net/vmw_vsock/hyperv_transport.c:394:12: note: in expansion of macro 'min_t' rcvbuf = min_t(int, rcvbuf, RINGBUFFER_HVS_MAX_SIZE); ^ net/vmw_vsock/hyperv_transport.c: In function 'hvs_stream_enqueue': >> include/linux/kernel.h:845:2: error: first argument to >> '__builtin_choose_expr' not a constant __builtin_choose_expr(__safe_cmp(x, y), \ ^ include/linux/kernel.h:913:27: note: in expansion of macro '__careful_cmp' #define min_t(type, x, y) __careful_cmp((type)(x), (type)(y), <) ^ net/vmw_vsock/hyperv_transport.c:681:14: note: in expansion of macro 'min_t' to_write = min_t(ssize_t, to_write, HVS_SEND_BUF_SIZE); ^ vim +58 net/vmw_vsock/hyperv_transport.c --- 0-DAY kernel test infrastructureOpen Source Technology Center https://lists.01.org/pipermail/kbuild-all Intel Corporation .config.gz Description: application/gzip
linux-next: Fixes tag needs some work in the usb tree
Hi all, In commit 6269e4c76eac ("usb: host: xhci-hub: fix extra endianness conversion") Fixes tag Fixes: 395f540 "xhci: support new USB 3.1 hub request to get extended port status" has these problem(s): - SHA1 should be at least 12 digits long Can be fixed by setting core.abbrev to 12 (or more) or (for git v2.11 or later) just making sure it is not set (or set to "auto"). -- Cheers, Stephen Rothwell pgpocCJcVz2AL.pgp Description: OpenPGP digital signature
[PATCH] clocksource/drivers: hyperv_timer: Fix CPU offlining by unbinding the timer
The commit fd1fea6834d0 says "No behavior is changed", but actually it removes the clockevents_unbind_device() call from hv_synic_cleanup(). In the discussion earlier this month, I thought the unbind call is unnecessary (see https://www.spinics.net/lists/arm-kernel/msg739888.html), however, after more investigation, when a VM runs on Hyper-V, it turns out the unbind call must be kept, otherwise CPU offling may not work, because a per-cpu timer device is still needed, after hv_synic_cleanup() disables the per-cpu Hyper-V timer device. The issue is found in the hibernation test. These are the details: 1. CPU0 hangs in wait_for_ap_thread(), when trying to offline CPU1: hibernation_snapshot create_image suspend_disable_secondary_cpus freeze_secondary_cpus _cpu_down(1, 1, CPUHP_OFFLINE) cpuhp_kick_ap_work cpuhp_kick_ap __cpuhp_kick_ap wait_for_ap_thread() 2. CPU0 hangs because CPU1 hangs this way: after CPU1 disables the per-cpu Hyper-V timer device in hv_synic_cleanup(), CPU1 sets a timer... Please read on to see how this can happen. 2.1 By "_cpu_down(1, 1, CPUHP_OFFLINE):", CPU0 first tries to move CPU1 to the CPUHP_TEARDOWN_CPU state and this wakes up the cpuhp/1 thread on CPU1; the thread is basically a loop of executing various callbacks defined in the global array cpuhp_hp_states[]: see smpboot_thread_fn(). 2.2 This is how a callback is called on CPU1: smpboot_thread_fn ht->thread_fn(td->cpu), i.e. cpuhp_thread_fun cpuhp_invoke_callback state = st->state st->state-- cpuhp_get_step(state)->teardown.single() 2.3 At first, the state of CPU1 is CPUHP_ONLINE, which defines a .teardown.single of NULL, so the execution of the code returns to the loop in smpboot_thread_fn(), and then reruns cpuhp_invoke_callback() with a smaller st->state. 2.4 The .teardown.single of every state between CPUHP_ONLINE and CPUHP_TEARDOWN_CPU runs one by one. 2.5 When it comes to the CPUHP_AP_ONLINE_DYN range, hv_synic_cleanup() runs: see vmbus_bus_init(). It calls hv_stimer_cleanup() -> hv_ce_shutdown() to disable the per-cpu timer device, so timer interrupt will no longer happen on CPU1. 2.6 Later, the .teardown.single of CPUHP_AP_SMPBOOT_THREADS, i.e. smpboot_park_threads(), starts to run, trying to park all the other hotplug_threads, e.g. ksoftirqd/1 and rcuc/1; here a timer can be set up this way and the timer will never be fired since CPU1 doesn't have an active timer device now, so CPU1 hangs and can not be offlined: smpboot_park_threads smpboot_park_thread kthread_park wait_task_inactive schedule_hrtimeout(, HRTIMER_MODE_REL) With this patch, when the per-cpu Hyper-V timer device is disabled, the system switches to the Local APIC timer, and the hang issue can not happen. Fixes: fd1fea6834d0 ("clocksource/drivers: Make Hyper-V clocksource ISA agnostic") Signed-off-by: Dexuan Cui --- drivers/clocksource/hyperv_timer.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/clocksource/hyperv_timer.c b/drivers/clocksource/hyperv_timer.c index 41c31a7ac0e4..8f3422c66cbb 100644 --- a/drivers/clocksource/hyperv_timer.c +++ b/drivers/clocksource/hyperv_timer.c @@ -139,6 +139,7 @@ void hv_stimer_cleanup(unsigned int cpu) /* Turn off clockevent device */ if (ms_hyperv.features & HV_MSR_SYNTIMER_AVAILABLE) { ce = per_cpu_ptr(hv_clock_event, cpu); + clockevents_unbind_device(ce, cpu); hv_ce_shutdown(ce); } } -- 2.19.1
linux-next: Signed-off-by missing for commit in the crypto tree
Hi all, Commit 53a5d5192803 ("crypto: ccp - Log an error message when ccp-crypto fails to load") is missing a Signed-off-by from its author. -- Cheers, Stephen Rothwell pgp8CGxhoQd5K.pgp Description: OpenPGP digital signature
Re: [PATCH 4.19 00/50] 4.19.62-stable review
On Fri, 26 Jul 2019 at 21:03, Greg Kroah-Hartman wrote: > > This is the start of the stable review cycle for the 4.19.62 release. > There are 50 patches in this series, all will be posted as a response > to this one. If anyone has any issues with these being applied, please > let me know. > > Responses should be made by Sun 28 Jul 2019 03:21:13 PM UTC. > Anything received after that time might be too late. > > The whole patch series can be found in one patch at: > > https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.19.62-rc1.gz > or in the git tree and branch at: > > git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git > linux-4.19.y > and the diffstat can be found below. > > thanks, > > greg k-h Results from Linaro’s test farm. No regressions on arm64, arm, x86_64, and i386. Summary kernel: 4.19.62-rc1 git repo: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git git branch: linux-4.19.y git commit: 213a5f3ac1f5e2af0e25fd4b26497590ec290be0 git describe: v4.19.61-51-g213a5f3ac1f5 Test details: https://qa-reports.linaro.org/lkft/linux-stable-rc-4.19-oe/build/v4.19.61-51-g213a5f3ac1f5 No regressions (compared to build v4.19.61) No fixes (compared to build v4.19.61) Ran 23490 total tests in the following environments and test suites. Environments -- - dragonboard-410c - arm64 - hi6220-hikey - arm64 - i386 - juno-r2 - arm64 - qemu_arm - qemu_arm64 - qemu_i386 - qemu_x86_64 - x15 - arm - x86_64 Test Suites --- * build * install-android-platform-tools-r2600 * kselftest * libgpiod * libhugetlbfs * ltp-cap_bounds-tests * ltp-commands-tests * ltp-containers-tests * ltp-cpuhotplug-tests * ltp-cve-tests * ltp-dio-tests * ltp-fcntl-locktests-tests * ltp-filecaps-tests * ltp-fs-tests * ltp-fs_bind-tests * ltp-fs_perms_simple-tests * ltp-fsx-tests * ltp-hugetlb-tests * ltp-io-tests * ltp-ipc-tests * ltp-math-tests * ltp-mm-tests * ltp-nptl-tests * ltp-pty-tests * ltp-sched-tests * ltp-securebits-tests * ltp-syscalls-tests * ltp-timers-tests * network-basic-tests * perf * spectre-meltdown-checker-test * v4l2-compliance * ltp-open-posix-tests * kvm-unit-tests * kselftest-vsyscall-mode-native * kselftest-vsyscall-mode-none -- Linaro LKFT https://lkft.linaro.org
Re: mmotm 2019-07-24-21-39 uploaded (mm/memcontrol)
On Fri, Jul 26, 2019 at 09:19:52PM -0700, Andrew Morton wrote: > On Fri, 26 Jul 2019 20:42:05 -0700 Nathan Chancellor > wrote: > > > > @@ -2414,8 +2414,9 @@ void mem_cgroup_handle_over_high(void) > > >*/ > > > clamped_high = max(high, 1UL); > > > > > > - overage = ((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT) > > > - / clamped_high; > > > + overage = (u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT; > > > + do_div(overage, clamped_high); > > > + > > > penalty_jiffies = ((u64)overage * overage * HZ) > > > >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT); > > > > > > _ > > > > > > > This causes a build error on arm: > > > > Ah. > > It's rather unclear why that u64 cast is there anyway. We're dealing > with ulongs all over this code. The below will suffice. I was thinking the same thing. > Chris, please take a look? > > --- > a/mm/memcontrol.c~mm-throttle-allocators-when-failing-reclaim-over-memoryhigh-fix-fix-fix > +++ a/mm/memcontrol.c > @@ -2415,7 +2415,7 @@ void mem_cgroup_handle_over_high(void) > clamped_high = max(high, 1UL); > > overage = (u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT; > - do_div(overage, clamped_high); > + overage /= clamped_high; > > penalty_jiffies = ((u64)overage * overage * HZ) > >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT); > _ > I assume this will get folded in with the original patch but for completeness (multi_v7_defconfig + CONFIG_MEMCG): Tested-by: Nathan Chancellor Thanks for the quick fix!
Re: [PATCH 5.1 00/62] 5.1.21-stable review
stable-rc/linux-5.1.y boot: 127 boots: 2 failed, 81 passed with 44 offline (v5.1.20-63-gf878628d8f1e) Full Boot Summary: https://kernelci.org/boot/all/job/stable-rc/branch/linux-5.1.y/kernel/v5.1.20-63-gf878628d8f1e/ Full Build Summary: https://kernelci.org/build/stable-rc/branch/linux-5.1.y/kernel/v5.1.20-63-gf878628d8f1e/ Tree: stable-rc Branch: linux-5.1.y Git Describe: v5.1.20-63-gf878628d8f1e Git Commit: f878628d8f1efc883e9bd6f9f81173194b4a01dd Git URL: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git Tested: 74 unique boards, 27 SoC families, 17 builds out of 209 Boot Failures Detected: arm64: defconfig: gcc-8: meson-g12a-x96-max: 1 failed lab arm: multi_v7_defconfig: gcc-8: bcm4708-smartrg-sr400ac: 1 failed lab Offline Platforms: arm64: defconfig: gcc-8 meson-axg-s400: 1 offline lab meson-g12a-u200: 1 offline lab meson-g12a-x96-max: 1 offline lab meson-gxbb-odroidc2: 1 offline lab meson-gxl-s905d-p230: 1 offline lab meson-gxl-s905x-libretech-cc: 1 offline lab meson-gxl-s905x-nexbox-a95x: 1 offline lab meson-gxl-s905x-p212: 1 offline lab meson-gxm-nexbox-a1: 1 offline lab rk3399-firefly: 1 offline lab sun50i-a64-pine64-plus: 1 offline lab mips: pistachio_defconfig: gcc-8 pistachio_marduk: 1 offline lab arm: exynos_defconfig: gcc-8 exynos5250-arndale: 1 offline lab exynos5420-arndale-octa: 1 offline lab exynos5800-peach-pi: 1 offline lab multi_v7_defconfig: gcc-8 bcm72521-bcm97252sffe: 1 offline lab bcm7445-bcm97445c: 1 offline lab exynos5250-arndale: 1 offline lab exynos5420-arndale-octa: 1 offline lab exynos5800-peach-pi: 1 offline lab imx6dl-wandboard_dual: 1 offline lab imx6dl-wandboard_solo: 1 offline lab imx6q-wandboard: 1 offline lab imx7s-warp: 1 offline lab meson8b-odroidc1: 1 offline lab omap3-beagle: 1 offline lab omap4-panda: 1 offline lab qcom-apq8064-ifc6410: 1 offline lab stih410-b2120: 1 offline lab sun4i-a10-cubieboard: 1 offline lab sun7i-a20-bananapi: 1 offline lab vf610-colibri-eval-v3: 1 offline lab omap2plus_defconfig: gcc-8 omap3-beagle: 1 offline lab omap4-panda: 1 offline lab qcom_defconfig: gcc-8 qcom-apq8064-ifc6410: 1 offline lab davinci_all_defconfig: gcc-8 da850-evm: 1 offline lab dm365evm,legacy: 1 offline lab imx_v6_v7_defconfig: gcc-8 imx6dl-wandboard_dual: 1 offline lab imx6dl-wandboard_solo: 1 offline lab imx6q-wandboard: 1 offline lab imx7s-warp: 1 offline lab vf610-colibri-eval-v3: 1 offline lab sunxi_defconfig: gcc-8 sun4i-a10-cubieboard: 1 offline lab sun7i-a20-bananapi: 1 offline lab --- For more info write to
Re: mmotm 2019-07-24-21-39 uploaded (mm/memcontrol)
On Fri, 26 Jul 2019 20:42:05 -0700 Nathan Chancellor wrote: > > @@ -2414,8 +2414,9 @@ void mem_cgroup_handle_over_high(void) > > */ > > clamped_high = max(high, 1UL); > > > > - overage = ((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT) > > - / clamped_high; > > + overage = (u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT; > > + do_div(overage, clamped_high); > > + > > penalty_jiffies = ((u64)overage * overage * HZ) > > >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT); > > > > _ > > > > This causes a build error on arm: > Ah. It's rather unclear why that u64 cast is there anyway. We're dealing with ulongs all over this code. The below will suffice. Chris, please take a look? --- a/mm/memcontrol.c~mm-throttle-allocators-when-failing-reclaim-over-memoryhigh-fix-fix-fix +++ a/mm/memcontrol.c @@ -2415,7 +2415,7 @@ void mem_cgroup_handle_over_high(void) clamped_high = max(high, 1UL); overage = (u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT; - do_div(overage, clamped_high); + overage /= clamped_high; penalty_jiffies = ((u64)overage * overage * HZ) >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT); _
Re: mmotm 2019-07-24-21-39 uploaded (mm/memcontrol)
On Thu, Jul 25, 2019 at 04:39:59PM -0700, Andrew Morton wrote: > On Thu, 25 Jul 2019 15:02:59 -0700 Randy Dunlap wrote: > > > On 7/24/19 9:40 PM, a...@linux-foundation.org wrote: > > > The mm-of-the-moment snapshot 2019-07-24-21-39 has been uploaded to > > > > > >http://www.ozlabs.org/~akpm/mmotm/ > > > > > > mmotm-readme.txt says > > > > > > README for mm-of-the-moment: > > > > > > http://www.ozlabs.org/~akpm/mmotm/ > > > > > > This is a snapshot of my -mm patch queue. Uploaded at random hopefully > > > more than once a week. > > > > > > You will need quilt to apply these patches to the latest Linus release > > > (5.x > > > or 5.x-rcY). The series file is in broken-out.tar.gz and is duplicated in > > > http://ozlabs.org/~akpm/mmotm/series > > > > > > > on i386: > > > > ld: mm/memcontrol.o: in function `mem_cgroup_handle_over_high': > > memcontrol.c:(.text+0x6235): undefined reference to `__udivdi3' > > Thanks. This? > > --- > a/mm/memcontrol.c~mm-throttle-allocators-when-failing-reclaim-over-memoryhigh-fix-fix > +++ a/mm/memcontrol.c > @@ -2414,8 +2414,9 @@ void mem_cgroup_handle_over_high(void) >*/ > clamped_high = max(high, 1UL); > > - overage = ((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT) > - / clamped_high; > + overage = (u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT; > + do_div(overage, clamped_high); > + > penalty_jiffies = ((u64)overage * overage * HZ) > >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT); > > _ > This causes a build error on arm: In file included from ../arch/arm/include/asm/div64.h:127, from ../include/linux/kernel.h:18, from ../include/linux/page_counter.h:6, from ../mm/memcontrol.c:25: ../mm/memcontrol.c: In function 'mem_cgroup_handle_over_high': ../include/asm-generic/div64.h:222:28: warning: comparison of distinct pointer types lacks a cast 222 | (void)(((typeof((n)) *)0) == ((uint64_t *)0)); \ |^~ ../mm/memcontrol.c:2423:2: note: in expansion of macro 'do_div' 2423 | do_div(overage, clamped_high); | ^~ In file included from ../arch/arm/include/asm/atomic.h:11, from ../include/linux/atomic.h:7, from ../include/linux/page_counter.h:5, from ../mm/memcontrol.c:25: ../include/asm-generic/div64.h:235:25: warning: right shift count >= width of type [-Wshift-count-overflow] 235 | } else if (likely(((n) >> 32) == 0)) { \ | ^~ ../include/linux/compiler.h:77:40: note: in definition of macro 'likely' 77 | # define likely(x) __builtin_expect(!!(x), 1) |^ ../mm/memcontrol.c:2423:2: note: in expansion of macro 'do_div' 2423 | do_div(overage, clamped_high); | ^~ In file included from ../arch/arm/include/asm/div64.h:127, from ../include/linux/kernel.h:18, from ../include/linux/page_counter.h:6, from ../mm/memcontrol.c:25: ../include/asm-generic/div64.h:239:22: error: passing argument 1 of '__div64_32' from incompatible pointer type [-Werror=incompatible-pointer-types] 239 | __rem = __div64_32(&(n), __base); \ | ^~~~ | | | long unsigned int * ../mm/memcontrol.c:2423:2: note: in expansion of macro 'do_div' 2423 | do_div(overage, clamped_high); | ^~ In file included from ../include/linux/kernel.h:18, from ../include/linux/page_counter.h:6, from ../mm/memcontrol.c:25: ../arch/arm/include/asm/div64.h:33:45: note: expected 'uint64_t *' {aka 'long long unsigned int *'} but argument is of type 'long unsigned int *' 33 | static inline uint32_t __div64_32(uint64_t *n, uint32_t base) | ~~^ cc1: some warnings being treated as errors make[3]: *** [../scripts/Makefile.build:274: mm/memcontrol.o] Error 1 make[2]: *** [../Makefile:1768: mm/memcontrol.o] Error 2 make[1]: *** [/home/nathan/cbl/linux-next/Makefile:330: __build_one_by_one] Error 2 make: *** [Makefile:179: sub-make] Error 2 I fixed it up like so but no idea if that is the ideal function to use. diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 5c7b9facb0eb..04b621f1cb6b 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2419,8 +2419,8 @@ void mem_cgroup_handle_over_high(void) */ clamped_high = max(high, 1UL); - overage = (u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT; - do_div(overage, clamped_high); + overage = div64_u64((u64)(usage - high) << MEMCG_DELAY_PRECISION_SHIFT, + clamped_high); penalty_jiffies = ((u64)overage * overage * HZ) >> (MEMCG_DELAY_PRECISION_SHIFT + MEMCG_DELAY_SCALING_SHIFT);
[PATCH v7] driver core: Fix use-after-free and double free on glue directory
There is a race condition between removing glue directory and adding a new device under the glue dir. It can be reproduced in following test: CPU1: CPU2: device_add() get_device_parent() class_dir_create_and_add() kobject_add_internal() create_dir()// create glue_dir device_add() get_device_parent() kobject_get() // get glue_dir device_del() cleanup_glue_dir() kobject_del(glue_dir) kobject_add() kobject_add_internal() create_dir() // in glue_dir sysfs_create_dir_ns() kernfs_create_dir_ns(sd) sysfs_remove_dir() // glue_dir->sd=NULL sysfs_put()// free glue_dir->sd // sd is freed kernfs_new_node(sd) kernfs_get(glue_dir) kernfs_add_one() kernfs_put() Before CPU1 remove last child device under glue dir, if CPU2 add a new device under glue dir, the glue_dir kobject reference count will be increase to 2 via kobject_get() in get_device_parent(). And CPU2 has been called kernfs_create_dir_ns(), but not call kernfs_new_node(). Meanwhile, CPU1 call sysfs_remove_dir() and sysfs_put(). This result in glue_dir->sd is freed and it's reference count will be 0. Then CPU2 call kernfs_get(glue_dir) will trigger a warning in kernfs_get() and increase it's reference count to 1. Because glue_dir->sd is freed by CPU1, the next call kernfs_add_one() by CPU2 will fail(This is also use-after-free) and call kernfs_put() to decrease reference count. Because the reference count is decremented to 0, it will also call kmem_cache_free() to free the glue_dir->sd again. This will result in double free. In order to avoid this happening, we also should make sure that kernfs_node for glue_dir is released in CPU1 only when refcount for glue_dir kobj is 1 to fix this race. The following calltrace is captured in kernel 4.14 with the following patch applied: commit 726e41097920 ("drivers: core: Remove glue dirs from sysfs earlier") -- [3.633703] WARNING: CPU: 4 PID: 513 at .../fs/kernfs/dir.c:494 Here is WARN_ON(!atomic_read(>count) in kernfs_get(). [3.633986] Call trace: [3.633991] kernfs_create_dir_ns+0xa8/0xb0 [3.633994] sysfs_create_dir_ns+0x54/0xe8 [3.634001] kobject_add_internal+0x22c/0x3f0 [3.634005] kobject_add+0xe4/0x118 [3.634011] device_add+0x200/0x870 [3.634017] _request_firmware+0x958/0xc38 [3.634020] request_firmware_into_buf+0x4c/0x70 [3.634064] kernel BUG at .../mm/slub.c:294! Here is BUG_ON(object == fp) in set_freepointer(). [3.634346] Call trace: [3.634351] kmem_cache_free+0x504/0x6b8 [3.634355] kernfs_put+0x14c/0x1d8 [3.634359] kernfs_create_dir_ns+0x88/0xb0 [3.634362] sysfs_create_dir_ns+0x54/0xe8 [3.634366] kobject_add_internal+0x22c/0x3f0 [3.634370] kobject_add+0xe4/0x118 [3.634374] device_add+0x200/0x870 [3.634378] _request_firmware+0x958/0xc38 [3.634381] request_firmware_into_buf+0x4c/0x70 -- Fixes: 726e41097920 ("drivers: core: Remove glue dirs from sysfs earlier") Signed-off-by: Muchun Song Reviewed-by: Mukesh Ojha --- Change in v7: 1. Update commit message. Change in v6: 1. Remove hardcoding "1 " Change in v5: 1. Revert to the v1 fix. 2. Add some comment to explain why we need do this in cleanup_glue_dir(). Change in v4: 1. Add some kerneldoc comment. 2. Remove unlock_if_glue_dir(). 3. Rename get_device_parent_locked_if_glue_dir() to get_device_parent_locked. 4. Update commit message. Change in v3: Add change log. Change in v2: Fix device_move() also. drivers/base/core.c | 53 - 1 file changed, 52 insertions(+), 1 deletion(-) diff --git a/drivers/base/core.c b/drivers/base/core.c index 4aeaa0c92bda..edc55160c5f0 100644 --- a/drivers/base/core.c +++ b/drivers/base/core.c @@ -1820,12 +1820,63 @@ static inline struct kobject *get_glue_dir(struct device *dev) */ static void cleanup_glue_dir(struct device *dev, struct kobject *glue_dir) { + unsigned int ref; + /* see if we live in a "glue"
Re: [PATCH V2 net-next 07/11] net: hns3: adds debug messages to identify eth down cause
On Sat, 2019-07-27 at 10:28 +0800, liuyonglong wrote: > On 2019/7/27 6:18, Joe Perches wrote: > > On Fri, 2019-07-26 at 22:00 +, Saeed Mahameed wrote: > > > On Fri, 2019-07-26 at 11:24 +0800, Huazhong Tan wrote: > > > > From: Yonglong Liu > > > > > > > > Some times just see the eth interface have been down/up via > > > > dmesg, but can not know why the eth down. So adds some debug > > > > messages to identify the cause for this. > > [] > > > > diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c > > > [] > > > > @@ -459,6 +459,10 @@ static int hns3_nic_net_open(struct net_device > > > > *netdev) > > > > h->ae_algo->ops->set_timer_task(priv->ae_handle, true); > > > > > > > > hns3_config_xps(priv); > > > > + > > > > + if (netif_msg_drv(h)) > > > > + netdev_info(netdev, "net open\n"); > > > > + > > > > > > to make sure this is only intended for debug, and to avoid repetition. > > > #define hns3_dbg(__dev, format, args...) \ > > > ({\ > > > if (netif_msg_drv(h)) \ > > > netdev_info(h->netdev, format, ##args); \ > > > }) > > > > netif_dbg(h, drv, h->netdev, "net open\n") > > > > Hi, Saeed && Joe: > For our cases, maybe netif_info() can be use for HNS3 drivers? > netif_dbg need to open dynamic debug options additional. Your code, your choice. I do think littering dmesg with "net open" style messages and such may be unnecessary. KERN_DEBUG seems a more appropriate log level.
Re: [GIT PULL] SELinux fixes for v5.3 (#1)
The pull request you sent on Fri, 26 Jul 2019 18:13:53 -0400: > git://git.kernel.org/pub/scm/linux/kernel/git/pcmoore/selinux.git > tags/selinux-pr-20190726 has been merged into torvalds/linux.git: https://git.kernel.org/torvalds/c/40233e7c447367ffc615b524187970732848d5e3 Thank you! -- Deet-doot-dot, I am a bot. https://korg.wiki.kernel.org/userdoc/prtracker
Re: [PATCH v2] .gitignore: Add compilation database file
On Wed, Jul 24, 2019 at 9:22 AM Toru Komatsu wrote: > > This file is used by clangd to use language server protocol. > It can be generated at each compile using scripts/gen_compile_commands.py. > Therefore it is different depending on the environment and should be > ignored. > > Signed-off-by: Toru Komatsu > --- > .gitignore | 3 +++ > 1 file changed, 3 insertions(+) Applied to linux-kbuild/fixes. Thanks. -- Best Regards Masahiro Yamada
[PATCH] gen_compile_commands: lower the entry count threshold
Running gen_compile_commands.py after building with allnoconfig gave this: $ ./scripts/gen_compile_commands.py WARNING: Found 449 entries. Have you compiled the kernel? Signed-off-by: Masahiro Yamada --- scripts/gen_compile_commands.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/scripts/gen_compile_commands.py b/scripts/gen_compile_commands.py index 7915823b92a5..c458696ef3a7 100755 --- a/scripts/gen_compile_commands.py +++ b/scripts/gen_compile_commands.py @@ -21,9 +21,9 @@ _LINE_PATTERN = r'^cmd_[^ ]*\.o := (.* )([^ ]*\.c)$' _VALID_LOG_LEVELS = ['DEBUG', 'INFO', 'WARNING', 'ERROR', 'CRITICAL'] # A kernel build generally has over 2000 entries in its compile_commands.json -# database. If this code finds 500 or fewer, then warn the user that they might +# database. If this code finds 300 or fewer, then warn the user that they might # not have all the .cmd files, and they might need to compile the kernel. -_LOW_COUNT_THRESHOLD = 500 +_LOW_COUNT_THRESHOLD = 300 def parse_arguments(): -- 2.17.1
Re: [PATCH] ext4: Fix deadlock on page reclaim
On 2019/07/27 7:55, Theodore Y. Ts'o wrote: > On Sat, Jul 27, 2019 at 08:44:23AM +1000, Dave Chinner wrote: >>> >>> This looks like something that could hit every file systems, so >>> shouldn't we fix this in common code? We could also look into >>> just using memalloc_nofs_save for the page cache allocation path >>> instead of the per-mapping gfp_mask. >> >> I think it has to be the entire IO path - any allocation from the >> underlying filesystem could recurse into the top level filesystem >> and then deadlock if the memory reclaim submits IO or blocks on >> IO completion from the upper filesystem. That's a bloody big hammer >> for something that is only necessary when there are stacked >> filesystems like this > > Yeah that's why using memalloc_nofs_save() probably makes the most > sense, and dm_zoned should use that before it calls into ext4. Unfortunately, with this particular setup, that will not solve the problem. dm-zoned submit BIOs to its backend drive in response to XFS activity. The requests for these BIOs are passed along to the kernel tcmu HBA and end up in that HBA command ring. The commands themselves are read from the ring and executed by the tcmu-runner user process which executes them doing pread()/pwrite() to the ext4 file. The tcmu-runner process being a different context than the dm-zoned worker thread issuing the BIO, memalloc_nofs_save/restore() calls in dm-zoned will have no effect. We tried a simpler setup using loopback mount (XFS used directly in an ext4 file) and running the same workload. We failed to recreate a similar deadlock in this case, but I am strongly suspecting that it can happen too. It is simply much harder to hit because the IO path from XFS to ext4 is all in-kernel and asynchronous, whereas tcmu-runner ZBC handler is a synchronous QD=1 path for IOs which makes it relatively easy to get inter-dependent writes or read+write queued back-to-back and create the deadlock. So back to Dave's point, we may be needing the big-hammer solution in the case of stacked file systems, while a non-stack setups do not necessarily need it (that is for the FS to decide). But I do not see how to implement this big hammer conditionally. How can a file system tell if it is at the top of the stack (big hammer not needed) or lower than the top level (big hammer needed) ? One simple hack would be an fcntl() or mount option to tell the FS to use GFP_NOFS unconditionally, but avoiding the bug would mean making sure that the applications or system setup is correct. So not so safe. -- Damien Le Moal Western Digital Research
Re: memory leak in kobject_set_name_vargs (2)
> On Jul 26, 2019, at 10:29 PM, Linus Torvalds > wrote: > > On Fri, Jul 26, 2019 at 4:26 PM syzbot > wrote: >> >> syzbot has bisected this bug to: >> >> commit 0e034f5c4bc408c943f9c4a06244415d75d7108c >> Author: Linus Torvalds >> Date: Wed May 18 18:51:25 2016 + >> >> iwlwifi: fix mis-merge that breaks the driver > > While this bisection looks more likely than the other syzbot entry > that bisected to a version change, I don't think it is correct eitger. > > The bisection ended up doing a lot of "git bisect skip" because of the > >undefined reference to `nf_nat_icmp_reply_translation' > > issue. Also, the memory leak doesn't seem to be entirely reliable: > when the bisect does 10 runs to verify that some test kernel is bad, > there are a couple of cases where only one or two of the ten run > failed. > > Which makes me wonder if one or two of the "everything OK" runs were > actually buggy, but just happened to have all ten pass… Real bisection should point to, 8ed633b9baf9e (“Revert "net-sysfs: Fix memory leak in netdev_register_kobject”") I did encounter those memory leak and comes up with a similar fix in, 6b70fc94afd1 ("net-sysfs: Fix memory leak in netdev_register_kobject”) but those error handling paths are tricky that seems nobody did much testing there, so it will keep hitting other bugs in upper functions.
Re: WARNING in ovl_real_fdget_meta
Hello, syzbot tried to test the proposed patch but build/boot failed: vmalloc) [6.623186][T1] TCP established hash table entries: 65536 (order: 7, 524288 bytes, vmalloc) [6.629001][T1] TCP bind hash table entries: 65536 (order: 10, 4194304 bytes, vmalloc) [6.633571][T1] TCP: Hash tables configured (established 65536 bind 65536) [6.635510][T1] UDP hash table entries: 4096 (order: 7, 655360 bytes, vmalloc) [6.637367][T1] UDP-Lite hash table entries: 4096 (order: 7, 655360 bytes, vmalloc) [6.639861][T1] NET: Registered protocol family 1 [6.642372][T1] RPC: Registered named UNIX socket transport module. [6.643458][T1] RPC: Registered udp transport module. [6.644319][T1] RPC: Registered tcp transport module. [6.645199][T1] RPC: Registered tcp NFSv4.1 backchannel transport module. [6.647753][T1] NET: Registered protocol family 44 [6.648732][T1] pci :00:00.0: Limiting direct PCI/PCI transfers [6.649837][T1] PCI: CLS 0 bytes, default 64 [6.654238][T1] PCI-DMA: Using software bounce buffering for IO (SWIOTLB) [6.655433][T1] software IO TLB: mapped [mem 0xaa80-0xae80] (64MB) [6.660080][T1] RAPL PMU: API unit is 2^-32 Joules, 0 fixed counters, 10737418240 ms ovfl timer [6.663698][T1] kvm: already loaded the other module [6.664750][T1] clocksource: tsc: mask: 0x max_cycles: 0x212735223b2, max_idle_ns: 440795277976 ns [6.666833][T1] clocksource: Switched to clocksource tsc [6.667884][T1] mce: Machine check injector initialized [6.672842][T1] check: Scanning for low memory corruption every 60 seconds [6.784695][T1] Initialise system trusted keyrings [6.786453][T1] workingset: timestamp_bits=40 max_order=21 bucket_order=0 [6.788062][T1] zbud: loaded [6.793680][T1] DLM installed [6.795747][T1] squashfs: version 4.0 (2009/01/31) Phillip Lougher [6.799822][T1] FS-Cache: Netfs 'nfs' registered for caching [6.802062][T1] NFS: Registering the id_resolver key type [6.803162][T1] Key type id_resolver registered [6.804299][T1] Key type id_legacy registered [6.805300][T1] nfs4filelayout_init: NFSv4 File Layout Driver Registering... [6.806905][T1] Installing knfsd (copyright (C) 1996 o...@monad.swb.de). [6.811461][T1] ntfs: driver 2.1.32 [Flags: R/W]. [6.813297][T1] fuse: init (API version 7.31) [6.816259][T1] JFS: nTxBlock = 8192, nTxLock = 65536 [6.826202][T1] SGI XFS with ACLs, security attributes, realtime, no debug enabled [6.832172][T1] 9p: Installing v9fs 9p2000 file system support [6.833515][T1] FS-Cache: Netfs '9p' registered for caching [6.838070][T1] gfs2: GFS2 installed [6.841163][T1] FS-Cache: Netfs 'ceph' registered for caching [6.842969][T1] ceph: loaded (mds proto 32) [6.850819][T1] NET: Registered protocol family 38 [6.852584][T1] async_tx: api initialized (async) [6.853585][T1] Key type asymmetric registered [6.854272][T1] Asymmetric key parser 'x509' registered [6.855126][T1] Asymmetric key parser 'pkcs8' registered [6.855903][T1] Key type pkcs7_test registered [6.856598][T1] Asymmetric key parser 'tpm_parser' registered [6.857618][T1] Block layer SCSI generic (bsg) driver version 0.4 loaded (major 246) [6.859381][T1] io scheduler mq-deadline registered [6.860444][T1] io scheduler kyber registered [6.861501][T1] io scheduler bfq registered [6.866618][T1] input: Power Button as /devices/LNXSYSTM:00/LNXPWRBN:00/input/input0 [6.869055][T1] ACPI: Power Button [PWRF] [6.870629][T1] input: Sleep Button as /devices/LNXSYSTM:00/LNXSLPBN:00/input/input1 [6.872202][T1] ACPI: Sleep Button [SLPF] [6.877520][T1] ioatdma: Intel(R) QuickData Technology Driver 5.00 [6.889497][T1] PCI Interrupt Link [LNKC] enabled at IRQ 11 [6.890599][T1] virtio-pci :00:03.0: virtio_pci: leaving for legacy driver [6.903444][T1] PCI Interrupt Link [LNKD] enabled at IRQ 10 [6.904470][T1] virtio-pci :00:04.0: virtio_pci: leaving for legacy driver [7.39][T1] HDLC line discipline maxframe=4096 [7.223063][T1] N_HDLC line discipline registered. [7.223876][T1] Serial: 8250/16550 driver, 4 ports, IRQ sharing enabled [7.247483][T1] 00:03: ttyS0 at I/O 0x3f8 (irq = 4, base_baud = 115200) is a 16550A [7.273815][T1] 00:04: ttyS1 at I/O 0x2f8 (irq = 3, base_baud = 115200) is a 16550A [7.299513][T1] 00:05: ttyS2 at I/O 0x3e8 (irq = 6, base_baud = 115200) is a 16550A [7.325004][T1] 00:06: ttyS3 at I/O 0x2e8 (irq = 7, base_baud = 115200) is a 16550A [7.335983][T1] Non-volatile memory driver v1.3 [7.337472][
Re: [PATCH 4.19 00/50] 4.19.62-stable review
On 7/26/19 9:24 AM, Greg Kroah-Hartman wrote: This is the start of the stable review cycle for the 4.19.62 release. There are 50 patches in this series, all will be posted as a response to this one. If anyone has any issues with these being applied, please let me know. Responses should be made by Sun 28 Jul 2019 03:21:13 PM UTC. Anything received after that time might be too late. The whole patch series can be found in one patch at: https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-4.19.62-rc1.gz or in the git tree and branch at: git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-4.19.y and the diffstat can be found below. thanks, greg k-h Compiled and booted on my test system. No dmesg regressions. thanks, -- Shuah
Re: [PATCH 02/10] mm/page_alloc: use unsigned int for "order" in __rmqueue_fallback()
On Fri, Jul 26, 2019 at 5:36 PM Rasmus Villemoes wrote: > > On 25/07/2019 20.42, Pengfei Li wrote: > > Because "order" will never be negative in __rmqueue_fallback(), > > so just make "order" unsigned int. > > And modify trace_mm_page_alloc_extfrag() accordingly. > > > > > diff --git a/mm/page_alloc.c b/mm/page_alloc.c > > index 75c18f4fd66a..1432cbcd87cd 100644 > > --- a/mm/page_alloc.c > > +++ b/mm/page_alloc.c > > @@ -2631,8 +2631,8 @@ static bool unreserve_highatomic_pageblock(const > > struct alloc_context *ac, > > * condition simpler. > > */ > > static __always_inline bool > > -__rmqueue_fallback(struct zone *zone, int order, int start_migratetype, > > - unsigned int alloc_flags) > > +__rmqueue_fallback(struct zone *zone, unsigned int order, > > + int start_migratetype, unsigned int alloc_flags) > > { > > Please read the last paragraph of the comment above this function, run > git blame to figure out when that was introduced, and then read the full > commit description. Thanks for your comments. I have read the commit info of commit b002529d2563 ("mm/page_alloc.c: eliminate unsigned confusion in __rmqueue_fallback"). And I looked at the discussion at https://lkml.org/lkml/2017/6/21/684 in detail. > Here be dragons. At the very least, this patch is > wrong in that it makes that comment inaccurate. I wonder if you noticed the commit 6bb154504f8b ("mm, page_alloc: spread allocations across zones before introducing fragmentation"). Commit 6bb154504f8b introduces a local variable min_order in __rmqueue_fallback(). And you can see for (current_order = MAX_ORDER - 1; current_order >= min_order; --current_order) { The “current_order” and "min_order" are int, so here is ok. Since __rmqueue_fallback() is only called by __rmqueue() and "order" is unsigned int in __rmqueue(), then I think that making "order" is also unsigned int is good. Maybe I should also modify the comments here? > > Rasmus Thank you again for your review. -- Pengfei
Re: [PATCH 5.1 00/62] 5.1.21-stable review
On 7/26/19 9:24 AM, Greg Kroah-Hartman wrote: Note, this will be the LAST 5.1.y kernel release. Everyone should move to the 5.2.y series at this point in time. This is the start of the stable review cycle for the 5.1.21 release. There are 62 patches in this series, all will be posted as a response to this one. If anyone has any issues with these being applied, please let me know. Responses should be made by Sun 28 Jul 2019 03:21:13 PM UTC. Anything received after that time might be too late. The whole patch series can be found in one patch at: https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.1.21-rc1.gz or in the git tree and branch at: git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-5.1.y and the diffstat can be found below. thanks, greg k-h Compiled and booted on my test system. No dmesg regressions. thanks, -- Shuah
Re: [PATCH 5.2 00/66] 5.2.4-stable review
On 7/26/19 9:23 AM, Greg Kroah-Hartman wrote: This is the start of the stable review cycle for the 5.2.4 release. There are 66 patches in this series, all will be posted as a response to this one. If anyone has any issues with these being applied, please let me know. Responses should be made by Sun 28 Jul 2019 03:21:13 PM UTC. Anything received after that time might be too late. The whole patch series can be found in one patch at: https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.2.4-rc1.gz or in the git tree and branch at: git://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git linux-5.2.y and the diffstat can be found below. thanks, greg k-h Compiled and booted on my test system. No dmesg regressions, thanks, -- Shuah
Re: memory leak in kobject_set_name_vargs (2)
On Fri, Jul 26, 2019 at 4:26 PM syzbot wrote: > > syzbot has bisected this bug to: > > commit 0e034f5c4bc408c943f9c4a06244415d75d7108c > Author: Linus Torvalds > Date: Wed May 18 18:51:25 2016 + > > iwlwifi: fix mis-merge that breaks the driver While this bisection looks more likely than the other syzbot entry that bisected to a version change, I don't think it is correct eitger. The bisection ended up doing a lot of "git bisect skip" because of the undefined reference to `nf_nat_icmp_reply_translation' issue. Also, the memory leak doesn't seem to be entirely reliable: when the bisect does 10 runs to verify that some test kernel is bad, there are a couple of cases where only one or two of the ten run failed. Which makes me wonder if one or two of the "everything OK" runs were actually buggy, but just happened to have all ten pass... Linus
Re: Regression in 5.3 for some FS_USERNS_MOUNT (aka user-namespace-mountable) filesystems
On Fri, Jul 26, 2019 at 07:46:18PM -0500, Eric W. Biederman wrote: > If someone had bothered to actually look at how I was proposing to clean > things up before the new mount api we would already have that. Sigh. > > You should be able to get away with something like this which moves the > checks earlier and makes things clearer. My old patch against the pre > new mount api code. Check your instances of ->permission(); AFAICS in all cases it's (in current terms) return ns_capable(fc->user_ns, CAP_SYS_ADMIN) ? 0 : -EPERM; In principle I like killing FS_USERNS_MOUNT flag, but when a method is always either NULL or exact same function...
Re: [PATCH V2 net-next 07/11] net: hns3: adds debug messages to identify eth down cause
On 2019/7/27 6:18, Joe Perches wrote: > On Fri, 2019-07-26 at 22:00 +, Saeed Mahameed wrote: >> On Fri, 2019-07-26 at 11:24 +0800, Huazhong Tan wrote: >>> From: Yonglong Liu >>> >>> Some times just see the eth interface have been down/up via >>> dmesg, but can not know why the eth down. So adds some debug >>> messages to identify the cause for this. > [] >>> diff --git a/drivers/net/ethernet/hisilicon/hns3/hns3_enet.c >> [] >>> @@ -459,6 +459,10 @@ static int hns3_nic_net_open(struct net_device >>> *netdev) >>> h->ae_algo->ops->set_timer_task(priv->ae_handle, true); >>> >>> hns3_config_xps(priv); >>> + >>> + if (netif_msg_drv(h)) >>> + netdev_info(netdev, "net open\n"); >>> + >> >> to make sure this is only intended for debug, and to avoid repetition. >> #define hns3_dbg(__dev, format, args...) \ >> ({ \ >> if (netif_msg_drv(h)) \ >> netdev_info(h->netdev, format, ##args); \ >> }) > > netif_dbg(h, drv, h->netdev, "net open\n") > Hi, Saeed && Joe: For our cases, maybe netif_info() can be use for HNS3 drivers? netif_dbg need to open dynamic debug options additional.
Re: Regression in 5.3 for some FS_USERNS_MOUNT (aka user-namespace-mountable) filesystems
On Sat, Jul 27, 2019 at 12:22:20AM +0100, Al Viro wrote: > On Fri, Jul 26, 2019 at 03:47:02PM -0700, Linus Torvalds wrote: > > > Of course, then later on, commit 20284ab7427f ("switch mount_capable() > > to fs_context") drops that argument entirely, and hardcodes the > > decision to look at fc->global. > > > > But that fc->global decision wasn't there originally, and is incorrect > > since it breaks existing users. > > > > What gets much more confusing about this is that the two different > > users then moved around. The sget_userns() case got moved to > > legacy_get_tree(), and then joined together in vfs_get_tree(), and > > then split and moved out to do_new_mount() and vfs_fsconfig_locked(). > > > > And that "joined together into vfs_get_tree()" must be wrong, because > > the two cases used two different namespace rules. The sget_userns() > > case *did* have that "global" flag check, while the sget_fc() did not. > > > > Messy. Al? > > Digging through that mess... It's my fuckup, and we obviously need to > restore the old behaviour, but I really hope to manage that with > checks _not_ in superblock allocator ;-/ It shouldn't have looked at fc->global for those checks. In any cases. sget_fc() should indeed have been passing fc->user_ns, not userns. And as for sget_userns(), by the time of 20284ab7427f its checks had been moved to legacy_get_tree(). In form of if (!mount_capable(fc->fs_type, fc->user_ns)) as it bloody well ought to. So the first mistake (wrong argument passed to mount_capable() by sget_fc() in 0ce0cf12fc4c) has been completed by 20284ab7427f - that conversion was, actually, an equivalent transformation (callers of legacy_get_tree() never have fc->global set, so it's all the same). However, the bug introduced in the earlier commit was now spelled out in mount_capable() itself. IOW, the minimal fix should be as below. In principle, I'm not against Eric's "add a method instead of setting FS_USERNS_MOUNT", but note that in *all* cases the instances of his method end up being equivalent to return ns_capable(fc->user_ns, CAP_SYS_ADMIN) ? 0 : -EPERM; Anyway, AFAICS the regression fix should be simply this: Unbreak mount_capable() In "consolidate the capability checks in sget_{fc,userns}())" the wrong argument had been passed to mount_capable() by sget_fc(). That mistake had been further obscured later, when switching mount_capable() to fs_context has moved the calculation of bogus argument from sget_fc() to mount_capable() itself. It should've been fc->user_ns all along. Screwed-up-by: Al Viro Reported-by: Christian Brauner Signed-off-by: Al Viro --- diff --git a/fs/super.c b/fs/super.c index 113c58f19425..5960578a4076 100644 --- a/fs/super.c +++ b/fs/super.c @@ -478,13 +478,10 @@ EXPORT_SYMBOL(generic_shutdown_super); bool mount_capable(struct fs_context *fc) { - struct user_namespace *user_ns = fc->global ? _user_ns - : fc->user_ns; - if (!(fc->fs_type->fs_flags & FS_USERNS_MOUNT)) return capable(CAP_SYS_ADMIN); else - return ns_capable(user_ns, CAP_SYS_ADMIN); + return ns_capable(fc->user_ns, CAP_SYS_ADMIN); } /**
RE: [PATCH 3/4] RISC-V: Support case insensitive ISA string parsing.
> -Original Message- > From: Paul Walmsley > Sent: Saturday, July 27, 2019 5:00 AM > To: Atish Patra > Cc: linux-kernel@vger.kernel.org; Alan Kao ; > Albert Ou ; Allison Randal ; > Anup Patel ; Daniel Lezcano > ; Greg Kroah-Hartman > ; Johan Hovold ; linux- > ri...@lists.infradead.org; Palmer Dabbelt ; Thomas > Gleixner > Subject: Re: [PATCH 3/4] RISC-V: Support case insensitive ISA string parsing. > > On Fri, 26 Jul 2019, Atish Patra wrote: > > > On 7/26/19 1:47 PM, Paul Walmsley wrote: > > > On Fri, 26 Jul 2019, Atish Patra wrote: > > > > > > > As per riscv specification, ISA naming strings are case > > > > insensitive. However, currently only lower case strings are parsed > > > > during cpu procfs. > > > > > > > > Support parsing of upper case letters as well. > > > > > > > > Signed-off-by: Atish Patra > > > > > > Is there a use case that's driving this, or > > > > Currently, we use all lower case isa string in kvmtool. But somebody > > can have uppercase letters in future as spec allows it. > > > > > > can we just say, "use > > > lowercase letters" and leave it at that? > > > > > > > In that case, it will not comply with RISC-V spec. Is that okay ? > > I think that section of the specification is mostly concerned with someone > trying to define "f" as a different extension than "F", or something like > that. > I'm not sure that it imposes any constraint that software must accept both > upper and lower case ISA strings. > > What gives me pause here is that this winds up impacting DT schema > validation: > > https://git.kernel.org/pub/scm/linux/kernel/git/torvalds/linux.git/tree/Docu > mentation/devicetree/bindings/riscv/cpus.yaml#n41 If 'f' and 'F' mean same extension as-per RISC-V spec then software should also interpret it that way hence this patch. Regards, Anup
Re: [PATCH v3 2/3] augmented rbtree: add new RB_DECLARE_CALLBACKS_MAX macro
On Fri, Jul 26, 2019 at 06:44:19PM -0700, Andrew Morton wrote: > On Mon, 8 Jul 2019 05:24:09 -0700 Michel Lespinasse wrote: > > > Syncing up with v5.2, I see that there is a new use for augmented > > rbtrees in mm/vmalloc.c which does not compile after applying my > > patchset. > > > > It's an easy fix though: > > It still doesn't build. > > lib/rbtree_test.c: In function check_augmented: > lib/rbtree_test.c:225:35: error: implicit declaration of function > augment_recompute [-Werror=implicit-function-declaration] >WARN_ON_ONCE(node->augmented != augment_recompute(node)); grumpf, sorry about that. I thought I had rbtree_test enabled in my build, but turned out I only had interval_tree_test :/ I would suggest the following fix, which reintroduces the code to compute node->augmented as was previously done in augment_recompute(): --- 8< After introducing RB_DECLARE_CALLBACKS_MAX, we do not need the augment_recompute function to recompute node->augmented during rbtree rebalancing callbacks. However, this function was also used in check_augmented() to verify that node->augmented was correctly set, so we need to reintroduce the code for that check. Signed-off-by: Michel Lespinasse --- lib/rbtree_test.c | 15 ++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/lib/rbtree_test.c b/lib/rbtree_test.c index 1939419ba869..41ae3c7570d3 100644 --- a/lib/rbtree_test.c +++ b/lib/rbtree_test.c @@ -222,7 +222,20 @@ static void check_augmented(int nr_nodes) check(nr_nodes); for (rb = rb_first(_root); rb; rb = rb_next(rb)) { struct test_node *node = rb_entry(rb, struct test_node, rb); - WARN_ON_ONCE(node->augmented != augment_recompute(node)); + u32 subtree, max = node->val; + if (node->rb.rb_left) { + subtree = rb_entry(node->rb.rb_left, struct test_node, + rb)->augmented; + if (max < subtree) + max = subtree; + } + if (node->rb.rb_right) { + subtree = rb_entry(node->rb.rb_right, struct test_node, + rb)->augmented; + if (max < subtree) + max = subtree; + } + WARN_ON_ONCE(node->augmented != max); } } -- Michel "Walken" Lespinasse A program is never fully debugged until the last user dies.
Re: [PATCH 4.19 00/50] 4.19.62-stable review
stable-rc/linux-4.19.y boot: 118 boots: 1 failed, 77 passed with 40 offline (v4.19.61-51-g213a5f3ac1f5) Full Boot Summary: https://kernelci.org/boot/all/job/stable-rc/branch/linux-4.19.y/kernel/v4.19.61-51-g213a5f3ac1f5/ Full Build Summary: https://kernelci.org/build/stable-rc/branch/linux-4.19.y/kernel/v4.19.61-51-g213a5f3ac1f5/ Tree: stable-rc Branch: linux-4.19.y Git Describe: v4.19.61-51-g213a5f3ac1f5 Git Commit: 213a5f3ac1f5e2af0e25fd4b26497590ec290be0 Git URL: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git Tested: 68 unique boards, 27 SoC families, 17 builds out of 206 Boot Failure Detected: arc: hsdk_defconfig: gcc-8: hsdk: 1 failed lab Offline Platforms: arm64: defconfig: gcc-8 meson-axg-s400: 1 offline lab meson-gxbb-odroidc2: 1 offline lab meson-gxl-s905d-p230: 1 offline lab meson-gxl-s905x-libretech-cc: 1 offline lab meson-gxl-s905x-nexbox-a95x: 1 offline lab meson-gxl-s905x-p212: 1 offline lab meson-gxm-nexbox-a1: 1 offline lab rk3399-firefly: 1 offline lab sun50i-a64-pine64-plus: 1 offline lab mips: pistachio_defconfig: gcc-8 pistachio_marduk: 1 offline lab arm: exynos_defconfig: gcc-8 exynos5250-arndale: 1 offline lab exynos5420-arndale-octa: 1 offline lab exynos5800-peach-pi: 1 offline lab multi_v7_defconfig: gcc-8 exynos5250-arndale: 1 offline lab exynos5420-arndale-octa: 1 offline lab exynos5800-peach-pi: 1 offline lab imx6dl-wandboard_dual: 1 offline lab imx6dl-wandboard_solo: 1 offline lab imx6q-wandboard: 1 offline lab imx7s-warp: 1 offline lab meson8b-odroidc1: 1 offline lab omap3-beagle: 1 offline lab omap4-panda: 1 offline lab qcom-apq8064-ifc6410: 1 offline lab stih410-b2120: 1 offline lab sun4i-a10-cubieboard: 1 offline lab sun7i-a20-bananapi: 1 offline lab vf610-colibri-eval-v3: 1 offline lab omap2plus_defconfig: gcc-8 omap3-beagle: 1 offline lab omap4-panda: 1 offline lab qcom_defconfig: gcc-8 qcom-apq8064-ifc6410: 1 offline lab davinci_all_defconfig: gcc-8 da850-evm: 1 offline lab dm365evm,legacy: 1 offline lab imx_v6_v7_defconfig: gcc-8 imx6dl-wandboard_dual: 1 offline lab imx6dl-wandboard_solo: 1 offline lab imx6q-wandboard: 1 offline lab imx7s-warp: 1 offline lab vf610-colibri-eval-v3: 1 offline lab sunxi_defconfig: gcc-8 sun4i-a10-cubieboard: 1 offline lab sun7i-a20-bananapi: 1 offline lab --- For more info write to
Re: [PATCH 5.2 00/66] 5.2.4-stable review
stable-rc/linux-5.2.y boot: 129 boots: 1 failed, 83 passed with 45 offline (v5.2.3-67-gd61e440a1852) Full Boot Summary: https://kernelci.org/boot/all/job/stable-rc/branch/linux-5.2.y/kernel/v5.2.3-67-gd61e440a1852/ Full Build Summary: https://kernelci.org/build/stable-rc/branch/linux-5.2.y/kernel/v5.2.3-67-gd61e440a1852/ Tree: stable-rc Branch: linux-5.2.y Git Describe: v5.2.3-67-gd61e440a1852 Git Commit: d61e440a1852a64d8a2d0d358b9582b19157e039 Git URL: https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git Tested: 76 unique boards, 28 SoC families, 17 builds out of 209 Boot Failure Detected: arm: omap2plus_defconfig: gcc-8: omap4-panda: 1 failed lab Offline Platforms: riscv: defconfig: gcc-8 sifive_fu540: 1 offline lab arm64: defconfig: gcc-8 meson-axg-s400: 1 offline lab meson-g12a-u200: 1 offline lab meson-g12a-x96-max: 1 offline lab meson-gxbb-odroidc2: 1 offline lab meson-gxl-s905d-p230: 1 offline lab meson-gxl-s905x-libretech-cc: 1 offline lab meson-gxl-s905x-nexbox-a95x: 1 offline lab meson-gxl-s905x-p212: 1 offline lab meson-gxm-nexbox-a1: 1 offline lab rk3399-firefly: 1 offline lab sun50i-a64-pine64-plus: 1 offline lab mips: pistachio_defconfig: gcc-8 pistachio_marduk: 1 offline lab arm: exynos_defconfig: gcc-8 exynos5250-arndale: 1 offline lab exynos5420-arndale-octa: 1 offline lab exynos5800-peach-pi: 1 offline lab multi_v7_defconfig: gcc-8 bcm72521-bcm97252sffe: 1 offline lab bcm7445-bcm97445c: 1 offline lab exynos5250-arndale: 1 offline lab exynos5420-arndale-octa: 1 offline lab exynos5800-peach-pi: 1 offline lab imx6dl-wandboard_dual: 1 offline lab imx6dl-wandboard_solo: 1 offline lab imx6q-wandboard: 1 offline lab imx7s-warp: 1 offline lab meson8b-odroidc1: 1 offline lab omap3-beagle: 1 offline lab omap4-panda: 1 offline lab qcom-apq8064-ifc6410: 1 offline lab stih410-b2120: 1 offline lab sun4i-a10-cubieboard: 1 offline lab sun7i-a20-bananapi: 1 offline lab vf610-colibri-eval-v3: 1 offline lab omap2plus_defconfig: gcc-8 omap3-beagle: 1 offline lab omap4-panda: 1 offline lab qcom_defconfig: gcc-8 qcom-apq8064-ifc6410: 1 offline lab davinci_all_defconfig: gcc-8 da850-evm: 1 offline lab dm365evm,legacy: 1 offline lab imx_v6_v7_defconfig: gcc-8 imx6dl-wandboard_dual: 1 offline lab imx6dl-wandboard_solo: 1 offline lab imx6q-wandboard: 1 offline lab imx7s-warp: 1 offline lab vf610-colibri-eval-v3: 1 offline lab sunxi_defconfig: gcc-8 sun4i-a10-cubieboard: 1 offline lab sun7i-a20-bananapi: 1 offline lab --- For more info write to
mmotm 2019-07-26-19-00 uploaded
The mm-of-the-moment snapshot 2019-07-26-19-00 has been uploaded to http://www.ozlabs.org/~akpm/mmotm/ mmotm-readme.txt says README for mm-of-the-moment: http://www.ozlabs.org/~akpm/mmotm/ This is a snapshot of my -mm patch queue. Uploaded at random hopefully more than once a week. You will need quilt to apply these patches to the latest Linus release (5.x or 5.x-rcY). The series file is in broken-out.tar.gz and is duplicated in http://ozlabs.org/~akpm/mmotm/series The file broken-out.tar.gz contains two datestamp files: .DATE and .DATE--mm-dd-hh-mm-ss. Both contain the string -mm-dd-hh-mm-ss, followed by the base kernel version against which this patch series is to be applied. This tree is partially included in linux-next. To see which patches are included in linux-next, consult the `series' file. Only the patches within the #NEXT_PATCHES_START/#NEXT_PATCHES_END markers are included in linux-next. A full copy of the full kernel tree with the linux-next and mmotm patches already applied is available through git within an hour of the mmotm release. Individual mmotm releases are tagged. The master branch always points to the latest release, so it's constantly rebasing. http://git.cmpxchg.org/cgit.cgi/linux-mmotm.git/ The directory http://www.ozlabs.org/~akpm/mmots/ (mm-of-the-second) contains daily snapshots of the -mm tree. It is updated more frequently than mmotm, and is untested. A git copy of this tree is available at http://git.cmpxchg.org/cgit.cgi/linux-mmots.git/ and use of this tree is similar to http://git.cmpxchg.org/cgit.cgi/linux-mmotm.git/, described above. This mmotm tree contains the following patches against 5.3-rc1: (patches marked "*" will be included in linux-next) origin.patch * docs-signal-fix-a-kernel-doc-markup.patch * revert-kmemleak-allow-to-coexist-with-fault-injection.patch * ocfs2-remove-set-but-not-used-variable-last_hash.patch * mm-vmscan-check-if-mem-cgroup-is-disabled-or-not-before-calling-memcg-slab-shrinker.patch * mm-migrate-fix-reference-check-race-between-__find_get_block-and-migration.patch * mm-compaction-avoid-100%-cpu-usage-during-compaction-when-a-task-is-killed.patch * kasan-remove-clang-version-check-for-kasan_stack.patch * ubsan-build-ubsanc-more-conservatively.patch * page-flags-prioritize-kasan-bits-over-last-cpuid.patch * page-flags-prioritize-kasan-bits-over-last-cpuid-fix.patch * coredump-split-pipe-command-whitespace-before-expanding-template.patch * mm-migrate-initialize-pud_entry-in-migrate_vma.patch * mm-hotplug-remove-unneeded-return-for-void-function.patch * cgroup-kselftest-relax-fs_spec-checks.patch * asm-generic-fix-wtype-limits-compiler-warnings.patch * asm-generic-fix-wtype-limits-compiler-warnings-fix.patch * asm-generic-fix-wtype-limits-compiler-warnings-v2.patch * test_meminit-use-gfp_atomic-in-rcu-critical-section.patch * proc-kpageflags-prevent-an-integer-overflow-in-stable_page_flags.patch * proc-kpageflags-do-not-use-uninitialized-struct-pages.patch * mm-document-zone-device-struct-page-field-usage.patch * mm-hmm-fix-zone_device-anon-page-mapping-reuse.patch * mm-hmm-fix-bad-subpage-pointer-in-try_to_unmap_one.patch * mm-hmm-fix-bad-subpage-pointer-in-try_to_unmap_one-v3.patch * acpi-scan-acquire-device_hotplug_lock-in-acpi_scan_init.patch * mm-mempolicy-make-the-behavior-consistent-when-mpol_mf_move-and-mpol_mf_strict-were-specified.patch * mm-mempolicy-make-the-behavior-consistent-when-mpol_mf_move-and-mpol_mf_strict-were-specified-v4.patch * mm-mempolicy-handle-vma-with-unmovable-pages-mapped-correctly-in-mbind.patch * mm-mempolicy-handle-vma-with-unmovable-pages-mapped-correctly-in-mbind-v4.patch * mm-z3foldc-fix-z3fold_destroy_pool-ordering.patch * mm-z3foldc-fix-z3fold_destroy_pool-race-condition.patch * kbuild-clean-compressed-initramfs-image.patch * ocfs2-use-jbd2_inode-dirty-range-scoping.patch * jbd2-remove-jbd2_journal_inode_add_.patch * ocfs2-clear-zero-in-unaligned-direct-io.patch * ocfs2-clear-zero-in-unaligned-direct-io-checkpatch-fixes.patch * ocfs2-wait-for-recovering-done-after-direct-unlock-request.patch * ocfs2-checkpoint-appending-truncate-log-transaction-before-flushing.patch * ramfs-support-o_tmpfile.patch mm.patch * mm-slab-extend-slab-shrink-to-shrink-all-memcg-caches.patch * mm-slab-move-memcg_cache_params-structure-to-mm-slabh.patch * memremap-move-from-kernel-to-mm.patch * mm-page_poison-fix-a-typo-in-a-comment.patch * mm-rmapc-remove-set-but-not-used-variable-cstart.patch * mm-introduce-page_size.patch * mm-introduce-page_shift.patch * mm-introduce-page_shift-fix.patch * mm-introduce-compound_nr.patch * mm-replace-list_move_tail-with-add_page_to_lru_list_tail.patch * mm-filemap-rewrite-mapping_needs_writeback-in-less-fancy-manner.patch * mm-throttle-allocators-when-failing-reclaim-over-memoryhigh.patch * mm-throttle-allocators-when-failing-reclaim-over-memoryhigh-fix.patch * mm-throttle-allocators-when-failing-reclaim-over-memoryhigh-fix-fix.patch *
Re: [PATCH] sched/core: Don't use dying mm as active_mm for kernel threads
On 7/26/19 7:45 PM, Waiman Long wrote: > It was found that a dying mm_struct where the owning task has exited can > stay on as active_mm of kernel threads as long as no other user tasks > run on those CPUs that use it as active_mm. This prolongs the life time > of dying mm holding up memory and other resources that cannot be freed. > > Fix that by forcing the kernel threads to use init_mm as the active_mm > if the previous active_mm is dying. > > Signed-off-by: Waiman Long > --- > kernel/sched/core.c | 13 +++-- > mm/init-mm.c| 2 ++ > 2 files changed, 13 insertions(+), 2 deletions(-) Sorry, I didn't realize that mm->owner depends on CONFIG_MEMCG. I will need to refresh the patch and send out v2 when I am done testing. Cheers, Longman
Re: [PATCH] isdn/gigaset: check endpoint null in gigaset_probe
On 7/26/19 9:22 PM, Paul Bolle wrote: Phong Tran schreef op vr 26-07-2019 om 20:35 [+0700]: This fixed the potential reference NULL pointer while using variable endpoint. Reported-by: syzbot+35b1c403a14f5c89e...@syzkaller.appspotmail.com Tested by syzbot: https://groups.google.com/d/msg/syzkaller-bugs/wnHG8eRNWEA/Qn2HhjNdBgAJ Signed-off-by: Phong Tran --- drivers/isdn/gigaset/usb-gigaset.c | 9 + This is now drivers/staging/isdn/gigaset/usb-gigaset.c. this patch was created base on branch kasan/usb-fuzzer-usb-testing-2019.07.11 [1] I did not notice about the driver was moved to staging. 1 file changed, 9 insertions(+) diff --git a/drivers/isdn/gigaset/usb-gigaset.c b/drivers/isdn/gigaset/usb-gigaset.c index 1b9b43659bdf..2e011f3db59e 100644 --- a/drivers/isdn/gigaset/usb-gigaset.c +++ b/drivers/isdn/gigaset/usb-gigaset.c @@ -703,6 +703,10 @@ static int gigaset_probe(struct usb_interface *interface, usb_set_intfdata(interface, cs); endpoint = >endpoint[0].desc; +if (!endpoint) { + dev_err(cs->dev, "Couldn't get control endpoint\n"); + return -ENODEV; + } When can this happen? Is this one of those bugs that one can only trigger with a specially crafted (evil) usb device? Yes, in my understanding, this only happens with random test of syzbot. buffer_size = le16_to_cpu(endpoint->wMaxPacketSize); ucs->bulk_out_size = buffer_size; @@ -722,6 +726,11 @@ static int gigaset_probe(struct usb_interface *interface, } endpoint = >endpoint[1].desc; +if (!endpoint) { + dev_err(cs->dev, "Endpoint not available\n"); + retval = -ENODEV; + goto error; + } ucs->busy = 0; Please note that I'm very close to getting cut off from the ISDN network, so the chances of being able to testi this on a live system are getting small. This bug can be invalid now. Do you agree? There is an instruction to report invalid bug to syzbot [2]. Thanks, Paul Bolle [1] https://github.com/google/kasan/commits/usb-fuzzer-usb-testing-2019.07.11 [2] https://github.com/google/syzkaller/blob/master/docs/syzbot.md#communication-with-syzbot Thanks, Phong
Re: [PATCH v3 2/3] augmented rbtree: add new RB_DECLARE_CALLBACKS_MAX macro
On Mon, 8 Jul 2019 05:24:09 -0700 Michel Lespinasse wrote: > Syncing up with v5.2, I see that there is a new use for augmented > rbtrees in mm/vmalloc.c which does not compile after applying my > patchset. > > It's an easy fix though: It still doesn't build. lib/rbtree_test.c: In function check_augmented: lib/rbtree_test.c:225:35: error: implicit declaration of function augment_recompute [-Werror=implicit-function-declaration] WARN_ON_ONCE(node->augmented != augment_recompute(node)); I think I'll just do this: --- a/lib/rbtree_test.c~augmented-rbtree-add-new-rb_declare_callbacks_max-macro-fix-2 +++ a/lib/rbtree_test.c @@ -220,10 +220,6 @@ static void check_augmented(int nr_nodes struct rb_node *rb; check(nr_nodes); - for (rb = rb_first(_root); rb; rb = rb_next(rb)) { - struct test_node *node = rb_entry(rb, struct test_node, rb); - WARN_ON_ONCE(node->augmented != augment_recompute(node)); - } } static int __init rbtree_test_init(void) although there may be something we can do here to restore the lost coverage?
Re: [PATCH bpf-next v10 06/10] bpf,landlock: Add a new map type: inode
On Sun, Jul 21, 2019 at 11:31:12PM +0200, Mickaël Salaün wrote: > FIXME: 64-bits in the doc > > This new map store arbitrary values referenced by inode keys. The map > can be updated from user space with file descriptor pointing to inodes > tied to a file system. From an eBPF (Landlock) program point of view, > such a map is read-only and can only be used to retrieved a value tied > to a given inode. This is useful to recognize an inode tagged by user > space, without access right to this inode (i.e. no need to have a write > access to this inode). > > Add dedicated BPF functions to handle this type of map: > * bpf_inode_htab_map_update_elem() > * bpf_inode_htab_map_lookup_elem() > * bpf_inode_htab_map_delete_elem() > > This new map require a dedicated helper inode_map_lookup_elem() because > of the key which is a pointer to an opaque data (only provided by the > kernel). This act like a (physical or cryptographic) key, which is why > it is also not allowed to get the next key. > > Signed-off-by: Mickaël Salaün there are too many things to comment on. Let's do this patch. imo inode_map concept is interesting, but see below... > + > + /* > + * Limit number of entries in an inode map to the maximum number of > + * open files for the current process. The maximum number of file > + * references (including all inode maps) for a process is then > + * (RLIMIT_NOFILE - 1) * RLIMIT_NOFILE. If the process' RLIMIT_NOFILE > + * is 0, then any entry update is forbidden. > + * > + * An eBPF program can inherit all the inode map FD. The worse case is > + * to fill a bunch of arraymaps, create an eBPF program, close the > + * inode map FDs, and start again. The maximum number of inode map > + * entries can then be close to RLIMIT_NOFILE^3. > + */ > + if (attr->max_entries > rlimit(RLIMIT_NOFILE)) > + return -EMFILE; rlimit is checked, but no fd are consumed. Once created such inode map_fd can be passed to a different process. map_fd can be pinned into bpffs. etc. what the value of the check? > + > + /* decorelate UAPI from kernel API */ > + attr->key_size = sizeof(struct inode *); > + > + return htab_map_alloc_check(attr); > +} > + > +static void inode_htab_put_key(void *key) > +{ > + struct inode **inode = key; > + > + if ((*inode)->i_state & I_FREEING) > + return; checking the state without take a lock? isn't it racy? > + iput(*inode); > +} > + > +/* called from syscall or (never) from eBPF program */ > +static int map_get_next_no_key(struct bpf_map *map, void *key, void > *next_key) > +{ > + /* do not leak a file descriptor */ what this comment suppose to mean? > + return -ENOTSUPP; > +} > + > +/* must call iput(inode) after this call */ > +static struct inode *inode_from_fd(int ufd, bool check_access) > +{ > + struct inode *ret; > + struct fd f; > + int deny; > + > + f = fdget(ufd); > + if (unlikely(!f.file)) > + return ERR_PTR(-EBADF); > + /* TODO?: add this check when called from an eBPF program too (already > + * checked by the LSM parent hooks anyway) */ > + if (unlikely(IS_PRIVATE(file_inode(f.file { > + ret = ERR_PTR(-EINVAL); > + goto put_fd; > + } > + /* check if the FD is tied to a mount point */ > + /* TODO?: add this check when called from an eBPF program too */ > + if (unlikely(f.file->f_path.mnt->mnt_flags & MNT_INTERNAL)) { > + ret = ERR_PTR(-EINVAL); > + goto put_fd; > + } a bunch of TODOs do not inspire confidence. > + if (check_access) { > + /* > + * must be allowed to access attributes from this file to then > + * be able to compare an inode to its map entry > + */ > + deny = security_inode_getattr(>f_path); > + if (deny) { > + ret = ERR_PTR(deny); > + goto put_fd; > + } > + } > + ret = file_inode(f.file); > + ihold(ret); > + > +put_fd: > + fdput(f); > + return ret; > +} > + > +/* > + * The key is a FD when called from a syscall, but an inode address when > called > + * from an eBPF program. > + */ > + > +/* called from syscall */ > +int bpf_inode_fd_htab_map_lookup_elem(struct bpf_map *map, int *key, void > *value) > +{ > + void *ptr; > + struct inode *inode; > + int ret; > + > + /* check inode access */ > + inode = inode_from_fd(*key, true); > + if (IS_ERR(inode)) > + return PTR_ERR(inode); > + > + rcu_read_lock(); > + ptr = htab_map_lookup_elem(map, ); > + iput(inode); > + if (IS_ERR(ptr)) { > + ret = PTR_ERR(ptr); > + } else if (!ptr) { > + ret = -ENOENT; > + } else { > + ret = 0; > + copy_map_value(map, value, ptr); > + } > + rcu_read_unlock(); > + return ret; > +} > + > +/*
[PATCH v3 -next] staging: vc04_services: fix unused-but-set-variable warning
Fix gcc used-but-set-variable warning: drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c: In function vchiq_release_internal: drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c:2827:16: warning: variable local_entity_uc set but not used [-Wunused-but-set-variable] drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c:2827:6: warning: variable local_uc set but not used [-Wunused-but-set-variable] Remove the unused variables 'local_entity_uc' and 'local_uc' Reported-by: Hulk Robot Signed-off-by: YueHaibing Acked-by: Stefan Wahren --- v3: fix patch title v2: remove the unused variable --- drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c index cc4383d..b1595b1 100644 --- a/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c +++ b/drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c @@ -2824,7 +2824,6 @@ vchiq_release_internal(struct vchiq_state *state, struct vchiq_service *service) VCHIQ_STATUS_T ret = VCHIQ_SUCCESS; char entity[16]; int *entity_uc; - int local_uc, local_entity_uc; if (!arm_state) goto out; @@ -2849,8 +2848,8 @@ vchiq_release_internal(struct vchiq_state *state, struct vchiq_service *service) ret = VCHIQ_ERROR; goto unlock; } - local_uc = --arm_state->videocore_use_count; - local_entity_uc = --(*entity_uc); + --arm_state->videocore_use_count; + --(*entity_uc); if (!vchiq_videocore_wanted(state)) { if (vchiq_platform_use_suspend_timer() && -- 2.7.4
[RFC PATCH v3 2/2] printk-rb: add test module
This module does some heavy write stress testing on the ringbuffer with a reader that is checking for integrity. Signed-off-by: John Ogness --- kernel/printk/Makefile | 2 + kernel/printk/test_prb.c | 256 +++ 2 files changed, 258 insertions(+) create mode 100644 kernel/printk/test_prb.c diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index 567999aa93af..24365ecee348 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -5,3 +5,5 @@ obj-$(CONFIG_PRINTK)+= ringbuffer.o obj-$(CONFIG_PRINTK) += numlist.o obj-$(CONFIG_PRINTK) += dataring.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o + +obj-m += test_prb.o diff --git a/kernel/printk/test_prb.c b/kernel/printk/test_prb.c new file mode 100644 index ..1ecb4fcbf823 --- /dev/null +++ b/kernel/printk/test_prb.c @@ -0,0 +1,256 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include +#include +#include +#include +#include +#include +#include "ringbuffer.h" + +/* + * This is a test module that starts "num_online_cpus() - 1" writer threads + * and 1 reader thread. The writer threads each write strings of varying + * length. They do this as fast as they can. + * + * The reader thread reads as fast as it can and performs sanity checks on + * the data. + * + * Because the threads are running in such tight loops, they will call + * schedule() from time to time so the system stays alive. + * + * If either the writers or the reader encounter an error, the test is + * aborted. Test results are recorded to the ftrace buffers, with some + * additional information also provided via printk. The test can be aborted + * manually by removing the module. (Ideally the test should never abort on + * its own.) + */ + +struct rbdata { + int len; + char text[0]; +}; + +static char *test_running; +static int halt_test; + +static void dump_rb(struct printk_ringbuffer *rb) +{ + DECLARE_PRINTKRB_ENTRY(entry, 160); + DECLARE_PRINTKRB_ITER(iter, rb, ); + unsigned long last_seq = 0; + struct rbdata *dat; + char buf[160]; + int len; + + trace_printk("BEGIN full dump\n"); + + prb_for_each_entry_continue(, len) { + if (entry.seq - last_seq != 1) { + trace_printk("LOST %lu\n", +entry.seq - (last_seq + 1)); + } + last_seq = entry.seq; + + dat = (struct rbdata *)[0]; + + snprintf(buf, sizeof(buf), "%s", dat->text); + buf[sizeof(buf) - 1] = 0; + trace_printk("seq=%lu len=%d textlen=%d dataval=%s\n", +entry.seq, len, dat->len, buf); + } + + trace_printk("END full dump\n"); +} + +DECLARE_PRINTKRB(test_rb, 7, 5); + +static int prbtest_writer(void *data) +{ + unsigned long num = (unsigned long)data; + struct prb_reserved_entry e; + char id = 'A' + num; + struct rbdata *dat; + int count = 0; + int len; + + pr_err("prbtest: start thread %lu (writer)\n", num); + + for (;;) { + len = sizeof(struct rbdata) + (prandom_u32() & 0x7f) + 2; + + dat = (struct rbdata *)prb_reserve(, _rb, len); + if (!IS_ERR(dat)) { + len -= sizeof(struct rbdata) + 1; + memset(>text[0], id, len); + dat->text[len] = 0; + dat->len = len; + prb_commit(); + } else { + WRITE_ONCE(halt_test, 1); + trace_printk("writer%lu (%c) reserve failed (%ld)\n", +num, id, PTR_ERR(dat)); + } + + if ((count++ & 0x3fff) == 0) + schedule(); + + if (READ_ONCE(halt_test) == 1) + break; + } + + pr_err("prbtest: end thread %lu (writer)\n", num); + + test_running[num] = 0; + + return 0; +} + +static int prbtest_reader(void *data) +{ + unsigned long num = (unsigned long)data; + DECLARE_PRINTKRB_ENTRY(entry, 160); + DECLARE_PRINTKRB_ITER(iter, _rb, ); + unsigned long total_lost = 0; + unsigned long last_seq = 0; + unsigned long max_lost = 0; + unsigned long count = 0; + struct rbdata *dat; + int did_sched = 1; + int len; + + pr_err("prbtest: start thread %lu (reader)\n", num); + + for (;;) { + prb_for_each_entry_continue(, len) { + if (entry.seq < last_seq) { + WRITE_ONCE(halt_test, 1); + trace_printk( + "reader%lu invalid seq %lu -> %lu\n", + num, last_seq, entry.seq); + goto out; + } + +
[RFC PATCH v3 0/2] printk: new ringbuffer implementation
Hello, This is a follow-up RFC on the work to re-implement much of the core of printk. The threads for the previous RFC versions are here: v1[0], v2[1]. As was planned[2], this is only the first piece: a new lockless ringbuffer. Changes from v2: - Moved all code into kernel/printk/. Let's keep it private for now. - Split the ringbuffer into 3 components: * a data ringbuffer (dataring) to manage the raw data and data descriptors * a numbered list (numlist) to manage committed entries and their sequence numbers * the printk_ringbuffer, which is the high-level structure providing the reader/writer API and glue for the other structures Splitting the components apart helped to document their roles and their related memory barriers (and will hopefully also simplify the review process). - Renamed most functions, structures, and variables based on v2 feedback. - Rewrote and reformatted nearly all comments (particularly the memory barrier comments) based on v2 feedback. - Addressed implementation issues with v2: * invalid data blocks potentially becoming valid because of overflows * weak associations between data blocks and descriptors * excessive freeing of data blocks due to unavailable descriptors - Improved error handling and data integrity checks in the test module. For the memory barrier work I wrote a litmus test for nearly every memory barrier. I did not include these in the series. Should I? If yes, where should they be placed? I would like to point out that Petr Mladek posted a proof-of-concept[3] alternate implementation. I wanted to base my v3 on his work, but ran into too many problems getting it to run acceptably. I will address those issues in that thread. This is why my v3 is based directly on my v2. John Ogness [0] https://lkml.kernel.org/r/20190212143003.48446-1-john.ogn...@linutronix.de [1] https://lkml.kernel.org/r/20190607162349.18199-1-john.ogn...@linutronix.de [2] https://lkml.kernel.org/r/87y35hn6ih@linutronix.de [3] https://lkml.kernel.org/r/20190704103321.10022-1-pmla...@suse.com John Ogness (2): printk-rb: add a new printk ringbuffer implementation printk-rb: add test module kernel/printk/Makefile | 5 + kernel/printk/dataring.c | 761 ++ kernel/printk/dataring.h | 95 ++ kernel/printk/numlist.c| 375 + kernel/printk/numlist.h| 72 kernel/printk/ringbuffer.c | 800 + kernel/printk/ringbuffer.h | 288 kernel/printk/test_prb.c | 256 +++ 8 files changed, 2652 insertions(+) create mode 100644 kernel/printk/dataring.c create mode 100644 kernel/printk/dataring.h create mode 100644 kernel/printk/numlist.c create mode 100644 kernel/printk/numlist.h create mode 100644 kernel/printk/ringbuffer.c create mode 100644 kernel/printk/ringbuffer.h create mode 100644 kernel/printk/test_prb.c -- 2.11.0
[RFC PATCH v3 1/2] printk-rb: add a new printk ringbuffer implementation
See documentation for details. For the real patch the "prb overview" documentation section in kernel/printk/ringbuffer.c will be included in the commit message. Signed-off-by: John Ogness --- kernel/printk/Makefile | 3 + kernel/printk/dataring.c | 761 ++ kernel/printk/dataring.h | 95 ++ kernel/printk/numlist.c| 375 + kernel/printk/numlist.h| 72 kernel/printk/ringbuffer.c | 800 + kernel/printk/ringbuffer.h | 288 7 files changed, 2394 insertions(+) create mode 100644 kernel/printk/dataring.c create mode 100644 kernel/printk/dataring.h create mode 100644 kernel/printk/numlist.c create mode 100644 kernel/printk/numlist.h create mode 100644 kernel/printk/ringbuffer.c create mode 100644 kernel/printk/ringbuffer.h diff --git a/kernel/printk/Makefile b/kernel/printk/Makefile index 4d052fc6bcde..567999aa93af 100644 --- a/kernel/printk/Makefile +++ b/kernel/printk/Makefile @@ -1,4 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only obj-y = printk.o obj-$(CONFIG_PRINTK) += printk_safe.o +obj-$(CONFIG_PRINTK) += ringbuffer.o +obj-$(CONFIG_PRINTK) += numlist.o +obj-$(CONFIG_PRINTK) += dataring.o obj-$(CONFIG_A11Y_BRAILLE_CONSOLE) += braille.o diff --git a/kernel/printk/dataring.c b/kernel/printk/dataring.c new file mode 100644 index ..911bac593ec1 --- /dev/null +++ b/kernel/printk/dataring.c @@ -0,0 +1,761 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "dataring.h" + +/** + * DOC: dataring overview + * + * A dataring is a lockless ringbuffer consisting of variable length data + * blocks, each of which are assigned an ID. The IDs map to descriptors, which + * contain metadata about the data block. The lookup function mapping IDs to + * descriptors is implemented by the user. + * + * Data Blocks + * --- + * All ringbuffer data is stored within a single static byte array. This is + * to ensure that any pointers to the data (past and present) will always + * point to valid memory. This is important because the lockless readers + * and writers may preempt for long periods of time and when they resume may + * be working with expired pointers. + * + * Data blocks are specified by begin and end logical positions (lpos) that + * map directly to byte array offsets. Using logical positions indirectly + * provides tagged state references for the data blocks to avoid ABA issues + * when the ringbuffer wraps. The number of tagged states per index is:: + * + * sizeof(long) / size of byte array + * + * If a data block starts near the end of the byte array but would extend + * beyond it, that data block is handled differently: a special "wrapping data + * block" is inserted in the space available at the end of the byte array and + * a "content data block" is placed at the beginning of the byte array. This + * can waste space at the end of the byte array, but simplifies the + * implementation by allowing writers to always work with contiguous buffers. + * For example, for a 1000 byte array, a descriptor may show a start lpos of + * 1950 and an end lpos of 2100. The data block associated with this + * descriptor is 100 bytes in size. Its ID is located in the "wrapping" data + * block (located at offset 950 of the byte array) and its data is found in + * the "content" data block (located at offset 0 of the byte array). + * + * Descriptors + * --- + * A descriptor is a handle to a data block. How descriptors are structured + * and mapped to IDs is implemented by the user. + * + * Descriptors contain the begin (begin_lpos) and end (next_lpos) logical + * positions of the data block they represent. The end logical position + * matches the begin logical position of the adjacent data block. + * + * Why Descriptors? + * + * The data ringbuffer supports variable length entities, which means that + * data blocks will not always begin at a predictable offset of the byte + * array. This is a major problem for lockless writers that, for example, will + * compete to expire and reuse old data blocks when the ringbuffer is full. + * Without a predictable begin for the data blocks, a writer has no reliable + * information about the status of the "free" area. Are any flags or state + * variables already set or is it just garbage left over from previous usage? + * + * Descriptors allow safe and controlled access to data block metadata by + * providing predictable offsets for such metadata. This is key to supporting + * multiple concurrent lockless writers. + * + * Behavior + * + * The data ringbuffer allows writers to commit data without regard for + * readers. Readers must pre- and post-validate the data blocks they are + * processing to be sure the processed data is consistent. A function + * dataring_datablock_isvalid() is available for that. Readers can only + * iterate data blocks by utilizing an external
Re: [PATCH v2 -next] staging: vc04_services: fix used-but-set-variable warning
On 2019/7/26 23:57, Stefan Wahren wrote: > Hi Yue, > > Am 26.07.19 um 11:26 schrieb YueHaibing: >> Fix gcc used-but-set-variable warning: > > just a nit. It is call "unused-but-set-variable" Oh, yes, thanks! > > Acked-by: Stefan Wahren > >> >> drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c: In function >> vchiq_release_internal: >> drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c:2827:16: >> warning: >> variable local_entity_uc set but not used [-Wunused-but-set-variable] >> drivers/staging/vc04_services/interface/vchiq_arm/vchiq_arm.c:2827:6: >> warning: >> variable local_uc set but not used [-Wunused-but-set-variable] >> >> Remove the unused variables 'local_entity_uc' and 'local_uc' >> >> Reported-by: Hulk Robot >> Signed-off-by: YueHaibing >> --- > > . >
Re: [patch 0/8] core, x86: Preparatory steps for RT
On Fri, 26 Jul 2019 23:19:36 +0200 Thomas Gleixner wrote: > CONFIG_PREEMPTION is selected by CONFIG_PREEMPT and by > CONFIG_PREEMPT_RT. Both PREEMPT and PREEMPT_RT require the same > functionality which today depends on CONFIG_PREEMPT. > > The following series adjusts the core and x86 code to use > CONFIG_PREEMPTION where appropriate and extends the x86 dumpstack > implementation to display PREEMPT_RT instead of PREEMPT on a RT > enabled kernel. > Hmm, I'm looking at v5.3-rc1 and I don't see a CONFIG_PREEMPTION defined. And the first patch doesn't define it. Did I miss a patch series that adds it? -- Steve
Re: [PATCH v3 1/2] dt-bindings: i3c: Document MediaTek I3C master bindings
On Wed, 2019-07-24 at 14:21 -0600, Rob Herring wrote: > On Tue, Jul 09, 2019 at 09:09:21PM +0800, Qii Wang wrote: > > Document MediaTek I3C master DT bindings. > > > > Signed-off-by: Qii Wang > > --- > > .../devicetree/bindings/i3c/mtk,i3c-master.txt | 48 > > > > 1 file changed, 48 insertions(+) > > create mode 100644 Documentation/devicetree/bindings/i3c/mtk,i3c-master.txt > > > > diff --git a/Documentation/devicetree/bindings/i3c/mtk,i3c-master.txt > > b/Documentation/devicetree/bindings/i3c/mtk,i3c-master.txt > > new file mode 100644 > > index 000..d32eda6 > > --- /dev/null > > +++ b/Documentation/devicetree/bindings/i3c/mtk,i3c-master.txt > > @@ -0,0 +1,48 @@ > > +Bindings for MediaTek I3C master block > > += > > + > > +Required properties: > > + > > +- compatible: shall be "mediatek,i3c-master" > > Needs to be SoC specific. > We hope that the SOCs will use the same driver and try to avoid big changes. If there are inevitable changes in the future, then we will modify the compatible to be SoC specific. cdns,i3c-master.txt is not SoC specific either. > > +- reg: physical base address of the controller and apdma base, length of > > + memory mapped region. > > +- reg-names: shall be "main" for master controller and "dma" for apdma. > > +- interrupts: the interrupt line connected to this I3C master. > > +- clocks: shall reference the i3c and apdma clocks. > > +- clock-names: shall include "main" and "dma". > > + > > +Mandatory properties defined by the generic binding (see > > +Documentation/devicetree/bindings/i3c/i3c.txt for more details): > > + > > +- #address-cells: shall be set to 3 > > +- #size-cells: shall be set to 0 > > + > > +Optional properties defined by the generic binding (see > > +Documentation/devicetree/bindings/i3c/i3c.txt for more details): > > + > > +- i2c-scl-hz > > +- i3c-scl-hz > > + > > +I3C device connected on the bus follow the generic description (see > > +Documentation/devicetree/bindings/i3c/i3c.txt for more details). > > + > > +Example: > > + > > + i3c0: i3c@1100d000 { > > + compatible = "mediatek,i3c-master"; > > + reg = <0x1100d000 0x1000>, > > + <0x11000300 0x80>; > > + reg-names = "main", "dma"; > > + interrupts = ; > > + clocks = < CLK_INFRA_I3C0>, > > +< CLK_INFRA_AP_DMA>; > > + clock-names = "main", "dma"; > > + #address-cells = <3>; > > + #size-cells = <0>; > > + i2c-scl-hz = <10>; > > + > > + nunchuk: nunchuk@52 { > > + compatible = "nintendo,nunchuk"; > > + reg = <0x52 0x0 0x10>; > > + }; > > + }; > > -- > > 1.7.9.5 > >
[PATCH] PM / wakeup: Avoid dev_name collisions in wakeup class
If a device is wakeup capable and the driver calls device_wakeup_init() on it during probe and then userspace writes 'enabled' to that device's power/wakeup file in sysfs we'll try to create the same named wakeup device in sysfs. The kernel will complain about duplicate file names. sysfs: cannot create duplicate filename '/devices/virtual/wakeup/1-1.1' kobject_add_internal failed for 1-1.1 with -EEXIST, don't try to register things with the same name in the same directory. It may be advantageous to not write 'enabled' to the wakeup file (see wakeup_store()) from userspace for these devices because we allocate devices and register them and then throw them all away later on if the device driver has already initialized the wakeup attribute. The implementation currently tries to avoid taking locks here so it seems best to optimize that path in a separate patch. Let's rename the wakeup class devices as 'wakeupN' with an IDA that's simple enough to just return some sort of number. In addition, let's make the device registering the wakeup the parent and include a 'name' attribute in case userspace wants to figure out the type of wakeup it is (in the case of virtual wakeups) or the device associated with the wakeup. This makes it easier for userspace to go from /sys/class/wakeup to a place in the device hierarchy where the wakeup is generated from like an input device. Cc: Tri Vo Cc: Kalesh Singh Cc: Greg Kroah-Hartman Cc: Ravi Chandra Sadineni Signed-off-by: Stephen Boyd --- drivers/acpi/device_pm.c | 2 +- drivers/base/power/wakeup.c | 8 +--- drivers/base/power/wakeup_stats.c | 31 ++- fs/eventpoll.c| 4 ++-- include/linux/pm_wakeup.h | 12 kernel/power/autosleep.c | 2 +- kernel/power/wakelock.c | 2 +- kernel/time/alarmtimer.c | 2 +- 8 files changed, 45 insertions(+), 18 deletions(-) diff --git a/drivers/acpi/device_pm.c b/drivers/acpi/device_pm.c index 28cffaaf9d82..0863be1e42d6 100644 --- a/drivers/acpi/device_pm.c +++ b/drivers/acpi/device_pm.c @@ -495,7 +495,7 @@ acpi_status acpi_add_pm_notifier(struct acpi_device *adev, struct device *dev, goto out; mutex_lock(_pm_notifier_lock); - adev->wakeup.ws = wakeup_source_register(dev_name(>dev)); + adev->wakeup.ws = wakeup_source_register(>dev, dev_name(>dev)); adev->wakeup.context.dev = dev; adev->wakeup.context.func = func; adev->wakeup.flags.notifier_present = true; diff --git a/drivers/base/power/wakeup.c b/drivers/base/power/wakeup.c index 2b8def0ea59f..7ba242b49831 100644 --- a/drivers/base/power/wakeup.c +++ b/drivers/base/power/wakeup.c @@ -201,15 +201,17 @@ EXPORT_SYMBOL_GPL(wakeup_source_remove); /** * wakeup_source_register - Create wakeup source and add it to the list. * @name: Name of the wakeup source to register. + * @dev: Device wakeup source is associated with (or NULL if virtual) */ -struct wakeup_source *wakeup_source_register(const char *name) +struct wakeup_source *wakeup_source_register(struct device *dev, +const char *name) { struct wakeup_source *ws; int ret; ws = wakeup_source_create(name); if (ws) { - ret = wakeup_source_sysfs_add(ws); + ret = wakeup_source_sysfs_add(dev, ws); if (ret) { kfree_const(ws->name); kfree(ws); @@ -273,7 +275,7 @@ int device_wakeup_enable(struct device *dev) if (pm_suspend_target_state != PM_SUSPEND_ON) dev_dbg(dev, "Suspicious %s() during system transition!\n", __func__); - ws = wakeup_source_register(dev_name(dev)); + ws = wakeup_source_register(dev, dev_name(dev)); if (!ws) return -ENOMEM; diff --git a/drivers/base/power/wakeup_stats.c b/drivers/base/power/wakeup_stats.c index 9c01150f1213..927cc84d3392 100644 --- a/drivers/base/power/wakeup_stats.c +++ b/drivers/base/power/wakeup_stats.c @@ -7,8 +7,9 @@ * Copyright (c) 2019 Google Inc. */ -#include +#include #include +#include #include "power.h" @@ -80,6 +81,15 @@ static ssize_t last_change_ms_show(struct device *dev, } static DEVICE_ATTR_RO(last_change_ms); +static ssize_t name_show(struct device *dev, + struct device_attribute *attr, char *buf) +{ + struct wakeup_source *ws = dev_get_drvdata(dev); + + return sprintf(buf, "%s\n", ws->name); +} +static DEVICE_ATTR_RO(name); + static ssize_t prevent_suspend_time_ms_show(struct device *dev, struct device_attribute *attr, char *buf) @@ -96,6 +106,7 @@ static ssize_t prevent_suspend_time_ms_show(struct device *dev, static DEVICE_ATTR_RO(prevent_suspend_time_ms); static struct attribute *wakeup_source_attrs[] = { +
Re: [PATCH 1/3 v2] fs: ocfs2: Fix possible null-pointer dereferences in ocfs2_xa_prepare_entry()
On 19/7/26 18:14, Jia-Ju Bai wrote: > In ocfs2_xa_prepare_entry(), there is an if statement on line 2136 to > check whether loc->xl_entry is NULL: > if (loc->xl_entry) > > When loc->xl_entry is NULL, it is used on line 2158: > ocfs2_xa_add_entry(loc, name_hash); > loc->xl_entry->xe_name_hash = cpu_to_le32(name_hash); > loc->xl_entry->xe_name_offset = cpu_to_le16(loc->xl_size); > and line 2164: > ocfs2_xa_add_namevalue(loc, xi); > loc->xl_entry->xe_value_size = cpu_to_le64(xi->xi_value_len); > loc->xl_entry->xe_name_len = xi->xi_name_len; > > Thus, possible null-pointer dereferences may occur. > > To fix these bugs, if loc-xl_entry is NULL, ocfs2_xa_prepare_entry() > abnormally returns with -EINVAL. > > These bugs are found by a static analysis tool STCheck written by us. > > Signed-off-by: Jia-Ju Bai Reviewed-by: Joseph Qi > --- > v2: > * Directly return -EINVAL if loc-xl_entry is NULL. > Thank Joseph for helpful advice. > > --- > fs/ocfs2/xattr.c | 44 +++- > 1 file changed, 23 insertions(+), 21 deletions(-) > > diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c > index 385f3aaa2448..4b876c82a35c 100644 > --- a/fs/ocfs2/xattr.c > +++ b/fs/ocfs2/xattr.c > @@ -2133,29 +2133,31 @@ static int ocfs2_xa_prepare_entry(struct ocfs2_xa_loc > *loc, > if (rc) > goto out; > > - if (loc->xl_entry) { > - if (ocfs2_xa_can_reuse_entry(loc, xi)) { > - orig_value_size = loc->xl_entry->xe_value_size; > - rc = ocfs2_xa_reuse_entry(loc, xi, ctxt); > - if (rc) > - goto out; > - goto alloc_value; > - } > + if (!loc->xl_entry) { > + rc = -EINVAL; > + goto out; > + } > > - if (!ocfs2_xattr_is_local(loc->xl_entry)) { > - orig_clusters = ocfs2_xa_value_clusters(loc); > - rc = ocfs2_xa_value_truncate(loc, 0, ctxt); > - if (rc) { > - mlog_errno(rc); > - ocfs2_xa_cleanup_value_truncate(loc, > - "overwriting", > - orig_clusters); > - goto out; > - } > + if (ocfs2_xa_can_reuse_entry(loc, xi)) { > + orig_value_size = loc->xl_entry->xe_value_size; > + rc = ocfs2_xa_reuse_entry(loc, xi, ctxt); > + if (rc) > + goto out; > + goto alloc_value; > + } > + > + if (!ocfs2_xattr_is_local(loc->xl_entry)) { > + orig_clusters = ocfs2_xa_value_clusters(loc); > + rc = ocfs2_xa_value_truncate(loc, 0, ctxt); > + if (rc) { > + mlog_errno(rc); > + ocfs2_xa_cleanup_value_truncate(loc, > + "overwriting", > + orig_clusters); > + goto out; > } > - ocfs2_xa_wipe_namevalue(loc); > - } else > - ocfs2_xa_add_entry(loc, name_hash); > + } > + ocfs2_xa_wipe_namevalue(loc); > > /* >* If we get here, we have a blank entry. Fill it. We grow our >
Re: Regression in 5.3 for some FS_USERNS_MOUNT (aka user-namespace-mountable) filesystems
Al Viro writes: > On Fri, Jul 26, 2019 at 03:47:02PM -0700, Linus Torvalds wrote: > >> Of course, then later on, commit 20284ab7427f ("switch mount_capable() >> to fs_context") drops that argument entirely, and hardcodes the >> decision to look at fc->global. >> >> But that fc->global decision wasn't there originally, and is incorrect >> since it breaks existing users. >> >> What gets much more confusing about this is that the two different >> users then moved around. The sget_userns() case got moved to >> legacy_get_tree(), and then joined together in vfs_get_tree(), and >> then split and moved out to do_new_mount() and vfs_fsconfig_locked(). >> >> And that "joined together into vfs_get_tree()" must be wrong, because >> the two cases used two different namespace rules. The sget_userns() >> case *did* have that "global" flag check, while the sget_fc() did not. >> >> Messy. Al? > > Digging through that mess... It's my fuckup, and we obviously need to > restore the old behaviour, but I really hope to manage that with > checks _not_ in superblock allocator ;-/ If someone had bothered to actually look at how I was proposing to clean things up before the new mount api we would already have that. Sigh. You should be able to get away with something like this which moves the checks earlier and makes things clearer. My old patch against the pre new mount api code. I am running at undependable speed due to the new baby so it is probably better for someone else to forward port this, but I will attempt it otherwise. Eric From: "Eric W. Biederman" Date: Wed, 21 Nov 2018 11:17:01 -0600 Subject: [PATCH] vfs: Replace FS_USERNS_MOUNT with file_system_type->permission Permission checking of the user to see if the can mount an individual filesystem using FS_USERNS_MOUNT and checks in sget is not very comprehensible. Further by pushing the logic down into sget the attack surface on filesystems that don't support unprivilged mounts is much larger than it should be. Now that it is understood what the permission checks need to be refactor the checks into a simple per filesystme permission check. If no permission check is implemented the default check becomes a simple capable(CAP_SYS_ADMIN). The result is code that is much simpler to understand and much easier to maintain. Signed-off-by: "Eric W. Biederman" --- fs/devpts/inode.c | 2 +- fs/fuse/inode.c| 3 ++- fs/namespace.c | 15 +++ fs/proc/root.c | 8 +++- fs/ramfs/inode.c | 2 +- fs/super.c | 20 ++-- fs/sysfs/mount.c | 13 +++-- include/linux/fs.h | 4 +++- ipc/mqueue.c | 8 +++- kernel/cgroup/cgroup.c | 16 mm/shmem.c | 4 ++-- 11 files changed, 59 insertions(+), 36 deletions(-) diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index c53814539070..1418912efc7d 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -519,9 +519,9 @@ static void devpts_kill_sb(struct super_block *sb) static struct file_system_type devpts_fs_type = { .name = "devpts", + .permission = userns_mount_permission, .mount = devpts_mount, .kill_sb= devpts_kill_sb, - .fs_flags = FS_USERNS_MOUNT, }; /* diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index 0b94b23b02d4..e9f6aa9974f8 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -1259,7 +1259,8 @@ static void fuse_kill_sb_anon(struct super_block *sb) static struct file_system_type fuse_fs_type = { .owner = THIS_MODULE, .name = "fuse", - .fs_flags = FS_HAS_SUBTYPE | FS_USERNS_MOUNT, + .fs_flags = FS_HAS_SUBTYPE, + .permission = userns_mount_permission, .mount = fuse_mount, .kill_sb= fuse_kill_sb_anon, }; diff --git a/fs/namespace.c b/fs/namespace.c index 74f64294a410..44935dbdb162 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -2448,6 +2448,16 @@ static int do_add_mount(struct mount *newmnt, struct path *path, int mnt_flags) static bool mount_too_revealing(struct vfsmount *mnt, int *new_mnt_flags); +static int new_mount_permission(struct file_system_type *type) +{ + int err = 0; + if (type->permission) + err = type->permission(); + else if (!capable(CAP_SYS_ADMIN)) + err = -EPERM; + return err; +} + /* * create a new mount for userspace and request it to be added into the * namespace's tree @@ -2466,6 +2476,11 @@ static int do_new_mount(struct path *path, const char *fstype, int sb_flags, if (!type) return -ENODEV; + /* Verify the mounter has permission to mount the filesystem */ + err = new_mount_permission(type); + if (err) + return err; + mnt = vfs_kern_mount(type, sb_flags, name, data); if (!IS_ERR(mnt) && (type->fs_flags & FS_HAS_SUBTYPE) &&
Re: [PATCH v6 0/9] Add Error Disconnect Recover (EDR) support
On 7/26/2019 6:33 PM, sathyanarayanan kuppuswamy wrote: > +Austin , Huong > > On 7/26/19 2:53 PM, Keith Busch wrote: >> On Fri, Jul 26, 2019 at 02:43:10PM -0700, >> sathyanarayanan.kuppusw...@linux.intel.com wrote: >>> From: Kuppuswamy Sathyanarayanan >>> >>> >>> This patchset adds support for following features: >>> >>> 1. Error Disconnect Recover (EDR) support. >>> 2. _OSC based negotiation support for DPC. >>> >>> You can find EDR spec in the following link. >>> >>> https://members.pcisig.com/wg/PCI-SIG/document/12614 >> Thank you for sticking with this. I've reviewed the series and I think >> this looks good for the next merge window. >> >> Acked-by: Keith Busch Tested on a DPC-enabled PCIe switch (Broadcom PEX9733) in a Dell PowerEdge R740xd. Injected fatal and non-fatal errors on an NVMe endpoint below the switch and on the switch downstream port itself and verified errors were contained and then recovered at the PCIe level. Tested-by: Austin Bolen >>
Re: [PATCH] mm: Make kvfree safe to call
On Fri, Jul 26, 2019 at 05:25:03PM -0400, Jeff Layton wrote: > On Fri, 2019-07-26 at 14:10 -0700, Alexander Duyck wrote: > > On Fri, Jul 26, 2019 at 2:01 PM Matthew Wilcox wrote: > > > From: "Matthew Wilcox (Oracle)" > > > > > > Since vfree() can sleep, calling kvfree() from contexts where sleeping > > > is not permitted (eg holding a spinlock) is a bit of a lottery whether > > > it'll work. Introduce kvfree_safe() for situations where we know we can > > > sleep, but make kvfree() safe by default. > > > > > > Reported-by: Jeff Layton > > > Cc: Alexander Viro > > > Cc: Luis Henriques > > > Cc: Christoph Hellwig > > > Cc: Carlos Maiolino > > > Signed-off-by: Matthew Wilcox (Oracle) > > > > So you say you are adding kvfree_safe() in the patch description, but > > it looks like you are introducing kvfree_fast() below. Did something > > change and the patch description wasn't updated, or is this just the > > wrong description for this patch? Oops, bad description. Thanks, I'll fix it for v2. > > > +/** > > > + * kvfree_fast() - Free memory. > > > + * @addr: Pointer to allocated memory. > > > + * > > > + * kvfree_fast frees memory allocated by any of vmalloc(), kmalloc() or > > > + * kvmalloc(). It is slightly more efficient to use kfree() or vfree() > > > if > > > + * you are certain that you know which one to use. > > > + * > > > + * Context: Either preemptible task context or not-NMI interrupt. Must > > > not > > > + * hold a spinlock as it can sleep. > > > + */ > > > +void kvfree_fast(const void *addr) > > > +{ > > > + might_sleep(); > > > + > > might_sleep_if(!in_interrupt()); > > That's what vfree does anyway, so we might as well exempt the case where > you are. True, but if we are in interrupt, then we may as well call kvfree() since it'll do the same thing, and this way the rules are clearer. > > > + if (is_vmalloc_addr(addr)) > > > + vfree(addr); > > > + else > > > + kfree(addr); > > > +} > > > +EXPORT_SYMBOL(kvfree_fast); > > > + > > That said -- is this really useful? > > The only way to know that this is safe is to know what sort of > allocation it is, and in that case you can just call kfree or vfree as > appropriate. It's safe if you know you're not holding any spinlocks, for example ...
Re: [PATCH v9 4/4] uprobe: use FOLL_SPLIT_PMD instead of FOLL_SPLIT
> On Jul 26, 2019, at 4:52 PM, Andrew Morton wrote: > > On Fri, 26 Jul 2019 23:44:34 + Song Liu wrote: > >> >> >>> On Jul 26, 2019, at 4:02 PM, Andrew Morton >>> wrote: >>> >>> On Thu, 25 Jul 2019 22:46:54 -0700 Song Liu wrote: >>> This patches uses newly added FOLL_SPLIT_PMD in uprobe. This enables easy regroup of huge pmd after the uprobe is disabled (in next patch). >>> >>> Confused. There is no "next patch". >> >> That was the patch 5, which was in earlier versions. I am working on >> addressing Kirill's feedback for it. >> >> Do I need to resubmit 4/4 with modified change log? > > Please just send new changelog text now. I assume this [4/4] patch is > useful without patch #5, but a description of why it is useful is > appropriate. Yes, 4/4 is useful with #5. Please find the updated change log. = 8< This patch uses newly added FOLL_SPLIT_PMD in uprobe. This preserves the huge page when the uprobe is enabled. When the uprobe is disabled, newer instances of the same application could still benefit from huge page. For the next step, we will enable khugepaged to regroup the pmd, so that existing instances of the application could also benefit from huge page after the uprobe is disabled. Acked-by: Kirill A. Shutemov Reviewed-by: Srikar Dronamraju Signed-off-by: Song Liu = 8< > > I trust the fifth patch is to be sent soon? Yes, I am working on it. Thanks, Song
[GIT PULL] Devicetree fixes for 5.3-rc, take 2
Hi Linus, Please pull some more DT fixes for 5.3. The nvmem changes would typically go thru Greg's tree, but they were missed in the merge window and I've been unable to get a response (partly because Srinivas is out on vacation it appears). Rob The following changes since commit e2297f7c3ab3b68dda2ac732b1767212019d3bdf: dt-bindings: pinctrl: stm32: Fix missing 'clocks' property in examples (2019-07-20 20:28:53 -0600) are available in the Git repository at: git://git.kernel.org/pub/scm/linux/kernel/git/robh/linux.git tags/devicetree-fixes-for-5.3-2 for you to fetch changes up to e1ff7390f58e609aa113a2452a953f669abce6cc: dt-bindings: Fix more $id value mismatches filenames (2019-07-26 17:41:41 -0600) Devicetree fixes for 5.3-rc: - Fix mismatches in $id values and actual filenames. Now checked by tools. - Convert nvmem binding to DT schema - Fix a typo in of_property_read_bool() kerneldoc - Remove some redundant description in al-fic interrupt-controller Maxime Ripard (2): dt-bindings: nvmem: Add YAML schemas for the generic NVMEM bindings dt-bindings: nvmem: SID: Fix the examples node names Rob Herring (2): dt-bindings: clk: allwinner,sun4i-a10-ccu: Correct path in $id dt-bindings: Fix more $id value mismatches filenames Talel Shenhar (1): dt-bindings: interrupt-controller: al-fic: remove redundant binding Thierry Reding (1): of: Fix typo in kerneldoc Documentation/devicetree/bindings/arm/renesas.yaml | 2 +- .../bindings/arm/socionext/milbeaut.yaml | 2 +- .../devicetree/bindings/arm/ti/ti,davinci.yaml | 2 +- .../bindings/clock/allwinner,sun4i-a10-ccu.yaml| 2 +- .../intel,ixp4xx-network-processing-engine.yaml| 2 +- .../devicetree/bindings/iio/accel/adi,adxl345.yaml | 2 +- .../devicetree/bindings/iio/accel/adi,adxl372.yaml | 2 +- .../interrupt-controller/amazon,al-fic.txt | 16 ++-- .../intel,ixp4xx-interrupt.yaml| 2 +- ...er.yaml => intel,ixp4xx-ahb-queue-manager.yaml} | 2 +- .../bindings/net/allwinner,sun8i-a83t-emac.yaml| 2 +- .../bindings/nvmem/allwinner,sun4i-a10-sid.yaml| 4 +- .../devicetree/bindings/nvmem/nvmem-consumer.yaml | 45 +++ Documentation/devicetree/bindings/nvmem/nvmem.txt | 81 +-- Documentation/devicetree/bindings/nvmem/nvmem.yaml | 93 ++ .../phy/allwinner,sun6i-a31-mipi-dphy.yaml | 2 +- .../bindings/timer/intel,ixp4xx-timer.yaml | 2 +- include/linux/of.h | 2 +- 18 files changed, 161 insertions(+), 104 deletions(-) rename Documentation/devicetree/bindings/misc/{intel,ixp4xx-queue-manager.yaml => intel,ixp4xx-ahb-queue-manager.yaml} (95%) create mode 100644 Documentation/devicetree/bindings/nvmem/nvmem-consumer.yaml create mode 100644 Documentation/devicetree/bindings/nvmem/nvmem.yaml
[PATCH v3] mm: memcontrol: fix use after free in mem_cgroup_iter()
This patch is sent to report an use after free in mem_cgroup_iter() after merging commit: be2657752e9e "mm: memcg: fix use after free in mem_cgroup_iter()". I work with android kernel tree (4.9 & 4.14), and the commit: be2657752e9e "mm: memcg: fix use after free in mem_cgroup_iter()" has been merged to the trees. However, I can still observe use after free issues addressed in the commit be2657752e9e. (on low-end devices, a few times this month) backtrace: css_tryget <- crash here mem_cgroup_iter shrink_node shrink_zones do_try_to_free_pages try_to_free_pages __perform_reclaim __alloc_pages_direct_reclaim __alloc_pages_slowpath __alloc_pages_nodemask To debug, I poisoned mem_cgroup before freeing it: static void __mem_cgroup_free(struct mem_cgroup *memcg) for_each_node(node) free_mem_cgroup_per_node_info(memcg, node); free_percpu(memcg->stat); + /* poison memcg before freeing it */ + memset(memcg, 0x78, sizeof(struct mem_cgroup)); kfree(memcg); } The coredump shows the position=0xdbbc2a00 is freed. (gdb) p/x ((struct mem_cgroup_per_node *)0xe5009e00)->iter[8] $13 = {position = 0xdbbc2a00, generation = 0x2efd} 0xdbbc2a00: 0xdbbc2e00 0x 0xdbbc2800 0x0100 0xdbbc2a10: 0x0200 0x78787878 0x00026218 0x 0xdbbc2a20: 0xdcad6000 0x0001 0x78787800 0x 0xdbbc2a30: 0x7878 0x 0x0068fb84 0x78787878 0xdbbc2a40: 0x78787878 0x78787878 0x78787878 0xe3fa5cc0 0xdbbc2a50: 0x78787878 0x78787878 0x 0x 0xdbbc2a60: 0x 0x 0x 0x 0xdbbc2a70: 0x 0x 0x 0x 0xdbbc2a80: 0x 0x 0x 0x 0xdbbc2a90: 0x0001 0x 0x 0x0010 0xdbbc2aa0: 0x0001 0xdbbc2ac8 0x 0x 0xdbbc2ab0: 0x 0x 0x 0x 0xdbbc2ac0: 0x 0x 0xe5b02618 0x1000 0xdbbc2ad0: 0x 0x78787878 0x78787878 0x78787878 0xdbbc2ae0: 0x78787878 0x78787878 0x78787878 0x78787878 0xdbbc2af0: 0x78787878 0x78787878 0x78787878 0x78787878 0xdbbc2b00: 0x78787878 0x78787878 0x78787878 0x78787878 0xdbbc2b10: 0x78787878 0x78787878 0x78787878 0x78787878 0xdbbc2b20: 0x78787878 0x78787878 0x78787878 0x78787878 0xdbbc2b30: 0x78787878 0x78787878 0x78787878 0x78787878 0xdbbc2b40: 0x78787878 0x78787878 0x78787878 0x78787878 0xdbbc2b50: 0x78787878 0x78787878 0x78787878 0x78787878 0xdbbc2b60: 0x78787878 0x78787878 0x78787878 0x78787878 0xdbbc2b70: 0x78787878 0x78787878 0x78787878 0x78787878 0xdbbc2b80: 0x78787878 0x78787878 0x 0x78787878 0xdbbc2b90: 0x78787878 0x78787878 0x78787878 0x78787878 0xdbbc2ba0: 0x78787878 0x78787878 0x78787878 0x78787878 In the reclaim path, try_to_free_pages() does not setup sc.target_mem_cgroup and sc is passed to do_try_to_free_pages(), ..., shrink_node(). In mem_cgroup_iter(), root is set to root_mem_cgroup because sc->target_mem_cgroup is NULL. It is possible to assign a memcg to root_mem_cgroup.nodeinfo.iter in mem_cgroup_iter(). try_to_free_pages struct scan_control sc = {...}, target_mem_cgroup is 0x0; do_try_to_free_pages shrink_zones shrink_node mem_cgroup *root = sc->target_mem_cgroup; memcg = mem_cgroup_iter(root, NULL, ); mem_cgroup_iter() if (!root) root = root_mem_cgroup; ... css = css_next_descendant_pre(css, >css); memcg = mem_cgroup_from_css(css); cmpxchg(>position, pos, memcg); My device uses memcg non-hierarchical mode. When we release a memcg: invalidate_reclaim_iterators() reaches only dead_memcg and its parents. If non-hierarchical mode is used, invalidate_reclaim_iterators() never reaches root_mem_cgroup. static void invalidate_reclaim_iterators(struct mem_cgroup *dead_memcg) { struct mem_cgroup *memcg = dead_memcg; for (; memcg; memcg = parent_mem_cgroup(memcg) ... } So the use after free scenario looks like: CPU1CPU2 try_to_free_pages do_try_to_free_pages shrink_zones shrink_node mem_cgroup_iter() if (!root) root = root_mem_cgroup; ... css = css_next_descendant_pre(css, >css); memcg = mem_cgroup_from_css(css); cmpxchg(>position, pos, memcg);
Re: [PATCH 3/3] Fix sched-messaging.c use of uninitialized value errors
On Fri, Jul 26, 2019 at 12:32 PM Arnaldo Carvalho de Melo wrote: > > Em Wed, Jul 24, 2019 at 04:45:00PM -0700, Numfor Mbiziwo-Tiapo escreveu: > > Our local MSAN (Memory Sanitizer) build of perf throws use of > > uninitialized value warnings in "tools/perf/bench/sched-messaging.c" > > when running perf bench. > > > > The first warning comes from the "ready" function where the "dummy" char > > is declared and then passed into "write" without being initialized. > > Initializing "dummy" to any character silences the warning. > > > > The second warning comes from the "sender" function where a "write" call > > is made to write the contents from the "data" char array when it has not > > yet been initialized. Calling memset on "data" silences the warning. > > So, this is just to silence MSAN, as it doesn't matter what is sent, > whatever values are in those variables is ok, as it will not be used, > right? That's right. Thanks, Ian Rogers > - Arnaldo > > > To reproduce this warning, build perf by running: > > make -C tools/perf CLANG=1 CC=clang EXTRA_CFLAGS="-fsanitize=memory\ > > -fsanitize-memory-track-origins" > > > > (Additionally, llvm might have to be installed and clang might have to > > be specified as the compiler - export CC=/usr/bin/clang) > > > > then running: tools/perf/perf bench sched all > > > > Please see the cover letter for why false positive warnings may be > > generated. > > > > Signed-off-by: Numfor Mbiziwo-Tiapo > > --- > > tools/perf/bench/sched-messaging.c | 3 ++- > > 1 file changed, 2 insertions(+), 1 deletion(-) > > > > diff --git a/tools/perf/bench/sched-messaging.c > > b/tools/perf/bench/sched-messaging.c > > index f9d7641ae833..d22d7b7b591d 100644 > > --- a/tools/perf/bench/sched-messaging.c > > +++ b/tools/perf/bench/sched-messaging.c > > @@ -69,7 +69,7 @@ static void fdpair(int fds[2]) > > /* Block until we're ready to go */ > > static void ready(int ready_out, int wakefd) > > { > > - char dummy; > > + char dummy = 'N'; > > struct pollfd pollfd = { .fd = wakefd, .events = POLLIN }; > > > > /* Tell them we're ready. */ > > @@ -87,6 +87,7 @@ static void *sender(struct sender_context *ctx) > > char data[DATASIZE]; > > unsigned int i, j; > > > > + memset(data, 'N', DATASIZE); > > ready(ctx->ready_out, ctx->wakefd); > > > > /* Now pump to every receiver. */ > > -- > > 2.22.0.657.g960e92d24f-goog > > -- > > - Arnaldo
Re: [PATCH v9 4/4] uprobe: use FOLL_SPLIT_PMD instead of FOLL_SPLIT
On Fri, 26 Jul 2019 23:44:34 + Song Liu wrote: > > > > On Jul 26, 2019, at 4:02 PM, Andrew Morton > > wrote: > > > > On Thu, 25 Jul 2019 22:46:54 -0700 Song Liu wrote: > > > >> This patches uses newly added FOLL_SPLIT_PMD in uprobe. This enables easy > >> regroup of huge pmd after the uprobe is disabled (in next patch). > > > > Confused. There is no "next patch". > > That was the patch 5, which was in earlier versions. I am working on > addressing Kirill's feedback for it. > > Do I need to resubmit 4/4 with modified change log? Please just send new changelog text now. I assume this [4/4] patch is useful without patch #5, but a description of why it is useful is appropriate. I trust the fifth patch is to be sent soon?
Re: [PATCH v2] mm: memcontrol: fix use after free in mem_cgroup_iter()
On Fri, 2019-07-26 at 14:55 +0200, Michal Hocko wrote: > On Fri 26-07-19 14:49:33, Michal Hocko wrote: > > On Fri 26-07-19 10:12:47, Miles Chen wrote: > > > This patch is sent to report an use after free in mem_cgroup_iter() > > > after merging commit: be2657752e9e "mm: memcg: fix use after free in > > > mem_cgroup_iter()". > > > > > > I work with android kernel tree (4.9 & 4.14), and the commit: > > > be2657752e9e "mm: memcg: fix use after free in mem_cgroup_iter()" has > > > been merged to the trees. However, I can still observe use after free > > > issues addressed in the commit be2657752e9e. > > > (on low-end devices, a few times this month) > > > > > > backtrace: > > > css_tryget <- crash here > > > mem_cgroup_iter > > > shrink_node > > > shrink_zones > > > do_try_to_free_pages > > > try_to_free_pages > > > __perform_reclaim > > > __alloc_pages_direct_reclaim > > > __alloc_pages_slowpath > > > __alloc_pages_nodemask > > > > > > To debug, I poisoned mem_cgroup before freeing it: > > > > > > static void __mem_cgroup_free(struct mem_cgroup *memcg) > > > for_each_node(node) > > > free_mem_cgroup_per_node_info(memcg, node); > > > free_percpu(memcg->stat); > > > + /* poison memcg before freeing it */ > > > + memset(memcg, 0x78, sizeof(struct mem_cgroup)); > > > kfree(memcg); > > > } > > > > > > The coredump shows the position=0xdbbc2a00 is freed. > > > > > > (gdb) p/x ((struct mem_cgroup_per_node *)0xe5009e00)->iter[8] > > > $13 = {position = 0xdbbc2a00, generation = 0x2efd} > > > > > > 0xdbbc2a00: 0xdbbc2e00 0x 0xdbbc2800 0x0100 > > > 0xdbbc2a10: 0x0200 0x78787878 0x00026218 0x > > > 0xdbbc2a20: 0xdcad6000 0x0001 0x78787800 0x > > > 0xdbbc2a30: 0x7878 0x 0x0068fb84 0x78787878 > > > 0xdbbc2a40: 0x78787878 0x78787878 0x78787878 0xe3fa5cc0 > > > 0xdbbc2a50: 0x78787878 0x78787878 0x 0x > > > 0xdbbc2a60: 0x 0x 0x 0x > > > 0xdbbc2a70: 0x 0x 0x 0x > > > 0xdbbc2a80: 0x 0x 0x 0x > > > 0xdbbc2a90: 0x0001 0x 0x 0x0010 > > > 0xdbbc2aa0: 0x0001 0xdbbc2ac8 0x 0x > > > 0xdbbc2ab0: 0x 0x 0x 0x > > > 0xdbbc2ac0: 0x 0x 0xe5b02618 0x1000 > > > 0xdbbc2ad0: 0x 0x78787878 0x78787878 0x78787878 > > > 0xdbbc2ae0: 0x78787878 0x78787878 0x78787878 0x78787878 > > > 0xdbbc2af0: 0x78787878 0x78787878 0x78787878 0x78787878 > > > 0xdbbc2b00: 0x78787878 0x78787878 0x78787878 0x78787878 > > > 0xdbbc2b10: 0x78787878 0x78787878 0x78787878 0x78787878 > > > 0xdbbc2b20: 0x78787878 0x78787878 0x78787878 0x78787878 > > > 0xdbbc2b30: 0x78787878 0x78787878 0x78787878 0x78787878 > > > 0xdbbc2b40: 0x78787878 0x78787878 0x78787878 0x78787878 > > > 0xdbbc2b50: 0x78787878 0x78787878 0x78787878 0x78787878 > > > 0xdbbc2b60: 0x78787878 0x78787878 0x78787878 0x78787878 > > > 0xdbbc2b70: 0x78787878 0x78787878 0x78787878 0x78787878 > > > 0xdbbc2b80: 0x78787878 0x78787878 0x 0x78787878 > > > 0xdbbc2b90: 0x78787878 0x78787878 0x78787878 0x78787878 > > > 0xdbbc2ba0: 0x78787878 0x78787878 0x78787878 0x78787878 > > > > > > In the reclaim path, try_to_free_pages() does not setup > > > sc.target_mem_cgroup and sc is passed to do_try_to_free_pages(), ..., > > > shrink_node(). > > > > > > In mem_cgroup_iter(), root is set to root_mem_cgroup because > > > sc->target_mem_cgroup is NULL. > > > It is possible to assign a memcg to root_mem_cgroup.nodeinfo.iter in > > > mem_cgroup_iter(). > > > > > > try_to_free_pages > > > struct scan_control sc = {...}, target_mem_cgroup is 0x0; > > > do_try_to_free_pages > > > shrink_zones > > > shrink_node > > >mem_cgroup *root = sc->target_mem_cgroup; > > >memcg = mem_cgroup_iter(root, NULL, ); > > > mem_cgroup_iter() > > > if (!root) > > > root = root_mem_cgroup; > > > ... > > > > > > css = css_next_descendant_pre(css, >css); > > > memcg = mem_cgroup_from_css(css); > > > cmpxchg(>position, pos, memcg); > > > > > > My device uses memcg non-hierarchical mode. > > > When we release a memcg: invalidate_reclaim_iterators() reaches only > > > dead_memcg and its parents. If non-hierarchical mode is used, > > > invalidate_reclaim_iterators() never reaches root_mem_cgroup. > > > > > > static void
[PATCH] sched/core: Don't use dying mm as active_mm for kernel threads
It was found that a dying mm_struct where the owning task has exited can stay on as active_mm of kernel threads as long as no other user tasks run on those CPUs that use it as active_mm. This prolongs the life time of dying mm holding up memory and other resources that cannot be freed. Fix that by forcing the kernel threads to use init_mm as the active_mm if the previous active_mm is dying. Signed-off-by: Waiman Long --- kernel/sched/core.c | 13 +++-- mm/init-mm.c| 2 ++ 2 files changed, 13 insertions(+), 2 deletions(-) diff --git a/kernel/sched/core.c b/kernel/sched/core.c index 2b037f195473..ca348e1f5a1e 100644 --- a/kernel/sched/core.c +++ b/kernel/sched/core.c @@ -3233,13 +3233,22 @@ context_switch(struct rq *rq, struct task_struct *prev, * Both of these contain the full memory barrier required by * membarrier after storing to rq->curr, before returning to * user-space. +* +* If mm is NULL and oldmm is dying (!owner), we switch to +* init_mm instead to make sure that oldmm can be freed ASAP. */ - if (!mm) { + if (!mm && oldmm->owner) { next->active_mm = oldmm; mmgrab(oldmm); enter_lazy_tlb(oldmm, next); - } else + } else { + if (!mm) { + mm = _mm; + next->active_mm = mm; + mmgrab(mm); + } switch_mm_irqs_off(oldmm, mm, next); + } if (!prev->mm) { prev->active_mm = NULL; diff --git a/mm/init-mm.c b/mm/init-mm.c index a787a319211e..5bfc6bc333ca 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -36,5 +37,6 @@ struct mm_struct init_mm = { .mmlist = LIST_HEAD_INIT(init_mm.mmlist), .user_ns= _user_ns, .cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0}, + .owner = _task, INIT_MM_CONTEXT(init_mm) }; -- 2.18.1
Re: [PATCH v9 4/4] uprobe: use FOLL_SPLIT_PMD instead of FOLL_SPLIT
> On Jul 26, 2019, at 4:02 PM, Andrew Morton wrote: > > On Thu, 25 Jul 2019 22:46:54 -0700 Song Liu wrote: > >> This patches uses newly added FOLL_SPLIT_PMD in uprobe. This enables easy >> regroup of huge pmd after the uprobe is disabled (in next patch). > > Confused. There is no "next patch". That was the patch 5, which was in earlier versions. I am working on addressing Kirill's feedback for it. Do I need to resubmit 4/4 with modified change log? Thanks, Song
[PATCH] dt-bindings: Fix more $id value mismatches filenames
The path in the schema '$id' values are wrong. Fix them. Signed-off-by: Rob Herring --- Documentation/devicetree/bindings/arm/renesas.yaml | 2 +- Documentation/devicetree/bindings/arm/socionext/milbeaut.yaml | 2 +- Documentation/devicetree/bindings/arm/ti/ti,davinci.yaml| 2 +- .../firmware/intel,ixp4xx-network-processing-engine.yaml| 2 +- Documentation/devicetree/bindings/iio/accel/adi,adxl345.yaml| 2 +- Documentation/devicetree/bindings/iio/accel/adi,adxl372.yaml| 2 +- .../bindings/interrupt-controller/intel,ixp4xx-interrupt.yaml | 2 +- ...x-queue-manager.yaml => intel,ixp4xx-ahb-queue-manager.yaml} | 2 +- .../devicetree/bindings/net/allwinner,sun8i-a83t-emac.yaml | 2 +- .../devicetree/bindings/phy/allwinner,sun6i-a31-mipi-dphy.yaml | 2 +- Documentation/devicetree/bindings/timer/intel,ixp4xx-timer.yaml | 2 +- 11 files changed, 11 insertions(+), 11 deletions(-) rename Documentation/devicetree/bindings/misc/{intel,ixp4xx-queue-manager.yaml => intel,ixp4xx-ahb-queue-manager.yaml} (95%) diff --git a/Documentation/devicetree/bindings/arm/renesas.yaml b/Documentation/devicetree/bindings/arm/renesas.yaml index 08c923f8c257..28eb458f761a 100644 --- a/Documentation/devicetree/bindings/arm/renesas.yaml +++ b/Documentation/devicetree/bindings/arm/renesas.yaml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 %YAML 1.2 --- -$id: http://devicetree.org/schemas/arm/shmobile.yaml# +$id: http://devicetree.org/schemas/arm/renesas.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# title: Renesas SH-Mobile, R-Mobile, and R-Car Platform Device Tree Bindings diff --git a/Documentation/devicetree/bindings/arm/socionext/milbeaut.yaml b/Documentation/devicetree/bindings/arm/socionext/milbeaut.yaml index aae53fc3cb1e..2bd519d2e855 100644 --- a/Documentation/devicetree/bindings/arm/socionext/milbeaut.yaml +++ b/Documentation/devicetree/bindings/arm/socionext/milbeaut.yaml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 %YAML 1.2 --- -$id: http://devicetree.org/schemas/arm/milbeaut.yaml# +$id: http://devicetree.org/schemas/arm/socionext/milbeaut.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# title: Milbeaut platforms device tree bindings diff --git a/Documentation/devicetree/bindings/arm/ti/ti,davinci.yaml b/Documentation/devicetree/bindings/arm/ti/ti,davinci.yaml index 4326d2cfa15d..a8765ba29476 100644 --- a/Documentation/devicetree/bindings/arm/ti/ti,davinci.yaml +++ b/Documentation/devicetree/bindings/arm/ti/ti,davinci.yaml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 %YAML 1.2 --- -$id: http://devicetree.org/schemas/arm/ti/davinci.yaml# +$id: http://devicetree.org/schemas/arm/ti/ti,davinci.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# title: Texas Instruments DaVinci Platforms Device Tree Bindings diff --git a/Documentation/devicetree/bindings/firmware/intel,ixp4xx-network-processing-engine.yaml b/Documentation/devicetree/bindings/firmware/intel,ixp4xx-network-processing-engine.yaml index 8cb136c376fb..4f0db8ee226a 100644 --- a/Documentation/devicetree/bindings/firmware/intel,ixp4xx-network-processing-engine.yaml +++ b/Documentation/devicetree/bindings/firmware/intel,ixp4xx-network-processing-engine.yaml @@ -2,7 +2,7 @@ # Copyright 2019 Linaro Ltd. %YAML 1.2 --- -$id: "http://devicetree.org/schemas/firmware/intel-ixp4xx-network-processing-engine.yaml#; +$id: "http://devicetree.org/schemas/firmware/intel,ixp4xx-network-processing-engine.yaml#; $schema: "http://devicetree.org/meta-schemas/core.yaml#; title: Intel IXP4xx Network Processing Engine diff --git a/Documentation/devicetree/bindings/iio/accel/adi,adxl345.yaml b/Documentation/devicetree/bindings/iio/accel/adi,adxl345.yaml index 7ba167e2e1ea..c602b6fe1c0c 100644 --- a/Documentation/devicetree/bindings/iio/accel/adi,adxl345.yaml +++ b/Documentation/devicetree/bindings/iio/accel/adi,adxl345.yaml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 %YAML 1.2 --- -$id: http://devicetree.org/schemas/iio/accelerometers/adi,adxl345.yaml# +$id: http://devicetree.org/schemas/iio/accel/adi,adxl345.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# title: Analog Devices ADXL345/ADXL375 3-Axis Digital Accelerometers diff --git a/Documentation/devicetree/bindings/iio/accel/adi,adxl372.yaml b/Documentation/devicetree/bindings/iio/accel/adi,adxl372.yaml index a7fafb9bf5c6..e7daffec88d3 100644 --- a/Documentation/devicetree/bindings/iio/accel/adi,adxl372.yaml +++ b/Documentation/devicetree/bindings/iio/accel/adi,adxl372.yaml @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 %YAML 1.2 --- -$id: http://devicetree.org/schemas/iio/accelerometers/adi,adxl372.yaml# +$id: http://devicetree.org/schemas/iio/accel/adi,adxl372.yaml# $schema: http://devicetree.org/meta-schemas/core.yaml# title: Analog Devices ADXL372 3-Axis, +/-(200g) Digital Accelerometer diff --git