from:"Lai Jiangshan"

[RFC PATCH 66/73] x86/pvm: Use new cpu feature to describe XENPV and PVM

2024-02-26 Thread Lai Jiangshan

From: Hou Wenlong 

Some PVOPS are patched as the native version directly if the guest is
not a XENPV guest. However, this approach will not work after
introducing a PVM guest. To address this, use a new CPU feature to
describe XENPV and PVM, and ensure that those PVOPS are patched only
when it is not a paravirtual guest.

Signed-off-by: Hou Wenlong 
Signed-off-by: Lai Jiangshan 
---
 arch/x86/entry/entry_64.S  |  5 ++---
 arch/x86/include/asm/cpufeatures.h |  1 +
 arch/x86/include/asm/paravirt.h| 14 +++---
 arch/x86/kernel/pvm.c  |  1 +
 arch/x86/xen/enlighten_pv.c|  1 +
 5 files changed, 12 insertions(+), 10 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index fe12605b3c05..6b41a1837698 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -127,9 +127,8 @@ SYM_INNER_LABEL(entry_SYSCALL_64_after_hwframe, 
SYM_L_GLOBAL)
 * In the PVM guest case we must use eretu synthetic instruction.
 */
 
-   ALTERNATIVE_2 "testb %al, %al; jz 
swapgs_restore_regs_and_return_to_usermode", \
-   "jmp swapgs_restore_regs_and_return_to_usermode", 
X86_FEATURE_XENPV, \
-   "jmp swapgs_restore_regs_and_return_to_usermode", 
X86_FEATURE_KVM_PVM_GUEST
+   ALTERNATIVE "testb %al, %al; jz 
swapgs_restore_regs_and_return_to_usermode", \
+   "jmp swapgs_restore_regs_and_return_to_usermode", 
X86_FEATURE_PV_GUEST
 
/*
 * We win! This label is here just for ease of understanding
diff --git a/arch/x86/include/asm/cpufeatures.h 
b/arch/x86/include/asm/cpufeatures.h
index e17e72f13423..72ef58a2db19 100644
--- a/arch/x86/include/asm/cpufeatures.h
+++ b/arch/x86/include/asm/cpufeatures.h
@@ -238,6 +238,7 @@
 #define X86_FEATURE_VCPUPREEMPT( 8*32+21) /* "" PV 
vcpu_is_preempted function */
 #define X86_FEATURE_TDX_GUEST  ( 8*32+22) /* Intel Trust Domain 
Extensions Guest */
 #define X86_FEATURE_KVM_PVM_GUEST  ( 8*32+23) /* KVM Pagetable-based 
Virtual Machine guest */
+#define X86_FEATURE_PV_GUEST   ( 8*32+24) /* "" Paravirtual guest */
 
 /* Intel-defined CPU features, CPUID level 0x0007:0 (EBX), word 9 */
 #define X86_FEATURE_FSGSBASE   ( 9*32+ 0) /* RDFSBASE, WRFSBASE, 
RDGSBASE, WRGSBASE instructions*/
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index deaee9ec575e..a864ee481ca2 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -143,7 +143,7 @@ static __always_inline unsigned long read_cr2(void)
 {
return PVOP_ALT_CALLEE0(unsigned long, mmu.read_cr2,
"mov %%cr2, %%rax;",
-   ALT_NOT(X86_FEATURE_XENPV));
+   ALT_NOT(X86_FEATURE_PV_GUEST));
 }
 
 static __always_inline void write_cr2(unsigned long x)
@@ -154,13 +154,13 @@ static __always_inline void write_cr2(unsigned long x)
 static inline unsigned long __read_cr3(void)
 {
return PVOP_ALT_CALL0(unsigned long, mmu.read_cr3,
- "mov %%cr3, %%rax;", ALT_NOT(X86_FEATURE_XENPV));
+ "mov %%cr3, %%rax;", 
ALT_NOT(X86_FEATURE_PV_GUEST));
 }
 
 static inline void write_cr3(unsigned long x)
 {
PVOP_ALT_VCALL1(mmu.write_cr3, x,
-   "mov %%rdi, %%cr3", ALT_NOT(X86_FEATURE_XENPV));
+   "mov %%rdi, %%cr3", ALT_NOT(X86_FEATURE_PV_GUEST));
 }
 
 static inline void __write_cr4(unsigned long x)
@@ -694,17 +694,17 @@ bool __raw_callee_save___native_vcpu_is_preempted(long 
cpu);
 static __always_inline unsigned long arch_local_save_flags(void)
 {
return PVOP_ALT_CALLEE0(unsigned long, irq.save_fl, "pushf; pop %%rax;",
-   ALT_NOT(X86_FEATURE_XENPV));
+   ALT_NOT(X86_FEATURE_PV_GUEST));
 }
 
 static __always_inline void arch_local_irq_disable(void)
 {
-   PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", ALT_NOT(X86_FEATURE_XENPV));
+   PVOP_ALT_VCALLEE0(irq.irq_disable, "cli;", 
ALT_NOT(X86_FEATURE_PV_GUEST));
 }
 
 static __always_inline void arch_local_irq_enable(void)
 {
-   PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", ALT_NOT(X86_FEATURE_XENPV));
+   PVOP_ALT_VCALLEE0(irq.irq_enable, "sti;", 
ALT_NOT(X86_FEATURE_PV_GUEST));
 }
 
 static __always_inline unsigned long arch_local_irq_save(void)
@@ -776,7 +776,7 @@ void native_pv_lock_init(void) __init;
 .endm
 
 #define SAVE_FLAGS ALTERNATIVE "PARA_IRQ_save_fl;", "pushf; pop %rax;", \
-   ALT_NOT(X86_FEATURE_XENPV)
+   ALT_NOT(X86_FEATURE_PV_GUEST)
 #endif
 #endif /* CONFIG_PARAVIRT_XXL */
 #endif /* CONFIG_X86_64 */
diff --git a/arch

Re: [PATCH 2/2] KVM: x86: Fix split-irqchip vs interrupt injection window request

2021-04-15 Thread Lai Jiangshan

On Thu, Apr 15, 2021 at 2:07 PM Paolo Bonzini  wrote:
>
> On 15/04/21 02:59, Lai Jiangshan wrote:
> > The next call to inject_pending_event() will reach here AT FIRST with
> > vcpu->arch.exception.injected==false and vcpu->arch.exception.pending==false
> >
> >>   ... if (!vcpu->arch.exception.pending) {
> >>   if (vcpu->arch.nmi_injected) {
> >>   static_call(kvm_x86_set_nmi)(vcpu);
> >>   can_inject = false;
> >>   } else if (vcpu->arch.interrupt.injected) {
> >>   static_call(kvm_x86_set_irq)(vcpu);
> >>   can_inject = false;
> >
> > And comes here and vcpu->arch.interrupt.injected is true for there is
> > an interrupt queued by KVM_INTERRUPT for pure user irqchip. It then does
> > the injection of the interrupt without checking the EFLAGS.IF.
>
> Ok, understood now.  Yeah, that could be a problem for userspace irqchip
> so we should switch it to use pending_external_vector instead.  Are you
> going to write the patch or should I?
>

I wish you do it.  I haven't figured out how to write a clean test for
it and confirm it in upstream.  But I will backport your patch and test it.

My fix is changing the behavior back to before 664f8e26b00c7 where
arch.exception.pending=true would prevent ready_for_interrupt_injection
to be non-zero.  So that KVM_INTERRUPT maintains the original behavior
that it can immediately inject IRQ into guests. (Userspace may regret
an unearthly injected IRQ for it has no right to revise the IRQ or cancel
it.)  But your fix will unify the behaviors of all kinds of irqchips.

Thanks
Lai

> Thanks!
>
> Paolo
>
> > My question is that what stops the next call to inject_pending_event()
> > to reach here when KVM_INTERRUPT is called with exepction pending.
>

Re: [PATCH 2/2] KVM: x86: Fix split-irqchip vs interrupt injection window request

2021-04-14 Thread Lai Jiangshan

On Thu, Apr 15, 2021 at 12:58 AM Paolo Bonzini  wrote:
>
> On 14/04/21 04:28, Lai Jiangshan wrote:
> > On Tue, Apr 13, 2021 at 8:15 PM Paolo Bonzini  wrote:
> >>
> >> On 13/04/21 13:03, Lai Jiangshan wrote:
> >>> This patch claims that it has a place to
> >>> stash the IRQ when EFLAGS.IF=0, but inject_pending_event() seams to ignore
> >>> EFLAGS.IF and queues the IRQ to the guest directly in the first branch
> >>> of using "kvm_x86_ops.set_irq(vcpu)".
> >>
> >> This is only true for pure-userspace irqchip.  For split-irqchip, in
> >> which case the "place to stash" the interrupt is
> >> vcpu->arch.pending_external_vector.
> >>
> >> For pure-userspace irqchip, KVM_INTERRUPT only cares about being able to
> >> stash the interrupt in vcpu->arch.interrupt.injected.  It is indeed
> >> wrong for userspace to call KVM_INTERRUPT if the vCPU is not ready for
> >> interrupt injection, but KVM_INTERRUPT does not return an error.
> >
> > Thanks for the reply.
> >
> > May I ask what is the correct/practical way of using KVM_INTERRUPT ABI
> > for pure-userspace irqchip.
> >
> > gVisor is indeed a pure-userspace irqchip, it will call KVM_INTERRUPT
> > when kvm_run->ready_for_interrupt_injection=1 (along with other conditions
> > unrelated to our discussion).
> >
> > https://github.com/google/gvisor/blob/a9441aea2780da8c93da1c73da860219f98438de/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go#L105
> >
> > if kvm_run->ready_for_interrupt_injection=1 when expection pending or
> > EFLAGS.IF=0, it would be unexpected for gVisor.
>
> Not with EFLAGS.IF=0.  For pending exception, there is code to handle it
> in inject_pending_event:
>

Thanks for the reply.
(I rearranged your summarization here)

> so what happens is:
>
> - the interrupt will not be injected before the exception
>
> - KVM will schedule an immediate vmexit to inject the interrupt as well
>
> - if (as is likely) the exception has turned off interrupts, the next
> call to inject_pending_event will reach
> static_call(kvm_x86_enable_irq_window) and the interrupt will only be
> injected when IF becomes 1 again.

The next call to inject_pending_event() will reach here AT FIRST with
vcpu->arch.exception.injected==false and vcpu->arch.exception.pending==false

>  ... if (!vcpu->arch.exception.pending) {
>  if (vcpu->arch.nmi_injected) {
>  static_call(kvm_x86_set_nmi)(vcpu);
>  can_inject = false;
>  } else if (vcpu->arch.interrupt.injected) {
>  static_call(kvm_x86_set_irq)(vcpu);
>  can_inject = false;

And comes here and vcpu->arch.interrupt.injected is true for there is
an interrupt queued by KVM_INTERRUPT for pure user irqchip. It then does
the injection of the interrupt without checking the EFLAGS.IF.

My question is that what stops the next call to inject_pending_event()
to reach here when KVM_INTERRUPT is called with exepction pending.

Or what makes kvm_run->ready_for_interrupt_injection be zero when
exception pending to disallow userspace to call KVM_INTERRUPT.


>  }
>  }
> ...
>  if (vcpu->arch.exception.pending) {
> ...
>  can_inject = false;
>  }
> // this is vcpu->arch.interrupt.injected for userspace LAPIC
>  if (kvm_cpu_has_injectable_intr(vcpu)) {
>  r = can_inject ?
> static_call(kvm_x86_interrupt_allowed)(vcpu, true) : -EBUSY;
> if (r < 0)
> goto busy;
> ...
> }
>
>
> Paolo
>

Re: [PATCH 2/2] KVM: x86: Fix split-irqchip vs interrupt injection window request

2021-04-13 Thread Lai Jiangshan

On Tue, Apr 13, 2021 at 8:15 PM Paolo Bonzini  wrote:
>
> On 13/04/21 13:03, Lai Jiangshan wrote:
> > This patch claims that it has a place to
> > stash the IRQ when EFLAGS.IF=0, but inject_pending_event() seams to ignore
> > EFLAGS.IF and queues the IRQ to the guest directly in the first branch
> > of using "kvm_x86_ops.set_irq(vcpu)".
>
> This is only true for pure-userspace irqchip.  For split-irqchip, in
> which case the "place to stash" the interrupt is
> vcpu->arch.pending_external_vector.
>
> For pure-userspace irqchip, KVM_INTERRUPT only cares about being able to
> stash the interrupt in vcpu->arch.interrupt.injected.  It is indeed
> wrong for userspace to call KVM_INTERRUPT if the vCPU is not ready for
> interrupt injection, but KVM_INTERRUPT does not return an error.

Thanks for the reply.

May I ask what is the correct/practical way of using KVM_INTERRUPT ABI
for pure-userspace irqchip.

gVisor is indeed a pure-userspace irqchip, it will call KVM_INTERRUPT
when kvm_run->ready_for_interrupt_injection=1 (along with other conditions
unrelated to our discussion).

https://github.com/google/gvisor/blob/a9441aea2780da8c93da1c73da860219f98438de/pkg/sentry/platform/kvm/bluepill_amd64_unsafe.go#L105

if kvm_run->ready_for_interrupt_injection=1 when expection pending or
EFLAGS.IF=0, it would be unexpected for gVisor.

Thanks
Lai

>
> Ignoring the fact that this would be incorrect use of the API, are you
> saying that the incorrect injection was not possible before this patch?
>
> Paolo
>

Re: [PATCH 2/2] KVM: x86: Fix split-irqchip vs interrupt injection window request

2021-04-13 Thread Lai Jiangshan

On Tue, Apr 13, 2021 at 5:43 AM Sean Christopherson  wrote:
>
> On Fri, Apr 09, 2021, Lai Jiangshan wrote:
> > On Fri, Nov 27, 2020 at 7:26 PM Paolo Bonzini  wrote:
> > >
> > > kvm_cpu_accept_dm_intr and kvm_vcpu_ready_for_interrupt_injection are
> > > a hodge-podge of conditions, hacked together to get something that
> > > more or less works.  But what is actually needed is much simpler;
> > > in both cases the fundamental question is, do we have a place to stash
> > > an interrupt if userspace does KVM_INTERRUPT?
> > >
> > > In userspace irqchip mode, that is !vcpu->arch.interrupt.injected.
> > > Currently kvm_event_needs_reinjection(vcpu) covers it, but it is
> > > unnecessarily restrictive.
> > >
> > > In split irqchip mode it's a bit more complicated, we need to check
> > > kvm_apic_accept_pic_intr(vcpu) (the IRQ window exit is basically an INTACK
> > > cycle and thus requires ExtINTs not to be masked) as well as
> > > !pending_userspace_extint(vcpu).  However, there is no need to
> > > check kvm_event_needs_reinjection(vcpu), since split irqchip keeps
> > > pending ExtINT state separate from event injection state, and checking
> > > kvm_cpu_has_interrupt(vcpu) is wrong too since ExtINT has higher
> > > priority than APIC interrupts.  In fact the latter fixes a bug:
> > > when userspace requests an IRQ window vmexit, an interrupt in the
> > > local APIC can cause kvm_cpu_has_interrupt() to be true and thus
> > > kvm_vcpu_ready_for_interrupt_injection() to return false.  When this
> > > happens, vcpu_run does not exit to userspace but the interrupt window
> > > vmexits keep occurring.  The VM loops without any hope of making progress.
> > >
> > > Once we try to fix these with something like
> > >
> > >  return kvm_arch_interrupt_allowed(vcpu) &&
> > > -!kvm_cpu_has_interrupt(vcpu) &&
> > > -!kvm_event_needs_reinjection(vcpu) &&
> > > -kvm_cpu_accept_dm_intr(vcpu);
> > > +(!lapic_in_kernel(vcpu)
> > > + ? !vcpu->arch.interrupt.injected
> > > + : (kvm_apic_accept_pic_intr(vcpu)
> > > +&& !pending_userspace_extint(v)));
> > >
> > > we realize two things.  First, thanks to the previous patch the complex
> > > conditional can reuse !kvm_cpu_has_extint(vcpu).  Second, the interrupt
> > > window request in vcpu_enter_guest()
> > >
> > > bool req_int_win =
> > > dm_request_for_irq_injection(vcpu) &&
> > > kvm_cpu_accept_dm_intr(vcpu);
> > >
> > > should be kept in sync with kvm_vcpu_ready_for_interrupt_injection():
> > > it is unnecessary to ask the processor for an interrupt window
> > > if we would not be able to return to userspace.  Therefore, the
> > > complex conditional is really the correct implementation of
> > > kvm_cpu_accept_dm_intr(vcpu).  It all makes sense:
> > >
> > > - we can accept an interrupt from userspace if there is a place
> > >   to stash it (and, for irqchip split, ExtINTs are not masked).
> > >   Interrupts from userspace _can_ be accepted even if right now
> > >   EFLAGS.IF=0.
> >
> > Hello, Paolo
> >
> > If userspace does KVM_INTERRUPT, vcpu->arch.interrupt.injected is
> > set immediately, and in inject_pending_event(), we have
> >
> > else if (!vcpu->arch.exception.pending) {
> > if (vcpu->arch.nmi_injected) {
> > kvm_x86_ops.set_nmi(vcpu);
> > can_inject = false;
> > } else if (vcpu->arch.interrupt.injected) {
> > kvm_x86_ops.set_irq(vcpu);
> > can_inject = false;
> > }
> > }
> >
> > I'm curious about that can the kvm_x86_ops.set_irq() here be possible
> > to queue the irq with EFLAGS.IF=0? If not, which code prevents it?
>
> The interrupt is only directly injected if the local APIC is _not_ in-kernel.
> If userspace is managing the local APIC, my understanding is that userspace is
> also responsible for honoring EFLAGS.IF, though KVM aids userspace by updating
> vcpu->run->ready_for_interrupt_injection when exiting to userspace.  When
> userspace is modeling the local APIC, that resolves to
> kvm_vcpu_ready_for_interrupt_injection():
>
> return kvm_arch_interrupt_allowed(vcpu) &&
> kvm_cpu_accep

Re: [PATCH 2/2] KVM: x86: Fix split-irqchip vs interrupt injection window request

2021-04-09 Thread Lai Jiangshan

On Fri, Nov 27, 2020 at 7:26 PM Paolo Bonzini  wrote:
>
> kvm_cpu_accept_dm_intr and kvm_vcpu_ready_for_interrupt_injection are
> a hodge-podge of conditions, hacked together to get something that
> more or less works.  But what is actually needed is much simpler;
> in both cases the fundamental question is, do we have a place to stash
> an interrupt if userspace does KVM_INTERRUPT?
>
> In userspace irqchip mode, that is !vcpu->arch.interrupt.injected.
> Currently kvm_event_needs_reinjection(vcpu) covers it, but it is
> unnecessarily restrictive.
>
> In split irqchip mode it's a bit more complicated, we need to check
> kvm_apic_accept_pic_intr(vcpu) (the IRQ window exit is basically an INTACK
> cycle and thus requires ExtINTs not to be masked) as well as
> !pending_userspace_extint(vcpu).  However, there is no need to
> check kvm_event_needs_reinjection(vcpu), since split irqchip keeps
> pending ExtINT state separate from event injection state, and checking
> kvm_cpu_has_interrupt(vcpu) is wrong too since ExtINT has higher
> priority than APIC interrupts.  In fact the latter fixes a bug:
> when userspace requests an IRQ window vmexit, an interrupt in the
> local APIC can cause kvm_cpu_has_interrupt() to be true and thus
> kvm_vcpu_ready_for_interrupt_injection() to return false.  When this
> happens, vcpu_run does not exit to userspace but the interrupt window
> vmexits keep occurring.  The VM loops without any hope of making progress.
>
> Once we try to fix these with something like
>
>  return kvm_arch_interrupt_allowed(vcpu) &&
> -!kvm_cpu_has_interrupt(vcpu) &&
> -!kvm_event_needs_reinjection(vcpu) &&
> -kvm_cpu_accept_dm_intr(vcpu);
> +(!lapic_in_kernel(vcpu)
> + ? !vcpu->arch.interrupt.injected
> + : (kvm_apic_accept_pic_intr(vcpu)
> +&& !pending_userspace_extint(v)));
>
> we realize two things.  First, thanks to the previous patch the complex
> conditional can reuse !kvm_cpu_has_extint(vcpu).  Second, the interrupt
> window request in vcpu_enter_guest()
>
> bool req_int_win =
> dm_request_for_irq_injection(vcpu) &&
> kvm_cpu_accept_dm_intr(vcpu);
>
> should be kept in sync with kvm_vcpu_ready_for_interrupt_injection():
> it is unnecessary to ask the processor for an interrupt window
> if we would not be able to return to userspace.  Therefore, the
> complex conditional is really the correct implementation of
> kvm_cpu_accept_dm_intr(vcpu).  It all makes sense:
>
> - we can accept an interrupt from userspace if there is a place
>   to stash it (and, for irqchip split, ExtINTs are not masked).
>   Interrupts from userspace _can_ be accepted even if right now
>   EFLAGS.IF=0.

Hello, Paolo

If userspace does KVM_INTERRUPT, vcpu->arch.interrupt.injected is
set immediately, and in inject_pending_event(), we have

else if (!vcpu->arch.exception.pending) {
if (vcpu->arch.nmi_injected) {
kvm_x86_ops.set_nmi(vcpu);
can_inject = false;
} else if (vcpu->arch.interrupt.injected) {
kvm_x86_ops.set_irq(vcpu);
can_inject = false;
}
}

I'm curious about that can the kvm_x86_ops.set_irq() here be possible
to queue the irq with EFLAGS.IF=0? If not, which code prevents it?

I'm asking about this because I just noticed that interrupt can
be queued when exception pending, and this patch relaxed it even
more.

Note: interrupt can NOT be queued when exception pending
until 664f8e26b00c7 ("KVM: X86: Fix loss of exception which
has not yet been injected") which I think is dangerous.

Thanks
Lai

>
> - in order to tell userspace we will inject its interrupt ("IRQ
>   window open" i.e. kvm_vcpu_ready_for_interrupt_injection), both
>   KVM and the vCPU need to be ready to accept the interrupt.
>
> ... and this is what the patch implements.
>
> Reported-by: David Woodhouse 
> Analyzed-by: David Woodhouse 
> Cc: sta...@vger.kernel.org
> Signed-off-by: Paolo Bonzini 
> ---
>  arch/x86/include/asm/kvm_host.h |  1 +
>  arch/x86/kvm/irq.c  |  2 +-
>  arch/x86/kvm/x86.c  | 17 +++--
>  3 files changed, 9 insertions(+), 11 deletions(-)
>
> diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
> index d44858b69353..ddaf3e01a854 100644
> --- a/arch/x86/include/asm/kvm_host.h
> +++ b/arch/x86/include/asm/kvm_host.h
> @@ -1655,6 +1655,7 @@ int kvm_test_age_hva(struct kvm *kvm, unsigned long 
> hva);
>  int kvm_set_spte_hva(struct kvm *kvm, unsigned long hva, pte_t pte);
>  int kvm_cpu_has_injectable_intr(struct kvm_vcpu *v);
>  int kvm_cpu_has_interrupt(struct kvm_vcpu *vcpu);
> +int kvm_cpu_has_extint(struct kvm_vcpu *v);
>  int kvm_arch_interrupt_allowed(struct kvm_vcpu *vcpu);
>  int kvm_cpu_get_interrupt(struct kvm_vcpu *v);
>  void kvm_vcpu_reset(struct kvm_vcpu *vcpu, bool init_event);
> diff --git

[tip: x86/cleanups] x86/process/64: Move cpu_current_top_of_stack out of TSS

2021-03-28 Thread tip-bot2 for Lai Jiangshan

The following commit has been merged into the x86/cleanups branch of tip:

Commit-ID: 1591584e2e762edecefde403c44d9c26c9ff72c9
Gitweb:
https://git.kernel.org/tip/1591584e2e762edecefde403c44d9c26c9ff72c9
Author:Lai Jiangshan 
AuthorDate:Tue, 26 Jan 2021 01:34:29 +08:00
Committer: Thomas Gleixner 
CommitterDate: Sun, 28 Mar 2021 22:40:10 +02:00

x86/process/64: Move cpu_current_top_of_stack out of TSS

cpu_current_top_of_stack is currently stored in TSS.sp1. TSS is exposed
through the cpu_entry_area which is visible with user CR3 when PTI is
enabled and active.

This makes it a coveted fruit for attackers.  An attacker can fetch the
kernel stack top from it and continue next steps of actions based on the
kernel stack.

But it is actualy not necessary to be stored in the TSS.  It is only
accessed after the entry code switched to kernel CR3 and kernel GS_BASE
which means it can be in any regular percpu variable.

The reason why it is in TSS is historical (pre PTI) because TSS is also
used as scratch space in SYSCALL_64 and therefore cache hot.

A syscall also needs the per CPU variable current_task and eventually
__preempt_count, so placing cpu_current_top_of_stack next to them makes it
likely that they end up in the same cache line which should avoid
performance regressions. This is not enforced as the compiler is free to
place these variables, so these entry relevant variables should move into
a data structure to make this enforceable.

The seccomp_benchmark doesn't show any performance loss in the "getpid
native" test result.  Actually, the result changes from 93ns before to 92ns
with this change when KPTI is disabled. The test is very stable and
although the test doesn't show a higher degree of precision it gives enough
confidence that moving cpu_current_top_of_stack does not cause a
regression.

[ tglx: Removed unneeded export. Massaged changelog ]

Signed-off-by: Lai Jiangshan 
Signed-off-by: Thomas Gleixner 
Link: https://lore.kernel.org/r/20210125173444.22696-2-jiangshan...@gmail.com
---
 arch/x86/include/asm/processor.h   | 10 --
 arch/x86/include/asm/switch_to.h   |  7 +--
 arch/x86/include/asm/thread_info.h |  8 +---
 arch/x86/kernel/cpu/common.c   |  2 ++
 arch/x86/kernel/process.c  |  7 +--
 arch/x86/mm/pti.c  |  7 +++
 6 files changed, 8 insertions(+), 33 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 8b3ed21..185142b 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -314,11 +314,6 @@ struct x86_hw_tss {
 struct x86_hw_tss {
u32 reserved1;
u64 sp0;
-
-   /*
-* We store cpu_current_top_of_stack in sp1 so it's always accessible.
-* Linux does not use ring 1, so sp1 is not otherwise needed.
-*/
u64 sp1;
 
/*
@@ -426,12 +421,7 @@ struct irq_stack {
charstack[IRQ_STACK_SIZE];
 } __aligned(IRQ_STACK_SIZE);
 
-#ifdef CONFIG_X86_32
 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
-#else
-/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
-#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
-#endif
 
 #ifdef CONFIG_X86_64
 struct fixed_percpu_data {
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 9f69cc4..b5f0d2f 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -71,12 +71,7 @@ static inline void update_task_stack(struct task_struct 
*task)
else
this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
 #else
-   /*
-* x86-64 updates x86_tss.sp1 via cpu_current_top_of_stack. That
-* doesn't work on x86-32 because sp1 and
-* cpu_current_top_of_stack have different values (because of
-* the non-zero stack-padding on 32bit).
-*/
+   /* Xen PV enters the kernel on the thread stack. */
if (static_cpu_has(X86_FEATURE_XENPV))
load_sp0(task_top_of_stack(task));
 #endif
diff --git a/arch/x86/include/asm/thread_info.h 
b/arch/x86/include/asm/thread_info.h
index 06b740b..de406d9 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -197,13 +197,7 @@ static inline int arch_within_stack_frames(const void * 
const stack,
 #endif
 }
 
-#else /* !__ASSEMBLY__ */
-
-#ifdef CONFIG_X86_64
-# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
-#endif
-
-#endif
+#endif  /* !__ASSEMBLY__ */
 
 /*
  * Thread-synchronous status.
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 1aa5f0a..3401078 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1748,6 +1748,8 @@ DEFINE_PER_CPU(bool, hardirq_stack_inuse);
 DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
 EXPORT_PER_CPU_SYMBOL(__

Re: [RFC PATCH 0/6] [RFC] Faultable tracepoints (v2)

2021-02-25 Thread Lai Jiangshan

On Thu, Feb 25, 2021 at 9:15 AM Mathieu Desnoyers
 wrote:
>
> - On Feb 24, 2021, at 11:22 AM, Michael Jeanson mjean...@efficios.com 
> wrote:
>
> > [ Adding Mathieu Desnoyers in CC ]
> >
> > On 2021-02-23 21 h 16, Steven Rostedt wrote:
> >> On Thu, 18 Feb 2021 17:21:19 -0500
> >> Michael Jeanson  wrote:
> >>
> >>> This series only implements the tracepoint infrastructure required to
> >>> allow tracers to handle page faults. Modifying each tracer to handle
> >>> those page faults would be a next step after we all agree on this piece
> >>> of instrumentation infrastructure.
> >>
> >> I started taking a quick look at this, and came up with the question: how
> >> do you allow preemption when dealing with per-cpu buffers or storage to
> >> record the data?
> >>
> >> That is, perf, bpf and ftrace are all using some kind of per-cpu data, and
> >> this is the reason for the need to disable preemption. What's the solution
> >> that LTTng is using for this? I know it has a per cpu buffers too, but does
> >> it have some kind of "per task" buffer that is being used to extract the
> >> data that can fault?
>
> As a prototype solution, what I've done currently is to copy the user-space
> data into a kmalloc'd buffer in a preparation step before disabling preemption
> and copying data over into the per-cpu buffers. It works, but I think we 
> should
> be able to do it without the needless copy.
>
> What I have in mind as an efficient solution (not implemented yet) for the 
> LTTng
> kernel tracer goes as follows:
>
> #define COMMIT_LOCAL 0
> #define COMMIT_REMOTE 1
>
> - faultable probe is called from system call tracepoint [ 
> preemption/blocking/migration is allowed ]

label:
restart:

>   - probe code calculate the length which needs to be reserved to store the 
> event
> (e.g. user strlen),

Does "user strlen" makes the content fault in?

Is it possible to make the sleepable faulting only happen here between
"restart" and the following "preempt disable"?  The code here should
do a prefetch operation like "user strlen".

And we can keep preemption disabled when copying the data.  If there
is a fault while copying, then we can restart from the label "restart".

Very immature thought.

Thanks
Lai

>
>   - preempt disable -> [ preemption/blocking/migration is not allowed from 
> here ]
> - reserve_cpu = smp_processor_id()
> - reserve space in the ring buffer for reserve_cpu
>   [ from that point on, we have _exclusive_ access to write into the ring 
> buffer "slot"
> from any cpu until we commit. ]
>   - preempt enable -> [ preemption/blocking/migration is allowed from here ]
>
>   - copy data from user-space to the ring buffer "slot",
>
>   - preempt disable -> [ preemption/blocking/migration is not allowed from 
> here ]
> commit_cpu = smp_processor_id()
> if (commit_cpu == reserve_cpu)
>use local_add to increment the 
> buf[commit_cpu].subbuffer[current].commit_count[COMMIT_LOCAL]
> else
>use atomic_add to increment the 
> buf[commit_cpu].subbuffer[current].commit_count[COMMIT_REMOTE]
>   - preempt enable -> [ preemption/blocking/migration is allowed from here ]
>
> Given that lttng uses per-buffer/per-sub-buffer commit counters as simple 
> free-running
> accumulators, the trick here is to use two commit counters rather than single 
> one for each
> sub-buffer. Whenever we need to read a commit count value, we always sum the 
> total of the
> LOCAL and REMOTE counter.
>
> This allows dealing with migration between reserve and commit without 
> requiring the overhead
> of an atomic operation on the fast-path (LOCAL case).
>
> I had to design this kind of dual-counter trick in the context of user-space 
> use of restartable
> sequences. It looks like it may have a role to play in the kernel as well. :)
>
> Or am I missing something important that would not survive real-life ?
>
> Thanks,
>
> Mathieu
>
> --
> Mathieu Desnoyers
> EfficiOS Inc.
> http://www.efficios.com

Re: [PATCH] workqueue: Remove rcu_read_lock/unlock() in workqueue_congested()

2021-02-17 Thread Lai Jiangshan

+CC Paul

On Wed, Feb 17, 2021 at 7:58 PM  wrote:
>
> From: Zqiang 
>
> The RCU read critical area already by preempt_disable/enable()
> (equivalent to rcu_read_lock_sched/unlock_sched()) mark, so remove
> rcu_read_lock/unlock().

I think we can leave it which acks like document, especially
workqueue_congested() is not performance crucial.  Either way
is Ok for me.

If it needs to be changed, please also do the same for
rcu_read_lock() in wq_watchdog_timer_fn().

And __queue_work() and try_to_grab_pending() also use local_irq_save()
and rcu_read_lock() at the same time, but I don't know will these
local_irq_save() be changed to raw_local_irq_save() in PREEMPT_RT.

>
> Signed-off-by: Zqiang 
> ---
>  kernel/workqueue.c | 2 --
>  1 file changed, 2 deletions(-)
>
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index 0d150da252e8..c599835ad6c3 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -4540,7 +4540,6 @@ bool workqueue_congested(int cpu, struct 
> workqueue_struct *wq)
> struct pool_workqueue *pwq;
> bool ret;
>
> -   rcu_read_lock();
> preempt_disable();
>
> if (cpu == WORK_CPU_UNBOUND)
> @@ -4553,7 +4552,6 @@ bool workqueue_congested(int cpu, struct 
> workqueue_struct *wq)
>
> ret = !list_empty(>delayed_works);
> preempt_enable();
> -   rcu_read_unlock();
>
> return ret;
>  }
> --
> 2.25.1
>

Re: [PATCH] workqueue: Move the position of debug_work_activate() in __queue_work()

2021-02-17 Thread Lai Jiangshan

On Thu, Feb 11, 2021 at 4:24 PM  wrote:
>
> From: Zqiang 
>
> The debug_work_activate() is called on the premise that
> the work can be inserted, because if wq be in WQ_DRAINING
> status, insert work may be failed.
>

Please add:
Fixes: e41e704bc4f49 ("workqueue: improve destroy_workqueue() debuggability")

The code looks good to me.

Reviewed-by: Lai Jiangshan 

> Signed-off-by: Zqiang 
> ---
>  kernel/workqueue.c | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index 0d150da252e8..21fb00b52def 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -1412,7 +1412,6 @@ static void __queue_work(int cpu, struct 
> workqueue_struct *wq,
>  */
> lockdep_assert_irqs_disabled();
>
> -   debug_work_activate(work);
>
> /* if draining, only works from the same workqueue are allowed */
> if (unlikely(wq->flags & __WQ_DRAINING) &&
> @@ -1494,6 +1493,7 @@ static void __queue_work(int cpu, struct 
> workqueue_struct *wq,
> worklist = >delayed_works;
> }
>
> +   debug_work_activate(work);
> insert_work(pwq, work, worklist, work_flags);
>
>  out:
> --
> 2.25.1
>

Re: [PATCH V4 0/6] x86: Don't abuse tss.sp1

2021-02-10 Thread Lai Jiangshan

Hi Mark

Thank you for your reply.

On Thu, Feb 11, 2021 at 7:42 AM mark gross  wrote:
>
> On Wed, Feb 10, 2021 at 09:39:11PM +0800, Lai Jiangshan wrote:
> > From: Lai Jiangshan 
> >
> > In x86_64, tss.sp1 is reused as cpu_current_top_of_stack.  We'd better
> > directly use percpu since CR3 and gs_base is correct when it is used.
> Be more direct if not using percpu is incorrect in some way.

Sure, I will abstract the most important reason from changelogs to
cover letter in the future.

> >
> > In x86_32, tss.sp1 is resued as thread.sp0 in three places in entry
> s/resued/reused

Sorry, I made it wrong in every cover letter even I noticed it after V2
was sent.  I forgot to use a spellchecker on the cover letter.

> > code.  We have the correct CR3 and %fs at two of the places.  The last
> > one is sysenter.  This patchset makes %fs available earlier so that
> > we can also use percpu in sysenter.  And add a percpu cpu_current_thread_sp0
> > for thread.sp0 instead of tss.sp1
> >
> > [V3]: 
> > https://lore.kernel.org/lkml/20210127163231.12709-1-jiangshan...@gmail.com/
> > [V2]: 
> > https://lore.kernel.org/lkml/20210125173444.22696-1-jiangshan...@gmail.com/
> > [V1]: 
> > https://lore.kernel.org/lkml/20210123084900.3118-1-jiangshan...@gmail.com/
> >
> > Changed from V3:
> >   Update subjects as Borislav's imperative request. ^_^
> >   Update changelog as Borislav suggested.
> >   Change EXPORT_PER_CPU_SYMBOL to EXPORT_PER_CPU_SYMBOL_GPL.
> >
> > Changed from V2:
> >   Add missing "%ss:" reported by Brian Gerst.
> >
> > Changed from V1:
> >   Requested from Andy to also fix sp1 for x86_32.
> >   Update comments in the x86_64 patch as Andy sugguested.
> >
> > Lai Jiangshan (6):
> >   x86/entry/64: Move cpu_current_top_of_stack out of TSS
> >   x86/entry/32: Use percpu instead of offset-calculation to get
> > thread.sp0 in SWITCH_TO_KERNEL_STACK
> >   x86/entry/32: Switch to the task stack without emptying the entry
> > stack
> >   x86/entry/32: Restore %fs before switching stack
> >   x86/entry/32: Use percpu to get thread.sp0 in SYSENTER
> >   x86/entry/32: Introduce cpu_current_thread_sp0 to replace
> > cpu_tss_rw.x86_tss.sp1
> >
> >  arch/x86/entry/entry_32.S  | 38 +-
> >  arch/x86/include/asm/processor.h   | 12 ++
> >  arch/x86/include/asm/switch_to.h   |  8 +--
> >  arch/x86/include/asm/thread_info.h |  6 -
> >  arch/x86/kernel/asm-offsets.c  |  1 -
> >  arch/x86/kernel/asm-offsets_32.c   | 10 
> >  arch/x86/kernel/cpu/common.c   | 12 +-
> >  arch/x86/kernel/process.c  |  7 --
> >  arch/x86/mm/pti.c  |  7 +++---
> >  9 files changed, 39 insertions(+), 62 deletions(-)
> >
> > --
> > 2.19.1.6.gb485710b
> >

[PATCH V4 4/6] x86/entry/32: Restore %fs before switching stack

2021-02-10 Thread Lai Jiangshan

From: Lai Jiangshan 

entry_SYSENTER_32 saves the user %fs in the entry stack and restores the
kernel %fs before loading the task stack for stack switching, so that it
can use percpu before switching stack in the next patch.

Signed-off-by: Lai Jiangshan 
---
 arch/x86/entry/entry_32.S | 22 +-
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 3e693db0963d..01f098c5b017 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -279,11 +279,13 @@
 .Lfinished_frame_\@:
 .endm
 
-.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 unwind_espfix=0
+.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 skip_fs=0 
unwind_espfix=0
cld
 .if \skip_gs == 0
PUSH_GS
 .endif
+
+.if \skip_fs == 0
pushl   %fs
 
pushl   %eax
@@ -293,6 +295,7 @@
UNWIND_ESPFIX_STACK
 .endif
popl%eax
+.endif
 
FIXUP_FRAME
pushl   %es
@@ -906,18 +909,27 @@ SYM_FUNC_START(entry_SYSENTER_32)
BUG_IF_WRONG_CR3 no_user_check=1
SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
 
+   /* Restore kernel %fs, so that we can use PERCPU */
+   pushl   %fs
+   movl$(__KERNEL_PERCPU), %eax
+   movl%eax, %fs
+
/* Switch to task stack */
movl%esp, %eax
-   movl(2*4+TSS_entry2task_stack)(%esp), %esp
+   movl(3*4+TSS_entry2task_stack)(%esp), %esp
 
 .Lsysenter_past_esp:
pushl   $__USER_DS  /* pt_regs->ss */
pushl   $0  /* pt_regs->sp (placeholder) */
-   pushl   %ss:4(%eax) /* pt_regs->flags (except IF = 0) */
+   pushl   %ss:8(%eax) /* pt_regs->flags (except IF = 0) */
pushl   $__USER_CS  /* pt_regs->cs */
pushl   $0  /* pt_regs->ip = 0 (placeholder) */
-   pushl   %ss:(%eax)  /* pt_regs->orig_ax */
-   SAVE_ALL pt_regs_ax=$-ENOSYS/* save rest, stack already switched */
+   pushl   %ss:4(%eax) /* pt_regs->orig_ax */
+   PUSH_GS /* pt_regs->gs */
+   pushl   %ss:(%eax)  /* pt_regs->fs */
+   /* save rest, stack and %fs already switched */
+   SAVE_ALL pt_regs_ax=$-ENOSYS skip_gs=1 skip_fs=1
+   SET_KERNEL_GS %edx
 
/*
 * SYSENTER doesn't filter flags, so we need to clear NT, AC
-- 
2.19.1.6.gb485710b

[PATCH V4 6/6] x86/entry/32: Introduce cpu_current_thread_sp0 to replace cpu_tss_rw.x86_tss.sp1

2021-02-10 Thread Lai Jiangshan

From: Lai Jiangshan 

TSS sp1 is not used by hardware and is used as a copy of thread.sp0.

It should just use a percpu variable instead, so we introduce
cpu_current_thread_sp0 for it.

And we remove the unneeded TSS_sp1.

Signed-off-by: Lai Jiangshan 
---
 arch/x86/entry/entry_32.S| 6 +++---
 arch/x86/include/asm/processor.h | 2 ++
 arch/x86/include/asm/switch_to.h | 2 +-
 arch/x86/kernel/asm-offsets.c| 1 -
 arch/x86/kernel/cpu/common.c | 9 -
 arch/x86/kernel/process.c| 2 --
 6 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index d5b5b43fd0c0..55dcf5c35141 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -472,7 +472,7 @@
movl%esp, %esi
 
/* Load top of task-stack into %edi */
-   movlPER_CPU_VAR(cpu_tss_rw + TSS_sp1), %edi
+   movlPER_CPU_VAR(cpu_current_thread_sp0), %edi
 
/* Special case - entry from kernel mode via entry stack */
 #ifdef CONFIG_VM86
@@ -658,7 +658,7 @@
movlPER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi
 
/* Bytes on the task-stack to ecx */
-   movlPER_CPU_VAR(cpu_tss_rw + TSS_sp1), %ecx
+   movlPER_CPU_VAR(cpu_current_thread_sp0), %ecx
subl%esi, %ecx
 
/* Allocate stack-frame on entry-stack */
@@ -916,7 +916,7 @@ SYM_FUNC_START(entry_SYSENTER_32)
 
/* Switch to task stack */
movl%esp, %eax
-   movlPER_CPU_VAR(cpu_tss_rw + TSS_sp1), %esp
+   movlPER_CPU_VAR(cpu_current_thread_sp0), %esp
 
 .Lsysenter_past_esp:
pushl   $__USER_DS  /* pt_regs->ss */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index e197de05d0aa..a40bade32105 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -776,6 +776,8 @@ static inline void spin_lock_prefetch(const void *x)
 
 #define KSTK_ESP(task) (task_pt_regs(task)->sp)
 
+DECLARE_PER_CPU(unsigned long, cpu_current_thread_sp0);
+
 #else
 #define INIT_THREAD { }
 
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index f0ba06bcba0b..eb0d3ae8a54d 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -69,7 +69,7 @@ static inline void update_task_stack(struct task_struct *task)
if (static_cpu_has(X86_FEATURE_XENPV))
load_sp0(task->thread.sp0);
else
-   this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
+   this_cpu_write(cpu_current_thread_sp0, task->thread.sp0);
 #else
if (static_cpu_has(X86_FEATURE_XENPV))
load_sp0(task_top_of_stack(task));
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 60b9f42ce3c1..3b63b6062792 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -98,6 +98,5 @@ static void __used common(void)
 
/* Offset for fields in tss_struct */
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
-   OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
 }
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 9c531ec73f5c..86485d55949e 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1792,12 +1792,19 @@ EXPORT_PER_CPU_SYMBOL(__preempt_count);
 /*
  * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
  * the top of the kernel stack.  Use an extra percpu variable to track the
- * top of the kernel stack directly.
+ * top of the kernel stack directly and an percpu variable to track the
+ * thread.sp0 for using in entry code.  cpu_current_top_of_stack and
+ * cpu_current_thread_sp0 are different value because of the non-zero
+ * stack-padding on 32bit.  See more comment at TOP_OF_KERNEL_STACK_PADDING
+ * and vm86.
  */
 DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =
(unsigned long)_thread_union + THREAD_SIZE;
 EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
 
+DEFINE_PER_CPU(unsigned long, cpu_current_thread_sp0) = TOP_OF_INIT_STACK;
+EXPORT_PER_CPU_SYMBOL_GPL(cpu_current_thread_sp0);
+
 #ifdef CONFIG_STACKPROTECTOR
 DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
 #endif
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 296de77da4b2..e6d4b5399a81 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -64,8 +64,6 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, 
cpu_tss_rw) = {
.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
 
 #ifdef CONFIG_X86_32
-   .sp1 = TOP_OF_INIT_STACK,
-
.ss0 = __KERNEL_DS,
.ss1 = __KERNEL_CS,
 #endif
-- 
2.19.1.6.gb485710b

[PATCH V4 5/6] x86/entry/32: Use percpu to get thread.sp0 in SYSENTER

2021-02-10 Thread Lai Jiangshan

From: Lai Jiangshan 

TSS_entry2task_stack is used to refer to tss.sp1 which is a copy of
thread.sp0.

When TSS_entry2task_stack is used in entry_SYSENTER_32, the CR3 is
already kernel CR3 and the kernel %fs is loaded.

So it directly uses percpu instead of offset-calculation via
TSS_entry2task_stack.

And we remove the unused TSS_entry2task_stack.

Signed-off-by: Lai Jiangshan 
---
 arch/x86/entry/entry_32.S|  2 +-
 arch/x86/kernel/asm-offsets_32.c | 10 --
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 01f098c5b017..d5b5b43fd0c0 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -916,7 +916,7 @@ SYM_FUNC_START(entry_SYSENTER_32)
 
/* Switch to task stack */
movl%esp, %eax
-   movl(3*4+TSS_entry2task_stack)(%esp), %esp
+   movlPER_CPU_VAR(cpu_tss_rw + TSS_sp1), %esp
 
 .Lsysenter_past_esp:
pushl   $__USER_DS  /* pt_regs->ss */
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 6e043f295a60..6d4143cfbf03 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -43,16 +43,6 @@ void foo(void)
OFFSET(saved_context_gdt_desc, saved_context, gdt_desc);
BLANK();
 
-   /*
-* Offset from the entry stack to task stack stored in TSS. Kernel entry
-* happens on the per-cpu entry-stack, and the asm code switches to the
-* task-stack pointer stored in x86_tss.sp1, which is a copy of
-* task->thread.sp0 where entry code can find it.
-*/
-   DEFINE(TSS_entry2task_stack,
-  offsetof(struct cpu_entry_area, tss.x86_tss.sp1) -
-  offsetofend(struct cpu_entry_area, entry_stack_page.stack));
-
 #ifdef CONFIG_STACKPROTECTOR
BLANK();
OFFSET(stack_canary_offset, stack_canary, canary);
-- 
2.19.1.6.gb485710b

[PATCH V4 3/6] x86/entry/32: Switch to the task stack without emptying the entry stack

2021-02-10 Thread Lai Jiangshan

From: Lai Jiangshan 

Like the way x86_64 uses the entry stack when switching to the task stack,
entry_SYSENTER_32 can also save the entry stack pointer to a register and
then switch to the task stack.  So that it doesn't need to empty the entry
stack by poping contents to registers and it has more space on the entry
stack to save stuffs or scratch registers.

It is a preparation for next patches which need to save user %fs in the
entry stack before restoring kernel %fs and loading the task stack for
stack switching.

Signed-off-by: Lai Jiangshan 
---
 arch/x86/entry/entry_32.S | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 3b4d1a63d1f0..3e693db0963d 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -905,19 +905,18 @@ SYM_FUNC_START(entry_SYSENTER_32)
pushl   %eax
BUG_IF_WRONG_CR3 no_user_check=1
SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
-   popl%eax
-   popfl
 
-   /* Stack empty again, switch to task stack */
-   movlTSS_entry2task_stack(%esp), %esp
+   /* Switch to task stack */
+   movl%esp, %eax
+   movl(2*4+TSS_entry2task_stack)(%esp), %esp
 
 .Lsysenter_past_esp:
pushl   $__USER_DS  /* pt_regs->ss */
pushl   $0  /* pt_regs->sp (placeholder) */
-   pushfl  /* pt_regs->flags (except IF = 0) */
+   pushl   %ss:4(%eax) /* pt_regs->flags (except IF = 0) */
pushl   $__USER_CS  /* pt_regs->cs */
pushl   $0  /* pt_regs->ip = 0 (placeholder) */
-   pushl   %eax/* pt_regs->orig_ax */
+   pushl   %ss:(%eax)  /* pt_regs->orig_ax */
SAVE_ALL pt_regs_ax=$-ENOSYS/* save rest, stack already switched */
 
/*
-- 
2.19.1.6.gb485710b

[PATCH V4 2/6] x86/entry/32: Use percpu instead of offset-calculation to get thread.sp0 in SWITCH_TO_KERNEL_STACK

2021-02-10 Thread Lai Jiangshan

From: Lai Jiangshan 

TSS_entry2task_stack is used to refer to tss.sp1 which is a copy of
thread.sp0.

When TSS_entry2task_stack is used in SWITCH_TO_KERNEL_STACK, the CR3 is
already kernel CR3 and the kernel segments are loaded.

So it directly uses percpu to get tss.sp1(thread.sp0) instead of
the complicated offset-calculation via TSS_entry2task_stack.

Signed-off-by: Lai Jiangshan 
---
 arch/x86/entry/entry_32.S | 9 ++---
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index df8c017e6161..3b4d1a63d1f0 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -465,16 +465,11 @@
cmpl$SIZEOF_entry_stack, %ecx
jae .Lend_\@
 
-   /* Load stack pointer into %esi and %edi */
+   /* Load stack pointer into %esi */
movl%esp, %esi
-   movl%esi, %edi
-
-   /* Move %edi to the top of the entry stack */
-   andl$(MASK_entry_stack), %edi
-   addl$(SIZEOF_entry_stack), %edi
 
/* Load top of task-stack into %edi */
-   movlTSS_entry2task_stack(%edi), %edi
+   movlPER_CPU_VAR(cpu_tss_rw + TSS_sp1), %edi
 
/* Special case - entry from kernel mode via entry stack */
 #ifdef CONFIG_VM86
-- 
2.19.1.6.gb485710b

[PATCH V4 1/6] x86/entry/64: Move cpu_current_top_of_stack out of TSS

2021-02-10 Thread Lai Jiangshan

From: Lai Jiangshan 

In x86_64, cpu_current_top_of_stack is an alias of cpu_tss_rw.x86_tss.sp1.

When the CPU has meltdown vulnerability(X86_BUG_CPU_MELTDOWN), it would
become a coveted fruit even if kernel pagetable isolation is enabled since
CPU TSS must also be in the user CR3.  An attacker can fetch the kernel
stack top from it through the said vulnerability and continue next steps
of actions based on the kernel stack.

Besides the possible leakage of the address of the kernel stack, it is
not necessary to be in TSS either.  Although it is also heavily used
in the entry code, it is only used when CR3 is already the kernel CR3
and gs_base is already the kernel gs_base which means it can be a normal
percpu variable instead of an alias to a field in TSS.

The major reason it reuses a filed in TSS is performance because TSS is
normally hot in cache and TLB since entry_SYSCALL_64 uses sp2 as scratch
space to stash the user RSP value.

This patch makes it be a percpu variable near other hot percpu variables,
such as current_task, __preempt_count, and they are in the same
cache line.

Signed-off-by: Lai Jiangshan 
---
tools/testing/selftests/seccomp/seccomp_benchmark desn't show any
performance lost in "getpid native" result.  And actually, the result
changes from 93ns before patch to 92ns after patch when !KPTI, and the
test is very stable although the test desn't show a higher degree of
precision but enough to know it doesn't cause degression for the test.

 arch/x86/include/asm/processor.h   | 10 --
 arch/x86/include/asm/switch_to.h   |  6 --
 arch/x86/include/asm/thread_info.h |  6 --
 arch/x86/kernel/cpu/common.c   |  3 +++
 arch/x86/kernel/process.c  |  7 +--
 arch/x86/mm/pti.c  |  7 +++
 6 files changed, 7 insertions(+), 32 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index a411466a6e74..e197de05d0aa 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -316,11 +316,6 @@ struct x86_hw_tss {
 struct x86_hw_tss {
u32 reserved1;
u64 sp0;
-
-   /*
-* We store cpu_current_top_of_stack in sp1 so it's always accessible.
-* Linux does not use ring 1, so sp1 is not otherwise needed.
-*/
u64 sp1;
 
/*
@@ -430,12 +425,7 @@ struct irq_stack {
 
 DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
 
-#ifdef CONFIG_X86_32
 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
-#else
-/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
-#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
-#endif
 
 #ifdef CONFIG_X86_64
 struct fixed_percpu_data {
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 9f69cc497f4b..f0ba06bcba0b 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -71,12 +71,6 @@ static inline void update_task_stack(struct task_struct 
*task)
else
this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
 #else
-   /*
-* x86-64 updates x86_tss.sp1 via cpu_current_top_of_stack. That
-* doesn't work on x86-32 because sp1 and
-* cpu_current_top_of_stack have different values (because of
-* the non-zero stack-padding on 32bit).
-*/
if (static_cpu_has(X86_FEATURE_XENPV))
load_sp0(task_top_of_stack(task));
 #endif
diff --git a/arch/x86/include/asm/thread_info.h 
b/arch/x86/include/asm/thread_info.h
index 33b637442b9e..f72404991d01 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -199,12 +199,6 @@ static inline int arch_within_stack_frames(const void * 
const stack,
 #endif
 }
 
-#else /* !__ASSEMBLY__ */
-
-#ifdef CONFIG_X86_64
-# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
-#endif
-
 #endif
 
 #ifdef CONFIG_COMPAT
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 9215b91bc044..9c531ec73f5c 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1748,6 +1748,9 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
 DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
 EXPORT_PER_CPU_SYMBOL(__preempt_count);
 
+DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
+EXPORT_PER_CPU_SYMBOL_GPL(cpu_current_top_of_stack);
+
 /* May not be marked __init: used by software suspend */
 void syscall_init(void)
 {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 145a7ac0c19a..296de77da4b2 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -63,14 +63,9 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, 
cpu_tss_rw) = {
 */
.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
 
-   /*
-* .sp1 is cpu_current_top_of_st

[PATCH V4 0/6] x86: Don't abuse tss.sp1

2021-02-10 Thread Lai Jiangshan

From: Lai Jiangshan 

In x86_64, tss.sp1 is reused as cpu_current_top_of_stack.  We'd better
directly use percpu since CR3 and gs_base is correct when it is used.

In x86_32, tss.sp1 is resued as thread.sp0 in three places in entry
code.  We have the correct CR3 and %fs at two of the places.  The last
one is sysenter.  This patchset makes %fs available earlier so that
we can also use percpu in sysenter.  And add a percpu cpu_current_thread_sp0
for thread.sp0 instead of tss.sp1

[V3]: 
https://lore.kernel.org/lkml/20210127163231.12709-1-jiangshan...@gmail.com/
[V2]: 
https://lore.kernel.org/lkml/20210125173444.22696-1-jiangshan...@gmail.com/
[V1]: https://lore.kernel.org/lkml/20210123084900.3118-1-jiangshan...@gmail.com/

Changed from V3:
Update subjects as Borislav's imperative request. ^_^
Update changelog as Borislav suggested.
Change EXPORT_PER_CPU_SYMBOL to EXPORT_PER_CPU_SYMBOL_GPL.

Changed from V2:
Add missing "%ss:" reported by Brian Gerst.

Changed from V1:
Requested from Andy to also fix sp1 for x86_32.
Update comments in the x86_64 patch as Andy sugguested.

Lai Jiangshan (6):
  x86/entry/64: Move cpu_current_top_of_stack out of TSS
  x86/entry/32: Use percpu instead of offset-calculation to get
thread.sp0 in SWITCH_TO_KERNEL_STACK
  x86/entry/32: Switch to the task stack without emptying the entry
stack
  x86/entry/32: Restore %fs before switching stack
  x86/entry/32: Use percpu to get thread.sp0 in SYSENTER
  x86/entry/32: Introduce cpu_current_thread_sp0 to replace
cpu_tss_rw.x86_tss.sp1

 arch/x86/entry/entry_32.S  | 38 +-
 arch/x86/include/asm/processor.h   | 12 ++
 arch/x86/include/asm/switch_to.h   |  8 +--
 arch/x86/include/asm/thread_info.h |  6 -
 arch/x86/kernel/asm-offsets.c  |  1 -
 arch/x86/kernel/asm-offsets_32.c   | 10 
 arch/x86/kernel/cpu/common.c   | 12 +-
 arch/x86/kernel/process.c  |  7 --
 arch/x86/mm/pti.c  |  7 +++---
 9 files changed, 39 insertions(+), 62 deletions(-)

-- 
2.19.1.6.gb485710b

[tip: x86/urgent] x86/debug: Prevent data breakpoints on cpu_dr7

2021-02-05 Thread tip-bot2 for Lai Jiangshan

The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: 3943abf2dbfae9ea4d2da05c1db569a0603f76da
Gitweb:
https://git.kernel.org/tip/3943abf2dbfae9ea4d2da05c1db569a0603f76da
Author:Lai Jiangshan 
AuthorDate:Thu, 04 Feb 2021 23:27:07 +08:00
Committer: Thomas Gleixner 
CommitterDate: Fri, 05 Feb 2021 20:13:12 +01:00

x86/debug: Prevent data breakpoints on cpu_dr7

local_db_save() is called at the start of exc_debug_kernel(), reads DR7 and
disables breakpoints to prevent recursion.

When running in a guest (X86_FEATURE_HYPERVISOR), local_db_save() reads the
per-cpu variable cpu_dr7 to check whether a breakpoint is active or not
before it accesses DR7.

A data breakpoint on cpu_dr7 therefore results in infinite #DB recursion.

Disallow data breakpoints on cpu_dr7 to prevent that.

Fixes: 84b6a3491567a("x86/entry: Optimize local_db_save() for virt")
Signed-off-by: Lai Jiangshan 
Signed-off-by: Thomas Gleixner 
Cc: sta...@vger.kernel.org
Link: https://lore.kernel.org/r/20210204152708.21308-2-jiangshan...@gmail.com

---
 arch/x86/kernel/hw_breakpoint.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 012ed82..668a4a6 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -307,6 +307,14 @@ static inline bool within_cpu_entry(unsigned long addr, 
unsigned long end)
(unsigned long)_cpu(cpu_tlbstate, cpu),
sizeof(struct tlb_state)))
return true;
+
+   /*
+* When in guest (X86_FEATURE_HYPERVISOR), local_db_save()
+* will read per-cpu cpu_dr7 before clear dr7 register.
+*/
+   if (within_area(addr, end, (unsigned long)_cpu(cpu_dr7, 
cpu),
+   sizeof(cpu_dr7)))
+   return true;
}
 
return false;

[tip: x86/urgent] x86/debug: Prevent data breakpoints on __per_cpu_offset

2021-02-05 Thread tip-bot2 for Lai Jiangshan

The following commit has been merged into the x86/urgent branch of tip:

Commit-ID: c4bed4b96918ff1d062ee81fdae4d207da4fa9b0
Gitweb:
https://git.kernel.org/tip/c4bed4b96918ff1d062ee81fdae4d207da4fa9b0
Author:Lai Jiangshan 
AuthorDate:Thu, 04 Feb 2021 23:27:06 +08:00
Committer: Thomas Gleixner 
CommitterDate: Fri, 05 Feb 2021 20:13:11 +01:00

x86/debug: Prevent data breakpoints on __per_cpu_offset

When FSGSBASE is enabled, paranoid_entry() fetches the per-CPU GSBASE value
via __per_cpu_offset or pcpu_unit_offsets.

When a data breakpoint is set on __per_cpu_offset[cpu] (read-write
operation), the specific CPU will be stuck in an infinite #DB loop.

RCU will try to send an NMI to the specific CPU, but it is not working
either since NMI also relies on paranoid_entry(). Which means it's
undebuggable.

Fixes: eaad981291ee3("x86/entry/64: Introduce the FIND_PERCPU_BASE macro")
Signed-off-by: Lai Jiangshan 
Signed-off-by: Thomas Gleixner 
Cc: sta...@vger.kernel.org
Link: https://lore.kernel.org/r/20210204152708.21308-1-jiangshan...@gmail.com

---
 arch/x86/kernel/hw_breakpoint.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 6694c0f..012ed82 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -269,6 +269,20 @@ static inline bool within_cpu_entry(unsigned long addr, 
unsigned long end)
CPU_ENTRY_AREA_TOTAL_SIZE))
return true;
 
+   /*
+* When FSGSBASE is enabled, paranoid_entry() fetches the per-CPU
+* GSBASE value via __per_cpu_offset or pcpu_unit_offsets.
+*/
+#ifdef CONFIG_SMP
+   if (within_area(addr, end, (unsigned long)__per_cpu_offset,
+   sizeof(unsigned long) * nr_cpu_ids))
+   return true;
+#else
+   if (within_area(addr, end, (unsigned long)_unit_offsets,
+   sizeof(pcpu_unit_offsets)))
+   return true;
+#endif
+
for_each_possible_cpu(cpu) {
/* The original rw GDT is being used after load_direct_gdt() */
if (within_area(addr, end, (unsigned long)get_cpu_gdt_rw(cpu),

Re: [patch 11/12] softirq: Allow inlining do_softirq_own_stack()

2021-02-05 Thread Lai Jiangshan

On Fri, Feb 5, 2021 at 10:04 AM Thomas Gleixner  wrote:
>
> The function to switch to the irq stack on x86 is now minimal and there is
> only a single caller. Allow the stack switch to be inlined.
>
> Signed-off-by: Thomas Gleixner 
> ---
>  include/linux/interrupt.h |2 ++
>  kernel/softirq.c  |4 
>  2 files changed, 6 insertions(+)
>
> --- a/include/linux/interrupt.h
> +++ b/include/linux/interrupt.h
> @@ -570,7 +570,9 @@ asmlinkage void do_softirq(void);
>  asmlinkage void __do_softirq(void);
>
>  #ifdef __ARCH_HAS_DO_SOFTIRQ
> +# ifndef __ARCH_HAS_DO_SOFTIRQ_INLINE
>  void do_softirq_own_stack(void);
> +# endif
>  #else
>  static inline void do_softirq_own_stack(void)
>  {

Hello

This patch and the next patch have three "#if[n]def" with
__ARCH_HAS_DO_SOFTIRQ_INLINE and this one is nested in
__ARCH_HAS_DO_SOFTIRQ.

I wonder if we can use __ARCH_HAS_DO_SOFTIRQ only.

For example, we can move "void do_softirq_own_stack(void);" to around
the code where __ARCH_HAS_DO_SOFTIRQ are defined in very ARCHs.
(And for x86, do_softirq_own_stack() is a macro instead of function
declaration as next patch shows)

Thanks
Lai

> --- a/kernel/softirq.c
> +++ b/kernel/softirq.c
> @@ -26,6 +26,10 @@
>  #include 
>  #include 
>
> +#ifdef __ARCH_HAS_DO_SOFTIRQ_INLINE
> +# include 
> +#endif
> +
>  #define CREATE_TRACE_POINTS
>  #include 
>
>

[PATCH 2/2] x86/hw_breakpoint: Prevent data breakpoints on cpu_dr7

2021-02-04 Thread Lai Jiangshan

From: Lai Jiangshan 

When in guest (X86_FEATURE_HYPERVISOR), local_db_save() will read
per-cpu cpu_dr7 before clear dr7 register.

local_db_save() is called at the start of exc_debug_kernel().
To avoid recursive #DB, we have to disallow data breakpoints
on cpu_dr7.

Fixes: 84b6a3491567a("x86/entry: Optimize local_db_save() for virt")
Signed-off-by: Lai Jiangshan 
---
 arch/x86/kernel/hw_breakpoint.c | 8 
 1 file changed, 8 insertions(+)

diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index bc7493a0736f..de34dd49d317 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -307,6 +307,14 @@ static inline bool within_cpu_entry(unsigned long addr, 
unsigned long end)
(unsigned long)_cpu(cpu_tlbstate, cpu),
sizeof(struct tlb_state)))
return true;
+
+   /*
+* When in guest (X86_FEATURE_HYPERVISOR), local_db_save()
+* will read per-cpu cpu_dr7 before clear dr7 register.
+*/
+   if (within_area(addr, end, (unsigned long)_cpu(cpu_dr7, 
cpu),
+   sizeof(cpu_dr7)))
+   return true;
}
 
return false;
-- 
2.19.1.6.gb485710b

[PATCH 1/2] x86/hw_breakpoint: Prevent data breakpoints on __per_cpu_offset

2021-02-04 Thread Lai Jiangshan

From: Lai Jiangshan 

When FSGSBASE is enabled, paranoid_entry() fetches the per-CPU
GSBASE value via __per_cpu_offset or pcpu_unit_offsets.

When data breakpoint is set on __per_cpu_offset[cpu] (read-write
operation), the specific cpu will be stuck in the infinite #DB loop.
RCU will try to send NMI to the specific cpu, but it is not working
either since NMI also relies on paranoid_entry().

Fixes: eaad981291ee3("x86/entry/64: Introduce the FIND_PERCPU_BASE macro")
Signed-off-by: Lai Jiangshan 
---
 arch/x86/kernel/hw_breakpoint.c | 14 ++
 1 file changed, 14 insertions(+)

diff --git a/arch/x86/kernel/hw_breakpoint.c b/arch/x86/kernel/hw_breakpoint.c
index 03aa33b58165..bc7493a0736f 100644
--- a/arch/x86/kernel/hw_breakpoint.c
+++ b/arch/x86/kernel/hw_breakpoint.c
@@ -269,6 +269,20 @@ static inline bool within_cpu_entry(unsigned long addr, 
unsigned long end)
CPU_ENTRY_AREA_TOTAL_SIZE))
return true;
 
+   /*
+* When FSGSBASE is enabled, paranoid_entry() fetches the per-CPU
+* GSBASE value via __per_cpu_offset or pcpu_unit_offsets.
+*/
+#ifdef CONFIG_SMP
+   if (within_area(addr, end, (unsigned long)__per_cpu_offset,
+   sizeof(unsigned long) * nr_cpu_ids))
+   return true;
+#else
+   if (within_area(addr, end, (unsigned long)_unit_offsets,
+   sizeof(pcpu_unit_offsets)))
+   return true;
+#endif
+
for_each_possible_cpu(cpu) {
/* The original rw GDT is being used after load_direct_gdt() */
if (within_area(addr, end, (unsigned long)get_cpu_gdt_rw(cpu),
-- 
2.19.1.6.gb485710b

Re: [PATCH V3 0/6] x86: don't abuse tss.sp1

2021-01-29 Thread Lai Jiangshan

On Sat, Jan 30, 2021 at 12:43 AM Borislav Petkov  wrote:
>
> On Fri, Jan 29, 2021 at 11:35:46PM +0800, Lai Jiangshan wrote:
> > Any feedback?
>
> Yes: be patient please.
>
> Thx.
>
> --
> Regards/Gruss,
> Boris.
>
> https://people.kernel.org/tglx/notes-about-netiquette

Thank you for your reply and this gentle hint.

I did "reply to all", but I have no clue why the mail failed
to be delivered to LKML, and maybe it also failed to be delivered
to other guys in the cc-list.

Hopefully this won't happen again to this reply.

Thanks
Lai

[PATCH V3 2/6] x86_32: use percpu instead of offset-calculation to get thread.sp0 when SWITCH_TO_KERNEL_STACK

2021-01-27 Thread Lai Jiangshan

From: Lai Jiangshan 

TSS_entry2task_stack is used to refer to tss.sp1 which is stored the value
of thread.sp0.

At the code where TSS_entry2task_stack is used in SWITCH_TO_KERNEL_STACK,
the CR3 is already kernel CR3 and kernel segments is loaded.

So we can directly use the percpu to get tss.sp1(thread.sp0) instead of
the complex offset-calculation.

Signed-off-by: Lai Jiangshan 
---
 arch/x86/entry/entry_32.S | 9 ++---
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index df8c017e6161..3b4d1a63d1f0 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -465,16 +465,11 @@
cmpl$SIZEOF_entry_stack, %ecx
jae .Lend_\@
 
-   /* Load stack pointer into %esi and %edi */
+   /* Load stack pointer into %esi */
movl%esp, %esi
-   movl%esi, %edi
-
-   /* Move %edi to the top of the entry stack */
-   andl$(MASK_entry_stack), %edi
-   addl$(SIZEOF_entry_stack), %edi
 
/* Load top of task-stack into %edi */
-   movlTSS_entry2task_stack(%edi), %edi
+   movlPER_CPU_VAR(cpu_tss_rw + TSS_sp1), %edi
 
/* Special case - entry from kernel mode via entry stack */
 #ifdef CONFIG_VM86
-- 
2.19.1.6.gb485710b

[PATCH V3 3/6] x86_32/sysenter: switch to the task stack without emptying the entry stack

2021-01-27 Thread Lai Jiangshan

From: Lai Jiangshan 

Like the way x86_64 uses the "old" stack, we can save the entry stack
pointer to a register and switch to the task stack.  So that we have
space on the "old" stack to save more things or scratch registers.

Signed-off-by: Lai Jiangshan 
---
 arch/x86/entry/entry_32.S | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 3b4d1a63d1f0..3e693db0963d 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -905,19 +905,18 @@ SYM_FUNC_START(entry_SYSENTER_32)
pushl   %eax
BUG_IF_WRONG_CR3 no_user_check=1
SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
-   popl%eax
-   popfl
 
-   /* Stack empty again, switch to task stack */
-   movlTSS_entry2task_stack(%esp), %esp
+   /* Switch to task stack */
+   movl%esp, %eax
+   movl(2*4+TSS_entry2task_stack)(%esp), %esp
 
 .Lsysenter_past_esp:
pushl   $__USER_DS  /* pt_regs->ss */
pushl   $0  /* pt_regs->sp (placeholder) */
-   pushfl  /* pt_regs->flags (except IF = 0) */
+   pushl   %ss:4(%eax) /* pt_regs->flags (except IF = 0) */
pushl   $__USER_CS  /* pt_regs->cs */
pushl   $0  /* pt_regs->ip = 0 (placeholder) */
-   pushl   %eax/* pt_regs->orig_ax */
+   pushl   %ss:(%eax)  /* pt_regs->orig_ax */
SAVE_ALL pt_regs_ax=$-ENOSYS/* save rest, stack already switched */
 
/*
-- 
2.19.1.6.gb485710b

[PATCH V3 5/6] x86_32/sysenter: use percpu to get thread.sp0 when sysenter

2021-01-27 Thread Lai Jiangshan

From: Lai Jiangshan 

TSS_entry2task_stack is used to refer to tss.sp1 which is stored the value
of thread.sp0.

At the code where TSS_entry2task_stack is used in sysenter, the CR3 is
already kernel CR3 and kernel segments is loaded.

So that we can directly use percpu for it instead of offset-calculation.

And we remove the unused TSS_entry2task_stack.

Signed-off-by: Lai Jiangshan 
---
 arch/x86/entry/entry_32.S|  2 +-
 arch/x86/kernel/asm-offsets_32.c | 10 --
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 01f098c5b017..d5b5b43fd0c0 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -916,7 +916,7 @@ SYM_FUNC_START(entry_SYSENTER_32)
 
/* Switch to task stack */
movl%esp, %eax
-   movl(3*4+TSS_entry2task_stack)(%esp), %esp
+   movlPER_CPU_VAR(cpu_tss_rw + TSS_sp1), %esp
 
 .Lsysenter_past_esp:
pushl   $__USER_DS  /* pt_regs->ss */
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 6e043f295a60..6d4143cfbf03 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -43,16 +43,6 @@ void foo(void)
OFFSET(saved_context_gdt_desc, saved_context, gdt_desc);
BLANK();
 
-   /*
-* Offset from the entry stack to task stack stored in TSS. Kernel entry
-* happens on the per-cpu entry-stack, and the asm code switches to the
-* task-stack pointer stored in x86_tss.sp1, which is a copy of
-* task->thread.sp0 where entry code can find it.
-*/
-   DEFINE(TSS_entry2task_stack,
-  offsetof(struct cpu_entry_area, tss.x86_tss.sp1) -
-  offsetofend(struct cpu_entry_area, entry_stack_page.stack));
-
 #ifdef CONFIG_STACKPROTECTOR
BLANK();
OFFSET(stack_canary_offset, stack_canary, canary);
-- 
2.19.1.6.gb485710b

[PATCH V3 4/6] x86_32/sysenter: restore %fs before switching stack

2021-01-27 Thread Lai Jiangshan

From: Lai Jiangshan 

Prepare for using percpu and removing TSS_entry2task_stack

Signed-off-by: Lai Jiangshan 
---
 arch/x86/entry/entry_32.S | 22 +-
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 3e693db0963d..01f098c5b017 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -279,11 +279,13 @@
 .Lfinished_frame_\@:
 .endm
 
-.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 unwind_espfix=0
+.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 skip_fs=0 
unwind_espfix=0
cld
 .if \skip_gs == 0
PUSH_GS
 .endif
+
+.if \skip_fs == 0
pushl   %fs
 
pushl   %eax
@@ -293,6 +295,7 @@
UNWIND_ESPFIX_STACK
 .endif
popl%eax
+.endif
 
FIXUP_FRAME
pushl   %es
@@ -906,18 +909,27 @@ SYM_FUNC_START(entry_SYSENTER_32)
BUG_IF_WRONG_CR3 no_user_check=1
SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
 
+   /* Restore kernel %fs, so that we can use PERCPU */
+   pushl   %fs
+   movl$(__KERNEL_PERCPU), %eax
+   movl%eax, %fs
+
/* Switch to task stack */
movl%esp, %eax
-   movl(2*4+TSS_entry2task_stack)(%esp), %esp
+   movl(3*4+TSS_entry2task_stack)(%esp), %esp
 
 .Lsysenter_past_esp:
pushl   $__USER_DS  /* pt_regs->ss */
pushl   $0  /* pt_regs->sp (placeholder) */
-   pushl   %ss:4(%eax) /* pt_regs->flags (except IF = 0) */
+   pushl   %ss:8(%eax) /* pt_regs->flags (except IF = 0) */
pushl   $__USER_CS  /* pt_regs->cs */
pushl   $0  /* pt_regs->ip = 0 (placeholder) */
-   pushl   %ss:(%eax)  /* pt_regs->orig_ax */
-   SAVE_ALL pt_regs_ax=$-ENOSYS/* save rest, stack already switched */
+   pushl   %ss:4(%eax) /* pt_regs->orig_ax */
+   PUSH_GS /* pt_regs->gs */
+   pushl   %ss:(%eax)  /* pt_regs->fs */
+   /* save rest, stack and %fs already switched */
+   SAVE_ALL pt_regs_ax=$-ENOSYS skip_gs=1 skip_fs=1
+   SET_KERNEL_GS %edx
 
/*
 * SYSENTER doesn't filter flags, so we need to clear NT, AC
-- 
2.19.1.6.gb485710b

[PATCH V3 6/6] x86_32: use cpu_current_thread_sp0 instead of cpu_tss_rw.x86_tss.sp1

2021-01-27 Thread Lai Jiangshan

From: Lai Jiangshan 

sp1 is not used by hardware and is used as thread.sp0.  We should just
use new percpu variable.

And remove unneeded TSS_sp1.

Signed-off-by: Lai Jiangshan 
---
 arch/x86/entry/entry_32.S| 6 +++---
 arch/x86/include/asm/processor.h | 2 ++
 arch/x86/include/asm/switch_to.h | 2 +-
 arch/x86/kernel/asm-offsets.c| 1 -
 arch/x86/kernel/cpu/common.c | 9 -
 arch/x86/kernel/process.c| 2 --
 6 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index d5b5b43fd0c0..55dcf5c35141 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -472,7 +472,7 @@
movl%esp, %esi
 
/* Load top of task-stack into %edi */
-   movlPER_CPU_VAR(cpu_tss_rw + TSS_sp1), %edi
+   movlPER_CPU_VAR(cpu_current_thread_sp0), %edi
 
/* Special case - entry from kernel mode via entry stack */
 #ifdef CONFIG_VM86
@@ -658,7 +658,7 @@
movlPER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi
 
/* Bytes on the task-stack to ecx */
-   movlPER_CPU_VAR(cpu_tss_rw + TSS_sp1), %ecx
+   movlPER_CPU_VAR(cpu_current_thread_sp0), %ecx
subl%esi, %ecx
 
/* Allocate stack-frame on entry-stack */
@@ -916,7 +916,7 @@ SYM_FUNC_START(entry_SYSENTER_32)
 
/* Switch to task stack */
movl%esp, %eax
-   movlPER_CPU_VAR(cpu_tss_rw + TSS_sp1), %esp
+   movlPER_CPU_VAR(cpu_current_thread_sp0), %esp
 
 .Lsysenter_past_esp:
pushl   $__USER_DS  /* pt_regs->ss */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 886d32da1318..4265884c33e7 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -774,6 +774,8 @@ static inline void spin_lock_prefetch(const void *x)
 
 #define KSTK_ESP(task) (task_pt_regs(task)->sp)
 
+DECLARE_PER_CPU(unsigned long, cpu_current_thread_sp0);
+
 #else
 #define INIT_THREAD { }
 
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index b5f0d2ff47e4..e27eb7974797 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -69,7 +69,7 @@ static inline void update_task_stack(struct task_struct *task)
if (static_cpu_has(X86_FEATURE_XENPV))
load_sp0(task->thread.sp0);
else
-   this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
+   this_cpu_write(cpu_current_thread_sp0, task->thread.sp0);
 #else
/* Xen PV enters the kernel on the thread stack. */
if (static_cpu_has(X86_FEATURE_XENPV))
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 60b9f42ce3c1..3b63b6062792 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -98,6 +98,5 @@ static void __used common(void)
 
/* Offset for fields in tss_struct */
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
-   OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
 }
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f3d7fd7e9684..b2c37d369137 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1789,12 +1789,19 @@ EXPORT_PER_CPU_SYMBOL(__preempt_count);
 /*
  * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
  * the top of the kernel stack.  Use an extra percpu variable to track the
- * top of the kernel stack directly.
+ * top of the kernel stack directly and an percpu variable to track the
+ * thread.sp0 for using in entry code.  cpu_current_top_of_stack and
+ * cpu_current_thread_sp0 are different value because of the non-zero
+ * stack-padding on 32bit.  See more comment at TOP_OF_KERNEL_STACK_PADDING
+ * and vm86.
  */
 DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =
(unsigned long)_thread_union + THREAD_SIZE;
 EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
 
+DEFINE_PER_CPU(unsigned long, cpu_current_thread_sp0) = TOP_OF_INIT_STACK;
+EXPORT_PER_CPU_SYMBOL(cpu_current_thread_sp0);
+
 #ifdef CONFIG_STACKPROTECTOR
 DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
 #endif
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 296de77da4b2..e6d4b5399a81 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -64,8 +64,6 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, 
cpu_tss_rw) = {
.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
 
 #ifdef CONFIG_X86_32
-   .sp1 = TOP_OF_INIT_STACK,
-
.ss0 = __KERNEL_DS,
.ss1 = __KERNEL_CS,
 #endif
-- 
2.19.1.6.gb485710b

[PATCH V3 1/6] x86_64: move cpu_current_top_of_stack out of TSS

2021-01-27 Thread Lai Jiangshan

From: Lai Jiangshan 

When X86_BUG_CPU_MELTDOWN & KPTI, cpu_current_top_of_stack lives in the
TSS which is also in the user CR3 and it becomes a coveted fruit.  An
attacker can fetch the kernel stack top from it and continue next steps
of actions based on the kernel stack.

The address might not be very usefull for attacker, but it is not so
necessary to be in TSS either.  It is only accessed when CR3 is kernel CR3
and gs_base is kernel gs_base which means it can be in any percpu variable.

The major reason it is in TSS might be performance because it is hot in
cache and tlb since we just access sp2 as the scratch space in syscall.

So we can move it to a percpu variable near other hot percpu variables,
such as current_task, __preempt_count, and they are in the same
cache line.

tools/testing/selftests/seccomp/seccomp_benchmark desn't show any
performance lost in "getpid native" result.  And actually, the result
changes from 93ns before patch to 92ns after patch when !KPTI, and the
test is very stable although the test desn't show a higher degree of
precision but enough to know it doesn't cause degression for the test.

Signed-off-by: Lai Jiangshan 
---
 arch/x86/include/asm/processor.h   | 10 --
 arch/x86/include/asm/switch_to.h   |  7 +--
 arch/x86/include/asm/thread_info.h |  6 --
 arch/x86/kernel/cpu/common.c   |  3 +++
 arch/x86/kernel/process.c  |  7 +--
 arch/x86/mm/pti.c  |  7 +++
 6 files changed, 8 insertions(+), 32 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c20a52b5534b..886d32da1318 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -314,11 +314,6 @@ struct x86_hw_tss {
 struct x86_hw_tss {
u32 reserved1;
u64 sp0;
-
-   /*
-* We store cpu_current_top_of_stack in sp1 so it's always accessible.
-* Linux does not use ring 1, so sp1 is not otherwise needed.
-*/
u64 sp1;
 
/*
@@ -428,12 +423,7 @@ struct irq_stack {
 
 DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
 
-#ifdef CONFIG_X86_32
 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
-#else
-/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
-#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
-#endif
 
 #ifdef CONFIG_X86_64
 struct fixed_percpu_data {
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 9f69cc497f4b..b5f0d2ff47e4 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -71,12 +71,7 @@ static inline void update_task_stack(struct task_struct 
*task)
else
this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
 #else
-   /*
-* x86-64 updates x86_tss.sp1 via cpu_current_top_of_stack. That
-* doesn't work on x86-32 because sp1 and
-* cpu_current_top_of_stack have different values (because of
-* the non-zero stack-padding on 32bit).
-*/
+   /* Xen PV enters the kernel on the thread stack. */
if (static_cpu_has(X86_FEATURE_XENPV))
load_sp0(task_top_of_stack(task));
 #endif
diff --git a/arch/x86/include/asm/thread_info.h 
b/arch/x86/include/asm/thread_info.h
index 0d751d5da702..3dc93d8df425 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -197,12 +197,6 @@ static inline int arch_within_stack_frames(const void * 
const stack,
 #endif
 }
 
-#else /* !__ASSEMBLY__ */
-
-#ifdef CONFIG_X86_64
-# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
-#endif
-
 #endif
 
 #ifdef CONFIG_COMPAT
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 35ad8480c464..f3d7fd7e9684 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1745,6 +1745,9 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
 DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
 EXPORT_PER_CPU_SYMBOL(__preempt_count);
 
+DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
+EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
+
 /* May not be marked __init: used by software suspend */
 void syscall_init(void)
 {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 145a7ac0c19a..296de77da4b2 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -63,14 +63,9 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, 
cpu_tss_rw) = {
 */
.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
 
-   /*
-* .sp1 is cpu_current_top_of_stack.  The init task never
-* runs user code, but cpu_current_top_of_stack should still
-* be well defined before the first context switch.
-*/
+#ifdef CONFIG_X86_32
.sp1 = TOP_O

[PATCH V3 0/6] x86: don't abuse tss.sp1

2021-01-27 Thread Lai Jiangshan

From: Lai Jiangshan 

In x86_64, tss.sp1 is reused as cpu_current_top_of_stack.  But we can
directly use percpu since CR3 and gs_base is correct when it is used.

In x86_32, tss.sp1 is resued as thread.sp0 in three places in entry
code.  We have the correct CR3 and %fs at two of the places.  The last
one is sysenter.  This patchset makes %fs available earlier so that
we can also use percpu in sysenter.  And add a percpu cpu_current_thread_sp0
for thread.sp0 instead of tss.sp1

[V2]: 
https://lore.kernel.org/lkml/20210125173444.22696-1-jiangshan...@gmail.com/
[V1]: https://lore.kernel.org/lkml/20210123084900.3118-1-jiangshan...@gmail.com/

Changed from V2
Add missing "%ss:" reported by Brian Gerst.

Changed from V1
Requested from Andy to also fix sp1 for x86_32.
Update comments in the x86_64 patch as Andy sugguested.

Lai Jiangshan (6):
  x86_64: move cpu_current_top_of_stack out of TSS
  x86_32: use percpu instead of offset-calculation to get thread.sp0
when SWITCH_TO_KERNEL_STACK
  x86_32/sysenter: switch to the task stack without emptying the entry
stack
  x86_32/sysenter: restore %fs before switching stack
  x86_32/sysenter: use percpu to get thread.sp0 when sysenter
  x86_32: use cpu_current_thread_sp0 instead of cpu_tss_rw.x86_tss.sp1

 arch/x86/entry/entry_32.S  | 38 +-
 arch/x86/include/asm/processor.h   | 12 ++
 arch/x86/include/asm/switch_to.h   |  9 ++-
 arch/x86/include/asm/thread_info.h |  6 -
 arch/x86/kernel/asm-offsets.c  |  1 -
 arch/x86/kernel/asm-offsets_32.c   | 10 
 arch/x86/kernel/cpu/common.c   | 12 +-
 arch/x86/kernel/process.c  |  7 --
 arch/x86/mm/pti.c  |  7 +++---
 9 files changed, 40 insertions(+), 62 deletions(-)

-- 
2.19.1.6.gb485710b

[PATCH V2 3/6] x86_32/sysenter: switch to the task stack without emptying the entry stack

2021-01-25 Thread Lai Jiangshan

From: Lai Jiangshan 

Like the way x86_64 uses the "old" stack, we can save the entry stack
pointer to a register and switch to the task stack.  So that we have
space on the "old" stack to save more things or scratch registers.

Signed-off-by: Lai Jiangshan 
---
 arch/x86/entry/entry_32.S | 11 +--
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 3b4d1a63d1f0..4513702ba45d 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -905,19 +905,18 @@ SYM_FUNC_START(entry_SYSENTER_32)
pushl   %eax
BUG_IF_WRONG_CR3 no_user_check=1
SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
-   popl%eax
-   popfl
 
-   /* Stack empty again, switch to task stack */
-   movlTSS_entry2task_stack(%esp), %esp
+   /* Switch to task stack */
+   movl%esp, %eax
+   movl(2*4+TSS_entry2task_stack)(%esp), %esp
 
 .Lsysenter_past_esp:
pushl   $__USER_DS  /* pt_regs->ss */
pushl   $0  /* pt_regs->sp (placeholder) */
-   pushfl  /* pt_regs->flags (except IF = 0) */
+   pushl   4(%eax) /* pt_regs->flags (except IF = 0) */
pushl   $__USER_CS  /* pt_regs->cs */
pushl   $0  /* pt_regs->ip = 0 (placeholder) */
-   pushl   %eax/* pt_regs->orig_ax */
+   pushl   (%eax)  /* pt_regs->orig_ax */
SAVE_ALL pt_regs_ax=$-ENOSYS/* save rest, stack already switched */
 
/*
-- 
2.19.1.6.gb485710b

[PATCH V2 5/6] x86_32/sysenter: use percpu to get thread.sp0 when sysenter

2021-01-25 Thread Lai Jiangshan

From: Lai Jiangshan 

TSS_entry2task_stack is used to refer to tss.sp1 which is stored the value
of thread.sp0.

At the code where TSS_entry2task_stack is used in sysenter, the CR3 is
already kernel CR3 and kernel segments is loaded.

So that we can directly use percpu for it instead of offset-calculation.

And we remove the unused TSS_entry2task_stack.

Signed-off-by: Lai Jiangshan 
---
 arch/x86/entry/entry_32.S|  2 +-
 arch/x86/kernel/asm-offsets_32.c | 10 --
 2 files changed, 1 insertion(+), 11 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index a8d2640394f9..3cb42efb3c04 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -916,7 +916,7 @@ SYM_FUNC_START(entry_SYSENTER_32)
 
/* Switch to task stack */
movl%esp, %eax
-   movl(3*4+TSS_entry2task_stack)(%esp), %esp
+   movlPER_CPU_VAR(cpu_tss_rw + TSS_sp1), %esp
 
 .Lsysenter_past_esp:
pushl   $__USER_DS  /* pt_regs->ss */
diff --git a/arch/x86/kernel/asm-offsets_32.c b/arch/x86/kernel/asm-offsets_32.c
index 6e043f295a60..6d4143cfbf03 100644
--- a/arch/x86/kernel/asm-offsets_32.c
+++ b/arch/x86/kernel/asm-offsets_32.c
@@ -43,16 +43,6 @@ void foo(void)
OFFSET(saved_context_gdt_desc, saved_context, gdt_desc);
BLANK();
 
-   /*
-* Offset from the entry stack to task stack stored in TSS. Kernel entry
-* happens on the per-cpu entry-stack, and the asm code switches to the
-* task-stack pointer stored in x86_tss.sp1, which is a copy of
-* task->thread.sp0 where entry code can find it.
-*/
-   DEFINE(TSS_entry2task_stack,
-  offsetof(struct cpu_entry_area, tss.x86_tss.sp1) -
-  offsetofend(struct cpu_entry_area, entry_stack_page.stack));
-
 #ifdef CONFIG_STACKPROTECTOR
BLANK();
OFFSET(stack_canary_offset, stack_canary, canary);
-- 
2.19.1.6.gb485710b

[PATCH V2 2/6] x86_32: use percpu instead of offset-calculation to get thread.sp0 when SWITCH_TO_KERNEL_STACK

2021-01-25 Thread Lai Jiangshan

From: Lai Jiangshan 

TSS_entry2task_stack is used to refer to tss.sp1 which is stored the value
of thread.sp0.

At the code where TSS_entry2task_stack is used in SWITCH_TO_KERNEL_STACK,
the CR3 is already kernel CR3 and kernel segments is loaded.

So we can directly use the percpu to get tss.sp1(thread.sp0) instead of
the complex offset-calculation.

Signed-off-by: Lai Jiangshan 
---
 arch/x86/entry/entry_32.S | 9 ++---
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index df8c017e6161..3b4d1a63d1f0 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -465,16 +465,11 @@
cmpl$SIZEOF_entry_stack, %ecx
jae .Lend_\@
 
-   /* Load stack pointer into %esi and %edi */
+   /* Load stack pointer into %esi */
movl%esp, %esi
-   movl%esi, %edi
-
-   /* Move %edi to the top of the entry stack */
-   andl$(MASK_entry_stack), %edi
-   addl$(SIZEOF_entry_stack), %edi
 
/* Load top of task-stack into %edi */
-   movlTSS_entry2task_stack(%edi), %edi
+   movlPER_CPU_VAR(cpu_tss_rw + TSS_sp1), %edi
 
/* Special case - entry from kernel mode via entry stack */
 #ifdef CONFIG_VM86
-- 
2.19.1.6.gb485710b

[PATCH V2 6/6] x86_32: use cpu_current_thread_sp0 instead of cpu_tss_rw.x86_tss.sp1

2021-01-25 Thread Lai Jiangshan

From: Lai Jiangshan 

sp1 is not used by hardware and is used as thread.sp0.  We should just
use new percpu variable.

And remove unneeded TSS_sp1.

Signed-off-by: Lai Jiangshan 
---
 arch/x86/entry/entry_32.S| 6 +++---
 arch/x86/include/asm/processor.h | 2 ++
 arch/x86/include/asm/switch_to.h | 2 +-
 arch/x86/kernel/asm-offsets.c| 1 -
 arch/x86/kernel/cpu/common.c | 9 -
 arch/x86/kernel/process.c| 2 --
 6 files changed, 14 insertions(+), 8 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 3cb42efb3c04..22cd3d8fd23e 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -472,7 +472,7 @@
movl%esp, %esi
 
/* Load top of task-stack into %edi */
-   movlPER_CPU_VAR(cpu_tss_rw + TSS_sp1), %edi
+   movlPER_CPU_VAR(cpu_current_thread_sp0), %edi
 
/* Special case - entry from kernel mode via entry stack */
 #ifdef CONFIG_VM86
@@ -658,7 +658,7 @@
movlPER_CPU_VAR(cpu_tss_rw + TSS_sp0), %edi
 
/* Bytes on the task-stack to ecx */
-   movlPER_CPU_VAR(cpu_tss_rw + TSS_sp1), %ecx
+   movlPER_CPU_VAR(cpu_current_thread_sp0), %ecx
subl%esi, %ecx
 
/* Allocate stack-frame on entry-stack */
@@ -916,7 +916,7 @@ SYM_FUNC_START(entry_SYSENTER_32)
 
/* Switch to task stack */
movl%esp, %eax
-   movlPER_CPU_VAR(cpu_tss_rw + TSS_sp1), %esp
+   movlPER_CPU_VAR(cpu_current_thread_sp0), %esp
 
 .Lsysenter_past_esp:
pushl   $__USER_DS  /* pt_regs->ss */
diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index 886d32da1318..4265884c33e7 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -774,6 +774,8 @@ static inline void spin_lock_prefetch(const void *x)
 
 #define KSTK_ESP(task) (task_pt_regs(task)->sp)
 
+DECLARE_PER_CPU(unsigned long, cpu_current_thread_sp0);
+
 #else
 #define INIT_THREAD { }
 
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index b5f0d2ff47e4..e27eb7974797 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -69,7 +69,7 @@ static inline void update_task_stack(struct task_struct *task)
if (static_cpu_has(X86_FEATURE_XENPV))
load_sp0(task->thread.sp0);
else
-   this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
+   this_cpu_write(cpu_current_thread_sp0, task->thread.sp0);
 #else
/* Xen PV enters the kernel on the thread stack. */
if (static_cpu_has(X86_FEATURE_XENPV))
diff --git a/arch/x86/kernel/asm-offsets.c b/arch/x86/kernel/asm-offsets.c
index 60b9f42ce3c1..3b63b6062792 100644
--- a/arch/x86/kernel/asm-offsets.c
+++ b/arch/x86/kernel/asm-offsets.c
@@ -98,6 +98,5 @@ static void __used common(void)
 
/* Offset for fields in tss_struct */
OFFSET(TSS_sp0, tss_struct, x86_tss.sp0);
-   OFFSET(TSS_sp1, tss_struct, x86_tss.sp1);
OFFSET(TSS_sp2, tss_struct, x86_tss.sp2);
 }
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index f3d7fd7e9684..b2c37d369137 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1789,12 +1789,19 @@ EXPORT_PER_CPU_SYMBOL(__preempt_count);
 /*
  * On x86_32, vm86 modifies tss.sp0, so sp0 isn't a reliable way to find
  * the top of the kernel stack.  Use an extra percpu variable to track the
- * top of the kernel stack directly.
+ * top of the kernel stack directly and an percpu variable to track the
+ * thread.sp0 for using in entry code.  cpu_current_top_of_stack and
+ * cpu_current_thread_sp0 are different value because of the non-zero
+ * stack-padding on 32bit.  See more comment at TOP_OF_KERNEL_STACK_PADDING
+ * and vm86.
  */
 DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) =
(unsigned long)_thread_union + THREAD_SIZE;
 EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
 
+DEFINE_PER_CPU(unsigned long, cpu_current_thread_sp0) = TOP_OF_INIT_STACK;
+EXPORT_PER_CPU_SYMBOL(cpu_current_thread_sp0);
+
 #ifdef CONFIG_STACKPROTECTOR
 DEFINE_PER_CPU_ALIGNED(struct stack_canary, stack_canary);
 #endif
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 296de77da4b2..e6d4b5399a81 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -64,8 +64,6 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, 
cpu_tss_rw) = {
.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
 
 #ifdef CONFIG_X86_32
-   .sp1 = TOP_OF_INIT_STACK,
-
.ss0 = __KERNEL_DS,
.ss1 = __KERNEL_CS,
 #endif
-- 
2.19.1.6.gb485710b

[PATCH V2 4/6] x86_32/sysenter: restore %fs before switching stack

2021-01-25 Thread Lai Jiangshan

From: Lai Jiangshan 

Prepare for using percpu and removing TSS_entry2task_stack

Signed-off-by: Lai Jiangshan 
---
 arch/x86/entry/entry_32.S | 22 +-
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/arch/x86/entry/entry_32.S b/arch/x86/entry/entry_32.S
index 4513702ba45d..a8d2640394f9 100644
--- a/arch/x86/entry/entry_32.S
+++ b/arch/x86/entry/entry_32.S
@@ -279,11 +279,13 @@
 .Lfinished_frame_\@:
 .endm
 
-.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 unwind_espfix=0
+.macro SAVE_ALL pt_regs_ax=%eax switch_stacks=0 skip_gs=0 skip_fs=0 
unwind_espfix=0
cld
 .if \skip_gs == 0
PUSH_GS
 .endif
+
+.if \skip_fs == 0
pushl   %fs
 
pushl   %eax
@@ -293,6 +295,7 @@
UNWIND_ESPFIX_STACK
 .endif
popl%eax
+.endif
 
FIXUP_FRAME
pushl   %es
@@ -906,18 +909,27 @@ SYM_FUNC_START(entry_SYSENTER_32)
BUG_IF_WRONG_CR3 no_user_check=1
SWITCH_TO_KERNEL_CR3 scratch_reg=%eax
 
+   /* Restore kernel %fs, so that we can use PERCPU */
+   pushl   %fs
+   movl$(__KERNEL_PERCPU), %eax
+   movl%eax, %fs
+
/* Switch to task stack */
movl%esp, %eax
-   movl(2*4+TSS_entry2task_stack)(%esp), %esp
+   movl(3*4+TSS_entry2task_stack)(%esp), %esp
 
 .Lsysenter_past_esp:
pushl   $__USER_DS  /* pt_regs->ss */
pushl   $0  /* pt_regs->sp (placeholder) */
-   pushl   4(%eax) /* pt_regs->flags (except IF = 0) */
+   pushl   8(%eax) /* pt_regs->flags (except IF = 0) */
pushl   $__USER_CS  /* pt_regs->cs */
pushl   $0  /* pt_regs->ip = 0 (placeholder) */
-   pushl   (%eax)  /* pt_regs->orig_ax */
-   SAVE_ALL pt_regs_ax=$-ENOSYS/* save rest, stack already switched */
+   pushl   4(%eax) /* pt_regs->orig_ax */
+   PUSH_GS /* pt_regs->gs */
+   pushl   (%eax)  /* pt_regs->fs */
+   /* save rest, stack and %fs already switched */
+   SAVE_ALL pt_regs_ax=$-ENOSYS skip_gs=1 skip_fs=1
+   SET_KERNEL_GS %edx
 
/*
 * SYSENTER doesn't filter flags, so we need to clear NT, AC
-- 
2.19.1.6.gb485710b

[PATCH V2 1/6] x86_64: move cpu_current_top_of_stack out of TSS

2021-01-25 Thread Lai Jiangshan

From: Lai Jiangshan 

When X86_BUG_CPU_MELTDOWN & KPTI, cpu_current_top_of_stack lives in the
TSS which is also in the user CR3 and it becomes a coveted fruit.  An
attacker can fetch the kernel stack top from it and continue next steps
of actions based on the kernel stack.

The address might not be very usefull for attacker, but it is not so
necessary to be in TSS either.  It is only accessed when CR3 is kernel CR3
and gs_base is kernel gs_base which means it can be in any percpu variable.

The major reason it is in TSS might be performance because it is hot in
cache and tlb since we just access sp2 as the scratch space in syscall.

So we can move it to a percpu variable near other hot percpu variables,
such as current_task, __preempt_count, and they are in the same
cache line.

tools/testing/selftests/seccomp/seccomp_benchmark desn't show any
performance lost in "getpid native" result.  And actually, the result
changes from 93ns before patch to 92ns after patch when !KPTI, and the
test is very stable although the test desn't show a higher degree of
precision but enough to know it doesn't cause degression for the test.

Signed-off-by: Lai Jiangshan 
---
 arch/x86/include/asm/processor.h   | 10 --
 arch/x86/include/asm/switch_to.h   |  7 +--
 arch/x86/include/asm/thread_info.h |  6 --
 arch/x86/kernel/cpu/common.c   |  3 +++
 arch/x86/kernel/process.c  |  7 +--
 arch/x86/mm/pti.c  |  7 +++
 6 files changed, 8 insertions(+), 32 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c20a52b5534b..886d32da1318 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -314,11 +314,6 @@ struct x86_hw_tss {
 struct x86_hw_tss {
u32 reserved1;
u64 sp0;
-
-   /*
-* We store cpu_current_top_of_stack in sp1 so it's always accessible.
-* Linux does not use ring 1, so sp1 is not otherwise needed.
-*/
u64 sp1;
 
/*
@@ -428,12 +423,7 @@ struct irq_stack {
 
 DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
 
-#ifdef CONFIG_X86_32
 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
-#else
-/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
-#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
-#endif
 
 #ifdef CONFIG_X86_64
 struct fixed_percpu_data {
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 9f69cc497f4b..b5f0d2ff47e4 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -71,12 +71,7 @@ static inline void update_task_stack(struct task_struct 
*task)
else
this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
 #else
-   /*
-* x86-64 updates x86_tss.sp1 via cpu_current_top_of_stack. That
-* doesn't work on x86-32 because sp1 and
-* cpu_current_top_of_stack have different values (because of
-* the non-zero stack-padding on 32bit).
-*/
+   /* Xen PV enters the kernel on the thread stack. */
if (static_cpu_has(X86_FEATURE_XENPV))
load_sp0(task_top_of_stack(task));
 #endif
diff --git a/arch/x86/include/asm/thread_info.h 
b/arch/x86/include/asm/thread_info.h
index 0d751d5da702..3dc93d8df425 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -197,12 +197,6 @@ static inline int arch_within_stack_frames(const void * 
const stack,
 #endif
 }
 
-#else /* !__ASSEMBLY__ */
-
-#ifdef CONFIG_X86_64
-# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
-#endif
-
 #endif
 
 #ifdef CONFIG_COMPAT
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 35ad8480c464..f3d7fd7e9684 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1745,6 +1745,9 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
 DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
 EXPORT_PER_CPU_SYMBOL(__preempt_count);
 
+DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
+EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
+
 /* May not be marked __init: used by software suspend */
 void syscall_init(void)
 {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 145a7ac0c19a..296de77da4b2 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -63,14 +63,9 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, 
cpu_tss_rw) = {
 */
.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
 
-   /*
-* .sp1 is cpu_current_top_of_stack.  The init task never
-* runs user code, but cpu_current_top_of_stack should still
-* be well defined before the first context switch.
-*/
+#ifdef CONFIG_X86_32
.sp1 = TOP_O

[PATCH V2 0/6] x86: don't abuse tss.sp1

2021-01-25 Thread Lai Jiangshan

From: Lai Jiangshan 

In x86_64, tss.sp1 is reused as cpu_current_top_of_stack.  But we can
directly use percpu since CR3 and gs_base is correct when it is used.

In x86_32, tss.sp1 is resued as thread.sp0 in three places in entry
code.  We have the correct CR3 and %fs at two of the places.  The last
one is sysenter.  This patchset makes %fs available earlier so that
we can also use percpu in sysenter.  And add a percpu cpu_current_thread_sp0
for thread.sp0 instead of tss.sp1

Changed from V1
Requested from Andy to also fix sp1 for x86_32.
Update comments in the x86_64 patch as Andy sugguested.

Lai Jiangshan (6):
  x86_64: move cpu_current_top_of_stack out of TSS
  x86_32: use percpu instead of offset-calculation to get thread.sp0
when SWITCH_TO_KERNEL_STACK
  x86_32/sysenter: switch to the task stack without emptying the entry
stack
  x86_32/sysenter: restore %fs before switching stack
  x86_32/sysenter: use percpu to get thread.sp0 when sysenter
  x86_32: use cpu_current_thread_sp0 instead of cpu_tss_rw.x86_tss.sp1

 arch/x86/entry/entry_32.S  | 38 +-
 arch/x86/include/asm/processor.h   | 12 ++
 arch/x86/include/asm/switch_to.h   |  9 ++-
 arch/x86/include/asm/thread_info.h |  6 -
 arch/x86/kernel/asm-offsets.c  |  1 -
 arch/x86/kernel/asm-offsets_32.c   | 10 
 arch/x86/kernel/cpu/common.c   | 12 +-
 arch/x86/kernel/process.c  |  7 --
 arch/x86/mm/pti.c  |  7 +++---
 9 files changed, 40 insertions(+), 62 deletions(-)

-- 
2.19.1.6.gb485710b

[PATCH V2] x86/entry/64: De-Xen-ify our NMI code further

2021-01-24 Thread Lai Jiangshan

From: Lai Jiangshan 

The commit 929bacec21478("x86/entry/64: De-Xen-ify our NMI code") simplified
the NMI code by changing paravirt code into native code and left a comment
about "inspecting RIP instead".  But until now, "inspecting RIP instead"
has not been made happened and this patch tries to complete it.

Comments in the code was from Andy Lutomirski.  Thanks!

Signed-off-by: Lai Jiangshan 
---
 arch/x86/entry/entry_64.S | 44 ++-
 1 file changed, 11 insertions(+), 33 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index cad08703c4ad..21f67ea62341 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1268,32 +1268,14 @@ SYM_CODE_START(asm_exc_nmi)
je  nested_nmi
 
/*
-* Now test if the previous stack was an NMI stack.  This covers
-* the case where we interrupt an outer NMI after it clears
-* "NMI executing" but before IRET.  We need to be careful, though:
-* there is one case in which RSP could point to the NMI stack
-* despite there being no NMI active: naughty userspace controls
-* RSP at the very beginning of the SYSCALL targets.  We can
-* pull a fast one on naughty userspace, though: we program
-* SYSCALL to mask DF, so userspace cannot cause DF to be set
-* if it controls the kernel's RSP.  We set DF before we clear
-* "NMI executing".
+* Now test if we interrupted an outer NMI that just cleared "NMI
+* executing" and is about to IRET.  This is a single-instruction
+* window.  This check does not handle the case in which we get a
+* nested interrupt (#MC, #VE, #VC, etc.) after clearing
+* "NMI executing" but before the outer NMI executes IRET.
 */
-   lea 6*8(%rsp), %rdx
-   /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) 
*/
-   cmpq%rdx, 4*8(%rsp)
-   /* If the stack pointer is above the NMI stack, this is a normal NMI */
-   ja  first_nmi
-
-   subq$EXCEPTION_STKSZ, %rdx
-   cmpq%rdx, 4*8(%rsp)
-   /* If it is below the NMI stack, it is a normal NMI */
-   jb  first_nmi
-
-   /* Ah, it is within the NMI stack. */
-
-   testb   $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
-   jz  first_nmi   /* RSP was user controlled. */
+   cmpq$.Lnmi_iret, 8(%rsp)
+   jne first_nmi
 
/* This is a nested NMI. */
 
@@ -1438,17 +1420,13 @@ nmi_restore:
addq$6*8, %rsp
 
/*
-* Clear "NMI executing".  Set DF first so that we can easily
-* distinguish the remaining code between here and IRET from
-* the SYSCALL entry and exit paths.
-*
-* We arguably should just inspect RIP instead, but I (Andy) wrote
-* this code when I had the misapprehension that Xen PV supported
-* NMIs, and Xen PV would break that approach.
+* Clear "NMI executing".  This leaves a window in which a nested NMI
+* could observe "NMI executing" cleared, and a nested NMI will detect
+* this by inspecting RIP.
 */
-   std
movq$0, 5*8(%rsp)   /* clear "NMI executing" */
 
+.Lnmi_iret: /* must be immediately after clearing "NMI executing" */
/*
 * iretq reads the "iret" frame and exits the NMI stack in a
 * single instruction.  We are returning to kernel mode, so this
-- 
2.19.1.6.gb485710b

[PATCH] x86/entry/64: De-Xen-ify our NMI code further

2021-01-24 Thread Lai Jiangshan

From: Lai Jiangshan 

The commit 929bacec21478("x86/entry/64: De-Xen-ify our NMI code") simplified
the NMI code by changing paravirt code into native code and left a comment
about "inspecting RIP instead".  But until now, "inspecting RIP instead"
has not been made happened and this patch tries to complete it.

Signed-off-by: Lai Jiangshan 
---
 arch/x86/entry/entry_64.S | 46 +++
 1 file changed, 13 insertions(+), 33 deletions(-)

diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S
index cad08703c4ad..cb6b8a6c6652 100644
--- a/arch/x86/entry/entry_64.S
+++ b/arch/x86/entry/entry_64.S
@@ -1268,32 +1268,12 @@ SYM_CODE_START(asm_exc_nmi)
je  nested_nmi
 
/*
-* Now test if the previous stack was an NMI stack.  This covers
-* the case where we interrupt an outer NMI after it clears
-* "NMI executing" but before IRET.  We need to be careful, though:
-* there is one case in which RSP could point to the NMI stack
-* despite there being no NMI active: naughty userspace controls
-* RSP at the very beginning of the SYSCALL targets.  We can
-* pull a fast one on naughty userspace, though: we program
-* SYSCALL to mask DF, so userspace cannot cause DF to be set
-* if it controls the kernel's RSP.  We set DF before we clear
-* "NMI executing".
+* Now test if we interrupt an outer NMI after it clears
+* "NMI executing" but before iret.
 */
-   lea 6*8(%rsp), %rdx
-   /* Compare the NMI stack (rdx) with the stack we came from (4*8(%rsp)) 
*/
-   cmpq%rdx, 4*8(%rsp)
-   /* If the stack pointer is above the NMI stack, this is a normal NMI */
-   ja  first_nmi
-
-   subq$EXCEPTION_STKSZ, %rdx
-   cmpq%rdx, 4*8(%rsp)
-   /* If it is below the NMI stack, it is a normal NMI */
-   jb  first_nmi
-
-   /* Ah, it is within the NMI stack. */
-
-   testb   $(X86_EFLAGS_DF >> 8), (3*8 + 1)(%rsp)
-   jz  first_nmi   /* RSP was user controlled. */
+   movq$nmi_executing_cleared, %rdx
+   cmpq8(%rsp), %rdx
+   jne first_nmi
 
/* This is a nested NMI. */
 
@@ -1438,16 +1418,16 @@ nmi_restore:
addq$6*8, %rsp
 
/*
-* Clear "NMI executing".  Set DF first so that we can easily
-* distinguish the remaining code between here and IRET from
-* the SYSCALL entry and exit paths.
-*
-* We arguably should just inspect RIP instead, but I (Andy) wrote
-* this code when I had the misapprehension that Xen PV supported
-* NMIs, and Xen PV would break that approach.
+* Clear "NMI executing".  It also leaves a window after it before
+* iret which should be also considered to be "NMI executing" albeit
+* with "NMI executing" variable being zero.  So we should also check
+* the RIP after it when checking "NMI executing".  See the code
+* before nested_nmi.  No code is allowed to be added to between
+* clearing "NMI executing" and iret unless we check a larger window
+* with a range of RIPs instead of currently a single-RIP window.
 */
-   std
movq$0, 5*8(%rsp)   /* clear "NMI executing" */
+nmi_executing_cleared:
 
/*
 * iretq reads the "iret" frame and exits the NMI stack in a
-- 
2.19.1.6.gb485710b

Re: [PATCH v7 45/72] x86/entry/64: Add entry code for #VC handler

2021-01-24 Thread Lai Jiangshan

> +
> +   /*
> +* No need to switch back to the IST stack. The current stack is 
> either
> +* identical to the stack in the IRET frame or the VC fall-back stack,
> +* so it is definitly mapped even with PTI enabled.
> +*/
> +   jmp paranoid_exit
> +
>

Hello

I know we don't enable PTI on AMD, but the above comment doesn't align to the
next code.

We assume PTI is enabled as the comments said "even with PTI enabled".

When #VC happens after entry_SYSCALL_64 but before it switches to the
kernel CR3.  vc_switch_off_ist() will switch the stack to the kernel stack
and paranoid_exit can't work when it switches to user CR3 on the kernel stack.

The comment above lost information that the current stack is possible to be
the kernel stack which is mapped not user CR3.

Maybe I missed something.

Thanks
Lai

> +#ifdef CONFIG_AMD_MEM_ENCRYPT
> +asmlinkage __visible noinstr struct pt_regs *vc_switch_off_ist(struct 
> pt_regs *regs)
> +{
> +   unsigned long sp, *stack;
> +   struct stack_info info;
> +   struct pt_regs *regs_ret;
> +
> +   /*
> +* In the SYSCALL entry path the RSP value comes from user-space - 
> don't
> +* trust it and switch to the current kernel stack
> +*/
> +   if (regs->ip >= (unsigned long)entry_SYSCALL_64 &&
> +   regs->ip <  (unsigned long)entry_SYSCALL_64_safe_stack) {
> +   sp = this_cpu_read(cpu_current_top_of_stack);
> +   goto sync;
> +   }

[PATCH] x86_64: move cpu_current_top_of_stack out of TSS

2021-01-22 Thread Lai Jiangshan

From: Lai Jiangshan 

When X86_BUG_CPU_MELTDOWN & KPTI, cpu_current_top_of_stack lives in the
TSS which is also in the user CR3 and it becomes a coveted fruit.  An
attacker can fetch the kernel stack top from it and continue next steps
of actions based on the kernel stack.

The address might not be very usefull for attacker, but it is not so
necessary to be in TSS either.  It is only accessed when CR3 is kernel CR3
and gs_base is kernel gs_base which means it can be in any percpu variable.

The major reason it is in TSS might be performance because it is hot in
cache and tlb since we just access sp2 as the scratch space in syscall.

So we can move it to a percpu variable near other hot percpu variables,
such as current_task, __preempt_count, and they are in the same
cache line.

tools/testing/selftests/seccomp/seccomp_benchmark desn't show any
performance lost in "getpid native" result.  And actually, the result
changes from 93ns before patch to 92ns after patch when !KPTI, and the
test is very stable although the test desn't show a higher degree of
precision but enough to know it doesn't cause degression for the test.

Signed-off-by: Lai Jiangshan 
---
 arch/x86/include/asm/processor.h   | 10 --
 arch/x86/include/asm/switch_to.h   |  7 +--
 arch/x86/include/asm/thread_info.h |  6 --
 arch/x86/kernel/cpu/common.c   |  3 +++
 arch/x86/kernel/process.c  |  8 ++--
 arch/x86/mm/pti.c  |  7 +++
 6 files changed, 9 insertions(+), 32 deletions(-)

diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h
index c20a52b5534b..886d32da1318 100644
--- a/arch/x86/include/asm/processor.h
+++ b/arch/x86/include/asm/processor.h
@@ -314,11 +314,6 @@ struct x86_hw_tss {
 struct x86_hw_tss {
u32 reserved1;
u64 sp0;
-
-   /*
-* We store cpu_current_top_of_stack in sp1 so it's always accessible.
-* Linux does not use ring 1, so sp1 is not otherwise needed.
-*/
u64 sp1;
 
/*
@@ -428,12 +423,7 @@ struct irq_stack {
 
 DECLARE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
 
-#ifdef CONFIG_X86_32
 DECLARE_PER_CPU(unsigned long, cpu_current_top_of_stack);
-#else
-/* The RO copy can't be accessed with this_cpu_xyz(), so use the RW copy. */
-#define cpu_current_top_of_stack cpu_tss_rw.x86_tss.sp1
-#endif
 
 #ifdef CONFIG_X86_64
 struct fixed_percpu_data {
diff --git a/arch/x86/include/asm/switch_to.h b/arch/x86/include/asm/switch_to.h
index 9f69cc497f4b..4f0bc8533a54 100644
--- a/arch/x86/include/asm/switch_to.h
+++ b/arch/x86/include/asm/switch_to.h
@@ -71,12 +71,7 @@ static inline void update_task_stack(struct task_struct 
*task)
else
this_cpu_write(cpu_tss_rw.x86_tss.sp1, task->thread.sp0);
 #else
-   /*
-* x86-64 updates x86_tss.sp1 via cpu_current_top_of_stack. That
-* doesn't work on x86-32 because sp1 and
-* cpu_current_top_of_stack have different values (because of
-* the non-zero stack-padding on 32bit).
-*/
+   /* XENPV keeps its entry stack to be kernel stack.  */
if (static_cpu_has(X86_FEATURE_XENPV))
load_sp0(task_top_of_stack(task));
 #endif
diff --git a/arch/x86/include/asm/thread_info.h 
b/arch/x86/include/asm/thread_info.h
index 0d751d5da702..3dc93d8df425 100644
--- a/arch/x86/include/asm/thread_info.h
+++ b/arch/x86/include/asm/thread_info.h
@@ -197,12 +197,6 @@ static inline int arch_within_stack_frames(const void * 
const stack,
 #endif
 }
 
-#else /* !__ASSEMBLY__ */
-
-#ifdef CONFIG_X86_64
-# define cpu_current_top_of_stack (cpu_tss_rw + TSS_sp1)
-#endif
-
 #endif
 
 #ifdef CONFIG_COMPAT
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 35ad8480c464..f3d7fd7e9684 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1745,6 +1745,9 @@ DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
 DEFINE_PER_CPU(int, __preempt_count) = INIT_PREEMPT_COUNT;
 EXPORT_PER_CPU_SYMBOL(__preempt_count);
 
+DEFINE_PER_CPU(unsigned long, cpu_current_top_of_stack) = TOP_OF_INIT_STACK;
+EXPORT_PER_CPU_SYMBOL(cpu_current_top_of_stack);
+
 /* May not be marked __init: used by software suspend */
 void syscall_init(void)
 {
diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c
index 145a7ac0c19a..7c4d0184a44a 100644
--- a/arch/x86/kernel/process.c
+++ b/arch/x86/kernel/process.c
@@ -63,14 +63,10 @@ __visible DEFINE_PER_CPU_PAGE_ALIGNED(struct tss_struct, 
cpu_tss_rw) = {
 */
.sp0 = (1UL << (BITS_PER_LONG-1)) + 1,
 
-   /*
-* .sp1 is cpu_current_top_of_stack.  The init task never
-* runs user code, but cpu_current_top_of_stack should still
-* be well defined before the first context switch.
-*/
+#ifdef CONFIG_X86_32
+   /* .sp1 is used v

[tip: sched/urgent] workqueue: Use cpu_possible_mask instead of cpu_active_mask to break affinity

2021-01-22 Thread tip-bot2 for Lai Jiangshan

The following commit has been merged into the sched/urgent branch of tip:

Commit-ID: 547a77d02f8cfb345631ce23b5b548d27afa0fc4
Gitweb:
https://git.kernel.org/tip/547a77d02f8cfb345631ce23b5b548d27afa0fc4
Author:Lai Jiangshan 
AuthorDate:Mon, 11 Jan 2021 23:26:33 +08:00
Committer: Peter Zijlstra 
CommitterDate: Fri, 22 Jan 2021 15:09:41 +01:00

workqueue: Use cpu_possible_mask instead of cpu_active_mask to break affinity

The scheduler won't break affinity for us any more, and we should
"emulate" the same behavior when the scheduler breaks affinity for
us.  The behavior is "changing the cpumask to cpu_possible_mask".

And there might be some other CPUs online later while the worker is
still running with the pending work items.  The worker should be allowed
to use the later online CPUs as before and process the work items ASAP.
If we use cpu_active_mask here, we can't achieve this goal but
using cpu_possible_mask can.

Fixes: 06249738a41a ("workqueue: Manually break affinity on hotplug")
Signed-off-by: Lai Jiangshan 
Signed-off-by: Peter Zijlstra (Intel) 
Reviewed-by: Valentin Schneider 
Acked-by: Tejun Heo 
Tested-by: Paul E. McKenney 
Tested-by: Valentin Schneider 
Link: https://lkml.kernel.org/r/2021052638.2417-4-jiangshan...@gmail.com
---
 kernel/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9880b6c..1646331 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4920,7 +4920,7 @@ static void unbind_workers(int cpu)
raw_spin_unlock_irq(>lock);
 
for_each_pool_worker(worker, pool)
-   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
cpu_active_mask) < 0);
+   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
cpu_possible_mask) < 0);
 
mutex_unlock(_pool_attach_mutex);

Re: [PATCH] workqueue: fix annotation for WQ_SYSFS

2021-01-18 Thread Lai Jiangshan

On Mon, Jan 18, 2021 at 4:05 PM  wrote:
>
> From: Menglong Dong 
>
> 'wq_sysfs_register()' in annotation for 'WQ_SYSFS' is unavailable,
> change it to 'workqueue_sysfs_register()'.
>
> Signed-off-by: Menglong Dong 

Reviewed-by: Lai Jiangshan 

> ---
>  include/linux/workqueue.h | 2 +-
>  1 file changed, 1 insertion(+), 1 deletion(-)
>
> diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
> index 26de0cae2a0a..d15a7730ee18 100644
> --- a/include/linux/workqueue.h
> +++ b/include/linux/workqueue.h
> @@ -311,7 +311,7 @@ enum {
> WQ_MEM_RECLAIM  = 1 << 3, /* may be used for memory reclaim */
> WQ_HIGHPRI  = 1 << 4, /* high priority */
> WQ_CPU_INTENSIVE= 1 << 5, /* cpu intensive workqueue */
> -   WQ_SYSFS= 1 << 6, /* visible in sysfs, see 
> wq_sysfs_register() */
> +   WQ_SYSFS= 1 << 6, /* visible in sysfs, see 
> workqueue_sysfs_register() */
>
> /*
>  * Per-cpu workqueues are generally preferred because they tend to
> --
> 2.25.1
>

Re: [PATCH 8/8] sched: Relax the set_cpus_allowed_ptr() semantics

2021-01-16 Thread Lai Jiangshan

On Sat, Jan 16, 2021 at 7:43 PM Peter Zijlstra  wrote:
>
> Now that we have KTHREAD_IS_PER_CPU to denote the critical per-cpu
> tasks to retain during CPU offline, we can relax the warning in
> set_cpus_allowed_ptr(). Any spurious kthread that wants to get on at
> the last minute will get pushed off before it can run.
>
> While during CPU online there is no harm, and actual benefit, to
> allowing kthreads back on early, it simplifies hotplug code.
>
> Signed-off-by: Peter Zijlstra (Intel) 

Thanks!

Relaxing set_cpus_allowed_ptr() was also one of the choices I listed,
which can really simplify hotplug code in the workqueue and may be
other hotplug code.

Reviewed-by: Lai jiangshan 

> ---
>  kernel/sched/core.c |   20 +---
>  1 file changed, 9 insertions(+), 11 deletions(-)
>
> --- a/kernel/sched/core.c
> +++ b/kernel/sched/core.c
> @@ -2342,7 +2342,9 @@ static int __set_cpus_allowed_ptr(struct
>
> if (p->flags & PF_KTHREAD || is_migration_disabled(p)) {
> /*
> -* Kernel threads are allowed on online && !active CPUs.
> +* Kernel threads are allowed on online && !active CPUs,
> +* however, during cpu-hot-unplug, even these might get pushed
> +* away if not KTHREAD_IS_PER_CPU.
>  *
>  * Specifically, migration_disabled() tasks must not fail the
>  * cpumask_any_and_distribute() pick below, esp. so on
> @@ -2386,16 +2388,6 @@ static int __set_cpus_allowed_ptr(struct
>
> __do_set_cpus_allowed(p, new_mask, flags);
>
> -   if (p->flags & PF_KTHREAD) {
> -   /*
> -* For kernel threads that do indeed end up on online &&
> -* !active we want to ensure they are strict per-CPU threads.
> -*/
> -   WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
> -   !cpumask_intersects(new_mask, cpu_active_mask) &&
> -   p->nr_cpus_allowed != 1);
> -   }
> -
> return affine_move_task(rq, p, , dest_cpu, flags);
>
>  out:
> @@ -7519,6 +7511,12 @@ int sched_cpu_deactivate(unsigned int cp
>  */
> synchronize_rcu();
>
> +   /*
> +* From this point forward, this CPU will refuse to run any task that
> +* is not: migrate_disable() or KTHREAD_IS_PER_CPU, and will actively
> +* push those tasks away until this gets cleared, see
> +* sched_cpu_dying().
> +*/
> balance_push_set(cpu, true);
>
> rq_lock_irqsave(rq, );
>
>

Re: [PATCH 3/4] workqueue: Tag bound workers with KTHREAD_IS_PER_CPU

2021-01-16 Thread Lai Jiangshan

On Sat, Jan 16, 2021 at 11:16 PM Peter Zijlstra  wrote:
>
> On Sat, Jan 16, 2021 at 10:45:04PM +0800, Lai Jiangshan wrote:
> > On Sat, Jan 16, 2021 at 8:45 PM Peter Zijlstra  wrote:
> > > It is also the exact sequence normal per-cpu threads (smpboot) use to
> > > preserve affinity.
> >
> > Other per-cpu threads normally do short-live works. wq's work can be
> > lengthy, cpu-intensive, heavy-lock-acquiring or even call
> > get_online_cpus() which might result in a deadlock with kthread_park().
>
> kthread_park() is called by the migration thread running the
> workqueue_online_cpu() callback.
>
> kthread_parkme() is called by the worker thread, after it completes a
> work and has no locks held from that context.
>
>

BP: AP:  worker:
cpus_write_lock()
bringup_cpu()work_item_func()
  bringup_wait_for_ap  get_online_cpus()
kthread_park(worker)

Re: [PATCH 3/4] workqueue: Tag bound workers with KTHREAD_IS_PER_CPU

2021-01-16 Thread Lai Jiangshan

On Sat, Jan 16, 2021 at 8:45 PM Peter Zijlstra  wrote:
>
> On Sat, Jan 16, 2021 at 02:27:09PM +0800, Lai Jiangshan wrote:
> > On Thu, Jan 14, 2021 at 11:35 PM Peter Zijlstra  
> > wrote:
> >
> > >
> > > -void kthread_set_per_cpu(struct task_struct *k, bool set)
> > > +void kthread_set_per_cpu(struct task_struct *k, int cpu)
> > >  {
> > > struct kthread *kthread = to_kthread(k);
> > > if (!kthread)
> > > return;
> > >
> > > -   if (set) {
> > > -   WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY));
> > > -   WARN_ON_ONCE(k->nr_cpus_allowed != 1);
> > > -   set_bit(KTHREAD_IS_PER_CPU, >flags);
> > > -   } else {
> > > +   WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY));
> > > +
> > > +   if (cpu < 0) {
> > > clear_bit(KTHREAD_IS_PER_CPU, >flags);
> > > +   return;
> > > }
> > > +
> > > +   kthread->cpu = cpu;
> > > +   set_bit(KTHREAD_IS_PER_CPU, >flags);
> > >  }
> > >
> >
> > I don't see the code to set the mask of the cpu to the task
> > since set_cpus_allowed_ptr() is removed from rebind_worker().
> >
> > Is it somewhere I missed?
>
> kthread_unpark().
>
> > > @@ -4978,9 +4982,9 @@ static void rebind_workers(struct worker_pool *pool)
> > >  * from CPU_ONLINE, the following shouldn't fail.
> > >  */
> > > for_each_pool_worker(worker, pool) {
> > > -   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
> > > - pool->attrs->cpumask) < 
> > > 0);
> > > -   kthread_set_per_cpu(worker->task, true);
> > > +   WARN_ON_ONCE(kthread_park(worker->task) < 0);
> > > +   kthread_set_per_cpu(worker->task, pool->cpu);
> > > +   kthread_unpark(worker->task);
> >
> > I feel nervous to use kthread_park() here and kthread_parkme() in
> > worker thread.  And adding kthread_should_park() to the fast path
> > also daunt me.
>
> Is that really such a hot path that an additional load is problematic?
>
> > How about using a new KTHREAD_ instead of KTHREAD_IS_PER_CPU,
> > so that we can set and clear KTHREAD_ freely, especially before
> > set_cpus_allowed_ptr().
>
> KTHREAD_IS_PER_CPU is exactly what we need, why make another flag?
>
> The above sequence is nice in that it restores both the
> KTHREAD_IS_PER_CPU flag and affinity while the task is frozen, so there
> are no races where one is observed and not the other.
>
> It is also the exact sequence normal per-cpu threads (smpboot) use to
> preserve affinity.

Other per-cpu threads normally do short-live works. wq's work can be
lengthy, cpu-intensive, heavy-lock-acquiring or even call
get_online_cpus() which might result in a deadlock with kthread_park().

Re: [PATCH 3/4] workqueue: Tag bound workers with KTHREAD_IS_PER_CPU

2021-01-15 Thread Lai Jiangshan

On Thu, Jan 14, 2021 at 11:35 PM Peter Zijlstra  wrote:

>
> -void kthread_set_per_cpu(struct task_struct *k, bool set)
> +void kthread_set_per_cpu(struct task_struct *k, int cpu)
>  {
> struct kthread *kthread = to_kthread(k);
> if (!kthread)
> return;
>
> -   if (set) {
> -   WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY));
> -   WARN_ON_ONCE(k->nr_cpus_allowed != 1);
> -   set_bit(KTHREAD_IS_PER_CPU, >flags);
> -   } else {
> +   WARN_ON_ONCE(!(k->flags & PF_NO_SETAFFINITY));
> +
> +   if (cpu < 0) {
> clear_bit(KTHREAD_IS_PER_CPU, >flags);
> +   return;
> }
> +
> +   kthread->cpu = cpu;
> +   set_bit(KTHREAD_IS_PER_CPU, >flags);
>  }
>

I don't see the code to set the mask of the cpu to the task
since set_cpus_allowed_ptr() is removed from rebind_worker().

Is it somewhere I missed?

>
> @@ -2371,6 +2371,7 @@ static int worker_thread(void *__worker)
> /* tell the scheduler that this is a workqueue worker */
> set_pf_worker(true);
>  woke_up:
> +   kthread_parkme();
> raw_spin_lock_irq(>lock);
>
> /* am I supposed to die? */
> @@ -2428,7 +2429,7 @@ static int worker_thread(void *__worker)
> move_linked_works(work, >scheduled, NULL);
> process_scheduled_works(worker);
> }
> -   } while (keep_working(pool));
> +   } while (keep_working(pool) && !kthread_should_park());
>
> worker_set_flags(worker, WORKER_PREP);
>  sleep:
> @@ -2440,9 +2441,12 @@ static int worker_thread(void *__worker)
>  * event.
>  */
> worker_enter_idle(worker);
> -   __set_current_state(TASK_IDLE);
> +   set_current_state(TASK_IDLE);
> raw_spin_unlock_irq(>lock);
> -   schedule();
> +
> +   if (!kthread_should_park())
> +   schedule();
> +
> goto woke_up;
>  }
>
> @@ -4923,7 +4927,7 @@ static void unbind_workers(int cpu)
> raw_spin_unlock_irq(>lock);
>
> for_each_pool_worker(worker, pool) {
> -   kthread_set_per_cpu(worker->task, false);
> +   kthread_set_per_cpu(worker->task, -1);
> WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
> cpu_possible_mask) < 0);
> }
>
> @@ -4978,9 +4982,9 @@ static void rebind_workers(struct worker_pool *pool)
>  * from CPU_ONLINE, the following shouldn't fail.
>  */
> for_each_pool_worker(worker, pool) {
> -   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
> - pool->attrs->cpumask) < 0);
> -   kthread_set_per_cpu(worker->task, true);
> +   WARN_ON_ONCE(kthread_park(worker->task) < 0);
> +   kthread_set_per_cpu(worker->task, pool->cpu);
> +   kthread_unpark(worker->task);

I feel nervous to use kthread_park() here and kthread_parkme() in
worker thread.  And adding kthread_should_park() to the fast path
also daunt me.

How about using a new KTHREAD_ instead of KTHREAD_IS_PER_CPU,
so that we can set and clear KTHREAD_ freely, especially before
set_cpus_allowed_ptr().

For example, we can add a new KTHREAD_ACTIVE_MASK_ONLY which means
even when
  is_per_cpu_kthread() && the_cpu_is_online() &&
  the_cpu_is_not_active() && KTHREAD_ACTIVE_MASK_ONLY
we should also break the affinity.

So that we can easily set KTHREAD_ACTIVE_MASK_ONLY in unbind_workers()
add clear KTHREAD_ACTIVE_MASK_ONLY here and avoid adding new
synchronization like kthread_park().

> }
>
> raw_spin_lock_irq(>lock);

Re: [PATCH -tip V3 0/8] workqueue: break affinity initiatively

2021-01-15 Thread Lai Jiangshan

On Fri, Jan 15, 2021 at 9:05 PM Peter Zijlstra  wrote:
>
> On Fri, Jan 15, 2021 at 10:11:51AM +0100, Peter Zijlstra wrote:
> > On Tue, Jan 12, 2021 at 03:53:24PM -0800, Paul E. McKenney wrote:
> > > An SRCU-P run on the new series reproduced the warning below.  Repeat-by:
> > >
> > > tools/testing/selftests/rcutorture/bin/kvm.sh --allcpus --duration 10 
> > > --configs "112*SRCU-P" --bootargs 
> > > "rcupdate.rcu_cpu_stall_suppress_at_boot=1 torture.disable_onoff_at_boot 
> > > rcupdate.rcu_task_stall_timeout=3 rcutree.softirq=0" --trust-make
> >
> > Lemme go wake up an EX ;-)
>
> Whee, rescuer thread goes wobbly... took a few hours but there you have
> it.
>
> All I've got so far is fugly, gotta think harder.

Hello,

With the help of this patch, you can solve the problem.

https://lore.kernel.org/lkml/20210116065753.2163-1-jiangshan...@gmail.com/

[PATCH] workqueue: keep unbound rescuer's cpumask to be default cpumask

2021-01-15 Thread Lai Jiangshan

From: Lai Jiangshan 

When we attach a rescuer to a pool, we will set the rescuer's cpumask
to the pool's cpumask.  If there is hotplug ongoing, it may cause
the rescuer running on the dying CPU and cause bug or it may cause
warning of setting online&!active cpumask.

So we have to find a reliable way to set cpumask when attaching
rescuer.

When the pool is percpu pool, we have easy way to reliably
set cpumask with the help of %POOL_DISASSOCIATED.

But when it is unbound rescuer, the problem becomes harder, because
we can't neither use get_online_cpus() here nor test cpu_active_mask
without synchronization.

Atfer investigation, and noticing the unbound nature of the unbound
rescuer, we decide to make it use the wq's default pwq's cpumask so
that we don't need to set the rescuer's cpumask when attaching.

To implement it, we have to set unbound rescuer's cpumask to the
default pwq's cpumask when creation and maintain it when hotplug
or the default pwq is changed.

Signed-off-by: Lai Jiangshan 
---
NOTE:
this patch is designed to be a complement of Peter's
patchset https://lore.kernel.org/lkml/20210112144344.850850...@infradead.org/
where it has a problem dealing with rescuer.

 kernel/workqueue.c | 101 +
 1 file changed, 92 insertions(+), 9 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9880b6c0e272..901abab945d4 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1849,16 +1849,37 @@ static void worker_attach_to_pool(struct worker *worker,
mutex_lock(_pool_attach_mutex);
 
/*
-* set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
-* online CPUs.  It'll be re-applied when any of the CPUs come up.
-*/
-   set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
-
-   /*
+* If we called from create_worker(), we don't need to call
+* set_cpus_allowed_ptr() since we just call kthread_bind_mask()
+* on it with proper cpumask.
+*
+* The only other path gets us here is rescuer_thread().  And we
+* only call set_cpus_allowed_ptr() to rescuer for percpu pool.
+*
+* For bound percpu pool, the task's mask is set to the pool's
+* mask.  For unbound percpu pool whose cpu is offline, the task's
+* mask is set to cpu_possible_mask instead of sticking to the
+* previous served percpu pool's mask.
+*
+* Unboud wq's rescuer's cpumask is kept as the same as the wq's
+* default pwq's cpumask and maintained when cpu hotplug and
+* the update of the wq's default pwq.
+*
+* set_cpus_allowed_ptr() will not fail here since the pool's CPU
+* is online for !POOL_DISASSOCIATED and set_cpus_allowed_ptr() can
+* not fail on cpu_possible_mask.
+*
 * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains
 * stable across this function.  See the comments above the flag
 * definition for details.
 */
+   if (worker->rescue_wq && pool->cpu >= 0) {
+   if (!(pool->flags & POOL_DISASSOCIATED))
+   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
pool->attrs->cpumask) < 0);
+   else
+   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
cpu_possible_mask) < 0);
+   }
+
if (pool->flags & POOL_DISASSOCIATED)
worker->flags |= WORKER_UNBOUND;
 
@@ -4003,6 +4024,25 @@ static void apply_wqattrs_commit(struct 
apply_wqattrs_ctx *ctx)
link_pwq(ctx->dfl_pwq);
swap(ctx->wq->dfl_pwq, ctx->dfl_pwq);
 
+   if (ctx->wq->rescuer) {
+   struct task_struct *task = ctx->wq->rescuer->task;
+   struct cpumask *dfl_mask = ctx->dfl_pwq->pool->attrs->cpumask;
+
+   /*
+* set the rescuer's cpumask to default pwq's cpumask.
+*
+* There might be no CPUs online in the default pwq's
+* cpumask and set_cpus_allowed_ptr() will fail on this
+* cpumask, so we use cpu_possible_mask instead and
+* restore_unbound_rescuer_cpumask() will restore for us
+* when the CPUs are back.
+*/
+   if (cpumask_intersects(dfl_mask, cpu_online_mask))
+   WARN_ON_ONCE(set_cpus_allowed_ptr(task, dfl_mask) < 0);
+   else
+   WARN_ON_ONCE(set_cpus_allowed_ptr(task, 
cpu_possible_mask) < 0);
+   }
+
mutex_unlock(>wq->mutex);
 }
 
@@ -4241,7 +4281,14 @@ static int init_rescuer(struct workqueue_struct *wq)
}
 
wq->rescuer = rescuer;
-   kthread_bind_mask(rescuer->task, cpu_possible_mask);
+   if (wq->flags & WQ_UNBOUND) {
+   /* grab wq->mutex

Re: [PATCH 3/4] workqueue: Tag bound workers with KTHREAD_IS_PER_CPU

2021-01-13 Thread Lai Jiangshan

On Tue, Jan 12, 2021 at 10:51 PM Peter Zijlstra  wrote:
>
> Mark the per-cpu workqueue workers as KTHREAD_IS_PER_CPU.
>
> Workqueues have unfortunate semantics in that per-cpu workers are not
> default flushed and parked during hotplug, however a subset does
> manual flush on hotplug and hard relies on them for correctness.
>
> Therefore play silly games..
>
> Signed-off-by: Peter Zijlstra (Intel) 
> Tested-by: Paul E. McKenney 
> ---
>  kernel/workqueue.c |   11 +--
>  1 file changed, 9 insertions(+), 2 deletions(-)
>
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -1861,6 +1861,8 @@ static void worker_attach_to_pool(struct
>  */
> if (pool->flags & POOL_DISASSOCIATED)
> worker->flags |= WORKER_UNBOUND;
> +   else
> +   kthread_set_per_cpu(worker->task, true);
>
> list_add_tail(>node, >workers);
> worker->pool = pool;
> @@ -1883,6 +1885,7 @@ static void worker_detach_from_pool(stru
>
> mutex_lock(_pool_attach_mutex);
>
> +   kthread_set_per_cpu(worker->task, false);
> list_del(>node);
> worker->pool = NULL;
>
> @@ -4919,8 +4922,10 @@ static void unbind_workers(int cpu)
>
> raw_spin_unlock_irq(>lock);
>
> -   for_each_pool_worker(worker, pool)
> +   for_each_pool_worker(worker, pool) {
> +   kthread_set_per_cpu(worker->task, false);
> WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
> cpu_possible_mask) < 0);
> +   }
>
> mutex_unlock(_pool_attach_mutex);
>
> @@ -4972,9 +4977,11 @@ static void rebind_workers(struct worker
>  * of all workers first and then clear UNBOUND.  As we're called
>  * from CPU_ONLINE, the following shouldn't fail.
>  */
> -   for_each_pool_worker(worker, pool)
> +   for_each_pool_worker(worker, pool) {
> WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
>   pool->attrs->cpumask) < 0);
> +   kthread_set_per_cpu(worker->task, true);

Will the schedule break affinity in the middle of these two lines due to
patch4 allowing it and result in Paul's reported splat.

> +   }
>
> raw_spin_lock_irq(>lock);
>
>
>

Re: [PATCH -tip V3 0/8] workqueue: break affinity initiatively

2021-01-13 Thread Lai Jiangshan




On 2021/1/13 19:10, Peter Zijlstra wrote:

On Tue, Jan 12, 2021 at 11:38:12PM +0800, Lai Jiangshan wrote:


But the hard problem is "how to suppress the warning of
online&!active in __set_cpus_allowed_ptr()" for late spawned
unbound workers during hotplug.


I cannot see create_worker() go bad like that.

The thing is, it uses:

   kthread_bind_mask(, pool->attr->cpumask)
   worker_attach_to_pool()
 set_cpus_allowed_ptr(, pool->attr->cpumask)

which means set_cpus_allowed_ptr() must be a NOP, because the affinity
is already set by kthread_bind_mask(). Further, the first wakeup of that
worker will then hit:

   select_task_rq()
 is_cpu_allowed()
   is_per_cpu_kthread() -- false
 select_fallback_rq()


So normally that really isn't a problem. I can only see a tiny hole
there, where someone changes the cpumask between kthread_bind_mask() and
set_cpus_allowed_ptr(). AFAICT that can be fixed in two ways:

  - add wq_pool_mutex around things in create_worker(), or
  - move the set_cpus_allowed_ptr() out of worker_attach_to_pool() and
into rescuer_thread().

Which then brings us to rescuer_thread...  If we manage to trigger the
rescuer during hotplug, then yes, I think that can go wobbly.



How about the following idea (not complied, not tested).
It does not call set_cpus_allowed_ptr() for just created workers.
It does not change cpumask for rescuer except when it is per cpu pool.

The only problem is that, unbound rescue worker doesn't comply with
wq_unbound_cpumask nor wq->unbound_attrs->cpumask.  Another 50 Lines
of code can make it complied,  but I don't want to type it in email
and complicated the idea.

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9880b6c0e272..df2082283c1e 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1849,10 +1849,30 @@ static void worker_attach_to_pool(struct worker *worker,
mutex_lock(_pool_attach_mutex);

/*
-* set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
-* online CPUs.  It'll be re-applied when any of the CPUs come up.
+* If we called from create_worker(), we don't need to call
+* set_cpus_allowed_ptr() since we just kthread_bind_mask() it.
+*
+* The only other path gets us here is rescuer_thread().
+*
+* When !(pool->flags & POOL_DISASSOCIATED), it is per-cpu pool
+* and we should rebind the rescuer worker to the target CPU.
+*
+* When it is a rescuer worker attaching to unbound pool, we keep
+* the affinity for rescuer worker to be cpu_possible_mask.
+*
+* Note: unbound rescue worker doesn't comply with wq_unbound_cpumask
+* nor wq->unbound_attrs->cpumask.  The optimal choice is to keep
+* the affinity for rescuer worker to be
+*  wq_unbound_cpumask & wq->unbound_attrs->cpumask
+* but there is no reliable way to set it back via
+* set_cpus_allowed_ptr() when its affinity is changed by scheduler
+* due to CPU hotplug, so we just use cpu_possible_mask for resuer.
+*
+* set_cpus_allowed_ptr() will not fail since
+* !(pool->flags & POOL_DISASSOCIATED)
 */
-   set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
+   if (worker->rescue_wq && !(pool->flags & POOL_DISASSOCIATED))
+   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
pool->attrs->cpumask) < 0);

/*
 * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains
@@ -5043,7 +5063,8 @@ static void restore_unbound_workers_cpumask(struct 
worker_pool *pool, int cpu)

/* as we're called from CPU_ONLINE, the following shouldn't fail */
for_each_pool_worker(worker, pool)
-   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, ) < 0);
+   if (!worker->rescue_wq)
+   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, ) 
< 0);
 }

 int workqueue_prepare_cpu(unsigned int cpu)

Re: [PATCH -tip V3 0/8] workqueue: break affinity initiatively

2021-01-13 Thread Lai Jiangshan

On Wed, Jan 13, 2021 at 7:11 PM Peter Zijlstra  wrote:
>
> On Tue, Jan 12, 2021 at 11:38:12PM +0800, Lai Jiangshan wrote:
>
> > But the hard problem is "how to suppress the warning of
> > online&!active in __set_cpus_allowed_ptr()" for late spawned
> > unbound workers during hotplug.
>
> I cannot see create_worker() go bad like that.
>
> The thing is, it uses:
>
>   kthread_bind_mask(, pool->attr->cpumask)
>   worker_attach_to_pool()
> set_cpus_allowed_ptr(, pool->attr->cpumask)
>
> which means set_cpus_allowed_ptr() must be a NOP, because the affinity
> is already set by kthread_bind_mask(). Further, the first wakeup of that
> worker will then hit:
>
>   select_task_rq()
> is_cpu_allowed()
>   is_per_cpu_kthread() -- false
> select_fallback_rq()
>
>
> So normally that really isn't a problem. I can only see a tiny hole
> there, where someone changes the cpumask between kthread_bind_mask() and
> set_cpus_allowed_ptr(). AFAICT that can be fixed in two ways:
>
>  - add wq_pool_mutex around things in create_worker(), or
>  - move the set_cpus_allowed_ptr() out of worker_attach_to_pool() and
>into rescuer_thread().
>
> Which then brings us to rescuer_thread...  If we manage to trigger the
> rescuer during hotplug, then yes, I think that can go wobbly.

Oh, I forgot set_cpus_allowed_ptr() is NOP when combined with
kthread_bind_mask()(create_worker()).

So the problem becomes "how to suppress the warning of online&!active in
__set_cpus_allowed_ptr()" for late *attached unbound rescuer* workers
during hotplug.


>
> Let me consider that a bit more while I try and make sense of that splat
> Paul reported.
>
> ---
>
> diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> index ec0771e4a3fb..fe05308dc472 100644
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -1844,15 +1844,19 @@ static struct worker *alloc_worker(int node)
>   * cpu-[un]hotplugs.
>   */
>  static void worker_attach_to_pool(struct worker *worker,
> -  struct worker_pool *pool)
> + struct worker_pool *pool,
> + bool set_affinity)
>  {
> mutex_lock(_pool_attach_mutex);
>
> -   /*
> -* set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
> -* online CPUs.  It'll be re-applied when any of the CPUs come up.
> -*/
> -   set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
> +   if (set_affinity) {
> +   /*
> +* set_cpus_allowed_ptr() will fail if the cpumask doesn't 
> have
> +* any online CPUs.  It'll be re-applied when any of the CPUs
> +* come up.
> +*/
> +   set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
> +   }
>
> /*
>  * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains
> @@ -1944,7 +1948,7 @@ static struct worker *create_worker(struct worker_pool 
> *pool)
> kthread_bind_mask(worker->task, pool->attrs->cpumask);
>
> /* successful, attach the worker to the pool */
> -   worker_attach_to_pool(worker, pool);
> +   worker_attach_to_pool(worker, pool, false);
>
> /* start the newly created worker */
> raw_spin_lock_irq(>lock);
> @@ -2509,7 +2513,11 @@ static int rescuer_thread(void *__rescuer)
>
> raw_spin_unlock_irq(_mayday_lock);
>
> -   worker_attach_to_pool(rescuer, pool);
> +   /*
> +* XXX can go splat when running during hot-un-plug and
> +* the pool affinity is wobbly.
> +*/
> +   worker_attach_to_pool(rescuer, pool, true);
>
> raw_spin_lock_irq(>lock);
>

Re: [PATCH 3/4] workqueue: Tag bound workers with KTHREAD_IS_PER_CPU

2021-01-12 Thread Lai Jiangshan

On Tue, Jan 12, 2021 at 10:51 PM Peter Zijlstra  wrote:
>
> Mark the per-cpu workqueue workers as KTHREAD_IS_PER_CPU.
>
> Workqueues have unfortunate semantics in that per-cpu workers are not
> default flushed and parked during hotplug, however a subset does
> manual flush on hotplug and hard relies on them for correctness.
>
> Therefore play silly games..
>
> Signed-off-by: Peter Zijlstra (Intel) 
> Tested-by: Paul E. McKenney 
> ---

Reviewed-by: Lai Jiangshan 

I like this patchset in that the scheduler takes care of the
affinities of the tasks when we don't want it to be per-cpu.

>  kernel/workqueue.c |   11 +--
>  1 file changed, 9 insertions(+), 2 deletions(-)
>
> --- a/kernel/workqueue.c
> +++ b/kernel/workqueue.c
> @@ -1861,6 +1861,8 @@ static void worker_attach_to_pool(struct
>  */
> if (pool->flags & POOL_DISASSOCIATED)
> worker->flags |= WORKER_UNBOUND;
> +   else
> +   kthread_set_per_cpu(worker->task, true);
>
> list_add_tail(>node, >workers);
> worker->pool = pool;
> @@ -1883,6 +1885,7 @@ static void worker_detach_from_pool(stru
>
> mutex_lock(_pool_attach_mutex);
>
> +   kthread_set_per_cpu(worker->task, false);
> list_del(>node);
> worker->pool = NULL;
>
> @@ -4919,8 +4922,10 @@ static void unbind_workers(int cpu)
>
> raw_spin_unlock_irq(>lock);
>
> -   for_each_pool_worker(worker, pool)
> +   for_each_pool_worker(worker, pool) {
> +   kthread_set_per_cpu(worker->task, false);
> WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
> cpu_possible_mask) < 0);
> +   }
>
> mutex_unlock(_pool_attach_mutex);
>
> @@ -4972,9 +4977,11 @@ static void rebind_workers(struct worker
>  * of all workers first and then clear UNBOUND.  As we're called
>  * from CPU_ONLINE, the following shouldn't fail.
>  */
> -   for_each_pool_worker(worker, pool)
> +   for_each_pool_worker(worker, pool) {
> WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
>   pool->attrs->cpumask) < 0);
> +   kthread_set_per_cpu(worker->task, true);
> +   }
>
> raw_spin_lock_irq(>lock);
>
>
>

Re: [PATCH -tip V3 0/8] workqueue: break affinity initiatively

2021-01-12 Thread Lai Jiangshan

On Tue, Jan 12, 2021 at 10:53 PM Peter Zijlstra  wrote:
>
> On Tue, Jan 12, 2021 at 12:33:03PM +0800, Lai Jiangshan wrote:
> > > Well yes, but afaict the workqueue stuff hasn't been settled yet, and
> > > the rcutorture patch Paul did was just plain racy and who knows what
> > > other daft kthread users are out there. That and we're at -rc3.
> >
> > I just send the V4 patchset for the workqueue.  Please take a look.
>
> Yes, I've seen it. But I think this approach is smaller and simpler.
>
> By distinguishing between geniuine per-cpu kthreads and kthreads that
> happen to have a single CPU affinity, things become much simpler and
> robust.

Again, fixing the problem of tasks running on dying cpu is easy.
(In other word, adjusting affinity correctly is easy, not matter adjusting
affinity initiatively like here or passively via new !KTHREAD_IS_PER_CPU)

For workqueue, Valentin Schneider patch can almost complete it, only
a small additional fix for percpu unbound workers needed.

And all four versions of my patchset can complete it.

But the hard problem is "how to suppress the warning of
online&!active in __set_cpus_allowed_ptr()" for late spawned
unbound workers during hotplug.

It is not handled completely until my fourth version patchset.

And your patchset ignores this theoretical case, I agree it is
Okay since no one has reported the warning in practice so far.

So something like CPUHP_AP_WORKQUEUE_UNBOUND_ONLINE is still needed
after your patchset merged.

Or you move the warning in __set_cpus_allowed_ptr() in deeper
places where the problem are really happen.

Re: [PATCH -tip V3 0/8] workqueue: break affinity initiatively

2021-01-11 Thread Lai Jiangshan

> Well yes, but afaict the workqueue stuff hasn't been settled yet, and
> the rcutorture patch Paul did was just plain racy and who knows what
> other daft kthread users are out there. That and we're at -rc3.

I just send the V4 patchset for the workqueue.  Please take a look.

> @@ -1861,6 +1861,8 @@ static void worker_attach_to_pool(struct worker *worker,
> */
> if (pool->flags & POOL_DISASSOCIATED)
> worker->flags |= WORKER_UNBOUND;
> +   else
> +   kthread_set_per_cpu(worker->task, true);

I think kthread_set_per_cpu(worker->task, false) is also needed for
worker_detach_from_pool() or at least rescuer_thread() who doesn't
go to die after detached from the pool.

> I thought only pcpu pools would get the POOL_DISASSOCIATED flag on
> offline, but it seems unbound pools also get it at init time. Did I get
> that right?

You are right.

The POOL_DISASSOCIATED flag indicates whether the pool is concurrency
management or not (negative way, POOL_DISASSOCIATED means "not concurrency
management"). So it should be applied for all unbound pools.

When !POOL_DISASSOCIATED means it is a percpu pool, and the pool->cpu
is online and the offline callback has not been called yet even the pool->cpu
is going to be offline.  So !POOL_DISASSOCIATED is used a lot in the code.

[PATCH -tip V4 7/8] workqueue: Manually break affinity on hotplug for unbound pool

2021-01-11 Thread Lai Jiangshan

From: Lai Jiangshan 

There is possible that a per-node pool/woker's affinity is a single
CPU.  It can happen when the workqueue user changes the cpumask of the
workqueue or when wq_unbound_cpumask is changed by system adim via
/sys/devices/virtual/workqueue/cpumask.  And pool->attrs->cpumask
is workqueue's cpumask & wq_unbound_cpumask & possible_cpumask_of_the_node,
which can be a single CPU and makes the pool's workers to be "per cpu
kthread".

And the scheduler won't break affinity on the "per cpu kthread" workers
when the CPU is going down, so we have to do it by ourselves.

We do it by introducing new break_unbound_workers_cpumask() which is a
symmetric version of restore_unbound_workers_cpumask().   When the last
online CPU of the pool goes down, it is time to break the affinity.

The way to break affinity is to set the workers' affinity to
cpu_possible_mask, so that we preserve the same behavisor when
the scheduler breaks affinity for us.

Fixes: 06249738a41a ("workqueue: Manually break affinity on hotplug")
Acked-by: Tejun Heo 
Tested-by: Paul E. McKenney 
Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c | 65 +++---
 1 file changed, 62 insertions(+), 3 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f2793749bd97..b012adbeff9f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5035,8 +5035,9 @@ static void rebind_workers(struct worker_pool *pool)
  *
  * An unbound pool may end up with a cpumask which doesn't have any online
  * CPUs.  When a worker of such pool get scheduled, the scheduler resets
- * its cpus_allowed.  If @cpu is in @pool's cpumask which didn't have any
- * online CPU before, cpus_allowed of all its workers should be restored.
+ * its cpus_allowed or we had reset it earlier in 
break_unbound_workers_cpumask().
+ * If @cpu is in @pool's cpumask which didn't have any online CPU before,
+ * cpus_allowed of all its workers should be restored.
  */
 static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
 {
@@ -5061,6 +5062,50 @@ static void restore_unbound_workers_cpumask(struct 
worker_pool *pool, int cpu)
  pool->attrs->cpumask) < 0);
 }
 
+/**
+ * break_unbound_workers_cpumask - break cpumask of unbound workers
+ * @pool: unbound pool of interest
+ * @cpu: the CPU which is going down
+ *
+ * An unbound pool may end up with a cpumask which doesn't have any online
+ * CPUs.  When a worker of such pool get scheduled, the scheduler resets
+ * its cpus_allowed unless there is only one CPU in the cpus_allowed which
+ * is the special case we need to handle it on our own and avoid blocking
+ * the hotplug process or causing further harms.
+ */
+static void break_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
+{
+   struct worker *worker;
+
+   lockdep_assert_held(_pool_mutex);
+   lockdep_assert_held(_pool_attach_mutex);
+
+   /* is @cpu allowed for @pool? */
+   if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
+   return;
+
+   /*
+* is @cpu the last online for @pool?  If so, the scheduler or we
+* need to break affinity for the workers.
+*/
+   if (cpumask_intersects(pool->attrs->cpumask, wq_unbound_online_cpumask))
+   return;
+
+   /*
+* is @cpu the only possible CPU for @pool?  If not, scheduler
+* will take care of breaking affinity for the workers since the
+* workers are all non-per-cpu-kthread.  It is the usual case
+* for unbound pools/workers and we don't need to bother to do it.
+*/
+   if (cpumask_weight(pool->attrs->cpumask) > 1)
+   return;
+
+   /* as we're setting it to cpu_possible_mask, the following shouldn't 
fail */
+   for_each_pool_worker(worker, pool)
+   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
+ cpu_possible_mask) < 0);
+}
+
 int workqueue_prepare_cpu(unsigned int cpu)
 {
struct worker_pool *pool;
@@ -5126,13 +5171,27 @@ int workqueue_unbound_online_cpu(unsigned int cpu)
 
 int workqueue_unbound_offline_cpu(unsigned int cpu)
 {
+   struct worker_pool *pool;
struct workqueue_struct *wq;
+   int pi;
 
-   /* update NUMA affinity of unbound workqueues */
mutex_lock(_pool_mutex);
cpumask_clear_cpu(cpu, wq_unbound_online_cpumask);
+
+   /* update CPU affinity of workers of unbound pools */
+   for_each_pool(pool, pi) {
+   mutex_lock(_pool_attach_mutex);
+
+   if (pool->cpu < 0)
+   break_unbound_workers_cpumask(pool, cpu);
+
+   mutex_unlock(_pool_attach_mutex);
+   }
+
+   /* update NUMA affinity of unbound workqueues */
list_for_each_entry(wq, , list)
wq_update_unboun

[PATCH -tip V4 8/8] workqueue: Fix affinity of kworkers when attaching into pool

2021-01-11 Thread Lai Jiangshan

From: Lai Jiangshan 

When worker_attach_to_pool() is called, we should not put the workers
to pool->attrs->cpumask when there is or will be no CPU online in it.

Otherwise, it may cause BUG_ON(): (quote from Valentin:)
  Per-CPU kworkers forcefully migrated away by hotplug via
  workqueue_offline_cpu() can end up spawning more kworkers via

manage_workers() -> maybe_create_worker()

  Workers created at this point will be bound using

pool->attrs->cpumask

  which in this case is wrong, as the hotplug state machine already
  migrated all pinned kworkers away from this CPU. This ends up
  triggering the BUG_ON condition is sched_cpu_dying() (i.e. there's
  a kworker enqueued on the dying rq).
(end of quote)

We need to find out where it is in the hotplug stages to determind
whether pool->attrs->cpumask is valid.  So we have to check
%POOL_DISASSOCIATED and wq_unbound_online_cpumask which are indications
for the hotplug stages.

So for per-CPU kworker case, %POOL_DISASSOCIATED marks the kworkers
of the pool are bound or unboud, so it is used to detect whether
pool->attrs->cpumask is valid to use when attachment.

For unbound workers, we should not set online&!active cpumask to workers.
Just introduced wq_unound_online_cpumask has the features that going-down
cpu is cleared earlier in it than in cpu_active_mask and bring-up cpu
is set later in it than cpu_active_mask.  So it is perfect to be used to
detect whether the pool->attrs->cpumask is valid to use.

To use wq_unound_online_cpumask in worker_attach_to_pool(), we need to protect
wq_unbound_online_cpumask in wq_pool_attach_mutex.

Cc: Qian Cai 
Cc: Peter Zijlstra 
Cc: Vincent Donnefort 
Link: 
https://lore.kernel.org/lkml/20201210163830.21514-3-valentin.schnei...@arm.com/
Link: 
https://lore.kernel.org/r/ff62e3ee994efb3620177bf7b19fab16f4866845.ca...@redhat.com
Reported-by: Qian Cai 
Reviewed-by: Valentin Schneider 
Acked-by: Tejun Heo 
Tested-by: Paul E. McKenney 
Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c | 39 ++-
 1 file changed, 30 insertions(+), 9 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index b012adbeff9f..d1f1b863c52a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -310,7 +310,7 @@ static bool workqueue_freezing; /* PL: have wqs 
started freezing? */
 /* PL: allowable cpus for unbound wqs and work items */
 static cpumask_var_t wq_unbound_cpumask;
 
-/* PL: online cpus (cpu_online_mask with the going-down cpu cleared) */
+/* PL: online cpus (cpu_online_mask with the going-down cpu cleared) */
 static cpumask_var_t wq_unbound_online_cpumask;
 
 /* CPU where unbound work was last round robin scheduled from this CPU */
@@ -1849,19 +1849,36 @@ static struct worker *alloc_worker(int node)
 static void worker_attach_to_pool(struct worker *worker,
   struct worker_pool *pool)
 {
+   bool pool_cpumask_active;
+
mutex_lock(_pool_attach_mutex);
 
/*
-* set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
-* online CPUs.  It'll be re-applied when any of the CPUs come up.
+* The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED and
+* wq_unbound_online_cpumask remain stable across this function.
+* See the comments above the definitions of the flag and
+* wq_unbound_online_cpumask for details.
+*
+* For percpu pools, whether pool->attrs->cpumask is legitimate
+* for @worker task depends on where it is in the hotplug stages
+* divided by workqueue_online/offline_cpu().  Refer the functions
+* to see how they toggle %POOL_DISASSOCIATED and update cpumask
+* of the workers.
+*
+* For unbound pools, whether pool->attrs->cpumask is legitimate
+* for @worker task depends on where it is in the hotplug stages
+* divided by workqueue_unbound_online/offline_cpu().  Refer the
+* functions to see how they update wq_unbound_online_cpumask and
+* update cpumask of the workers.
 */
-   set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
+   pool_cpumask_active = pool->cpu >= 0 ? !(pool->flags & 
POOL_DISASSOCIATED) :
+   cpumask_intersects(pool->attrs->cpumask, 
wq_unbound_online_cpumask);
+
+   if (pool_cpumask_active)
+   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
pool->attrs->cpumask) < 0);
+   else
+   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
cpu_possible_mask) < 0);
 
-   /*
-* The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains
-* stable across this function.  See the comments above the flag
-* definition for details.
-*/
if (pool->flags & POOL_DISASSOCIATED)
worker->flags |= WORKER_UNBOUND;
 
@@ -5149,7 +5166

[PATCH -tip V4 4/8] workqueue: Manually break affinity on pool detachment

2021-01-11 Thread Lai Jiangshan

From: Lai Jiangshan 

The pool->attrs->cpumask might be a single CPU and it may go
down after detachment, and the scheduler won't force to break
affinity for us since it is a per-cpu-ktrhead.  So we have to
do it on our own and unbind this worker which can't be unbound
by workqueue_offline_cpu() since it doesn't belong to any pool
after detachment.  Do it unconditionally for there is no harm
to break affinity for non-per-cpu-ktrhead and we don't need to
rely on the scheduler's policy on when to break affinity.

Fixes: 06249738a41a ("workqueue: Manually break affinity on hotplug")
Acked-by: Tejun Heo 
Tested-by: Paul E. McKenney 
Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 3e92bc4f8a36..aed08eddeb83 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1888,6 +1888,19 @@ static void worker_detach_from_pool(struct worker 
*worker)
 
if (list_empty(>workers))
detach_completion = pool->detach_completion;
+
+   /*
+* The pool->attrs->cpumask might be a single CPU and it may go
+* down after detachment, and the scheduler won't force to break
+* affinity for us since it is a per-cpu-ktrhead.  So we have to
+* do it on our own and unbind this worker which can't be unbound
+* by workqueue_offline_cpu() since it doesn't belong to any pool
+* after detachment.  Do it unconditionally for there is no harm
+* to break affinity for non-per-cpu-ktrhead and we don't need to
+* rely on the scheduler's policy on when to break affinity.
+*/
+   set_cpus_allowed_ptr(worker->task, cpu_possible_mask);
+
mutex_unlock(_pool_attach_mutex);
 
/* clear leftover flags without pool->lock after it is detached */
-- 
2.19.1.6.gb485710b

[PATCH -tip V4 6/8] workqueue: use wq_unbound_online_cpumask in restore_unbound_workers_cpumask()

2021-01-11 Thread Lai Jiangshan

From: Lai Jiangshan 

restore_unbound_workers_cpumask() is called when CPU_ONLINE, where
wq_online_cpumask equals to cpu_online_mask.  So no fucntionality
changed.

Acked-by: Tejun Heo 
Tested-by: Paul E. McKenney 
Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d01cca8e51f9..f2793749bd97 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5043,6 +5043,7 @@ static void restore_unbound_workers_cpumask(struct 
worker_pool *pool, int cpu)
static cpumask_t cpumask;
struct worker *worker;
 
+   lockdep_assert_held(_pool_mutex);
lockdep_assert_held(_pool_attach_mutex);
 
/* is @cpu allowed for @pool? */
@@ -5050,7 +5051,7 @@ static void restore_unbound_workers_cpumask(struct 
worker_pool *pool, int cpu)
return;
 
/* is @cpu the only online CPU? */
-   cpumask_and(, pool->attrs->cpumask, cpu_online_mask);
+   cpumask_and(, pool->attrs->cpumask, wq_unbound_online_cpumask);
if (cpumask_weight() != 1)
return;
 
-- 
2.19.1.6.gb485710b

[PATCH -tip V4 5/8] workqueue: introduce wq_unbound_online_cpumask

2021-01-11 Thread Lai Jiangshan

From: Lai Jiangshan 

wq_unbound_online_cpumask is the cached result of cpu_online_mask with
the going-down cpu cleared before the cpu is cleared from cpu_active_mask.
It is used to track the cpu hotplug process so the creation/attachment
of unbound workers can know where it is in the process when there is
ongoing cpu hotplug and so that workqueue can cooperate with hotplug and
scheduler correctly in later patches for setting correct cpumask for
workers and break affinity initiatively.

The first usage of wq_unbound_online_cpumask is also in this patch.
wq_calc_node_cpumask() and wq_update_unbound_numa() can be simplified
a little.

Acked-by: Tejun Heo 
Tested-by: Paul E. McKenney 
Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c | 34 ++
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index aed08eddeb83..d01cca8e51f9 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -310,6 +310,9 @@ static bool workqueue_freezing; /* PL: have wqs 
started freezing? */
 /* PL: allowable cpus for unbound wqs and work items */
 static cpumask_var_t wq_unbound_cpumask;
 
+/* PL: online cpus (cpu_online_mask with the going-down cpu cleared) */
+static cpumask_var_t wq_unbound_online_cpumask;
+
 /* CPU where unbound work was last round robin scheduled from this CPU */
 static DEFINE_PER_CPU(int, wq_rr_cpu_last);
 
@@ -3835,12 +3838,10 @@ static struct pool_workqueue *alloc_unbound_pwq(struct 
workqueue_struct *wq,
  * wq_calc_node_cpumask - calculate a wq_attrs' cpumask for the specified node
  * @attrs: the wq_attrs of the default pwq of the target workqueue
  * @node: the target NUMA node
- * @cpu_going_down: if >= 0, the CPU to consider as offline
  * @cpumask: outarg, the resulting cpumask
  *
- * Calculate the cpumask a workqueue with @attrs should use on @node.  If
- * @cpu_going_down is >= 0, that cpu is considered offline during
- * calculation.  The result is stored in @cpumask.
+ * Calculate the cpumask a workqueue with @attrs should use on @node.
+ * The result is stored in @cpumask.
  *
  * If NUMA affinity is not enabled, @attrs->cpumask is always used.  If
  * enabled and @node has online CPUs requested by @attrs, the returned
@@ -3854,15 +3855,14 @@ static struct pool_workqueue *alloc_unbound_pwq(struct 
workqueue_struct *wq,
  * %false if equal.
  */
 static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
-int cpu_going_down, cpumask_t *cpumask)
+cpumask_t *cpumask)
 {
if (!wq_numa_enabled || attrs->no_numa)
goto use_dfl;
 
/* does @node have any online CPUs @attrs wants? */
cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask);
-   if (cpu_going_down >= 0)
-   cpumask_clear_cpu(cpu_going_down, cpumask);
+   cpumask_and(cpumask, cpumask, wq_unbound_online_cpumask);
 
if (cpumask_empty(cpumask))
goto use_dfl;
@@ -3971,7 +3971,7 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
goto out_free;
 
for_each_node(node) {
-   if (wq_calc_node_cpumask(new_attrs, node, -1, 
tmp_attrs->cpumask)) {
+   if (wq_calc_node_cpumask(new_attrs, node, tmp_attrs->cpumask)) {
ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
if (!ctx->pwq_tbl[node])
goto out_free;
@@ -4096,7 +4096,6 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
  * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
  * @wq: the target workqueue
  * @cpu: the CPU coming up or going down
- * @online: whether @cpu is coming up or going down
  *
  * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
  * %CPU_DOWN_FAILED.  @cpu is being hot[un]plugged, update NUMA affinity of
@@ -4114,11 +4113,9 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
  * affinity, it's the user's responsibility to flush the work item from
  * CPU_DOWN_PREPARE.
  */
-static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
-  bool online)
+static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu)
 {
int node = cpu_to_node(cpu);
-   int cpu_off = online ? -1 : cpu;
struct pool_workqueue *old_pwq = NULL, *pwq;
struct workqueue_attrs *target_attrs;
cpumask_t *cpumask;
@@ -4146,7 +4143,7 @@ static void wq_update_unbound_numa(struct 
workqueue_struct *wq, int cpu,
 * and create a new one if they don't match.  If the target cpumask
 * equals the default pwq's, the default pwq should be used.
 */
-   if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, cpu_off, 
cpumask)) {
+   if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, nod

[PATCH -tip V4 3/8] workqueue: use cpu_possible_mask instead of cpu_active_mask to break affinity

2021-01-11 Thread Lai Jiangshan

From: Lai Jiangshan 

The scheduler won't break affinity for us any more, and we should
"emulate" the same behavior when the scheduler breaks affinity for
us.  The behavior is "changing the cpumask to cpu_possible_mask".

And there might be some other CPUs online later while the worker is
still running with the pending work items.  The worker should be allowed
to use the later online CPUs as before and process the work items ASAP.
If we use cpu_active_mask here, we can't achieve this goal but
using cpu_possible_mask can.

Fixes: 06249738a41a ("workqueue: Manually break affinity on hotplug")
Acked-by: Tejun Heo 
Tested-by: Paul E. McKenney 
Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 38628e2a622c..3e92bc4f8a36 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4920,7 +4920,7 @@ static void unbind_workers(int cpu)
raw_spin_unlock_irq(>lock);
 
for_each_pool_worker(worker, pool)
-   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
cpu_active_mask) < 0);
+   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
cpu_possible_mask) < 0);
 
mutex_unlock(_pool_attach_mutex);
 
-- 
2.19.1.6.gb485710b

[PATCH -tip V4 1/8] workqueue: split cpuhotplug callbacks for unbound workqueue

2021-01-11 Thread Lai Jiangshan

From: Lai Jiangshan 

Unbound workers are normally non-per-cpu-kthread, but when cpu hotplug,
we also need to update the pools of unbound workqueues based on the info
that whether the relevant node has CPUs online or not for every workqueue.

The code reuses current cpu hotplug callbacks which are designed for
per-cpu workqueues and not well fit with unbound workqueues/pool/workers.

For example workqueue_offline_cpu() is very late, work items of unbound
workqueue might delay offline process or even worse it might cause
offline stopped due to back-to-back work items which are not really
needed to be per-cpu.

And it is also very bad when unbound worker are created after
sched_cpu_deactivate().  set_cpus_allowed_ptr() with online&!active
cpumasks (multi CPUs) will cause warning, and no one will deactivate
such late spawned workers and might cause later BUG_ON().

Similarly, workqueue_online_cpu is verly early, work items of unbound
workqueue might delay online process.  And it is also very bad when
unbound worker are created before sched_cpu_activate().
set_cpus_allowed_ptr() with online&!active cpumasks (multi CPUs) will
cause warning.  For example, the commit d945b5e9f0e("workqueue: Fix
setting affinity of unbound worker threads") fixed it in some cases
of the problem, leaving other cases unfixed and leaving the comment
does not match with the fixing code.

So we need to split cpuhotplug callback for unbound workqueue and
put the new cpuhotplug callbacks in proper places.

Normally, we can split them and put them to CPUHP_AP_ONLINE_DYN.  But it
doesn't solve the problem of set_cpus_allowed_ptr() with online&!active
cpumasks.  So we have to use an offline callback earlier than
sched_cpu_deactivate() and an online callbck later than sched_cpu_activate().

This patch just introduces CPUHP_AP_WORKQUEUE_UNBOUND_ONLINE and
splits the callbacks.  The follow-up fixes are in the later patches.

Signed-off-by: Lai Jiangshan 
---
 include/linux/cpuhotplug.h |  4 
 include/linux/workqueue.h  |  2 ++
 kernel/cpu.c   |  5 +
 kernel/workqueue.c | 36 ++--
 4 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 0042ef362511..ac2103deb20b 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -20,6 +20,9 @@
  *   |   ^
  *   v   |
  *  AP_ACTIVE  AP_ACTIVE
+ *   |   ^
+ *   v   |
+ *  ONLINE ONLINE
  */
 
 enum cpuhp_state {
@@ -194,6 +197,7 @@ enum cpuhp_state {
CPUHP_AP_X86_HPET_ONLINE,
CPUHP_AP_X86_KVM_CLK_ONLINE,
CPUHP_AP_ACTIVE,
+   CPUHP_AP_WORKQUEUE_UNBOUND_ONLINE,
CPUHP_ONLINE,
 };
 
diff --git a/include/linux/workqueue.h b/include/linux/workqueue.h
index 26de0cae2a0a..98300ddee308 100644
--- a/include/linux/workqueue.h
+++ b/include/linux/workqueue.h
@@ -663,6 +663,8 @@ static inline void wq_watchdog_touch(int cpu) { }
 int workqueue_prepare_cpu(unsigned int cpu);
 int workqueue_online_cpu(unsigned int cpu);
 int workqueue_offline_cpu(unsigned int cpu);
+int workqueue_unbound_online_cpu(unsigned int cpu);
+int workqueue_unbound_offline_cpu(unsigned int cpu);
 #endif
 
 void __init workqueue_init_early(void);
diff --git a/kernel/cpu.c b/kernel/cpu.c
index 4e11e91010e1..f654ca0a104e 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -1665,6 +1665,11 @@ static struct cpuhp_step cpuhp_hp_states[] = {
.startup.single = sched_cpu_activate,
.teardown.single= sched_cpu_deactivate,
},
+   [CPUHP_AP_WORKQUEUE_UNBOUND_ONLINE] = {
+   .name   = "workqueue_unbound:online",
+   .startup.single = workqueue_unbound_online_cpu,
+   .teardown.single= workqueue_unbound_offline_cpu,
+   },
 #endif
 
/* CPU is fully up and running. */
diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 9880b6c0e272..d7bdb7885e55 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5060,6 +5060,29 @@ int workqueue_prepare_cpu(unsigned int cpu)
 }
 
 int workqueue_online_cpu(unsigned int cpu)
+{
+   struct worker_pool *pool;
+
+   for_each_cpu_worker_pool(pool, cpu) {
+   mutex_lock(_pool_attach_mutex);
+   rebind_workers(pool);
+   mutex_unlock(_pool_attach_mutex);
+   }
+
+   return 0;
+}
+
+int workqueue_offline_cpu(unsigned int cpu)
+{
+   /* unbinding per-cpu workers should happen on the local CPU */
+   if (WARN_ON(cpu != smp_processor_id()))
+   return -1;
+
+   unbind_workers(cpu);
+   return 0;
+}
+
+int workqueue_unbound_online_cpu(unsigned int cpu)
 {
struct w

[PATCH -tip V4 2/8] workqueue: set pool->attr->cpumask to workers when cpu online

2021-01-11 Thread Lai Jiangshan

From: Lai Jiangshan 

The commit d945b5e9f0e("workqueue: Fix setting affinity of unbound worker
threads") fixed a problem of set_cpus_allowed_ptr() with online&!active
cpumasks (more than one CPUs) when restore_unbound_workers_cpumask() in
online callback.

But now the new online callback for unbound workqueue is called later
than sched_cpu_activate().  So the cpu is set in cpu_active_mask and
set_cpus_allowed_ptr() with pool->attrs->cpumask in the code won't
cause such warning any more.

The said commit also left comments outdated and causes confusing.
It is better to change the code back.

Cc: Hillf Danton 
Reported-by: Hillf Danton 
Acked-by: Tejun Heo 
Tested-by: Paul E. McKenney 
Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index d7bdb7885e55..38628e2a622c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5039,11 +5039,15 @@ static void restore_unbound_workers_cpumask(struct 
worker_pool *pool, int cpu)
if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
return;
 
+   /* is @cpu the only online CPU? */
cpumask_and(, pool->attrs->cpumask, cpu_online_mask);
+   if (cpumask_weight() != 1)
+   return;
 
/* as we're called from CPU_ONLINE, the following shouldn't fail */
for_each_pool_worker(worker, pool)
-   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, ) < 0);
+   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task,
+ pool->attrs->cpumask) < 0);
 }
 
 int workqueue_prepare_cpu(unsigned int cpu)
-- 
2.19.1.6.gb485710b

[PATCH -tip V4 0/8] workqueue: break affinity initiatively

2021-01-11 Thread Lai Jiangshan

From: Lai Jiangshan 

06249738a41a ("workqueue: Manually break affinity on hotplug")
said that scheduler will not force break affinity for us.

But workqueue highly depends on the old behavior. Many parts of the codes
relies on it, 06249738a41a ("workqueue: Manually break affinity on hotplug")
is not enough to change it, and the commit has flaws in itself too.

It doesn't handle for worker detachment.
It doesn't handle for worker attachement, especially worker creation
  which is handled by Valentin Schneider's patch [1].
It doesn't handle for unbound workers which might be possible
per-cpu-kthread.

We need to thoroughly update the way workqueue handles affinity
in cpu hot[un]plug, what is this patchset intends to do and
replace the Valentin Schneider's patch [1].  The equivalent patch
is patch 8 here.

The patchset is based on tip/master rather than workqueue tree,
because the patchset is a complement for 06249738a41a ("workqueue:
Manually break affinity on hotplug") which is only in tip/master by now.

And TJ acked to route the series through tip.

Changed from V3:
split hotplug callbacks

introduce break_unbound_workers_cpumask() rather than resuing
restore_unbound_workers_cpumask().

Changed from V2:
Drop V2's patch4, which causes warning about setting cpumask
online&!active to kthread reported by several people:
Dexuan Cui 
kernel test robot 

Drop V2's patch 1, which can also cause warning about setting
cpumask online&!active to kthread.  restore_unbound_workers_cpumask()
is changed when we are bring cpu online.  And it cause V2's patch7
(V3's patch5) to be changed accordingly.

Marked patch8 Reviewed-by: Valentin Schneider 


Changed from V1:
Add TJ's acked-by for the whole patchset

Add more words to the comments and the changelog, mainly derived
from discussion with Peter.

Update the comments as TJ suggested.

Update a line of code as Valentin suggested.

Add Valentin's ack for patch 10 because "Seems alright to me." and
add Valentin's comments to the changelog which is integral.

[1]: 
https://lore.kernel.org/r/ff62e3ee994efb3620177bf7b19fab16f4866845.ca...@redhat.com
[V1 patchset]: 
https://lore.kernel.org/lkml/20201214155457.3430-1-jiangshan...@gmail.com/
[V2 patchset]: 
https://lore.kernel.org/lkml/20201218170919.2950-1-jiangshan...@gmail.com/
[V3 patchset]: 
https://lore.kernel.org/lkml/20201226025117.2770-1-jiangshan...@gmail.com/

Lai Jiangshan (8):
  workqueue: split cpuhotplug callbacks for unbound workqueue
  workqueue: set pool->attr->cpumask to workers when cpu online
  workqueue: use cpu_possible_mask instead of cpu_active_mask to break
affinity
  workqueue: Manually break affinity on pool detachment
  workqueue: introduce wq_unbound_online_cpumask
  workqueue: use wq_unbound_online_cpumask in
restore_unbound_workers_cpumask()
  workqueue: Manually break affinity on hotplug for unbound pool
  workqueue: Fix affinity of kworkers when attaching into pool

 include/linux/cpuhotplug.h |   4 +
 include/linux/workqueue.h  |   2 +
 kernel/cpu.c   |   5 +
 kernel/workqueue.c | 192 +
 4 files changed, 165 insertions(+), 38 deletions(-)

-- 
2.19.1.6.gb485710b

Re: [PATCH -tip V3 3/8] workqueue: introduce wq_online_cpumask

2021-01-05 Thread Lai Jiangshan

On Tue, Jan 5, 2021 at 10:37 PM Lai Jiangshan  wrote:
>
> On Tue, Jan 5, 2021 at 9:17 PM Peter Zijlstra  wrote:
> >
> > On Tue, Jan 05, 2021 at 04:23:44PM +0800, Lai Jiangshan wrote:
> > > On Tue, Jan 5, 2021 at 10:41 AM Lai Jiangshan  
> > > wrote:
> > > > On Mon, Jan 4, 2021 at 9:56 PM Peter Zijlstra  
> > > > wrote:
> > > > > On Sat, Dec 26, 2020 at 10:51:11AM +0800, Lai Jiangshan wrote:
> > > > > > From: Lai Jiangshan 
> > > > > >
> > > > > > wq_online_cpumask is the cached result of cpu_online_mask with the
> > > > > > going-down cpu cleared.
> > > > >
> > > > > You can't use cpu_active_mask ?
> > > >
> > > > When a cpu is going out:
> > > > (cpu_active_mask is not protected by workqueue mutexs.)
> >
> > But it is protected by the hotplug lock, which is really all you need
> > afaict.
> >
> > If the worker thread gets spawned before workqueue_offline_cpu(), said
> > function will observe it and adjust the mask, if it gets spawned after
> > it, it must observe a 'reduced' cpu_active_mask.
>
> Making the workqueue set workers' cpumask correctly is easy.
> The hard part is how to suppress the warning.
>
> It is true that said function will observe it and adjust the mask,
> but the warning is already issued.
>
> >
> > > >
> > > > create_worker() for unbound pool  |  cpu offlining
> > > > check cpu_active_mask |
> > > check wq_online_cpumask
> > > >   |  remove bit from cpu_active_mask
> > > >   |  no cpu in pool->attrs->cpumask is 
> > > > active
> > > > set pool->attrs->cpumask to worker|
> > > > and hit the warning
> > > |  remove bit from wq_online_cpumask
> > >
> > > Even with the help of wq_online_cpumask, the patchset can't silence
> > > the warning in __set_cpus_allowed_ptr() in this case.  It is indeed
> > > hard to suppress the warning for unbound pools.  Maybe we need something
> > > like this (outmost callback of CPUHP_AP_WORKQUEUE_UNBOUND_ONLINE,
> > > so that workqueue can do preparation when offlining before AP_ACTIVE):
> > >
> > > diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
> > > index 0042ef362511..ac2103deb20b 100644
> > > --- a/include/linux/cpuhotplug.h
> > > +++ b/include/linux/cpuhotplug.h
> > > @@ -20,6 +20,9 @@
> > >   *   |   ^
> > >   *   v   |
> > >   *  AP_ACTIVE  AP_ACTIVE
> > > + *   |   ^
> > > + *   v   |
> > > + *  ONLINE ONLINE
> > >   */
> > >
> > >  enum cpuhp_state {
> > > @@ -194,6 +197,7 @@ enum cpuhp_state {
> > > CPUHP_AP_X86_HPET_ONLINE,
> > > CPUHP_AP_X86_KVM_CLK_ONLINE,
> > > CPUHP_AP_ACTIVE,
> > > +   CPUHP_AP_WORKQUEUE_UNBOUND_ONLINE,
> > > CPUHP_ONLINE,
> > >  };
> > >
> >
> > That's waay to late, by then userspace is long running and expecting
> > things to 'just-work'.
>
> I don't like this way either, I just list three ways I can think of.
> I prefer the way that __set_cpus_allowed_ptr() doesn't warn
> for kworkers.
>
> >
> > But afaict, things will mostly work for you when you use cpu_active_mask
> > on cpu-down and cpu_online_mask on cpu-up.
> >
> > But I think I see the problem, it is spawning a new worker after
> > workqueue_online_cpu() but before sched_cpu_activate(), right? That
> > wants to have the wider mask set.
> >
> > To solve that, the spawning of workers thing needs to know where we are
> > in the hotplug process, and it can track that using
> > workqueue_{on,off}line_cpu(). If it happens after offline, it needs to
> > use cpu_active_mask, if it happens after online cpu_online_mask is your
> > guy.
> >
> > Does that make sense?
>
> There are six stages we need to know when spawning a worker:
>
> stageA ap_deactive stageB workqueue_offline stageC
> stageD workqueue_online stageE ap_active stageF
>
> I don't think create_worker()/worker_attach_to_pool() can know where
> it is in the hotplug process unless it uses get_online_cpus() so that
> it knows it is not in the hotplug process.  There is no way to maintain
> needed information since there are no workqueue callbacks in the proper
> stages in the hotplug process.
>
> Again, making the workqueue set workers' cpumask correctly is easy.
> But we can't distinguish stageA or stageE to suppress the warning
> in __set_cpus_allowed_ptr() for new unbound workers when pool->attr->cpumask
> has only one cpu online&!active since there is no way to keep
> cpu_active_mask stable except get_online_cpus().

when pool->attr->cpumask has multi cpus but only one cpu online&!active.

Re: [PATCH -tip V3 3/8] workqueue: introduce wq_online_cpumask

2021-01-05 Thread Lai Jiangshan

On Tue, Jan 5, 2021 at 9:17 PM Peter Zijlstra  wrote:
>
> On Tue, Jan 05, 2021 at 04:23:44PM +0800, Lai Jiangshan wrote:
> > On Tue, Jan 5, 2021 at 10:41 AM Lai Jiangshan  
> > wrote:
> > > On Mon, Jan 4, 2021 at 9:56 PM Peter Zijlstra  
> > > wrote:
> > > > On Sat, Dec 26, 2020 at 10:51:11AM +0800, Lai Jiangshan wrote:
> > > > > From: Lai Jiangshan 
> > > > >
> > > > > wq_online_cpumask is the cached result of cpu_online_mask with the
> > > > > going-down cpu cleared.
> > > >
> > > > You can't use cpu_active_mask ?
> > >
> > > When a cpu is going out:
> > > (cpu_active_mask is not protected by workqueue mutexs.)
>
> But it is protected by the hotplug lock, which is really all you need
> afaict.
>
> If the worker thread gets spawned before workqueue_offline_cpu(), said
> function will observe it and adjust the mask, if it gets spawned after
> it, it must observe a 'reduced' cpu_active_mask.

Making the workqueue set workers' cpumask correctly is easy.
The hard part is how to suppress the warning.

It is true that said function will observe it and adjust the mask,
but the warning is already issued.

>
> > >
> > > create_worker() for unbound pool  |  cpu offlining
> > > check cpu_active_mask |
> > check wq_online_cpumask
> > >   |  remove bit from cpu_active_mask
> > >   |  no cpu in pool->attrs->cpumask is 
> > > active
> > > set pool->attrs->cpumask to worker|
> > > and hit the warning
> > |  remove bit from wq_online_cpumask
> >
> > Even with the help of wq_online_cpumask, the patchset can't silence
> > the warning in __set_cpus_allowed_ptr() in this case.  It is indeed
> > hard to suppress the warning for unbound pools.  Maybe we need something
> > like this (outmost callback of CPUHP_AP_WORKQUEUE_UNBOUND_ONLINE,
> > so that workqueue can do preparation when offlining before AP_ACTIVE):
> >
> > diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
> > index 0042ef362511..ac2103deb20b 100644
> > --- a/include/linux/cpuhotplug.h
> > +++ b/include/linux/cpuhotplug.h
> > @@ -20,6 +20,9 @@
> >   *   |   ^
> >   *   v   |
> >   *  AP_ACTIVE  AP_ACTIVE
> > + *   |   ^
> > + *   v   |
> > + *  ONLINE ONLINE
> >   */
> >
> >  enum cpuhp_state {
> > @@ -194,6 +197,7 @@ enum cpuhp_state {
> > CPUHP_AP_X86_HPET_ONLINE,
> > CPUHP_AP_X86_KVM_CLK_ONLINE,
> > CPUHP_AP_ACTIVE,
> > +   CPUHP_AP_WORKQUEUE_UNBOUND_ONLINE,
> > CPUHP_ONLINE,
> >  };
> >
>
> That's waay to late, by then userspace is long running and expecting
> things to 'just-work'.

I don't like this way either, I just list three ways I can think of.
I prefer the way that __set_cpus_allowed_ptr() doesn't warn
for kworkers.

>
> But afaict, things will mostly work for you when you use cpu_active_mask
> on cpu-down and cpu_online_mask on cpu-up.
>
> But I think I see the problem, it is spawning a new worker after
> workqueue_online_cpu() but before sched_cpu_activate(), right? That
> wants to have the wider mask set.
>
> To solve that, the spawning of workers thing needs to know where we are
> in the hotplug process, and it can track that using
> workqueue_{on,off}line_cpu(). If it happens after offline, it needs to
> use cpu_active_mask, if it happens after online cpu_online_mask is your
> guy.
>
> Does that make sense?

There are six stages we need to know when spawning a worker:

stageA ap_deactive stageB workqueue_offline stageC
stageD workqueue_online stageE ap_active stageF

I don't think create_worker()/worker_attach_to_pool() can know where
it is in the hotplug process unless it uses get_online_cpus() so that
it knows it is not in the hotplug process.  There is no way to maintain
needed information since there are no workqueue callbacks in the proper
stages in the hotplug process.

Again, making the workqueue set workers' cpumask correctly is easy.
But we can't distinguish stageA or stageE to suppress the warning
in __set_cpus_allowed_ptr() for new unbound workers when pool->attr->cpumask
has only one cpu online&!active since there is no way to keep
cpu_active_mask stable except get_online_cpus().

Re: [PATCH -tip V3 3/8] workqueue: introduce wq_online_cpumask

2021-01-05 Thread Lai Jiangshan

On Tue, Jan 5, 2021 at 10:41 AM Lai Jiangshan  wrote:
>
> On Mon, Jan 4, 2021 at 9:56 PM Peter Zijlstra  wrote:
> >
> > On Sat, Dec 26, 2020 at 10:51:11AM +0800, Lai Jiangshan wrote:
> > > From: Lai Jiangshan 
> > >
> > > wq_online_cpumask is the cached result of cpu_online_mask with the
> > > going-down cpu cleared.
> >
> > You can't use cpu_active_mask ?
>
>
> When a cpu is going out:
> (cpu_active_mask is not protected by workqueue mutexs.)
>
> create_worker() for unbound pool  |  cpu offlining
> check cpu_active_mask |
check wq_online_cpumask
>   |  remove bit from cpu_active_mask
>   |  no cpu in pool->attrs->cpumask is active
> set pool->attrs->cpumask to worker|
> and hit the warning
|  remove bit from wq_online_cpumask

Even with the help of wq_online_cpumask, the patchset can't silence
the warning in __set_cpus_allowed_ptr() in this case.  It is indeed
hard to suppress the warning for unbound pools.  Maybe we need something
like this (outmost callback of CPUHP_AP_WORKQUEUE_UNBOUND_ONLINE,
so that workqueue can do preparation when offlining before AP_ACTIVE):

diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 0042ef362511..ac2103deb20b 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -20,6 +20,9 @@
  *   |   ^
  *   v   |
  *  AP_ACTIVE  AP_ACTIVE
+ *   |   ^
+ *   v   |
+ *  ONLINE ONLINE
  */

 enum cpuhp_state {
@@ -194,6 +197,7 @@ enum cpuhp_state {
CPUHP_AP_X86_HPET_ONLINE,
CPUHP_AP_X86_KVM_CLK_ONLINE,
CPUHP_AP_ACTIVE,
+   CPUHP_AP_WORKQUEUE_UNBOUND_ONLINE,
CPUHP_ONLINE,
 };


The other way is to modify __set_cpus_allowed_ptr() to suppress the
warning for kworkers and believe/let the workqueue handle cpumask correctly.

And the third way is to use get_online_cpus() in worker_attach_to_pool()
which might delay work items to be processed during cpuhotplug and might
be dangerous when someone call flush_work() in cpuhotplug callbacks.

Any thoughts?

Thanks,
Lai

>
>
> And when a cpu is onlining, there may be some workers which were just created
> after the workqueue hotplug callback is finished but before cpu_active_mask
> was updated. workqueue has not call back after cpu_active_mask updated and
> these workers' cpumask is not updated.
>
> For percpu workers, these problems can be handled with the help of
> POOL_DISASSOCIATED which is protected by workqueue mutexs and the
> help of sched/core.c which doesn't warn when per-cpu-kthread.
>
> For unbound workers, the way to handle it without using wq_online_cpumask
> is much more complex when a cpu is going out.

Re: [PATCH -tip V3 3/8] workqueue: introduce wq_online_cpumask

2021-01-04 Thread Lai Jiangshan

On Tue, Jan 5, 2021 at 10:41 AM Lai Jiangshan  wrote:
>
> On Mon, Jan 4, 2021 at 9:56 PM Peter Zijlstra  wrote:
> >
> > On Sat, Dec 26, 2020 at 10:51:11AM +0800, Lai Jiangshan wrote:
> > > From: Lai Jiangshan 
> > >
> > > wq_online_cpumask is the cached result of cpu_online_mask with the
> > > going-down cpu cleared.
> >
> > You can't use cpu_active_mask ?
>
>
> When a cpu is going out:
> (cpu_active_mask is not protected by workqueue mutexs.)
>
> create_worker() for unbound pool  |  cpu offlining
> check cpu_active_mask |
>   |  remove bit from cpu_active_mask
>   |  no cpu in pool->attrs->cpumask is active
> set pool->attrs->cpumask to worker|
> and hit the warning
>
>
> And when a cpu is onlining, there may be some workers which were just created
> after the workqueue hotplug callback is finished but before cpu_active_mask
> was updated. workqueue has not call back after cpu_active_mask updated and
> these workers' cpumask is not updated.
>
> For percpu workers, these problems can be handled with the help of
> POOL_DISASSOCIATED which is protected by workqueue mutexs and the
> help of sched/core.c which doesn't warn when per-cpu-kthread.
>
> For unbound workers, the way to handle it without using wq_online_cpumask
> is much more complex when a cpu is going out.

To have replied too soon, let me think about it again.

Re: [PATCH -tip V3 3/8] workqueue: introduce wq_online_cpumask

2021-01-04 Thread Lai Jiangshan

On Mon, Jan 4, 2021 at 9:56 PM Peter Zijlstra  wrote:
>
> On Sat, Dec 26, 2020 at 10:51:11AM +0800, Lai Jiangshan wrote:
> > From: Lai Jiangshan 
> >
> > wq_online_cpumask is the cached result of cpu_online_mask with the
> > going-down cpu cleared.
>
> You can't use cpu_active_mask ?

When a cpu is going out:
(cpu_active_mask is not protected by workqueue mutexs.)

create_worker() for unbound pool  |  cpu offlining
check cpu_active_mask |
  |  remove bit from cpu_active_mask
  |  no cpu in pool->attrs->cpumask is active
set pool->attrs->cpumask to worker|
and hit the warning

And when a cpu is onlining, there may be some workers which were just created
after the workqueue hotplug callback is finished but before cpu_active_mask
was updated. workqueue has not call back after cpu_active_mask updated and
these workers' cpumask is not updated.

For percpu workers, these problems can be handled with the help of
POOL_DISASSOCIATED which is protected by workqueue mutexs and the
help of sched/core.c which doesn't warn when per-cpu-kthread.

For unbound workers, the way to handle it without using wq_online_cpumask
is much more complex when a cpu is going out.

Re: [PATCH -tip V3 8/8] workqueue: Fix affinity of kworkers when attaching into pool

2020-12-29 Thread Lai Jiangshan

On Tue, Dec 29, 2020 at 6:06 PM Hillf Danton  wrote:
>
> On Sat, 26 Dec 2020 10:51:16 +0800
> > From: Lai Jiangshan 
> >
> > When worker_attach_to_pool() is called, we should not put the workers
> > to pool->attrs->cpumask when there is not CPU online in it.
> >
> > We have to use wq_online_cpumask in worker_attach_to_pool() to check
> > if pool->attrs->cpumask is valid rather than cpu_online_mask or
> > cpu_active_mask due to gaps between stages in cpu hot[un]plug.
>
> In 5/8 pool->attrs->cpumask is not restored to avoid triggering
> the warning added in e9d867a67fd03ccc ("sched: Allow
> per-cpu kernel threads to run on online && !active"), is it likely
> needed to repeat that trick here?
> Is the above gap no longer existing here at the presence of
> wq_online_cpumask?

It still exists. When online, wq_online_cpumask is always
cpu_online_mask, no thing changed.

An alternative way is to move the code into a work item, which adds
the proper protection against cpu hotlug and does the work.

I don't want to add too much complex in this patchset.

Re: [PATCH -tip V2 00/10] workqueue: break affinity initiatively

2020-12-27 Thread Lai Jiangshan

On Sat, Dec 26, 2020 at 10:52 PM Paul E. McKenney  wrote:

> >
> > Can you please specify a bit what you encountered in rcutorture
> > before this patchset? You know we cant have a correct estimation
> > of the fix diameter without your help.

>
> It triggers the following in sched_cpu_dying() in kernel/sched/core.c,
> exactly the same as for Lai Jiangshan:
>
> BUG_ON(rq->nr_running != 1 || rq_has_pinned_tasks(rq))
>
> Which is in fact the "this" in my earlier "rcutorture hits this".  ;-)
>
> Thanx, Paul
>

Hi, Hillf

https://lkml.org/lkml/2020/12/22/141

>From the email, I think rcutorture encountered the same problem.

Hi, Paul

I'm sorry to forget to add your Tested-by.

Thanks
Lai

Re: [PATCH -tip V3 5/8] workqueue: Manually break affinity on hotplug for unbound pool

2020-12-27 Thread Lai Jiangshan

On Sat, Dec 26, 2020 at 6:16 PM Hillf Danton  wrote:
>
> Sat, 26 Dec 2020 10:51:13 +0800
> > From: Lai Jiangshan 
> >
> > There is possible that a per-node pool/woker's affinity is a single
> > CPU.  It can happen when the workqueue user changes the cpumask of the
> > workqueue or when wq_unbound_cpumask is changed by system adim via
> > /sys/devices/virtual/workqueue/cpumask.  And pool->attrs->cpumask
> > is workqueue's cpumask & wq_unbound_cpumask & possible_cpumask_of_the_node,
> > which can be a single CPU and makes the pool's workers to be "per cpu
> > kthread".
> >
> > And it can also happen when the cpu is the first online and has been
> > the only online cpu in pool->attrs->cpumask.  In this case, the worker
> > task cpumask is single cpu no matter what pool->attrs->cpumask since
> > commit d945b5e9f0e3 ("workqueue: Fix setting affinity of unbound worker
> > threads").
> >
> > And the scheduler won't break affinity on the "per cpu kthread" workers
> > when the CPU is going down, so we have to do it by our own.
> >
> > We do it by reusing existing restore_unbound_workers_cpumask() and rename
> > it to update_unbound_workers_cpumask().  When the number of the online
> > CPU of the pool goes from 1 to 0, we break the affinity initiatively.
> >
> > Note here, we even break the affinity for non-per-cpu-kthread workers,
> > because first, the code path is slow path which is not worth too much to
> > optimize, second, we don't need to rely on the code/conditions when the
> > scheduler forces breaking affinity for us.
> >
> > The way to break affinity is to set the workers' affinity to
> > cpu_possible_mask, so that we preserve the same behavisor when
> > the scheduler breaks affinity for us.
> >
> > Fixes: 06249738a41a ("workqueue: Manually break affinity on hotplug")
> > Acked-by: Tejun Heo 
> > Signed-off-by: Lai Jiangshan 
> > ---
> >  kernel/workqueue.c | 48 ++
> >  1 file changed, 40 insertions(+), 8 deletions(-)
> >
> > diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> > index 0a95ae14d46f..79cc87df0cda 100644
> > --- a/kernel/workqueue.c
> > +++ b/kernel/workqueue.c
> > @@ -5019,16 +5019,18 @@ static void rebind_workers(struct worker_pool *pool)
> >  }
> >
> >  /**
> > - * restore_unbound_workers_cpumask - restore cpumask of unbound workers
> > + * update_unbound_workers_cpumask - update cpumask of unbound workers
> >   * @pool: unbound pool of interest
> > - * @cpu: the CPU which is coming up
> > + * @online: whether @cpu is coming up or going down
> > + * @cpu: the CPU which is coming up or going down
> >   *
> >   * An unbound pool may end up with a cpumask which doesn't have any online
> > - * CPUs.  When a worker of such pool get scheduled, the scheduler resets
> > - * its cpus_allowed.  If @cpu is in @pool's cpumask which didn't have any
> > - * online CPU before, cpus_allowed of all its workers should be restored.
> > + * CPUs.  We have to reset workers' cpus_allowed of such pool.  And we
> > + * restore the workers' cpus_allowed when the pool's cpumask has online
> > + * CPU.
> >   */
> > -static void restore_unbound_workers_cpumask(struct worker_pool *pool, int 
> > cpu)
> > +static void update_unbound_workers_cpumask(struct worker_pool *pool,
> > +bool online, int cpu)
> >  {
> >   static cpumask_t cpumask;
> >   struct worker *worker;
> > @@ -5042,6 +5044,23 @@ static void restore_unbound_workers_cpumask(struct 
> > worker_pool *pool, int cpu)
> >
> >   cpumask_and(, pool->attrs->cpumask, wq_online_cpumask);
> >
> > + if (!online) {
> > + if (cpumask_weight() > 0)
> > + return;
>
> We can apply the weight check also to the online case.
>
> > + /*
> > +  * All unbound workers can be possibly "per cpu kthread"
> > +  * if this is the only online CPU in pool->attrs->cpumask
> > +  * from the last time it has been brought up until now.
> > +  * And the scheduler won't break affinity on the "per cpu
> > +  * kthread" workers when the CPU is going down, so we have
> > +  * to do it by our own.
> > +  */
> > + for_each_pool_worker(worker, pool)
> > + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
>

[PATCH -tip V3 1/8] workqueue: use cpu_possible_mask instead of cpu_active_mask to break affinity

2020-12-25 Thread Lai Jiangshan

From: Lai Jiangshan 

The scheduler won't break affinity for us any more, and we should
"emulate" the same behavior when the scheduler breaks affinity for
us.  The behavior is "changing the cpumask to cpu_possible_mask".

And there might be some other CPUs online later while the worker is
still running with the pending work items.  The worker should be allowed
to use the later online CPUs as before and process the work items ASAP.
If we use cpu_active_mask here, we can't achieve this goal but
using cpu_possible_mask can.

Fixes: 06249738a41a ("workqueue: Manually break affinity on hotplug")
Acked-by: Tejun Heo 
Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c71da2a59e12..f2b8f3d458d1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4910,7 +4910,7 @@ static void unbind_workers(int cpu)
raw_spin_unlock_irq(>lock);
 
for_each_pool_worker(worker, pool)
-   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
cpu_active_mask) < 0);
+   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
cpu_possible_mask) < 0);
 
mutex_unlock(_pool_attach_mutex);
 
-- 
2.19.1.6.gb485710b

[PATCH -tip V3 7/8] workqueue: reorganize workqueue_offline_cpu() unbind_workers()

2020-12-25 Thread Lai Jiangshan

From: Lai Jiangshan 

Just move around the code, no functionality changed.
Only wq_pool_attach_mutex protected region becomes a little larger.

It prepares for later patch protecting wq_online_cpumask
in wq_pool_attach_mutex.

Acked-by: Tejun Heo 
Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c | 90 +++---
 1 file changed, 45 insertions(+), 45 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 94545e6feda5..dd32398edf55 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4896,61 +4896,57 @@ void wq_worker_comm(char *buf, size_t size, struct 
task_struct *task)
  * cpu comes back online.
  */
 
-static void unbind_workers(int cpu)
+static void unbind_workers(struct worker_pool *pool)
 {
-   struct worker_pool *pool;
struct worker *worker;
 
-   for_each_cpu_worker_pool(pool, cpu) {
-   mutex_lock(_pool_attach_mutex);
-   raw_spin_lock_irq(>lock);
+   lockdep_assert_held(_pool_attach_mutex);
 
-   /*
-* We've blocked all attach/detach operations. Make all workers
-* unbound and set DISASSOCIATED.  Before this, all workers
-* except for the ones which are still executing works from
-* before the last CPU down must be on the cpu.  After
-* this, they may become diasporas.
-*/
-   for_each_pool_worker(worker, pool)
-   worker->flags |= WORKER_UNBOUND;
+   raw_spin_lock_irq(>lock);
 
-   pool->flags |= POOL_DISASSOCIATED;
+   /*
+* We've blocked all attach/detach operations. Make all workers
+* unbound and set DISASSOCIATED.  Before this, all workers
+* except for the ones which are still executing works from
+* before the last CPU down must be on the cpu.  After
+* this, they may become diasporas.
+*/
+   for_each_pool_worker(worker, pool)
+   worker->flags |= WORKER_UNBOUND;
 
-   raw_spin_unlock_irq(>lock);
+   pool->flags |= POOL_DISASSOCIATED;
 
-   for_each_pool_worker(worker, pool)
-   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
cpu_possible_mask) < 0);
+   raw_spin_unlock_irq(>lock);
 
-   mutex_unlock(_pool_attach_mutex);
+   for_each_pool_worker(worker, pool)
+   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
cpu_possible_mask) < 0);
 
-   /*
-* Call schedule() so that we cross rq->lock and thus can
-* guarantee sched callbacks see the %WORKER_UNBOUND flag.
-* This is necessary as scheduler callbacks may be invoked
-* from other cpus.
-*/
-   schedule();
+   /*
+* Call schedule() so that we cross rq->lock and thus can
+* guarantee sched callbacks see the %WORKER_UNBOUND flag.
+* This is necessary as scheduler callbacks may be invoked
+* from other cpus.
+*/
+   schedule();
 
-   /*
-* Sched callbacks are disabled now.  Zap nr_running.
-* After this, nr_running stays zero and need_more_worker()
-* and keep_working() are always true as long as the
-* worklist is not empty.  This pool now behaves as an
-* unbound (in terms of concurrency management) pool which
-* are served by workers tied to the pool.
-*/
-   atomic_set(>nr_running, 0);
+   /*
+* Sched callbacks are disabled now.  Zap nr_running.
+* After this, nr_running stays zero and need_more_worker()
+* and keep_working() are always true as long as the
+* worklist is not empty.  This pool now behaves as an
+* unbound (in terms of concurrency management) pool which
+* are served by workers tied to the pool.
+*/
+   atomic_set(>nr_running, 0);
 
-   /*
-* With concurrency management just turned off, a busy
-* worker blocking could lead to lengthy stalls.  Kick off
-* unbound chain execution of currently pending work items.
-*/
-   raw_spin_lock_irq(>lock);
-   wake_up_worker(pool);
-   raw_spin_unlock_irq(>lock);
-   }
+   /*
+* With concurrency management just turned off, a busy
+* worker blocking could lead to lengthy stalls.  Kick off
+* unbound chain execution of currently pending work items.
+*/
+   raw_spin_lock_irq(>lock);
+   wake_up_worker(pool);
+   raw_spin_unlock_irq(>lock);
 }
 
 /**
@@ -5122,7 +5118,11 @@ int workqueue_offline_cpu(unsigned int cpu)
if (WARN_ON(cpu != smp_processor_id()))
return -1;
 
-   unb

[PATCH -tip V3 6/8] workqueue: reorganize workqueue_online_cpu()

2020-12-25 Thread Lai Jiangshan

From: Lai Jiangshan 

Just move around the code, no functionality changed.

It prepares for later patch protecting wq_online_cpumask
in wq_pool_attach_mutex.

Acked-by: Tejun Heo 
Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 79cc87df0cda..94545e6feda5 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5088,12 +5088,17 @@ int workqueue_online_cpu(unsigned int cpu)
mutex_lock(_pool_mutex);
cpumask_set_cpu(cpu, wq_online_cpumask);
 
+   for_each_cpu_worker_pool(pool, cpu) {
+   mutex_lock(_pool_attach_mutex);
+   rebind_workers(pool);
+   mutex_unlock(_pool_attach_mutex);
+   }
+
+   /* update CPU affinity of workers of unbound pools */
for_each_pool(pool, pi) {
mutex_lock(_pool_attach_mutex);
 
-   if (pool->cpu == cpu)
-   rebind_workers(pool);
-   else if (pool->cpu < 0)
+   if (pool->cpu < 0)
update_unbound_workers_cpumask(pool, true, cpu);
 
mutex_unlock(_pool_attach_mutex);
-- 
2.19.1.6.gb485710b

[PATCH -tip V3 5/8] workqueue: Manually break affinity on hotplug for unbound pool

2020-12-25 Thread Lai Jiangshan

From: Lai Jiangshan 

There is possible that a per-node pool/woker's affinity is a single
CPU.  It can happen when the workqueue user changes the cpumask of the
workqueue or when wq_unbound_cpumask is changed by system adim via
/sys/devices/virtual/workqueue/cpumask.  And pool->attrs->cpumask
is workqueue's cpumask & wq_unbound_cpumask & possible_cpumask_of_the_node,
which can be a single CPU and makes the pool's workers to be "per cpu
kthread".

And it can also happen when the cpu is the first online and has been
the only online cpu in pool->attrs->cpumask.  In this case, the worker
task cpumask is single cpu no matter what pool->attrs->cpumask since
commit d945b5e9f0e3 ("workqueue: Fix setting affinity of unbound worker
threads").

And the scheduler won't break affinity on the "per cpu kthread" workers
when the CPU is going down, so we have to do it by our own.

We do it by reusing existing restore_unbound_workers_cpumask() and rename
it to update_unbound_workers_cpumask().  When the number of the online
CPU of the pool goes from 1 to 0, we break the affinity initiatively.

Note here, we even break the affinity for non-per-cpu-kthread workers,
because first, the code path is slow path which is not worth too much to
optimize, second, we don't need to rely on the code/conditions when the
scheduler forces breaking affinity for us.

The way to break affinity is to set the workers' affinity to
cpu_possible_mask, so that we preserve the same behavisor when
the scheduler breaks affinity for us.

Fixes: 06249738a41a ("workqueue: Manually break affinity on hotplug")
Acked-by: Tejun Heo 
Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c | 48 ++
 1 file changed, 40 insertions(+), 8 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 0a95ae14d46f..79cc87df0cda 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5019,16 +5019,18 @@ static void rebind_workers(struct worker_pool *pool)
 }
 
 /**
- * restore_unbound_workers_cpumask - restore cpumask of unbound workers
+ * update_unbound_workers_cpumask - update cpumask of unbound workers
  * @pool: unbound pool of interest
- * @cpu: the CPU which is coming up
+ * @online: whether @cpu is coming up or going down
+ * @cpu: the CPU which is coming up or going down
  *
  * An unbound pool may end up with a cpumask which doesn't have any online
- * CPUs.  When a worker of such pool get scheduled, the scheduler resets
- * its cpus_allowed.  If @cpu is in @pool's cpumask which didn't have any
- * online CPU before, cpus_allowed of all its workers should be restored.
+ * CPUs.  We have to reset workers' cpus_allowed of such pool.  And we
+ * restore the workers' cpus_allowed when the pool's cpumask has online
+ * CPU.
  */
-static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
+static void update_unbound_workers_cpumask(struct worker_pool *pool,
+  bool online, int cpu)
 {
static cpumask_t cpumask;
struct worker *worker;
@@ -5042,6 +5044,23 @@ static void restore_unbound_workers_cpumask(struct 
worker_pool *pool, int cpu)
 
cpumask_and(, pool->attrs->cpumask, wq_online_cpumask);
 
+   if (!online) {
+   if (cpumask_weight() > 0)
+   return;
+   /*
+* All unbound workers can be possibly "per cpu kthread"
+* if this is the only online CPU in pool->attrs->cpumask
+* from the last time it has been brought up until now.
+* And the scheduler won't break affinity on the "per cpu
+* kthread" workers when the CPU is going down, so we have
+* to do it by our own.
+*/
+   for_each_pool_worker(worker, pool)
+   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
cpu_possible_mask) < 0);
+
+   return;
+   }
+
/* as we're called from CPU_ONLINE, the following shouldn't fail */
for_each_pool_worker(worker, pool)
WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, ) < 0);
@@ -5075,7 +5094,7 @@ int workqueue_online_cpu(unsigned int cpu)
if (pool->cpu == cpu)
rebind_workers(pool);
else if (pool->cpu < 0)
-   restore_unbound_workers_cpumask(pool, cpu);
+   update_unbound_workers_cpumask(pool, true, cpu);
 
mutex_unlock(_pool_attach_mutex);
}
@@ -5090,7 +5109,9 @@ int workqueue_online_cpu(unsigned int cpu)
 
 int workqueue_offline_cpu(unsigned int cpu)
 {
+   struct worker_pool *pool;
struct workqueue_struct *wq;
+   int pi;
 
/* unbinding per-cpu workers should happen on the local CPU */
if (WARN_ON(cpu != smp_processor_id()))

[PATCH -tip V3 8/8] workqueue: Fix affinity of kworkers when attaching into pool

2020-12-25 Thread Lai Jiangshan

From: Lai Jiangshan 

When worker_attach_to_pool() is called, we should not put the workers
to pool->attrs->cpumask when there is not CPU online in it.

We have to use wq_online_cpumask in worker_attach_to_pool() to check
if pool->attrs->cpumask is valid rather than cpu_online_mask or
cpu_active_mask due to gaps between stages in cpu hot[un]plug.

So for that late-spawned per-CPU kworker case: the outgoing CPU should have
already been cleared from wq_online_cpumask, so it gets its affinity reset
to the possible mask and the subsequent wakeup will ensure it's put on an
active CPU.

To use wq_online_cpumask in worker_attach_to_pool(), we need to protect
wq_online_cpumask in wq_pool_attach_mutex and we modify workqueue_online_cpu()
and workqueue_offline_cpu() to enlarge wq_pool_attach_mutex protected
region. We also put updating wq_online_cpumask and [re|un]bind_workers()
in the same wq_pool_attach_mutex protected region to make the update
for percpu workqueue atomically.

Cc: Qian Cai 
Cc: Peter Zijlstra 
Cc: Vincent Donnefort 
Link: 
https://lore.kernel.org/lkml/20201210163830.21514-3-valentin.schnei...@arm.com/
Reviewed-by: Valentin Schneider 
Acked-by: Tejun Heo 
Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c | 32 +++-
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index dd32398edf55..25d50050257c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -310,7 +310,7 @@ static bool workqueue_freezing; /* PL: have wqs 
started freezing? */
 /* PL: allowable cpus for unbound wqs and work items */
 static cpumask_var_t wq_unbound_cpumask;
 
-/* PL: online cpus (cpu_online_mask with the going-down cpu cleared) */
+/* PL: online cpus (cpu_online_mask with the going-down cpu cleared) */
 static cpumask_var_t wq_online_cpumask;
 
 /* CPU where unbound work was last round robin scheduled from this CPU */
@@ -1848,11 +1848,11 @@ static void worker_attach_to_pool(struct worker *worker,
 {
mutex_lock(_pool_attach_mutex);
 
-   /*
-* set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
-* online CPUs.  It'll be re-applied when any of the CPUs come up.
-*/
-   set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
+   /* Is there any cpu in pool->attrs->cpumask online? */
+   if (cpumask_intersects(pool->attrs->cpumask, wq_online_cpumask))
+   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
pool->attrs->cpumask) < 0);
+   else
+   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
cpu_possible_mask) < 0);
 
/*
 * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains
@@ -5082,13 +5082,12 @@ int workqueue_online_cpu(unsigned int cpu)
int pi;
 
mutex_lock(_pool_mutex);
-   cpumask_set_cpu(cpu, wq_online_cpumask);
 
-   for_each_cpu_worker_pool(pool, cpu) {
-   mutex_lock(_pool_attach_mutex);
+   mutex_lock(_pool_attach_mutex);
+   cpumask_set_cpu(cpu, wq_online_cpumask);
+   for_each_cpu_worker_pool(pool, cpu)
rebind_workers(pool);
-   mutex_unlock(_pool_attach_mutex);
-   }
+   mutex_unlock(_pool_attach_mutex);
 
/* update CPU affinity of workers of unbound pools */
for_each_pool(pool, pi) {
@@ -5118,14 +5117,13 @@ int workqueue_offline_cpu(unsigned int cpu)
if (WARN_ON(cpu != smp_processor_id()))
return -1;
 
-   for_each_cpu_worker_pool(pool, cpu) {
-   mutex_lock(_pool_attach_mutex);
-   unbind_workers(pool);
-   mutex_unlock(_pool_attach_mutex);
-   }
-
mutex_lock(_pool_mutex);
+
+   mutex_lock(_pool_attach_mutex);
cpumask_clear_cpu(cpu, wq_online_cpumask);
+   for_each_cpu_worker_pool(pool, cpu)
+   unbind_workers(pool);
+   mutex_unlock(_pool_attach_mutex);
 
/* update CPU affinity of workers of unbound pools */
for_each_pool(pool, pi) {
-- 
2.19.1.6.gb485710b

[PATCH -tip V3 3/8] workqueue: introduce wq_online_cpumask

2020-12-25 Thread Lai Jiangshan

From: Lai Jiangshan 

wq_online_cpumask is the cached result of cpu_online_mask with the
going-down cpu cleared.  It is needed for later patches for setting
correct cpumask for workers and break affinity initiatively.

The first usage of wq_online_cpumask is also in this patch.
wq_calc_node_cpumask() and wq_update_unbound_numa() can be simplified
a little.

Acked-by: Tejun Heo 
Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c | 34 ++
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index ccbceacaea1b..6f75f7ebeb17 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -310,6 +310,9 @@ static bool workqueue_freezing; /* PL: have wqs 
started freezing? */
 /* PL: allowable cpus for unbound wqs and work items */
 static cpumask_var_t wq_unbound_cpumask;
 
+/* PL: online cpus (cpu_online_mask with the going-down cpu cleared) */
+static cpumask_var_t wq_online_cpumask;
+
 /* CPU where unbound work was last round robin scheduled from this CPU */
 static DEFINE_PER_CPU(int, wq_rr_cpu_last);
 
@@ -3825,12 +3828,10 @@ static struct pool_workqueue *alloc_unbound_pwq(struct 
workqueue_struct *wq,
  * wq_calc_node_cpumask - calculate a wq_attrs' cpumask for the specified node
  * @attrs: the wq_attrs of the default pwq of the target workqueue
  * @node: the target NUMA node
- * @cpu_going_down: if >= 0, the CPU to consider as offline
  * @cpumask: outarg, the resulting cpumask
  *
- * Calculate the cpumask a workqueue with @attrs should use on @node.  If
- * @cpu_going_down is >= 0, that cpu is considered offline during
- * calculation.  The result is stored in @cpumask.
+ * Calculate the cpumask a workqueue with @attrs should use on @node.
+ * The result is stored in @cpumask.
  *
  * If NUMA affinity is not enabled, @attrs->cpumask is always used.  If
  * enabled and @node has online CPUs requested by @attrs, the returned
@@ -3844,15 +3845,14 @@ static struct pool_workqueue *alloc_unbound_pwq(struct 
workqueue_struct *wq,
  * %false if equal.
  */
 static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
-int cpu_going_down, cpumask_t *cpumask)
+cpumask_t *cpumask)
 {
if (!wq_numa_enabled || attrs->no_numa)
goto use_dfl;
 
/* does @node have any online CPUs @attrs wants? */
cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask);
-   if (cpu_going_down >= 0)
-   cpumask_clear_cpu(cpu_going_down, cpumask);
+   cpumask_and(cpumask, cpumask, wq_online_cpumask);
 
if (cpumask_empty(cpumask))
goto use_dfl;
@@ -3961,7 +3961,7 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
goto out_free;
 
for_each_node(node) {
-   if (wq_calc_node_cpumask(new_attrs, node, -1, 
tmp_attrs->cpumask)) {
+   if (wq_calc_node_cpumask(new_attrs, node, tmp_attrs->cpumask)) {
ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
if (!ctx->pwq_tbl[node])
goto out_free;
@@ -4086,7 +4086,6 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
  * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
  * @wq: the target workqueue
  * @cpu: the CPU coming up or going down
- * @online: whether @cpu is coming up or going down
  *
  * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
  * %CPU_DOWN_FAILED.  @cpu is being hot[un]plugged, update NUMA affinity of
@@ -4104,11 +4103,9 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
  * affinity, it's the user's responsibility to flush the work item from
  * CPU_DOWN_PREPARE.
  */
-static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
-  bool online)
+static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu)
 {
int node = cpu_to_node(cpu);
-   int cpu_off = online ? -1 : cpu;
struct pool_workqueue *old_pwq = NULL, *pwq;
struct workqueue_attrs *target_attrs;
cpumask_t *cpumask;
@@ -4136,7 +4133,7 @@ static void wq_update_unbound_numa(struct 
workqueue_struct *wq, int cpu,
 * and create a new one if they don't match.  If the target cpumask
 * equals the default pwq's, the default pwq should be used.
 */
-   if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, cpu_off, 
cpumask)) {
+   if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, cpumask)) {
if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
return;
} else {
@@ -5069,6 +5066,7 @@ int workqueue_online_cpu(unsigned int cpu)
int pi;
 
mutex_lock(_pool_mutex);
+   cpumask_set_cpu(cpu, wq_online_cpumask);
 
for_each_pool(

[PATCH -tip V3 2/8] workqueue: Manually break affinity on pool detachment

2020-12-25 Thread Lai Jiangshan

From: Lai Jiangshan 

The pool->attrs->cpumask might be a single CPU and it may go
down after detachment, and the scheduler won't force to break
affinity for us since it is a per-cpu-ktrhead.  So we have to
do it on our own and unbind this worker which can't be unbound
by workqueue_offline_cpu() since it doesn't belong to any pool
after detachment.  Do it unconditionally for there is no harm
to break affinity for non-per-cpu-ktrhead and we don't need to
rely on the scheduler's policy on when to break affinity.

Fixes: 06249738a41a ("workqueue: Manually break affinity on hotplug")
Acked-by: Tejun Heo 
Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index f2b8f3d458d1..ccbceacaea1b 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1885,6 +1885,19 @@ static void worker_detach_from_pool(struct worker 
*worker)
 
if (list_empty(>workers))
detach_completion = pool->detach_completion;
+
+   /*
+* The pool->attrs->cpumask might be a single CPU and it may go
+* down after detachment, and the scheduler won't force to break
+* affinity for us since it is a per-cpu-ktrhead.  So we have to
+* do it on our own and unbind this worker which can't be unbound
+* by workqueue_offline_cpu() since it doesn't belong to any pool
+* after detachment.  Do it unconditionally for there is no harm
+* to break affinity for non-per-cpu-ktrhead and we don't need to
+* rely on the scheduler's policy on when to break affinity.
+*/
+   set_cpus_allowed_ptr(worker->task, cpu_possible_mask);
+
mutex_unlock(_pool_attach_mutex);
 
/* clear leftover flags without pool->lock after it is detached */
-- 
2.19.1.6.gb485710b

[PATCH -tip V3 4/8] workqueue: use wq_online_cpumask in restore_unbound_workers_cpumask()

2020-12-25 Thread Lai Jiangshan

From: Lai Jiangshan 

restore_unbound_workers_cpumask() is called when CPU_ONLINE, where
wq_online_cpumask equals to cpu_online_mask. So no fucntionality
changed.

Acked-by: Tejun Heo 
Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 6f75f7ebeb17..0a95ae14d46f 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5033,13 +5033,14 @@ static void restore_unbound_workers_cpumask(struct 
worker_pool *pool, int cpu)
static cpumask_t cpumask;
struct worker *worker;
 
+   lockdep_assert_held(_pool_mutex);
lockdep_assert_held(_pool_attach_mutex);
 
/* is @cpu allowed for @pool? */
if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
return;
 
-   cpumask_and(, pool->attrs->cpumask, cpu_online_mask);
+   cpumask_and(, pool->attrs->cpumask, wq_online_cpumask);
 
/* as we're called from CPU_ONLINE, the following shouldn't fail */
for_each_pool_worker(worker, pool)
-- 
2.19.1.6.gb485710b

[PATCH -tip V3 0/8] workqueue: break affinity initiatively

2020-12-25 Thread Lai Jiangshan

From: Lai Jiangshan 

06249738a41a ("workqueue: Manually break affinity on hotplug")
said that scheduler will not force break affinity for us.

But workqueue highly depends on the old behavior. Many parts of the codes
relies on it, 06249738a41a ("workqueue: Manually break affinity on hotplug")
is not enough to change it, and the commit has flaws in itself too.

It doesn't handle for worker detachment.
It doesn't handle for worker attachement, especially worker creation
  which is handled by Valentin Schneider's patch [1].
It doesn't handle for unbound workers which might be possible
per-cpu-kthread.

We need to thoroughly update the way workqueue handles affinity
in cpu hot[un]plug, what is this patchset intends to do and
replace the Valentin Schneider's patch [1].  The equivalent patch
is patch 10.

The patchset is based on tip/master rather than workqueue tree,
because the patchset is a complement for 06249738a41a ("workqueue:
Manually break affinity on hotplug") which is only in tip/master by now.

And TJ acked to route the series through tip.

Changed from V2:
Drop V2's patch4, which causes warning about setting cpumask
online&!active to kthread reported by several people:
Dexuan Cui 
kernel test robot 

Drop V2's patch 1, which can also cause warning about setting
cpumask online&!active to kthread.  restore_unbound_workers_cpumask()
is changed when we are bring cpu online.  And it cause V2's patch7
(V3's patch5) to be changed accordingly.

Marked patch8 Reviewed-by: Valentin Schneider 


Changed from V1:
Add TJ's acked-by for the whole patchset

Add more words to the comments and the changelog, mainly derived
from discussion with Peter.

Update the comments as TJ suggested.

Update a line of code as Valentin suggested.

Add Valentin's ack for patch 10 because "Seems alright to me." and
add Valentin's comments to the changelog which is integral.

[1]: 
https://lore.kernel.org/r/ff62e3ee994efb3620177bf7b19fab16f4866845.ca...@redhat.com
[V1 patchset]: 
https://lore.kernel.org/lkml/20201214155457.3430-1-jiangshan...@gmail.com/
[V2 patchset]: 
https://lore.kernel.org/lkml/20201218170919.2950-1-jiangshan...@gmail.com/

Lai Jiangshan (8):
  workqueue: use cpu_possible_mask instead of cpu_active_mask to break
affinity
  workqueue: Manually break affinity on pool detachment
  workqueue: introduce wq_online_cpumask
  workqueue: use wq_online_cpumask in restore_unbound_workers_cpumask()
  workqueue: Manually break affinity on hotplug for unbound pool
  workqueue: reorganize workqueue_online_cpu()
  workqueue: reorganize workqueue_offline_cpu() unbind_workers()
  workqueue: Fix affinity of kworkers when attaching into pool

 kernel/workqueue.c | 207 -
 1 file changed, 129 insertions(+), 78 deletions(-)

-- 
2.19.1.6.gb485710b

Re: [PATCH -tip V2 00/10] workqueue: break affinity initiatively

2020-12-23 Thread Lai Jiangshan

On Wed, Dec 23, 2020 at 5:39 AM Dexuan-Linux Cui  wrote:
>
> On Fri, Dec 18, 2020 at 8:11 AM Lai Jiangshan  wrote:
> >
> > From: Lai Jiangshan 
> >
> > 06249738a41a ("workqueue: Manually break affinity on hotplug")
> > said that scheduler will not force break affinity for us.
> >
> > But workqueue highly depends on the old behavior. Many parts of the codes
> > relies on it, 06249738a41a ("workqueue: Manually break affinity on hotplug")
> > is not enough to change it, and the commit has flaws in itself too.
> >
> > It doesn't handle for worker detachment.
> > It doesn't handle for worker attachement, mainly worker creation
> >   which is handled by Valentin Schneider's patch [1].
> > It doesn't handle for unbound workers which might be possible
> > per-cpu-kthread.
> >
> > We need to thoroughly update the way workqueue handles affinity
> > in cpu hot[un]plug, what is this patchset intends to do and
> > replace the Valentin Schneider's patch [1].  The equivalent patch
> > is patch 10.
> >
> > Patch 1 fixes a flaw reported by Hillf Danton .
> > I have to include this fix because later patches depends on it.
> >
> > The patchset is based on tip/master rather than workqueue tree,
> > because the patchset is a complement for 06249738a41a ("workqueue:
> > Manually break affinity on hotplug") which is only in tip/master by now.
> >
> > And TJ acked to route the series through tip.
> >
> > Changed from V1:
> > Add TJ's acked-by for the whole patchset
> >
> > Add more words to the comments and the changelog, mainly derived
> > from discussion with Peter.
> >
> > Update the comments as TJ suggested.
> >
> > Update a line of code as Valentin suggested.
> >
> > Add Valentin's ack for patch 10 because "Seems alright to me." and
> > add Valentin's comments to the changelog which is integral.
> >
> > [1]: 
> > https://lore.kernel.org/r/ff62e3ee994efb3620177bf7b19fab16f4866845.ca...@redhat.com
> > [V1 patcheset]: 
> > https://lore.kernel.org/lkml/20201214155457.3430-1-jiangshan...@gmail.com/
> >
> > Cc: Hillf Danton 
> > Cc: Valentin Schneider 
> > Cc: Qian Cai 
> > Cc: Peter Zijlstra 
> > Cc: Vincent Donnefort 
> > Cc: Tejun Heo 
> >
> > Lai Jiangshan (10):
> >   workqueue: restore unbound_workers' cpumask correctly
> >   workqueue: use cpu_possible_mask instead of cpu_active_mask to break
> > affinity
> >   workqueue: Manually break affinity on pool detachment
> >   workqueue: don't set the worker's cpumask when kthread_bind_mask()
> >   workqueue: introduce wq_online_cpumask
> >   workqueue: use wq_online_cpumask in restore_unbound_workers_cpumask()
> >   workqueue: Manually break affinity on hotplug for unbound pool
> >   workqueue: reorganize workqueue_online_cpu()
> >   workqueue: reorganize workqueue_offline_cpu() unbind_workers()
> >   workqueue: Fix affinity of kworkers when attaching into pool
> >
> >  kernel/workqueue.c | 214 -
> >  1 file changed, 132 insertions(+), 82 deletions(-)
> >
> > --
> > 2.19.1.6.gb485710b
>
> Hi,
> I tested this patchset on today's tip.git's master branch
> (981316394e35 ("Merge branch 'locking/urgent'")).
>
> Every time the kernel boots with 32 CPUs (I'm running the Linux VM on
> Hyper-V), I get the below warning.
> (BTW, with 8 or 16 CPUs, I don't see the warning).
> By printing the cpumasks with "%*pbl", I know the warning happens because:
> new_mask = 16-31
> cpu_online_mask= 0-16
> cpu_active_mask= 0-15
> p->nr_cpus_allowed=16
>
> 2374 if (p->flags & PF_KTHREAD) {
> 2375 /*
> 2376  * For kernel threads that do indeed end up on online &&
> 2377  * !active we want to ensure they are strict
> per-CPU threads.
> 2378  */
> 2379 WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
> 2380 !cpumask_intersects(new_mask, cpu_active_mask) &&
> 2381 p->nr_cpus_allowed != 1);
> 2382 }
> 2383
>

Hello, Dexuan

Could you omit patch4 of the patchset and test it again, please?
("workqueue: don't set the worker's cpumask when kthread_bind_mask()")

kthread_bind_mask() set the worker task to the pool's cpumask without
any check. And set_cpus_allowed_ptr() finds that the task's cpumask
is unchanged (already set by kthread_bind_mask()) and skips all the checks.

And I found that numa=fake=2U seems broken on cpumask_of_node() in my box.

Thanks,
Lai

Re: [PATCH -tip V2 00/10] workqueue: break affinity initiatively

2020-12-23 Thread Lai Jiangshan

On Wed, Dec 23, 2020 at 5:39 AM Dexuan-Linux Cui  wrote:
>
> On Fri, Dec 18, 2020 at 8:11 AM Lai Jiangshan  wrote:
> >
> > From: Lai Jiangshan 
> >
> > 06249738a41a ("workqueue: Manually break affinity on hotplug")
> > said that scheduler will not force break affinity for us.
> >
> > But workqueue highly depends on the old behavior. Many parts of the codes
> > relies on it, 06249738a41a ("workqueue: Manually break affinity on hotplug")
> > is not enough to change it, and the commit has flaws in itself too.
> >
> > It doesn't handle for worker detachment.
> > It doesn't handle for worker attachement, mainly worker creation
> >   which is handled by Valentin Schneider's patch [1].
> > It doesn't handle for unbound workers which might be possible
> > per-cpu-kthread.
> >
> > We need to thoroughly update the way workqueue handles affinity
> > in cpu hot[un]plug, what is this patchset intends to do and
> > replace the Valentin Schneider's patch [1].  The equivalent patch
> > is patch 10.
> >
> > Patch 1 fixes a flaw reported by Hillf Danton .
> > I have to include this fix because later patches depends on it.
> >
> > The patchset is based on tip/master rather than workqueue tree,
> > because the patchset is a complement for 06249738a41a ("workqueue:
> > Manually break affinity on hotplug") which is only in tip/master by now.
> >
> > And TJ acked to route the series through tip.
> >
> > Changed from V1:
> > Add TJ's acked-by for the whole patchset
> >
> > Add more words to the comments and the changelog, mainly derived
> > from discussion with Peter.
> >
> > Update the comments as TJ suggested.
> >
> > Update a line of code as Valentin suggested.
> >
> > Add Valentin's ack for patch 10 because "Seems alright to me." and
> > add Valentin's comments to the changelog which is integral.
> >
> > [1]: 
> > https://lore.kernel.org/r/ff62e3ee994efb3620177bf7b19fab16f4866845.ca...@redhat.com
> > [V1 patcheset]: 
> > https://lore.kernel.org/lkml/20201214155457.3430-1-jiangshan...@gmail.com/
> >
> > Cc: Hillf Danton 
> > Cc: Valentin Schneider 
> > Cc: Qian Cai 
> > Cc: Peter Zijlstra 
> > Cc: Vincent Donnefort 
> > Cc: Tejun Heo 
> >
> > Lai Jiangshan (10):
> >   workqueue: restore unbound_workers' cpumask correctly
> >   workqueue: use cpu_possible_mask instead of cpu_active_mask to break
> > affinity
> >   workqueue: Manually break affinity on pool detachment
> >   workqueue: don't set the worker's cpumask when kthread_bind_mask()
> >   workqueue: introduce wq_online_cpumask
> >   workqueue: use wq_online_cpumask in restore_unbound_workers_cpumask()
> >   workqueue: Manually break affinity on hotplug for unbound pool
> >   workqueue: reorganize workqueue_online_cpu()
> >   workqueue: reorganize workqueue_offline_cpu() unbind_workers()
> >   workqueue: Fix affinity of kworkers when attaching into pool
> >
> >  kernel/workqueue.c | 214 -
> >  1 file changed, 132 insertions(+), 82 deletions(-)
> >
> > --
> > 2.19.1.6.gb485710b
>
> Hi,

Hello,

thanks for reporting.

I have just been debugging it in a short time, I will continue tomorrow.


> I tested this patchset on today's tip.git's master branch
> (981316394e35 ("Merge branch 'locking/urgent'")).
>
> Every time the kernel boots with 32 CPUs (I'm running the Linux VM on
> Hyper-V), I get the below warning.
> (BTW, with 8 or 16 CPUs, I don't see the warning).
> By printing the cpumasks with "%*pbl", I know the warning happens because:
> new_mask = 16-31
> cpu_online_mask= 0-16
> cpu_active_mask= 0-15
> p->nr_cpus_allowed=16


>From the call stack, we can see that we are bringing cpu#16 up.
And workqueue_online_cpu is being called and sched_cpu_activate()
is not called. So cpu_online_mask= 0-16, cpu_active_mask= 0-15.

Why isn't it legitimate to set the worker's cpumask
to be new_mask(16-31) since cpu#16 is being brought up?

Anyway, it revealed there must be a problem in the patchset
which raised the warning.

>
> 2374 if (p->flags & PF_KTHREAD) {
> 2375 /*
> 2376  * For kernel threads that do indeed end up on online &&
> 2377  * !active we want to ensure they are strict
> per-CPU threads.
> 2378  */
> 2379 WARN_ON(cpumask_intersects(new_mask, cpu_online_mask) &&
> 2380

Re: [PATCH -tip V2 10/10] workqueue: Fix affinity of kworkers when attaching into pool

2020-12-18 Thread Lai Jiangshan

On Sat, Dec 19, 2020 at 1:59 AM Valentin Schneider
 wrote:
>
>
> On 18/12/20 17:09, Lai Jiangshan wrote:
> > From: Lai Jiangshan 
> >
> > When worker_attach_to_pool() is called, we should not put the workers
> > to pool->attrs->cpumask when there is not CPU online in it.
> >
> > We have to use wq_online_cpumask in worker_attach_to_pool() to check
> > if pool->attrs->cpumask is valid rather than cpu_online_mask or
> > cpu_active_mask due to gaps between stages in cpu hot[un]plug.
> >
> > So for that late-spawned per-CPU kworker case: the outgoing CPU should have
> > already been cleared from wq_online_cpumask, so it gets its affinity reset
> > to the possible mask and the subsequent wakeup will ensure it's put on an
> > active CPU.
> >
> > To use wq_online_cpumask in worker_attach_to_pool(), we need to protect
> > wq_online_cpumask in wq_pool_attach_mutex and we modify 
> > workqueue_online_cpu()
> > and workqueue_offline_cpu() to enlarge wq_pool_attach_mutex protected
> > region. We also put updating wq_online_cpumask and [re|un]bind_workers()
> > in the same wq_pool_attach_mutex protected region to make the update
> > for percpu workqueue atomically.
> >
> > Cc: Qian Cai 
> > Cc: Peter Zijlstra 
> > Cc: Vincent Donnefort 
> > Link: 
> > https://lore.kernel.org/lkml/20201210163830.21514-3-valentin.schnei...@arm.com/
> > Acked-by: Valentin Schneider 
>
> So an etiquette thing: I never actually gave an Acked-by. I did say it
> looked good to me, and that probably should've been bundled with a
> Reviewed-by, but it wasn't (I figured I'd wait for v2). Forging is bad,
> m'kay.
>
> When in doubt (e.g. someone says they're ok with your patch but don't give
> any Ack/Reviewed-by), just ask via mail or on IRC.

Hello, Valentin

I'm sorry not to have asked for your option.  When I saw
"Seems alright to me." I felt a huge encouragement and rushed.

I was in doubt should I promote "Seems alright to me." to "Ack".
Instead of asking, I wrongly did it right the way.  I knew may I'm
just forging, and added a log in the cover letter:

>Add Valentin's ack for patch 10 because "Seems alright to me." and
>add Valentin's comments to the changelog which is integral.

Anyway, it is my bad and I learnt.

>
> For now, please make this a:
>
> Reviewed-by: Valentin Schneider 

Hello Peter, cloud you help change it if there is no other
feedback that causes V3 patchset to be made.

Thanks
Lai

>
> > Acked-by: Tejun Heo 
> > Signed-off-by: Lai Jiangshan 
> > ---
> >  kernel/workqueue.c | 32 +++-
> >  1 file changed, 15 insertions(+), 17 deletions(-)
> >
> > diff --git a/kernel/workqueue.c b/kernel/workqueue.c
> > index 65270729454c..eeb726598f80 100644
> > --- a/kernel/workqueue.c
> > +++ b/kernel/workqueue.c
> > @@ -310,7 +310,7 @@ static bool workqueue_freezing;   /* PL: have 
> > wqs started freezing? */
> >  /* PL: allowable cpus for unbound wqs and work items */
> >  static cpumask_var_t wq_unbound_cpumask;
> >
> > -/* PL: online cpus (cpu_online_mask with the going-down cpu cleared) */
> > +/* PL: online cpus (cpu_online_mask with the going-down cpu cleared) */
> >  static cpumask_var_t wq_online_cpumask;
> >
> >  /* CPU where unbound work was last round robin scheduled from this CPU */
> > @@ -1848,11 +1848,11 @@ static void worker_attach_to_pool(struct worker 
> > *worker,
> >  {
> >   mutex_lock(_pool_attach_mutex);
> >
> > - /*
> > -  * set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
> > -  * online CPUs.  It'll be re-applied when any of the CPUs come up.
> > -  */
> > - set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
> > + /* Is there any cpu in pool->attrs->cpumask online? */
> > + if (cpumask_intersects(pool->attrs->cpumask, wq_online_cpumask))
> > + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
> > pool->attrs->cpumask) < 0);
> > + else
> > + WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
> > cpu_possible_mask) < 0);
> >
> >   /*
> >* The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains
> > @@ -5081,13 +5081,12 @@ int workqueue_online_cpu(unsigned int cpu)
> >   int pi;
> >
> >   mutex_lock(_pool_mutex);
> > - cpumask_set_cpu(cpu, wq_online_cpumask);
> >
> > - for_each_cpu_worker_pool(pool, cpu) {
> > - mutex_lock(_pool_atta

[PATCH -tip V2 08/10] workqueue: reorganize workqueue_online_cpu()

2020-12-18 Thread Lai Jiangshan

From: Lai Jiangshan 

Just move around the code, no functionality changed.

It prepares for later patch protecting wq_online_cpumask
in wq_pool_attach_mutex.

Acked-by: Tejun Heo 
Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c | 11 ---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c2b66679c0aa..dc891b5c0868 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5087,12 +5087,17 @@ int workqueue_online_cpu(unsigned int cpu)
mutex_lock(_pool_mutex);
cpumask_set_cpu(cpu, wq_online_cpumask);
 
+   for_each_cpu_worker_pool(pool, cpu) {
+   mutex_lock(_pool_attach_mutex);
+   rebind_workers(pool);
+   mutex_unlock(_pool_attach_mutex);
+   }
+
+   /* update CPU affinity of workers of unbound pools */
for_each_pool(pool, pi) {
mutex_lock(_pool_attach_mutex);
 
-   if (pool->cpu == cpu)
-   rebind_workers(pool);
-   else if (pool->cpu < 0)
+   if (pool->cpu < 0)
update_unbound_workers_cpumask(pool, cpu);
 
mutex_unlock(_pool_attach_mutex);
-- 
2.19.1.6.gb485710b

[PATCH -tip V2 09/10] workqueue: reorganize workqueue_offline_cpu() unbind_workers()

2020-12-18 Thread Lai Jiangshan

From: Lai Jiangshan 

Just move around the code, no functionality changed.
Only wq_pool_attach_mutex protected region becomes a little larger.

It prepares for later patch protecting wq_online_cpumask
in wq_pool_attach_mutex.

Acked-by: Tejun Heo 
Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c | 90 +++---
 1 file changed, 45 insertions(+), 45 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index dc891b5c0868..65270729454c 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4904,61 +4904,57 @@ void wq_worker_comm(char *buf, size_t size, struct 
task_struct *task)
  * cpu comes back online.
  */
 
-static void unbind_workers(int cpu)
+static void unbind_workers(struct worker_pool *pool)
 {
-   struct worker_pool *pool;
struct worker *worker;
 
-   for_each_cpu_worker_pool(pool, cpu) {
-   mutex_lock(_pool_attach_mutex);
-   raw_spin_lock_irq(>lock);
+   lockdep_assert_held(_pool_attach_mutex);
 
-   /*
-* We've blocked all attach/detach operations. Make all workers
-* unbound and set DISASSOCIATED.  Before this, all workers
-* except for the ones which are still executing works from
-* before the last CPU down must be on the cpu.  After
-* this, they may become diasporas.
-*/
-   for_each_pool_worker(worker, pool)
-   worker->flags |= WORKER_UNBOUND;
+   raw_spin_lock_irq(>lock);
 
-   pool->flags |= POOL_DISASSOCIATED;
+   /*
+* We've blocked all attach/detach operations. Make all workers
+* unbound and set DISASSOCIATED.  Before this, all workers
+* except for the ones which are still executing works from
+* before the last CPU down must be on the cpu.  After
+* this, they may become diasporas.
+*/
+   for_each_pool_worker(worker, pool)
+   worker->flags |= WORKER_UNBOUND;
 
-   raw_spin_unlock_irq(>lock);
+   pool->flags |= POOL_DISASSOCIATED;
 
-   for_each_pool_worker(worker, pool)
-   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
cpu_possible_mask) < 0);
+   raw_spin_unlock_irq(>lock);
 
-   mutex_unlock(_pool_attach_mutex);
+   for_each_pool_worker(worker, pool)
+   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
cpu_possible_mask) < 0);
 
-   /*
-* Call schedule() so that we cross rq->lock and thus can
-* guarantee sched callbacks see the %WORKER_UNBOUND flag.
-* This is necessary as scheduler callbacks may be invoked
-* from other cpus.
-*/
-   schedule();
+   /*
+* Call schedule() so that we cross rq->lock and thus can
+* guarantee sched callbacks see the %WORKER_UNBOUND flag.
+* This is necessary as scheduler callbacks may be invoked
+* from other cpus.
+*/
+   schedule();
 
-   /*
-* Sched callbacks are disabled now.  Zap nr_running.
-* After this, nr_running stays zero and need_more_worker()
-* and keep_working() are always true as long as the
-* worklist is not empty.  This pool now behaves as an
-* unbound (in terms of concurrency management) pool which
-* are served by workers tied to the pool.
-*/
-   atomic_set(>nr_running, 0);
+   /*
+* Sched callbacks are disabled now.  Zap nr_running.
+* After this, nr_running stays zero and need_more_worker()
+* and keep_working() are always true as long as the
+* worklist is not empty.  This pool now behaves as an
+* unbound (in terms of concurrency management) pool which
+* are served by workers tied to the pool.
+*/
+   atomic_set(>nr_running, 0);
 
-   /*
-* With concurrency management just turned off, a busy
-* worker blocking could lead to lengthy stalls.  Kick off
-* unbound chain execution of currently pending work items.
-*/
-   raw_spin_lock_irq(>lock);
-   wake_up_worker(pool);
-   raw_spin_unlock_irq(>lock);
-   }
+   /*
+* With concurrency management just turned off, a busy
+* worker blocking could lead to lengthy stalls.  Kick off
+* unbound chain execution of currently pending work items.
+*/
+   raw_spin_lock_irq(>lock);
+   wake_up_worker(pool);
+   raw_spin_unlock_irq(>lock);
 }
 
 /**
@@ -5121,7 +5117,11 @@ int workqueue_offline_cpu(unsigned int cpu)
if (WARN_ON(cpu != smp_processor_id()))
return -1;
 
-   unb

[PATCH -tip V2 10/10] workqueue: Fix affinity of kworkers when attaching into pool

2020-12-18 Thread Lai Jiangshan

From: Lai Jiangshan 

When worker_attach_to_pool() is called, we should not put the workers
to pool->attrs->cpumask when there is not CPU online in it.

We have to use wq_online_cpumask in worker_attach_to_pool() to check
if pool->attrs->cpumask is valid rather than cpu_online_mask or
cpu_active_mask due to gaps between stages in cpu hot[un]plug.

So for that late-spawned per-CPU kworker case: the outgoing CPU should have
already been cleared from wq_online_cpumask, so it gets its affinity reset
to the possible mask and the subsequent wakeup will ensure it's put on an
active CPU.

To use wq_online_cpumask in worker_attach_to_pool(), we need to protect
wq_online_cpumask in wq_pool_attach_mutex and we modify workqueue_online_cpu()
and workqueue_offline_cpu() to enlarge wq_pool_attach_mutex protected
region. We also put updating wq_online_cpumask and [re|un]bind_workers()
in the same wq_pool_attach_mutex protected region to make the update
for percpu workqueue atomically.

Cc: Qian Cai 
Cc: Peter Zijlstra 
Cc: Vincent Donnefort 
Link: 
https://lore.kernel.org/lkml/20201210163830.21514-3-valentin.schnei...@arm.com/
Acked-by: Valentin Schneider 
Acked-by: Tejun Heo 
Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c | 32 +++-
 1 file changed, 15 insertions(+), 17 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 65270729454c..eeb726598f80 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -310,7 +310,7 @@ static bool workqueue_freezing; /* PL: have wqs 
started freezing? */
 /* PL: allowable cpus for unbound wqs and work items */
 static cpumask_var_t wq_unbound_cpumask;
 
-/* PL: online cpus (cpu_online_mask with the going-down cpu cleared) */
+/* PL: online cpus (cpu_online_mask with the going-down cpu cleared) */
 static cpumask_var_t wq_online_cpumask;
 
 /* CPU where unbound work was last round robin scheduled from this CPU */
@@ -1848,11 +1848,11 @@ static void worker_attach_to_pool(struct worker *worker,
 {
mutex_lock(_pool_attach_mutex);
 
-   /*
-* set_cpus_allowed_ptr() will fail if the cpumask doesn't have any
-* online CPUs.  It'll be re-applied when any of the CPUs come up.
-*/
-   set_cpus_allowed_ptr(worker->task, pool->attrs->cpumask);
+   /* Is there any cpu in pool->attrs->cpumask online? */
+   if (cpumask_intersects(pool->attrs->cpumask, wq_online_cpumask))
+   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
pool->attrs->cpumask) < 0);
+   else
+   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
cpu_possible_mask) < 0);
 
/*
 * The wq_pool_attach_mutex ensures %POOL_DISASSOCIATED remains
@@ -5081,13 +5081,12 @@ int workqueue_online_cpu(unsigned int cpu)
int pi;
 
mutex_lock(_pool_mutex);
-   cpumask_set_cpu(cpu, wq_online_cpumask);
 
-   for_each_cpu_worker_pool(pool, cpu) {
-   mutex_lock(_pool_attach_mutex);
+   mutex_lock(_pool_attach_mutex);
+   cpumask_set_cpu(cpu, wq_online_cpumask);
+   for_each_cpu_worker_pool(pool, cpu)
rebind_workers(pool);
-   mutex_unlock(_pool_attach_mutex);
-   }
+   mutex_unlock(_pool_attach_mutex);
 
/* update CPU affinity of workers of unbound pools */
for_each_pool(pool, pi) {
@@ -5117,14 +5116,13 @@ int workqueue_offline_cpu(unsigned int cpu)
if (WARN_ON(cpu != smp_processor_id()))
return -1;
 
-   for_each_cpu_worker_pool(pool, cpu) {
-   mutex_lock(_pool_attach_mutex);
-   unbind_workers(pool);
-   mutex_unlock(_pool_attach_mutex);
-   }
-
mutex_lock(_pool_mutex);
+
+   mutex_lock(_pool_attach_mutex);
cpumask_clear_cpu(cpu, wq_online_cpumask);
+   for_each_cpu_worker_pool(pool, cpu)
+   unbind_workers(pool);
+   mutex_unlock(_pool_attach_mutex);
 
/* update CPU affinity of workers of unbound pools */
for_each_pool(pool, pi) {
-- 
2.19.1.6.gb485710b

[PATCH -tip V2 05/10] workqueue: introduce wq_online_cpumask

2020-12-18 Thread Lai Jiangshan

From: Lai Jiangshan 

wq_online_cpumask is the cached result of cpu_online_mask with the
going-down cpu cleared.  It is needed for later patches for setting
correct cpumask for workers and break affinity initiatively.

The first usage of wq_online_cpumask is also in this patch.
wq_calc_node_cpumask() and wq_update_unbound_numa() can be simplified
a little.

Acked-by: Tejun Heo 
Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c | 34 ++
 1 file changed, 18 insertions(+), 16 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 5f3c86eaed7a..84842f10e6a2 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -310,6 +310,9 @@ static bool workqueue_freezing; /* PL: have wqs 
started freezing? */
 /* PL: allowable cpus for unbound wqs and work items */
 static cpumask_var_t wq_unbound_cpumask;
 
+/* PL: online cpus (cpu_online_mask with the going-down cpu cleared) */
+static cpumask_var_t wq_online_cpumask;
+
 /* CPU where unbound work was last round robin scheduled from this CPU */
 static DEFINE_PER_CPU(int, wq_rr_cpu_last);
 
@@ -3833,12 +3836,10 @@ static struct pool_workqueue *alloc_unbound_pwq(struct 
workqueue_struct *wq,
  * wq_calc_node_cpumask - calculate a wq_attrs' cpumask for the specified node
  * @attrs: the wq_attrs of the default pwq of the target workqueue
  * @node: the target NUMA node
- * @cpu_going_down: if >= 0, the CPU to consider as offline
  * @cpumask: outarg, the resulting cpumask
  *
- * Calculate the cpumask a workqueue with @attrs should use on @node.  If
- * @cpu_going_down is >= 0, that cpu is considered offline during
- * calculation.  The result is stored in @cpumask.
+ * Calculate the cpumask a workqueue with @attrs should use on @node.
+ * The result is stored in @cpumask.
  *
  * If NUMA affinity is not enabled, @attrs->cpumask is always used.  If
  * enabled and @node has online CPUs requested by @attrs, the returned
@@ -3852,15 +3853,14 @@ static struct pool_workqueue *alloc_unbound_pwq(struct 
workqueue_struct *wq,
  * %false if equal.
  */
 static bool wq_calc_node_cpumask(const struct workqueue_attrs *attrs, int node,
-int cpu_going_down, cpumask_t *cpumask)
+cpumask_t *cpumask)
 {
if (!wq_numa_enabled || attrs->no_numa)
goto use_dfl;
 
/* does @node have any online CPUs @attrs wants? */
cpumask_and(cpumask, cpumask_of_node(node), attrs->cpumask);
-   if (cpu_going_down >= 0)
-   cpumask_clear_cpu(cpu_going_down, cpumask);
+   cpumask_and(cpumask, cpumask, wq_online_cpumask);
 
if (cpumask_empty(cpumask))
goto use_dfl;
@@ -3969,7 +3969,7 @@ apply_wqattrs_prepare(struct workqueue_struct *wq,
goto out_free;
 
for_each_node(node) {
-   if (wq_calc_node_cpumask(new_attrs, node, -1, 
tmp_attrs->cpumask)) {
+   if (wq_calc_node_cpumask(new_attrs, node, tmp_attrs->cpumask)) {
ctx->pwq_tbl[node] = alloc_unbound_pwq(wq, tmp_attrs);
if (!ctx->pwq_tbl[node])
goto out_free;
@@ -4094,7 +4094,6 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
  * wq_update_unbound_numa - update NUMA affinity of a wq for CPU hot[un]plug
  * @wq: the target workqueue
  * @cpu: the CPU coming up or going down
- * @online: whether @cpu is coming up or going down
  *
  * This function is to be called from %CPU_DOWN_PREPARE, %CPU_ONLINE and
  * %CPU_DOWN_FAILED.  @cpu is being hot[un]plugged, update NUMA affinity of
@@ -4112,11 +4111,9 @@ int apply_workqueue_attrs(struct workqueue_struct *wq,
  * affinity, it's the user's responsibility to flush the work item from
  * CPU_DOWN_PREPARE.
  */
-static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu,
-  bool online)
+static void wq_update_unbound_numa(struct workqueue_struct *wq, int cpu)
 {
int node = cpu_to_node(cpu);
-   int cpu_off = online ? -1 : cpu;
struct pool_workqueue *old_pwq = NULL, *pwq;
struct workqueue_attrs *target_attrs;
cpumask_t *cpumask;
@@ -4144,7 +4141,7 @@ static void wq_update_unbound_numa(struct 
workqueue_struct *wq, int cpu,
 * and create a new one if they don't match.  If the target cpumask
 * equals the default pwq's, the default pwq should be used.
 */
-   if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, cpu_off, 
cpumask)) {
+   if (wq_calc_node_cpumask(wq->dfl_pwq->pool->attrs, node, cpumask)) {
if (cpumask_equal(cpumask, pwq->pool->attrs->cpumask))
return;
} else {
@@ -5081,6 +5078,7 @@ int workqueue_online_cpu(unsigned int cpu)
int pi;
 
mutex_lock(_pool_mutex);
+   cpumask_set_cpu(cpu, wq_online_cpumask);
 
for_each_pool(

[PATCH -tip V2 04/10] workqueue: don't set the worker's cpumask when kthread_bind_mask()

2020-12-18 Thread Lai Jiangshan

From: Lai Jiangshan 

There might be no online cpu in the pool->attrs->cpumask.
We will set the worker's cpumask later in worker_attach_to_pool().

Cc: Peter Zijlstra 
Acked-by: Tejun Heo 
Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c | 10 +-
 1 file changed, 9 insertions(+), 1 deletion(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 4d7575311198..5f3c86eaed7a 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1948,7 +1948,15 @@ static struct worker *create_worker(struct worker_pool 
*pool)
goto fail;
 
set_user_nice(worker->task, pool->attrs->nice);
-   kthread_bind_mask(worker->task, pool->attrs->cpumask);
+
+   /*
+* Set PF_NO_SETAFFINITY via kthread_bind_mask().  We use
+* cpu_possible_mask instead of pool->attrs->cpumask, because
+* there might not be any online cpu in the pool->attrs->cpumask.
+* The cpumask of the worker will be set properly later in
+* worker_attach_to_pool().
+*/
+   kthread_bind_mask(worker->task, cpu_possible_mask);
 
/* successful, attach the worker to the pool */
worker_attach_to_pool(worker, pool);
-- 
2.19.1.6.gb485710b

[PATCH -tip V2 07/10] workqueue: Manually break affinity on hotplug for unbound pool

2020-12-18 Thread Lai Jiangshan

From: Lai Jiangshan 

There is possible that a per-node pool/woker's affinity is a single
CPU.  It can happen when wq_unbound_cpumask is changed by system adim
via /sys/devices/virtual/workqueue/cpumask.  And pool->attrs->cpumask
is wq_unbound_cpumask & possible_cpumask_of_the_node, which can be a
single CPU and makes the pool's workers to be "per cpu kthread".

And the scheduler won't break affinity on the "per cpu kthread" workers
when the CPU is going down, so we have to do it by our own.

We do it by reusing existing restore_unbound_workers_cpumask() and rename
it to update_unbound_workers_cpumask().  When the number of the online
CPU of the pool goes from 1 to 0, we break the affinity initiatively.

Note here, we even break the affinity for non-per-cpu-kthread workers,
because first, the code path is slow path which is not worth too much to
optimize, second, we don't need to rely on the code/conditions when the
scheduler forces breaking affinity for us.

The way to break affinity is to set the workers' affinity to
cpu_possible_mask, so that we preserve the same behavisor when
the scheduler breaks affinity for us.

Fixes: 06249738a41a ("workqueue: Manually break affinity on hotplug")
Acked-by: Tejun Heo 
Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c | 49 --
 1 file changed, 34 insertions(+), 15 deletions(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index eda293097fe1..c2b66679c0aa 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5027,16 +5027,16 @@ static void rebind_workers(struct worker_pool *pool)
 }
 
 /**
- * restore_unbound_workers_cpumask - restore cpumask of unbound workers
+ * update_unbound_workers_cpumask - update cpumask of unbound workers
  * @pool: unbound pool of interest
- * @cpu: the CPU which is coming up
+ * @cpu: the CPU which is coming up or going down
  *
  * An unbound pool may end up with a cpumask which doesn't have any online
- * CPUs.  When a worker of such pool get scheduled, the scheduler resets
- * its cpus_allowed.  If @cpu is in @pool's cpumask which didn't have any
- * online CPU before, cpus_allowed of all its workers should be restored.
+ * CPUs.  We have to reset workers' cpus_allowed of such pool.  And we
+ * restore the workers' cpus_allowed when the pool's cpumask has online
+ * CPU for the first time after reset.
  */
-static void restore_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
+static void update_unbound_workers_cpumask(struct worker_pool *pool, int cpu)
 {
static cpumask_t cpumask;
struct worker *worker;
@@ -5050,13 +5050,19 @@ static void restore_unbound_workers_cpumask(struct 
worker_pool *pool, int cpu)
 
cpumask_and(, pool->attrs->cpumask, wq_online_cpumask);
 
-   /* is @cpu the first one onlined for the @pool? */
-   if (cpumask_weight() > 1)
-   return;
-
-   /* as we're called from CPU_ONLINE, the following shouldn't fail */
-   for_each_pool_worker(worker, pool)
-   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
pool->attrs->cpumask) < 0);
+   switch (cpumask_weight()) {
+   case 0: /* @cpu is the last one going down for the @pool. */
+   for_each_pool_worker(worker, pool)
+   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
cpu_possible_mask) < 0);
+   break;
+   case 1: /* @cpu is the first one onlined for the @pool. */
+   /* as we're called from CPU_ONLINE, the following shouldn't 
fail */
+   for_each_pool_worker(worker, pool)
+   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
pool->attrs->cpumask) < 0);
+   break;
+   default: /* other cases, nothing to do */
+   break;
+   }
 }
 
 int workqueue_prepare_cpu(unsigned int cpu)
@@ -5087,7 +5093,7 @@ int workqueue_online_cpu(unsigned int cpu)
if (pool->cpu == cpu)
rebind_workers(pool);
else if (pool->cpu < 0)
-   restore_unbound_workers_cpumask(pool, cpu);
+   update_unbound_workers_cpumask(pool, cpu);
 
mutex_unlock(_pool_attach_mutex);
}
@@ -5102,7 +5108,9 @@ int workqueue_online_cpu(unsigned int cpu)
 
 int workqueue_offline_cpu(unsigned int cpu)
 {
+   struct worker_pool *pool;
struct workqueue_struct *wq;
+   int pi;
 
/* unbinding per-cpu workers should happen on the local CPU */
if (WARN_ON(cpu != smp_processor_id()))
@@ -5110,9 +5118,20 @@ int workqueue_offline_cpu(unsigned int cpu)
 
unbind_workers(cpu);
 
-   /* update NUMA affinity of unbound workqueues */
mutex_lock(_pool_mutex);
cpumask_clear_cpu(cpu, wq_online_cpumask);
+
+   /* update CPU affinity of workers of unbound pools */
+   for_each_pool(pool, p

[PATCH -tip V2 02/10] workqueue: use cpu_possible_mask instead of cpu_active_mask to break affinity

2020-12-18 Thread Lai Jiangshan

From: Lai Jiangshan 

The scheduler won't break affinity for us any more, and we should
"emulate" the same behavior when the scheduler breaks affinity for
us.  The behavior is "changing the cpumask to cpu_possible_mask".

And there might be some other CPUs online later while the worker is
still running with the pending work items.  The worker should be allowed
to use the later online CPUs as before and process the work items ASAP.
If we use cpu_active_mask here, we can't achieve this goal but
using cpu_possible_mask can.

Fixes: 06249738a41a ("workqueue: Manually break affinity on hotplug")
Acked-by: Tejun Heo 
Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index aba71ab359dd..fa71520822f0 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -4910,7 +4910,7 @@ static void unbind_workers(int cpu)
raw_spin_unlock_irq(>lock);
 
for_each_pool_worker(worker, pool)
-   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
cpu_active_mask) < 0);
+   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
cpu_possible_mask) < 0);
 
mutex_unlock(_pool_attach_mutex);
 
-- 
2.19.1.6.gb485710b

[PATCH -tip V2 03/10] workqueue: Manually break affinity on pool detachment

2020-12-18 Thread Lai Jiangshan

From: Lai Jiangshan 

The pool->attrs->cpumask might be a single CPU and it may go
down after detachment, and the scheduler won't force to break
affinity for us since it is a per-cpu-ktrhead.  So we have to
do it on our own and unbind this worker which can't be unbound
by workqueue_offline_cpu() since it doesn't belong to any pool
after detachment.  Do it unconditionally for there is no harm
to break affinity for non-per-cpu-ktrhead and we don't need to
rely on the scheduler's policy on when to break affinity.

Fixes: 06249738a41a ("workqueue: Manually break affinity on hotplug")
Acked-by: Tejun Heo 
Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c | 13 +
 1 file changed, 13 insertions(+)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index fa71520822f0..4d7575311198 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -1885,6 +1885,19 @@ static void worker_detach_from_pool(struct worker 
*worker)
 
if (list_empty(>workers))
detach_completion = pool->detach_completion;
+
+   /*
+* The pool->attrs->cpumask might be a single CPU and it may go
+* down after detachment, and the scheduler won't force to break
+* affinity for us since it is a per-cpu-ktrhead.  So we have to
+* do it on our own and unbind this worker which can't be unbound
+* by workqueue_offline_cpu() since it doesn't belong to any pool
+* after detachment.  Do it unconditionally for there is no harm
+* to break affinity for non-per-cpu-ktrhead and we don't need to
+* rely on the scheduler's policy on when to break affinity.
+*/
+   set_cpus_allowed_ptr(worker->task, cpu_possible_mask);
+
mutex_unlock(_pool_attach_mutex);
 
/* clear leftover flags without pool->lock after it is detached */
-- 
2.19.1.6.gb485710b

[PATCH -tip V2 06/10] workqueue: use wq_online_cpumask in restore_unbound_workers_cpumask()

2020-12-18 Thread Lai Jiangshan

From: Lai Jiangshan 

restore_unbound_workers_cpumask() is called when CPU_ONLINE, where
wq_online_cpumask equals to cpu_online_mask. So no fucntionality
changed.

Acked-by: Tejun Heo 
Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index 84842f10e6a2..eda293097fe1 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5041,13 +5041,14 @@ static void restore_unbound_workers_cpumask(struct 
worker_pool *pool, int cpu)
static cpumask_t cpumask;
struct worker *worker;
 
+   lockdep_assert_held(_pool_mutex);
lockdep_assert_held(_pool_attach_mutex);
 
/* is @cpu allowed for @pool? */
if (!cpumask_test_cpu(cpu, pool->attrs->cpumask))
return;
 
-   cpumask_and(, pool->attrs->cpumask, cpu_online_mask);
+   cpumask_and(, pool->attrs->cpumask, wq_online_cpumask);
 
/* is @cpu the first one onlined for the @pool? */
if (cpumask_weight() > 1)
-- 
2.19.1.6.gb485710b

[PATCH -tip V2 00/10] workqueue: break affinity initiatively

2020-12-18 Thread Lai Jiangshan

From: Lai Jiangshan 

06249738a41a ("workqueue: Manually break affinity on hotplug")
said that scheduler will not force break affinity for us.

But workqueue highly depends on the old behavior. Many parts of the codes
relies on it, 06249738a41a ("workqueue: Manually break affinity on hotplug")
is not enough to change it, and the commit has flaws in itself too.

It doesn't handle for worker detachment.
It doesn't handle for worker attachement, mainly worker creation
  which is handled by Valentin Schneider's patch [1].
It doesn't handle for unbound workers which might be possible
per-cpu-kthread.

We need to thoroughly update the way workqueue handles affinity
in cpu hot[un]plug, what is this patchset intends to do and
replace the Valentin Schneider's patch [1].  The equivalent patch
is patch 10.

Patch 1 fixes a flaw reported by Hillf Danton .
I have to include this fix because later patches depends on it.

The patchset is based on tip/master rather than workqueue tree,
because the patchset is a complement for 06249738a41a ("workqueue:
Manually break affinity on hotplug") which is only in tip/master by now.

And TJ acked to route the series through tip.

Changed from V1:
Add TJ's acked-by for the whole patchset

Add more words to the comments and the changelog, mainly derived
from discussion with Peter.

Update the comments as TJ suggested.

Update a line of code as Valentin suggested.

Add Valentin's ack for patch 10 because "Seems alright to me." and
add Valentin's comments to the changelog which is integral.

[1]: 
https://lore.kernel.org/r/ff62e3ee994efb3620177bf7b19fab16f4866845.ca...@redhat.com
[V1 patcheset]: 
https://lore.kernel.org/lkml/20201214155457.3430-1-jiangshan...@gmail.com/

Cc: Hillf Danton 
Cc: Valentin Schneider 
Cc: Qian Cai 
Cc: Peter Zijlstra 
Cc: Vincent Donnefort 
Cc: Tejun Heo 

Lai Jiangshan (10):
  workqueue: restore unbound_workers' cpumask correctly
  workqueue: use cpu_possible_mask instead of cpu_active_mask to break
affinity
  workqueue: Manually break affinity on pool detachment
  workqueue: don't set the worker's cpumask when kthread_bind_mask()
  workqueue: introduce wq_online_cpumask
  workqueue: use wq_online_cpumask in restore_unbound_workers_cpumask()
  workqueue: Manually break affinity on hotplug for unbound pool
  workqueue: reorganize workqueue_online_cpu()
  workqueue: reorganize workqueue_offline_cpu() unbind_workers()
  workqueue: Fix affinity of kworkers when attaching into pool

 kernel/workqueue.c | 214 -
 1 file changed, 132 insertions(+), 82 deletions(-)

-- 
2.19.1.6.gb485710b

[PATCH -tip V2 01/10] workqueue: restore unbound_workers' cpumask correctly

2020-12-18 Thread Lai Jiangshan

From: Lai Jiangshan 

When we restore workers' cpumask, we should restore them to the
designed pool->attrs->cpumask. And we need to only do it at
the first time.

Cc: Hillf Danton 
Reported-by: Hillf Danton 
Acked-by: Tejun Heo 
Signed-off-by: Lai Jiangshan 
---
 kernel/workqueue.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/workqueue.c b/kernel/workqueue.c
index c71da2a59e12..aba71ab359dd 100644
--- a/kernel/workqueue.c
+++ b/kernel/workqueue.c
@@ -5031,9 +5031,13 @@ static void restore_unbound_workers_cpumask(struct 
worker_pool *pool, int cpu)
 
cpumask_and(, pool->attrs->cpumask, cpu_online_mask);
 
+   /* is @cpu the first one onlined for the @pool? */
+   if (cpumask_weight() > 1)
+   return;
+
/* as we're called from CPU_ONLINE, the following shouldn't fail */
for_each_pool_worker(worker, pool)
-   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, ) < 0);
+   WARN_ON_ONCE(set_cpus_allowed_ptr(worker->task, 
pool->attrs->cpumask) < 0);
 }
 
 int workqueue_prepare_cpu(unsigned int cpu)
-- 
2.19.1.6.gb485710b

Re: [PATCH V2 1/3] x86/mm/pti: handle unaligned address for pmd clone in pti_clone_pagetable()

2020-12-18 Thread Lai Jiangshan

Hello, Dave Hansen

Could you help review the patches, please?

I think they meet your suggestion except for forcing alignment in the
caller.  The reason is in the code.

Thanks
Lai

On Thu, Dec 10, 2020 at 9:34 PM Lai Jiangshan  wrote:
>
> From: Lai Jiangshan 
>
> The commit 825d0b73cd752("x86/mm/pti: Handle unaligned address gracefully
> in pti_clone_pagetable()") handles unaligned address well for unmapped
> PUD/PMD etc. But unaligned address for mapped pmd also needs to
> be aware.
>
> For mapped pmd, if @addr is not aligned to PMD_SIZE, the next pmd
> (PTI_CLONE_PMD or the next pmd is large) or the last ptes (PTI_CLONE_PTE)
> in the next pmd will not be cloned when @end < @addr + PMD_SIZE in the
> current logic in the code.
>
> It is not a good idea to force alignment in the caller due to one of
> the cases (see the comments in the code), so it just handles the alignment
> in pti_clone_pagetable().
>
> Signed-off-by: Lai Jiangshan 
> ---
>  arch/x86/mm/pti.c | 15 +++
>  1 file changed, 15 insertions(+)
>
> diff --git a/arch/x86/mm/pti.c b/arch/x86/mm/pti.c
> index 1aab92930569..7ee99ef13a99 100644
> --- a/arch/x86/mm/pti.c
> +++ b/arch/x86/mm/pti.c
> @@ -342,6 +342,21 @@ pti_clone_pgtable(unsigned long start, unsigned long end,
> }
>
> if (pmd_large(*pmd) || level == PTI_CLONE_PMD) {
> +   /*
> +* pti_clone_kernel_text() might be called with
> +* @start not aligned to PMD_SIZE. We need to make
> +* it aligned, otherwise the next pmd or last ptes
> +* are not cloned when @end < @addr + PMD_SIZE.
> +*
> +* We can't force pti_clone_kernel_text() to align
> +* the @addr to PMD_SIZE when level == PTI_CLONE_PTE.
> +* But the problem can still possible exist when the
> +* first pmd is large. And it is not a good idea to
> +* check whether the first pmd is large or not in the
> +* caller, so we just simply align it here.
> +*/
> +   addr = round_down(addr, PMD_SIZE);
> +
> target_pmd = pti_user_pagetable_walk_pmd(addr);
> if (WARN_ON(!target_pmd))
> return;
> --
> 2.19.1.6.gb485710b
>

[PATCH V3] kvm: check tlbs_dirty directly

2020-12-17 Thread Lai Jiangshan

From: Lai Jiangshan 

In kvm_mmu_notifier_invalidate_range_start(), tlbs_dirty is used as:
need_tlb_flush |= kvm->tlbs_dirty;
with need_tlb_flush's type being int and tlbs_dirty's type being long.

It means that tlbs_dirty is always used as int and the higher 32 bits
is useless.  We need to check tlbs_dirty in a correct way and this
change checks it directly without propagating it to need_tlb_flush.

Note: it's _extremely_ unlikely this neglecting of higher 32 bits can
cause problems in practice.  It would require encountering tlbs_dirty
on a 4 billion count boundary, and KVM would need to be using shadow
paging or be running a nested guest.

Cc: sta...@vger.kernel.org
Fixes: a4ee1ca4a36e ("KVM: MMU: delay flush all tlbs on sync_page path")
Signed-off-by: Lai Jiangshan 
---
Changed from V1:
Update the patch and the changelog as Sean Christopherson suggested.

Changed from v2:
don't change the type of need_tlb_flush

 virt/kvm/kvm_main.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/virt/kvm/kvm_main.c b/virt/kvm/kvm_main.c
index 2541a17ff1c4..3083fb53861d 100644
--- a/virt/kvm/kvm_main.c
+++ b/virt/kvm/kvm_main.c
@@ -482,9 +482,8 @@ static int kvm_mmu_notifier_invalidate_range_start(struct 
mmu_notifier *mn,
kvm->mmu_notifier_count++;
need_tlb_flush = kvm_unmap_hva_range(kvm, range->start, range->end,
 range->flags);
-   need_tlb_flush |= kvm->tlbs_dirty;
/* we've to flush the tlb before the pages can be freed */
-   if (need_tlb_flush)
+   if (need_tlb_flush || kvm->tlbs_dirty)
kvm_flush_remote_tlbs(kvm);
 
spin_unlock(>mmu_lock);
-- 
2.19.1.6.gb485710b

[PATCH 2/2] selftest: parse the max cpu corretly from cpu list string

2020-12-17 Thread Lai Jiangshan

From: Lai Jiangshan 

"," is allowed in cpu list strings, such as "0-3,5".  We need
to handle these cases.

Signed-off-by: Lai Jiangshan 
---
 tools/testing/selftests/cpu-hotplug/cpu-on-off-test.sh | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/tools/testing/selftests/cpu-hotplug/cpu-on-off-test.sh 
b/tools/testing/selftests/cpu-hotplug/cpu-on-off-test.sh
index 5cdef96326a7..ac37bd54ea1c 100755
--- a/tools/testing/selftests/cpu-hotplug/cpu-on-off-test.sh
+++ b/tools/testing/selftests/cpu-hotplug/cpu-on-off-test.sh
@@ -30,7 +30,7 @@ prerequisite()
 
echo "CPU online/offline summary:"
online_cpus=`cat $SYSFS/devices/system/cpu/online`
-   online_max=${online_cpus##*-}
+   online_max=${online_cpus##*[-|,]}
 
if [[ "$online_cpus" = "$online_max" ]]; then
echo "$msg: since there is only one cpu: $online_cpus"
@@ -38,7 +38,7 @@ prerequisite()
fi
 
present_cpus=`cat $SYSFS/devices/system/cpu/present`
-   present_max=${present_cpus##*-}
+   present_max=${present_cpus##*[-|,]}
echo "present_cpus = $present_cpus present_max = $present_max"
 
echo -e "\t Cpus in online state: $online_cpus"
@@ -47,7 +47,7 @@ prerequisite()
if [[ "a$offline_cpus" = "a" ]]; then
offline_cpus=0
else
-   offline_max=${offline_cpus##*-}
+   offline_max=${offline_cpus##*[-|,]}
fi
echo -e "\t Cpus in offline state: $offline_cpus"
 }
-- 
2.19.1.6.gb485710b

[PATCH 1/2] selftest: don't offline the last CPU in cpu hotplug test

2020-12-17 Thread Lai Jiangshan

From: Lai Jiangshan 

In my box, all CPUs are allowed to be offline.  The test tries to offline
all offline-able CPUs and causes fail on the last one.  We should just
skip offlining the last CPU

Signed-off-by: Lai Jiangshan 
---
 tools/testing/selftests/cpu-hotplug/cpu-on-off-test.sh | 5 +
 1 file changed, 5 insertions(+)

diff --git a/tools/testing/selftests/cpu-hotplug/cpu-on-off-test.sh 
b/tools/testing/selftests/cpu-hotplug/cpu-on-off-test.sh
index 0d26b5e3f966..5cdef96326a7 100755
--- a/tools/testing/selftests/cpu-hotplug/cpu-on-off-test.sh
+++ b/tools/testing/selftests/cpu-hotplug/cpu-on-off-test.sh
@@ -126,6 +126,11 @@ offline_cpu_expect_success()
 {
local cpu=$1
 
+   # don't offline the last CPU if all CPUs are offline-able
+   if [[ a$cpu = a`cat $SYSFS/devices/system/cpu/online` ]]; then
+   return
+   fi
+
if ! offline_cpu $cpu; then
echo $FUNCNAME $cpu: unexpected fail >&2
exit 1
-- 
2.19.1.6.gb485710b

1 2 3 4 5 6 7 8 9 10 >

1 - 100 of 2229 matches

Mail list logo