Re: [PATCH v2 08/10] PCI: layerscape: Add EP mode support for ls1088a and ls2088a

2019-08-23 Thread christophe leroy




Le 24/08/2019 à 02:18, Xiaowei Bao a écrit :




-Original Message-
From: Andrew Murray 
Sent: 2019年8月23日 22:28
To: Xiaowei Bao 
Cc: bhelg...@google.com; robh...@kernel.org; mark.rutl...@arm.com;
shawn...@kernel.org; Leo Li ; kis...@ti.com;
lorenzo.pieral...@arm.co; a...@arndb.de; gre...@linuxfoundation.org; M.h.
Lian ; Mingkai Hu ; Roy
Zang ; jingooh...@gmail.com;
gustavo.pimen...@synopsys.com; linux-...@vger.kernel.org;
devicet...@vger.kernel.org; linux-kernel@vger.kernel.org;
linux-arm-ker...@lists.infradead.org; linuxppc-...@lists.ozlabs.org
Subject: Re: [PATCH v2 08/10] PCI: layerscape: Add EP mode support for
ls1088a and ls2088a

On Thu, Aug 22, 2019 at 07:22:40PM +0800, Xiaowei Bao wrote:

Add PCIe EP mode support for ls1088a and ls2088a, there are some
difference between LS1 and LS2 platform, so refactor the code of the
EP driver.

Signed-off-by: Xiaowei Bao 
---
v2:
  - New mechanism for layerscape EP driver.


Was there a v1 of this patch?


Yes, but I don't know how to comments, ^_^


As far as I can see, in the previous version of the series 
(https://patchwork.ozlabs.org/project/linuxppc-dev/list/?series=125315&state=*), 
the 8/10 was something completely different, and I can't find any other 
patch in the series that could have been the v1 of this patch.


Christophe







  drivers/pci/controller/dwc/pci-layerscape-ep.c | 76
--
  1 file changed, 58 insertions(+), 18 deletions(-)

diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c
b/drivers/pci/controller/dwc/pci-layerscape-ep.c
index 7ca5fe8..2a66f07 100644
--- a/drivers/pci/controller/dwc/pci-layerscape-ep.c
+++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c
@@ -20,27 +20,29 @@

  #define PCIE_DBI2_OFFSET  0x1000  /* DBI2 base address*/

-struct ls_pcie_ep {
-   struct dw_pcie  *pci;
-   struct pci_epc_features *ls_epc;
+#define to_ls_pcie_ep(x)   dev_get_drvdata((x)->dev)
+
+struct ls_pcie_ep_drvdata {
+   u32 func_offset;
+   const struct dw_pcie_ep_ops *ops;
+   const struct dw_pcie_ops*dw_pcie_ops;
  };

-#define to_ls_pcie_ep(x)   dev_get_drvdata((x)->dev)
+struct ls_pcie_ep {
+   struct dw_pcie  *pci;
+   struct pci_epc_features *ls_epc;
+   const struct ls_pcie_ep_drvdata *drvdata; };

  static int ls_pcie_establish_link(struct dw_pcie *pci)  {
return 0;
  }

-static const struct dw_pcie_ops ls_pcie_ep_ops = {
+static const struct dw_pcie_ops dw_ls_pcie_ep_ops = {
.start_link = ls_pcie_establish_link,  };

-static const struct of_device_id ls_pcie_ep_of_match[] = {
-   { .compatible = "fsl,ls-pcie-ep",},
-   { },
-};
-
  static const struct pci_epc_features*  ls_pcie_ep_get_features(struct
dw_pcie_ep *ep)  { @@ -82,10 +84,44 @@ static int
ls_pcie_ep_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
}
  }

-static const struct dw_pcie_ep_ops pcie_ep_ops = {
+static unsigned int ls_pcie_ep_func_conf_select(struct dw_pcie_ep *ep,
+   u8 func_no)
+{
+   struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
+   struct ls_pcie_ep *pcie = to_ls_pcie_ep(pci);
+   u8 header_type;
+
+   header_type = ioread8(pci->dbi_base + PCI_HEADER_TYPE);
+
+   if (header_type & (1 << 7))
+   return pcie->drvdata->func_offset * func_no;
+   else
+   return 0;


It looks like there isn't a PCI define for multi function, the nearest I could 
find
was PCI_HEADER_TYPE_MULTIDEVICE in hotplug/ibmphp.h. A comment
above the test might be helpful to explain the test.


Yes, I have not find the PCI_HEADER_TYPE_MULTIDEVICE define. OK, I will add
The comments in next version patch.



As the ls_pcie_ep_drvdata structures are static, the unset .func_offset will be
initialised to 0, so you could just drop the test above.


OK, thanks



However something to the effect of the following may help spot
misconfiguration:

WARN_ON(func_no && !pcie->drvdata->func_offset); return
pcie->drvdata->func_offset * func_no;


Thanks a lot, this looks better.



The WARN is probably quite useful as if you are attempting to use non-zero
functions and func_offset isn't set - then things may appear to work normally
but actually will break horribly.


got it, thanks.



Thanks,

Andrew Murray


+}
+
+static const struct dw_pcie_ep_ops ls_pcie_ep_ops = {
.ep_init = ls_pcie_ep_init,
.raise_irq = ls_pcie_ep_raise_irq,
.get_features = ls_pcie_ep_get_features,
+   .func_conf_select = ls_pcie_ep_func_conf_select, };
+
+static const struct ls_pcie_ep_drvdata ls1_ep_drvdata = {
+   .ops = &ls_pcie_ep_ops,
+   .dw_pcie_ops = &dw_ls_pcie_ep_ops,
+};
+
+static const struct ls_pcie_ep_drvdata ls2_ep_drvdata = {
+   .func_offset = 0x2,
+   .ops = &ls_pcie_ep_ops,
+   .dw_pcie_ops = &dw_ls_pcie_ep_ops,
+};
+
+static const struct of_device_id ls_pcie_ep_of_match[] = {
+   

Re: [PATCH v3 2/4] clk: qcom: clk-rpmh: Convert to parent data scheme

2019-08-23 Thread Bjorn Andersson
On Thu 22 Aug 10:01 PDT 2019, Vinod Koul wrote:

> Convert the rpmh clock driver to use the new parent data scheme by
> specifying the parent data for board clock.
> 
> Signed-off-by: Vinod Koul 
> ---
>  drivers/clk/qcom/clk-rpmh.c | 10 --
>  1 file changed, 8 insertions(+), 2 deletions(-)
> 
> diff --git a/drivers/clk/qcom/clk-rpmh.c b/drivers/clk/qcom/clk-rpmh.c
> index c3fd632af119..0bced7326a20 100644
> --- a/drivers/clk/qcom/clk-rpmh.c
> +++ b/drivers/clk/qcom/clk-rpmh.c
> @@ -95,7 +95,10 @@ static DEFINE_MUTEX(rpmh_clk_lock);
>   .hw.init = &(struct clk_init_data){ \
>   .ops = &clk_rpmh_ops,   \
>   .name = #_name, \
> - .parent_names = (const char *[]){ "xo_board" }, \
> + .parent_data =  &(const struct clk_parent_data){ \
> + .fw_name = "xo_board",  \
> + .name = "xo_board", \

Iiuc .name here refers to the global clock namespace and .fw_name refers
to the device_node local name space. As such I really prefer this to be:

  .fw_name = "xo",
  .name = "xo_board",

This ensures the backwards compatibility (when using global lookup),
without complicating the node-local naming.

Regards,
Bjorn

> + },  \
>   .num_parents = 1,   \
>   },  \
>   };  \
> @@ -110,7 +113,10 @@ static DEFINE_MUTEX(rpmh_clk_lock);
>   .hw.init = &(struct clk_init_data){ \
>   .ops = &clk_rpmh_ops,   \
>   .name = #_name_active,  \
> - .parent_names = (const char *[]){ "xo_board" }, \
> + .parent_data =  &(const struct clk_parent_data){ \
> + .fw_name = "xo_board",  \
> + .name = "xo_board", \
> + },  \
>   .num_parents = 1,   \
>   },  \
>   }
> -- 
> 2.20.1
> 


Re: [PATCH] mailmap: Add Simon Arlott (replacement for expired email address)

2019-08-23 Thread Stefan Wahren
Am 22.08.19 um 21:01 schrieb Simon Arlott:
> Add replacement email address for the one on my expired domain.
>
> Signed-off-by: Simon Arlott 
I already send out my pull requests. Can anyone else take care of this?


[RFC PATCH v2 1/3] x86/mm/tlb: Change __flush_tlb_one_user interface

2019-08-23 Thread Nadav Amit
__flush_tlb_one_user() currently flushes a single entry, and flushes it
both in the kernel and user page-tables, when PTI is enabled.

Change __flush_tlb_one_user() and related interfaces into
__flush_tlb_range() that flushes a range and does not flush the user
page-table.

This refactoring is needed for the next patch, but regardless makes
sense and has several advantages. First, only Xen-PV, which does not
use PTI, implements the paravirtual interface of flush_tlb_one_user() so
nothing is broken by separating the user and kernel page-table flushes,
and the interface is more intuitive.

Second, INVLPG can flush unrelated mappings, and it is also a
serializing instruction. It is better to have a tight loop that flushes
the entries.

Third, currently __flush_tlb_one_kernel() also flushes the user
page-tables, which is not needed. This allows to avoid this redundant
flush.

Cc: Boris Ostrovsky 
Cc: Juergen Gross 
Cc: Stefano Stabellini 
Cc: xen-de...@lists.xenproject.org
Signed-off-by: Nadav Amit 
---
 arch/x86/include/asm/paravirt.h   |  5 ++--
 arch/x86/include/asm/paravirt_types.h |  3 ++-
 arch/x86/include/asm/tlbflush.h   | 24 +
 arch/x86/kernel/paravirt.c|  7 ++---
 arch/x86/mm/tlb.c | 39 ++-
 arch/x86/xen/mmu_pv.c | 21 +--
 6 files changed, 62 insertions(+), 37 deletions(-)

diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index bc4829c9b3f9..7347328eacd3 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -57,9 +57,10 @@ static inline void __flush_tlb_global(void)
PVOP_VCALL0(mmu.flush_tlb_kernel);
 }
 
-static inline void __flush_tlb_one_user(unsigned long addr)
+static inline void __flush_tlb_range(unsigned long start, unsigned long end,
+u8 stride_shift)
 {
-   PVOP_VCALL1(mmu.flush_tlb_one_user, addr);
+   PVOP_VCALL3(mmu.flush_tlb_range, start, end, stride_shift);
 }
 
 static inline void flush_tlb_multi(const struct cpumask *cpumask,
diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 63fa751344bf..a87a5f236251 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -205,7 +205,8 @@ struct pv_mmu_ops {
/* TLB operations */
void (*flush_tlb_user)(void);
void (*flush_tlb_kernel)(void);
-   void (*flush_tlb_one_user)(unsigned long addr);
+   void (*flush_tlb_range)(unsigned long start, unsigned long end,
+   u8 stride_shift);
void (*flush_tlb_multi)(const struct cpumask *cpus,
const struct flush_tlb_info *info);
 
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 1f88ea410ff3..421bc82504e2 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -145,7 +145,7 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, 
u16 asid)
 #else
 #define __flush_tlb() __native_flush_tlb()
 #define __flush_tlb_global() __native_flush_tlb_global()
-#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
+#define __flush_tlb_range(start, end, stride_shift) 
__native_flush_tlb_range(start, end, stride_shift)
 #endif
 
 struct tlb_context {
@@ -454,23 +454,13 @@ static inline void __native_flush_tlb_global(void)
 /*
  * flush one page in the user mapping
  */
-static inline void __native_flush_tlb_one_user(unsigned long addr)
+static inline void __native_flush_tlb_range(unsigned long start,
+   unsigned long end, u8 stride_shift)
 {
-   u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
+   unsigned long addr;
 
-   asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
-
-   if (!static_cpu_has(X86_FEATURE_PTI))
-   return;
-
-   /*
-* Some platforms #GP if we call invpcid(type=1/2) before CR4.PCIDE=1.
-* Just use invalidate_user_asid() in case we are called early.
-*/
-   if (!this_cpu_has(X86_FEATURE_INVPCID_SINGLE))
-   invalidate_user_asid(loaded_mm_asid);
-   else
-   invpcid_flush_one(user_pcid(loaded_mm_asid), addr);
+   for (addr = start; addr < end; addr += 1ul << stride_shift)
+   asm volatile("invlpg (%0)" ::"r" (addr) : "memory");
 }
 
 /*
@@ -512,7 +502,7 @@ static inline void __flush_tlb_one_kernel(unsigned long 
addr)
 * kernel address space and for its usermode counterpart, but it does
 * not flush it for other address spaces.
 */
-   __flush_tlb_one_user(addr);
+   __flush_tlb_range(addr, addr + PAGE_SIZE, PAGE_SHIFT);
 
if (!static_cpu_has(X86_FEATURE_PTI))
return;
diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c
index 5520a04c84ba..195f5577d0d5 100644
--- a/arch/x86/kernel/paravirt.c
+++ b/arch/x

[RFC PATCH v2 3/3] x86/mm/tlb: Avoid deferring PTI flushes on shootdown

2019-08-23 Thread Nadav Amit
When a shootdown is initiated, the initiating CPU has cycles to burn as
it waits for the responding CPUs to receive the IPI and acknowledge it.
In these cycles it is better to flush the user page-tables using
INVPCID, instead of deferring the TLB flush.

The best way to figure out whether there are cycles to burn is arguably
to expose from the SMP layer when an acknowledgment is received.
However, this would break some abstractions.

Instead, use a simpler solution: the initiating CPU of a TLB shootdown
would not defer PTI flushes. It is not always a win, relatively to
deferring user page-table flushes, but it prevents performance
regression.

Signed-off-by: Nadav Amit 
---
 arch/x86/include/asm/tlbflush.h |  1 +
 arch/x86/mm/tlb.c   | 10 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index da56aa3ccd07..066b3804f876 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -573,6 +573,7 @@ struct flush_tlb_info {
unsigned intinitiating_cpu;
u8  stride_shift;
u8  freed_tables;
+   u8  shootdown;
 };
 
 #define local_flush_tlb() __flush_tlb()
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 31260c55d597..ba50430275d4 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -592,8 +592,13 @@ static void flush_tlb_user_pt_range(u16 asid, const struct 
flush_tlb_info *f)
 
/*
 * We can defer flushes as long as page-tables were not freed.
+*
+* However, if there is a shootdown the initiating CPU has cycles to
+* spare, while it waits for the other cores to respond. In this case,
+* deferring the flushing can cause overheads, so avoid it.
 */
-   if (IS_ENABLED(CONFIG_X86_64) && !f->freed_tables) {
+   if (IS_ENABLED(CONFIG_X86_64) && !f->freed_tables &&
+   (!f->shootdown || f->initiating_cpu != smp_processor_id())) {
flush_user_tlb_deferred(asid, start, end, stride_shift);
return;
}
@@ -861,6 +866,7 @@ static struct flush_tlb_info *get_flush_tlb_info(struct 
mm_struct *mm,
info->freed_tables  = freed_tables;
info->new_tlb_gen   = new_tlb_gen;
info->initiating_cpu= smp_processor_id();
+   info->shootdown = false;
 
return info;
 }
@@ -903,6 +909,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long 
start,
 * flush_tlb_func_local() directly in this case.
 */
if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
+   info->shootdown = true;
flush_tlb_multi(mm_cpumask(mm), info);
} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
lockdep_assert_irqs_enabled();
@@ -970,6 +977,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch 
*batch)
 * flush_tlb_func_local() directly in this case.
 */
if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
+   info->shootdown = true;
flush_tlb_multi(&batch->cpumask, info);
} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
lockdep_assert_irqs_enabled();
-- 
2.17.1



[RFC PATCH v2 2/3] x86/mm/tlb: Defer PTI flushes

2019-08-23 Thread Nadav Amit
INVPCID is considerably slower than INVLPG of a single PTE. Using it to
flush the user page-tables when PTI is enabled therefore introduces
significant overhead.

Instead, unless page-tables are released, it is possible to defer the
flushing of the user page-tables until the time the code returns to
userspace. These page tables are not in use, so deferring them is not a
security hazard. When CR3 is loaded, as part of returning to userspace,
use INVLPG to flush the relevant PTEs. Use LFENCE to prevent speculative
executions that skip INVLPG.

There are some caveats, which sometime require a full TLB flush of the
user page-tables. There are some (uncommon) code-paths that reload CR3
in which there is not stack. If a context-switch happens and there are
pending flushes, tracking which TLB flushes are later needed is
complicated and expensive. If there are multiple TLB flushes of
different ranges before the kernel returns to userspace, the overhead of
tracking them can exceed the benefit.

In these cases, perform a full TLB flush. It is possible to avoid them
in some cases, but the benefit in doing so is questionable.

Signed-off-by: Nadav Amit 
---
 arch/x86/entry/calling.h| 52 ++--
 arch/x86/include/asm/tlbflush.h | 30 +++---
 arch/x86/kernel/asm-offsets.c   |  3 ++
 arch/x86/mm/tlb.c   | 70 +
 4 files changed, 147 insertions(+), 8 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 515c0ceeb4a3..a4d46416853d 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -6,6 +6,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
 
@@ -205,7 +206,16 @@ For 32-bit we have the following conventions - kernel is 
built with
 #define THIS_CPU_user_pcid_flush_mask   \
PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
 
-.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
+#define THIS_CPU_user_flush_start  \
+   PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_flush_start
+
+#define THIS_CPU_user_flush_end\
+   PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_flush_end
+
+#define THIS_CPU_user_flush_stride_shift   \
+   PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_flush_stride_shift
+
+.macro SWITCH_TO_USER_CR3 scratch_reg:req scratch_reg2:req has_stack:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
mov %cr3, \scratch_reg
 
@@ -221,9 +231,41 @@ For 32-bit we have the following conventions - kernel is 
built with
 
/* Flush needed, clear the bit */
btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
+.if \has_stack
+   cmpq$(TLB_FLUSH_ALL), THIS_CPU_user_flush_end
+   jnz .Lpartial_flush_\@
+.Ldo_full_flush_\@:
+.endif
movq\scratch_reg2, \scratch_reg
jmp .Lwrcr3_pcid_\@
-
+.if \has_stack
+.Lpartial_flush_\@:
+   /* Prepare CR3 with PGD of user, and no flush set */
+   orq $(PTI_USER_PGTABLE_AND_PCID_MASK), \scratch_reg2
+   SET_NOFLUSH_BIT \scratch_reg2
+   pushq   %rsi
+   pushq   %rbx
+   pushq   %rcx
+   movbTHIS_CPU_user_flush_stride_shift, %cl
+   movq$1, %rbx
+   shl %cl, %rbx
+   movqTHIS_CPU_user_flush_start, %rsi
+   movqTHIS_CPU_user_flush_end, %rcx
+   /* Load the new cr3 and flush */
+   mov \scratch_reg2, %cr3
+.Lflush_loop_\@:
+   invlpg  (%rsi)
+   addq%rbx, %rsi
+   cmpq%rsi, %rcx
+   ja  .Lflush_loop_\@
+   /* Prevent speculatively skipping flushes */
+   lfence
+
+   popq%rcx
+   popq%rbx
+   popq%rsi
+   jmp .Lend_\@
+.endif
 .Lnoflush_\@:
movq\scratch_reg2, \scratch_reg
SET_NOFLUSH_BIT \scratch_reg
@@ -239,9 +281,13 @@ For 32-bit we have the following conventions - kernel is 
built with
 .Lend_\@:
 .endm
 
+.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
+   SWITCH_TO_USER_CR3 scratch_reg=\scratch_reg scratch_reg2=%rax 
has_stack=0
+.endm
+
 .macro SWITCH_TO_USER_CR3_STACKscratch_reg:req
pushq   %rax
-   SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
+   SWITCH_TO_USER_CR3 scratch_reg=\scratch_reg scratch_reg2=%rax 
has_stack=1
popq%rax
 .endm
 
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 421bc82504e2..da56aa3ccd07 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -2,6 +2,10 @@
 #ifndef _ASM_X86_TLBFLUSH_H
 #define _ASM_X86_TLBFLUSH_H
 
+#define TLB_FLUSH_ALL  -1UL
+
+#ifndef __ASSEMBLY__
+
 #include 
 #include 
 
@@ -222,6 +226,10 @@ struct tlb_state {
 * context 0.
 */
struct tlb_context ctxs[TLB_NR_DYN_ASIDS];
+
+   unsigned long user_flush_start;
+   unsigned long user_flush_end;
+   unsigned long user_flush_stride_shift;
 };
 DECLARE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate);
 
@@ 

[RFC PATCH v2 0/3] x86/mm/tlb: Defer TLB flushes with PTI

2019-08-23 Thread Nadav Amit
INVPCID is considerably slower than INVLPG of a single PTE, but it is
currently used to flush PTEs in the user page-table when PTI is used.

Instead, it is possible to defer TLB flushes until after the user
page-tables are loaded. Preventing speculation over the TLB flushes
should keep the whole thing safe. In some cases, deferring TLB flushes
in such a way can result in more full TLB flushes, but arguably this
behavior is oftentimes beneficial.

These patches are based and evaluated on top of the concurrent
TLB-flushes v4 patch-set.

I will provide more results later, but it might be easier to look at the
time an isolated TLB flush takes. These numbers are from skylake,
showing the number of cycles that running madvise(DONTNEED) which
results in local TLB flushes takes:

n_pages concurrent  +deferred-pti   change
--- --  -   --
 1  21191986-6.7%
 10 67915417 -20%

Please let me know if I missed something that affects security or
performance.

[ Yes, I know there is another pending RFC for async TLB flushes, but I
  think it might be easier to merge this one first ]

RFC v1 -> RFC v2:
  * Wrong patches were sent before

Nadav Amit (3):
  x86/mm/tlb: Change __flush_tlb_one_user interface
  x86/mm/tlb: Defer PTI flushes
  x86/mm/tlb: Avoid deferring PTI flushes on shootdown

 arch/x86/entry/calling.h  |  52 +++-
 arch/x86/include/asm/paravirt.h   |   5 +-
 arch/x86/include/asm/paravirt_types.h |   3 +-
 arch/x86/include/asm/tlbflush.h   |  55 +++-
 arch/x86/kernel/asm-offsets.c |   3 +
 arch/x86/kernel/paravirt.c|   7 +-
 arch/x86/mm/tlb.c | 117 --
 arch/x86/xen/mmu_pv.c |  21 +++--
 8 files changed, 218 insertions(+), 45 deletions(-)

-- 
2.17.1



Re: [RFC PATCH 0/3] x86/mm/tlb: Defer TLB flushes with PTI

2019-08-23 Thread Nadav Amit
Sorry, I made a mistake and included the wrong patches. I will send
RFC v2 in few minutes.


> On Aug 23, 2019, at 3:46 PM, Nadav Amit  wrote:
> 
> INVPCID is considerably slower than INVLPG of a single PTE, but it is
> currently used to flush PTEs in the user page-table when PTI is used.
> 
> Instead, it is possible to defer TLB flushes until after the user
> page-tables are loaded. Preventing speculation over the TLB flushes
> should keep the whole thing safe. In some cases, deferring TLB flushes
> in such a way can result in more full TLB flushes, but arguably this
> behavior is oftentimes beneficial.
> 
> These patches are based and evaluated on top of the concurrent
> TLB-flushes v4 patch-set.
> 
> I will provide more results later, but it might be easier to look at the
> time an isolated TLB flush takes. These numbers are from skylake,
> showing the number of cycles that running madvise(DONTNEED) which
> results in local TLB flushes takes:
> 
> n_pages   concurrent  +deferred-pti   change
> ---   --  -   --
> 1 21191986-6.7%
> 1067915417 -20%
> 
> Please let me know if I missed something that affects security or
> performance.
> 
> [ Yes, I know there is another pending RFC for async TLB flushes, but I
>  think it might be easier to merge this one first ]
> 
> Nadav Amit (3):
>  x86/mm/tlb: Defer PTI flushes
>  x86/mm/tlb: Avoid deferring PTI flushes on shootdown
>  x86/mm/tlb: Use lockdep irq assertions
> 
> arch/x86/entry/calling.h| 52 +++--
> arch/x86/include/asm/tlbflush.h | 31 ++--
> arch/x86/kernel/asm-offsets.c   |  3 ++
> arch/x86/mm/tlb.c   | 83 +++--
> 4 files changed, 158 insertions(+), 11 deletions(-)
> 
> -- 
> 2.17.1




[RFC PATCH 1/3] x86/mm/tlb: Defer PTI flushes

2019-08-23 Thread Nadav Amit
INVPCID is considerably slower than INVLPG of a single PTE. Using it to
flush the user page-tables when PTI is enabled therefore introduces
significant overhead.

Instead, unless page-tables are released, it is possible to defer the
flushing of the user page-tables until the time the code returns to
userspace. These page tables are not in use, so deferring them is not a
security hazard. When CR3 is loaded, as part of returning to userspace,
use INVLPG to flush the relevant PTEs. Use LFENCE to prevent speculative
executions that skip INVLPG.

There are some caveats, which sometime require a full TLB flush of the
user page-tables. There are some (uncommon) code-paths that reload CR3
in which there is not stack. If a context-switch happens and there are
pending flushes, tracking which TLB flushes are later needed is
complicated and expensive. If there are multiple TLB flushes of
different ranges before the kernel returns to userspace, the overhead of
tracking them can exceed the benefit.

In these cases, perform a full TLB flush. It is possible to avoid them
in some cases, but the benefit in doing so is questionable.

Signed-off-by: Nadav Amit 
---
 arch/x86/entry/calling.h| 52 ++--
 arch/x86/include/asm/tlbflush.h | 30 +++---
 arch/x86/kernel/asm-offsets.c   |  3 ++
 arch/x86/mm/tlb.c   | 70 +
 4 files changed, 147 insertions(+), 8 deletions(-)

diff --git a/arch/x86/entry/calling.h b/arch/x86/entry/calling.h
index 515c0ceeb4a3..a4d46416853d 100644
--- a/arch/x86/entry/calling.h
+++ b/arch/x86/entry/calling.h
@@ -6,6 +6,7 @@
 #include 
 #include 
 #include 
+#include 
 
 /*
 
@@ -205,7 +206,16 @@ For 32-bit we have the following conventions - kernel is 
built with
 #define THIS_CPU_user_pcid_flush_mask   \
PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_pcid_flush_mask
 
-.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
+#define THIS_CPU_user_flush_start  \
+   PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_flush_start
+
+#define THIS_CPU_user_flush_end\
+   PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_flush_end
+
+#define THIS_CPU_user_flush_stride_shift   \
+   PER_CPU_VAR(cpu_tlbstate) + TLB_STATE_user_flush_stride_shift
+
+.macro SWITCH_TO_USER_CR3 scratch_reg:req scratch_reg2:req has_stack:req
ALTERNATIVE "jmp .Lend_\@", "", X86_FEATURE_PTI
mov %cr3, \scratch_reg
 
@@ -221,9 +231,41 @@ For 32-bit we have the following conventions - kernel is 
built with
 
/* Flush needed, clear the bit */
btr \scratch_reg, THIS_CPU_user_pcid_flush_mask
+.if \has_stack
+   cmpq$(TLB_FLUSH_ALL), THIS_CPU_user_flush_end
+   jnz .Lpartial_flush_\@
+.Ldo_full_flush_\@:
+.endif
movq\scratch_reg2, \scratch_reg
jmp .Lwrcr3_pcid_\@
-
+.if \has_stack
+.Lpartial_flush_\@:
+   /* Prepare CR3 with PGD of user, and no flush set */
+   orq $(PTI_USER_PGTABLE_AND_PCID_MASK), \scratch_reg2
+   SET_NOFLUSH_BIT \scratch_reg2
+   pushq   %rsi
+   pushq   %rbx
+   pushq   %rcx
+   movbTHIS_CPU_user_flush_stride_shift, %cl
+   movq$1, %rbx
+   shl %cl, %rbx
+   movqTHIS_CPU_user_flush_start, %rsi
+   movqTHIS_CPU_user_flush_end, %rcx
+   /* Load the new cr3 and flush */
+   mov \scratch_reg2, %cr3
+.Lflush_loop_\@:
+   invlpg  (%rsi)
+   addq%rbx, %rsi
+   cmpq%rsi, %rcx
+   ja  .Lflush_loop_\@
+   /* Prevent speculatively skipping flushes */
+   lfence
+
+   popq%rcx
+   popq%rbx
+   popq%rsi
+   jmp .Lend_\@
+.endif
 .Lnoflush_\@:
movq\scratch_reg2, \scratch_reg
SET_NOFLUSH_BIT \scratch_reg
@@ -239,9 +281,13 @@ For 32-bit we have the following conventions - kernel is 
built with
 .Lend_\@:
 .endm
 
+.macro SWITCH_TO_USER_CR3_NOSTACK scratch_reg:req scratch_reg2:req
+   SWITCH_TO_USER_CR3 scratch_reg=\scratch_reg scratch_reg2=%rax 
has_stack=0
+.endm
+
 .macro SWITCH_TO_USER_CR3_STACKscratch_reg:req
pushq   %rax
-   SWITCH_TO_USER_CR3_NOSTACK scratch_reg=\scratch_reg scratch_reg2=%rax
+   SWITCH_TO_USER_CR3 scratch_reg=\scratch_reg scratch_reg2=%rax 
has_stack=1
popq%rax
 .endm
 
diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 421bc82504e2..da56aa3ccd07 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -2,6 +2,10 @@
 #ifndef _ASM_X86_TLBFLUSH_H
 #define _ASM_X86_TLBFLUSH_H
 
+#define TLB_FLUSH_ALL  -1UL
+
+#ifndef __ASSEMBLY__
+
 #include 
 #include 
 
@@ -222,6 +226,10 @@ struct tlb_state {
 * context 0.
 */
struct tlb_context ctxs[TLB_NR_DYN_ASIDS];
+
+   unsigned long user_flush_start;
+   unsigned long user_flush_end;
+   unsigned long user_flush_stride_shift;
 };
 DECLARE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate);
 
@@ 

[RFC PATCH 2/3] x86/mm/tlb: Avoid deferring PTI flushes on shootdown

2019-08-23 Thread Nadav Amit
When a shootdown is initiated, the initiating CPU has cycles to burn as
it waits for the responding CPUs to receive the IPI and acknowledge it.
In these cycles it is better to flush the user page-tables using
INVPCID, instead of deferring the TLB flush.

The best way to figure out whether there are cycles to burn is arguably
to expose from the SMP layer when an acknowledgment is received.
However, this would break some abstractions.

Instead, use a simpler solution: the initiating CPU of a TLB shootdown
would not defer PTI flushes. It is not always a win, relatively to
deferring user page-table flushes, but it prevents performance
regression.

Signed-off-by: Nadav Amit 
---
 arch/x86/include/asm/tlbflush.h |  1 +
 arch/x86/mm/tlb.c   | 10 +-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index da56aa3ccd07..066b3804f876 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -573,6 +573,7 @@ struct flush_tlb_info {
unsigned intinitiating_cpu;
u8  stride_shift;
u8  freed_tables;
+   u8  shootdown;
 };
 
 #define local_flush_tlb() __flush_tlb()
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 31260c55d597..ba50430275d4 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -592,8 +592,13 @@ static void flush_tlb_user_pt_range(u16 asid, const struct 
flush_tlb_info *f)
 
/*
 * We can defer flushes as long as page-tables were not freed.
+*
+* However, if there is a shootdown the initiating CPU has cycles to
+* spare, while it waits for the other cores to respond. In this case,
+* deferring the flushing can cause overheads, so avoid it.
 */
-   if (IS_ENABLED(CONFIG_X86_64) && !f->freed_tables) {
+   if (IS_ENABLED(CONFIG_X86_64) && !f->freed_tables &&
+   (!f->shootdown || f->initiating_cpu != smp_processor_id())) {
flush_user_tlb_deferred(asid, start, end, stride_shift);
return;
}
@@ -861,6 +866,7 @@ static struct flush_tlb_info *get_flush_tlb_info(struct 
mm_struct *mm,
info->freed_tables  = freed_tables;
info->new_tlb_gen   = new_tlb_gen;
info->initiating_cpu= smp_processor_id();
+   info->shootdown = false;
 
return info;
 }
@@ -903,6 +909,7 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long 
start,
 * flush_tlb_func_local() directly in this case.
 */
if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) {
+   info->shootdown = true;
flush_tlb_multi(mm_cpumask(mm), info);
} else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) {
lockdep_assert_irqs_enabled();
@@ -970,6 +977,7 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch 
*batch)
 * flush_tlb_func_local() directly in this case.
 */
if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) {
+   info->shootdown = true;
flush_tlb_multi(&batch->cpumask, info);
} else if (cpumask_test_cpu(cpu, &batch->cpumask)) {
lockdep_assert_irqs_enabled();
-- 
2.17.1



[RFC PATCH 0/3] x86/mm/tlb: Defer TLB flushes with PTI

2019-08-23 Thread Nadav Amit
INVPCID is considerably slower than INVLPG of a single PTE, but it is
currently used to flush PTEs in the user page-table when PTI is used.

Instead, it is possible to defer TLB flushes until after the user
page-tables are loaded. Preventing speculation over the TLB flushes
should keep the whole thing safe. In some cases, deferring TLB flushes
in such a way can result in more full TLB flushes, but arguably this
behavior is oftentimes beneficial.

These patches are based and evaluated on top of the concurrent
TLB-flushes v4 patch-set.

I will provide more results later, but it might be easier to look at the
time an isolated TLB flush takes. These numbers are from skylake,
showing the number of cycles that running madvise(DONTNEED) which
results in local TLB flushes takes:

n_pages concurrent  +deferred-pti   change
--- --  -   --
 1  21191986-6.7%
 10 67915417 -20%

Please let me know if I missed something that affects security or
performance.

[ Yes, I know there is another pending RFC for async TLB flushes, but I
  think it might be easier to merge this one first ]

Nadav Amit (3):
  x86/mm/tlb: Defer PTI flushes
  x86/mm/tlb: Avoid deferring PTI flushes on shootdown
  x86/mm/tlb: Use lockdep irq assertions

 arch/x86/entry/calling.h| 52 +++--
 arch/x86/include/asm/tlbflush.h | 31 ++--
 arch/x86/kernel/asm-offsets.c   |  3 ++
 arch/x86/mm/tlb.c   | 83 +++--
 4 files changed, 158 insertions(+), 11 deletions(-)

-- 
2.17.1



[RFC PATCH 3/3] x86/mm/tlb: Use lockdep irq assertions

2019-08-23 Thread Nadav Amit
The assertions that check whether IRQs are disabled depend currently on
different debug features. Use instead lockdep_assert_irqs_disabled(),
which is standard, enabled by the same debug feature,  and provides more
information upon failures.

Signed-off-by: Nadav Amit 
---
 arch/x86/mm/tlb.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index ba50430275d4..6f4ce02e2c5b 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -293,8 +293,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
 */
 
/* We don't want flush_tlb_func() to run concurrently with us. */
-   if (IS_ENABLED(CONFIG_PROVE_LOCKING))
-   WARN_ON_ONCE(!irqs_disabled());
+   lockdep_assert_irqs_disabled();
 
/*
 * Verify that CR3 is what we think it is.  This will catch
@@ -643,7 +642,7 @@ static void flush_tlb_func(void *info)
unsigned long nr_invalidate = 0;
 
/* This code cannot presently handle being reentered. */
-   VM_WARN_ON(!irqs_disabled());
+   lockdep_assert_irqs_disabled();
 
if (!local) {
inc_irq_stat(irq_tlb_count);
-- 
2.17.1



[PATCH 3/7] x86/percpu: Use C for percpu accesses when possible

2019-08-23 Thread Nadav Amit
The percpu code mostly uses inline assembly. Using segment qualifiers
allows to use C code instead, which enables the compiler to perform
various optimizations (e.g., CSE). For example, in __schedule() the
following two instructions:

  mov%gs:0x7e5f1eff(%rip),%edx# 0x10350 
  movslq %edx,%rdx

Turn with this patch into:

  movslq %gs:0x7e5f2e6e(%rip),%rax# 0x10350 

In addition, operations that have no guarantee against concurrent
interrupts or preemption, such as __this_cpu_cmpxchg() can be further
optimized by the compiler when they are implemented in C, for example
in call_timer_fn().

Signed-off-by: Nadav Amit 
---
 arch/x86/include/asm/percpu.h | 115 +++---
 1 file changed, 105 insertions(+), 10 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 1fe348884477..13987f9bc82f 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -439,13 +439,88 @@ do {  
\
  */
 #define this_cpu_read_stable(var)  percpu_stable_op("mov", var)
 
+#if USE_X86_SEG_SUPPORT
+
+#define __raw_cpu_read(qual, pcp)  \
+({ \
+   *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp));   \
+})
+
+#define __raw_cpu_write(qual, pcp, val)
\
+   do {\
+   *(qual __my_cpu_type(pcp) *)__my_cpu_ptr(&(pcp)) = (val); \
+   } while (0)
+
+/*
+ * Performance-wise, C operations are only more efficient than their inline
+ * assembly counterparts for non-volatile variables (__this_*) and for volatile
+ * loads and stores.
+ *
+ * Since we do not use assembly, we are free to define 64-bit operations
+ * on 32-bit architecture
+ */
+#define __raw_cpu_add(pcp, val)do { __my_cpu_var(pcp) += 
(val); } while (0)
+#define __raw_cpu_and(pcp, val)do { __my_cpu_var(pcp) &= 
(val); } while (0)
+#define __raw_cpu_or(pcp, val) do { __my_cpu_var(pcp) |= (val); } 
while (0)
+#define __raw_cpu_add_return(pcp, val) ({ __my_cpu_var(pcp) += (val); })
+
+#define __raw_cpu_xchg(pcp, val)   \
+({ \
+   typeof(pcp) pxo_ret__ = __my_cpu_var(pcp);  \
+   \
+   __my_cpu_var(pcp) = (val);  \
+   pxo_ret__;  \
+})
+
+#define __raw_cpu_cmpxchg(pcp, oval, nval) \
+({ \
+   __my_cpu_type(pcp) *__p = __my_cpu_ptr(&(pcp)); \
+   \
+   typeof(pcp) __ret = *__p;   \
+   \
+   if (__ret == (oval))\
+   *__p = nval;\
+   __ret;  \
+})
+
+#define raw_cpu_read_1(pcp)__raw_cpu_read(, pcp)
+#define raw_cpu_read_2(pcp)__raw_cpu_read(, pcp)
+#define raw_cpu_read_4(pcp)__raw_cpu_read(, pcp)
+#define raw_cpu_write_1(pcp, val)  __raw_cpu_write(, pcp, val)
+#define raw_cpu_write_2(pcp, val)  __raw_cpu_write(, pcp, val)
+#define raw_cpu_write_4(pcp, val)  __raw_cpu_write(, pcp, val)
+#define raw_cpu_add_1(pcp, val)__raw_cpu_add(pcp, val)
+#define raw_cpu_add_2(pcp, val)__raw_cpu_add(pcp, val)
+#define raw_cpu_add_4(pcp, val)__raw_cpu_add(pcp, val)
+#define raw_cpu_and_1(pcp, val)__raw_cpu_and(pcp, val)
+#define raw_cpu_and_2(pcp, val)__raw_cpu_and(pcp, val)
+#define raw_cpu_and_4(pcp, val)__raw_cpu_and(pcp, val)
+#define raw_cpu_or_1(pcp, val) __raw_cpu_or(pcp, val)
+#define raw_cpu_or_2(pcp, val) __raw_cpu_or(pcp, val)
+#define raw_cpu_or_4(pcp, val) __raw_cpu_or(pcp, val)
+#define raw_cpu_xchg_1(pcp, val)   __raw_cpu_xchg(pcp, val)
+#define raw_cpu_xchg_2(pcp, val)   __raw_cpu_xchg(pcp, val)
+#define raw_cpu_xchg_4(pcp, val)   __raw_cpu_xchg(pcp, val)
+#define raw_cpu_add_return_1(pcp, val) __raw_cpu_add_return(pcp, val)
+#define raw_cpu_add_return_2(pcp, val) __raw_cpu_add_return(pcp, val)
+#define raw_cpu_add_return_4(pcp, val) __raw_cpu_add_return(pcp, val)
+#define raw_cpu_add_return_8(pcp, val) __raw_cpu_add_return(pcp, val)
+#define raw_cpu_cmpxchg_1(pcp, oval, nval) __raw_cpu_cmpxchg(pcp, oval, 
nval)
+#defi

[PATCH 5/7] percpu: Assume preemption is disabled on per_cpu_ptr()

2019-08-23 Thread Nadav Amit
When per_cpu_ptr() is used, the caller should have preemption disabled,
as otherwise the pointer is meaningless. If the user wants an "unstable"
pointer he should call raw_cpu_ptr().

Add an assertion to check that indeed preemption is disabled, and
distinguish between the two cases to allow further, per-arch
optimizations.

Signed-off-by: Nadav Amit 
---
 include/asm-generic/percpu.h | 12 
 include/linux/percpu-defs.h  | 33 -
 2 files changed, 44 insertions(+), 1 deletion(-)

diff --git a/include/asm-generic/percpu.h b/include/asm-generic/percpu.h
index c2de013b2cf4..7853605f4210 100644
--- a/include/asm-generic/percpu.h
+++ b/include/asm-generic/percpu.h
@@ -36,6 +36,14 @@ extern unsigned long __per_cpu_offset[NR_CPUS];
 #define my_cpu_offset __my_cpu_offset
 #endif
 
+/*
+ * Determine the offset of the current active processor when preemption is
+ * disabled. Can be overriden by arch code.
+ */
+#ifndef __raw_my_cpu_offset
+#define __raw_my_cpu_offset __my_cpu_offset
+#endif
+
 /*
  * Arch may define arch_raw_cpu_ptr() to provide more efficient address
  * translations for raw_cpu_ptr().
@@ -44,6 +52,10 @@ extern unsigned long __per_cpu_offset[NR_CPUS];
 #define arch_raw_cpu_ptr(ptr) SHIFT_PERCPU_PTR(ptr, __my_cpu_offset)
 #endif
 
+#ifndef arch_raw_cpu_ptr_preemptable
+#define arch_raw_cpu_ptr_preemptable(ptr) SHIFT_PERCPU_PTR(ptr, 
__raw_my_cpu_offset)
+#endif
+
 #ifdef CONFIG_HAVE_SETUP_PER_CPU_AREA
 extern void setup_per_cpu_areas(void);
 #endif
diff --git a/include/linux/percpu-defs.h b/include/linux/percpu-defs.h
index a6fabd865211..13afca8a37e7 100644
--- a/include/linux/percpu-defs.h
+++ b/include/linux/percpu-defs.h
@@ -237,20 +237,51 @@ do {  
\
SHIFT_PERCPU_PTR((ptr), per_cpu_offset((cpu))); \
 })
 
+#ifndef arch_raw_cpu_ptr_preemption_disabled
+#define arch_raw_cpu_ptr_preemption_disabled(ptr)  \
+   arch_raw_cpu_ptr(ptr)
+#endif
+
+#define raw_cpu_ptr_preemption_disabled(ptr)   \
+({ \
+   __verify_pcpu_ptr(ptr); \
+   arch_raw_cpu_ptr_preemption_disabled(ptr);  \
+})
+
+/*
+ * If preemption is enabled, we need to read the pointer atomically on
+ * raw_cpu_ptr(). However if it is disabled, we can use the
+ * raw_cpu_ptr_nopreempt(), which is potentially more efficient. Similarly, we
+ * can use the preemption-disabled version if the kernel is non-preemptable or
+ * if voluntary preemption is used.
+ */
+#ifdef CONFIG_PREEMPT
+
 #define raw_cpu_ptr(ptr)   \
 ({ \
__verify_pcpu_ptr(ptr); \
arch_raw_cpu_ptr(ptr);  \
 })
 
+#else
+
+#define raw_cpu_ptr(ptr)   raw_cpu_ptr_preemption_disabled(ptr)
+
+#endif
+
 #ifdef CONFIG_DEBUG_PREEMPT
+/*
+ * Unlike other this_cpu_* operations, this_cpu_ptr() requires that preemption
+ * will be disabled. In contrast, raw_cpu_ptr() does not require that.
+ */
 #define this_cpu_ptr(ptr)  \
 ({ \
+   __this_cpu_preempt_check("ptr");\
__verify_pcpu_ptr(ptr); \
SHIFT_PERCPU_PTR(ptr, my_cpu_offset);   \
 })
 #else
-#define this_cpu_ptr(ptr) raw_cpu_ptr(ptr)
+#define this_cpu_ptr(ptr) raw_cpu_ptr_preemption_disabled(ptr)
 #endif
 
 #else  /* CONFIG_SMP */
-- 
2.17.1



[PATCH 1/7] compiler: Report x86 segment support

2019-08-23 Thread Nadav Amit
GCC v6+ supports x86 segment qualifiers (__seg_gs and __seg_fs). Define
COMPILER_HAS_X86_SEG_SUPPORT when it is supported.

Signed-off-by: Nadav Amit 
---
 include/linux/compiler-gcc.h | 4 
 1 file changed, 4 insertions(+)

diff --git a/include/linux/compiler-gcc.h b/include/linux/compiler-gcc.h
index d7ee4c6bad48..5967590a18c6 100644
--- a/include/linux/compiler-gcc.h
+++ b/include/linux/compiler-gcc.h
@@ -149,6 +149,10 @@
 #define COMPILER_HAS_GENERIC_BUILTIN_OVERFLOW 1
 #endif
 
+#if GCC_VERSION >= 6
+#define COMPILER_HAS_X86_SEG_SUPPORT 1
+#endif
+
 /*
  * Turn individual warnings and errors on and off locally, depending
  * on version.
-- 
2.17.1



[PATCH 4/7] x86: Fix possible caching of current_task

2019-08-23 Thread Nadav Amit
this_cpu_read_stable() is allowed and supposed to cache and return the
same value, specifically for current_task. It actually does not cache
current_task very well, which hinders possible invalid caching when the
task is switched in __switch_to().

Fix the possible caching by avoiding the use of current in
__switch_to()'s dynamic extent.

Signed-off-by: Nadav Amit 
---
 arch/x86/include/asm/fpu/internal.h|  7 ---
 arch/x86/include/asm/resctrl_sched.h   | 14 +++---
 arch/x86/kernel/cpu/resctrl/rdtgroup.c |  4 ++--
 arch/x86/kernel/process_32.c   |  4 ++--
 arch/x86/kernel/process_64.c   |  4 ++--
 5 files changed, 17 insertions(+), 16 deletions(-)

diff --git a/arch/x86/include/asm/fpu/internal.h 
b/arch/x86/include/asm/fpu/internal.h
index 4c95c365058a..b537788600fe 100644
--- a/arch/x86/include/asm/fpu/internal.h
+++ b/arch/x86/include/asm/fpu/internal.h
@@ -588,9 +588,10 @@ static inline void switch_fpu_prepare(struct fpu *old_fpu, 
int cpu)
 
 /*
  * Load PKRU from the FPU context if available. Delay loading of the
- * complete FPU state until the return to userland.
+ * complete FPU state until the return to userland. Avoid reading current 
during
+ * switch.
  */
-static inline void switch_fpu_finish(struct fpu *new_fpu)
+static inline void switch_fpu_finish(struct task_struct *task, struct fpu 
*new_fpu)
 {
u32 pkru_val = init_pkru_value;
struct pkru_state *pk;
@@ -598,7 +599,7 @@ static inline void switch_fpu_finish(struct fpu *new_fpu)
if (!static_cpu_has(X86_FEATURE_FPU))
return;
 
-   set_thread_flag(TIF_NEED_FPU_LOAD);
+   set_ti_thread_flag(task_thread_info(task), TIF_NEED_FPU_LOAD);
 
if (!cpu_feature_enabled(X86_FEATURE_OSPKE))
return;
diff --git a/arch/x86/include/asm/resctrl_sched.h 
b/arch/x86/include/asm/resctrl_sched.h
index f6b7fe2833cc..9a00d9df9d02 100644
--- a/arch/x86/include/asm/resctrl_sched.h
+++ b/arch/x86/include/asm/resctrl_sched.h
@@ -51,7 +51,7 @@ DECLARE_STATIC_KEY_FALSE(rdt_mon_enable_key);
  *   simple as possible.
  * Must be called with preemption disabled.
  */
-static void __resctrl_sched_in(void)
+static void __resctrl_sched_in(struct task_struct *task)
 {
struct resctrl_pqr_state *state = this_cpu_ptr(&pqr_state);
u32 closid = state->default_closid;
@@ -62,13 +62,13 @@ static void __resctrl_sched_in(void)
 * Else use the closid/rmid assigned to this cpu.
 */
if (static_branch_likely(&rdt_alloc_enable_key)) {
-   if (current->closid)
+   if (task->closid)
closid = current->closid;
}
 
if (static_branch_likely(&rdt_mon_enable_key)) {
-   if (current->rmid)
-   rmid = current->rmid;
+   if (task->rmid)
+   rmid = task->rmid;
}
 
if (closid != state->cur_closid || rmid != state->cur_rmid) {
@@ -78,15 +78,15 @@ static void __resctrl_sched_in(void)
}
 }
 
-static inline void resctrl_sched_in(void)
+static inline void resctrl_sched_in(struct task_struct *task)
 {
if (static_branch_likely(&rdt_enable_key))
-   __resctrl_sched_in();
+   __resctrl_sched_in(task);
 }
 
 #else
 
-static inline void resctrl_sched_in(void) {}
+static inline void resctrl_sched_in(struct task_struct *task) {}
 
 #endif /* CONFIG_X86_CPU_RESCTRL */
 
diff --git a/arch/x86/kernel/cpu/resctrl/rdtgroup.c 
b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
index a46dee8e78db..5fcf56cbf438 100644
--- a/arch/x86/kernel/cpu/resctrl/rdtgroup.c
+++ b/arch/x86/kernel/cpu/resctrl/rdtgroup.c
@@ -311,7 +311,7 @@ static void update_cpu_closid_rmid(void *info)
 * executing task might have its own closid selected. Just reuse
 * the context switch code.
 */
-   resctrl_sched_in();
+   resctrl_sched_in(current);
 }
 
 /*
@@ -536,7 +536,7 @@ static void move_myself(struct callback_head *head)
 
preempt_disable();
/* update PQR_ASSOC MSR to make resource group go into effect */
-   resctrl_sched_in();
+   resctrl_sched_in(current);
preempt_enable();
 
kfree(callback);
diff --git a/arch/x86/kernel/process_32.c b/arch/x86/kernel/process_32.c
index b8ceec4974fe..699a4c95ab13 100644
--- a/arch/x86/kernel/process_32.c
+++ b/arch/x86/kernel/process_32.c
@@ -292,10 +292,10 @@ __switch_to(struct task_struct *prev_p, struct 
task_struct *next_p)
 
this_cpu_write(current_task, next_p);
 
-   switch_fpu_finish(next_fpu);
+   switch_fpu_finish(next_p, next_fpu);
 
/* Load the Intel cache allocation PQR MSR. */
-   resctrl_sched_in();
+   resctrl_sched_in(next_p);
 
return prev_p;
 }
diff --git a/arch/x86/kernel/process_64.c b/arch/x86/kernel/process_64.c
index af64519b2695..bb811808936d 100644
--- a/arch/x86/kernel/process_64.c
+++ b/arch/x86/kernel/process_64.c
@@ -565,7 +565,7 @@ __switch_to(struct task_str

[PATCH 7/7] x86/current: Aggressive caching of current

2019-08-23 Thread Nadav Amit
The current_task is supposed to be constant in each thread and therefore
does not need to be reread. There is already an attempt to cache it
using inline assembly, using this_cpu_read_stable(), which hides the
dependency on the read memory address.

However, this caching is not working very well. For example,
sync_mm_rss() still reads current_task twice for no reason.

Allow more aggressive caching by aliasing current_task to
into a constant const_current_task and reading from the constant copy.
Doing so requires the compiler to support x86 segment qualifiers.
Hide const_current_task in a different compilation unit to avoid the
compiler from assuming that the value is constant during compilation.

Signed-off-by: Nadav Amit 
---
 arch/x86/include/asm/current.h | 30 ++
 arch/x86/kernel/cpu/Makefile   |  1 +
 arch/x86/kernel/cpu/common.c   |  7 +--
 arch/x86/kernel/cpu/current.c  | 16 
 include/linux/compiler.h   |  2 +-
 5 files changed, 49 insertions(+), 7 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/current.c

diff --git a/arch/x86/include/asm/current.h b/arch/x86/include/asm/current.h
index 3e204e6140b5..7f093e81a647 100644
--- a/arch/x86/include/asm/current.h
+++ b/arch/x86/include/asm/current.h
@@ -10,11 +10,41 @@ struct task_struct;
 
 DECLARE_PER_CPU(struct task_struct *, current_task);
 
+#if USE_X86_SEG_SUPPORT
+
+/*
+ * Hold a constant alias for current_task, which would allow to avoid caching 
of
+ * current task.
+ *
+ * We must mark const_current_task with the segment qualifiers, as otherwise 
gcc
+ * would do redundant reads of const_current_task.
+ */
+DECLARE_PER_CPU(struct task_struct * const __percpu_seg_override, 
const_current_task);
+
+static __always_inline struct task_struct *get_current(void)
+{
+
+   /*
+* GCC is missing functionality of removing segment qualifiers, which
+* messes with per-cpu infrastructure that holds local copies. Use
+* __raw_cpu_read to avoid holding any copy.
+*/
+   return __raw_cpu_read(, const_current_task);
+}
+
+#else /* USE_X86_SEG_SUPPORT */
+
+/*
+ * Without segment qualifier support, the per-cpu infrastrucutre is not
+ * suitable for reading constants, so use this_cpu_read_stable() in this case.
+ */
 static __always_inline struct task_struct *get_current(void)
 {
return this_cpu_read_stable(current_task);
 }
 
+#endif /* USE_X86_SEG_SUPPORT */
+
 #define current get_current()
 
 #endif /* __ASSEMBLY__ */
diff --git a/arch/x86/kernel/cpu/Makefile b/arch/x86/kernel/cpu/Makefile
index d7a1e5a9331c..d816f03a37d7 100644
--- a/arch/x86/kernel/cpu/Makefile
+++ b/arch/x86/kernel/cpu/Makefile
@@ -19,6 +19,7 @@ CFLAGS_common.o   := $(nostackp)
 
 obj-y  := cacheinfo.o scattered.o topology.o
 obj-y  += common.o
+obj-y  += current.o
 obj-y  += rdrand.o
 obj-y  += match.o
 obj-y  += bugs.o
diff --git a/arch/x86/kernel/cpu/common.c b/arch/x86/kernel/cpu/common.c
index 5cc2d51cc25e..5f7c9ee57802 100644
--- a/arch/x86/kernel/cpu/common.c
+++ b/arch/x86/kernel/cpu/common.c
@@ -1619,13 +1619,8 @@ DEFINE_PER_CPU_FIRST(struct fixed_percpu_data,
 EXPORT_PER_CPU_SYMBOL_GPL(fixed_percpu_data);
 
 /*
- * The following percpu variables are hot.  Align current_task to
- * cacheline size such that they fall in the same cacheline.
+ * The following percpu variables are hot.
  */
-DEFINE_PER_CPU(struct task_struct *, current_task) cacheline_aligned =
-   &init_task;
-EXPORT_PER_CPU_SYMBOL(current_task);
-
 DEFINE_PER_CPU(struct irq_stack *, hardirq_stack_ptr);
 DEFINE_PER_CPU(unsigned int, irq_count) __visible = -1;
 
diff --git a/arch/x86/kernel/cpu/current.c b/arch/x86/kernel/cpu/current.c
new file mode 100644
index ..3238c6e34984
--- /dev/null
+++ b/arch/x86/kernel/cpu/current.c
@@ -0,0 +1,16 @@
+#include 
+#include 
+
+/*
+ * Align current_task to cacheline size such that they fall in the same
+ * cacheline.
+ */
+DEFINE_PER_CPU(struct task_struct *, current_task) cacheline_aligned =
+   &init_task;
+EXPORT_PER_CPU_SYMBOL(current_task);
+
+#if USE_X86_SEG_SUPPORT
+DECLARE_PER_CPU(struct task_struct * const __percpu_seg_override, 
const_current_task)
+   __attribute__((alias("current_task")));
+EXPORT_PER_CPU_SYMBOL(const_current_task);
+#endif
diff --git a/include/linux/compiler.h b/include/linux/compiler.h
index f0fd5636fddb..1b6ee9ab6373 100644
--- a/include/linux/compiler.h
+++ b/include/linux/compiler.h
@@ -299,7 +299,7 @@ unsigned long read_word_at_a_time(const void *addr)
  */
 #define __ADDRESSABLE(sym) \
static void * __section(".discard.addressable") __used \
-   __PASTE(__addressable_##sym, __LINE__) = (void *)&sym;
+   __PASTE(__addressable_##sym, __LINE__) = (void 
*)(uintptr_t)&sym;
 
 /**
  * offset_to_ptr - convert a relative memory offset to an absolute pointer
-- 
2.17.1



[PATCH 0/7] x86/percpu: Use segment qualifiers

2019-08-23 Thread Nadav Amit
GCC 6+ supports segment qualifiers. Using them allows to implement
several optimizations:

1. Avoid unnecessary instructions when an operation is carried on
read/written per-cpu value, and instead allow the compiler to set
instructions that access per-cpu value directly.

2. Make this_cpu_ptr() more efficient and allow its value to be cached,
since preemption must be disabled when this_cpu_ptr() is used.

3. Provide better alternative for this_cpu_read_stable() that caches
values more efficiently using alias attribute to const variable.

4. Allow the compiler to perform other optimizations (e.g. CSE).

5. Use rip-relative addressing in per_cpu_read_stable(), which make it
PIE-ready.

"size" and Peter's compare do not seem to show the impact on code size
reduction correctly. Summing the code size according to nm on defconfig
shows a minor reduction from 11451310 to 11451310 (0.09%).

RFC->v1:
 * Fixing i386 build bug
 * Moving chunk to the right place [Peter]

Nadav Amit (7):
  compiler: Report x86 segment support
  x86/percpu: Use compiler segment prefix qualifier
  x86/percpu: Use C for percpu accesses when possible
  x86: Fix possible caching of current_task
  percpu: Assume preemption is disabled on per_cpu_ptr()
  x86/percpu: Optimized arch_raw_cpu_ptr()
  x86/current: Aggressive caching of current

 arch/x86/include/asm/current.h |  30 +++
 arch/x86/include/asm/fpu/internal.h|   7 +-
 arch/x86/include/asm/percpu.h  | 293 +++--
 arch/x86/include/asm/preempt.h |   3 +-
 arch/x86/include/asm/resctrl_sched.h   |  14 +-
 arch/x86/kernel/cpu/Makefile   |   1 +
 arch/x86/kernel/cpu/common.c   |   7 +-
 arch/x86/kernel/cpu/current.c  |  16 ++
 arch/x86/kernel/cpu/resctrl/rdtgroup.c |   4 +-
 arch/x86/kernel/process_32.c   |   4 +-
 arch/x86/kernel/process_64.c   |   4 +-
 include/asm-generic/percpu.h   |  12 +
 include/linux/compiler-gcc.h   |   4 +
 include/linux/compiler.h   |   2 +-
 include/linux/percpu-defs.h|  33 ++-
 15 files changed, 346 insertions(+), 88 deletions(-)
 create mode 100644 arch/x86/kernel/cpu/current.c

-- 
2.17.1



[PATCH 2/7] x86/percpu: Use compiler segment prefix qualifier

2019-08-23 Thread Nadav Amit
Using a segment prefix qualifier is cleaner than using a segment prefix
in the inline assembly, and provides the compiler with more information,
telling it that __seg_gs:[addr] is different than [addr] when it
analyzes data dependencies. It also enables various optimizations that
will be implemented in the next patches.

Use segment prefix qualifiers when they are supported. Unfortunately,
gcc does not provide a way to remove segment qualifiers, which is needed
to use typeof() to create local instances of the per-cpu variable. For
this reason, do not use the segment qualifier for per-cpu variables, and
do casting using the segment qualifier instead.

Signed-off-by: Nadav Amit 
---
 arch/x86/include/asm/percpu.h  | 153 ++---
 arch/x86/include/asm/preempt.h |   3 +-
 2 files changed, 104 insertions(+), 52 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 2278797c769d..1fe348884477 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -45,8 +45,33 @@
 #include 
 #include 
 
+#if defined(COMPILER_HAS_X86_SEG_SUPPORT) && IS_ENABLED(CONFIG_SMP)
+#define USE_X86_SEG_SUPPORT1
+#else
+#define USE_X86_SEG_SUPPORT0
+#endif
+
 #ifdef CONFIG_SMP
+
+#if USE_X86_SEG_SUPPORT
+
+#ifdef CONFIG_X86_64
+#define __percpu_seg_override  __seg_gs
+#else
+#define __percpu_seg_override  __seg_fs
+#endif
+
+#define __percpu_prefix""
+#define __force_percpu_prefix  "%%"__stringify(__percpu_seg)":"
+
+#else /* USE_X86_SEG_SUPPORT */
+
+#define __percpu_seg_override
 #define __percpu_prefix"%%"__stringify(__percpu_seg)":"
+#define __force_percpu_prefix  __percpu_prefix
+
+#endif /* USE_X86_SEG_SUPPORT */
+
 #define __my_cpu_offsetthis_cpu_read(this_cpu_off)
 
 /*
@@ -58,14 +83,21 @@
unsigned long tcp_ptr__;\
asm volatile("add " __percpu_arg(1) ", %0"  \
 : "=r" (tcp_ptr__) \
-: "m" (this_cpu_off), "0" (ptr));  \
+: "m" (__my_cpu_var(this_cpu_off)),\
+  "0" (ptr));  \
(typeof(*(ptr)) __kernel __force *)tcp_ptr__;   \
 })
-#else
+#else /* CONFIG_SMP */
+#define __percpu_seg_override
 #define __percpu_prefix""
-#endif
+#define __force_percpu_prefix  ""
+#endif /* CONFIG_SMP */
 
+#define __my_cpu_type(var) typeof(var) __percpu_seg_override
+#define __my_cpu_ptr(ptr)  (__my_cpu_type(*ptr) *)(uintptr_t)(ptr)
+#define __my_cpu_var(var)  (*__my_cpu_ptr(&var))
 #define __percpu_arg(x)__percpu_prefix "%" #x
+#define __force_percpu_arg(x)  __force_percpu_prefix "%" #x
 
 /*
  * Initialized pointers to per-cpu variables needed for the boot
@@ -98,22 +130,22 @@ do {   
\
switch (sizeof(var)) {  \
case 1: \
asm qual (op "b %1,"__percpu_arg(0) \
-   : "+m" (var)\
+   : "+m" (__my_cpu_var(var))  \
: "qi" ((pto_T__)(val)));   \
break;  \
case 2: \
asm qual (op "w %1,"__percpu_arg(0) \
-   : "+m" (var)\
+   : "+m" (__my_cpu_var(var))  \
: "ri" ((pto_T__)(val)));   \
break;  \
case 4: \
asm qual (op "l %1,"__percpu_arg(0) \
-   : "+m" (var)\
+   : "+m" (__my_cpu_var(var))  \
: "ri" ((pto_T__)(val)));   \
break;  \
case 8: \
asm qual (op "q %1,"__percpu_arg(0) \
-   : "+m" (var)\
+   : "+m" (__my_cpu_var(var))  \
: "re" ((pto_T__)(val)));   \
break;  \
default: __bad_percpu_size();   \
@@ -138,71 +170,79 @@ do {  
\
switch (sizeof(var)) {  \
case 1: \
if (pao_ID__ == 1)  \
-   asm qual ("incb "__percpu_arg(0) : "+m" (var)); \
+   asm qual ("incb "__percpu_arg(0) :  \
+ "+m" (__my_cpu_var(var)));\
else if (pao_ID__ == -1) 

[PATCH 6/7] x86/percpu: Optimized arch_raw_cpu_ptr()

2019-08-23 Thread Nadav Amit
Implementing arch_raw_cpu_ptr() in C, allows the compiler to perform
better optimizations, such as setting an appropriate base to compute
the address instead of an add instruction.

The benefit of this computation is relevant only when using compiler
segment qualifiers. It is inapplicable to use this method when the
address size is greater than the maximum operand size, as it is when
building vdso32.

Distinguish between the two cases in which preemption is disabled (as
happens when this_cpu_ptr() is used) and enabled (when raw_cpu_ptr() is
used).

This allows optimizations, for instance in rcu_dynticks_eqs_exit(),
the following code:

  mov$0x2bbc0,%rax
  add%gs:0x7ef07570(%rip),%rax  # 0x10358 
  lock xadd %edx,0xd8(%rax)

Turns with this patch into:

  mov%gs:0x7ef08aa5(%rip),%rax  # 0x10358 
  lock xadd %edx,0x2bc58(%rax)

Signed-off-by: Nadav Amit 
---
 arch/x86/include/asm/percpu.h | 25 +++--
 1 file changed, 23 insertions(+), 2 deletions(-)

diff --git a/arch/x86/include/asm/percpu.h b/arch/x86/include/asm/percpu.h
index 13987f9bc82f..3d82df27d45c 100644
--- a/arch/x86/include/asm/percpu.h
+++ b/arch/x86/include/asm/percpu.h
@@ -73,20 +73,41 @@
 #endif /* USE_X86_SEG_SUPPORT */
 
 #define __my_cpu_offsetthis_cpu_read(this_cpu_off)
+#define __raw_my_cpu_offset__this_cpu_read(this_cpu_off)
+#define __my_cpu_ptr(ptr)  (__my_cpu_type(*ptr) *)(uintptr_t)(ptr)
 
+#if USE_X86_SEG_SUPPORT && !defined(BUILD_VDSO32) && defined(CONFIG_X86_64)
+/*
+ * Efficient implementation for cases in which the compiler supports C 
segments.
+ * Allows the compiler to perform additional optimizations that can save more
+ * instructions.
+ *
+ * This optimized version can only be used if the pointer size equals to native
+ * operand size, which does not happen when vdso32 is used.
+ */
+#define __arch_raw_cpu_ptr_qual(qual, ptr) \
+({ \
+   (qual typeof(*(ptr)) __kernel __force *)((uintptr_t)(ptr) + \
+   __my_cpu_offset);   \
+})
+#else /* USE_X86_SEG_SUPPORT && !defined(BUILD_VDSO32) && 
defined(CONFIG_X86_64) */
 /*
  * Compared to the generic __my_cpu_offset version, the following
  * saves one instruction and avoids clobbering a temp register.
  */
-#define arch_raw_cpu_ptr(ptr)  \
+#define __arch_raw_cpu_ptr_qual(qual, ptr) \
 ({ \
unsigned long tcp_ptr__;\
-   asm volatile("add " __percpu_arg(1) ", %0"  \
+   asm qual ("add " __percpu_arg(1) ", %0" \
 : "=r" (tcp_ptr__) \
 : "m" (__my_cpu_var(this_cpu_off)),\
   "0" (ptr));  \
(typeof(*(ptr)) __kernel __force *)tcp_ptr__;   \
 })
+#endif /* USE_X86_SEG_SUPPORT && !defined(BUILD_VDSO32) && 
defined(CONFIG_X86_64) */
+
+#define arch_raw_cpu_ptr(ptr)  
__arch_raw_cpu_ptr_qual(volatile, ptr)
+#define arch_raw_cpu_ptr_preemption_disabled(ptr)  
__arch_raw_cpu_ptr_qual( , ptr)
 #else /* CONFIG_SMP */
 #define __percpu_seg_override
 #define __percpu_prefix""
-- 
2.17.1



[PATCH 4.14] tcp: fix tcp_rtx_queue_tail in case of empty retransmit queue

2019-08-23 Thread Tim Froidcoeur
Commit 8c3088f895a0 ("tcp: be more careful in tcp_fragment()")
triggers following stack trace:

[25244.848046] kernel BUG at ./include/linux/skbuff.h:1406!
[25244.859335] RIP: 0010:skb_queue_prev+0x9/0xc
[25244.888167] Call Trace:
[25244.889182]  
[25244.890001]  tcp_fragment+0x9c/0x2cf
[25244.891295]  tcp_write_xmit+0x68f/0x988
[25244.892732]  __tcp_push_pending_frames+0x3b/0xa0
[25244.894347]  tcp_data_snd_check+0x2a/0xc8
[25244.895775]  tcp_rcv_established+0x2a8/0x30d
[25244.897282]  tcp_v4_do_rcv+0xb2/0x158
[25244.898666]  tcp_v4_rcv+0x692/0x956
[25244.899959]  ip_local_deliver_finish+0xeb/0x169
[25244.901547]  __netif_receive_skb_core+0x51c/0x582
[25244.903193]  ? inet_gro_receive+0x239/0x247
[25244.904756]  netif_receive_skb_internal+0xab/0xc6
[25244.906395]  napi_gro_receive+0x8a/0xc0
[25244.907760]  receive_buf+0x9a1/0x9cd
[25244.909160]  ? load_balance+0x17a/0x7b7
[25244.910536]  ? vring_unmap_one+0x18/0x61
[25244.911932]  ? detach_buf+0x60/0xfa
[25244.913234]  virtnet_poll+0x128/0x1e1
[25244.914607]  net_rx_action+0x12a/0x2b1
[25244.915953]  __do_softirq+0x11c/0x26b
[25244.917269]  ? handle_irq_event+0x44/0x56
[25244.918695]  irq_exit+0x61/0xa0
[25244.919947]  do_IRQ+0x9d/0xbb
[25244.921065]  common_interrupt+0x85/0x85
[25244.922479]  

tcp_rtx_queue_tail() (called by tcp_fragment()) can call
tcp_write_queue_prev() on the first packet in the queue, which will trigger
the BUG in tcp_write_queue_prev(), because there is no previous packet.

This happens when the retransmit queue is empty, for example in case of a
zero window.

Patch is needed for 4.4, 4.9 and 4.14 stable branches.

Fixes: 8c3088f895a0 ("tcp: be more careful in tcp_fragment()")
Change-Id: I839bde7167ae59e2f7d916c913507372445765c5
Signed-off-by: Tim Froidcoeur 
Signed-off-by: Matthieu Baerts 
Reviewed-by: Christoph Paasch 
---
 include/net/tcp.h | 4 
 1 file changed, 4 insertions(+)

diff --git a/include/net/tcp.h b/include/net/tcp.h
index 9de2c8cdcc51..1e70ca75c8bf 100644
--- a/include/net/tcp.h
+++ b/include/net/tcp.h
@@ -1705,6 +1705,10 @@ static inline struct sk_buff *tcp_rtx_queue_tail(const 
struct sock *sk)
 {
struct sk_buff *skb = tcp_send_head(sk);

+   /* empty retransmit queue, for example due to zero window */
+   if (skb == tcp_write_queue_head(sk))
+   return NULL;
+
return skb ? tcp_write_queue_prev(sk, skb) : tcp_write_queue_tail(sk);
 }

--
2.23.0


-- 


Disclaimer: https://www.tessares.net/mail-disclaimer/ 





[PATCH v4 2/9] x86/mm/tlb: Unify flush_tlb_func_local() and flush_tlb_func_remote()

2019-08-23 Thread Nadav Amit
The unification of these two functions allows to use them in the updated
SMP infrastrucutre.

To do so, remove the reason argument from flush_tlb_func_local(), add
a member to struct tlb_flush_info that says which CPU initiated the
flush and act accordingly. Optimize the size of flush_tlb_info while we
are at it.

Unfortunately, this prevents us from using a constant tlb_flush_info for
arch_tlbbatch_flush(), but in a later stage we will inline
tlb_flush_info into the IPI data, so it should not have an impact
eventually.

Reviewed-by: Dave Hansen 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Josh Poimboeuf 
Signed-off-by: Nadav Amit 
---
 arch/x86/include/asm/tlbflush.h |  5 +-
 arch/x86/mm/tlb.c   | 85 +++--
 2 files changed, 41 insertions(+), 49 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 6f66d841262d..2f6e9be163ae 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -559,8 +559,9 @@ struct flush_tlb_info {
unsigned long   start;
unsigned long   end;
u64 new_tlb_gen;
-   unsigned intstride_shift;
-   boolfreed_tables;
+   unsigned intinitiating_cpu;
+   u8  stride_shift;
+   u8  freed_tables;
 };
 
 #define local_flush_tlb() __flush_tlb()
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index e6a9edc5baaf..2674f55ed9a1 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -292,7 +292,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
 * NB: leave_mm() calls us with prev == NULL and tsk == NULL.
 */
 
-   /* We don't want flush_tlb_func_* to run concurrently with us. */
+   /* We don't want flush_tlb_func() to run concurrently with us. */
if (IS_ENABLED(CONFIG_PROVE_LOCKING))
WARN_ON_ONCE(!irqs_disabled());
 
@@ -512,14 +512,13 @@ void initialize_tlbstate_and_flush(void)
 }
 
 /*
- * flush_tlb_func_common()'s memory ordering requirement is that any
+ * flush_tlb_func()'s memory ordering requirement is that any
  * TLB fills that happen after we flush the TLB are ordered after we
  * read active_mm's tlb_gen.  We don't need any explicit barriers
  * because all x86 flush operations are serializing and the
  * atomic64_read operation won't be reordered by the compiler.
  */
-static void flush_tlb_func_common(const struct flush_tlb_info *f,
- bool local, enum tlb_flush_reason reason)
+static void flush_tlb_func(void *info)
 {
/*
 * We have three different tlb_gen values in here.  They are:
@@ -530,14 +529,26 @@ static void flush_tlb_func_common(const struct 
flush_tlb_info *f,
 * - f->new_tlb_gen: the generation that the requester of the flush
 *   wants us to catch up to.
 */
+   const struct flush_tlb_info *f = info;
struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm);
u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
u64 mm_tlb_gen = atomic64_read(&loaded_mm->context.tlb_gen);
u64 local_tlb_gen = 
this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen);
+   bool local = smp_processor_id() == f->initiating_cpu;
+   unsigned long nr_invalidate = 0;
 
/* This code cannot presently handle being reentered. */
VM_WARN_ON(!irqs_disabled());
 
+   if (!local) {
+   inc_irq_stat(irq_tlb_count);
+   count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED);
+
+   /* Can only happen on remote CPUs */
+   if (f->mm && f->mm != loaded_mm)
+   return;
+   }
+
if (unlikely(loaded_mm == &init_mm))
return;
 
@@ -565,8 +576,7 @@ static void flush_tlb_func_common(const struct 
flush_tlb_info *f,
 * be handled can catch us all the way up, leaving no work for
 * the second flush.
 */
-   trace_tlb_flush(reason, 0);
-   return;
+   goto done;
}
 
WARN_ON_ONCE(local_tlb_gen > mm_tlb_gen);
@@ -613,46 +623,34 @@ static void flush_tlb_func_common(const struct 
flush_tlb_info *f,
f->new_tlb_gen == local_tlb_gen + 1 &&
f->new_tlb_gen == mm_tlb_gen) {
/* Partial flush */
-   unsigned long nr_invalidate = (f->end - f->start) >> 
f->stride_shift;
unsigned long addr = f->start;
 
+   nr_invalidate = (f->end - f->start) >> f->stride_shift;
+
while (addr < f->end) {
__flush_tlb_one_user(addr);
addr += 1UL << f->stride_shift;
}
if (local)
count_vm_tlb_events(NR_TLB_LOCAL_FLUSH

[PATCH v4 5/9] x86/mm/tlb: Privatize cpu_tlbstate

2019-08-23 Thread Nadav Amit
cpu_tlbstate is mostly private and only the variable is_lazy is shared.
This causes some false-sharing when TLB flushes are performed.

Break cpu_tlbstate intro cpu_tlbstate and cpu_tlbstate_shared, and mark
each one accordingly.

Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Reviewed-by: Dave Hansen 
Signed-off-by: Nadav Amit 
---
 arch/x86/include/asm/tlbflush.h | 39 ++---
 arch/x86/mm/init.c  |  2 +-
 arch/x86/mm/tlb.c   | 15 -
 3 files changed, 31 insertions(+), 25 deletions(-)

diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h
index 559195f79c2f..1f88ea410ff3 100644
--- a/arch/x86/include/asm/tlbflush.h
+++ b/arch/x86/include/asm/tlbflush.h
@@ -178,23 +178,6 @@ struct tlb_state {
u16 loaded_mm_asid;
u16 next_asid;
 
-   /*
-* We can be in one of several states:
-*
-*  - Actively using an mm.  Our CPU's bit will be set in
-*mm_cpumask(loaded_mm) and is_lazy == false;
-*
-*  - Not using a real mm.  loaded_mm == &init_mm.  Our CPU's bit
-*will not be set in mm_cpumask(&init_mm) and is_lazy == false.
-*
-*  - Lazily using a real mm.  loaded_mm != &init_mm, our bit
-*is set in mm_cpumask(loaded_mm), but is_lazy == true.
-*We're heuristically guessing that the CR3 load we
-*skipped more than makes up for the overhead added by
-*lazy mode.
-*/
-   bool is_lazy;
-
/*
 * If set we changed the page tables in such a way that we
 * needed an invalidation of all contexts (aka. PCIDs / ASIDs).
@@ -240,7 +223,27 @@ struct tlb_state {
 */
struct tlb_context ctxs[TLB_NR_DYN_ASIDS];
 };
-DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate);
+DECLARE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate);
+
+struct tlb_state_shared {
+   /*
+* We can be in one of several states:
+*
+*  - Actively using an mm.  Our CPU's bit will be set in
+*mm_cpumask(loaded_mm) and is_lazy == false;
+*
+*  - Not using a real mm.  loaded_mm == &init_mm.  Our CPU's bit
+*will not be set in mm_cpumask(&init_mm) and is_lazy == false.
+*
+*  - Lazily using a real mm.  loaded_mm != &init_mm, our bit
+*is set in mm_cpumask(loaded_mm), but is_lazy == true.
+*We're heuristically guessing that the CR3 load we
+*skipped more than makes up for the overhead added by
+*lazy mode.
+*/
+   bool is_lazy;
+};
+DECLARE_PER_CPU_SHARED_ALIGNED(struct tlb_state_shared, cpu_tlbstate_shared);
 
 /*
  * Blindly accessing user memory from NMI context can be dangerous
diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c
index fd10d91a6115..34027f36a944 100644
--- a/arch/x86/mm/init.c
+++ b/arch/x86/mm/init.c
@@ -951,7 +951,7 @@ void __init zone_sizes_init(void)
free_area_init_nodes(max_zone_pfns);
 }
 
-__visible DEFINE_PER_CPU_SHARED_ALIGNED(struct tlb_state, cpu_tlbstate) = {
+__visible DEFINE_PER_CPU_ALIGNED(struct tlb_state, cpu_tlbstate) = {
.loaded_mm = &init_mm,
.next_asid = 1,
.cr4 = ~0UL,/* fail hard if we screw up cr4 shadow initialization */
diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 5376a5447bd0..24c9839e3d9b 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -145,7 +145,7 @@ void leave_mm(int cpu)
return;
 
/* Warn if we're not lazy. */
-   WARN_ON(!this_cpu_read(cpu_tlbstate.is_lazy));
+   WARN_ON(!this_cpu_read(cpu_tlbstate_shared.is_lazy));
 
switch_mm(NULL, &init_mm, NULL);
 }
@@ -277,7 +277,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
 {
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
-   bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
+   bool was_lazy = this_cpu_read(cpu_tlbstate_shared.is_lazy);
unsigned cpu = smp_processor_id();
u64 next_tlb_gen;
bool need_flush;
@@ -322,7 +322,7 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
__flush_tlb_all();
}
 #endif
-   this_cpu_write(cpu_tlbstate.is_lazy, false);
+   this_cpu_write(cpu_tlbstate_shared.is_lazy, false);
 
/*
 * The membarrier system call requires a full memory barrier and
@@ -463,7 +463,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct 
task_struct *tsk)
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
return;
 
-   this_cpu_write(cpu_tlbstate.is_lazy, true);
+   this_cpu_write(cpu_tlbstate_shared.is_lazy, true);
 }
 
 /*
@@ -555,7 +555,7 @@ static void flush_tlb_func(void *info)
VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) !=
   loaded_mm

[PATCH v4 4/9] x86/mm/tlb: Flush remote and local TLBs concurrently

2019-08-23 Thread Nadav Amit
To improve TLB shootdown performance, flush the remote and local TLBs
concurrently. Introduce flush_tlb_multi() that does so. Introduce
paravirtual versions of flush_tlb_multi() for KVM, Xen and hyper-v (Xen
and hyper-v are only compile-tested).

While the updated smp infrastructure is capable of running a function on
a single local core, it is not optimized for this case. The multiple
function calls and the indirect branch introduce some overhead, and
might make local TLB flushes slower than they were before the recent
changes.

Before calling the SMP infrastructure, check if only a local TLB flush
is needed to restore the lost performance in this common case. This
requires to check mm_cpumask() one more time, but unless this mask is
updated very frequently, this should impact performance negatively.

Cc: "K. Y. Srinivasan" 
Cc: Haiyang Zhang 
Cc: Stephen Hemminger 
Cc: Sasha Levin 
Cc: Thomas Gleixner 
Cc: Ingo Molnar 
Cc: Borislav Petkov 
Cc: x...@kernel.org
Cc: Juergen Gross 
Cc: Paolo Bonzini 
Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Cc: Boris Ostrovsky 
Cc: linux-hyp...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: virtualizat...@lists.linux-foundation.org
Cc: k...@vger.kernel.org
Cc: xen-de...@lists.xenproject.org
Reviewed-by: Michael Kelley  # Hyper-v parts
Reviewed-by: Juergen Gross  # Xen and paravirt parts
Reviewed-by: Dave Hansen 
Signed-off-by: Nadav Amit 
---
 arch/x86/hyperv/mmu.c | 10 +++---
 arch/x86/include/asm/paravirt.h   |  6 ++--
 arch/x86/include/asm/paravirt_types.h |  4 +--
 arch/x86/include/asm/tlbflush.h   |  8 ++---
 arch/x86/include/asm/trace/hyperv.h   |  2 +-
 arch/x86/kernel/kvm.c | 11 +--
 arch/x86/kernel/paravirt.c|  2 +-
 arch/x86/mm/tlb.c | 45 ++-
 arch/x86/xen/mmu_pv.c | 11 +++
 include/trace/events/xen.h|  2 +-
 10 files changed, 61 insertions(+), 40 deletions(-)

diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c
index e65d7fe6489f..8740d8b21db3 100644
--- a/arch/x86/hyperv/mmu.c
+++ b/arch/x86/hyperv/mmu.c
@@ -50,8 +50,8 @@ static inline int fill_gva_list(u64 gva_list[], int offset,
return gva_n - offset;
 }
 
-static void hyperv_flush_tlb_others(const struct cpumask *cpus,
-   const struct flush_tlb_info *info)
+static void hyperv_flush_tlb_multi(const struct cpumask *cpus,
+  const struct flush_tlb_info *info)
 {
int cpu, vcpu, gva_n, max_gvas;
struct hv_tlb_flush **flush_pcpu;
@@ -59,7 +59,7 @@ static void hyperv_flush_tlb_others(const struct cpumask 
*cpus,
u64 status = U64_MAX;
unsigned long flags;
 
-   trace_hyperv_mmu_flush_tlb_others(cpus, info);
+   trace_hyperv_mmu_flush_tlb_multi(cpus, info);
 
if (!hv_hypercall_pg)
goto do_native;
@@ -156,7 +156,7 @@ static void hyperv_flush_tlb_others(const struct cpumask 
*cpus,
if (!(status & HV_HYPERCALL_RESULT_MASK))
return;
 do_native:
-   native_flush_tlb_others(cpus, info);
+   native_flush_tlb_multi(cpus, info);
 }
 
 static u64 hyperv_flush_tlb_others_ex(const struct cpumask *cpus,
@@ -231,6 +231,6 @@ void hyperv_setup_mmu_ops(void)
return;
 
pr_info("Using hypercall for remote TLB flush\n");
-   pv_ops.mmu.flush_tlb_others = hyperv_flush_tlb_others;
+   pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi;
pv_ops.mmu.tlb_remove_table = tlb_remove_table;
 }
diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h
index 69089d46f128..bc4829c9b3f9 100644
--- a/arch/x86/include/asm/paravirt.h
+++ b/arch/x86/include/asm/paravirt.h
@@ -62,10 +62,10 @@ static inline void __flush_tlb_one_user(unsigned long addr)
PVOP_VCALL1(mmu.flush_tlb_one_user, addr);
 }
 
-static inline void flush_tlb_others(const struct cpumask *cpumask,
-   const struct flush_tlb_info *info)
+static inline void flush_tlb_multi(const struct cpumask *cpumask,
+  const struct flush_tlb_info *info)
 {
-   PVOP_VCALL2(mmu.flush_tlb_others, cpumask, info);
+   PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info);
 }
 
 static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void 
*table)
diff --git a/arch/x86/include/asm/paravirt_types.h 
b/arch/x86/include/asm/paravirt_types.h
index 70b654f3ffe5..63fa751344bf 100644
--- a/arch/x86/include/asm/paravirt_types.h
+++ b/arch/x86/include/asm/paravirt_types.h
@@ -206,8 +206,8 @@ struct pv_mmu_ops {
void (*flush_tlb_user)(void);
void (*flush_tlb_kernel)(void);
void (*flush_tlb_one_user)(unsigned long addr);
-   void (*flush_tlb_others)(const struct cpumask *cpus,
-const struct flush_tlb_info *info);
+   void (*flush_tlb_multi)(const struct cpumask *cpus,
+   cons

[PATCH v4 9/9] x86/mm/tlb: Remove unnecessary uses of the inline keyword

2019-08-23 Thread Nadav Amit
The compiler is smart enough without these hints.

Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Suggested-by: Dave Hansen 
Reviewed-by: Dave Hansen 
Signed-off-by: Nadav Amit 
---
 arch/x86/mm/tlb.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 3dca146edcf1..0addc6e84126 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -189,7 +189,7 @@ static void sync_current_stack_to_mm(struct mm_struct *mm)
}
 }
 
-static inline unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
+static unsigned long mm_mangle_tif_spec_ib(struct task_struct *next)
 {
unsigned long next_tif = task_thread_info(next)->flags;
unsigned long ibpb = (next_tif >> TIF_SPEC_IB) & LAST_USER_MM_IBPB;
@@ -741,7 +741,7 @@ static DEFINE_PER_CPU_SHARED_ALIGNED(struct flush_tlb_info, 
flush_tlb_info);
 static DEFINE_PER_CPU(unsigned int, flush_tlb_info_idx);
 #endif
 
-static inline struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
+static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm,
unsigned long start, unsigned long end,
unsigned int stride_shift, bool freed_tables,
u64 new_tlb_gen)
@@ -768,7 +768,7 @@ static inline struct flush_tlb_info 
*get_flush_tlb_info(struct mm_struct *mm,
return info;
 }
 
-static inline void put_flush_tlb_info(void)
+static void put_flush_tlb_info(void)
 {
 #ifdef CONFIG_DEBUG_VM
/* Complete reentrency prevention checks */
-- 
2.17.1



[PATCH v4 1/9] smp: Run functions concurrently in smp_call_function_many()

2019-08-23 Thread Nadav Amit
Currently, on_each_cpu() and similar functions do not exploit the
potential of concurrency: the function is first executed remotely and
only then it is executed locally. Functions such as TLB flush can take
considerable time, so this provides an opportunity for performance
optimization.

To do so, introduce __smp_call_function_many(), which allows the callers
to provide local and remote functions that should be executed, and run
them concurrently. Keep smp_call_function_many() semantic as it is today
for backward compatibility: the called function is not executed in this
case locally.

__smp_call_function_many() does not use the optimized version for a
single remote target that smp_call_function_single() implements. For
synchronous function call, smp_call_function_single() keeps a
call_single_data (which is used for synchronization) on the stack.
Interestingly, it seems that not using this optimization provides
greater performance improvements (greater speedup with a single remote
target than with multiple ones). Presumably, holding data structures
that are intended for synchronization on the stack can introduce
overheads due to TLB misses and false-sharing when the stack is used for
other purposes.

Adding support to run the functions concurrently required to remove a
micro-optimization in on_each_cpu() that disabled/enabled IRQs instead
of saving/restoring them. The benefit of running the local and remote
code concurrently is expected to be greater.

Reviewed-by: Dave Hansen 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Josh Poimboeuf 
Signed-off-by: Nadav Amit 
---
 include/linux/smp.h |  34 ---
 kernel/smp.c| 138 +---
 2 files changed, 92 insertions(+), 80 deletions(-)

diff --git a/include/linux/smp.h b/include/linux/smp.h
index 6fc856c9eda5..d18d54199635 100644
--- a/include/linux/smp.h
+++ b/include/linux/smp.h
@@ -32,11 +32,6 @@ extern unsigned int total_cpus;
 int smp_call_function_single(int cpuid, smp_call_func_t func, void *info,
 int wait);
 
-/*
- * Call a function on all processors
- */
-void on_each_cpu(smp_call_func_t func, void *info, int wait);
-
 /*
  * Call a function on processors specified by mask, which might include
  * the local one.
@@ -44,6 +39,17 @@ void on_each_cpu(smp_call_func_t func, void *info, int wait);
 void on_each_cpu_mask(const struct cpumask *mask, smp_call_func_t func,
void *info, bool wait);
 
+/*
+ * Call a function on all processors.  May be used during early boot while
+ * early_boot_irqs_disabled is set.
+ */
+static inline void on_each_cpu(smp_call_func_t func, void *info, int wait)
+{
+   preempt_disable();
+   on_each_cpu_mask(cpu_online_mask, func, info, wait);
+   preempt_enable();
+}
+
 /*
  * Call a function on each processor for which the supplied function
  * cond_func returns a positive value. This may include the local
@@ -102,8 +108,22 @@ extern void smp_cpus_done(unsigned int max_cpus);
  * Call a function on all other processors
  */
 void smp_call_function(smp_call_func_t func, void *info, int wait);
-void smp_call_function_many(const struct cpumask *mask,
-   smp_call_func_t func, void *info, bool wait);
+
+/*
+ * Flags to be used as as scf_flags argument of __smp_call_function_many().
+ */
+#define SCF_WAIT   (1U << 0)   /* Wait until function execution 
completed */
+#define SCF_RUN_LOCAL  (1U << 1)   /* Run also locally if local cpu is set 
in cpumask */
+
+void __smp_call_function_many(const struct cpumask *mask,
+ smp_call_func_t func, void *info,
+ unsigned int scf_flags);
+
+static inline void smp_call_function_many(const struct cpumask *mask,
+   smp_call_func_t func, void *info, bool wait)
+{
+   __smp_call_function_many(mask, func, info, wait ? SCF_WAIT : 0);
+}
 
 int smp_call_function_any(const struct cpumask *mask,
  smp_call_func_t func, void *info, int wait);
diff --git a/kernel/smp.c b/kernel/smp.c
index 7dbcb402c2fc..9faec688b34b 100644
--- a/kernel/smp.c
+++ b/kernel/smp.c
@@ -396,24 +396,28 @@ int smp_call_function_any(const struct cpumask *mask,
 EXPORT_SYMBOL_GPL(smp_call_function_any);
 
 /**
- * smp_call_function_many(): Run a function on a set of other CPUs.
+ * __smp_call_function_many(): Run a function on a set of CPUs.
  * @mask: The set of cpus to run on (only runs on online subset).
  * @func: The function to run. This must be fast and non-blocking.
  * @info: An arbitrary pointer to pass to the function.
- * @wait: If true, wait (atomically) until function has completed
- *on other CPUs.
- *
- * If @wait is true, then returns once @func has returned.
+ * @flags: Bitmask that controls the operation. If %SCF_WAIT is set, wait
+ *(atomically) until function has completed on other CPUs. If
+ *%SCF_RUN_

[PATCH v4 3/9] x86/mm/tlb: Open-code on_each_cpu_cond_mask() for tlb_is_not_lazy()

2019-08-23 Thread Nadav Amit
Open-code on_each_cpu_cond_mask() in native_flush_tlb_others() to
optimize the code. Open-coding eliminates the need for the indirect branch
that is used to call is_lazy(), and in CPUs that are vulnerable to
Spectre v2, it eliminates the retpoline. In addition, it allows to use a
preallocated cpumask to compute the CPUs that should be.

This would later allow us not to adapt on_each_cpu_cond_mask() to
support local and remote functions.

Note that calling tlb_is_not_lazy() for every CPU that needs to be
flushed, as done in native_flush_tlb_multi() might look ugly, but it is
equivalent to what is currently done in on_each_cpu_cond_mask().
Actually, native_flush_tlb_multi() does it more efficiently since it
avoids using an indirect branch for the matter.

Reviewed-by: Dave Hansen 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Thomas Gleixner 
Cc: Andy Lutomirski 
Cc: Josh Poimboeuf 
Signed-off-by: Nadav Amit 
---
 arch/x86/mm/tlb.c | 40 +---
 1 file changed, 33 insertions(+), 7 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 2674f55ed9a1..c3ca3545d78a 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -653,11 +653,13 @@ static void flush_tlb_func(void *info)
nr_invalidate);
 }
 
-static bool tlb_is_not_lazy(int cpu, void *data)
+static bool tlb_is_not_lazy(int cpu)
 {
return !per_cpu(cpu_tlbstate.is_lazy, cpu);
 }
 
+static DEFINE_PER_CPU(cpumask_t, flush_tlb_mask);
+
 void native_flush_tlb_others(const struct cpumask *cpumask,
 const struct flush_tlb_info *info)
 {
@@ -701,12 +703,36 @@ void native_flush_tlb_others(const struct cpumask 
*cpumask,
 * up on the new contents of what used to be page tables, while
 * doing a speculative memory access.
 */
-   if (info->freed_tables)
-   smp_call_function_many(cpumask, flush_tlb_func,
-  (void *)info, 1);
-   else
-   on_each_cpu_cond_mask(tlb_is_not_lazy, flush_tlb_func,
-   (void *)info, 1, GFP_ATOMIC, cpumask);
+   if (info->freed_tables) {
+   smp_call_function_many(cpumask, flush_tlb_func, (void *)info, 
1);
+   } else {
+   /*
+* Although we could have used on_each_cpu_cond_mask(),
+* open-coding it has performance advantages, as it eliminates
+* the need for indirect calls or retpolines. In addition, it
+* allows to use a designated cpumask for evaluating the
+* condition, instead of allocating one.
+*
+* This code works under the assumption that there are no nested
+* TLB flushes, an assumption that is already made in
+* flush_tlb_mm_range().
+*
+* cond_cpumask is logically a stack-local variable, but it is
+* more efficient to have it off the stack and not to allocate
+* it on demand. Preemption is disabled and this code is
+* non-reentrant.
+*/
+   struct cpumask *cond_cpumask = this_cpu_ptr(&flush_tlb_mask);
+   int cpu;
+
+   cpumask_clear(cond_cpumask);
+
+   for_each_cpu(cpu, cpumask) {
+   if (tlb_is_not_lazy(cpu))
+   __cpumask_set_cpu(cpu, cond_cpumask);
+   }
+   smp_call_function_many(cond_cpumask, flush_tlb_func, (void 
*)info, 1);
+   }
 }
 
 /*
-- 
2.17.1



[PATCH v4 0/9] x86/tlb: Concurrent TLB flushes

2019-08-23 Thread Nadav Amit
[ Similar cover-letter to v3 with updated performance numbers on skylake.
  Sorry for the time it since the last version. ]

Currently, local and remote TLB flushes are not performed concurrently,
which introduces unnecessary overhead - each PTE flushing can take 100s
of cycles. This patch-set allows TLB flushes to be run concurrently:
first request the remote CPUs to initiate the flush, then run it
locally, and finally wait for the remote CPUs to finish their work.

In addition, there are various small optimizations to avoid, for
example, unwarranted false-sharing.

The proposed changes should also improve the performance of other
invocations of on_each_cpu(). Hopefully, no one has relied on this
behavior of on_each_cpu() that invoked functions first remotely and only
then locally [Peter says he remembers someone might do so, but without
further information it is hard to know how to address it].

Running sysbench on dax/ext4 w/emulated-pmem, write-cache disabled on
2-socket, 56-logical-cores (28+SMT) Skylake, 5 repetitions:

sysbench fileio --file-total-size=3G --file-test-mode=rndwr \
 --file-io-mode=mmap --threads=X --file-fsync-mode=fdatasync run

 Th.   tip-aug22 avg (stdev)   +patch-set avg (stdev)  change
 ---   -   --  --
 1  1152920 (7453)  1169469 (9059)  +1.4%
 2  1545832 (12555) 1584172 (10484) +2.4%
 4  2480703 (12039) 2518641 (12875) +1.5%
 8  3684486 (26007) 3840343 (44144) +4.2%
 16 4981567 (23565) 5125756 (15458) +2.8%
 32 5679542 (10116) 5887826 (6121)  +3.6%
 56 5630944 (17937) 5812514 (26264) +3.2%

(Note that on configurations with up to 28 threads numactl was used to
set all threads on socket 1, which explains the drop in performance when
going to 32 threads).

Running the same benchmark with security mitigations disabled (PTI,
Spectre, MDS):

 Th.   tip-aug22 avg (stdev)   +patch-set avg (stdev)  change
 ---   -   --  --
 1  1444119 (8524)  1469606 (10527) +1.7%
 2  1921540 (24169) 1961899 (14450) +2.1%
 4  3073716 (21786) 3199880 (16774) +4.1%
 8  4700698 (49534) 4802312 (11043) +2.1%
 16 6005180 (6366)  6006656 (31624)0%
 32 6826466 (10496) 6886622 (19110) +0.8%
 56 6832344 (13468) 6885586 (20646) +0.8%

The results are somewhat different than the results that have been obtained on
Haswell-X, which were sent before and the maximum performance improvement is
smaller. However, the performance improvement is significant.

v3 -> v4:
  * Merge flush_tlb_func_local and flush_tlb_func_remote() [Peter]
  * Prevent preemption on_each_cpu(). It is not needed, but it prevents
concerns. [Peter/tglx]
  * Adding acked-, review-by tags

v2 -> v3:
* Open-code the remote/local-flush decision code [Andy]
* Fix hyper-v, Xen implementations [Andrew]
* Fix redundant TLB flushes.

v1 -> v2:
* Removing the patches that Thomas took [tglx]
* Adding hyper-v, Xen compile-tested implementations [Dave]
* Removing UV [Andy]
* Adding lazy optimization, removing inline keyword [Dave]
* Restructuring patch-set

RFCv2 -> v1:
* Fix comment on flush_tlb_multi [Juergen]
* Removing async invalidation optimizations [Andy]
* Adding KVM support [Paolo]

Cc: Andy Lutomirski 
Cc: Borislav Petkov 
Cc: Boris Ostrovsky 
Cc: Dave Hansen 
Cc: Haiyang Zhang 
Cc: Ingo Molnar 
Cc: Josh Poimboeuf 
Cc: Juergen Gross 
Cc: "K. Y. Srinivasan" 
Cc: Paolo Bonzini 
Cc: Peter Zijlstra 
Cc: Rik van Riel 
Cc: Sasha Levin 
Cc: Stephen Hemminger 
Cc: Thomas Gleixner 
Cc: k...@vger.kernel.org
Cc: linux-hyp...@vger.kernel.org
Cc: linux-kernel@vger.kernel.org
Cc: virtualizat...@lists.linux-foundation.org
Cc: x...@kernel.org
Cc: xen-de...@lists.xenproject.org

Nadav Amit (9):
  smp: Run functions concurrently in smp_call_function_many()
  x86/mm/tlb: Unify flush_tlb_func_local() and flush_tlb_func_remote()
  x86/mm/tlb: Open-code on_each_cpu_cond_mask() for tlb_is_not_lazy()
  x86/mm/tlb: Flush remote and local TLBs concurrently
  x86/mm/tlb: Privatize cpu_tlbstate
  x86/mm/tlb: Do not make is_lazy dirty for no reason
  cpumask: Mark functions as pure
  x86/mm/tlb: Remove UV special case
  x86/mm/tlb: Remove unnecessary uses of the inline keyword

 arch/x86/hyperv/mmu.c |  10 +-
 arch/x86/include/asm/paravirt.h   |   6 +-
 arch/x86/include/asm/paravirt_types.h |   4 +-
 arch/x86/include/asm/tlbflush.h   |  52 +++
 arch/x86/include/asm/trace/hyperv.h   |   2 +-
 arch/x86/kernel/kvm.c |  11 +-
 arch/x86/kernel/paravirt.c|   2 +-
 arch/x86/mm/init.c|   2 +-
 arch/x86/mm/tlb.c | 195 ++
 arch/x86/xen/mmu_pv.c |  11 +-
 include/linux/cpumask.h 

[PATCH v4 6/9] x86/mm/tlb: Do not make is_lazy dirty for no reason

2019-08-23 Thread Nadav Amit
Blindly writing to is_lazy for no reason, when the written value is
identical to the old value, makes the cacheline dirty for no reason.
Avoid making such writes to prevent cache coherency traffic for no
reason.

Cc: Andy Lutomirski 
Cc: Peter Zijlstra 
Suggested-by: Dave Hansen 
Reviewed-by: Dave Hansen 
Signed-off-by: Nadav Amit 
---
 arch/x86/mm/tlb.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 24c9839e3d9b..1393b3cd3697 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -322,7 +322,8 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct 
mm_struct *next,
__flush_tlb_all();
}
 #endif
-   this_cpu_write(cpu_tlbstate_shared.is_lazy, false);
+   if (was_lazy)
+   this_cpu_write(cpu_tlbstate_shared.is_lazy, false);
 
/*
 * The membarrier system call requires a full memory barrier and
-- 
2.17.1



[PATCH v4 7/9] cpumask: Mark functions as pure

2019-08-23 Thread Nadav Amit
cpumask_next_and() and cpumask_any_but() are pure, and marking them as
such seems to generate different and presumably better code for
native_flush_tlb_multi().

Reviewed-by: Dave Hansen 
Signed-off-by: Nadav Amit 
---
 include/linux/cpumask.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/include/linux/cpumask.h b/include/linux/cpumask.h
index b5a5a1ed9efd..2579700bf44a 100644
--- a/include/linux/cpumask.h
+++ b/include/linux/cpumask.h
@@ -225,7 +225,7 @@ static inline unsigned int cpumask_last(const struct 
cpumask *srcp)
return find_last_bit(cpumask_bits(srcp), nr_cpumask_bits);
 }
 
-unsigned int cpumask_next(int n, const struct cpumask *srcp);
+unsigned int __pure cpumask_next(int n, const struct cpumask *srcp);
 
 /**
  * cpumask_next_zero - get the next unset cpu in a cpumask
@@ -242,8 +242,8 @@ static inline unsigned int cpumask_next_zero(int n, const 
struct cpumask *srcp)
return find_next_zero_bit(cpumask_bits(srcp), nr_cpumask_bits, n+1);
 }
 
-int cpumask_next_and(int n, const struct cpumask *, const struct cpumask *);
-int cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
+int __pure cpumask_next_and(int n, const struct cpumask *, const struct 
cpumask *);
+int __pure cpumask_any_but(const struct cpumask *mask, unsigned int cpu);
 unsigned int cpumask_local_spread(unsigned int i, int node);
 
 /**
-- 
2.17.1



[PATCH v4 8/9] x86/mm/tlb: Remove UV special case

2019-08-23 Thread Nadav Amit
SGI UV TLB flushes is outdated and will be replaced with compatible
smp_call_many APIC function in the future. For now, simplify the code by
removing the UV special case.

Cc: Peter Zijlstra 
Suggested-by: Andy Lutomirski 
Acked-by: Mike Travis 
Reviewed-by: Dave Hansen 
Signed-off-by: Nadav Amit 
---
 arch/x86/mm/tlb.c | 25 -
 1 file changed, 25 deletions(-)

diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c
index 1393b3cd3697..3dca146edcf1 100644
--- a/arch/x86/mm/tlb.c
+++ b/arch/x86/mm/tlb.c
@@ -679,31 +679,6 @@ void native_flush_tlb_multi(const struct cpumask *cpumask,
trace_tlb_flush(TLB_REMOTE_SEND_IPI,
(info->end - info->start) >> PAGE_SHIFT);
 
-   if (is_uv_system()) {
-   /*
-* This whole special case is confused.  UV has a "Broadcast
-* Assist Unit", which seems to be a fancy way to send IPIs.
-* Back when x86 used an explicit TLB flush IPI, UV was
-* optimized to use its own mechanism.  These days, x86 uses
-* smp_call_function_many(), but UV still uses a manual IPI,
-* and that IPI's action is out of date -- it does a manual
-* flush instead of calling flush_tlb_func().  This
-* means that the percpu tlb_gen variables won't be updated
-* and we'll do pointless flushes on future context switches.
-*
-* Rather than hooking native_flush_tlb_multi() here, I think
-* that UV should be updated so that smp_call_function_many(),
-* etc, are optimal on UV.
-*/
-   flush_tlb_func((void *)info);
-
-   cpumask = uv_flush_tlb_others(cpumask, info);
-   if (cpumask)
-   smp_call_function_many(cpumask, flush_tlb_func,
-  (void *)info, 1);
-   return;
-   }
-
/*
 * If no page tables were freed, we can skip sending IPIs to
 * CPUs in lazy TLB mode. They will flush the CPU themselves
-- 
2.17.1



Re: [PATCH 5.2 000/135] 5.2.10-stable review

2019-08-23 Thread Sasha Levin

On Fri, Aug 23, 2019 at 07:32:58PM -0700, Greg KH wrote:

On Fri, Aug 23, 2019 at 09:18:05PM -0400, Sasha Levin wrote:

On Fri, Aug 23, 2019 at 10:36:27AM -0700, Greg KH wrote:
> On Fri, Aug 23, 2019 at 02:28:53AM -0400, Sasha Levin wrote:
> > On Fri, Aug 23, 2019 at 02:42:48AM +0200, Stefan Lippers-Hollmann wrote:
> > > Hi
> > >
> > > On 2019-08-22, Greg KH wrote:
> > > > On Fri, Aug 23, 2019 at 12:05:27AM +0200, Stefan Lippers-Hollmann wrote:
> > > > > On 2019-08-22, Greg KH wrote:
> > > > > > On Thu, Aug 22, 2019 at 01:05:56PM -0400, Sasha Levin wrote:
> > > [...]
> > > > > It might be down to kernel.org mirroring, but the patch file doesn't
> > > > > seem to be available yet (404), both in the wrong location listed
> > > > > above - and the expected one under
> > > > >
> > > > >
https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.2.10-rc1.gz
> > > [...]
> > > > Ah, no, it's not a mirroring problem, Sasha and I didn't know if anyone
> > > > was actually using the patch files anymore, so it was simpler to do a
> > > > release without them to see what happens. :)
> > > >
> > > > Do you rely on these, or can you use the -rc git tree or the quilt
> > > > series?  If you do rely on them, we will work to fix this, it just
> > > > involves some scripting that we didn't get done this morning.
> > >
> > > "Rely" is a strong word, I can adapt if they're going away, but
> > > I've been using them so far, as in (slightly simplified):
> > >
> > > $ cd patches/upstream/
> > > $ wget https://cdn.kernel.org/pub/linux/kernel/v5.x/patch-5.2.9.xz
> > > $ xz -d patch-5.2.9.xz
> > > $ wget 
https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.2.10-rc1.gz
> > > $ gunzip patch-5.2.10-rc1.gz
> > > $ vim ../series
> > > $ quilt ...
> > >
> > > I can switch to importing the quilt queue with some sed magic (and I
> > > already do that, if interesting or just a larger amounts of patches are
> > > queuing up for more than a day or two), but using the -rc patches has
> > > been convenient in that semi-manual workflow, also to make sure to really
> > > get and test the formal -rc patch, rather than something inbetween.
> >
> > An easy way to generate a patch is to just use the git.kernel.org web
> > interface. A patch for 5.2.10-rc1 would be:
> > 
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git/patch/?id=linux-5.2.y&id2=v5.2.9
> >
> > Personally this patch upload story sounded to me like a pre-git era
> > artifact...
>
> Given that we no longer do patches for Linus's -rc releases for the past
> few years, maybe it is time to move to do the same for the stable
> releases to be consistent.

Or tarballs? Why do we generate tarballs (and go through kup)?
git.kernel.org already does it for us.


As I mentioned yesterday, but writing it down here for posterity,
there's a number of reasons.

First off, the release process doesn't require kup for when a "real"
release happens, that's all now donw on git.kernel.org with a process
involving a signed note and some other fun backend stuff.  We are
working on expanding that in the future with some other signature
validation as well, to make it easier to verify tarballs are "correct"
as what we do today is a bit different than other projects.


I think that I made it read like I want to remove tarballs altogether.
That's not the case: I just want to get rid of the magical signed note
process.

The way I understand it, we generate tarballs twice: once during the
magical signed note process, and once by the git interface. I'm just
suggesting we reduce that down to happen once.

Right now you can fetch tarballs from two different links on kernel.org.
For example, a 5.2.9 tarball is available at:

- https://mirrors.edge.kernel.org/pub/linux/kernel/v5.x/linux-5.2.9.tar.xz
- 
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux.git/snapshot/linux-5.2.9.tar.gz

Can't we symlink one to the other?


As for the tarball itself, it's still needed for the same reasons we do
so on Linus's releases:
- distros use these.  Don't want all Gentoo users hammering on
  git.kernel.org for their updated builds, that's a huge waste.


We can just place git.kernel.org generated tarballs (for some repos) on
the CDN, no?


- mirroring works _so_ much better around the world for tarballs


Doing the above should solve this.


- legal reasons.  git is not yet "recognized" as being something
  that properly is reflective of a specific point in time while
  as online tarballs that are mirrored and stored around the
  world are finally almost properly recognized for this.


We still keep the tarballs.


- historical, people are used to using them, and workflows are
  built up around them.  People don't like rewriting scripts, as
  can be seen in my monstrosity of a mess that I use for
  releases :)


Right, this shouldn't require changing any scripts.

--
Thanks,
Sasha


RE: [PATCH v2 0/2] Simplify mtty driver and mdev core

2019-08-23 Thread Parav Pandit



> -Original Message-
> From: Alex Williamson 
> Sent: Saturday, August 24, 2019 10:29 AM
> To: Parav Pandit 
> Cc: Jiri Pirko ; Jiri Pirko ; David S .
> Miller ; Kirti Wankhede ;
> Cornelia Huck ; k...@vger.kernel.org; linux-
> ker...@vger.kernel.org; cjia ; net...@vger.kernel.org
> Subject: Re: [PATCH v2 0/2] Simplify mtty driver and mdev core
> 
> On Sat, 24 Aug 2019 03:56:08 +
> Parav Pandit  wrote:
> 
> > > -Original Message-
> > > From: Alex Williamson 
> > > Sent: Saturday, August 24, 2019 1:14 AM
> > > To: Parav Pandit 
> > > Cc: Jiri Pirko ; Jiri Pirko ;
> > > David S . Miller ; Kirti Wankhede
> > > ; Cornelia Huck ;
> > > k...@vger.kernel.org; linux- ker...@vger.kernel.org; cjia
> > > ; net...@vger.kernel.org
> > > Subject: Re: [PATCH v2 0/2] Simplify mtty driver and mdev core
> > >
> > > On Fri, 23 Aug 2019 18:00:30 +
> > > Parav Pandit  wrote:
> > >
> > > > > -Original Message-
> > > > > From: Alex Williamson 
> > > > > Sent: Friday, August 23, 2019 10:47 PM
> > > > > To: Parav Pandit 
> > > > > Cc: Jiri Pirko ; Jiri Pirko
> > > > > ; David S . Miller ;
> > > > > Kirti Wankhede ; Cornelia Huck
> > > > > ; k...@vger.kernel.org; linux-
> > > > > ker...@vger.kernel.org; cjia ;
> > > > > net...@vger.kernel.org
> > > > > Subject: Re: [PATCH v2 0/2] Simplify mtty driver and mdev core
> > > > >
> > > > > On Fri, 23 Aug 2019 16:14:04 + Parav Pandit
> > > > >  wrote:
> > > > >
> > > > > > > > Idea is to have mdev alias as optional.
> > > > > > > > Each mdev_parent says whether it wants mdev_core to
> > > > > > > > generate an alias or not. So only networking device drivers
> would set it to true.
> > > > > > > > For rest, alias won't be generated, and won't be compared
> > > > > > > > either during creation time. User continue to provide only uuid.
> > > > > > >
> > > > > > > Ok
> > > > > > >
> > > > > > > > I am tempted to have alias collision detection only within
> > > > > > > > children mdevs of the same parent, but doing so will
> > > > > > > > always mandate to prefix in netdev name. And currently we
> > > > > > > > are left with only 3 characters to prefix it, so that may not be
> good either.
> > > > > > > > Hence, I think mdev core wide alias is better with 12 
> > > > > > > > characters.
> > > > > > >
> > > > > > > I suppose it depends on the API, if the vendor driver can
> > > > > > > ask the mdev core for an alias as part of the device
> > > > > > > creation process, then it could manage the netdev namespace
> > > > > > > for all its devices, choosing how many characters to use,
> > > > > > > and fail the creation if it can't meet a uniqueness
> > > > > > > requirement.  IOW, mdev-core would always provide a full
> > > > > > > sha1 and therefore gets itself out of the uniqueness/collision
> aspects.
> > > > > > >
> > > > > > This doesn't work. At mdev core level 20 bytes sha1 are
> > > > > > unique, so mdev core allowed to create a mdev.
> > > > >
> > > > > The mdev vendor driver has the opportunity to fail the device
> > > > > creation in mdev_parent_ops.create().
> > > > >
> > > > That is not helpful for below reasons.
> > > > 1. vendor driver doesn't have visibility in other vendor's alias.
> > > > 2. Even for single vendor, it needs to maintain global list of
> > > > devices to see
> > > collision.
> > > > 3. multiple vendors needs to implement same scheme.
> > > >
> > > > Mdev core should be the owner. Shifting ownership from one layer
> > > > to a lower layer in vendor driver doesn't solve the problem (if
> > > > there is one, which I think doesn't exist).
> > > >
> > > > > > And then devlink core chooses
> > > > > > only 6 bytes (12 characters) and there is collision. Things
> > > > > > fall apart. Since mdev provides unique uuid based scheme, it's
> > > > > > the mdev core's ownership to provide unique aliases.
> > > > >
> > > > > You're suggesting/contemplating multiple solutions here, 3-char
> > > > > prefix + 12- char sha1 vs  + ?-char sha1.  Also,
> > > > > the 15-char total limit is imposed by an external subsystem,
> > > > > where the vendor driver is the gateway between that subsystem
> > > > > and mdev.  How would mdev integrate with another subsystem that
> > > > > maybe only has 9-chars available?  Would the vendor driver API
> > > > > specify "I need an alias" or would it specify "I need an X-char length
> alias"?
> > > > Yes, Vendor driver should say how long the alias it wants.
> > > > However before we implement that, I suggest let such
> > > > vendor/user/driver arrive which needs that. Such variable length
> > > > alias can be added at that time and even with that alias collision
> > > > can be detected by single mdev module.
> > >
> > > If we agree that different alias lengths are possible, then I would
> > > request that minimally an mdev sample driver be modified to request
> > > an alias with a length that can be adjusted without recompiling in order
> to exercise the collision path.
> > >
> > Yes. this can be done. But I fail

Re: [RFC PATCH v2 00/19] RDMA/FS DAX truncate proposal V1,000,002 ;-)

2019-08-23 Thread Ira Weiny
On Sat, Aug 24, 2019 at 10:11:24AM +1000, Dave Chinner wrote:
> On Fri, Aug 23, 2019 at 09:04:29AM -0300, Jason Gunthorpe wrote:
> > On Fri, Aug 23, 2019 at 01:23:45PM +1000, Dave Chinner wrote:
> > 
> > > > But the fact that RDMA, and potentially others, can "pass the
> > > > pins" to other processes is something I spent a lot of time trying to 
> > > > work out.
> > > 
> > > There's nothing in file layout lease architecture that says you
> > > can't "pass the pins" to another process.  All the file layout lease
> > > requirements say is that if you are going to pass a resource for
> > > which the layout lease guarantees access for to another process,
> > > then the destination process already have a valid, active layout
> > > lease that covers the range of the pins being passed to it via the
> > > RDMA handle.
> > 
> > How would the kernel detect and enforce this? There are many ways to
> > pass a FD.
> 
> AFAIC, that's not really a kernel problem. It's more of an
> application design constraint than anything else. i.e. if the app
> passes the IB context to another process without a lease, then the
> original process is still responsible for recalling the lease and
> has to tell that other process to release the IB handle and it's
> resources.
> 
> > IMHO it is wrong to try and create a model where the file lease exists
> > independently from the kernel object relying on it. In other words the
> > IB MR object itself should hold a reference to the lease it relies
> > upon to function properly.
> 
> That still doesn't work. Leases are not individually trackable or
> reference counted objects objects - they are attached to a struct
> file bUt, in reality, they are far more restricted than a struct
> file.
> 
> That is, a lease specifically tracks the pid and the _open fd_ it
> was obtained for, so it is essentially owned by a specific process
> context.  Hence a lease is not able to be passed to a separate
> process context and have it still work correctly for lease break
> notifications.  i.e. the layout break signal gets delivered to
> original process that created the struct file, if it still exists
> and has the original fd still open. It does not get sent to the
> process that currently holds a reference to the IB context.
>

The fcntl man page says:

"Leases are associated with an open file description (see open(2)).  This means
that duplicate file descriptors (created by, for example, fork(2) or dup(2))
refer to the same lease, and this lease may be modified or released using any
of these descriptors.  Furthermore,  the lease is released by either an
explicit F_UNLCK operation on any of these duplicate file descriptors, or when
all such file descriptors have been closed."

>From this I took it that the child process FD would have the lease as well
_and_ could release it.  I _assumed_ that applied to SCM_RIGHTS but it does not
seem to work the same way as dup() so I'm not so sure.

Ira

> 
> So while a struct file passed to another process might still have
> an active lease, and you can change the owner of the struct file
> via fcntl(F_SETOWN), you can't associate the existing lease with a
> the new fd in the new process and so layout break signals can't be
> directed at the lease fd
> 
> This really means that a lease can only be owned by a single process
> context - it can't be shared across multiple processes (so I was
> wrong about dup/pass as being a possible way of passing them)
> because there's only one process that can "own" a struct file, and
> that where signals are sent when the lease needs to be broken.
> 
> So, fundamentally, if you want to pass a resource that pins a file
> layout between processes, both processes need to hold a layout lease
> on that file range. And that means exclusive leases and passing
> layouts between processes are fundamentally incompatible because you
> can't hold two exclusive leases on the same file range
> 
> Cheers,
> 
> Dave.
> -- 
> Dave Chinner
> da...@fromorbit.com


Re: [PATCH v2 0/2] Simplify mtty driver and mdev core

2019-08-23 Thread Alex Williamson
On Sat, 24 Aug 2019 03:56:08 +
Parav Pandit  wrote:

> > -Original Message-
> > From: Alex Williamson 
> > Sent: Saturday, August 24, 2019 1:14 AM
> > To: Parav Pandit 
> > Cc: Jiri Pirko ; Jiri Pirko ; David S 
> > . Miller
> > ; Kirti Wankhede ; Cornelia
> > Huck ; k...@vger.kernel.org; linux-
> > ker...@vger.kernel.org; cjia ; net...@vger.kernel.org
> > Subject: Re: [PATCH v2 0/2] Simplify mtty driver and mdev core
> > 
> > On Fri, 23 Aug 2019 18:00:30 +
> > Parav Pandit  wrote:
> >   
> > > > -Original Message-
> > > > From: Alex Williamson 
> > > > Sent: Friday, August 23, 2019 10:47 PM
> > > > To: Parav Pandit 
> > > > Cc: Jiri Pirko ; Jiri Pirko ;
> > > > David S . Miller ; Kirti Wankhede
> > > > ; Cornelia Huck ;
> > > > k...@vger.kernel.org; linux- ker...@vger.kernel.org; cjia
> > > > ; net...@vger.kernel.org
> > > > Subject: Re: [PATCH v2 0/2] Simplify mtty driver and mdev core
> > > >
> > > > On Fri, 23 Aug 2019 16:14:04 +
> > > > Parav Pandit  wrote:
> > > >  
> > > > > > > Idea is to have mdev alias as optional.
> > > > > > > Each mdev_parent says whether it wants mdev_core to generate
> > > > > > > an alias or not. So only networking device drivers would set it 
> > > > > > > to true.
> > > > > > > For rest, alias won't be generated, and won't be compared
> > > > > > > either during creation time. User continue to provide only uuid.  
> > > > > >
> > > > > > Ok
> > > > > >  
> > > > > > > I am tempted to have alias collision detection only within
> > > > > > > children mdevs of the same parent, but doing so will always
> > > > > > > mandate to prefix in netdev name. And currently we are left
> > > > > > > with only 3 characters to prefix it, so that may not be good 
> > > > > > > either.
> > > > > > > Hence, I think mdev core wide alias is better with 12 characters. 
> > > > > > >  
> > > > > >
> > > > > > I suppose it depends on the API, if the vendor driver can ask
> > > > > > the mdev core for an alias as part of the device creation
> > > > > > process, then it could manage the netdev namespace for all its
> > > > > > devices, choosing how many characters to use, and fail the
> > > > > > creation if it can't meet a uniqueness requirement.  IOW,
> > > > > > mdev-core would always provide a full
> > > > > > sha1 and therefore gets itself out of the uniqueness/collision 
> > > > > > aspects.
> > > > > >  
> > > > > This doesn't work. At mdev core level 20 bytes sha1 are unique, so
> > > > > mdev core allowed to create a mdev.  
> > > >
> > > > The mdev vendor driver has the opportunity to fail the device
> > > > creation in mdev_parent_ops.create().
> > > >  
> > > That is not helpful for below reasons.
> > > 1. vendor driver doesn't have visibility in other vendor's alias.
> > > 2. Even for single vendor, it needs to maintain global list of devices to 
> > > see  
> > collision.  
> > > 3. multiple vendors needs to implement same scheme.
> > >
> > > Mdev core should be the owner. Shifting ownership from one layer to a
> > > lower layer in vendor driver doesn't solve the problem (if there is
> > > one, which I think doesn't exist).
> > >  
> > > > > And then devlink core chooses
> > > > > only 6 bytes (12 characters) and there is collision. Things fall
> > > > > apart. Since mdev provides unique uuid based scheme, it's the mdev
> > > > > core's ownership to provide unique aliases.  
> > > >
> > > > You're suggesting/contemplating multiple solutions here, 3-char
> > > > prefix + 12- char sha1 vs  + ?-char sha1.  Also, the
> > > > 15-char total limit is imposed by an external subsystem, where the
> > > > vendor driver is the gateway between that subsystem and mdev.  How
> > > > would mdev integrate with another subsystem that maybe only has
> > > > 9-chars available?  Would the vendor driver API specify "I need an
> > > > alias" or would it specify "I need an X-char length alias"?  
> > > Yes, Vendor driver should say how long the alias it wants.
> > > However before we implement that, I suggest let such
> > > vendor/user/driver arrive which needs that. Such variable length alias
> > > can be added at that time and even with that alias collision can be
> > > detected by single mdev module.  
> > 
> > If we agree that different alias lengths are possible, then I would request 
> > that
> > minimally an mdev sample driver be modified to request an alias with a 
> > length
> > that can be adjusted without recompiling in order to exercise the collision 
> > path.
> >   
> Yes. this can be done. But I fail to understand the need to do so.
> It is not the responsibility of the mdev core to show case sha1
> collision efficiency/deficiency. So why do you insist exercise it?

I don't understand what you're trying to imply with "show case sha1
collision efficiency/deficiency".  Are you suggesting that I'm asking
for this feature to experimentally test the probability of collisions
at different character lengths?  We can use shell scripts for that.
I'm simply observing that col

Re: [RFC PATCH v2 00/19] RDMA/FS DAX truncate proposal V1,000,002 ;-)

2019-08-23 Thread Ira Weiny
On Fri, Aug 23, 2019 at 01:23:45PM +1000, Dave Chinner wrote:
> On Wed, Aug 21, 2019 at 01:44:21PM -0700, Ira Weiny wrote:
> > On Wed, Aug 21, 2019 at 04:48:10PM -0300, Jason Gunthorpe wrote:
> > > On Wed, Aug 21, 2019 at 11:57:03AM -0700, Ira Weiny wrote:
> > > 
> > > > > Oh, I didn't think we were talking about that. Hanging the close of
> > > > > the datafile fd contingent on some other FD's closure is a recipe for
> > > > > deadlock..
> > > > 
> > > > The discussion between Jan and Dave was concerning what happens when a 
> > > > user
> > > > calls
> > > > 
> > > > fd = open()
> > > > fnctl(...getlease...)
> > > > addr = mmap(fd...)
> > > > ib_reg_mr() 
> > > > munmap(addr...)
> > > > close(fd)
> > > 
> > > I don't see how blocking close(fd) could work.
> > 
> > Well Dave was saying this _could_ work. FWIW I'm not 100% sure it will but I
> > can't prove it won't..
> 
> Right, I proposed it as a possible way of making sure application
> developers don't do this. It _could_ be made to work (e.g. recording
> longterm page pins on the vma->file), but this is tangential to 
> the discussion of requiring active references to all resources
> covered by the layout lease.
> 
> I think allowing applications to behave like the above is simply
> poor system level design, regardless of the interaction with
> filesystems and layout leases.
> 
> > Maybe we are all just touching a different part of this
> > elephant[1] but the above scenario or one without munmap is very reasonably
> > something a user would do.  So we can either allow the close to complete (my
> > current patches) or try to make it block like Dave is suggesting.

My belief when writing the current series was that hanging the close would
cause deadlock.  But it seems I was wrong because of the delayed __fput().

So far, I have not been able to get RDMA to have an issue like Jason suggested
would happen (or used to happen).  So from that perspective it may be ok to
hang the close.

> > 
> > I don't disagree with Dave with the semantics being nice and clean for the
> > filesystem.
> 
> I'm not trying to make it "nice and clean for the filesystem".
> 
> The problem is not just RDMA/DAX - anything that is directly
> accessing the block device under the filesystem has the same set of
> issues. That is, the filesystem controls the life cycle of the
> blocks in the block device, so direct access to the blocks by any
> means needs to be co-ordinated with the filesystem. Pinning direct
> access to a file via page pins attached to a hardware context that
> the filesystem knows nothing about is not an access model that the
> filesystems can support.
> 
> IOWs, anyone looking at this problem just from the RDMA POV of page
> pins is not seeing all the other direct storage access mechainsms
> that we need to support in the filesystems. RDMA on DAX is just one
> of them.  pNFS is another. Remote acces via NVMeOF is another. XDP
> -> DAX (direct file data placement from the network hardware) is
> another. There are /lots/ of different direct storage access
> mechanisms that filesystems need to support and we sure as hell do
> not want to have to support special case semantics for every single
> one of them.

My use of struct file was based on the fact that FDs are a primary interface
for linux and my thought was that they would be more universal than having file
pin information stored in an RDMA specific structure.

XDP is not as direct; it uses sockets.  But sockets also have a struct file
which I believe could be used in a similar manner.  I'm not 100% sure of the
xdp_umem lifetime yet but it seems that my choice of using struct file was a
good one in this respect.

> 
> Hence if we don't start with a sane model for arbitrating direct
> access to the storage at the filesystem level we'll never get this
> stuff to work reliably, let alone work together coherently.  An
> application that wants a direct data path to storage should have a
> single API that enables then to safely access the storage,
> regardless of how they are accessing the storage.
> 
> From that perspective, what we are talking about here with RDMA
> doing "mmap, page pin, unmap, close" and "pass page pins via
> SCM_RIGHTS" are fundamentally unworkable from the filesystem
> perspective. They are use-after-free situations from the filesystem
> perspective - they do not hold direct references to anything in the
> filesystem, and so the filesytem is completely unaware of them.

I see your point of view but looking at it from a different point of view I
don't see this as a "use after free".

The user has explicitly registered this memory (and layout) with another direct
access subsystem (RDMA for example) so why do they need to keep the FD around?

> 
> The filesystem needs to be aware of /all users/ of it's resources if
> it's going to manage them sanely.

>From the way I look at it the underlying filesystem _is_ aware of the leases
with my patch set.  And so to is the user.  It is just not through the 

RE: [PATCH v2 0/2] Simplify mtty driver and mdev core

2019-08-23 Thread Parav Pandit
Hi Alex,

> -Original Message-
> From: linux-kernel-ow...@vger.kernel.org  ow...@vger.kernel.org> On Behalf Of Parav Pandit
> Sent: Saturday, August 24, 2019 9:26 AM
> To: Alex Williamson 
> Cc: Jiri Pirko ; Jiri Pirko ; David S .
> Miller ; Kirti Wankhede ;
> Cornelia Huck ; k...@vger.kernel.org; linux-
> ker...@vger.kernel.org; cjia ; net...@vger.kernel.org
> Subject: RE: [PATCH v2 0/2] Simplify mtty driver and mdev core
> > I don't understand this logic.  I'm simply asking that we have a way
> > to test the collision behavior without changing the binary.  The path
> > we're driving towards seems to be making this easier and easier.  If
> > the vendor can request an alias of a specific length, then a sample
> > driver with a module option to set the desired alias length to 1-char makes
> it trivially easy to induce a collision.
> Sure it is easy to test collision, but my point is - mdev core is not sha1 
> test
> module.
> Hence adding functionality of variable alias length to test collision doesn't
> make sense.
> When the actual user arrives who needs small alias, we will be able to add
> additional pieces very easily.

My initial thoughts to add parent_ops to have bool flag to generate alias or 
not.
However, instead of bool, keeping it unsigned int to say, zero to skip alias 
and non-zero length to convey generate alias.
This will serve both the purpose with trivial handling.




RE: [PATCH v2 0/2] Simplify mtty driver and mdev core

2019-08-23 Thread Parav Pandit



> -Original Message-
> From: Alex Williamson 
> Sent: Saturday, August 24, 2019 1:14 AM
> To: Parav Pandit 
> Cc: Jiri Pirko ; Jiri Pirko ; David S . 
> Miller
> ; Kirti Wankhede ; Cornelia
> Huck ; k...@vger.kernel.org; linux-
> ker...@vger.kernel.org; cjia ; net...@vger.kernel.org
> Subject: Re: [PATCH v2 0/2] Simplify mtty driver and mdev core
> 
> On Fri, 23 Aug 2019 18:00:30 +
> Parav Pandit  wrote:
> 
> > > -Original Message-
> > > From: Alex Williamson 
> > > Sent: Friday, August 23, 2019 10:47 PM
> > > To: Parav Pandit 
> > > Cc: Jiri Pirko ; Jiri Pirko ;
> > > David S . Miller ; Kirti Wankhede
> > > ; Cornelia Huck ;
> > > k...@vger.kernel.org; linux- ker...@vger.kernel.org; cjia
> > > ; net...@vger.kernel.org
> > > Subject: Re: [PATCH v2 0/2] Simplify mtty driver and mdev core
> > >
> > > On Fri, 23 Aug 2019 16:14:04 +
> > > Parav Pandit  wrote:
> > >
> > > > > > Idea is to have mdev alias as optional.
> > > > > > Each mdev_parent says whether it wants mdev_core to generate
> > > > > > an alias or not. So only networking device drivers would set it to 
> > > > > > true.
> > > > > > For rest, alias won't be generated, and won't be compared
> > > > > > either during creation time. User continue to provide only uuid.
> > > > >
> > > > > Ok
> > > > >
> > > > > > I am tempted to have alias collision detection only within
> > > > > > children mdevs of the same parent, but doing so will always
> > > > > > mandate to prefix in netdev name. And currently we are left
> > > > > > with only 3 characters to prefix it, so that may not be good either.
> > > > > > Hence, I think mdev core wide alias is better with 12 characters.
> > > > >
> > > > > I suppose it depends on the API, if the vendor driver can ask
> > > > > the mdev core for an alias as part of the device creation
> > > > > process, then it could manage the netdev namespace for all its
> > > > > devices, choosing how many characters to use, and fail the
> > > > > creation if it can't meet a uniqueness requirement.  IOW,
> > > > > mdev-core would always provide a full
> > > > > sha1 and therefore gets itself out of the uniqueness/collision 
> > > > > aspects.
> > > > >
> > > > This doesn't work. At mdev core level 20 bytes sha1 are unique, so
> > > > mdev core allowed to create a mdev.
> > >
> > > The mdev vendor driver has the opportunity to fail the device
> > > creation in mdev_parent_ops.create().
> > >
> > That is not helpful for below reasons.
> > 1. vendor driver doesn't have visibility in other vendor's alias.
> > 2. Even for single vendor, it needs to maintain global list of devices to 
> > see
> collision.
> > 3. multiple vendors needs to implement same scheme.
> >
> > Mdev core should be the owner. Shifting ownership from one layer to a
> > lower layer in vendor driver doesn't solve the problem (if there is
> > one, which I think doesn't exist).
> >
> > > > And then devlink core chooses
> > > > only 6 bytes (12 characters) and there is collision. Things fall
> > > > apart. Since mdev provides unique uuid based scheme, it's the mdev
> > > > core's ownership to provide unique aliases.
> > >
> > > You're suggesting/contemplating multiple solutions here, 3-char
> > > prefix + 12- char sha1 vs  + ?-char sha1.  Also, the
> > > 15-char total limit is imposed by an external subsystem, where the
> > > vendor driver is the gateway between that subsystem and mdev.  How
> > > would mdev integrate with another subsystem that maybe only has
> > > 9-chars available?  Would the vendor driver API specify "I need an
> > > alias" or would it specify "I need an X-char length alias"?
> > Yes, Vendor driver should say how long the alias it wants.
> > However before we implement that, I suggest let such
> > vendor/user/driver arrive which needs that. Such variable length alias
> > can be added at that time and even with that alias collision can be
> > detected by single mdev module.
> 
> If we agree that different alias lengths are possible, then I would request 
> that
> minimally an mdev sample driver be modified to request an alias with a length
> that can be adjusted without recompiling in order to exercise the collision 
> path.
> 
Yes. this can be done. But I fail to understand the need to do so.
It is not the responsibility of the mdev core to show case sha1 collision 
efficiency/deficiency.
So why do you insist exercise it?

> If mdev-core is guaranteeing uniqueness, does this indicate that each alias
> length constitutes a separate namespace?  ie. strictly a strcmp(), not a
> strncmp() to the shorter alias.
> 
Yes.


> > > Does it make sense that mdev-core would fail creation of a device if
> > > there's a collision in the 12-char address space between different
> > > subsystems?  For example, does enm0123456789ab really
> > > collide with xyz0123456789ab?
> > I think so, because at mdev level its 12-char alias matters.
> > Choosing the prefix not adding prefix is really a user space choice.
> >
> > >  So if
> > > mdev were to 

Re: [PATCH 4/5] mtd: spi-nor: Move clear_sr_bp() to 'struct spi_nor_flash_parameter'

2019-08-23 Thread Tudor.Ambarus


On 08/23/2019 06:53 PM, Tudor Ambarus - M18064 wrote:
> +  * configuration register Quad Enable bit is one, only the the
^duplication


Re: [PATCH v1] [semaphore] Removed redundant code from semaphore's down family of function

2019-08-23 Thread Satendra Singh Thakur
On Thu, 22 Aug 2019 17:51:12 +0200, Peter Zijlstra wrote:
> On Mon, Aug 12, 2019 at 07:18:59PM +0530, Satendra Singh Thakur wrote:
> > -The semaphore code has four funcs
> > down,
> > down_interruptible,
> > down_killable,
> > down_timeout
> > -These four funcs have almost similar code except that
> > they all call lower level function __down_xyz.
> > -This lower level func in-turn call inline func
> > __down_common with appropriate arguments.
> > -This patch creates a common macro for above family of funcs
> > so that duplicate code is eliminated.
> > -Also, __down_common has been made noinline so that code is
> > functionally similar to previous one
> > -For example, earlier down_killable would call __down_killable
> > , which in-turn would call inline func __down_common
> > Now, down_killable calls noinline __down_common directly
> > through a macro
> > -The funcs __down_interruptible, __down_killable etc have been
> > removed as they were just wrapper to __down_common
>
> The above is unreadable and seems to lack a reason for this change.
Hi Mr Peter,
Thanks for the comments.
I will try to explain it further:

The semaphore has four functions named down*.
The call flow of the functions is

down* > __down* > inline __down_common

The code of down* and __down* is redundant/duplicate except that
the __down_common is called with different arguments from __down*
functions.

This patch defines a macro down_common which contain this common
code of all down* functions.

new call flow is

down* > noinline __down_common (through a macro down_common).

> AFAICT from the actual patch, you're destroying the explicit
> instantiation of the __down*() functions
> through constant propagation into __down_common().
Intead of instantiation of __down* functions, we are instaintiating
__down_common, is it a problem ?

Thanks
Satendra


Re: [PATCH] Partially revert "mm/memcontrol.c: keep local VM counters in sync with the hierarchical ones"

2019-08-23 Thread Yafang Shao
On Sat, Aug 24, 2019 at 6:33 AM Roman Gushchin  wrote:
>
> On Fri, Aug 16, 2019 at 05:47:26PM -0700, Roman Gushchin wrote:
> > Commit 766a4c19d880 ("mm/memcontrol.c: keep local VM counters in sync
> > with the hierarchical ones") effectively decreased the precision of
> > per-memcg vmstats_local and per-memcg-per-node lruvec percpu counters.
> >
> > That's good for displaying in memory.stat, but brings a serious regression
> > into the reclaim process.
> >
> > One issue I've discovered and debugged is the following:
> > lruvec_lru_size() can return 0 instead of the actual number of pages
> > in the lru list, preventing the kernel to reclaim last remaining
> > pages. Result is yet another dying memory cgroups flooding.
> > The opposite is also happening: scanning an empty lru list
> > is the waste of cpu time.
> >
> > Also, inactive_list_is_low() can return incorrect values, preventing
> > the active lru from being scanned and freed. It can fail both because
> > the size of active and inactive lists are inaccurate, and because
> > the number of workingset refaults isn't precise. In other words,
> > the result is pretty random.
> >
> > I'm not sure, if using the approximate number of slab pages in
> > count_shadow_number() is acceptable, but issues described above
> > are enough to partially revert the patch.
> >
> > Let's keep per-memcg vmstat_local batched (they are only used for
> > displaying stats to the userspace), but keep lruvec stats precise.
> > This change fixes the dead memcg flooding on my setup.
> >
> > Fixes: 766a4c19d880 ("mm/memcontrol.c: keep local VM counters in sync with 
> > the hierarchical ones")
> > Signed-off-by: Roman Gushchin 
> > Cc: Yafang Shao 
> > Cc: Johannes Weiner 
>
> Any other concerns/comments here?
>
> I'd prefer to fix the regression: we're likely leaking several pages
> of memory for each created and destroyed memory cgroup. Plus
> all internal structures, which are measured in hundreds of kb.
>

Hi Roman,

As it really introduces issues, I agree with you that we should fix it first.

So for your fix,
Acked-by: Yafang Shao 

Thanks
Yafang


Re: WARNINGs in set_task_reclaim_state with memory cgroup and full memory usage

2019-08-23 Thread Yafang Shao
On Sat, Aug 24, 2019 at 10:57 AM Hillf Danton  wrote:
>
>
> On Fri, 23 Aug 2019 18:00:15 -0400 Adric Blake wrote:
> > Synopsis:
> > A WARN_ON_ONCE is hit twice in set_task_reclaim_state under the
> > following conditions:
> > - a memory cgroup has been created and a task assigned it it
> > - memory.limit_in_bytes has been set
> > - memory has filled up, likely from cache
> >
> Thanks for report.
>
> > In my usage, I create a cgroup under the current session scope and
> > assign a task to it. I then set memory.limit_in_bytes and
> > memory.soft_limit_in_bytes for the cgroup to reasonable values, say
> > 1G/512M. The program accesses large files frequently and gradually
> > fills memory with the page cache. The warnings appears when the
> > entirety of the system memory is filled, presumably from other
> > programs.
> >
> > If I wait until the program has filled the entirety of system memory
> > with cache and then assign a memory limit, the warnings appear
> > immediately.
> >
> > I am building the linux git. I first noticed this issue with the
> > drm-tip 5.3rc3 and 5.3rc4 kernels, and tested linux master after
> > 5.3rc5 to confirm the bug more resoundingly.
> >
> > Here are the warnings.
> >
> > [38491.963105] WARNING: CPU: 7 PID: 175 at mm/vmscan.c:245 
> > set_task_reclaim_state+0x1e/0x40
> > [38491.963106] Modules linked in: iwlmvm mac80211 libarc4 iwlwifi
> > cfg80211 xt_comment nls_iso8859_1 nls_cp437 vfat fat xfs jfs btrfs xor
> > raid6_pq libcrc32c ccm tun rfcomm fuse xt_tcpudp ip6t_REJECT
> > nf_reject_ipv6 ipt_REJECT nf_reject_ipv4 xt_multiport xt_owner
> > snd_hda_codec_hdmi ip6table_filter ip6_tables iptable_filter bnep ext4
> > crc32c_generic mbcache jbd2 snd_hda_codec_realtek
> > snd_hda_codec_generic snd_soc_skl snd_soc_hdac_hda snd_hda_ext_core
> > snd_soc_skl_ipc x86_pkg_temp_thermal intel_powerclamp snd_soc_sst_ipc
> > coretemp snd_soc_sst_dsp snd_soc_acpi_intel_match kvm_intel
> > snd_soc_acpi i915 snd_soc_core kvm snd_compress ac97_bus
> > snd_pcm_dmaengine snd_hda_intel i2c_algo_bit btusb irqbypass
> > drm_kms_helper btrtl snd_hda_codec dell_laptop btbcm crct10dif_pclmul
> > snd_hda_core crc32c_intel btintel iTCO_wdt ghash_clmulni_intel drm
> > ledtrig_audio aesni_intel iTCO_vendor_support snd_hwdep dell_wmi
> > rtsx_usb_ms r8169 dell_smbios aes_x86_64 mei_hdcp crypto_simd
> > intel_gtt bluetooth snd_pcm cryptd dcdbas
> > [38491.963155]  wmi_bmof dell_wmi_descriptor intel_rapl_msr
> > glue_helper snd_timer joydev intel_cstate snd realtek memstick
> > dell_smm_hwmon mousedev psmouse input_leds libphy intel_uncore
> > ecdh_generic ecc crc16 rfkill intel_rapl_perf soundcore i2c_i801
> > agpgart mei_me tpm_crb syscopyarea sysfillrect sysimgblt mei
> > intel_xhci_usb_role_switch fb_sys_fops idma64 tpm_tis roles
> > processor_thermal_device intel_rapl_common i2c_hid tpm_tis_core
> > int3403_thermal intel_soc_dts_iosf battery wmi intel_lpss_pci
> > intel_lpss intel_pch_thermal tpm int3400_thermal int3402_thermal
> > acpi_thermal_rel int340x_thermal_zone rng_core intel_hid ac
> > sparse_keymap evdev mac_hid crypto_user ip_tables x_tables
> > hid_multitouch rtsx_usb_sdmmc mmc_core rtsx_usb hid_logitech_hidpp
> > sr_mod cdrom sd_mod uas usb_storage hid_logitech_dj hid_generic usbhid
> > hid ahci serio_raw libahci atkbd libps2 libata xhci_pci scsi_mod
> > xhci_hcd crc32_pclmul i8042 serio f2fs [last unloaded: cfg80211]
> > [38491.963221] CPU: 7 PID: 175 Comm: kswapd0 Not tainted 
> > 5.3.0-rc5+149+gbb7ba8069de9 #1
> > [38491.963222] Hardware name: Dell Inc. Inspiron 5570/09YTN7, BIOS 1.2.3 
> > 05/15/2019
> > [38491.963226] RIP: 0010:set_task_reclaim_state+0x1e/0x40
> > [38491.963228] Code: 78 a9 e7 ff 0f 1f 84 00 00 00 00 00 0f 1f 44 00
> > 00 55 48 89 f5 53 48 89 fb 48 85 ed 48 8b 83 08 08 00 00 74 11 48 85
> > c0 74 02 <0f> 0b 48 89 ab 08 08 00 00 5b 5d c3 48 85 c0 75 f1 0f 0b 48
> > 89 ab
> > [38491.963229] RSP: 0018:8c898031fc60 EFLAGS: 00010286
> > [38491.963230] RAX: 8c898031fe28 RBX: 892aa04ddc40 RCX: 
> > 
> > [38491.963231] RDX: 8c898031fc60 RSI: 8c898031fcd0 RDI: 
> > 892aa04ddc40
> > [38491.963233] RBP: 8c898031fcd0 R08: 8c898031fd48 R09: 
> > 89279674b800
> > [38491.963234] R10:  R11:  R12: 
> > 8c898031fd48
> > [38491.963235] R13: 892a842ef000 R14: 892aaf7fc000 R15: 
> > 
> > [38491.963236] FS:  () GS:892aa33c() 
> > knlGS:
> > [38491.963238] CS:  0010 DS:  ES:  CR0: 80050033
> > [38491.963239] CR2: 7f90628fa000 CR3: 00027ee0a002 CR4: 
> > 003606e0
> > [38491.963239] Call Trace:
> > [38491.963246]  mem_cgroup_shrink_node+0x9b/0x1d0
> > [38491.963250]  mem_cgroup_soft_limit_reclaim+0x10c/0x3a0
> > [38491.963254]  balance_pgdat+0x276/0x540
> > [38491.963258]  kswapd+0x200/0x3f0
> > [38491.963261]  ? wait_woken+0x80/0x80
> > [38491.963265]  kthread+0xfd/0x130
> > [38491.963267]  ? balance_pgdat+0

Re: [PATCH v2] i2c: mediatek: disable zero-length transfers for mt8183

2019-08-23 Thread Qii Wang
On Fri, 2019-08-23 at 16:13 +0800, Hsin-Yi Wang wrote:
> On Fri, Aug 23, 2019 at 4:09 PM Qii Wang  wrote:
> 
> > >
> > >  static u32 mtk_i2c_functionality(struct i2c_adapter *adap)
> > >  {
> > > - return I2C_FUNC_I2C | I2C_FUNC_SMBUS_EMUL;
> > > + if (adap->quirks->flags & I2C_AQ_NO_ZERO_LEN)
> > > + return I2C_FUNC_I2C |
> > > + (I2C_FUNC_SMBUS_EMUL & ~I2C_FUNC_SMBUS_QUICK);
> > > + else
> > > + return I2C_FUNC_I2C | I2C_FUNC_SMBUS_EMUL;
> >
> > It can be removed?
> See previous discussion: https://patchwork.kernel.org/patch/10814391/#22484435
> but not all SoC's quirks has I2C_AQ_NO_ZERO_LEN.
ok, it looks good for me, thanks.
Reviewed-by: Qii Wang 




Re: [PATCH RT v2 2/3] sched: migrate_enable: Use sleeping_lock to indicate involuntary sleep

2019-08-23 Thread Joel Fernandes
On Fri, Aug 23, 2019 at 02:28:46PM -0500, Scott Wood wrote:
> On Fri, 2019-08-23 at 18:20 +0200, Sebastian Andrzej Siewior wrote:
> > On 2019-08-21 18:19:05 [-0500], Scott Wood wrote:
> > > Without this, rcu_note_context_switch() will complain if an RCU read
> > > lock is held when migrate_enable() calls stop_one_cpu().
> > > 
> > > Signed-off-by: Scott Wood 
> > > ---
> > > v2: Added comment.
> > > 
> > > If my migrate disable changes aren't taken, then pin_current_cpu()
> > > will also need to use sleeping_lock_inc() because calling
> > > __read_rt_lock() bypasses the usual place it's done.
> > > 
> > >  include/linux/sched.h| 4 ++--
> > >  kernel/rcu/tree_plugin.h | 2 +-
> > >  kernel/sched/core.c  | 8 
> > >  3 files changed, 11 insertions(+), 3 deletions(-)
> > > 
> > > --- a/kernel/sched/core.c
> > > +++ b/kernel/sched/core.c
> > > @@ -7405,7 +7405,15 @@ void migrate_enable(void)
> > >   unpin_current_cpu();
> > >   preempt_lazy_enable();
> > >   preempt_enable();
> > > +
> > > + /*
> > > +  * sleeping_lock_inc suppresses a debug check for
> > > +  * sleeping inside an RCU read side critical section
> > > +  */
> > > + sleeping_lock_inc();
> > >   stop_one_cpu(task_cpu(p), migration_cpu_stop, &arg);
> > > + sleeping_lock_dec();
> > 
> > this looks like an ugly hack. This sleeping_lock_inc() is used where we
> > actually hold a sleeping lock and schedule() which is okay. But this
> > would mean we hold a RCU lock and schedule() anyway. Is that okay?
> 
> Perhaps the name should be changed, but the concept is the same -- RT-
> specific sleeping which should be considered involuntary for the purpose of
> debug checks.  Voluntary sleeping is not allowed in an RCU critical section
> because it will break the critical section on certain flavors of RCU, but
> that doesn't apply to the flavor used on RT.  Sleeping for a long time in an
> RCU critical section would also be a bad thing, but that also doesn't apply
> here.

I think the name should definitely be changed. At best, it is super confusing to
call it "sleeping_lock" for this scenario. In fact here, you are not even
blocking on a lock.

Maybe "sleeping_allowed" or some such.

thanks,

 - Joel



Re: [PATCH 5.2 000/135] 5.2.10-stable review

2019-08-23 Thread Greg KH
On Fri, Aug 23, 2019 at 12:41:03PM -0600, shuah wrote:
> On 8/22/19 11:05 AM, Sasha Levin wrote:
> > 
> > This is the start of the stable review cycle for the 5.2.10 release.
> > There are 135 patches in this series, all will be posted as a response
> > to this one.  If anyone has any issues with these being applied, please
> > let me know.
> > 
> > Responses should be made by Sat 24 Aug 2019 05:07:10 PM UTC.
> > Anything received after that time might be too late.
> > 
> > The whole patch series can be found in one patch at:
> >  
> > https://www.kernel.org/pub/linux/kernel/v4.x/stable-review/patch-5.2.10-rc1.gz
> 
> I am seeing "Sorry I can't find your kernels". Is this posted?

Ah, Sasha didn't generate the patch but it was still listed here, oops.
He copied my format and we didn't notice this, sorry about that.

As the thread shows, we didn't generate this file this time to see what
would happen.  If your test process requires it, we can generate it as I
don't want to break it.

thanks,

greg k-h


Re: linux-next: Tree for Aug 23

2019-08-23 Thread John Hubbard
On 8/23/19 2:26 AM, Stephen Rothwell wrote:
> Hi all,
> 
> Changes since 20190822:
> 
> The thermal tree gained a conflict against the jc_docs tree.
> 
> The rdma tree gained a conflict against the rdma-fixes tree.
> 
> The net-next tree gained conflicts against the pci tree.
> 
> The crypto tree gained a conflict against Linus' tree.
> 
> The drm tree gained a conflict against the drm-fixes tree.

Hi,

Even though I saw email proposing fixes for one (maybe both) of the 
following warnings, I'm still seeing them in this linux-next:

WARNING: "ahci_em_messages" [drivers/ata/libahci] is a static EXPORT_SYMBOL_GPL
WARNING: "ftrace_set_clr_event" [vmlinux] is a static EXPORT_SYMBOL_GPL

...and obviously these can be trivially fixed by:

diff --git a/drivers/ata/libahci.c b/drivers/ata/libahci.c
index e4c45d3cca79..bff369d9a1a7 100644
--- a/drivers/ata/libahci.c
+++ b/drivers/ata/libahci.c
@@ -175,7 +175,6 @@ struct ata_port_operations ahci_pmp_retry_srst_ops = {
 EXPORT_SYMBOL_GPL(ahci_pmp_retry_srst_ops);
 
 static bool ahci_em_messages __read_mostly = true;
-EXPORT_SYMBOL_GPL(ahci_em_messages);
 module_param(ahci_em_messages, bool, 0444);
 /* add other LED protocol types when they become supported */
 MODULE_PARM_DESC(ahci_em_messages,
diff --git a/kernel/trace/trace_events.c b/kernel/trace/trace_events.c
index c7506bc81b75..648930823b57 100644
--- a/kernel/trace/trace_events.c
+++ b/kernel/trace/trace_events.c
@@ -787,7 +787,7 @@ static int __ftrace_set_clr_event(struct trace_array *tr, 
const char *match,
return ret;
 }
 
-static int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
+int ftrace_set_clr_event(struct trace_array *tr, char *buf, int set)
 {
char *event = NULL, *sub = NULL, *match;
int ret;



...which I didn't create patches for, because I expect they are already
in flight. But if those somehow got lost or skipped, then here's an early
warning that these fixes still need to be applied.


thanks,
-- 
John Hubbard
NVIDIA


Re: [PATCH 5.2 000/135] 5.2.10-stable review

2019-08-23 Thread Greg KH
On Fri, Aug 23, 2019 at 09:18:05PM -0400, Sasha Levin wrote:
> On Fri, Aug 23, 2019 at 10:36:27AM -0700, Greg KH wrote:
> > On Fri, Aug 23, 2019 at 02:28:53AM -0400, Sasha Levin wrote:
> > > On Fri, Aug 23, 2019 at 02:42:48AM +0200, Stefan Lippers-Hollmann wrote:
> > > > Hi
> > > >
> > > > On 2019-08-22, Greg KH wrote:
> > > > > On Fri, Aug 23, 2019 at 12:05:27AM +0200, Stefan Lippers-Hollmann 
> > > > > wrote:
> > > > > > On 2019-08-22, Greg KH wrote:
> > > > > > > On Thu, Aug 22, 2019 at 01:05:56PM -0400, Sasha Levin wrote:
> > > > [...]
> > > > > > It might be down to kernel.org mirroring, but the patch file doesn't
> > > > > > seem to be available yet (404), both in the wrong location listed
> > > > > > above - and the expected one under
> > > > > >
> > > > > > 
> > > > > > https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.2.10-rc1.gz
> > > > [...]
> > > > > Ah, no, it's not a mirroring problem, Sasha and I didn't know if 
> > > > > anyone
> > > > > was actually using the patch files anymore, so it was simpler to do a
> > > > > release without them to see what happens. :)
> > > > >
> > > > > Do you rely on these, or can you use the -rc git tree or the quilt
> > > > > series?  If you do rely on them, we will work to fix this, it just
> > > > > involves some scripting that we didn't get done this morning.
> > > >
> > > > "Rely" is a strong word, I can adapt if they're going away, but
> > > > I've been using them so far, as in (slightly simplified):
> > > >
> > > > $ cd patches/upstream/
> > > > $ wget https://cdn.kernel.org/pub/linux/kernel/v5.x/patch-5.2.9.xz
> > > > $ xz -d patch-5.2.9.xz
> > > > $ wget 
> > > > https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.2.10-rc1.gz
> > > > $ gunzip patch-5.2.10-rc1.gz
> > > > $ vim ../series
> > > > $ quilt ...
> > > >
> > > > I can switch to importing the quilt queue with some sed magic (and I
> > > > already do that, if interesting or just a larger amounts of patches are
> > > > queuing up for more than a day or two), but using the -rc patches has
> > > > been convenient in that semi-manual workflow, also to make sure to 
> > > > really
> > > > get and test the formal -rc patch, rather than something inbetween.
> > > 
> > > An easy way to generate a patch is to just use the git.kernel.org web
> > > interface. A patch for 5.2.10-rc1 would be:
> > > https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git/patch/?id=linux-5.2.y&id2=v5.2.9
> > > 
> > > Personally this patch upload story sounded to me like a pre-git era
> > > artifact...
> > 
> > Given that we no longer do patches for Linus's -rc releases for the past
> > few years, maybe it is time to move to do the same for the stable
> > releases to be consistent.
> 
> Or tarballs? Why do we generate tarballs (and go through kup)?
> git.kernel.org already does it for us.

As I mentioned yesterday, but writing it down here for posterity,
there's a number of reasons.

First off, the release process doesn't require kup for when a "real"
release happens, that's all now donw on git.kernel.org with a process
involving a signed note and some other fun backend stuff.  We are
working on expanding that in the future with some other signature
validation as well, to make it easier to verify tarballs are "correct"
as what we do today is a bit different than other projects.

As for the tarball itself, it's still needed for the same reasons we do
so on Linus's releases:
- distros use these.  Don't want all Gentoo users hammering on
  git.kernel.org for their updated builds, that's a huge waste.
- mirroring works _so_ much better around the world for tarballs
- legal reasons.  git is not yet "recognized" as being something
  that properly is reflective of a specific point in time while
  as online tarballs that are mirrored and stored around the
  world are finally almost properly recognized for this.
- historical, people are used to using them, and workflows are
  built up around them.  People don't like rewriting scripts, as
  can be seen in my monstrosity of a mess that I use for
  releases :)

there are probably others, I know it came up when Linus stopped doing it
for the -rc releases and it was considered to do the same for the "real"
releases.

thanks,

greg k-h


Re: [PATCH] ARM: dts: vf610-zii-scu4-aib: Configure IRQ line for GPIO expander

2019-08-23 Thread Chris Healy
On Fri, Aug 23, 2019 at 5:27 PM Andrey Smirnov  wrote:
>
> Configure IRQ line for SX1503 GPIO expander. We already have
> appropriate pinmux entry and all that is missing is "interrupt-parent"
> and "interrupts" properties. Add them.
>
> Signed-off-by: Andrey Smirnov 
> Cc: Shawn Guo 
> Cc: Chris Healy 
> Cc: Cory Tusar 
> Cc: Fabio Estevam 
> Cc: linux-arm-ker...@lists.infradead.org
> Cc: linux-kernel@vger.kernel.org
> ---
>  arch/arm/boot/dts/vf610-zii-scu4-aib.dts | 2 ++
>  1 file changed, 2 insertions(+)
>
> diff --git a/arch/arm/boot/dts/vf610-zii-scu4-aib.dts 
> b/arch/arm/boot/dts/vf610-zii-scu4-aib.dts
> index e6c3621079e0..45a978defbdc 100644
> --- a/arch/arm/boot/dts/vf610-zii-scu4-aib.dts
> +++ b/arch/arm/boot/dts/vf610-zii-scu4-aib.dts
> @@ -570,6 +570,8 @@
> #gpio-cells = <2>;
> reg = <0x20>;
> gpio-controller;
> +   interrupt-parent = <&gpio1>;
> +   interrupts = <31 IRQ_TYPE_EDGE_FALLING>;
> };
>
> lm75@4e {
> --

Tested-by: Chris Healy 


Re: [PATCH v2] x86/mm/pti: in pti_clone_pgtable(), increase addr properly

2019-08-23 Thread Song Liu



> On Aug 21, 2019, at 3:30 AM, Peter Zijlstra  wrote:
> 
> On Wed, Aug 21, 2019 at 12:10:08PM +0200, Peter Zijlstra wrote:
>> On Tue, Aug 20, 2019 at 01:23:14PM -0700, Song Liu wrote:
> 
>>> host-5.2-after # grep "x  pmd" /sys/kernel/debug/page_tables/dump_pid
>>> 0x0060-0x00e0   8M USR ro PSE   
>>>   x  pmd
>>> 0x8100-0x81e0  14M ro PSE 
>>> GLB x  pmd
>>> 
>>> So after this patch, the 5.2 based kernel has 7 PMDs instead of 1 PMD
>>> in 4.16 kernel.
>> 
>> This basically gives rise to more questions than it provides answers.
>> You seem to have 'forgotten' to provide the equivalent mappings on the
>> two older kernels. The fact that they're not PMD is evident, but it
>> would be very good to know what is mapped, and what -- if anything --
>> lives in the holes we've (accidentally) created.
>> 
>> Can you please provide more complete mappings? Basically provide the
>> whole cpu_entry_area mapping.
> 
> I tried on my local machine and:
> 
>  cat /debug/page_tables/kernel | awk '/^---/ { p=0 } /CPU entry/ { p=1 } { if 
> (p) print $0 }' > ~/cea-{before,after}.txt
> 
> resulted in _identical_ files ?!?!
> 
> Can you share your before and after dumps?

I was really dumb on this. The actual issue this that kprobe on 
CONFIG_KPROBES_ON_FTRACE splits kernel text PMDs (0x8100-). 

I will dig more into this. 

Sorry for being silent, somehow I didn't see this email until just now.

Song

Re: [PATCH V3 0/3] riscv: Add perf callchain support

2019-08-23 Thread Paul Walmsley
On Mon, 19 Aug 2019, Mao Han wrote:

> PS: I got some compile error while compiling glibc 2.30 with linux
> v5.3-rc4 header. vfork.S include linux/sched.h(./include/uapi/linux/sched.h)
> which has a struct clone_args inside, added by
> 7f192e3cd316ba58c88dfa26796cf77789dd9872.

Noticed that also.  Probably the sched.h uapi kernel header file needs an 
"#ifndef __ASSEMBLY__" around the struct clone_args...


- Paul


Re: [PATCH] PCI: Add missing link delays required by the PCIe spec

2019-08-23 Thread Bjorn Helgaas
Hi Mika,

I'm trying to figure out specifically why we need this and where it
should go.  Questions below.

On Wed, Aug 21, 2019 at 03:45:19PM +0300, Mika Westerberg wrote:
> Currently Linux does not follow PCIe spec regarding the required delays
> after reset. A concrete example is a Thunderbolt add-in-card that
> consists of a PCIe switch and two PCIe endpoints:
> 
>   +-1b.0-[01-6b]00.0-[02-6b]--+-00.0-[03]00.0 TBT controller
>   +-01.0-[04-36]-- DS hotplug port
>   +-02.0-[37]00.0 xHCI controller
>   \-04.0-[38-6b]-- DS hotplug port
> 
> The root port (1b.0) and the PCIe switch downstream ports are all PCIe
> gen3 so they support 8GT/s link speeds.
> 
> We wait for the PCIe hierarchy to enter D3cold (runtime):
> 
>   pcieport :00:1b.0: power state changed by ACPI to D3cold
> 
> When it wakes up from D3cold, according to the PCIe 4.0 section 5.8 the
> PCIe switch is put to reset and its power is re-applied. This means that
> we must follow the rules in PCIe 4.0 section 6.6.1.
> 
> For the PCIe gen3 ports we are dealing with here, the following applies:
> 
>   With a Downstream Port that supports Link speeds greater than 5.0
>   GT/s, software must wait a minimum of 100 ms after Link training
>   completes before sending a Configuration Request to the device
>   immediately below that Port. Software can determine when Link training
>   completes by polling the Data Link Layer Link Active bit or by setting
>   up an associated interrupt (see Section 6.7.3.3).
> 
> Translating this into the above topology we would need to do this (DLLLA
> stands for Data Link Layer Link Active):
> 
>   pcieport :00:1b.0: wait for 100ms after DLLLA is set before access to 
> :01:00.0
>   pcieport :02:00.0: wait for 100ms after DLLLA is set before access to 
> :03:00.0
>   pcieport :02:02.0: wait for 100ms after DLLLA is set before access to 
> :37:00.0
> 
> I've instrumented the kernel with additional logging so we can see the
> actual delays the kernel performs:
> 
>   pcieport :00:1b.0: power state changed by ACPI to D0
>   pcieport :00:1b.0: waiting for D3cold delay of 100 ms
>   pcieport :00:1b.0: waking up bus
>   pcieport :00:1b.0: waiting for D3hot delay of 10 ms
>   pcieport :00:1b.0: restoring config space at offset 0x2c (was 0x60, 
> writing 0x60)
>   ...
>   pcieport :00:1b.0: PME# disabled
>   pcieport :01:00.0: restoring config space at offset 0x3c (was 0x1ff, 
> writing 0x201ff)
>   ...
>   pcieport :01:00.0: PME# disabled
>   pcieport :02:00.0: restoring config space at offset 0x3c (was 0x1ff, 
> writing 0x201ff)
>   ...
>   pcieport :02:00.0: PME# disabled
>   pcieport :02:01.0: restoring config space at offset 0x3c (was 0x1ff, 
> writing 0x201ff)
>   ...
>   pcieport :02:01.0: restoring config space at offset 0x4 (was 0x10, 
> writing 0x100407)
>   pcieport :02:01.0: PME# disabled
>   pcieport :02:02.0: restoring config space at offset 0x3c (was 0x1ff, 
> writing 0x201ff)
>   ...
>   pcieport :02:02.0: PME# disabled
>   pcieport :02:04.0: restoring config space at offset 0x3c (was 0x1ff, 
> writing 0x201ff)
>   ...
>   pcieport :02:04.0: PME# disabled
>   pcieport :02:01.0: PME# enabled
>   pcieport :02:01.0: waiting for D3hot delay of 10 ms
>   pcieport :02:04.0: PME# enabled
>   pcieport :02:04.0: waiting for D3hot delay of 10 ms
>   thunderbolt :03:00.0: restoring config space at offset 0x14 (was 0x0, 
> writing 0x8a04)
>   ...
>   thunderbolt :03:00.0: PME# disabled
>   xhci_hcd :37:00.0: restoring config space at offset 0x10 (was 0x0, 
> writing 0x73f0)
>   ...
>   xhci_hcd :37:00.0: PME# disabled
> 
> For the switch upstream port (01:00.0) we wait for 100ms but not taking
> into account the DLLLA requirement. We then wait 10ms for D3hot -> D0
> transition of the root port and the two downstream hotplug ports. This
> means that we deviate from what the spec requires.
> 
> Performing the same check for system sleep (s2idle) transitions we can
> see following when resuming from s2idle:
> 
>   pcieport :00:1b.0: power state changed by ACPI to D0
>   pcieport :00:1b.0: restoring config space at offset 0x2c (was 0x60, 
> writing 0x60)
>   ...
>   pcieport :01:00.0: restoring config space at offset 0x3c (was 0x1ff, 
> writing 0x201ff)
>   ...

I think the important thing in all the above logging is that it
doesn't show any delay, right?  If that's the case, you can just say
that in one line; I trust you even without 40 lines of config space
restore debug output :)

>   xhci_hcd :37:00.0: restoring config space at offset 0x10 (was 0x0, 
> writing 0x73f0)
>   ...
>   thunderbolt :03:00.0: restoring config space at offset 0x14 (was 0x0, 
> writing 0x8a04)
> 
> This is even worse. None of the mandatory delays are performed. If this
> would b

Re: [PATCH v2] x86/mm/pti: in pti_clone_pgtable(), increase addr properly

2019-08-23 Thread Song Liu



> On Aug 23, 2019, at 5:59 PM, Thomas Gleixner  wrote:
> 
> On Wed, 21 Aug 2019, Thomas Gleixner wrote:
>> On Wed, 21 Aug 2019, Song Liu wrote:
 On Aug 20, 2019, at 1:23 PM, Song Liu  wrote:
 
 Before 32-bit support, pti_clone_pmds() always adds PMD_SIZE to addr.
 This behavior changes after the 32-bit support:  pti_clone_pgtable()
 increases addr by PUD_SIZE for pud_none(*pud) case, and increases addr by
 PMD_SIZE for pmd_none(*pmd) case. However, this is not accurate because
 addr may not be PUD_SIZE/PMD_SIZE aligned.
 
 Fix this issue by properly rounding up addr to next PUD_SIZE/PMD_SIZE
 in these two cases.
>>> 
>>> After poking around more, I found the following doesn't really make 
>>> sense. 
>> 
>> I'm glad you figured that out yourself. Was about to write up something to
>> that effect.
>> 
>> Still interesting questions remain:
>> 
>>  1) How did you end up feeding an unaligned address into that which points
>> to a 0 PUD?
>> 
>>  2) Is this related to Facebook specific changes and unlikely to affect any
>> regular kernel? I can't come up with a way to trigger that in mainline
>> 
>>  3) As this is a user page table and the missing mapping is related to
>> mappings required by PTI, how is the machine going in/out of user
>> space in the first place? Or did I just trip over what you called
>> nonsense?
> 
> And just because this ended in silence I looked at it myself after Peter
> told me that this was on a kernel with PTI disabled. Aside of that my built
> in distrust for debug war stories combined with fairy tale changelogs
> triggered my curiousity anyway.

I am really sorry that I was silent. Somehow I didn't see this in my inbox
(or it didn't show up until just now?). 

For this patch, I really messed up this with something else. The issue we
are seeing is that kprobe on CONFIG_KPROBES_ON_FTRACE splits PMD located 
at 0x81a0. I sent another patch last night, but that might not
be the right fix either. 

I haven't started testing our PTI enabled kernel, so I am not sure whether
there is really an issue with the PTI code. 

Thanks,
Song





Re: [PATCH 00/14] per memcg lru_lock

2019-08-23 Thread Hugh Dickins
On Wed, 21 Aug 2019, Alex Shi wrote:
> 在 2019/8/21 上午2:24, Hugh Dickins 写道:
> > I'll set aside what I'm doing, and switch to rebasing ours to v5.3-rc
> > and/or mmotm.  Then compare with what Alex has, to see if there's any
> > good reason to prefer one to the other: if no good reason to prefer ours,
> > I doubt we shall bother to repost, but just use it as basis for helping
> > to review or improve Alex's.
> 
> For your review, my patchset are pretty straight and simple.
> It just use per lruvec lru_lock to replace necessary pgdat lru_lock.
> just this.  We could talk more after I back to work. :)

Sorry to be bearer of bad news, Alex, but when you said "straight and
simple", I feared that your patchset would turn out to be fundamentally
too simple.

And that is so. I have only to see the
lruvec = mem_cgroup_page_lruvec(page, pgdat);
line in isolate_migratepages_block() in mm/compaction.c, and check
that mem_cgroup_page_lruvec() is little changed in mm/mempolicy.c.

The central problem with per-memcg lru_lock is that you do not know
for sure what lock to take (which memcg a page belongs to) until you
have taken the expected lock, and then checked whether page->memcg
is still the same - backing out and trying again if not.

Fix that central problem, and you end up with a more complicated
patchset, much like ours.  It's true that when ours was first developed,
the memcg situation was more complicated in several ways, and perhaps
some aspects of our patchset could be simplified now (though I've not
identified any).  Johannes in particular has done a great deal of
simplifying work in memcg over the last few years, but there are still
situations in which a page's memcg can change (move_charge_at_immigrate
and swapin readahead spring to mind - or perhaps the latter is only an
issue when MEMCG_SWAP is not enabled, I forget; and I often wonder if
reparenting will be brought back one day).

I did not review your patchset in detail, and wasn't able to get very
far in testing it.  At first I was put off by set_task_reclaim_state
warnings from mm/vmscan.c, but those turned out to be in v5.3-rc5
itself, not from your patchset or mine (but I've not yet investigated
what's responsible for them).  Within a minute of starting swapping
load, kcompactd compact_lock_irqsave() in isolate_migratepages_block()
would deadlock, and I did not get further.  (Though I did also notice
that booting the CONFIG_MEMCG=y kernel with "cgroup_disable=memory"
froze in booting - tiresomely, one has to keep both the memcg and
no-memcg locking to cope with that case, and I guess you had not.)

Rather than duplicating effort, I would advise you to give our patchset
a try, and if it works for you, help towards getting that one merged:
but of course, it's up to you.

I've attached a tarfile of it rebased to v5.3-rc5: I do not want to
spam the list with patches yet, because I do not have any stats or
argument in support of the series, as Andrew asked for years ago and
Michal asks again now.  But aside from that I consider it ready, and
will let Shakeel take it over from here, while I get back to what I
diverted from (but of course I'll try to answer questions on it).

Thanks,
Hugh

lrulock503.tar
Description: per-memcg lru_lock on v5.3-rc5


Re: [PATCH v2] riscv: add support for SECCOMP and SECCOMP_FILTER

2019-08-23 Thread Paul Walmsley
On Fri, 23 Aug 2019, David Abdurachmanov wrote:

> On Fri, Aug 23, 2019 at 5:30 PM Paul Walmsley  
> wrote:
> >
> > On Thu, 22 Aug 2019, David Abdurachmanov wrote:
> >
> > > There is one failing kernel selftest: global.user_notification_signal
> >
> > Is this the only failing test?  Or are the rest of the selftests skipped
> > when this test fails, and no further tests are run, as seems to be shown
> > here:
> >
> >   
> > https://lore.kernel.org/linux-riscv/cadnnuqcmdmre1f+3jg8spr6jrrnbsy8vvd70vbkem0nqyeo...@mail.gmail.com/
> 
> Yes, it's a single test failing. After removing 
> global.user_notification_signal
> test everything else pass and you get the results printed.

OK.

> Well the code states ".. and hope that it doesn't break when there
> is actually a signal :)". Maybe we are just unlucky. I don't have results
> from other architectures to compare.
> 
> I found that Linaro is running selftests, but SECCOMP is disabled
> and thus it's failing. Is there another CI which tracks selftests?

0day runs the kselftests, and at least on some architectures/Kconfigs, 
it's succeeding:

https://lore.kernel.org/lkml/20190726083740.GG22106@shao2-debian/

https://lore.kernel.org/lkml/20190712064850.GC20848@shao2-debian/

https://lore.kernel.org/lkml/20190311074115.GC10839@shao2-debian/

etc.


- Paul


Re: [PATCH 5.2 000/135] 5.2.10-stable review

2019-08-23 Thread Sasha Levin

On Fri, Aug 23, 2019 at 10:36:27AM -0700, Greg KH wrote:

On Fri, Aug 23, 2019 at 02:28:53AM -0400, Sasha Levin wrote:

On Fri, Aug 23, 2019 at 02:42:48AM +0200, Stefan Lippers-Hollmann wrote:
> Hi
>
> On 2019-08-22, Greg KH wrote:
> > On Fri, Aug 23, 2019 at 12:05:27AM +0200, Stefan Lippers-Hollmann wrote:
> > > On 2019-08-22, Greg KH wrote:
> > > > On Thu, Aug 22, 2019 at 01:05:56PM -0400, Sasha Levin wrote:
> [...]
> > > It might be down to kernel.org mirroring, but the patch file doesn't
> > > seem to be available yet (404), both in the wrong location listed
> > > above - and the expected one under
> > >
> > >  
https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.2.10-rc1.gz
> [...]
> > Ah, no, it's not a mirroring problem, Sasha and I didn't know if anyone
> > was actually using the patch files anymore, so it was simpler to do a
> > release without them to see what happens. :)
> >
> > Do you rely on these, or can you use the -rc git tree or the quilt
> > series?  If you do rely on them, we will work to fix this, it just
> > involves some scripting that we didn't get done this morning.
>
> "Rely" is a strong word, I can adapt if they're going away, but
> I've been using them so far, as in (slightly simplified):
>
> $ cd patches/upstream/
> $ wget https://cdn.kernel.org/pub/linux/kernel/v5.x/patch-5.2.9.xz
> $ xz -d patch-5.2.9.xz
> $ wget 
https://www.kernel.org/pub/linux/kernel/v5.x/stable-review/patch-5.2.10-rc1.gz
> $ gunzip patch-5.2.10-rc1.gz
> $ vim ../series
> $ quilt ...
>
> I can switch to importing the quilt queue with some sed magic (and I
> already do that, if interesting or just a larger amounts of patches are
> queuing up for more than a day or two), but using the -rc patches has
> been convenient in that semi-manual workflow, also to make sure to really
> get and test the formal -rc patch, rather than something inbetween.

An easy way to generate a patch is to just use the git.kernel.org web
interface. A patch for 5.2.10-rc1 would be:
https://git.kernel.org/pub/scm/linux/kernel/git/stable/linux-stable-rc.git/patch/?id=linux-5.2.y&id2=v5.2.9

Personally this patch upload story sounded to me like a pre-git era
artifact...


Given that we no longer do patches for Linus's -rc releases for the past
few years, maybe it is time to move to do the same for the stable
releases to be consistent.


Or tarballs? Why do we generate tarballs (and go through kup)?
git.kernel.org already does it for us.

--
Thanks,
Sasha


Re: [PATCH 11/15] sched,fair: flatten hierarchical runqueues

2019-08-23 Thread Rik van Riel
On Fri, 2019-08-23 at 20:14 +0200, Dietmar Eggemann wrote:
> 
> Looks like with the se->depth related code gone here in
> pick_next_task_fair() and the call to find_matching_se() in
> check_preempt_wakeup() you could remove se->depth entirely.
> 
> [...]

I've just done that in a separate patch in my series,
in case we need it again.  If we don't, diffstat tells
us we're getting this:

 include/linux/sched.h |  1 -
 kernel/sched/fair.c   | 50 ++-
---
 2 files changed, 2 insertions(+), 49 deletions(-)

Thank you!

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH v2] riscv: add support for SECCOMP and SECCOMP_FILTER

2019-08-23 Thread David Abdurachmanov
On Fri, Aug 23, 2019 at 6:04 PM David Abdurachmanov
 wrote:
>
> On Fri, Aug 23, 2019 at 5:30 PM Paul Walmsley  
> wrote:
> >
> > On Thu, 22 Aug 2019, David Abdurachmanov wrote:
> >
> > > There is one failing kernel selftest: global.user_notification_signal
> >
> > Is this the only failing test?  Or are the rest of the selftests skipped
> > when this test fails, and no further tests are run, as seems to be shown
> > here:
> >
> >   
> > https://lore.kernel.org/linux-riscv/cadnnuqcmdmre1f+3jg8spr6jrrnbsy8vvd70vbkem0nqyeo...@mail.gmail.com/
>
> Yes, it's a single test failing. After removing 
> global.user_notification_signal
> test everything else pass and you get the results printed.
>
> >
> > For example, looking at the source, I'd naively expect to see the
> > user_notification_closed_listener test result -- which follows right
> > after the failing test in the selftest source.  But there aren't any
> > results?
>
> Yes, it hangs at this point. You have to manually terminate it.
>
> >
> > Also - could you follow up with the author of this failing test to see if
> > we can get some more clarity about what might be going wrong here?  It
> > appears that the failing test was added in commit 6a21cc50f0c7f ("seccomp:
> > add a return code to trap to userspace") by Tycho Andersen
> > .
>
> Well the code states ".. and hope that it doesn't break when there
> is actually a signal :)". Maybe we are just unlucky. I don't have results
> from other architectures to compare.
>
> I found that Linaro is running selftests, but SECCOMP is disabled
> and thus it's failing. Is there another CI which tracks selftests?
>
> https://qa-reports.linaro.org/lkft/linux-next-oe/tests/kselftest/seccomp_seccomp_bpf?top=next-20190823

Actually it seems that seccomp is enabled in kernel, but not in
systemd, and somehow seccomp_bpf is missing on all arches thus
causing automatic failure.

> >
> >
> > - Paul


Re: [PATCH v2] riscv: add support for SECCOMP and SECCOMP_FILTER

2019-08-23 Thread David Abdurachmanov
On Fri, Aug 23, 2019 at 5:30 PM Paul Walmsley  wrote:
>
> On Thu, 22 Aug 2019, David Abdurachmanov wrote:
>
> > There is one failing kernel selftest: global.user_notification_signal
>
> Is this the only failing test?  Or are the rest of the selftests skipped
> when this test fails, and no further tests are run, as seems to be shown
> here:
>
>   
> https://lore.kernel.org/linux-riscv/cadnnuqcmdmre1f+3jg8spr6jrrnbsy8vvd70vbkem0nqyeo...@mail.gmail.com/

Yes, it's a single test failing. After removing global.user_notification_signal
test everything else pass and you get the results printed.

>
> For example, looking at the source, I'd naively expect to see the
> user_notification_closed_listener test result -- which follows right
> after the failing test in the selftest source.  But there aren't any
> results?

Yes, it hangs at this point. You have to manually terminate it.

>
> Also - could you follow up with the author of this failing test to see if
> we can get some more clarity about what might be going wrong here?  It
> appears that the failing test was added in commit 6a21cc50f0c7f ("seccomp:
> add a return code to trap to userspace") by Tycho Andersen
> .

Well the code states ".. and hope that it doesn't break when there
is actually a signal :)". Maybe we are just unlucky. I don't have results
from other architectures to compare.

I found that Linaro is running selftests, but SECCOMP is disabled
and thus it's failing. Is there another CI which tracks selftests?

https://qa-reports.linaro.org/lkft/linux-next-oe/tests/kselftest/seccomp_seccomp_bpf?top=next-20190823

>
>
> - Paul


Re: WARNINGs in set_task_reclaim_state with memory cgroup and full memory usage

2019-08-23 Thread Yang Shi




On 8/23/19 3:00 PM, Adric Blake wrote:

Synopsis:
A WARN_ON_ONCE is hit twice in set_task_reclaim_state under the
following conditions:
- a memory cgroup has been created and a task assigned it it
- memory.limit_in_bytes has been set
- memory has filled up, likely from cache

In my usage, I create a cgroup under the current session scope and
assign a task to it. I then set memory.limit_in_bytes and
memory.soft_limit_in_bytes for the cgroup to reasonable values, say
1G/512M. The program accesses large files frequently and gradually
fills memory with the page cache. The warnings appears when the
entirety of the system memory is filled, presumably from other
programs.

If I wait until the program has filled the entirety of system memory
with cache and then assign a memory limit, the warnings appear
immediately.


It looks the warning is triggered because kswapd set reclaim_state then 
the memcg soft limit reclaim in the same kswapd set it again.


But, kswapd and memcg soft limit uses different reclaim_state from 
different scan control. It sounds not correct, they should use the same 
reclaim_state if they come from the same context if my understanding is 
correct.




I am building the linux git. I first noticed this issue with the
drm-tip 5.3rc3 and 5.3rc4 kernels, and tested linux master after
5.3rc5 to confirm the bug more resoundingly.

Here are the warnings.

[38491.963105] WARNING: CPU: 7 PID: 175 at mm/vmscan.c:245
set_task_reclaim_state+0x1e/0x40
[38491.963106] Modules linked in: iwlmvm mac80211 libarc4 iwlwifi
cfg80211 xt_comment nls_iso8859_1 nls_cp437 vfat fat xfs jfs btrfs xor
raid6_pq libcrc32c ccm tun rfcomm fuse xt_tcpudp ip6t_REJECT
nf_reject_ipv6 ipt_REJECT nf_reject_ipv4 xt_multiport xt_owner
snd_hda_codec_hdmi ip6table_filter ip6_tables iptable_filter bnep ext4
crc32c_generic mbcache jbd2 snd_hda_codec_realtek
snd_hda_codec_generic snd_soc_skl snd_soc_hdac_hda snd_hda_ext_core
snd_soc_skl_ipc x86_pkg_temp_thermal intel_powerclamp snd_soc_sst_ipc
coretemp snd_soc_sst_dsp snd_soc_acpi_intel_match kvm_intel
snd_soc_acpi i915 snd_soc_core kvm snd_compress ac97_bus
snd_pcm_dmaengine snd_hda_intel i2c_algo_bit btusb irqbypass
drm_kms_helper btrtl snd_hda_codec dell_laptop btbcm crct10dif_pclmul
snd_hda_core crc32c_intel btintel iTCO_wdt ghash_clmulni_intel drm
ledtrig_audio aesni_intel iTCO_vendor_support snd_hwdep dell_wmi
rtsx_usb_ms r8169 dell_smbios aes_x86_64 mei_hdcp crypto_simd
intel_gtt bluetooth snd_pcm cryptd dcdbas
[38491.963155]  wmi_bmof dell_wmi_descriptor intel_rapl_msr
glue_helper snd_timer joydev intel_cstate snd realtek memstick
dell_smm_hwmon mousedev psmouse input_leds libphy intel_uncore
ecdh_generic ecc crc16 rfkill intel_rapl_perf soundcore i2c_i801
agpgart mei_me tpm_crb syscopyarea sysfillrect sysimgblt mei
intel_xhci_usb_role_switch fb_sys_fops idma64 tpm_tis roles
processor_thermal_device intel_rapl_common i2c_hid tpm_tis_core
int3403_thermal intel_soc_dts_iosf battery wmi intel_lpss_pci
intel_lpss intel_pch_thermal tpm int3400_thermal int3402_thermal
acpi_thermal_rel int340x_thermal_zone rng_core intel_hid ac
sparse_keymap evdev mac_hid crypto_user ip_tables x_tables
hid_multitouch rtsx_usb_sdmmc mmc_core rtsx_usb hid_logitech_hidpp
sr_mod cdrom sd_mod uas usb_storage hid_logitech_dj hid_generic usbhid
hid ahci serio_raw libahci atkbd libps2 libata xhci_pci scsi_mod
xhci_hcd crc32_pclmul i8042 serio f2fs [last unloaded: cfg80211]
[38491.963221] CPU: 7 PID: 175 Comm: kswapd0 Not tainted
5.3.0-rc5+149+gbb7ba8069de9 #1
[38491.963222] Hardware name: Dell Inc. Inspiron 5570/09YTN7, BIOS
1.2.3 05/15/2019
[38491.963226] RIP: 0010:set_task_reclaim_state+0x1e/0x40
[38491.963228] Code: 78 a9 e7 ff 0f 1f 84 00 00 00 00 00 0f 1f 44 00
00 55 48 89 f5 53 48 89 fb 48 85 ed 48 8b 83 08 08 00 00 74 11 48 85
c0 74 02 <0f> 0b 48 89 ab 08 08 00 00 5b 5d c3 48 85 c0 75 f1 0f 0b 48
89 ab
[38491.963229] RSP: 0018:8c898031fc60 EFLAGS: 00010286
[38491.963230] RAX: 8c898031fe28 RBX: 892aa04ddc40 RCX: 
[38491.963231] RDX: 8c898031fc60 RSI: 8c898031fcd0 RDI: 892aa04ddc40
[38491.963233] RBP: 8c898031fcd0 R08: 8c898031fd48 R09: 89279674b800
[38491.963234] R10:  R11:  R12: 8c898031fd48
[38491.963235] R13: 892a842ef000 R14: 892aaf7fc000 R15: 
[38491.963236] FS:  () GS:892aa33c()
knlGS:
[38491.963238] CS:  0010 DS:  ES:  CR0: 80050033
[38491.963239] CR2: 7f90628fa000 CR3: 00027ee0a002 CR4: 003606e0
[38491.963239] Call Trace:
[38491.963246]  mem_cgroup_shrink_node+0x9b/0x1d0
[38491.963250]  mem_cgroup_soft_limit_reclaim+0x10c/0x3a0
[38491.963254]  balance_pgdat+0x276/0x540
[38491.963258]  kswapd+0x200/0x3f0
[38491.963261]  ? wait_woken+0x80/0x80
[38491.963265]  kthread+0xfd/0x130
[38491.963267]  ? balance_pgdat+0x540/0x540
[38491.963269]  ? kthread_park+0x80/0x80
[38491.963273]  ret_from_fork+0x35/0x40
[3849

Re: [PATCH v2] x86/mm/pti: in pti_clone_pgtable(), increase addr properly

2019-08-23 Thread Thomas Gleixner
On Wed, 21 Aug 2019, Thomas Gleixner wrote:
> On Wed, 21 Aug 2019, Song Liu wrote:
> > > On Aug 20, 2019, at 1:23 PM, Song Liu  wrote:
> > > 
> > > Before 32-bit support, pti_clone_pmds() always adds PMD_SIZE to addr.
> > > This behavior changes after the 32-bit support:  pti_clone_pgtable()
> > > increases addr by PUD_SIZE for pud_none(*pud) case, and increases addr by
> > > PMD_SIZE for pmd_none(*pmd) case. However, this is not accurate because
> > > addr may not be PUD_SIZE/PMD_SIZE aligned.
> > > 
> > > Fix this issue by properly rounding up addr to next PUD_SIZE/PMD_SIZE
> > > in these two cases.
> > 
> > After poking around more, I found the following doesn't really make 
> > sense. 
> 
> I'm glad you figured that out yourself. Was about to write up something to
> that effect.
> 
> Still interesting questions remain:
> 
>   1) How did you end up feeding an unaligned address into that which points
>  to a 0 PUD?
> 
>   2) Is this related to Facebook specific changes and unlikely to affect any
>  regular kernel? I can't come up with a way to trigger that in mainline
> 
>   3) As this is a user page table and the missing mapping is related to
>  mappings required by PTI, how is the machine going in/out of user
>  space in the first place? Or did I just trip over what you called
>  nonsense?

And just because this ended in silence I looked at it myself after Peter
told me that this was on a kernel with PTI disabled. Aside of that my built
in distrust for debug war stories combined with fairy tale changelogs
triggered my curiousity anyway.

So that cannot be a kernel with PTI disabled compile time because in that
case the functions are not available, unless it's FB hackery which I do not
care about.

So the only way this can be reached is when PTI is configured but disabled
at boot time via pti=off or nopti.

For some silly reason and that goes back to before the 32bit support and
Joern did not notice either when he introduced pti_finalize() this results
in the following functions being called unconditionallY:

 pti_clone_entry_text()
 pti_clone_kernel_text()

pti_clone_kernel_text() was called unconditionally before the 32bit support
already and the only reason why it did not have any effect in that
situation is that it invokes pti_kernel_image_global_ok() and that returns
false when PTI is disabled on the kernel command line. Oh well. It
obviously never checked whether X86_FEATURE_PTI was disabled or enabled in
the first place.

Now 32bit moved that around into pti_finalize() and added the call to
pti_clone_entry_text() which just runs unconditionally.

Now there is still the interesting question why this matters. The to be
cloned page table entries are mapped and the start address even if
unaligned never points to something unmapped. The unmapped case only covers
holes and holes are obviously aligned at the upper levels even if the
address of the hole is unaligned.

So colour me still confused what's wrong there but the proper fix is the
trivial:

--- a/arch/x86/mm/pti.c
+++ b/arch/x86/mm/pti.c
@@ -666,6 +666,8 @@ void __init pti_init(void)
  */
 void pti_finalize(void)
 {
+   if (!boot_cpu_has(X86_FEATURE_PTI))
+   return;
/*
 * We need to clone everything (again) that maps parts of the
 * kernel image.

Hmm?

I'm going to look whether that makes any difference in the page tables
tomorrow with brain awake, but I wanted to share this before the .us
vanishes into the weekend :)

Thanks,

tglx



Re: [PATCH V5 1/3] riscv: Add perf callchain support

2019-08-23 Thread Guo Ren
Please check CONFIG_FRAME_POINTER

1 *frame = *((struct stackframe *)frame->fp - 1);
This code is origionally from riscv/kernel/stacktrace.c: walk_stackframe

In linux/Makefile it'll involve the options for gcc to definitely
store ra & prev_fp in fp pointer.
ifdef CONFIG_FRAME_POINTER
KBUILD_CFLAGS += -fno-omit-frame-pointer -fno-optimize-sibling-calls

So --call-graph fp need depends on CONFIG_FRAME_POINTER.

On Fri, Aug 23, 2019 at 4:56 PM Greentime Hu  wrote:
>
> Hi Mao,
>
> Mao Han  於 2019年8月23日 週五 下午2:16寫道:
>
> >
> > This patch add support for perf callchain sampling on riscv platform.
> > The return address of leaf function is retrieved from pt_regs as
> > it is not saved in the outmost frame.
> >
> > Signed-off-by: Mao Han 
> > Cc: Paul Walmsley 
> > Cc: Greentime Hu 
> > Cc: Palmer Dabbelt 
> > Cc: linux-riscv 
> > Cc: Christoph Hellwig 
> > Cc: Guo Ren 
> > ---
> >  arch/riscv/Makefile|   3 +
> >  arch/riscv/kernel/Makefile |   3 +-
> >  arch/riscv/kernel/perf_callchain.c | 115 
> > +
> >  3 files changed, 120 insertions(+), 1 deletion(-)
> >  create mode 100644 arch/riscv/kernel/perf_callchain.c
>
> I just tested "./perf record -e cpu-clock --call-graph fp ls" on
> Unleashed board and I got this failure.
> I take a look at it. It seem failed in here. Do you have any idea?
> It seems fine in Qemu.
>
> 1 *frame = *((struct stackframe *)frame->fp - 1);
> ffe0001a198c: 00863a83 ld s5,8(a2)
> ffe0001a1990: ff093903 ld s2,-16(s2)
>
> ./perf record -e cpu-clock --call-graph fp ls
> [ 9619.423884] hrtimer: interrupt took 733000 ns
> [ 9619.977017] Unable to handle kernel paging request at virtual
> address ff94
> [ 9620.214736] Oops [#1]
> [ 9620.289893] Modules linked in:
> [ 9620.391378] CPU: 0 PID: 264 Comm: ls Not tainted
> 5.3.0-rc5-3-gb008f6bcd67c #4
> [ 9620.640176] sepc: ffe0001a198c ra : ffe0001a199a sp :
> ffe93720
> [ 9620.880366] gp : ffe00097dad8 tp : ffe82e40 t0 : 
> 00046000
> [ 9621.120564] t1 : 0002 t2 : 0007 s0 : 
> ffe93760
> [ 9621.360768] s1 : ffe93788 a0 : 0003 a1 : 
> 
> [ 9621.600991] a2 : ff8c a3 : 1fa0 a4 : 
> 0010
> [ 9621.841181] a5 : 0002 a6 : 0001 a7 : 
> ffe079b34e10
> [ 9622.081400] s2 : ff9c s3 : ffe0 s4 : 
> 1ff8
> [ 9622.321618] s5 : ffe93da0 s6 : ffe00097d540 s7 : 
> ffe07a1517a0
> [ 9622.561811] s8 : 08bf01c7ff60 s9 : ffe000235b2a s10: 
> 00020120
> [ 9622.802015] s11: 0001 t3 : ffe079b34e00 t4 : 
> 0001
> [ 9623.042194] t5 : 0008 t6 : ffe0009208d0
> [ 9623.218785] sstatus: 00020100 sbadaddr: ff94
> scause: 000d
> [ 9623.490850] ---[ end trace 49043f28e856d84d ]---
> [ 9623.644217] Kernel panic - not syncing: Fatal exception in interrupt
> [ 9623.855470] SMP: stopping secondary CPUs
> [ 9623.985955] ---[ end Kernel panic - not syncing: Fatal exception in
> interrupt ]---



-- 
Best Regards
 Guo Ren

ML: https://lore.kernel.org/linux-csky/


Re: [PATCH v5 12/13] pwm: mediatek: remove a property "has-clock"

2019-08-23 Thread Uwe Kleine-König
On Thu, Aug 22, 2019 at 02:58:42PM +0800, Sam Shih wrote:
> Due to we added clock-frequency property to fix
> mt7628 pwm during configure from userspace.
> We can alos use this property to determine whether
> the complex clock tree exists in the SoC or not.
> So we can safety remove has-clock property in the
> driver specific data.

Some suggestions in short form:

s/Due/Since/
s/alos/also/

Also please use more horizontal space, up to 76 chars per line is fine.

Other than that I suggest to first address the feedback for the earlier
patches as the needed changes there has influence on this patch.

Best regards
Uwe

-- 
Pengutronix e.K.   | Uwe Kleine-König|
Industrial Linux Solutions | http://www.pengutronix.de/  |


Re: [PATCH v5 11/13] dt-bindings: pwm: update bindings for MT7629 SoC

2019-08-23 Thread Uwe Kleine-König
On Thu, Aug 22, 2019 at 02:58:41PM +0800, Sam Shih wrote:
> From: Ryder Lee 
> 
> This updates bindings for MT7629 pwm controller.
> 
> This patch is the same as
> https://patchwork.kernel.org/patch/10769381/
> and it has a Reviewed-by tag in v1

This paragraph doesn't belong in the commit log.
> 
> Signed-off-by: Ryder Lee 
> Signed-off-by: Sam Shih 
> Reviewed-by: Matthias Brugger 

Thanks
Uwe

-- 
Pengutronix e.K.   | Uwe Kleine-König|
Industrial Linux Solutions | http://www.pengutronix.de/  |


Re: [PATCH v5 10/13] arm: dts: mt7623: add a property "num-pwms" for PWM

2019-08-23 Thread Uwe Kleine-König
On Thu, Aug 22, 2019 at 02:58:40PM +0800, Sam Shih wrote:
> From: Ryder Lee 
> 
> This adds a property "num-pwms" for PWM controller.
> 
> Signed-off-by: Ryder Lee 
> Signed-off-by: Sam Shih 
> ---
>  arch/arm/boot/dts/mt7623.dtsi | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/arch/arm/boot/dts/mt7623.dtsi b/arch/arm/boot/dts/mt7623.dtsi
> index a79f0b6c3429..208e0d19a575 100644
> --- a/arch/arm/boot/dts/mt7623.dtsi
> +++ b/arch/arm/boot/dts/mt7623.dtsi
> @@ -452,6 +452,7 @@
><&pericfg CLK_PERI_PWM5>;
>   clock-names = "top", "main", "pwm1", "pwm2",
> "pwm3", "pwm4", "pwm5";
> + num-pwms = <5>;
>   status = "disabled";
>   };

FTR: The matching change to the binding is patch 7 in this series and
didn't get an Ack from the dt people yet.

Best regards
Uwe


-- 
Pengutronix e.K.   | Uwe Kleine-König|
Industrial Linux Solutions | http://www.pengutronix.de/  |


Re: [PATCH v5 09/13] arm64: dts: mt7622: add a property "num-pwms" for PWM

2019-08-23 Thread Uwe Kleine-König
On Thu, Aug 22, 2019 at 02:58:39PM +0800, Sam Shih wrote:
> From: Ryder Lee 
> 
> This adds a property "num-pwms" for PWM controller.
> 
> Signed-off-by: Ryder Lee 
> Signed-off-by: Sam Shih 
> ---
>  arch/arm64/boot/dts/mediatek/mt7622.dtsi | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/arch/arm64/boot/dts/mediatek/mt7622.dtsi 
> b/arch/arm64/boot/dts/mediatek/mt7622.dtsi
> index d1e13d340e26..9a043938881f 100644
> --- a/arch/arm64/boot/dts/mediatek/mt7622.dtsi
> +++ b/arch/arm64/boot/dts/mediatek/mt7622.dtsi
> @@ -439,6 +439,7 @@
><&pericfg CLK_PERI_PWM6_PD>;
>   clock-names = "top", "main", "pwm1", "pwm2", "pwm3", "pwm4",
> "pwm5", "pwm6";
> + num-pwms = <6>;
>   status = "disabled";
>   };

FTR: The matching change to the binding is patch 7 in this series and
didn't get an Ack from the dt people yet.

Best regards
Uwe

-- 
Pengutronix e.K.   | Uwe Kleine-König|
Industrial Linux Solutions | http://www.pengutronix.de/  |


Re: [PATCH v5 08/13] dt-bindings: pwm: update bindings for MT7628 SoC

2019-08-23 Thread Uwe Kleine-König
On Thu, Aug 22, 2019 at 04:12:20PM +0800, Yingjoe Chen wrote:
> On Thu, 2019-08-22 at 14:58 +0800, Sam Shih wrote:
> > This updates bindings for MT7628 pwm controller.
> > 
> > Signed-off-by: Sam Shih 
> > ---
> >  Documentation/devicetree/bindings/pwm/pwm-mediatek.txt | 4 
> >  1 file changed, 4 insertions(+)
> > 
> > diff --git a/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt 
> > b/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt
> > index ea95b490a913..93980e3da261 100644
> > --- a/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt
> > +++ b/Documentation/devicetree/bindings/pwm/pwm-mediatek.txt
> > @@ -21,6 +21,10 @@ Required properties:
> > See pinctrl/pinctrl-bindings.txt for details of the property values.
> >   - num-pwms: the number of PWM channels.
> > +
> > + Optional properties:
> > + - clock-frequency: fix clock frequency, this is only used in MT7628 SoC
> > +for period calculation. This SoC has no complex clock 
> > tree.
> 
> I'm sorry if this has been discussed before. 
> 
> Would it be simpler if you just provide a fixed-clock as clock in device
> tree? This way you don't need this optional properties and don't need to
> special handle it in driver code. 
> 
> After all, the hw is still connected to a simple clock tree.

This is what I just wrote in reply to patch 3 (which implements handling
the clock-frequency property) even before reading your feedback. So I
fully agree.

Best regards
Uwe

-- 
Pengutronix e.K.   | Uwe Kleine-König|
Industrial Linux Solutions | http://www.pengutronix.de/  |


Re: [PATCH v5 06/13] pwm: mediatek: update license and switch to SPDX tag

2019-08-23 Thread Uwe Kleine-König
On Thu, Aug 22, 2019 at 02:58:36PM +0800, Sam Shih wrote:
> Add SPDX identifiers to pwm-mediatek.c
> Update license to GNU General Public License v2.0
> 
> Signed-off-by: Ryder Lee 
> Signed-off-by: Sam Shih 

Reviewed-by: Uwe Kleine-König 

Thanks
Uwe

-- 
Pengutronix e.K.   | Uwe Kleine-König|
Industrial Linux Solutions | http://www.pengutronix.de/  |


Re: [PATCH v5 05/13] pwm: mediatek: use pwm_mediatek as common prefix

2019-08-23 Thread Uwe Kleine-König
On Thu, Aug 22, 2019 at 02:58:35PM +0800, Sam Shih wrote:
> Use pwm_mediatek as common prefix to match the filename.
> No functional change intended.
> 
> Signed-off-by: Ryder Lee 
> Signed-off-by: Sam Shih 
> ---
> Changes since v5:
> - Follow reviewers's comments
> The license stuff is a separate change

this is a nice cleanup, I like it.

Acked-by: Uwe Kleine-König 

Thanks
Uwe

-- 
Pengutronix e.K.   | Uwe Kleine-König|
Industrial Linux Solutions | http://www.pengutronix.de/  |


Re: [PATCH v5 04/13] pwm: mediatek: allocate the clks array dynamically

2019-08-23 Thread Uwe Kleine-König
On Thu, Aug 22, 2019 at 02:58:34PM +0800, Sam Shih wrote:
> Instead of using fixed size of arrays, allocate the memory for them
> based on the information we get from the DT.
> 
> Also remove the check for num_pwms, due to dynamically allocate pwm
> should not cause array index out of bound.
> 
> Signed-off-by: Ryder Lee 
> Signed-off-by: Sam Shih 

Reviewed-by: Uwe Kleine-König 

Thanks
Uwe

-- 
Pengutronix e.K.   | Uwe Kleine-König|
Industrial Linux Solutions | http://www.pengutronix.de/  |


Re: [PATCH v5 03/13] pwm: mediatek: add a property "clock-frequency"

2019-08-23 Thread Uwe Kleine-König
Hello Sam,

On Thu, Aug 22, 2019 at 02:58:33PM +0800, Sam Shih wrote:
> This fix mt7628 pwm during configure from userspace. The SoC
> is legacy MIPS and has no complex clock tree. This patch add property
> clock-frequency to the SoC specific data and legacy MIPS SoC need to
> configure it in DT. This property is use for period calculation.
> 
> We will improve this fix by droping has-clks attribute and using
> clock-frequency to do the same thing in a new patch.
> 
> Signed-off-by: Ryder Lee 
> Signed-off-by: Sam Shih 

I wonder if instead the platform could provide some dummy and optional
clocks.

You could add a fixed-clock for the clk that is used to determine the
clock rate and switch to devm_clk_get_optional for the others.

Best regards
Uwe

-- 
Pengutronix e.K.   | Uwe Kleine-König|
Industrial Linux Solutions | http://www.pengutronix.de/  |


[CFP] Real-Time Summit 2019 Call for Presentations

2019-08-23 Thread Daniel Bristot de Oliveira
The Real-Time Summit is organized by the Linux Foundation Real-Time Linux (RTL)
collaborative project. The event is intended to gather developers and users of
Linux as a Real-Time Operating System. The main intent is to provide room for
discussion between developers, tooling experts, and users.

The summit will take place alongside the Open Source Summit + Embedded Linux
Conference Europe 2019 in Lyon, France. The summit is planned the day after the
main conference, Thursday, October 31st, 2019, from 8:00 to 17:00 at the
conference venue. If you are already considering your travel arrangements for
the Open Source Summit + Embedded Linux Conference Europe 2019 in Lyon, France,
and you have a general interest in this topic, please extend your travel by one
day to be in Lyon on Thursday, 31st.

If you are interested to present, please submit a proposal [1] before September
14th, 2019, at 23:59 EST. Please provide a title, an abstract describing the
proposed talk (900 characters maximum), a short biography (900 characters
maximum), and describe the targeted audience (900 characters maximum). Please
indicate the slot length you are aiming for: The format is a single track with
presentation slots of 30, 45 or 60 minutes long. Considering that the
presentation should use at most half of the slot time, leaving the rest of the
slot reserved for discussion. The focus of this event is the discussion.

We are welcoming presentations from both end-users and developers, on topics
covering, but not limited to:

 - Real-time Linux development
 - Real-time Linux evaluation
 - Real-time Linux use cases (Success and failures)
 - Real-time Linux tooling (tracing, configuration, …)
 - Real-time Linux academic work, already presented or under development, for
   direct feedback from practitioners community.

Those can cover recently available technologies, ongoing work, and new ideas.

Important Notes for Speakers:

 - All speakers are required to adhere to the Linux Foundation events’ Code of
   Conduct. We also highly recommend that speakers take the Linux Foundation
   online Inclusive Speaker Orientation Course.

 - Avoid sales or marketing pitches and discussing unlicensed or potentially
   closed-source technologies when preparing your proposal; these talks are
   almost always rejected due to the fact that they take away from the integrity
   of our events, and are rarely well-received by conference attendees.

 - All accepted speakers are required to submit their slides prior to the
event.

Submission must be received by 11:59 pm PST on September 14th, 2019

[1] Submission page: https://forms.gle/yQeqyrtJYezM5VRJA

Important Dates:

  - CFP Close: Saturday, September 14th, 2019, 11:59PM PST
  - Speaker notification: September 21st, 2019
  - Conference: Thursday, October 31st, 2019

Questions on submitting a proposal?
Email Daniel Bristot de Oliveira 

Thanks!
-- Daniel


Re: [PATCH v2] riscv: add support for SECCOMP and SECCOMP_FILTER

2019-08-23 Thread Paul Walmsley
On Thu, 22 Aug 2019, David Abdurachmanov wrote:

> There is one failing kernel selftest: global.user_notification_signal

Is this the only failing test?  Or are the rest of the selftests skipped 
when this test fails, and no further tests are run, as seems to be shown 
here:

  
https://lore.kernel.org/linux-riscv/cadnnuqcmdmre1f+3jg8spr6jrrnbsy8vvd70vbkem0nqyeo...@mail.gmail.com/

For example, looking at the source, I'd naively expect to see the 
user_notification_closed_listener test result -- which follows right 
after the failing test in the selftest source.  But there aren't any 
results?

Also - could you follow up with the author of this failing test to see if 
we can get some more clarity about what might be going wrong here?  It 
appears that the failing test was added in commit 6a21cc50f0c7f ("seccomp: 
add a return code to trap to userspace") by Tycho Andersen 
.


- Paul


Re: [PATCH 0/2] coresight: Add barrier packet when moving offset forward

2019-08-23 Thread Yabin Cui
Thanks for fixing this problem. I didn't realize it because I usually use a
buffer size >= the default ETR buffer size, which is harder to reproduce the
problem.
The patches LGTM, maybe you also want to fix the problem commented by Leo Yan.
I tested the patches by recording etm data with a buffer size smaller than the
default ETR buffer size. Then I saw barrier packets when decoding with OpenCSD.
And I could decode successfully without error message.


Re: [PATCH v5 01/13] pwm: mediatek: add a property "num-pwms"

2019-08-23 Thread Uwe Kleine-König
On Thu, Aug 22, 2019 at 02:58:31PM +0800, Sam Shih wrote:
> From: Ryder Lee 
> 
> This adds a property "num-pwms" to avoid having an endless
> list of compatibles with no differences for the same driver.
> 
> Signed-off-by: Ryder Lee 
> Signed-off-by: Sam Shih 

Reviewed-by: Uwe Kleine-König 

Thanks
Uwe

-- 
Pengutronix e.K.   | Uwe Kleine-König|
Industrial Linux Solutions | http://www.pengutronix.de/  |


Re: [PATCH v5 02/13] pwm: mediatek: droping the check for of_device_get_match_data

2019-08-23 Thread Uwe Kleine-König
On Thu, Aug 22, 2019 at 02:58:32PM +0800, Sam Shih wrote:
> This patch drop the check for of_device_get_match_data.
> Due to the only way call driver probe is compatible match.
> The .data pointer which point to the SoC specify data is
> directly set by driver, and it should not be NULL in our case.
> We can safety remove the check for of_device_get_match_data.
> 
> Signed-off-by: Ryder Lee 
> Signed-off-by: Sam Shih 

Acked-by: Uwe Kleine-König 

Thanks
Uwe

-- 
Pengutronix e.K.   | Uwe Kleine-König|
Industrial Linux Solutions | http://www.pengutronix.de/  |


[PATCH 1/2] media: imx: Move capture device init to registered

2019-08-23 Thread Steve Longerbeam
If the CSI is unregistered and then registered again without the
driver being removed and re-probed (which will happen when the media
device is removed and re-probed without also removing/re-probing the
CSI), the result is the kobject error and backtrace "tried to init an
initialized object". This is because the video device is left in an
initialized state after being unregistered, thus the video device's
underlying kobject is also left in an initialized state when the device
is registered again.

Fix this by moving imx_media_capture_device_init() and _remove()
into csi_registered() and csi_unregistered(). This will create a new
un-initialized video device when the CSI is re-registered. Do this for
all the subdevices that register a capture device.

Reported-by: Russell King 
Signed-off-by: Steve Longerbeam 
---
 drivers/staging/media/imx/imx-ic-prpencvf.c | 24 -
 drivers/staging/media/imx/imx-media-csi.c   | 20 ++---
 drivers/staging/media/imx/imx7-media-csi.c  | 22 +--
 3 files changed, 37 insertions(+), 29 deletions(-)

diff --git a/drivers/staging/media/imx/imx-ic-prpencvf.c 
b/drivers/staging/media/imx/imx-ic-prpencvf.c
index 67ffa46a8e96..301f5fce53c0 100644
--- a/drivers/staging/media/imx/imx-ic-prpencvf.c
+++ b/drivers/staging/media/imx/imx-ic-prpencvf.c
@@ -1271,17 +1271,26 @@ static int prp_registered(struct v4l2_subdev *sd)
if (ret)
return ret;
 
+   priv->vdev = imx_media_capture_device_init(ic_priv->ipu_dev,
+  &ic_priv->sd,
+  PRPENCVF_SRC_PAD);
+   if (IS_ERR(priv->vdev))
+   return PTR_ERR(priv->vdev);
+
ret = imx_media_capture_device_register(priv->vdev);
if (ret)
-   return ret;
+   goto remove_vdev;
 
ret = prp_init_controls(priv);
if (ret)
-   goto unreg;
+   goto unreg_vdev;
 
return 0;
-unreg:
+
+unreg_vdev:
imx_media_capture_device_unregister(priv->vdev);
+remove_vdev:
+   imx_media_capture_device_remove(priv->vdev);
return ret;
 }
 
@@ -1290,6 +1299,8 @@ static void prp_unregistered(struct v4l2_subdev *sd)
struct prp_priv *priv = sd_to_priv(sd);
 
imx_media_capture_device_unregister(priv->vdev);
+   imx_media_capture_device_remove(priv->vdev);
+
v4l2_ctrl_handler_free(&priv->ctrl_hdlr);
 }
 
@@ -1336,12 +1347,6 @@ static int prp_init(struct imx_ic_priv *ic_priv)
spin_lock_init(&priv->irqlock);
timer_setup(&priv->eof_timeout_timer, prp_eof_timeout, 0);
 
-   priv->vdev = imx_media_capture_device_init(ic_priv->ipu_dev,
-  &ic_priv->sd,
-  PRPENCVF_SRC_PAD);
-   if (IS_ERR(priv->vdev))
-   return PTR_ERR(priv->vdev);
-
mutex_init(&priv->lock);
 
return 0;
@@ -1352,7 +1357,6 @@ static void prp_remove(struct imx_ic_priv *ic_priv)
struct prp_priv *priv = ic_priv->task_priv;
 
mutex_destroy(&priv->lock);
-   imx_media_capture_device_remove(priv->vdev);
 }
 
 struct imx_ic_ops imx_ic_prpencvf_ops = {
diff --git a/drivers/staging/media/imx/imx-media-csi.c 
b/drivers/staging/media/imx/imx-media-csi.c
index 367e39f5b382..b3f1cf08a102 100644
--- a/drivers/staging/media/imx/imx-media-csi.c
+++ b/drivers/staging/media/imx/imx-media-csi.c
@@ -1797,12 +1797,22 @@ static int csi_registered(struct v4l2_subdev *sd)
if (ret)
goto free_fim;
 
+   priv->vdev = imx_media_capture_device_init(priv->sd.dev,
+  &priv->sd,
+  CSI_SRC_PAD_IDMAC);
+   if (IS_ERR(priv->vdev)) {
+   ret = PTR_ERR(priv->vdev);
+   goto free_fim;
+   }
+
ret = imx_media_capture_device_register(priv->vdev);
if (ret)
-   goto free_fim;
+   goto remove_vdev;
 
return 0;
 
+remove_vdev:
+   imx_media_capture_device_remove(priv->vdev);
 free_fim:
if (priv->fim)
imx_media_fim_free(priv->fim);
@@ -1816,6 +1826,7 @@ static void csi_unregistered(struct v4l2_subdev *sd)
struct csi_priv *priv = v4l2_get_subdevdata(sd);
 
imx_media_capture_device_unregister(priv->vdev);
+   imx_media_capture_device_remove(priv->vdev);
 
if (priv->fim)
imx_media_fim_free(priv->fim);
@@ -1963,11 +1974,6 @@ static int imx_csi_probe(struct platform_device *pdev)
imx_media_grp_id_to_sd_name(priv->sd.name, sizeof(priv->sd.name),
priv->sd.grp_id, ipu_get_num(priv->ipu));
 
-   priv->vdev = imx_media_capture_device_init(priv->sd.dev, &priv->sd,
-  CSI_SRC_PAD_IDMAC);
-   if (IS_ERR(priv->vdev))
-   return 

[PATCH 2/2] media: imx: Move pads init to probe

2019-08-23 Thread Steve Longerbeam
If a subdevice is unregistered and then registered again without the
driver being removed and re-probed (which will happen when the media
device is removed and re-probed without also removing/re-probing the
subdevice), media_device_register_entity() is called with a non-zero
entity->num_pads, and then the subdevice's .registered callback calls
media_entity_pads_init(). Thus the subdevice's pad objects are added
to the media device pad list twice, causing list corruption.

One way to fix this would be to create media_entity_pads_destroy(),
and call it in the subdevice's .unregistered callback. But calling
media_entity_pads_init() in the .registered callbacks was done for
legacy reasons and is no longer necessary, so move the call to
media_entity_pads_init() into the subdevice's probe functions. This
fixes the duplicate pad obejcts in the media device pad list.

Signed-off-by: Steve Longerbeam 
---
 drivers/staging/media/imx/imx-ic-prp.c| 25 ++--
 drivers/staging/media/imx/imx-ic-prpencvf.c   | 29 ++-
 drivers/staging/media/imx/imx-media-capture.c | 15 +-
 drivers/staging/media/imx/imx-media-csi.c | 21 +++---
 drivers/staging/media/imx/imx-media-vdic.c| 27 +++--
 drivers/staging/media/imx/imx6-mipi-csi2.c| 27 -
 drivers/staging/media/imx/imx7-media-csi.c| 18 +++-
 drivers/staging/media/imx/imx7-mipi-csis.c| 23 +--
 8 files changed, 82 insertions(+), 103 deletions(-)

diff --git a/drivers/staging/media/imx/imx-ic-prp.c 
b/drivers/staging/media/imx/imx-ic-prp.c
index 35e60a120dc1..2a4f77e83ed3 100644
--- a/drivers/staging/media/imx/imx-ic-prp.c
+++ b/drivers/staging/media/imx/imx-ic-prp.c
@@ -428,32 +428,19 @@ static int prp_s_frame_interval(struct v4l2_subdev *sd,
return 0;
 }
 
-/*
- * retrieve our pads parsed from the OF graph by the media device
- */
 static int prp_registered(struct v4l2_subdev *sd)
 {
struct prp_priv *priv = sd_to_priv(sd);
-   int i, ret;
u32 code;
 
-   for (i = 0; i < PRP_NUM_PADS; i++) {
-   priv->pad[i].flags = (i == PRP_SINK_PAD) ?
-   MEDIA_PAD_FL_SINK : MEDIA_PAD_FL_SOURCE;
-   }
-
/* init default frame interval */
priv->frame_interval.numerator = 1;
priv->frame_interval.denominator = 30;
 
/* set a default mbus format  */
imx_media_enum_ipu_format(&code, 0, CS_SEL_YUV);
-   ret = imx_media_init_mbus_fmt(&priv->format_mbus, 640, 480, code,
- V4L2_FIELD_NONE, NULL);
-   if (ret)
-   return ret;
-
-   return media_entity_pads_init(&sd->entity, PRP_NUM_PADS, priv->pad);
+   return imx_media_init_mbus_fmt(&priv->format_mbus, 640, 480, code,
+  V4L2_FIELD_NONE, NULL);
 }
 
 static const struct v4l2_subdev_pad_ops prp_pad_ops = {
@@ -487,6 +474,7 @@ static const struct v4l2_subdev_internal_ops 
prp_internal_ops = {
 static int prp_init(struct imx_ic_priv *ic_priv)
 {
struct prp_priv *priv;
+   int i;
 
priv = devm_kzalloc(ic_priv->ipu_dev, sizeof(*priv), GFP_KERNEL);
if (!priv)
@@ -496,7 +484,12 @@ static int prp_init(struct imx_ic_priv *ic_priv)
ic_priv->task_priv = priv;
priv->ic_priv = ic_priv;
 
-   return 0;
+   for (i = 0; i < PRP_NUM_PADS; i++)
+   priv->pad[i].flags = (i == PRP_SINK_PAD) ?
+   MEDIA_PAD_FL_SINK : MEDIA_PAD_FL_SOURCE;
+
+   return media_entity_pads_init(&ic_priv->sd.entity, PRP_NUM_PADS,
+ priv->pad);
 }
 
 static void prp_remove(struct imx_ic_priv *ic_priv)
diff --git a/drivers/staging/media/imx/imx-ic-prpencvf.c 
b/drivers/staging/media/imx/imx-ic-prpencvf.c
index 301f5fce53c0..09c4e3f33807 100644
--- a/drivers/staging/media/imx/imx-ic-prpencvf.c
+++ b/drivers/staging/media/imx/imx-ic-prpencvf.c
@@ -1240,21 +1240,16 @@ static int prp_s_frame_interval(struct v4l2_subdev *sd,
return 0;
 }
 
-/*
- * retrieve our pads parsed from the OF graph by the media device
- */
 static int prp_registered(struct v4l2_subdev *sd)
 {
struct prp_priv *priv = sd_to_priv(sd);
+   struct imx_ic_priv *ic_priv = priv->ic_priv;
int i, ret;
u32 code;
 
+   /* set a default mbus format  */
+   imx_media_enum_ipu_format(&code, 0, CS_SEL_YUV);
for (i = 0; i < PRPENCVF_NUM_PADS; i++) {
-   priv->pad[i].flags = (i == PRPENCVF_SINK_PAD) ?
-   MEDIA_PAD_FL_SINK : MEDIA_PAD_FL_SOURCE;
-
-   /* set a default mbus format  */
-   imx_media_enum_ipu_format(&code, 0, CS_SEL_YUV);
ret = imx_media_init_mbus_fmt(&priv->format_mbus[i],
  640, 480, code, V4L2_FIELD_NONE,
  &priv->cc[i]);
@@ -1266,11 +1261,6 @@ static int prp_registered(struct v4l2_subd

[PATCH] ARM: dts: vf610-zii-scu4-aib: Use generic names for DT nodes

2019-08-23 Thread Andrey Smirnov
The devicetree specification recommends using generic node names.

Some ZII dts files already follow such recommendation, but some don't,
so use generic node names for consistency among the ZII dts files.

Signed-off-by: Andrey Smirnov 
Cc: Shawn Guo 
Cc: Chris Healy 
Cc: Cory Tusar 
Cc: Fabio Estevam 
Cc: linux-arm-ker...@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
---
 arch/arm/boot/dts/vf610-zii-scu4-aib.dts | 24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/arch/arm/boot/dts/vf610-zii-scu4-aib.dts 
b/arch/arm/boot/dts/vf610-zii-scu4-aib.dts
index 45a978defbdc..6edd682dbd29 100644
--- a/arch/arm/boot/dts/vf610-zii-scu4-aib.dts
+++ b/arch/arm/boot/dts/vf610-zii-scu4-aib.dts
@@ -417,7 +417,7 @@
pinctrl-0 = <&pinctrl_dspi1>;
status = "okay";
 
-   spi-flash@0 {
+   flash@0 {
#address-cells = <1>;
#size-cells = <1>;
compatible = "jedec,spi-nor";
@@ -430,7 +430,7 @@
};
};
 
-   spi-flash@1 {
+   flash@1 {
#address-cells = <1>;
#size-cells = <1>;
compatible = "jedec,spi-nor";
@@ -519,7 +519,7 @@
#gpio-cells = <2>;
};
 
-   lm75@48 {
+   temp-sensor@48 {
compatible = "national,lm75";
reg = <0x48>;
};
@@ -534,7 +534,7 @@
reg = <0x52>;
};
 
-   ds1682@6b {
+   elapsed-time-recorder@6b {
compatible = "dallas,ds1682";
reg = <0x6b>;
};
@@ -551,7 +551,7 @@
reg = <0x38>;
};
 
-   adt7411@4a {
+   adc@4a {
compatible = "adi,adt7411";
reg = <0x4a>;
};
@@ -563,7 +563,7 @@
pinctrl-0 = <&pinctrl_i2c2>;
status = "okay";
 
-   gpio9: sx1503q@20 {
+   gpio9: io-expander@20 {
compatible = "semtech,sx1503q";
pinctrl-names = "default";
pinctrl-0 = <&pinctrl_sx1503_20>;
@@ -574,12 +574,12 @@
interrupts = <31 IRQ_TYPE_EDGE_FALLING>;
};
 
-   lm75@4e {
+   temp-sensor@4e {
compatible = "national,lm75";
reg = <0x4e>;
};
 
-   lm75@4f {
+   temp-sensor@4f {
compatible = "national,lm75";
reg = <0x4f>;
};
@@ -591,17 +591,17 @@
reg = <0x23>;
};
 
-   adt7411@4a {
+   adc@4a {
compatible = "adi,adt7411";
reg = <0x4a>;
};
 
-   at24c08@54 {
+   eeprom@54 {
compatible = "atmel,24c08";
reg = <0x54>;
};
 
-   tca9548@70 {
+   i2c-mux@70 {
compatible = "nxp,pca9548";
pinctrl-names = "default";
#address-cells = <1>;
@@ -639,7 +639,7 @@
};
};
 
-   tca9548@71 {
+   i2c-mux@71 {
compatible = "nxp,pca9548";
pinctrl-names = "default";
reg = <0x71>;
-- 
2.21.0



Re: [PATCH] pwm: mxs: use devm_platform_ioremap_resource() to simplify code

2019-08-23 Thread Uwe Kleine-König
Hello,

On Tue, Aug 20, 2019 at 05:56:40AM +, Anson Huang wrote:
> Gentle ping...

My impression[1] is that Thierry collects patches in bulk once or twice
per release cycle. The last two such bulks were between 5.2-rc6 and
5.2-rc7 and in the 5.2 merge window. So given we're at v5.3-rc5 now I
expect some action soon :-)

> > > From: anson.hu...@nxp.com 
> > > Sent: Thursday, July 18, 2019 9:32 AM
> > >
> > > Use the new helper devm_platform_ioremap_resource() which wraps the
> > > platform_get_resource() and devm_ioremap_resource() together, to
> > > simplify the code.
> > >
> > > Signed-off-by: Anson Huang 
> > 
> > Reviewed-by: Dong Aisheng 

Acked-by: Uwe Kleine-König 

Best regards
Uwe

[1] from git log --committer=Thierry --format=%ci drivers/pwm | cut -d\  -f1 | 
uniq -c
-- 
Pengutronix e.K.   | Uwe Kleine-König|
Industrial Linux Solutions | http://www.pengutronix.de/  |


[PATCH] ARM: dts: vf610-zii-scu4-aib: Configure IRQ line for GPIO expander

2019-08-23 Thread Andrey Smirnov
Configure IRQ line for SX1503 GPIO expander. We already have
appropriate pinmux entry and all that is missing is "interrupt-parent"
and "interrupts" properties. Add them.

Signed-off-by: Andrey Smirnov 
Cc: Shawn Guo 
Cc: Chris Healy 
Cc: Cory Tusar 
Cc: Fabio Estevam 
Cc: linux-arm-ker...@lists.infradead.org
Cc: linux-kernel@vger.kernel.org
---
 arch/arm/boot/dts/vf610-zii-scu4-aib.dts | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/arch/arm/boot/dts/vf610-zii-scu4-aib.dts 
b/arch/arm/boot/dts/vf610-zii-scu4-aib.dts
index e6c3621079e0..45a978defbdc 100644
--- a/arch/arm/boot/dts/vf610-zii-scu4-aib.dts
+++ b/arch/arm/boot/dts/vf610-zii-scu4-aib.dts
@@ -570,6 +570,8 @@
#gpio-cells = <2>;
reg = <0x20>;
gpio-controller;
+   interrupt-parent = <&gpio1>;
+   interrupts = <31 IRQ_TYPE_EDGE_FALLING>;
};
 
lm75@4e {
-- 
2.21.0



Re: [PATCH] uprobes/x86: fix detection of 32-bit user mode

2019-08-23 Thread Thomas Gleixner
On Fri, 23 Aug 2019, Andy Lutomirski wrote:
> > On Aug 23, 2019, at 5:03 PM, Thomas Gleixner  wrote:
> > 
> >> On Sat, 24 Aug 2019, Thomas Gleixner wrote:
> >> On Fri, 23 Aug 2019, Andy Lutomirski wrote:
>  On Aug 23, 2019, at 4:44 PM, Thomas Gleixner  wrote:
>  
> >> On Sat, 24 Aug 2019, Thomas Gleixner wrote:
> >> On Sun, 28 Jul 2019, Sebastian Mayr wrote:
> >> 
> >> -static inline int sizeof_long(void)
> >> +static inline int sizeof_long(struct pt_regs *regs)
> >> {
> >> -return in_ia32_syscall() ? 4 : 8;
> > 
> > This wants a comment.
> > 
> >> +return user_64bit_mode(regs) ? 8 : 4;
>  
>  The more simpler one liner is to check
>  
>    test_thread_flag(TIF_IA32)
> >>> 
> >>> I still want to finish killing TIF_IA32 some day.  Let’s please not add 
> >>> new users.
> >> 
> >> Well, yes and no. This needs to be backported 
> > 
> > And TBH the magic in user_64bit_mode() is not pretty either.
> > 
> It’s only magic on Xen. I should probably stick a
> cpu_feature_enabled(X86_FEATURE_XENPV) in there instead.

For backporting sake I really prefer the TIF version. One usage site more
is not the end of the world. We can add the user_64bit_mode() variant from
Sebastian on top as a cleanup right away so mainline is clean.

Thanks,

tglx

RE: [PATCH v2 03/10] PCI: designware-ep: Move the function of getting MSI capability forward

2019-08-23 Thread Xiaowei Bao


> -Original Message-
> From: Andrew Murray 
> Sent: 2019年8月23日 21:39
> To: Xiaowei Bao 
> Cc: bhelg...@google.com; robh...@kernel.org; mark.rutl...@arm.com;
> shawn...@kernel.org; Leo Li ; kis...@ti.com;
> lorenzo.pieral...@arm.co; a...@arndb.de; gre...@linuxfoundation.org; M.h.
> Lian ; Mingkai Hu ; Roy
> Zang ; jingooh...@gmail.com;
> gustavo.pimen...@synopsys.com; linux-...@vger.kernel.org;
> devicet...@vger.kernel.org; linux-kernel@vger.kernel.org;
> linux-arm-ker...@lists.infradead.org; linuxppc-...@lists.ozlabs.org
> Subject: Re: [PATCH v2 03/10] PCI: designware-ep: Move the function of
> getting MSI capability forward
> 
> On Thu, Aug 22, 2019 at 07:22:35PM +0800, Xiaowei Bao wrote:
> > Move the function of getting MSI capability to the front of init
> > function, because the init function of the EP platform driver will use
> > the return value by the function of getting MSI capability.
> >
> > Signed-off-by: Xiaowei Bao 
> 
> Reviewed-by: Andrew Murray 

Thanks a lot, I think move this to ep_init is better.

> 
> > ---
> > v2:
> >  - No change.
> >
> >  drivers/pci/controller/dwc/pcie-designware-ep.c | 7 ---
> >  1 file changed, 4 insertions(+), 3 deletions(-)
> >
> > diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c
> > b/drivers/pci/controller/dwc/pcie-designware-ep.c
> > index b8388f8..0a6c199 100644
> > --- a/drivers/pci/controller/dwc/pcie-designware-ep.c
> > +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
> > @@ -656,6 +656,10 @@ int dw_pcie_ep_init(struct dw_pcie_ep *ep)
> > if (ret < 0)
> > epc->max_functions = 1;
> >
> > +   ep->msi_cap = dw_pcie_ep_find_capability(pci, PCI_CAP_ID_MSI);
> > +
> > +   ep->msix_cap = dw_pcie_ep_find_capability(pci, PCI_CAP_ID_MSIX);
> > +
> > if (ep->ops->ep_init)
> > ep->ops->ep_init(ep);
> >
> > @@ -672,9 +676,6 @@ int dw_pcie_ep_init(struct dw_pcie_ep *ep)
> > dev_err(dev, "Failed to reserve memory for MSI/MSI-X\n");
> > return -ENOMEM;
> > }
> > -   ep->msi_cap = dw_pcie_ep_find_capability(pci, PCI_CAP_ID_MSI);
> > -
> > -   ep->msix_cap = dw_pcie_ep_find_capability(pci, PCI_CAP_ID_MSIX);
> >
> > offset = dw_pcie_ep_find_ext_capability(pci, PCI_EXT_CAP_ID_REBAR);
> > if (offset) {
> > --
> > 2.9.5
> >


Re: [RFC PATCH v2 00/19] RDMA/FS DAX truncate proposal V1,000,002 ;-)

2019-08-23 Thread Dave Chinner
On Fri, Aug 23, 2019 at 10:15:04AM -0700, Ira Weiny wrote:
> On Fri, Aug 23, 2019 at 10:59:14AM +1000, Dave Chinner wrote:
> > On Wed, Aug 21, 2019 at 11:02:00AM -0700, Ira Weiny wrote:
> > > On Tue, Aug 20, 2019 at 08:55:15AM -0300, Jason Gunthorpe wrote:
> > > > On Tue, Aug 20, 2019 at 11:12:10AM +1000, Dave Chinner wrote:
> > > > > On Mon, Aug 19, 2019 at 09:38:41AM -0300, Jason Gunthorpe wrote:
> > > > > > On Mon, Aug 19, 2019 at 07:24:09PM +1000, Dave Chinner wrote:
> > > > > > 
> > > > > > > So that leaves just the normal close() syscall exit case, where 
> > > > > > > the
> > > > > > > application has full control of the order in which resources are
> > > > > > > released. We've already established that we can block in this
> > > > > > > context.  Blocking in an interruptible state will allow fatal 
> > > > > > > signal
> > > > > > > delivery to wake us, and then we fall into the
> > > > > > > fatal_signal_pending() case if we get a SIGKILL while blocking.
> > > > > > 
> > > > > > The major problem with RDMA is that it doesn't always wait on 
> > > > > > close() for the
> > > > > > MR holding the page pins to be destoyed. This is done to avoid a
> > > > > > deadlock of the form:
> > > > > > 
> > > > > >uverbs_destroy_ufile_hw()
> > > > > >   mutex_lock()
> > > > > >[..]
> > > > > > mmput()
> > > > > >  exit_mmap()
> > > > > >   remove_vma()
> > > > > >fput();
> > > > > > file_operations->release()
> > > > > 
> > > > > I think this is wrong, and I'm pretty sure it's an example of why
> > > > > the final __fput() call is moved out of line.
> > > > 
> > > > Yes, I think so too, all I can say is this *used* to happen, as we
> > > > have special code avoiding it, which is the code that is messing up
> > > > Ira's lifetime model.
> > > > 
> > > > Ira, you could try unraveling the special locking, that solves your
> > > > lifetime issues?
> > > 
> > > Yes I will try to prove this out...  But I'm still not sure this fully 
> > > solves
> > > the problem.
> > > 
> > > This only ensures that the process which has the RDMA context (RDMA FD) 
> > > is safe
> > > with regard to hanging the close for the "data file FD" (the file which 
> > > has
> > > pinned pages) in that _same_ process.  But what about the scenario.
> > > 
> > > Process A has the RDMA context FD and data file FD (with lease) open.
> > > 
> > > Process A uses SCM_RIGHTS to pass the RDMA context FD to Process B.
> > 
> > Passing the RDMA context dependent on a file layout lease to another
> > process that doesn't have a file layout lease or a reference to the
> > original lease should be considered a violation of the layout lease.
> > Process B does not have an active layout lease, and so by the rules
> > of layout leases, it is not allowed to pin the layout of the file.
> > 
> 
> I don't disagree with the semantics of this.  I just don't know how to enforce
> it.
> 
> > > Process A attempts to exit (hangs because data file FD is pinned).
> > > 
> > > Admin kills process A.  kill works because we have allowed for it...
> > > 
> > > Process B _still_ has the RDMA context FD open _and_ therefore still 
> > > holds the
> > > file pins.
> > > 
> > > Truncation still fails.
> > > 
> > > Admin does not know which process is holding the pin.
> > > 
> > > What am I missing?
> > 
> > Application does not hold the correct file layout lease references.
> > Passing the fd via SCM_RIGHTS to a process without a layout lease
> > is equivalent to not using layout leases in the first place.
> 
> Ok, So If I understand you correctly you would support a failure of SCM_RIGHTS
> in this case?  I'm ok with that but not sure how to implement it right now.
> 
> To that end, I would like to simplify this slightly because I'm not convinced
> that SCM_RIGHTS is a problem we need to solve right now.  ie I don't know of a
> user who wants to do this.

I don't think we can support it, let alone want to. SCM_RIGHTS was a
mistake made years ago that has been causing bugs and complexity to
try and avoid those bugs ever since.  I'm only taking about it
because someone else raised it and I asummed they raised it because
they want it to "work".

> Right now duplication via SCM_RIGHTS could fail if _any_ file pins (and by
> definition leases) exist underneath the "RDMA FD" (or other direct access FD,
> like XDP etc) being duplicated.

Sounds like a fine idea to me.

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com


RE: [PATCH v2 08/10] PCI: layerscape: Add EP mode support for ls1088a and ls2088a

2019-08-23 Thread Xiaowei Bao


> -Original Message-
> From: Andrew Murray 
> Sent: 2019年8月23日 22:28
> To: Xiaowei Bao 
> Cc: bhelg...@google.com; robh...@kernel.org; mark.rutl...@arm.com;
> shawn...@kernel.org; Leo Li ; kis...@ti.com;
> lorenzo.pieral...@arm.co; a...@arndb.de; gre...@linuxfoundation.org; M.h.
> Lian ; Mingkai Hu ; Roy
> Zang ; jingooh...@gmail.com;
> gustavo.pimen...@synopsys.com; linux-...@vger.kernel.org;
> devicet...@vger.kernel.org; linux-kernel@vger.kernel.org;
> linux-arm-ker...@lists.infradead.org; linuxppc-...@lists.ozlabs.org
> Subject: Re: [PATCH v2 08/10] PCI: layerscape: Add EP mode support for
> ls1088a and ls2088a
> 
> On Thu, Aug 22, 2019 at 07:22:40PM +0800, Xiaowei Bao wrote:
> > Add PCIe EP mode support for ls1088a and ls2088a, there are some
> > difference between LS1 and LS2 platform, so refactor the code of the
> > EP driver.
> >
> > Signed-off-by: Xiaowei Bao 
> > ---
> > v2:
> >  - New mechanism for layerscape EP driver.
> 
> Was there a v1 of this patch?

Yes, but I don't know how to comments, ^_^

> 
> >
> >  drivers/pci/controller/dwc/pci-layerscape-ep.c | 76
> > --
> >  1 file changed, 58 insertions(+), 18 deletions(-)
> >
> > diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c
> > b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> > index 7ca5fe8..2a66f07 100644
> > --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c
> > +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> > @@ -20,27 +20,29 @@
> >
> >  #define PCIE_DBI2_OFFSET   0x1000  /* DBI2 base address*/
> >
> > -struct ls_pcie_ep {
> > -   struct dw_pcie  *pci;
> > -   struct pci_epc_features *ls_epc;
> > +#define to_ls_pcie_ep(x)   dev_get_drvdata((x)->dev)
> > +
> > +struct ls_pcie_ep_drvdata {
> > +   u32 func_offset;
> > +   const struct dw_pcie_ep_ops *ops;
> > +   const struct dw_pcie_ops*dw_pcie_ops;
> >  };
> >
> > -#define to_ls_pcie_ep(x)   dev_get_drvdata((x)->dev)
> > +struct ls_pcie_ep {
> > +   struct dw_pcie  *pci;
> > +   struct pci_epc_features *ls_epc;
> > +   const struct ls_pcie_ep_drvdata *drvdata; };
> >
> >  static int ls_pcie_establish_link(struct dw_pcie *pci)  {
> > return 0;
> >  }
> >
> > -static const struct dw_pcie_ops ls_pcie_ep_ops = {
> > +static const struct dw_pcie_ops dw_ls_pcie_ep_ops = {
> > .start_link = ls_pcie_establish_link,  };
> >
> > -static const struct of_device_id ls_pcie_ep_of_match[] = {
> > -   { .compatible = "fsl,ls-pcie-ep",},
> > -   { },
> > -};
> > -
> >  static const struct pci_epc_features*  ls_pcie_ep_get_features(struct
> > dw_pcie_ep *ep)  { @@ -82,10 +84,44 @@ static int
> > ls_pcie_ep_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
> > }
> >  }
> >
> > -static const struct dw_pcie_ep_ops pcie_ep_ops = {
> > +static unsigned int ls_pcie_ep_func_conf_select(struct dw_pcie_ep *ep,
> > +   u8 func_no)
> > +{
> > +   struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
> > +   struct ls_pcie_ep *pcie = to_ls_pcie_ep(pci);
> > +   u8 header_type;
> > +
> > +   header_type = ioread8(pci->dbi_base + PCI_HEADER_TYPE);
> > +
> > +   if (header_type & (1 << 7))
> > +   return pcie->drvdata->func_offset * func_no;
> > +   else
> > +   return 0;
> 
> It looks like there isn't a PCI define for multi function, the nearest I 
> could find
> was PCI_HEADER_TYPE_MULTIDEVICE in hotplug/ibmphp.h. A comment
> above the test might be helpful to explain the test.

Yes, I have not find the PCI_HEADER_TYPE_MULTIDEVICE define. OK, I will add
The comments in next version patch.

> 
> As the ls_pcie_ep_drvdata structures are static, the unset .func_offset will 
> be
> initialised to 0, so you could just drop the test above.

OK, thanks

> 
> However something to the effect of the following may help spot
> misconfiguration:
> 
> WARN_ON(func_no && !pcie->drvdata->func_offset); return
> pcie->drvdata->func_offset * func_no;

Thanks a lot, this looks better.

> 
> The WARN is probably quite useful as if you are attempting to use non-zero
> functions and func_offset isn't set - then things may appear to work normally
> but actually will break horribly.

got it, thanks.

> 
> Thanks,
> 
> Andrew Murray
> 
> > +}
> > +
> > +static const struct dw_pcie_ep_ops ls_pcie_ep_ops = {
> > .ep_init = ls_pcie_ep_init,
> > .raise_irq = ls_pcie_ep_raise_irq,
> > .get_features = ls_pcie_ep_get_features,
> > +   .func_conf_select = ls_pcie_ep_func_conf_select, };
> > +
> > +static const struct ls_pcie_ep_drvdata ls1_ep_drvdata = {
> > +   .ops = &ls_pcie_ep_ops,
> > +   .dw_pcie_ops = &dw_ls_pcie_ep_ops,
> > +};
> > +
> > +static const struct ls_pcie_ep_drvdata ls2_ep_drvdata = {
> > +   .func_offset = 0x2,
> > +   .ops = &ls_pcie_ep_ops,
> > +   .dw_pcie_ops = &dw_ls_pcie_ep_ops,
> > +};
> > +
> > +static const struct of_device_id ls_pcie_ep_of_match[] = {
> > +   { .compatible = "fsl,ls1046a-pcie-ep", .data = &ls1_ep_drvda

RE: [PATCH 3/3] nvme: complete request in work queue on CPU with flooded interrupts

2019-08-23 Thread Long Li
>>>Subject: Re: [PATCH 3/3] nvme: complete request in work queue on CPU
>>>with flooded interrupts
>>>
>>>
 Sagi,

 Here are the test results.

 Benchmark command:
 fio --bs=4k --ioengine=libaio --iodepth=64
 --
>>>filename=/dev/nvme0n1:/dev/nvme1n1:/dev/nvme2n1:/dev/nvme3n1:/d
>>>ev/nv

>>>me4n1:/dev/nvme5n1:/dev/nvme6n1:/dev/nvme7n1:/dev/nvme8n1:/dev
>>>/nvme9n1
 --direct=1 --runtime=90 --numjobs=80 --rw=randread --name=test
 --group_reporting --gtod_reduce=1

 With your patch: 1720k IOPS
 With threaded interrupts: 1320k IOPS
 With just interrupts: 3720k IOPS

 Interrupts are the fastest but we need to find a way to throttle it.
>>>
>>>This is the workload that generates the flood?
>>>If so I did not expect that this would be the perf difference..
>>>
>>>If completions keep pounding on the cpu, I would expect irqpoll to simply
>>>keep running forever and poll the cqs. There is no fundamental reason why
>>>polling would be faster in an interrupt, what matters could be:
>>>1. we reschedule more than we need to
>>>2. we don't reap enough completions in every poll round, which will trigger
>>>rearming the interrupt and then when it fires reschedule another softirq...
>>>

Yes I think it's the rescheduling that takes some. With the patch there are 
lots of ksoftirqd activities. (compared to nearly none without the patch)
A 90 seconds FIO run shows a big difference of context switches on all CPUs:
With patch: 5755849
Without patch: 1462931

>>>Maybe we need to take care of some irq_poll optimizations?
>>>
>>>Does this (untested) patch make any difference?
>>>--
>>>diff --git a/lib/irq_poll.c b/lib/irq_poll.c index 2f17b488d58e..0e94183eba15
>>>100644
>>>--- a/lib/irq_poll.c
>>>+++ b/lib/irq_poll.c
>>>@@ -12,7 +12,8 @@
>>>  #include 
>>>  #include 
>>>
>>>-static unsigned int irq_poll_budget __read_mostly = 256;
>>>+static unsigned int irq_poll_budget __read_mostly = 3000; unsigned int
>>>+__read_mostly irqpoll_budget_usecs = 2000;
>>>
>>>  static DEFINE_PER_CPU(struct list_head, blk_cpu_iopoll);
>>>
>>>@@ -77,32 +78,26 @@ EXPORT_SYMBOL(irq_poll_complete);
>>>
>>>  static void __latent_entropy irq_poll_softirq(struct softirq_action *h)
>>>  {
>>>-   struct list_head *list = this_cpu_ptr(&blk_cpu_iopoll);
>>>-   int rearm = 0, budget = irq_poll_budget;
>>>-   unsigned long start_time = jiffies;
>>>+   struct list_head *irqpoll_list = this_cpu_ptr(&blk_cpu_iopoll);
>>>+   unsigned int budget = irq_poll_budget;
>>>+   unsigned long time_limit =
>>>+   jiffies + usecs_to_jiffies(irqpoll_budget_usecs);
>>>+   LIST_HEAD(list);
>>>
>>> local_irq_disable();
>>>+   list_splice_init(irqpoll_list, &list);
>>>+   local_irq_enable();
>>>
>>>-   while (!list_empty(list)) {
>>>+   while (!list_empty(&list)) {
>>> struct irq_poll *iop;
>>> int work, weight;
>>>
>>>-   /*
>>>-* If softirq window is exhausted then punt.
>>>-*/
>>>-   if (budget <= 0 || time_after(jiffies, start_time)) {
>>>-   rearm = 1;
>>>-   break;
>>>-   }
>>>-
>>>-   local_irq_enable();
>>>-
>>> /* Even though interrupts have been re-enabled, this
>>>  * access is safe because interrupts can only add new
>>>  * entries to the tail of this list, and only ->poll()
>>>  * calls can remove this head entry from the list.
>>>  */
>>>-   iop = list_entry(list->next, struct irq_poll, list);
>>>+   iop = list_first_entry(&list, struct irq_poll, list);
>>>
>>> weight = iop->weight;
>>> work = 0;
>>>@@ -111,8 +106,6 @@ static void __latent_entropy irq_poll_softirq(struct
>>>softirq_action *h)
>>>
>>> budget -= work;
>>>
>>>-   local_irq_disable();
>>>-
>>> /*
>>>  * Drivers must not modify the iopoll state, if they
>>>  * consume their assigned weight (or more, some drivers 
>>> can't @@
>>>-125,11 +118,21 @@ static void __latent_entropy irq_poll_softirq(struct
>>>softirq_action *h)
>>> if (test_bit(IRQ_POLL_F_DISABLE, &iop->state))
>>> __irq_poll_complete(iop);
>>> else
>>>-   list_move_tail(&iop->list, list);
>>>+   list_move_tail(&iop->list, &list);
>>> }
>>>+
>>>+   /*
>>>+* If softirq window is exhausted then punt.
>>>+*/
>>>+   if (budget <= 0 || time_after_eq(jiffies, time_limit))
>>>+   break;
>>> }
>>>
>>>-   if (rearm)
>>>+   local_irq_disable();
>>>+
>>>+   list_splice_tail_init(irqpoll_list, &list);
>>>+   list_splice(&list, i

Re: [PATCH] uprobes/x86: fix detection of 32-bit user mode

2019-08-23 Thread Andy Lutomirski



> On Aug 23, 2019, at 5:03 PM, Thomas Gleixner  wrote:
> 
>> On Sat, 24 Aug 2019, Thomas Gleixner wrote:
>> On Fri, 23 Aug 2019, Andy Lutomirski wrote:
 On Aug 23, 2019, at 4:44 PM, Thomas Gleixner  wrote:
 
>> On Sat, 24 Aug 2019, Thomas Gleixner wrote:
>> On Sun, 28 Jul 2019, Sebastian Mayr wrote:
>> 
>> -static inline int sizeof_long(void)
>> +static inline int sizeof_long(struct pt_regs *regs)
>> {
>> -return in_ia32_syscall() ? 4 : 8;
> 
> This wants a comment.
> 
>> +return user_64bit_mode(regs) ? 8 : 4;
 
 The more simpler one liner is to check
 
   test_thread_flag(TIF_IA32)
>>> 
>>> I still want to finish killing TIF_IA32 some day.  Let’s please not add new 
>>> users.
>> 
>> Well, yes and no. This needs to be backported 
> 
> And TBH the magic in user_64bit_mode() is not pretty either.
> 
> 

It’s only magic on Xen. I should probably stick a 
cpu_feature_enabled(X86_FEATURE_XENPV) in there instead.

Re: [RFC PATCH v2 00/19] RDMA/FS DAX truncate proposal V1,000,002 ;-)

2019-08-23 Thread Dave Chinner
On Fri, Aug 23, 2019 at 09:04:29AM -0300, Jason Gunthorpe wrote:
> On Fri, Aug 23, 2019 at 01:23:45PM +1000, Dave Chinner wrote:
> 
> > > But the fact that RDMA, and potentially others, can "pass the
> > > pins" to other processes is something I spent a lot of time trying to 
> > > work out.
> > 
> > There's nothing in file layout lease architecture that says you
> > can't "pass the pins" to another process.  All the file layout lease
> > requirements say is that if you are going to pass a resource for
> > which the layout lease guarantees access for to another process,
> > then the destination process already have a valid, active layout
> > lease that covers the range of the pins being passed to it via the
> > RDMA handle.
> 
> How would the kernel detect and enforce this? There are many ways to
> pass a FD.

AFAIC, that's not really a kernel problem. It's more of an
application design constraint than anything else. i.e. if the app
passes the IB context to another process without a lease, then the
original process is still responsible for recalling the lease and
has to tell that other process to release the IB handle and it's
resources.

> IMHO it is wrong to try and create a model where the file lease exists
> independently from the kernel object relying on it. In other words the
> IB MR object itself should hold a reference to the lease it relies
> upon to function properly.

That still doesn't work. Leases are not individually trackable or
reference counted objects objects - they are attached to a struct
file bUt, in reality, they are far more restricted than a struct
file.

That is, a lease specifically tracks the pid and the _open fd_ it
was obtained for, so it is essentially owned by a specific process
context. Hence a lease is not able to be passed to a separate
process context and have it still work correctly for lease break
notifications.  i.e. the layout break signal gets delivered to
original process that created the struct file, if it still exists
and has the original fd still open. It does not get sent to the
process that currently holds a reference to the IB context.

So while a struct file passed to another process might still have
an active lease, and you can change the owner of the struct file
via fcntl(F_SETOWN), you can't associate the existing lease with a
the new fd in the new process and so layout break signals can't be
directed at the lease fd

This really means that a lease can only be owned by a single process
context - it can't be shared across multiple processes (so I was
wrong about dup/pass as being a possible way of passing them)
because there's only one process that can "own" a struct file, and
that where signals are sent when the lease needs to be broken.

So, fundamentally, if you want to pass a resource that pins a file
layout between processes, both processes need to hold a layout lease
on that file range. And that means exclusive leases and passing
layouts between processes are fundamentally incompatible because you
can't hold two exclusive leases on the same file range

Cheers,

Dave.
-- 
Dave Chinner
da...@fromorbit.com


RE: [PATCH v2 07/10] PCI: layerscape: Modify the MSIX to the doorbell way

2019-08-23 Thread Xiaowei Bao


> -Original Message-
> From: Andrew Murray 
> Sent: 2019年8月23日 21:58
> To: Xiaowei Bao 
> Cc: bhelg...@google.com; robh...@kernel.org; mark.rutl...@arm.com;
> shawn...@kernel.org; Leo Li ; kis...@ti.com;
> lorenzo.pieral...@arm.co; a...@arndb.de; gre...@linuxfoundation.org; M.h.
> Lian ; Mingkai Hu ; Roy
> Zang ; jingooh...@gmail.com;
> gustavo.pimen...@synopsys.com; linux-...@vger.kernel.org;
> devicet...@vger.kernel.org; linux-kernel@vger.kernel.org;
> linux-arm-ker...@lists.infradead.org; linuxppc-...@lists.ozlabs.org
> Subject: Re: [PATCH v2 07/10] PCI: layerscape: Modify the MSIX to the
> doorbell way
> 
> On Thu, Aug 22, 2019 at 07:22:39PM +0800, Xiaowei Bao wrote:
> > The layerscape platform use the doorbell way to trigger MSIX interrupt
> > in EP mode.
> >
> 
> I have no problems with this patch, however...
> 
> Are you able to add to this message a reason for why you are making this
> change? Did dw_pcie_ep_raise_msix_irq not work when func_no != 0? Or did
> it work yet dw_pcie_ep_raise_msix_irq_doorbell is more efficient?

The fact is that, this driver is verified in ls1046a platform of NXP before, 
and ls1046a don't
support MSIX feature, so I set the msix_capable of pci_epc_features struct is 
false,
but in other platform, e.g. ls1088a, it support the MSIX feature, I verified 
the MSIX
feature in ls1088a, it is not OK, so I changed to another way. Thanks.

> 
> Thanks,
> 
> Andrew Murray
> 
> > Signed-off-by: Xiaowei Bao 
> > ---
> > v2:
> >  - No change.
> >
> >  drivers/pci/controller/dwc/pci-layerscape-ep.c | 3 ++-
> >  1 file changed, 2 insertions(+), 1 deletion(-)
> >
> > diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c
> > b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> > index 8461f62..7ca5fe8 100644
> > --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c
> > +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> > @@ -74,7 +74,8 @@ static int ls_pcie_ep_raise_irq(struct dw_pcie_ep *ep,
> u8 func_no,
> > case PCI_EPC_IRQ_MSI:
> > return dw_pcie_ep_raise_msi_irq(ep, func_no, interrupt_num);
> > case PCI_EPC_IRQ_MSIX:
> > -   return dw_pcie_ep_raise_msix_irq(ep, func_no, interrupt_num);
> > +   return dw_pcie_ep_raise_msix_irq_doorbell(ep, func_no,
> > + interrupt_num);
> > default:
> > dev_err(pci->dev, "UNKNOWN IRQ type\n");
> > return -EINVAL;
> > --
> > 2.9.5
> >


Re: [PATCH 01/15] sched: introduce task_se_h_load helper

2019-08-23 Thread Rik van Riel
On Fri, 2019-08-23 at 20:13 +0200, Dietmar Eggemann wrote:
> 
> 
> > @@ -1668,7 +1668,7 @@ static void task_numa_compare(struct
> > task_numa_env *env,
> > /*
> >  * In the overloaded case, try and keep the load balanced.
> >  */
> > -   load = task_h_load(env->p) - task_h_load(cur);
> > +   load = task_se_h_load(env->p->se) - task_se_h_load(cur->se);
> 
> Shouldn't this be:
> 
> load = task_se_h_load(&env->p->se) - task_se_h_load(&cur->se);

Yes indeed. Sorry I forgot to fix these after you
pointed them out last time.

They are now fixed in my tree.

-- 
All Rights Reversed.


signature.asc
Description: This is a digitally signed message part


Re: [PATCH] uprobes/x86: fix detection of 32-bit user mode

2019-08-23 Thread Thomas Gleixner
On Sat, 24 Aug 2019, Thomas Gleixner wrote:
> On Fri, 23 Aug 2019, Andy Lutomirski wrote:
> > > On Aug 23, 2019, at 4:44 PM, Thomas Gleixner  wrote:
> > > 
> > >> On Sat, 24 Aug 2019, Thomas Gleixner wrote:
> > >>> On Sun, 28 Jul 2019, Sebastian Mayr wrote:
> > >>> 
> > >>> -static inline int sizeof_long(void)
> > >>> +static inline int sizeof_long(struct pt_regs *regs)
> > >>> {
> > >>> -return in_ia32_syscall() ? 4 : 8;
> > >> 
> > >>  This wants a comment.
> > >> 
> > >>> +return user_64bit_mode(regs) ? 8 : 4;
> > > 
> > > The more simpler one liner is to check
> > > 
> > >test_thread_flag(TIF_IA32)
> > 
> > I still want to finish killing TIF_IA32 some day.  Let’s please not add new 
> > users.
> 
> Well, yes and no. This needs to be backported 

And TBH the magic in user_64bit_mode() is not pretty either.

Thanks,

tglx

Re: [PATCH] uprobes/x86: fix detection of 32-bit user mode

2019-08-23 Thread Thomas Gleixner
On Fri, 23 Aug 2019, Andy Lutomirski wrote:
> > On Aug 23, 2019, at 4:44 PM, Thomas Gleixner  wrote:
> > 
> >> On Sat, 24 Aug 2019, Thomas Gleixner wrote:
> >>> On Sun, 28 Jul 2019, Sebastian Mayr wrote:
> >>> 
> >>> -static inline int sizeof_long(void)
> >>> +static inline int sizeof_long(struct pt_regs *regs)
> >>> {
> >>> -return in_ia32_syscall() ? 4 : 8;
> >> 
> >>  This wants a comment.
> >> 
> >>> +return user_64bit_mode(regs) ? 8 : 4;
> > 
> > The more simpler one liner is to check
> > 
> >test_thread_flag(TIF_IA32)
> 
> I still want to finish killing TIF_IA32 some day.  Let’s please not add new 
> users.

Well, yes and no. This needs to be backported 

RE: [PATCH v2 05/10] PCI: layerscape: Fix some format issue of the code

2019-08-23 Thread Xiaowei Bao


> -Original Message-
> From: Andrew Murray 
> Sent: 2019年8月23日 21:45
> To: Xiaowei Bao 
> Cc: bhelg...@google.com; robh...@kernel.org; mark.rutl...@arm.com;
> shawn...@kernel.org; Leo Li ; kis...@ti.com;
> lorenzo.pieral...@arm.co; a...@arndb.de; gre...@linuxfoundation.org; M.h.
> Lian ; Mingkai Hu ; Roy
> Zang ; jingooh...@gmail.com;
> gustavo.pimen...@synopsys.com; linux-...@vger.kernel.org;
> devicet...@vger.kernel.org; linux-kernel@vger.kernel.org;
> linux-arm-ker...@lists.infradead.org; linuxppc-...@lists.ozlabs.org
> Subject: Re: [PATCH v2 05/10] PCI: layerscape: Fix some format issue of the
> code
> 
> On Thu, Aug 22, 2019 at 07:22:37PM +0800, Xiaowei Bao wrote:
> > Fix some format issue of the code in EP driver.
> >
> > Signed-off-by: Xiaowei Bao 
> 
> Reviewed-by: Andrew Murray 

Thanks.

> 
> 
> > ---
> > v2:
> >  - No change.
> >
> >  drivers/pci/controller/dwc/pci-layerscape-ep.c | 4 ++--
> >  1 file changed, 2 insertions(+), 2 deletions(-)
> >
> > diff --git a/drivers/pci/controller/dwc/pci-layerscape-ep.c
> > b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> > index be61d96..4e92a95 100644
> > --- a/drivers/pci/controller/dwc/pci-layerscape-ep.c
> > +++ b/drivers/pci/controller/dwc/pci-layerscape-ep.c
> > @@ -62,7 +62,7 @@ static void ls_pcie_ep_init(struct dw_pcie_ep *ep)
> > }
> >
> >  static int ls_pcie_ep_raise_irq(struct dw_pcie_ep *ep, u8 func_no,
> > - enum pci_epc_irq_type type, u16 interrupt_num)
> > +   enum pci_epc_irq_type type, u16 interrupt_num)
> >  {
> > struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
> >
> > @@ -86,7 +86,7 @@ static const struct dw_pcie_ep_ops pcie_ep_ops = {
> > };
> >
> >  static int __init ls_add_pcie_ep(struct ls_pcie_ep *pcie,
> > -   struct platform_device *pdev)
> > +struct platform_device *pdev)
> >  {
> > struct dw_pcie *pci = pcie->pci;
> > struct device *dev = pci->dev;
> > --
> > 2.9.5
> >


Re: [PATCH] uprobes/x86: fix detection of 32-bit user mode

2019-08-23 Thread Andy Lutomirski



> On Aug 23, 2019, at 4:44 PM, Thomas Gleixner  wrote:
> 
>> On Sat, 24 Aug 2019, Thomas Gleixner wrote:
>>> On Sun, 28 Jul 2019, Sebastian Mayr wrote:
>>> 
>>> -static inline int sizeof_long(void)
>>> +static inline int sizeof_long(struct pt_regs *regs)
>>> {
>>> -return in_ia32_syscall() ? 4 : 8;
>> 
>>  This wants a comment.
>> 
>>> +return user_64bit_mode(regs) ? 8 : 4;
> 
> The more simpler one liner is to check
> 
>test_thread_flag(TIF_IA32)

I still want to finish killing TIF_IA32 some day.  Let’s please not add new 
users.



RE: [PATCH v2 02/10] PCI: designware-ep: Add the doorbell mode of MSI-X in EP mode

2019-08-23 Thread Xiaowei Bao


> -Original Message-
> From: Andrew Murray 
> Sent: 2019年8月23日 21:36
> To: Xiaowei Bao 
> Cc: bhelg...@google.com; robh...@kernel.org; mark.rutl...@arm.com;
> shawn...@kernel.org; Leo Li ; kis...@ti.com;
> lorenzo.pieral...@arm.co; a...@arndb.de; gre...@linuxfoundation.org; M.h.
> Lian ; Mingkai Hu ; Roy
> Zang ; jingooh...@gmail.com;
> gustavo.pimen...@synopsys.com; linux-...@vger.kernel.org;
> devicet...@vger.kernel.org; linux-kernel@vger.kernel.org;
> linux-arm-ker...@lists.infradead.org; linuxppc-...@lists.ozlabs.org
> Subject: Re: [PATCH v2 02/10] PCI: designware-ep: Add the doorbell mode of
> MSI-X in EP mode
> 
> On Thu, Aug 22, 2019 at 07:22:34PM +0800, Xiaowei Bao wrote:
> > Add the doorbell mode of MSI-X in EP mode.
> >
> > Signed-off-by: Xiaowei Bao 
> > ---
> > v2:
> >  - Remove the macro of no used.
> >
> >  drivers/pci/controller/dwc/pcie-designware-ep.c | 14 ++
> >  drivers/pci/controller/dwc/pcie-designware.h| 12 
> >  2 files changed, 26 insertions(+)
> >
> > diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c
> > b/drivers/pci/controller/dwc/pcie-designware-ep.c
> > index 3e2b740..b8388f8 100644
> > --- a/drivers/pci/controller/dwc/pcie-designware-ep.c
> > +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
> > @@ -480,6 +480,20 @@ int dw_pcie_ep_raise_msi_irq(struct dw_pcie_ep
> *ep, u8 func_no,
> > return 0;
> >  }
> >
> > +int dw_pcie_ep_raise_msix_irq_doorbell(struct dw_pcie_ep *ep, u8
> func_no,
> > +  u16 interrupt_num)
> > +{
> > +   struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
> > +   u32 msg_data;
> > +
> > +   msg_data = (func_no << PCIE_MSIX_DOORBELL_PF_SHIFT) |
> > +  (interrupt_num - 1);
> > +
> > +   dw_pcie_writel_dbi(pci, PCIE_MSIX_DOORBELL, msg_data);
> > +
> > +   return 0;
> > +}
> > +
> >  int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep *ep, u8 func_no,
> >   u16 interrupt_num)
> >  {
> > diff --git a/drivers/pci/controller/dwc/pcie-designware.h
> > b/drivers/pci/controller/dwc/pcie-designware.h
> > index a0fdbf7..895a9ef 100644
> > --- a/drivers/pci/controller/dwc/pcie-designware.h
> > +++ b/drivers/pci/controller/dwc/pcie-designware.h
> > @@ -88,6 +88,9 @@
> >  #define PCIE_MISC_CONTROL_1_OFF0x8BC
> >  #define PCIE_DBI_RO_WR_EN  BIT(0)
> >
> > +#define PCIE_MSIX_DOORBELL 0x948
> > +#define PCIE_MSIX_DOORBELL_PF_SHIFT24
> > +
> >  /*
> >   * iATU Unroll-specific register definitions
> >   * From 4.80 core version the address translation will be made by
> > unroll @@ -400,6 +403,8 @@ int dw_pcie_ep_raise_msi_irq(struct
> dw_pcie_ep *ep, u8 func_no,
> >  u8 interrupt_num);
> >  int dw_pcie_ep_raise_msix_irq(struct dw_pcie_ep *ep, u8 func_no,
> >  u16 interrupt_num);
> > +int dw_pcie_ep_raise_msix_irq_doorbell(struct dw_pcie_ep *ep, u8
> func_no,
> > +  u16 interrupt_num);
> >  void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum pci_barno bar);
> > #else  static inline void dw_pcie_ep_linkup(struct dw_pcie_ep *ep) @@
> > -432,6 +437,13 @@ static inline int dw_pcie_ep_raise_msix_irq(struct
> dw_pcie_ep *ep, u8 func_no,
> > return 0;
> >  }
> >
> > +static inline int dw_pcie_ep_raise_msix_irq_doorbell(struct dw_pcie_ep
> *ep,
> > +u8 func_no,
> > +u16 interrupt_num)
> > +{
> > +   return 0;
> > +}
> > +
> 
> Looks OK to me.
> 
> Reviewed-by: Andrew Murray 

Thanks a lot.

> 
> >  static inline void dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum
> > pci_barno bar)  {  }
> > --
> > 2.9.5
> >


RE: [PATCH v2 01/10] PCI: designware-ep: Add multiple PFs support for DWC

2019-08-23 Thread Xiaowei Bao


> -Original Message-
> From: Andrew Murray 
> Sent: 2019年8月23日 21:25
> To: Xiaowei Bao 
> Cc: bhelg...@google.com; robh...@kernel.org; mark.rutl...@arm.com;
> shawn...@kernel.org; Leo Li ; kis...@ti.com;
> lorenzo.pieral...@arm.co; a...@arndb.de; gre...@linuxfoundation.org; M.h.
> Lian ; Mingkai Hu ; Roy
> Zang ; jingooh...@gmail.com;
> gustavo.pimen...@synopsys.com; linux-...@vger.kernel.org;
> devicet...@vger.kernel.org; linux-kernel@vger.kernel.org;
> linux-arm-ker...@lists.infradead.org; linuxppc-...@lists.ozlabs.org
> Subject: Re: [PATCH v2 01/10] PCI: designware-ep: Add multiple PFs support
> for DWC
> 
> On Thu, Aug 22, 2019 at 07:22:33PM +0800, Xiaowei Bao wrote:
> > Add multiple PFs support for DWC, different PF have different config
> > space we use pf-offset property which get from the DTS to access the
> > different pF config space.
> 
> It looks like you're missing a --cover-letter again.
> 
> >
> > Signed-off-by: Xiaowei Bao 
> > ---
> > v2:
> >  - Remove duplicate redundant code.
> >  - Reimplement the PF config space access way.
> >
> >  drivers/pci/controller/dwc/pcie-designware-ep.c | 122
> 
> >  drivers/pci/controller/dwc/pcie-designware.c|  59 
> >  drivers/pci/controller/dwc/pcie-designware.h|  11 ++-
> >  3 files changed, 134 insertions(+), 58 deletions(-)
> >
> > diff --git a/drivers/pci/controller/dwc/pcie-designware-ep.c
> > b/drivers/pci/controller/dwc/pcie-designware-ep.c
> > index 2bf5a35..3e2b740 100644
> > --- a/drivers/pci/controller/dwc/pcie-designware-ep.c
> > +++ b/drivers/pci/controller/dwc/pcie-designware-ep.c
> > @@ -19,12 +19,17 @@ void dw_pcie_ep_linkup(struct dw_pcie_ep *ep)
> > pci_epc_linkup(epc);
> >  }
> >
> > -static void __dw_pcie_ep_reset_bar(struct dw_pcie *pci, enum pci_barno
> bar,
> > -  int flags)
> > +static void __dw_pcie_ep_reset_bar(struct dw_pcie *pci, u8 func_no,
> > +  enum pci_barno bar, int flags)
> >  {
> > u32 reg;
> > +   unsigned int func_offset = 0;
> > +   struct dw_pcie_ep *ep = &pci->ep;
> >
> > -   reg = PCI_BASE_ADDRESS_0 + (4 * bar);
> > +   if (ep->ops->func_conf_select)
> > +   func_offset = ep->ops->func_conf_select(ep, func_no);
> > +
> > +   reg = func_offset + PCI_BASE_ADDRESS_0 + (4 * bar);
> 
> This pattern of checking if func_conf_select exists and using it to get an 
> offset
> is repeated a lot throughout this file. You could move this functionality 
> into a
> new function (similar to dw_pcie_read_dbi etc). Or perhaps a new variant of
> dw_pcie_writel_ should be created that writes takes a func_no argument.

Thanks for your comments, I thought about this method before, but there is a 
issue
about the method of access the different func config space, due to our platform 
use
this method that different func have different offset from dbi_base to access 
the
different config space, but others platform maybe use the way that write a 
register
to implement different func config space access, so I think reserve a callback 
function 
to different platform to implement the own method, my point is that, if use 
register 
method they can implement the code in this function and return offset is 0, if 
use 
offset method, they can return the offset value which can be use by dw_pcie_ep 
driver.
 
> 
> 
> > dw_pcie_dbi_ro_wr_en(pci);
> > dw_pcie_writel_dbi2(pci, reg, 0x0);
> > dw_pcie_writel_dbi(pci, reg, 0x0);
> 
> 
> > @@ -235,7 +257,7 @@ static int dw_pcie_ep_map_addr(struct pci_epc
> *epc, u8 func_no,
> > struct dw_pcie_ep *ep = epc_get_drvdata(epc);
> > struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
> >
> > -   ret = dw_pcie_ep_outbound_atu(ep, addr, pci_addr, size);
> > +   ret = dw_pcie_ep_outbound_atu(ep, func_no, addr, pci_addr, size);
> > if (ret) {
> > dev_err(pci->dev, "Failed to enable address\n");
> > return ret;
> > @@ -249,11 +271,15 @@ static int dw_pcie_ep_get_msi(struct pci_epc
> *epc, u8 func_no)
> > struct dw_pcie_ep *ep = epc_get_drvdata(epc);
> > struct dw_pcie *pci = to_dw_pcie_from_ep(ep);
> > u32 val, reg;
> > +   unsigned int func_offset = 0;
> > +
> > +   if (ep->ops->func_conf_select)
> > +   func_offset = ep->ops->func_conf_select(ep, func_no);
> >
> > if (!ep->msi_cap)
> > return -EINVAL;
> >
> > -   reg = ep->msi_cap + PCI_MSI_FLAGS;
> > +   reg = ep->msi_cap + func_offset + PCI_MSI_FLAGS;
> 
> This makes me nervous.
> 
> From a PCI viewpoint, each function has it's own capability structure and
> within each function there may exist a MSI capability. Yet what we're doing
> here is using dw_pcie_ep_find_capability to get the list of capabilities for
> function 0, and then applying offsets from that for subsequent functions. I.e.
> we're applying DW specific knowledge to find the correct capability, rather
> than following the general PCI approach.
> 
> I think the above hunk shouldn't be required - but

Re: [PATCH] uprobes/x86: fix detection of 32-bit user mode

2019-08-23 Thread Thomas Gleixner
On Sat, 24 Aug 2019, Thomas Gleixner wrote:
> On Sun, 28 Jul 2019, Sebastian Mayr wrote:
> 
> > -static inline int sizeof_long(void)
> > +static inline int sizeof_long(struct pt_regs *regs)
> >  {
> > -   return in_ia32_syscall() ? 4 : 8;
> 
>   This wants a comment.
> 
> > +   return user_64bit_mode(regs) ? 8 : 4;

The more simpler one liner is to check

test_thread_flag(TIF_IA32)

which is only true for IA32 and independent of syscalls, exceptions ...

Thanks,

tglx


Re: [PATCH] uprobes/x86: fix detection of 32-bit user mode

2019-08-23 Thread Thomas Gleixner
Sebastian,

On Sun, 28 Jul 2019, Sebastian Mayr wrote:

sorry for the delay..

> 32-bit processes running on a 64-bit kernel are not always detected
> correctly, causing the process to crash when uretprobes are installed.
> The reason for the crash is that in_ia32_syscall() is used to determine
> the process's mode, which only works correctly when called from a
> syscall. In the case of uretprobes, however, the function is called from
> a software interrupt and always returns 'false' (on a 64-bit kernel). In

s/software interrupt/exception/

> consequence this leads to corruption of the process's return address.

Nice detective work and well written changelog!

> This can be fixed by using user_64bit_mode(), which should always be
> correct.

This wants to be:

  Fix this by using user_64bit_mode() which is always correct.

Be imperative, even if not entirely sure. You nicely put a disclaimer into
the discard section.

This also wants a Fixes tag and cc stable. Interestingly enough this should
have been detected by the rename with

  abfb9498ee13 ("x86/entry: Rename is_{ia32,x32}_task() to 
in_{ia32,x32}_syscall()")

which states in the changelog:

The is_ia32_task()/is_x32_task() function names are a big misnomer: they
suggests that the compat-ness of a system call is a task property, which
is not true, the compatness of a system call purely depends on how it
was invoked through the system call layer.
.

and then it went and blindly renamed every call site 

And sadly this was already mentioned here:

   8faaed1b9f50 ("uprobes/x86: Introduce sizeof_long(), cleanup 
adjust_ret_addr() and arch_uretprobe_hijack_return_addr()")

where the changelog says:

   TODO: is_ia32_task() is not what we actually want, TS_COMPAT does
   not necessarily mean 32bit. Fortunately syscall-like insns can't be
   probed so it actually works, but it would be better to rename and
   use is_ia32_frame().

and goes all the way back to:

0326f5a94dde ("uprobes/core: Handle breakpoint and singlestep exceptions")

Oh well. 7 years until someone actually tried a uretprobe on a 32bit
process on a 64bit kernel

> -static inline int sizeof_long(void)
> +static inline int sizeof_long(struct pt_regs *regs)
>  {
> - return in_ia32_syscall() ? 4 : 8;

  This wants a comment.

> + return user_64bit_mode(regs) ? 8 : 4;
>  }

No need to resend, I'll fix this up while applying.

Thank you very much for digging into this!

  tglx




Re: [PATCH] Partially revert "mm/memcontrol.c: keep local VM counters in sync with the hierarchical ones"

2019-08-23 Thread Roman Gushchin
On Fri, Aug 16, 2019 at 05:47:26PM -0700, Roman Gushchin wrote:
> Commit 766a4c19d880 ("mm/memcontrol.c: keep local VM counters in sync
> with the hierarchical ones") effectively decreased the precision of
> per-memcg vmstats_local and per-memcg-per-node lruvec percpu counters.
> 
> That's good for displaying in memory.stat, but brings a serious regression
> into the reclaim process.
> 
> One issue I've discovered and debugged is the following:
> lruvec_lru_size() can return 0 instead of the actual number of pages
> in the lru list, preventing the kernel to reclaim last remaining
> pages. Result is yet another dying memory cgroups flooding.
> The opposite is also happening: scanning an empty lru list
> is the waste of cpu time.
> 
> Also, inactive_list_is_low() can return incorrect values, preventing
> the active lru from being scanned and freed. It can fail both because
> the size of active and inactive lists are inaccurate, and because
> the number of workingset refaults isn't precise. In other words,
> the result is pretty random.
> 
> I'm not sure, if using the approximate number of slab pages in
> count_shadow_number() is acceptable, but issues described above
> are enough to partially revert the patch.
> 
> Let's keep per-memcg vmstat_local batched (they are only used for
> displaying stats to the userspace), but keep lruvec stats precise.
> This change fixes the dead memcg flooding on my setup.
> 
> Fixes: 766a4c19d880 ("mm/memcontrol.c: keep local VM counters in sync with 
> the hierarchical ones")
> Signed-off-by: Roman Gushchin 
> Cc: Yafang Shao 
> Cc: Johannes Weiner 

Any other concerns/comments here?

I'd prefer to fix the regression: we're likely leaking several pages
of memory for each created and destroyed memory cgroup. Plus
all internal structures, which are measured in hundreds of kb.

Thanks!


[PATCH v7 4/8] PCI/DPC: Add Error Disconnect Recover (EDR) support

2019-08-23 Thread sathyanarayanan . kuppuswamy
From: Kuppuswamy Sathyanarayanan 

As per ACPI specification r6.3, sec 5.6.6, when firmware owns Downstream
Port Containment (DPC), its expected to use the "Error Disconnect
Recover" (EDR) notification to alert OSPM of a DPC event and if OS
supports EDR, its expected to handle the software state invalidation and
port recovery in OS, and also let firmware know the recovery status via
_OST ACPI call. Related _OST status codes can be found in ACPI
specification r6.3, sec 6.3.5.2.

Also, as per PCI firmware specification r3.2 Downstream Port Containment
Related Enhancements ECN, sec 4.5.1, table 4-6, If DPC is controlled by
firmware (firmware first mode), firmware is responsible for
configuring the DPC and OS is responsible for error recovery. Also, OS
is allowed to modify DPC registers only during the EDR notification
window. So with EDR support, OS should provide DPC port services even in
firmware first mode.

Cc: Keith Busch 
Signed-off-by: Kuppuswamy Sathyanarayanan 

Acked-by: Keith Busch 
Tested-by: Huong Nguyen 
Tested-by: Austin Bolen 
---
 drivers/pci/pci-acpi.c   |  91 
 drivers/pci/pcie/Kconfig |  10 
 drivers/pci/pcie/dpc.c   | 125 ++-
 include/linux/pci-acpi.h |  11 
 4 files changed, 236 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/pci-acpi.c b/drivers/pci/pci-acpi.c
index 45049f558860..82abab25daf2 100644
--- a/drivers/pci/pci-acpi.c
+++ b/drivers/pci/pci-acpi.c
@@ -104,6 +104,97 @@ int acpi_get_rc_resources(struct device *dev, const char 
*hid, u16 segment,
 }
 #endif
 
+#if defined(CONFIG_PCIE_DPC) && defined(CONFIG_ACPI)
+
+/*
+ * _DSM wrapper function to enable/disable DPC port.
+ * @dpc   : DPC device structure
+ * @enable: status of DPC port (0 or 1).
+ *
+ * returns 0 on success or errno on failure.
+ */
+int acpi_enable_dpc_port(struct pci_dev *pdev, acpi_handle handle, bool enable)
+{
+   union acpi_object *obj;
+   int status = 0;
+   union acpi_object argv4;
+
+   argv4.type = ACPI_TYPE_INTEGER;
+   argv4.integer.value = enable;
+
+   obj = acpi_evaluate_dsm(handle, &pci_acpi_dsm_guid, 1,
+   EDR_PORT_ENABLE_DSM, &argv4);
+   if (!obj)
+   return -EIO;
+
+   if (obj->type != ACPI_TYPE_INTEGER || obj->integer.value != enable)
+   status = -EIO;
+
+   ACPI_FREE(obj);
+
+   return status;
+}
+
+/*
+ * _DSM wrapper function to locate DPC port.
+ * @dpc   : DPC device structure
+ *
+ * returns pci_dev or NULL.
+ */
+struct pci_dev *acpi_locate_dpc_port(struct pci_dev *pdev, acpi_handle handle)
+{
+   union acpi_object *obj;
+   u16 port;
+
+   obj = acpi_evaluate_dsm(handle, &pci_acpi_dsm_guid, 1,
+   EDR_PORT_LOCATE_DSM, NULL);
+   if (!obj)
+   return pci_dev_get(pdev);
+
+   if (obj->type != ACPI_TYPE_INTEGER) {
+   ACPI_FREE(obj);
+   return NULL;
+   }
+
+   /*
+* Firmware returns DPC port BDF details in following format:
+*  15:8 = bus
+*  7:3 = device
+*  2:0 = function
+*/
+   port = obj->integer.value;
+
+   ACPI_FREE(obj);
+
+   return pci_get_domain_bus_and_slot(pci_domain_nr(pdev->bus),
+  PCI_BUS_NUM(port), port & 0xff);
+}
+
+/*
+ * _OST wrapper function to let firmware know the status of EDR event.
+ * @dpc   : DPC device structure.
+ * @status: Status of EDR event.
+ */
+int acpi_send_edr_status(struct pci_dev *pdev,  acpi_handle handle, u16 status)
+{
+   u32 ost_status;
+
+   pci_dbg(pdev, "Sending EDR status :%x\n", status);
+
+   ost_status =  PCI_DEVID(pdev->bus->number, pdev->devfn);
+   ost_status = (ost_status << 16) | status;
+
+   status = acpi_evaluate_ost(handle,
+  ACPI_NOTIFY_DISCONNECT_RECOVER,
+  ost_status, NULL);
+   if (ACPI_FAILURE(status))
+   return -EINVAL;
+
+   return 0;
+}
+
+#endif /* CONFIG_PCIE_DPC && CONFIG_ACPI */
+
 phys_addr_t acpi_pci_root_get_mcfg_addr(acpi_handle handle)
 {
acpi_status status = AE_NOT_EXIST;
diff --git a/drivers/pci/pcie/Kconfig b/drivers/pci/pcie/Kconfig
index 362eb8cfa53b..9a05015af7cd 100644
--- a/drivers/pci/pcie/Kconfig
+++ b/drivers/pci/pcie/Kconfig
@@ -150,3 +150,13 @@ config PCIE_BW
  This enables PCI Express Bandwidth Change Notification.  If
  you know link width or rate changes occur only to correct
  unreliable links, you may answer Y.
+
+config PCIE_EDR
+   bool "PCI Express Error Disconnect Recover support"
+   default n
+   depends on PCIE_DPC && ACPI
+   help
+ This options adds Error Disconnect Recover support as specified
+ in PCI firmware specification v3.2 Downstream Port Containment
+ Related Enhancements ECN. Enable this if you want to support hybrid
+ DPC 

[PATCH v7 2/8] PCI/DPC: Allow dpc_probe() even if firmware first mode is enabled

2019-08-23 Thread sathyanarayanan . kuppuswamy
From: Kuppuswamy Sathyanarayanan 

As per ACPI specification v6.3, sec 5.6.6, Error Disconnect Recover
(EDR) notification used by firmware to let OS know about the DPC event
and permit OS to perform error recovery when processing the EDR
notification. Also, as per PCI firmware specification r3.2 Downstream
Port Containment Related Enhancements ECN, sec 4.5.1, table 4-6, if DPC
is controlled by firmware (firmware first mode), it's responsible for
initializing Downstream Port Containment Extended Capability Structures
per firmware policy. And, OS is permitted to read or write DPC Control
and Status registers of a port while processing an Error Disconnect
Recover (EDR) notification from firmware on that port.

Currently, if firmware controls DPC (firmware first mode), OS will not
create/enumerate DPC PCIe port services. But, if OS supports EDR
feature, then as mentioned in above spec references, it should permit
enumeration of DPC driver and also support handling ACPI EDR
notification. So as first step, allow dpc_probe() to continue even if
firmware first mode is enabled. Also add appropriate checks to ensure
device registers are not modified outside EDR notification window in
firmware first mode. This is a preparatory patch for adding EDR support.

Signed-off-by: Kuppuswamy Sathyanarayanan 

Acked-by: Keith Busch 
---
 drivers/pci/pcie/dpc.c | 49 +++---
 1 file changed, 36 insertions(+), 13 deletions(-)

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index a32ec3487a8d..9717fda012f8 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -22,6 +22,8 @@ struct dpc_dev {
u16 cap_pos;
boolrp_extensions;
u8  rp_log_size;
+   /* Set True if DPC is controlled by firmware */
+   boolfirmware_dpc;
 };
 
 static const char * const rp_pio_error_string[] = {
@@ -69,6 +71,9 @@ void pci_save_dpc_state(struct pci_dev *dev)
if (!dpc)
return;
 
+   if (dpc->firmware_dpc)
+   return;
+
save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_DPC);
if (!save_state)
return;
@@ -90,6 +95,9 @@ void pci_restore_dpc_state(struct pci_dev *dev)
if (!dpc)
return;
 
+   if (dpc->firmware_dpc)
+   return;
+
save_state = pci_find_saved_ext_cap(dev, PCI_EXT_CAP_ID_DPC);
if (!save_state)
return;
@@ -291,9 +299,6 @@ static int dpc_probe(struct pcie_device *dev)
int status;
u16 ctl, cap;
 
-   if (pcie_aer_get_firmware_first(pdev))
-   return -ENOTSUPP;
-
dpc = devm_kzalloc(device, sizeof(*dpc), GFP_KERNEL);
if (!dpc)
return -ENOMEM;
@@ -302,13 +307,25 @@ static int dpc_probe(struct pcie_device *dev)
dpc->dev = dev;
set_service_data(dev, dpc);
 
-   status = devm_request_threaded_irq(device, dev->irq, dpc_irq,
-  dpc_handler, IRQF_SHARED,
-  "pcie-dpc", dpc);
-   if (status) {
-   pci_warn(pdev, "request IRQ%d failed: %d\n", dev->irq,
-status);
-   return status;
+   if (pcie_aer_get_firmware_first(pdev))
+   dpc->firmware_dpc = 1;
+
+   /*
+* If DPC is handled in firmware and ACPI support is not enabled
+* in OS, skip probe and return error.
+*/
+   if (dpc->firmware_dpc && !IS_ENABLED(CONFIG_ACPI))
+   return -ENODEV;
+
+   if (!dpc->firmware_dpc) {
+   status = devm_request_threaded_irq(device, dev->irq, dpc_irq,
+  dpc_handler, IRQF_SHARED,
+  "pcie-dpc", dpc);
+   if (status) {
+   pci_warn(pdev, "request IRQ%d failed: %d\n", dev->irq,
+status);
+   return status;
+   }
}
 
pci_read_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CAP, &cap);
@@ -323,9 +340,12 @@ static int dpc_probe(struct pcie_device *dev)
dpc->rp_log_size = 0;
}
}
-
-   ctl = (ctl & 0xfff4) | PCI_EXP_DPC_CTL_EN_FATAL | 
PCI_EXP_DPC_CTL_INT_EN;
-   pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL, ctl);
+   if (!dpc->firmware_dpc) {
+   ctl = (ctl & 0xfff4) |
+   (PCI_EXP_DPC_CTL_EN_FATAL | PCI_EXP_DPC_CTL_INT_EN);
+   pci_write_config_word(pdev, dpc->cap_pos + PCI_EXP_DPC_CTL,
+ ctl);
+   }
 
pci_info(pdev, "error containment capabilities: Int Msg #%d, RPExt%c 
PoisonedTLP%c SwTrigger%c RP PIO Log %d, DL_ActiveErr%c\n",
 cap & PCI_EXP_DPC_IRQ, FLAG(cap, PCI_EXP_DPC_CAP_RP_EXT),
@@ -343,6 +363,9

  1   2   3   4   5   6   7   8   9   >