date:20211004

Re: [PATCH v3 6/8] PCI/AER: Clear error device AER registers in aer_irq()

2021-10-04 Thread kernel test robot

Hi Naveen,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on helgaas-pci/next]
[also build test WARNING on linux/master linus/master v5.15-rc3 next-20210921]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:
https://github.com/0day-ci/linux/commits/Naveen-Naidu/Fix-long-standing-AER-Error-Handling-Issues/20211004-223758
base:   https://git.kernel.org/pub/scm/linux/kernel/git/helgaas/pci.git next
config: x86_64-randconfig-a006-20211004 (attached as .config)
compiler: clang version 14.0.0 (https://github.com/llvm/llvm-project 
c0039de2953d15815448b4b3c3bafb45607781e0)
reproduce (this is a W=1 build):
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# 
https://github.com/0day-ci/linux/commit/ab727f8771a49bb5f4c7bbf10dc9ba9b90a454cf
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review 
Naveen-Naidu/Fix-long-standing-AER-Error-Handling-Issues/20211004-223758
git checkout ab727f8771a49bb5f4c7bbf10dc9ba9b90a454cf
# save the attached .config to linux build tree
mkdir build_dir
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross W=1 
O=build_dir ARCH=x86_64 SHELL=/bin/bash drivers/pci/pcie/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

All warnings (new ones prefixed by >>):

   drivers/pci/pcie/aer.c:822: warning: Function parameter or member 'e_dev' 
not described in 'add_error_device'
   drivers/pci/pcie/aer.c:822: warning: Excess function parameter 'e_info' 
description in 'add_error_device'
   drivers/pci/pcie/aer.c:923: warning: Function parameter or member 'e_dev' 
not described in 'find_source_device'
   drivers/pci/pcie/aer.c:923: warning: Excess function parameter 'e_info' 
description in 'find_source_device'
>> drivers/pci/pcie/aer.c:1172: warning: expecting prototype for 
>> aer_find_corr_error_source_device(). Prototype was for 
>> aer_find_uncorr_error_source_device() instead


vim +1172 drivers/pci/pcie/aer.c

  1156  
  1157  /**
  1158   * aer_find_corr_error_source_device - find the error source which 
detected the uncorrected error
  1159   * @rp: pointer to Root Port pci_dev data structure
  1160   * @e_src: pointer to an error source
  1161   * @e_info: including detailed error information such like id
  1162   *
  1163   * Return true if found.
  1164   *
  1165   * Process the error information received at the Root Port, set these 
values
  1166   * in the aer_devices_err_info and find all the devices that are 
related to
  1167   * the error.
  1168   */
  1169  static bool aer_find_uncorr_error_source_device(struct pci_dev *rp,
  1170  struct aer_err_source *e_src,
  1171  struct aer_devices_err_info 
*e_info)
> 1172  {
  1173  if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) {
  1174  e_info->err_info.id = ERR_UNCOR_ID(e_src->id);
  1175  
  1176  if (e_src->status & PCI_ERR_ROOT_FATAL_RCV)
  1177  e_info->err_info.severity = AER_FATAL;
  1178  else
  1179  e_info->err_info.severity = AER_NONFATAL;
  1180  
  1181  if (e_src->status & PCI_ERR_ROOT_MULTI_UNCOR_RCV)
  1182  e_info->err_info.multi_error_valid = 1;
  1183  else
  1184  e_info->err_info.multi_error_valid = 0;
  1185  
  1186  if (!find_source_device(rp, e_info))
  1187  return false;
  1188  }
  1189  
  1190  return true;
  1191  }
  1192  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


.config.gz
Description: application/gzip

Re: [PATCH 2/3] Remove 256MB limit restriction for boot cpu paca allocation

2021-10-04 Thread kernel test robot

Hi Sourabh,

Thank you for the patch! Yet something to improve:

[auto build test ERROR on powerpc/next]
[also build test ERROR on linux/master linus/master v5.15-rc3 next-20210922]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:
https://github.com/0day-ci/linux/commits/Sourabh-Jain/Update-crashkernel-offset-to-allow-kernel-to-boot-on-large-config-LPARs/20211004-233345
base:   https://git.kernel.org/pub/scm/linux/kernel/git/powerpc/linux.git next
config: powerpc-buildonly-randconfig-r004-20211004 (attached as .config)
compiler: powerpc64-linux-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# 
https://github.com/0day-ci/linux/commit/563e715d022b3fab0f1791f64c3944aa34d20f04
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review 
Sourabh-Jain/Update-crashkernel-offset-to-allow-kernel-to-boot-on-large-config-LPARs/20211004-233345
git checkout 563e715d022b3fab0f1791f64c3944aa34d20f04
# save the attached .config to linux build tree
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross 
ARCH=powerpc 

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

All errors (new ones prefixed by >>):

   arch/powerpc/kernel/prom.c: In function 'early_init_dt_scan_cpus':
>> arch/powerpc/kernel/prom.c:389:17: error: implicit declaration of function 
>> 'hash__early_detect_seg_size' [-Werror=implicit-function-declaration]
 389 | hash__early_detect_seg_size();
 | ^~~
   cc1: some warnings being treated as errors


vim +/hash__early_detect_seg_size +389 arch/powerpc/kernel/prom.c

   307  
   308  static int __init early_init_dt_scan_cpus(unsigned long node,
   309const char *uname, int depth,
   310void *data)
   311  {
   312  const char *type = of_get_flat_dt_prop(node, "device_type", 
NULL);
   313  const __be32 *prop;
   314  const __be32 *intserv;
   315  int i, nthreads;
   316  int len;
   317  int found = -1;
   318  int found_thread = 0;
   319  
   320  /* We are scanning "cpu" nodes only */
   321  if (type == NULL || strcmp(type, "cpu") != 0)
   322  return 0;
   323  
   324  /* Get physical cpuid */
   325  intserv = of_get_flat_dt_prop(node, 
"ibm,ppc-interrupt-server#s", &len);
   326  if (!intserv)
   327  intserv = of_get_flat_dt_prop(node, "reg", &len);
   328  
   329  nthreads = len / sizeof(int);
   330  
   331  /*
   332   * Now see if any of these threads match our boot cpu.
   333   * NOTE: This must match the parsing done in smp_setup_cpu_maps.
   334   */
   335  for (i = 0; i < nthreads; i++) {
   336  if (be32_to_cpu(intserv[i]) ==
   337  fdt_boot_cpuid_phys(initial_boot_params)) {
   338  found = boot_cpu_count;
   339  found_thread = i;
   340  }
   341  #ifdef CONFIG_SMP
   342  /* logical cpu id is always 0 on UP kernels */
   343  boot_cpu_count++;
   344  #endif
   345  }
   346  
   347  /* Not the boot CPU */
   348  if (found < 0)
   349  return 0;
   350  
   351  DBG("boot cpu: logical %d physical %d\n", found,
   352  be32_to_cpu(intserv[found_thread]));
   353  boot_cpuid = found;
   354  
   355  /*
   356   * PAPR defines "logical" PVR values for cpus that
   357   * meet various levels of the architecture:
   358   * 0x0f01   Architecture version 2.04
   359   * 0x0f02   Architecture version 2.05
   360   * If the cpu-version property in the cpu node contains
   361   * such a value, we call identify_cpu again with the
   362   * logical PVR value in order to use the cpu feature
   363   * bits appropriate for the architecture level.
   364   *
   365   * A POWER6 partition in "POWER6 architected" mode
   366   * uses the 0x0f02 PVR value; in POWER5+ mode
   367   * it uses 0x0f01.
   368   *
   369   * If we're using device tree CPU feature discovery then we 
don't
   370   * support the cpu-version property, and it's the 
responsib

Re: [PATCH 1/3] fixup mmu_features immediately after getting cpu pa features.

2021-10-04 Thread Mahesh J Salgaonkar

On 2021-10-04 21:02:21 Mon, Aneesh Kumar K.V wrote:
> On 10/4/21 20:41, Sourabh Jain wrote:
> > From: Mahesh Salgaonkar 
> > 
> > On system with radix support available, early_radix_enabled() starts
> > returning true for a small window (until mmu_early_init_devtree() is
> > called) even when radix mode disabled on kernel command line. This causes
> > ppc64_bolted_size() to return ULONG_MAX in HPT mode instead of supported
> > segment size, during boot cpu paca allocation.
> > 
> > With kernel command line = "... disable_radix":
> > 
> > early_init_devtree:   <- early_radix_enabled() = false
> >early_init_dt_scan_cpus:   <- early_radix_enabled() = false
> >...
> >check_cpu_pa_features: <- early_radix_enabled() = 
> > false
> >...  ^ <- early_radix_enabled() = TRUE
> >allocate_paca:   | <- early_radix_enabled() = 
> > TRUE
> >...   |
> >ppc64_bolted_size:   | <- early_radix_enabled() = 
> > TRUE
> >if (early_radix_enabled())| <- early_radix_enabled() = TRUE
> >return ULONG_MAX; |
> >...   |
> >...  | <- early_radix_enabled() = 
> > TRUE
> >...  | <- early_radix_enabled() = 
> > TRUE
> >mmu_early_init_devtree()  V
> >...<- early_radix_enabled() = 
> > false
> > 
> > So far we have not seen any issue because allocate_paca() takes minimum of
> > ppc64_bolted_size and rma_size while allocating paca. However it is better
> > to close this window by fixing up the mmu features as early as possible.
> > This fixes early_radix_enabled() and ppc64_bolted_size() to return valid
> > values in radix disable mode. This patch will help subsequent patch to
> > depend on early_radix_enabled() check while detecting supported segment
> > size in HPT mode.
> > 
> > Signed-off-by: Mahesh Salgaonkar 
> > Signed-off-by: Sourabh Jain 
> > Reported-and-tested-by: Abdul haleem 
> > ---
> >   arch/powerpc/include/asm/book3s/64/mmu.h | 1 +
> >   arch/powerpc/include/asm/mmu.h   | 1 +
> >   arch/powerpc/kernel/prom.c   | 1 +
> >   arch/powerpc/mm/init_64.c| 5 -
> >   4 files changed, 7 insertions(+), 1 deletion(-)
> > 
> > diff --git a/arch/powerpc/include/asm/book3s/64/mmu.h 
> > b/arch/powerpc/include/asm/book3s/64/mmu.h
> > index c02f42d1031e..69a89fa1330d 100644
> > --- a/arch/powerpc/include/asm/book3s/64/mmu.h
> > +++ b/arch/powerpc/include/asm/book3s/64/mmu.h
> > @@ -197,6 +197,7 @@ extern int mmu_vmemmap_psize;
> >   extern int mmu_io_psize;
> >   /* MMU initialization */
> > +void mmu_cpu_feature_fixup(void);
> >   void mmu_early_init_devtree(void);
> >   void hash__early_init_devtree(void);
> >   void radix__early_init_devtree(void);
> > diff --git a/arch/powerpc/include/asm/mmu.h b/arch/powerpc/include/asm/mmu.h
> > index 8abe8e42e045..c8eafd401fe9 100644
> > --- a/arch/powerpc/include/asm/mmu.h
> > +++ b/arch/powerpc/include/asm/mmu.h
> > @@ -401,6 +401,7 @@ extern void early_init_mmu(void);
> >   extern void early_init_mmu_secondary(void);
> >   extern void setup_initial_memory_limit(phys_addr_t first_memblock_base,
> >phys_addr_t first_memblock_size);
> > +static inline void mmu_cpu_feature_fixup(void) { }
> >   static inline void mmu_early_init_devtree(void) { }
> >   static inline void pkey_early_init_devtree(void) {}
> > diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
> > index 2e67588f6f6e..1727a3abe6c1 100644
> > --- a/arch/powerpc/kernel/prom.c
> > +++ b/arch/powerpc/kernel/prom.c
> > @@ -380,6 +380,7 @@ static int __init early_init_dt_scan_cpus(unsigned long 
> > node,
> > check_cpu_pa_features(node);
> > }
> > +   mmu_cpu_feature_fixup();
> 
> can you do that call inside check_cpu_pa_features? or is it because we have
> the same issue with baremetal platforms?

Yup same issue exist on baremetal as well in case of dt_cpu_ftrs_in_use
is true. Hence calling it after the if (!dt_cpu_ftrs_in_use) code block
takes care of both pseries and baremetal platforms.

> 
> Can we also rename this to indicate we are sanitizing the feature flag based
> on kernel command line.  Something like
> 
> /* Update cpu features based on kernel command line */
> update_cpu_features();

Sure will do.

Thanks for your review.
-Mahesh.

Re: [V2 1/4] powerpc/perf: Refactor the code definition of perf reg extended mask

2021-10-04 Thread Michael Ellerman

Athira Rajeev  writes:
> PERF_REG_PMU_MASK_300 and PERF_REG_PMU_MASK_31 defines the mask
> value for extended registers. Current definition of these mask values
> uses hex constant and does not use registers by name, making it less
> readable. Patch refactor the macro values by or'ing together the actual
> register value constants. Also include PERF_REG_EXTENDED_MAX as
> part of enum definition.
>
> Suggested-by: Michael Ellerman 
> Signed-off-by: Athira Rajeev 
> ---
>  arch/powerpc/include/uapi/asm/perf_regs.h | 21 +
>  1 file changed, 13 insertions(+), 8 deletions(-)
>
> diff --git a/arch/powerpc/include/uapi/asm/perf_regs.h 
> b/arch/powerpc/include/uapi/asm/perf_regs.h
> index 578b3ee86105..fb1d8a9b4393 100644
> --- a/arch/powerpc/include/uapi/asm/perf_regs.h
> +++ b/arch/powerpc/include/uapi/asm/perf_regs.h
> @@ -61,27 +61,32 @@ enum perf_event_powerpc_regs {
>   PERF_REG_POWERPC_PMC4,
>   PERF_REG_POWERPC_PMC5,
>   PERF_REG_POWERPC_PMC6,
> - /* Max regs without the extended regs */
> + /* Max mask value for interrupt regs w/o extended regs */
>   PERF_REG_POWERPC_MAX = PERF_REG_POWERPC_MMCRA + 1,
> + /* Max mask value for interrupt regs including extended regs */
> + PERF_REG_EXTENDED_MAX = PERF_REG_POWERPC_PMC6 + 1,
>  };
>  
>  #define PERF_REG_PMU_MASK((1ULL << PERF_REG_POWERPC_MAX) - 1)
>  
> -/* Exclude MMCR3, SIER2, SIER3 for CPU_FTR_ARCH_300 */
> -#define  PERF_EXCLUDE_REG_EXT_300(7ULL << PERF_REG_POWERPC_MMCR3)
> -
>  /*
>   * PERF_REG_EXTENDED_MASK value for CPU_FTR_ARCH_300
>   * includes 9 SPRS from MMCR0 to PMC6 excluding the
> - * unsupported SPRS in PERF_EXCLUDE_REG_EXT_300.
> + * unsupported SPRS MMCR3, SIER2 and SIER3.
>   */
> -#define PERF_REG_PMU_MASK_300   ((0xfffULL << PERF_REG_POWERPC_MMCR0) - 
> PERF_EXCLUDE_REG_EXT_300)
> +#define PERF_REG_PMU_MASK_300\
> + ((1ul << PERF_REG_POWERPC_MMCR0) | (1ul << PERF_REG_POWERPC_MMCR1) | \
> + (1ul << PERF_REG_POWERPC_MMCR2) | (1ul << PERF_REG_POWERPC_PMC1) | \
> + (1ul << PERF_REG_POWERPC_PMC2) | (1ul << PERF_REG_POWERPC_PMC3) | \
> + (1ul << PERF_REG_POWERPC_PMC4) | (1ul << PERF_REG_POWERPC_PMC5) | \
> + (1ul << PERF_REG_POWERPC_PMC6))

These all need to be unsigned long long. Otherwise when building on big
endian (which defaults to 32-bit), we see errors such as:

  In file included from 
/home/michael/linux/tools/perf/arch/powerpc/include/perf_regs.h:7:0,
   from arch/powerpc/util/../../../util/perf_regs.h:30,
   from arch/powerpc/util/perf_regs.c:7:
  arch/powerpc/util/perf_regs.c: In function ‘arch__intr_reg_mask’:
  /home/michael/linux/tools/arch/powerpc/include/uapi/asm/perf_regs.h:78:8: 
error: left shift count >= width of type [-Werror=shift-count-overflow]
((1ul << PERF_REG_POWERPC_MMCR0) | (1ul << PERF_REG_POWERPC_MMCR1) | \
  ^
  arch/powerpc/util/perf_regs.c:206:19: note: in expansion of macro 
‘PERF_REG_PMU_MASK_300’
 extended_mask = PERF_REG_PMU_MASK_300;
 ^

cheers

Re: [PATCH 3/9] powerpc/bpf: Remove unused SEEN_STACK

2021-10-04 Thread Christophe Leroy





Le 04/10/2021 à 20:11, Naveen N. Rao a écrit :

Christophe Leroy wrote:



Le 01/10/2021 à 23:14, Naveen N. Rao a écrit :

From: Ravi Bangoria 

SEEN_STACK is unused on PowerPC. Remove it. Also, have
SEEN_TAILCALL use 0x4000.


Why change SEEN_TAILCALL ? Would it be a problem to leave it as is ?



Signed-off-by: Ravi Bangoria 
Reviewed-by: Christophe Leroy 


I prefer the bit usage to be contiguous. Changing SEEN_TAILCALL isn't a 
problem either.




Well you are adding SEEN_BIG_PROG in following patch so it would still 
be contiguous at the end.


I don't really mind but I thought it would be less churn to just leave 
SEEN_TAILCALL as is and re-use 0x4000 for SEEN_BIG_PROG.


Anyway

Reviewed-by: Christophe Leroy

Re: [PATCH 4/9] powerpc/bpf: Handle large branch ranges with BPF_EXIT

2021-10-04 Thread Christophe Leroy





Le 04/10/2021 à 20:24, Naveen N. Rao a écrit :

Christophe Leroy wrote:



Le 01/10/2021 à 23:14, Naveen N. Rao a écrit :

In some scenarios, it is possible that the program epilogue is outside
the branch range for a BPF_EXIT instruction. Instead of rejecting such
programs, emit an indirect branch. We track the size of the bpf program
emitted after the initial run and do a second pass since BPF_EXIT can
end up emitting different number of instructions depending on the
program size.

Suggested-by: Jordan Niethe 
Signed-off-by: Naveen N. Rao 
---
  arch/powerpc/net/bpf_jit.h    |  3 +++
  arch/powerpc/net/bpf_jit_comp.c   | 22 +-
  arch/powerpc/net/bpf_jit_comp32.c |  2 +-
  arch/powerpc/net/bpf_jit_comp64.c |  2 +-
  4 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
index 89bd744c2bffd4..4023de1698b9f5 100644
--- a/arch/powerpc/net/bpf_jit.h
+++ b/arch/powerpc/net/bpf_jit.h
@@ -126,6 +126,7 @@
  #define SEEN_FUNC    0x2000 /* might call external helpers */
  #define SEEN_TAILCALL    0x4000 /* uses tail calls */
+#define SEEN_BIG_PROG    0x8000 /* large prog, >32MB */
  #define SEEN_VREG_MASK    0x1ff8 /* Volatile registers r3-r12 */
  #define SEEN_NVREG_MASK    0x0003 /* Non volatile registers 
r14-r31 */
@@ -179,6 +180,8 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 
*image, struct codegen_context *

  void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx);
  void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx);
  void bpf_jit_realloc_regs(struct codegen_context *ctx);
+int bpf_jit_emit_exit_insn(u32 *image, struct codegen_context *ctx,
+    int tmp_reg, unsigned long exit_addr);
  #endif
diff --git a/arch/powerpc/net/bpf_jit_comp.c 
b/arch/powerpc/net/bpf_jit_comp.c

index fcbf7a917c566e..3204872fbf2738 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -72,6 +72,21 @@ static int bpf_jit_fixup_subprog_calls(struct 
bpf_prog *fp, u32 *image,

  return 0;
  }
+int bpf_jit_emit_exit_insn(u32 *image, struct codegen_context *ctx,
+    int tmp_reg, unsigned long exit_addr)
+{
+    if (!(ctx->seen & SEEN_BIG_PROG) && 
is_offset_in_branch_range(exit_addr)) {

+    PPC_JMP(exit_addr);
+    } else {
+    ctx->seen |= SEEN_BIG_PROG;
+    PPC_FUNC_ADDR(tmp_reg, (unsigned long)image + exit_addr);
+    EMIT(PPC_RAW_MTCTR(tmp_reg));
+    EMIT(PPC_RAW_BCTR());
+    }
+
+    return 0;
+}
+
  struct powerpc64_jit_data {
  struct bpf_binary_header *header;
  u32 *addrs;
@@ -155,12 +170,17 @@ struct bpf_prog *bpf_int_jit_compile(struct 
bpf_prog *fp)

  goto out_addrs;
  }
+    if (!is_offset_in_branch_range((long)cgctx.idx * 4))
+    cgctx.seen |= SEEN_BIG_PROG;
+
  /*
   * If we have seen a tail call, we need a second pass.
   * This is because bpf_jit_emit_common_epilogue() is called
   * from bpf_jit_emit_tail_call() with a not yet stable ctx->seen.
+ * We also need a second pass if we ended up with too large
+ * a program so as to fix branches.
   */
-    if (cgctx.seen & SEEN_TAILCALL) {
+    if (cgctx.seen & (SEEN_TAILCALL | SEEN_BIG_PROG)) {
  cgctx.idx = 0;
  if (bpf_jit_build_body(fp, 0, &cgctx, addrs, false)) {
  fp = org_fp;
diff --git a/arch/powerpc/net/bpf_jit_comp32.c 
b/arch/powerpc/net/bpf_jit_comp32.c

index a74d52204f8da2..d2a67574a23066 100644
--- a/arch/powerpc/net/bpf_jit_comp32.c
+++ b/arch/powerpc/net/bpf_jit_comp32.c
@@ -852,7 +852,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 
*image, struct codegen_context *

   * we'll just fall through to the epilogue.
   */
  if (i != flen - 1)
-    PPC_JMP(exit_addr);
+    bpf_jit_emit_exit_insn(image, ctx, tmp_reg, exit_addr);


On ppc32, if you use tmp_reg you must flag it. But I think you could 
use r0 instead.


Indeed. Can we drop tracking of the temp registers and using them while
remapping registers? Are you seeing significant benefits with re-use of 
those temp registers?




I'm not sure to follow you.

On ppc32, all volatile registers are used for function arguments, so 
temp registers are necessarily non-volatile so we track them as all 
non-volatile registers we use.


I think saving on stack only the non-volatile registers we use provides 
real benefit, otherwise you wouldn't have implemented it would you ?


Anyway here you should use _R0 instead of tmp_reg.

Christophe

Re: [PATCH 6/9] powerpc/bpf: Fix BPF_SUB when imm == 0x80000000

2021-10-04 Thread Christophe Leroy





Le 04/10/2021 à 20:18, Naveen N. Rao a écrit :

Christophe Leroy wrote:



Le 01/10/2021 à 23:14, Naveen N. Rao a écrit :

We aren't handling subtraction involving an immediate value of
0x8000 properly. Fix the same.

Fixes: 156d0e290e969c ("powerpc/ebpf/jit: Implement JIT compiler for 
extended BPF")

Signed-off-by: Naveen N. Rao 
---
  arch/powerpc/net/bpf_jit_comp64.c | 16 
  1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit_comp64.c 
b/arch/powerpc/net/bpf_jit_comp64.c

index ffb7a2877a8469..4641a50e82d50d 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -333,15 +333,15 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 
*image, struct codegen_context *

  case BPF_ALU | BPF_SUB | BPF_K: /* (u32) dst -= (u32) imm */
  case BPF_ALU64 | BPF_ADD | BPF_K: /* dst += imm */
  case BPF_ALU64 | BPF_SUB | BPF_K: /* dst -= imm */
-    if (BPF_OP(code) == BPF_SUB)
-    imm = -imm;
-    if (imm) {
-    if (imm >= -32768 && imm < 32768)
-    EMIT(PPC_RAW_ADDI(dst_reg, dst_reg, IMM_L(imm)));
-    else {
-    PPC_LI32(b2p[TMP_REG_1], imm);
+    if (imm > -32768 && imm < 32768) {
+    EMIT(PPC_RAW_ADDI(dst_reg, dst_reg,
+    BPF_OP(code) == BPF_SUB ? IMM_L(-imm) : 
IMM_L(imm)));

+    } else {
+    PPC_LI32(b2p[TMP_REG_1], imm);
+    if (BPF_OP(code) == BPF_SUB)
+    EMIT(PPC_RAW_SUB(dst_reg, dst_reg, 
b2p[TMP_REG_1]));

+    else
  EMIT(PPC_RAW_ADD(dst_reg, dst_reg, 
b2p[TMP_REG_1]));

-    }
  }
  goto bpf_alu32_trunc;


There is now so few code common to both BPF_ADD and BPF_SUB that you 
should make them different cases.


While at it, why not also use ADDIS if imm is 32 bits ? That would be 
an ADDIS/ADDI instead of LIS/ORI/ADD


Sure. I wanted to limit the change for this fix. We can do a separate 
patch to optimize code generation for BPF_ADD.




Sure, this second part was just a thought, I agree it should be another 
patch.


My main comment here is to split stuff and make it a different case, I 
don't think it increases the change much, and IMO it is easier to read:


diff --git a/arch/powerpc/net/bpf_jit_comp64.c 
b/arch/powerpc/net/bpf_jit_comp64.c

index ffb7a2877a84..39226d88c558 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -330,11 +330,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 
*image, struct codegen_context *

EMIT(PPC_RAW_SUB(dst_reg, dst_reg, src_reg));
goto bpf_alu32_trunc;
case BPF_ALU | BPF_ADD | BPF_K: /* (u32) dst += (u32) imm */
-   case BPF_ALU | BPF_SUB | BPF_K: /* (u32) dst -= (u32) imm */
case BPF_ALU64 | BPF_ADD | BPF_K: /* dst += imm */
-   case BPF_ALU64 | BPF_SUB | BPF_K: /* dst -= imm */
-   if (BPF_OP(code) == BPF_SUB)
-   imm = -imm;
if (imm) {
if (imm >= -32768 && imm < 32768)
EMIT(PPC_RAW_ADDI(dst_reg, dst_reg, 
IMM_L(imm)));
@@ -344,6 +340,17 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 
*image, struct codegen_context *

}
}
goto bpf_alu32_trunc;
+   case BPF_ALU | BPF_SUB | BPF_K: /* (u32) dst -= (u32) imm */
+   case BPF_ALU64 | BPF_SUB | BPF_K: /* dst -= imm */
+   if (imm) {
+   if (-imm >= -32768 && -imm < 32768) {
+   EMIT(PPC_RAW_ADDI(dst_reg, dst_reg, 
IMM_L(-imm)));
+   } else {
+   PPC_LI32(b2p[TMP_REG_1], imm);
+   EMIT(PPC_RAW_SUB(dst_reg, dst_reg, 
b2p[TMP_REG_1]));
+   }
+   }
+   goto bpf_alu32_trunc;
case BPF_ALU | BPF_MUL | BPF_X: /* (u32) dst *= (u32) src */
case BPF_ALU64 | BPF_MUL | BPF_X: /* dst *= src */
if (BPF_CLASS(code) == BPF_ALU)

Re: [PATCH v3 6/8] PCI/AER: Clear error device AER registers in aer_irq()

2021-10-04 Thread kernel test robot

Hi Naveen,

Thank you for the patch! Perhaps something to improve:

[auto build test WARNING on helgaas-pci/next]
[also build test WARNING on linux/master linus/master v5.15-rc3 next-20210922]
[If your patch is applied to the wrong git tree, kindly drop us a note.
And when submitting patch, we suggest to use '--base' as documented in
https://git-scm.com/docs/git-format-patch]

url:
https://github.com/0day-ci/linux/commits/Naveen-Naidu/Fix-long-standing-AER-Error-Handling-Issues/20211004-223758
base:   https://git.kernel.org/pub/scm/linux/kernel/git/helgaas/pci.git next
config: arc-allyesconfig (attached as .config)
compiler: arceb-elf-gcc (GCC) 11.2.0
reproduce (this is a W=1 build):
wget 
https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O 
~/bin/make.cross
chmod +x ~/bin/make.cross
# 
https://github.com/0day-ci/linux/commit/ab727f8771a49bb5f4c7bbf10dc9ba9b90a454cf
git remote add linux-review https://github.com/0day-ci/linux
git fetch --no-tags linux-review 
Naveen-Naidu/Fix-long-standing-AER-Error-Handling-Issues/20211004-223758
git checkout ab727f8771a49bb5f4c7bbf10dc9ba9b90a454cf
# save the attached .config to linux build tree
mkdir build_dir
COMPILER_INSTALL_PATH=$HOME/0day COMPILER=gcc-11.2.0 make.cross 
O=build_dir ARCH=arc SHELL=/bin/bash drivers/pci/pcie/ drivers/usb/dwc3/

If you fix the issue, kindly add following tag as appropriate
Reported-by: kernel test robot 

All warnings (new ones prefixed by >>):

   drivers/pci/pcie/aer.c:822: warning: Function parameter or member 'e_dev' 
not described in 'add_error_device'
   drivers/pci/pcie/aer.c:822: warning: Excess function parameter 'e_info' 
description in 'add_error_device'
   drivers/pci/pcie/aer.c:923: warning: Function parameter or member 'e_dev' 
not described in 'find_source_device'
   drivers/pci/pcie/aer.c:923: warning: Excess function parameter 'e_info' 
description in 'find_source_device'
>> drivers/pci/pcie/aer.c:1172: warning: expecting prototype for 
>> aer_find_corr_error_source_device(). Prototype was for 
>> aer_find_uncorr_error_source_device() instead


vim +1172 drivers/pci/pcie/aer.c

  1156  
  1157  /**
  1158   * aer_find_corr_error_source_device - find the error source which 
detected the uncorrected error
  1159   * @rp: pointer to Root Port pci_dev data structure
  1160   * @e_src: pointer to an error source
  1161   * @e_info: including detailed error information such like id
  1162   *
  1163   * Return true if found.
  1164   *
  1165   * Process the error information received at the Root Port, set these 
values
  1166   * in the aer_devices_err_info and find all the devices that are 
related to
  1167   * the error.
  1168   */
  1169  static bool aer_find_uncorr_error_source_device(struct pci_dev *rp,
  1170  struct aer_err_source *e_src,
  1171  struct aer_devices_err_info 
*e_info)
> 1172  {
  1173  if (e_src->status & PCI_ERR_ROOT_UNCOR_RCV) {
  1174  e_info->err_info.id = ERR_UNCOR_ID(e_src->id);
  1175  
  1176  if (e_src->status & PCI_ERR_ROOT_FATAL_RCV)
  1177  e_info->err_info.severity = AER_FATAL;
  1178  else
  1179  e_info->err_info.severity = AER_NONFATAL;
  1180  
  1181  if (e_src->status & PCI_ERR_ROOT_MULTI_UNCOR_RCV)
  1182  e_info->err_info.multi_error_valid = 1;
  1183  else
  1184  e_info->err_info.multi_error_valid = 0;
  1185  
  1186  if (!find_source_device(rp, e_info))
  1187  return false;
  1188  }
  1189  
  1190  return true;
  1191  }
  1192  

---
0-DAY CI Kernel Test Service, Intel Corporation
https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org


.config.gz
Description: application/gzip

Re: [RFC PATCH 4/8] powerpc: add CPU field to struct thread_info

2021-10-04 Thread Michael Ellerman

Kees Cook  writes:
> On Thu, Sep 30, 2021 at 08:46:04AM +1000, Michael Ellerman wrote:
>> Ard Biesheuvel  writes:
>> > On Tue, 28 Sept 2021 at 02:16, Michael Ellerman  
>> > wrote:
>> >>
>> >> Michael Ellerman  writes:
>> >> > Ard Biesheuvel  writes:
>> >> >> On Tue, 14 Sept 2021 at 14:11, Ard Biesheuvel  wrote:
>> >> >>>
>> >> >>> The CPU field will be moved back into thread_info even when
>> >> >>> THREAD_INFO_IN_TASK is enabled, so add it back to powerpc's definition
>> >> >>> of struct thread_info.
>> >> >>>
>> >> >>> Signed-off-by: Ard Biesheuvel 
>> >> >>
>> >> >> Michael,
>> >> >>
>> >> >> Do you have any objections or issues with this patch or the subsequent
>> >> >> ones cleaning up the task CPU kludge for ppc32? Christophe indicated
>> >> >> that he was happy with it.
>> >> >
>> >> > No objections, it looks good to me, thanks for cleaning up that horror 
>> >> > :)
>> >> >
>> >> > It didn't apply cleanly to master so I haven't tested it at all, if you 
>> >> > can point me at a
>> >> > git tree with the dependencies I'd be happy to run some tests over it.
>> >>
>> >> Actually I realised I can just drop the last patch.
>> >>
>> >> So that looks fine, passes my standard quick build & boot on qemu tests,
>> >> and builds with/without stack protector enabled.
>> >>
>> >
>> > Thanks.
>> >
>> > Do you have any opinion on how this series should be merged? Kees Cook
>> > is willing to take them via his cross-arch tree, or you could carry
>> > them if you prefer. Taking it via multiple trees at the same time is
>> > going to be tricky, or take two cycles, with I'd prefer to avoid.
>> 
>> I don't really mind. If Kees is happy to take it then that's OK by me.
>> 
>> If Kees put the series in a topic branch based off rc2 then I could
>> merge that, and avoid any conflicts.
>
> I've created:
>
> git://git.kernel.org/pub/scm/linux/kernel/git/kees/linux.git 
> for-next/thread_info/cpu
>
> it includes a --no-ff merge commit, which I'm not sure is desirable? Let
> me know if I should adjust this, or if Linus will yell about this if I
> send him a PR containing a merge commit? I'm not sure what's right here.

It looks good to me.

I don't think Linus will be bothered about that merge. It has useful
information, ie. explains why you're merging it and that arch
maintainers have acked it, and quotes Ard's cover letter.

cheers

Re: [PATCH v6 07/11] PCI: Replace pci_dev::driver usage that gets the driver name

2021-10-04 Thread Ido Schimmel

On Mon, Oct 04, 2021 at 02:59:31PM +0200, Uwe Kleine-König wrote:
> struct pci_dev::driver holds (apart from a constant offset) the same
> data as struct pci_dev::dev->driver. With the goal to remove struct
> pci_dev::driver to get rid of data duplication replace getting the
> driver name by dev_driver_string() which implicitly makes use of struct
> pci_dev::dev->driver.
> 
> Acked-by: Simon Horman  (for NFP)
> Signed-off-by: Uwe Kleine-König 

For mlxsw:

Reviewed-by: Ido Schimmel 
Tested-by: Ido Schimmel 

Tested with the kexec flow that I mentioned last time. Works fine now.

Thanks

[PATCH 1/6] PCI/AER: Enable COR/UNCOR error reporting in set_device_error_reporting()

2021-10-04 Thread Naveen Naidu

The (PCIe r5.0, sec 7.6.4.3, Table 7-101) and  (PCIe r5.0, sec 7.8.4.6,
Table 7-104) states that the default values for the Uncorrectable Error
Mask and Correctable Error Mask should be 0b. But the current code does
not set the default value of these registers when the PCIe bus loads the
AER service driver.

Enable reporting of all correctable and uncorrectable errors during
aer_probe()

Signed-off-by: Naveen Naidu 
---
 drivers/pci/pcie/aer.c | 13 -
 1 file changed, 12 insertions(+), 1 deletion(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 9784fdcf3006..88c4ca6098fb 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -1212,6 +1212,7 @@ static int set_device_error_reporting(struct pci_dev 
*dev, void *data)
 {
bool enable = *((bool *)data);
int type = pci_pcie_type(dev);
+   int aer = dev->aer_cap;
 
if ((type == PCI_EXP_TYPE_ROOT_PORT) ||
(type == PCI_EXP_TYPE_RC_EC) ||
@@ -1223,8 +1224,18 @@ static int set_device_error_reporting(struct pci_dev 
*dev, void *data)
pci_disable_pcie_error_reporting(dev);
}
 
-   if (enable)
+   if (enable) {
+
+   /* Enable reporting of all uncorrectable errors */
+   /* Uncorrectable Error Mask - turned on bits disable errors */
+   pci_write_config_dword(dev, aer + PCI_ERR_UNCOR_MASK, 0);
+
+   /* Enable reporting of all correctable errors */
+   /* Correctable Error Mask - turned on bits disable errors */
+   pci_write_config_dword(dev, aer + PCI_ERR_COR_MASK, 0);
+
pcie_set_ecrc_checking(dev);
+   }
 
return 0;
 }
-- 
2.25.1

[PATCH 0/6] MIPS: OCTEON: Remove redundant AER code

2021-10-04 Thread Naveen Naidu

e8635b484f64 ("MIPS: Add Cavium OCTEON PCI support.") added MIPS
specific code to enable PCIe and AER error reporting (*irrespective
of CONFIG_PCIEAER value*) because PCI core didn't do that at the time.

But currently, the PCI core clears and enables the AER status registers.
So it's redundant for octeon code to do so. This patch series removes
the redundant code from the pci-octeon.c

Currently, the correctable and uncorrectable AER mask registers are not
set to their default value when AER service driver is loaded. This
defect is also fixed in the "[PATCH 1/6]" in the series.

Please note that "Patch 4/6" is dependent on "Patch 1/6".

Thanks,
Naveen Naidu

Naveen Naidu (6):
 [PATCH 1/6] PCI/AER: Enable COR/UNCOR error reporting in 
set_device_error_reporting()
 [PATCH 2/6] MIPS: OCTEON: Remove redundant clearing of AER status registers
 [PATCH 3/6] MIPS: OCTEON: Remove redundant enable of PCIe normal error 
reporting
 [PATCH 4/6] MIPS: OCTEON: Remove redundant enable of COR/UNCOR error
 [PATCH 5/6] MIPS: OCTEON: Remove redundant ECRC Generation Enable
 [PATCH 6/6] MIPS: OCTEON: Remove redundant enable of RP error reporting

 arch/mips/pci/pci-octeon.c | 50 --
 drivers/pci/pcie/aer.c | 13 +-
 2 files changed, 12 insertions(+), 51 deletions(-)

-- 
2.25.1

[PATCH v3 8/8] PCI/AER: Include DEVCTL in aer_print_error()

2021-10-04 Thread Naveen Naidu

Print the contents of Device Control Register of the device which
detected the error. This might help in faster error diagnosis.

Sample output from dummy error injected by aer-inject:

  pcieport :00:03.0: AER: Corrected error received: :00:03.0
  pcieport :00:03.0: PCIe Bus Error: severity=Corrected, type=Data Link 
Layer, (Receiver)
  pcieport :00:03.0:   device [1b36:000c] error 
status/mask=0040/e000, devctl=0x000f
  pcieport :00:03.0:[ 6] BadTLP

Signed-off-by: Naveen Naidu 
---
 drivers/pci/pci.h  |  2 ++
 drivers/pci/pcie/aer.c | 10 --
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index eb88d8bfeaf7..48ed7f91113b 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -437,6 +437,8 @@ struct aer_err_info {
u32 status; /* COR/UNCOR Error Status */
u32 mask;   /* COR/UNCOR Error Mask */
struct aer_header_log_regs tlp; /* TLP Header */
+
+   u16 devctl;
 };
 
 /* Preliminary AER error information processed from Root port */
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 91f91d6ab052..42cae01b6887 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -729,8 +729,8 @@ void aer_print_error(struct pci_dev *dev, struct 
aer_err_info *info)
   aer_error_severity_string[info->severity],
   aer_error_layer[layer], aer_agent_string[agent]);
 
-   pci_printk(level, dev, "  device [%04x:%04x] error 
status/mask=%08x/%08x\n",
-  dev->vendor, dev->device, info->status, info->mask);
+   pci_printk(level, dev, "  device [%04x:%04x] error 
status/mask=%08x/%08x, devctl=%#06x\n",
+  dev->vendor, dev->device, info->status, info->mask, 
info->devctl);
 
__aer_print_error(dev, info);
 
@@ -1083,6 +1083,12 @@ int aer_get_device_error_info(struct pci_dev *dev, 
struct aer_err_info *info)
if (!aer)
return 0;
 
+   /*
+* Cache the value of Device Control Register now, because later the
+* device might not be available
+*/
+   pcie_capability_read_word(dev, PCI_EXP_DEVCTL, &info->devctl);
+
if (info->severity == AER_CORRECTABLE) {
pci_read_config_dword(dev, aer + PCI_ERR_COR_STATUS,
&info->status);
-- 
2.25.1

[PATCH v3 7/8] PCI/ERR: Remove redundant clearing of AER register in pcie_do_recovery()

2021-10-04 Thread Naveen Naidu

pcie_do_recovery() is shared across the following paths:
 - ACPI APEI
 - Native AER path
 - EDR
 - DPC

ACPI APEI
==

  ghes_handle_aer()
aer_recover_queue()
  kfifo_in_spinlocked(aer_recover_ring)

  aer_recover_work_func()
while (kfifo_get(aer_recover_ring))
  pcie_do_recovery()

In this path the system firmware clears the AER registers before
handing off the record to the OS in ghes_handle_aer()

Native AER
==

 aer_irq()
   aer_add_err_devices_to_queue()
 kfifo_put(&rpc->aer_fifo, *e_dev)
 clear_error_source_aer_registers()   < AER registers are cleard

 aer_isr()
   aer_isr_one_error()
handle_error_source()
  pcie_do_recovery()

The AER registers are cleared during the handling of IRQ, i.e before we
the recovery starts.

DPC
=

  dpc_handler()
dpc_process_error()
pci_aer_clear_status()   < AER registers are cleared
pcie_do_recovery()

EDR


  edr_handle_event()
dpc_process_error()
pci_aer_raw_clear_status()  < AER registers are cleared
pcie_do_recovery()

In all the above paths, the AER registers are cleared before
pcie_do_recovery(). The non fatal status AER registers are again cleared
in pcie_do_recovery(). This is redundant.

Remove redundant clearing of AER register in pcie_do_recovery()

Signed-off-by: Naveen Naidu 
---
 drivers/pci/pcie/err.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index b576aa890c76..fe04b0ae22f4 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -231,14 +231,11 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
 
/*
 * If we have native control of AER, clear error status in the device
-* that detected the error.  If the platform retained control of AER,
-* it is responsible for clearing this status.  In that case, the
-* signaling device may not even be visible to the OS.
+* that detected the error.
 */
-   if (host->native_aer || pcie_ports_native) {
+   if (host->native_aer || pcie_ports_native)
pcie_clear_device_status(dev);
-   pci_aer_clear_nonfatal_status(dev);
-   }
+
pci_info(bridge, "device recovery successful\n");
return status;
 
-- 
2.25.1

[PATCH v3 6/8] PCI/AER: Clear error device AER registers in aer_irq()

2021-10-04 Thread Naveen Naidu

Converge the APEI path and native AER path of clearing the AER registers
of the error device.

In APEI path, the system firmware clears the AER registers before
handing off the record to OS. But in "native AER" path, the execution
path of clearing the AER register is as follows:

  aer_isr_one_error
aer_print_port_info
  if (find_source_device())
aer_process_err_devices
  handle_error_source
pci_write_config_dword(dev, PCI_ERR_COR_STATUS, ...)

The above path has a bug, if the find_source_device() fails, AER
registers are not cleared from the error device. This means, the error
device will keep reporting the error again and again and would lead
to message spew.

Related Bug Report:
  https://lore.kernel.org/linux-pci/20151229155822.GA17321@localhost/
  https://bugzilla.kernel.org/show_bug.cgi?id=109691
  https://bugzilla.kernel.org/show_bug.cgi?id=109691
  https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1521173

The above bug could be avoided, if the AER registers are cleared during
the AER IRQ handler aer_irq(), which would provide guarantee that the AER
error registers are always cleared. This is similar to how APEI handles
these errors.

The main aim is that:

  When an interrupt handler deals with a interrupt, it must *always*
  clear the source of the interrupt.

Signed-off-by: Naveen Naidu 
---
 drivers/pci/pci.h  |  13 ++-
 drivers/pci/pcie/aer.c | 245 -
 2 files changed, 182 insertions(+), 76 deletions(-)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 9be7a966fda7..eb88d8bfeaf7 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -424,7 +424,6 @@ static inline bool pci_dev_is_added(const struct pci_dev 
*dev)
 #define AER_MAX_MULTI_ERR_DEVICES  5   /* Not likely to have more */
 
 struct aer_err_info {
-   struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
int error_dev_num;
 
u16 id;
@@ -440,6 +439,18 @@ struct aer_err_info {
struct aer_header_log_regs tlp; /* TLP Header */
 };
 
+/* Preliminary AER error information processed from Root port */
+struct aer_devices_err_info {
+   struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
+   struct aer_err_info err_info;
+};
+
+/* AER information associated with each error device */
+struct aer_dev_err_info {
+   struct pci_dev *dev;
+   struct aer_err_info err_info;
+};
+
 int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
 #endif /* CONFIG_PCIEAER */
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 241ff361b43c..91f91d6ab052 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -36,6 +36,18 @@
 
 #define AER_ERROR_SOURCES_MAX  128
 
+/*
+ * There can be 128 maximum error sources (AER_ERROR_SOURCES_MAX) and each
+ * error source can have maximum of 5 error devices (AER_MAX_MULTI_ERR_DEVICES)
+ * so the maximum error devices we can report is:
+ *
+ * AER_ERROR_DEVICES_MAX = AER_ERROR_SOURCES_MAX * AER_MAX_MULTI_ERR_DEVICES 
== (128 * 5) == 640
+ *
+ * But since, the size in KFIFO should be a power of two, the closest value
+ * to 640 is 1024
+ */
+# define AER_ERROR_DEVICES_MAX 1024
+
 #define AER_MAX_TYPEOF_COR_ERRS16  /* as per 
PCI_ERR_COR_STATUS */
 #define AER_MAX_TYPEOF_UNCOR_ERRS  27  /* as per PCI_ERR_UNCOR_STATUS*/
 
@@ -46,7 +58,7 @@ struct aer_err_source {
 
 struct aer_rpc {
struct pci_dev *rpd;/* Root Port device */
-   DECLARE_KFIFO(aer_fifo, struct aer_err_source, AER_ERROR_SOURCES_MAX);
+   DECLARE_KFIFO(aer_fifo, struct aer_dev_err_info, AER_ERROR_DEVICES_MAX);
 };
 
 /* AER stats for the device */
@@ -806,11 +818,11 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
  * @e_info: pointer to error info
  * @dev: pointer to pci_dev to be added
  */
-static int add_error_device(struct aer_err_info *e_info, struct pci_dev *dev)
+static int add_error_device(struct aer_devices_err_info *e_dev, struct pci_dev 
*dev)
 {
-   if (e_info->error_dev_num < AER_MAX_MULTI_ERR_DEVICES) {
-   e_info->dev[e_info->error_dev_num] = pci_dev_get(dev);
-   e_info->error_dev_num++;
+   if (e_dev->err_info.error_dev_num < AER_MAX_MULTI_ERR_DEVICES) {
+   e_dev->dev[e_dev->err_info.error_dev_num] = pci_dev_get(dev);
+   e_dev->err_info.error_dev_num++;
return 0;
}
return -ENOSPC;
@@ -877,18 +889,18 @@ static bool is_error_source(struct pci_dev *dev, struct 
aer_err_info *e_info)
 
 static int find_device_iter(struct pci_dev *dev, void *data)
 {
-   struct aer_err_info *e_info = (struct aer_err_info *)data;
+   struct aer_devices_err_info *e_dev = (struct aer_devices_err_info 
*)data;
 
-   if (is_error_source(dev, e_info)) {
+   if (is_error_source(dev, &e_dev->err_info)) {
/* List this de

[PATCH v3 5/8] PCI/DPC: Converge EDR and DPC Path of clearing AER registers

2021-10-04 Thread Naveen Naidu

In the EDR path, AER registers are cleared *after* DPC error event is
processed. The process stack in EDR is:

  edr_handle_event()
dpc_process_error()
pci_aer_raw_clear_status()
pcie_do_recovery()

But in DPC path, AER status registers are cleared *while* processing
the error. The process stack in DPC is:

  dpc_handler()
dpc_process_error()
  pci_aer_clear_status()
pcie_do_recovery()

In EDR path, AER status registers are cleared irrespective of whether
the error was an RP PIO or unmasked uncorrectable error. But in DPC, the
AER status registers are cleared only when it's an unmasked uncorrectable
error.

This leads to two different behaviours for the same task (handling of
DPC errors) in FFS systems and when native OS has control.

Bring the same semantics for clearing the AER status register in EDR
path and DPC path.

Signed-off-by: Naveen Naidu 
---
 drivers/pci/pcie/dpc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index faf4a1e77fab..68899a3db126 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -288,7 +288,6 @@ void dpc_process_error(struct pci_dev *pdev)
 dpc_get_aer_uncorrect_severity(pdev, &info) &&
 aer_get_device_error_info(pdev, &info)) {
aer_print_error(pdev, &info);
-   pci_aer_clear_status(pdev);
}
 }
 
@@ -297,6 +296,7 @@ static irqreturn_t dpc_handler(int irq, void *context)
struct pci_dev *pdev = context;
 
dpc_process_error(pdev);
+   pci_aer_clear_status(pdev);
 
/* We configure DPC so it only triggers on ERR_FATAL */
pcie_do_recovery(pdev, pci_channel_io_frozen, dpc_reset_link);
-- 
2.25.1

[PATCH v3 4/8] PCI/DPC: Use pci_aer_clear_status() in dpc_process_error()

2021-10-04 Thread Naveen Naidu

dpc_process_error() clears both AER fatal and non fatal status
registers. Instead of clearing each status registers via a different
function call use pci_aer_clear_status().

This helps clean up the code a bit.

Signed-off-by: Naveen Naidu 
---
 drivers/pci/pcie/dpc.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index df3f3a10f8bc..faf4a1e77fab 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -288,8 +288,7 @@ void dpc_process_error(struct pci_dev *pdev)
 dpc_get_aer_uncorrect_severity(pdev, &info) &&
 aer_get_device_error_info(pdev, &info)) {
aer_print_error(pdev, &info);
-   pci_aer_clear_nonfatal_status(pdev);
-   pci_aer_clear_fatal_status(pdev);
+   pci_aer_clear_status(pdev);
}
 }
 
-- 
2.25.1

[PATCH v3 3/8] PCI/DPC: Initialize info->id in dpc_process_error()

2021-10-04 Thread Naveen Naidu

In the dpc_process_error() path, info->id isn't initialized before being
passed to aer_print_error(). In the corresponding AER path, it is
initialized in aer_isr_one_error().

The error message shown during Coverity Scan is:

  Coverity #1461602
  CID 1461602 (#1 of 1): Uninitialized scalar variable (UNINIT)
  8. uninit_use_in_call: Using uninitialized value info.id when calling 
aer_print_error.

Initialize the "info->id" before passing it to aer_print_error()

Fixes: 8aefa9b0d910 ("PCI/DPC: Print AER status in DPC event handling")
Signed-off-by: Naveen Naidu 
---
 drivers/pci/pcie/dpc.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index c556e7beafe3..df3f3a10f8bc 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -262,14 +262,14 @@ static int dpc_get_aer_uncorrect_severity(struct pci_dev 
*dev,
 
 void dpc_process_error(struct pci_dev *pdev)
 {
-   u16 cap = pdev->dpc_cap, status, source, reason, ext_reason;
+   u16 cap = pdev->dpc_cap, status, reason, ext_reason;
struct aer_err_info info;
 
pci_read_config_word(pdev, cap + PCI_EXP_DPC_STATUS, &status);
-   pci_read_config_word(pdev, cap + PCI_EXP_DPC_SOURCE_ID, &source);
+   pci_read_config_word(pdev, cap + PCI_EXP_DPC_SOURCE_ID, &info.id);
 
pci_info(pdev, "containment event, status:%#06x source:%#06x\n",
-status, source);
+status, info.id);
 
reason = (status & PCI_EXP_DPC_STATUS_TRIGGER_RSN) >> 1;
ext_reason = (status & PCI_EXP_DPC_STATUS_TRIGGER_RSN_EXT) >> 5;
-- 
2.25.1

[PATCH v3 2/8] PCI: Cleanup struct aer_err_info

2021-10-04 Thread Naveen Naidu

The id, status and the mask fields of the struct aer_err_info comes
directly from the registers, hence their sizes should be explicit.

The length of these registers are:
  - id: 16 bits - Represents the Error Source Requester ID
  - status: 32 bits - COR/UNCOR Error Status
  - mask: 32 bits - COR/UNCOR Error Mask

Since the length of the above registers are even, use u16 and u32
to represent their values.

Also remove the __pad fields.

"pahole" was run on the modified struct aer_err_info and the size
remains unchanged.

Signed-off-by: Naveen Naidu 
---
 drivers/pci/pci.h | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 1cce56c2aea0..9be7a966fda7 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -427,18 +427,16 @@ struct aer_err_info {
struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
int error_dev_num;
 
-   unsigned int id:16;
+   u16 id;
 
unsigned int severity:2;/* 0:NONFATAL | 1:FATAL | 2:COR */
-   unsigned int __pad1:5;
unsigned int multi_error_valid:1;
 
unsigned int first_error:5;
-   unsigned int __pad2:2;
unsigned int tlp_header_valid:1;
 
-   unsigned int status;/* COR/UNCOR Error Status */
-   unsigned int mask;  /* COR/UNCOR Error Mask */
+   u32 status; /* COR/UNCOR Error Status */
+   u32 mask;   /* COR/UNCOR Error Mask */
struct aer_header_log_regs tlp; /* TLP Header */
 };
 
-- 
2.25.1

[PATCH v3 1/8] PCI/AER: Remove ID from aer_agent_string[]

2021-10-04 Thread Naveen Naidu

Before 010caed4ccb6 ("PCI/AER: Decode Error Source RequesterID")
the AER error logs looked like:

  pcieport :00:03.0: AER: Corrected error received: id=0018
  pcieport :00:03.0: PCIe Bus Error: severity=Corrected, type=Data Link 
Layer, id=0018 (Receiver ID)
  pcieport :00:03.0:   device [1b36:000c] error 
status/mask=0040/e000
  pcieport :00:03.0:[ 6] BadTLP

In 010caed4ccb6 ("PCI/AER: Decode Error Source Requester ID"),
the "id" field was removed from the AER error logs, so currently AER
logs look like:

  pcieport :00:03.0: AER: Corrected error received: :00:03:0
  pcieport :00:03.0: PCIe Bus Error: severity=Corrected, type=Data Link 
Layer, (Receiver ID)
  pcieport :00:03.0:   device [1b36:000c] error 
status/mask=0040/e000
  pcieport :00:03.0:[ 6] BadTLP

The second line in the above logs prints "(Receiver ID)", even when
there is no "id" in the log line. This is confusing.

Remove the "ID" from the aer_agent_string[]. The error logs will
look as follows (Sample from dummy error injected by aer-inject):

  pcieport :00:03.0: AER: Corrected error received: :00:03.0
  pcieport :00:03.0: PCIe Bus Error: severity=Corrected, type=Data Link 
Layer, (Receiver)
  pcieport :00:03.0:   device [1b36:000c] error 
status/mask=0040/e000
  pcieport :00:03.0:[ 6] BadTLP

Signed-off-by: Naveen Naidu 
---
 drivers/pci/pcie/aer.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 9784fdcf3006..241ff361b43c 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -516,10 +516,10 @@ static const char *aer_uncorrectable_error_string[] = {
 };
 
 static const char *aer_agent_string[] = {
-   "Receiver ID",
-   "Requester ID",
-   "Completer ID",
-   "Transmitter ID"
+   "Receiver",
+   "Requester",
+   "Completer",
+   "Transmitter"
 };
 
 #define aer_stats_dev_attr(name, stats_array, strings_array,   \
@@ -703,7 +703,7 @@ void aer_print_error(struct pci_dev *dev, struct 
aer_err_info *info)
const char *level;
 
if (!info->status) {
-   pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, 
(Unregistered Agent ID)\n",
+   pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, 
(Unregistered Agent)\n",
aer_error_severity_string[info->severity]);
goto out;
}
-- 
2.25.1

[PATCH v3 0/8] Fix long standing AER Error Handling Issues

2021-10-04 Thread Naveen Naidu

This patch series aims at fixing some of the AER error handling issues
we have.

Currently we have the following issues:
 - Confusing message in aer_print_error()
 - aer_err_info not being initialized completely in DPC path before
   we print the AER logs
 - A bug [1] in clearing of AER registers in the native AER path

[1] https://lore.kernel.org/linux-pci/20151229155822.GA17321@localhost/

The primary aim of this patch series is to converge the APEI path and the
native AER error handling paths. In our current code, we find that we
have two different behaviours (especially when it comes to clearing of
the AER registers) for the same functionality.

This patch series, tries to bring the same semantics and hence more
commonanlity between the APEI part of code and the native OS
handling of AER errors.

PATCH 1:
  - Fixes the first issue

PATCH 2 - 4:
  - Fixes the second issue
  - "Patch 3/8" is dependent on "Patch 2/3" in the series

PATCH 5 - 7
  - Deals with converging the various paths and to bring more
commonality between them
  - "Patch 6/8" depends on "Patch 1/8"

PATCH 8:
  -  Adds extra information in AER error logs.

Thanks,
Naveen Naidu

Changelog
=

v3:
 - Fix up mail formatting and resend the patches again.
   Really sorry for all the spam. I messed up in the first try and
   instead of fixing it well in v2, I messed up again. I have fixed
   everything now. Apologies for the inconvenience caused. I'll make
   sure to not repeat it again.

v2:
  - Apologies for the mistake, I forgot to cc the linux-pci mailing 
list.Resent the email with cc to linux-pci

Naveen Naidu (8):
 [PATCH v3 1/8] PCI/AER: Remove ID from aer_agent_string[]
 [PATCH v3 2/8] PCI: Cleanup struct aer_err_info
 [PATCH v3 3/8] PCI/DPC: Initialize info->id in dpc_process_error()
 [PATCH v3 4/8] PCI/DPC: Use pci_aer_clear_status() in dpc_process_error()
 [PATCH v3 5/8] PCI/DPC: Converge EDR and DPC Path of clearing AER registers
 [PATCH v3 6/8] PCI/AER: Clear error device AER registers in aer_irq()
 [PATCH v3 7/8] PCI/ERR: Remove redundant clearing of AER register in 
pcie_do_recovery()
 [PATCH v3 8/8] PCI/AER: Include DEVCTL in aer_print_error()

 drivers/pci/pci.h  |  23 +++-
 drivers/pci/pcie/aer.c | 265 -
 drivers/pci/pcie/dpc.c |   9 +-
 drivers/pci/pcie/err.c |   9 +-
 4 files changed, 207 insertions(+), 99 deletions(-)

-- 
2.25.1

[PATCH 8/8] PCI/AER: Include DEVCTL in aer_print_error()

2021-10-04 Thread Naveen Naidu

Print the contents of Device Control Register of the device which
detected the error. This might help in faster error diagnosis.

Sample output from dummy error injected by aer-inject:

  pcieport :00:03.0: AER: Corrected error received: :00:03.0
  pcieport :00:03.0: PCIe Bus Error: severity=Corrected, type=Data Link 
Layer, (Receiver)
  pcieport :00:03.0:   device [1b36:000c] error 
status/mask=0040/e000, devctl=0x000f
  pcieport :00:03.0:[ 6] BadTLP

Signed-off-by: Naveen Naidu 
---
 drivers/pci/pci.h  |  2 ++
 drivers/pci/pcie/aer.c | 10 --
 2 files changed, 10 insertions(+), 2 deletions(-)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index eb88d8bfeaf7..48ed7f91113b 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -437,6 +437,8 @@ struct aer_err_info {
u32 status; /* COR/UNCOR Error Status */
u32 mask;   /* COR/UNCOR Error Mask */
struct aer_header_log_regs tlp; /* TLP Header */
+
+   u16 devctl;
 };
 
 /* Preliminary AER error information processed from Root port */
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 91f91d6ab052..42cae01b6887 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -729,8 +729,8 @@ void aer_print_error(struct pci_dev *dev, struct 
aer_err_info *info)
   aer_error_severity_string[info->severity],
   aer_error_layer[layer], aer_agent_string[agent]);
 
-   pci_printk(level, dev, "  device [%04x:%04x] error 
status/mask=%08x/%08x\n",
-  dev->vendor, dev->device, info->status, info->mask);
+   pci_printk(level, dev, "  device [%04x:%04x] error 
status/mask=%08x/%08x, devctl=%#06x\n",
+  dev->vendor, dev->device, info->status, info->mask, 
info->devctl);
 
__aer_print_error(dev, info);
 
@@ -1083,6 +1083,12 @@ int aer_get_device_error_info(struct pci_dev *dev, 
struct aer_err_info *info)
if (!aer)
return 0;
 
+   /*
+* Cache the value of Device Control Register now, because later the
+* device might not be available
+*/
+   pcie_capability_read_word(dev, PCI_EXP_DEVCTL, &info->devctl);
+
if (info->severity == AER_CORRECTABLE) {
pci_read_config_dword(dev, aer + PCI_ERR_COR_STATUS,
&info->status);
-- 
2.25.1

[PATCH 7/8] PCI/ERR: Remove redundant clearing of AER register in pcie_do_recovery()

2021-10-04 Thread Naveen Naidu

pcie_do_recovery() is shared across the following paths:
 - ACPI APEI
 - Native AER path
 - EDR
 - DPC

ACPI APEI
==

  ghes_handle_aer()
aer_recover_queue()
  kfifo_in_spinlocked(aer_recover_ring)

  aer_recover_work_func()
while (kfifo_get(aer_recover_ring))
  pcie_do_recovery()

In this path the system firmware clears the AER registers before
handing off the record to the OS in ghes_handle_aer()

Native AER
==

 aer_irq()
   aer_add_err_devices_to_queue()
 kfifo_put(&rpc->aer_fifo, *e_dev)
 clear_error_source_aer_registers()   < AER registers are cleard

 aer_isr()
   aer_isr_one_error()
handle_error_source()
  pcie_do_recovery()

The AER registers are cleared during the handling of IRQ, i.e before we
the recovery starts.

DPC
=

  dpc_handler()
dpc_process_error()
pci_aer_clear_status()   < AER registers are cleared
pcie_do_recovery()

EDR


  edr_handle_event()
dpc_process_error()
pci_aer_raw_clear_status()  < AER registers are cleared
pcie_do_recovery()

In all the above paths, the AER registers are cleared before
pcie_do_recovery(). The non fatal status AER registers are again cleared
in pcie_do_recovery(). This is redundant.

Remove redundant clearing of AER register in pcie_do_recovery()

Signed-off-by: Naveen Naidu 
---
 drivers/pci/pcie/err.c | 9 +++--
 1 file changed, 3 insertions(+), 6 deletions(-)

diff --git a/drivers/pci/pcie/err.c b/drivers/pci/pcie/err.c
index b576aa890c76..fe04b0ae22f4 100644
--- a/drivers/pci/pcie/err.c
+++ b/drivers/pci/pcie/err.c
@@ -231,14 +231,11 @@ pci_ers_result_t pcie_do_recovery(struct pci_dev *dev,
 
/*
 * If we have native control of AER, clear error status in the device
-* that detected the error.  If the platform retained control of AER,
-* it is responsible for clearing this status.  In that case, the
-* signaling device may not even be visible to the OS.
+* that detected the error.
 */
-   if (host->native_aer || pcie_ports_native) {
+   if (host->native_aer || pcie_ports_native)
pcie_clear_device_status(dev);
-   pci_aer_clear_nonfatal_status(dev);
-   }
+
pci_info(bridge, "device recovery successful\n");
return status;
 
-- 
2.25.1

[PATCH 6/8] PCI/AER: Clear error device AER registers in aer_irq()

2021-10-04 Thread Naveen Naidu

Converge the APEI path and native AER path of clearing the AER registers
of the error device.

In APEI path, the system firmware clears the AER registers before
handing off the record to OS. But in "native AER" path, the execution
path of clearing the AER register is as follows:

  aer_isr_one_error
aer_print_port_info
  if (find_source_device())
aer_process_err_devices
  handle_error_source
pci_write_config_dword(dev, PCI_ERR_COR_STATUS, ...)

The above path has a bug, if the find_source_device() fails, AER
registers are not cleared from the error device. This means, the error
device will keep reporting the error again and again and would lead
to message spew.

Related Bug Report:
  https://lore.kernel.org/linux-pci/20151229155822.GA17321@localhost/
  https://bugzilla.kernel.org/show_bug.cgi?id=109691
  https://bugs.launchpad.net/ubuntu/+source/linux/+bug/1521173

The above bug could be avoided, if the AER registers are cleared during
the AER IRQ handler aer_irq(), which would provide guarantee that the AER
error registers are always cleared. This is similar to how APEI handles
these errors.

The main aim is that:

  When an interrupt handler deals with a interrupt, it must *always*
  clear the source of the interrupt.

Signed-off-by: Naveen Naidu 
---
 drivers/pci/pci.h  |  13 ++-
 drivers/pci/pcie/aer.c | 245 -
 2 files changed, 182 insertions(+), 76 deletions(-)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 9be7a966fda7..eb88d8bfeaf7 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -424,7 +424,6 @@ static inline bool pci_dev_is_added(const struct pci_dev 
*dev)
 #define AER_MAX_MULTI_ERR_DEVICES  5   /* Not likely to have more */
 
 struct aer_err_info {
-   struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
int error_dev_num;
 
u16 id;
@@ -440,6 +439,18 @@ struct aer_err_info {
struct aer_header_log_regs tlp; /* TLP Header */
 };
 
+/* Preliminary AER error information processed from Root port */
+struct aer_devices_err_info {
+   struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
+   struct aer_err_info err_info;
+};
+
+/* AER information associated with each error device */
+struct aer_dev_err_info {
+   struct pci_dev *dev;
+   struct aer_err_info err_info;
+};
+
 int aer_get_device_error_info(struct pci_dev *dev, struct aer_err_info *info);
 void aer_print_error(struct pci_dev *dev, struct aer_err_info *info);
 #endif /* CONFIG_PCIEAER */
diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 241ff361b43c..91f91d6ab052 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -36,6 +36,18 @@
 
 #define AER_ERROR_SOURCES_MAX  128
 
+/*
+ * There can be 128 maximum error sources (AER_ERROR_SOURCES_MAX) and each
+ * error source can have maximum of 5 error devices (AER_MAX_MULTI_ERR_DEVICES)
+ * so the maximum error devices we can report is:
+ *
+ * AER_ERROR_DEVICES_MAX = AER_ERROR_SOURCES_MAX * AER_MAX_MULTI_ERR_DEVICES 
== (128 * 5) == 640
+ *
+ * But since, the size in KFIFO should be a power of two, the closest value
+ * to 640 is 1024
+ */
+# define AER_ERROR_DEVICES_MAX 1024
+
 #define AER_MAX_TYPEOF_COR_ERRS16  /* as per 
PCI_ERR_COR_STATUS */
 #define AER_MAX_TYPEOF_UNCOR_ERRS  27  /* as per PCI_ERR_UNCOR_STATUS*/
 
@@ -46,7 +58,7 @@ struct aer_err_source {
 
 struct aer_rpc {
struct pci_dev *rpd;/* Root Port device */
-   DECLARE_KFIFO(aer_fifo, struct aer_err_source, AER_ERROR_SOURCES_MAX);
+   DECLARE_KFIFO(aer_fifo, struct aer_dev_err_info, AER_ERROR_DEVICES_MAX);
 };
 
 /* AER stats for the device */
@@ -806,11 +818,11 @@ void cper_print_aer(struct pci_dev *dev, int aer_severity,
  * @e_info: pointer to error info
  * @dev: pointer to pci_dev to be added
  */
-static int add_error_device(struct aer_err_info *e_info, struct pci_dev *dev)
+static int add_error_device(struct aer_devices_err_info *e_dev, struct pci_dev 
*dev)
 {
-   if (e_info->error_dev_num < AER_MAX_MULTI_ERR_DEVICES) {
-   e_info->dev[e_info->error_dev_num] = pci_dev_get(dev);
-   e_info->error_dev_num++;
+   if (e_dev->err_info.error_dev_num < AER_MAX_MULTI_ERR_DEVICES) {
+   e_dev->dev[e_dev->err_info.error_dev_num] = pci_dev_get(dev);
+   e_dev->err_info.error_dev_num++;
return 0;
}
return -ENOSPC;
@@ -877,18 +889,18 @@ static bool is_error_source(struct pci_dev *dev, struct 
aer_err_info *e_info)
 
 static int find_device_iter(struct pci_dev *dev, void *data)
 {
-   struct aer_err_info *e_info = (struct aer_err_info *)data;
+   struct aer_devices_err_info *e_dev = (struct aer_devices_err_info 
*)data;
 
-   if (is_error_source(dev, e_info)) {
+   if (is_error_source(dev, &e_dev->err_info)) {
/* List this device */
-   if (add_error_device(e_info,

[PATCH 5/8] PCI/DPC: Converge EDR and DPC Path of clearing AER registers

2021-10-04 Thread Naveen Naidu

In the EDR path, AER registers are cleared *after* DPC error event is
processed. The process stack in EDR is:

  edr_handle_event()
dpc_process_error()
pci_aer_raw_clear_status()
pcie_do_recovery()

But in DPC path, AER status registers are cleared *while* processing
the error. The process stack in DPC is:

  dpc_handler()
dpc_process_error()
  pci_aer_clear_status()
pcie_do_recovery()

In EDR path, AER status registers are cleared irrespective of whether
the error was an RP PIO or unmasked uncorrectable error. But in DPC, the
AER status registers are cleared only when it's an unmasked uncorrectable
error.

This leads to two different behaviours for the same task (handling of
DPC errors) in FFS systems and when native OS has control.

Bring the same semantics for clearing the AER status register in EDR
path and DPC path.

Signed-off-by: Naveen Naidu 
---
 drivers/pci/pcie/dpc.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index faf4a1e77fab..68899a3db126 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -288,7 +288,6 @@ void dpc_process_error(struct pci_dev *pdev)
 dpc_get_aer_uncorrect_severity(pdev, &info) &&
 aer_get_device_error_info(pdev, &info)) {
aer_print_error(pdev, &info);
-   pci_aer_clear_status(pdev);
}
 }
 
@@ -297,6 +296,7 @@ static irqreturn_t dpc_handler(int irq, void *context)
struct pci_dev *pdev = context;
 
dpc_process_error(pdev);
+   pci_aer_clear_status(pdev);
 
/* We configure DPC so it only triggers on ERR_FATAL */
pcie_do_recovery(pdev, pci_channel_io_frozen, dpc_reset_link);
-- 
2.25.1

[PATCH 4/8] PCI/DPC: Use pci_aer_clear_status() in dpc_process_error()

2021-10-04 Thread Naveen Naidu

dpc_process_error() clears both AER fatal and non fatal status
registers. Instead of clearing each status registers via a different
function call use pci_aer_clear_status().

This helps clean up the code a bit.

Signed-off-by: Naveen Naidu 
---
 drivers/pci/pcie/dpc.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/pci/pcie/dpc.c b/drivers/pci/pcie/dpc.c
index df3f3a10f8bc..faf4a1e77fab 100644
--- a/drivers/pci/pcie/dpc.c
+++ b/drivers/pci/pcie/dpc.c
@@ -288,8 +288,7 @@ void dpc_process_error(struct pci_dev *pdev)
 dpc_get_aer_uncorrect_severity(pdev, &info) &&
 aer_get_device_error_info(pdev, &info)) {
aer_print_error(pdev, &info);
-   pci_aer_clear_nonfatal_status(pdev);
-   pci_aer_clear_fatal_status(pdev);
+   pci_aer_clear_status(pdev);
}
 }
 
-- 
2.25.1

[PATCH 2/8] PCI: Cleanup struct aer_err_info

2021-10-04 Thread Naveen Naidu

The id, status and the mask fields of the struct aer_err_info comes
directly from the registers, hence their sizes should be explicit.

The length of these registers are:
  - id: 16 bits - Represents the Error Source Requester ID
  - status: 32 bits - COR/UNCOR Error Status
  - mask: 32 bits - COR/UNCOR Error Mask

Since the length of the above registers are even, use u16 and u32
to represent their values.

Also remove the __pad fields.

"pahole" was run on the modified struct aer_err_info and the size
remains unchanged.

Signed-off-by: Naveen Naidu 
---
 drivers/pci/pci.h | 8 +++-
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/drivers/pci/pci.h b/drivers/pci/pci.h
index 1cce56c2aea0..9be7a966fda7 100644
--- a/drivers/pci/pci.h
+++ b/drivers/pci/pci.h
@@ -427,18 +427,16 @@ struct aer_err_info {
struct pci_dev *dev[AER_MAX_MULTI_ERR_DEVICES];
int error_dev_num;
 
-   unsigned int id:16;
+   u16 id;
 
unsigned int severity:2;/* 0:NONFATAL | 1:FATAL | 2:COR */
-   unsigned int __pad1:5;
unsigned int multi_error_valid:1;
 
unsigned int first_error:5;
-   unsigned int __pad2:2;
unsigned int tlp_header_valid:1;
 
-   unsigned int status;/* COR/UNCOR Error Status */
-   unsigned int mask;  /* COR/UNCOR Error Mask */
+   u32 status; /* COR/UNCOR Error Status */
+   u32 mask;   /* COR/UNCOR Error Mask */
struct aer_header_log_regs tlp; /* TLP Header */
 };
 
-- 
2.25.1

[PATCH 1/8] PCI/AER: Remove ID from aer_agent_string[]

2021-10-04 Thread Naveen Naidu

Before 010caed4ccb6 ("PCI/AER: Decode Error Source RequesterID")
the AER error logs looked like:

  pcieport :00:03.0: AER: Corrected error received: id=0018
  pcieport :00:03.0: PCIe Bus Error: severity=Corrected, type=Data Link 
Layer, id=0018 (Receiver ID)
  pcieport :00:03.0:   device [1b36:000c] error 
status/mask=0040/e000
  pcieport :00:03.0:[ 6] BadTLP

In 010caed4ccb6 ("PCI/AER: Decode Error Source Requester ID"),
the "id" field was removed from the AER error logs, so currently AER
logs look like:

  pcieport :00:03.0: AER: Corrected error received: :00:03:0
  pcieport :00:03.0: PCIe Bus Error: severity=Corrected, type=Data Link 
Layer, (Receiver ID)
  pcieport :00:03.0:   device [1b36:000c] error 
status/mask=0040/e000
  pcieport :00:03.0:[ 6] BadTLP

The second line in the above logs prints "(Receiver ID)", even when
there is no "id" in the log line. This is confusing.

Remove the "ID" from the aer_agent_string[]. The error logs will
look as follows (Sample from dummy error injected by aer-inject):

  pcieport :00:03.0: AER: Corrected error received: :00:03.0
  pcieport :00:03.0: PCIe Bus Error: severity=Corrected, type=Data Link 
Layer, (Receiver)
  pcieport :00:03.0:   device [1b36:000c] error 
status/mask=0040/e000
  pcieport :00:03.0:[ 6] BadTLP

Signed-off-by: Naveen Naidu 
---
 drivers/pci/pcie/aer.c | 10 +-
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 9784fdcf3006..241ff361b43c 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -516,10 +516,10 @@ static const char *aer_uncorrectable_error_string[] = {
 };
 
 static const char *aer_agent_string[] = {
-   "Receiver ID",
-   "Requester ID",
-   "Completer ID",
-   "Transmitter ID"
+   "Receiver",
+   "Requester",
+   "Completer",
+   "Transmitter"
 };
 
 #define aer_stats_dev_attr(name, stats_array, strings_array,   \
@@ -703,7 +703,7 @@ void aer_print_error(struct pci_dev *dev, struct 
aer_err_info *info)
const char *level;
 
if (!info->status) {
-   pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, 
(Unregistered Agent ID)\n",
+   pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, 
(Unregistered Agent)\n",
aer_error_severity_string[info->severity]);
goto out;
}
-- 
2.25.1

[PATCH 0/8] Fix long standing AER Error Handling Issues

2021-10-04 Thread Naveen Naidu

This patch series aims at fixing some of the AER error handling issues
we have.

Currently we have the following issues:
 - Confusing message in aer_print_error()
 - aer_err_info not being initialized completely in DPC path before 
   we print the AER logs
 - A bug [1] in clearing of AER registers in the native AER path

[1] https://lore.kernel.org/linux-pci/20151229155822.GA17321@localhost/

The primary aim of this patch series is to converge the APEI path and the
native AER error handling paths. In our current code, we find that we
have two different behaviours (especially when it comes to clearing of
the AER registers) for the same functionality.

This patch series, tries to bring the same semantics and hence more 
commonanlity between the APEI part of code and the native OS 
handling of AER errors.

PATCH 1: 
  - Fixes the first issue

PATCH 2 - 4:
  - Fixes the second issue
  - "Patch 3/8" is dependent on "Patch 2/3" in the series

PATCH 5 - 7
  - Deals with converging the various paths and to bring more
commonality between them
  - "Patch 6/8" depends on "Patch 1/8"

PATCH 8:
  -  Adds extra information in AER error logs.

Thanks,
Naveen Naidu

Naveen Naidu (8):
 [PATCH 1/8] PCI/AER: Remove ID from aer_agent_string[]
 [PATCH 2/8] PCI: Cleanup struct aer_err_info
 [PATCH 3/8] PCI/DPC: Initialize info->id in dpc_process_error()
 [PATCH 4/8] PCI/DPC: Use pci_aer_clear_status() in dpc_process_error()
 [PATCH 5/8] PCI/DPC: Converge EDR and DPC Path of clearing AER registers
 [PATCH 6/8] PCI/AER: Clear error device AER registers in aer_irq()
 [PATCH 7/8] PCI/ERR: Remove redundant clearing of AER register in 
pcie_do_recovery()
 [PATCH 8/8] PCI/AER: Include DEVCTL in aer_print_error()

 drivers/pci/pci.h  |  23 +++-
 drivers/pci/pcie/aer.c | 265 -
 drivers/pci/pcie/dpc.c |   9 +-
 drivers/pci/pcie/err.c |   9 +-
 4 files changed, 207 insertions(+), 99 deletions(-)

-- 
2.25.1

[PATCH v6 00/11] PCI: Drop duplicated tracking of a pci_dev's bound driver

2021-10-04 Thread Uwe Kleine-König

Hello,

this is v6 of the quest to drop the "driver" member from struct pci_dev
which tracks the same data (apart from a constant offset) as dev.driver.

Changes since v5:
 - Some Acks added
 - Some fixes in "PCI: Replace pci_dev::driver usage by
   pci_dev::dev.driver" to properly handle that
   to_pci_driver(X) is wrong if X is NULL.
   This should fix the problem reported by Ido Schimmel.

Full range diff below.

This patch stack survived an allmodconfig build on arm64, m68k, powerpc,
riscv, s390, sparc64 and x86_64 on top of v5.15-rc3.

Best regards
Uwe

Uwe Kleine-König (11):
  PCI: Simplify pci_device_remove()
  PCI: Drop useless check from pci_device_probe()
  xen/pci: Drop some checks that are always true
  bcma: simplify reference to the driver's name
  powerpc/eeh: Don't use driver member of struct pci_dev and further
cleanups
  ssb: Simplify determination of driver name
  PCI: Replace pci_dev::driver usage that gets the driver name
  scsi: message: fusion: Remove unused parameter of mpt_pci driver's
probe()
  crypto: qat - simplify adf_enable_aer()
  PCI: Replace pci_dev::driver usage by pci_dev::dev.driver
  PCI: Drop duplicated tracking of a pci_dev's bound driver

 arch/powerpc/include/asm/ppc-pci.h|  5 -
 arch/powerpc/kernel/eeh.c |  8 ++
 arch/powerpc/kernel/eeh_driver.c  | 10 +-
 arch/x86/events/intel/uncore.c|  2 +-
 arch/x86/kernel/probe_roms.c  | 10 +-
 drivers/bcma/host_pci.c   |  6 +-
 drivers/crypto/hisilicon/qm.c |  2 +-
 drivers/crypto/qat/qat_4xxx/adf_drv.c |  7 +-
 drivers/crypto/qat/qat_c3xxx/adf_drv.c|  7 +-
 drivers/crypto/qat/qat_c62x/adf_drv.c |  7 +-
 drivers/crypto/qat/qat_common/adf_aer.c   | 10 +-
 .../crypto/qat/qat_common/adf_common_drv.h|  3 +-
 drivers/crypto/qat/qat_dh895xcc/adf_drv.c |  7 +-
 drivers/message/fusion/mptbase.c  |  7 +-
 drivers/message/fusion/mptbase.h  |  2 +-
 drivers/message/fusion/mptctl.c   |  4 +-
 drivers/message/fusion/mptlan.c   |  2 +-
 drivers/misc/cxl/guest.c  | 24 +++--
 drivers/misc/cxl/pci.c| 30 +++---
 .../ethernet/hisilicon/hns3/hns3_ethtool.c|  2 +-
 .../ethernet/marvell/prestera/prestera_pci.c  |  2 +-
 drivers/net/ethernet/mellanox/mlxsw/pci.c |  2 +-
 .../ethernet/netronome/nfp/nfp_net_ethtool.c  |  3 +-
 drivers/pci/iov.c | 33 +--
 drivers/pci/pci-driver.c  | 96 ++-
 drivers/pci/pci.c |  4 +-
 drivers/pci/pcie/err.c| 36 +++
 drivers/pci/xen-pcifront.c| 63 ++--
 drivers/ssb/pcihost_wrapper.c |  6 +-
 drivers/usb/host/xhci-pci.c   |  2 +-
 include/linux/pci.h   |  1 -
 31 files changed, 208 insertions(+), 195 deletions(-)

Range-diff against v5:
 -:   >  1:  c2b53ab26a6b PCI: Simplify pci_device_remove()
 -:   >  2:  2c733e1d5186 PCI: Drop useless check from 
pci_device_probe()
 -:   >  3:  547ca5a7aa16 xen/pci: Drop some checks that are always 
true
 -:   >  4:  40eb07353844 bcma: simplify reference to the driver's 
name
 -:   >  5:  bab59c1dff6d powerpc/eeh: Don't use driver member of 
struct pci_dev and further cleanups
 1:  abd70de9782d !  6:  92f4d61bbac3 ssb: Simplify determination of driver name
@@ Commit message
 This has the upside of not requiring the driver member of struct 
pci_dev
 which is about to be removed and being simpler.
 
+Acked-by: Michael Büsch 
 Signed-off-by: Uwe Kleine-König 
 
  ## drivers/ssb/pcihost_wrapper.c ##
 2:  735845bd26b9 !  7:  6303f03ab2aa PCI: Replace pci_dev::driver usage that 
gets the driver name
@@ Commit message
 driver name by dev_driver_string() which implicitly makes use of struct
 pci_dev::dev->driver.
 
+Acked-by: Simon Horman  (for NFP)
 Signed-off-by: Uwe Kleine-König 
 
  ## drivers/crypto/hisilicon/qm.c ##
 3:  1e58019165b9 =  8:  658a6c00ec96 scsi: message: fusion: Remove unused 
parameter of mpt_pci driver's probe()
 4:  dea72a470141 =  9:  aceaf5321603 crypto: qat - simplify adf_enable_aer()
 5:  b4165dda38ea ! 10:  80648d85 PCI: Replace pci_dev::driver usage by 
pci_dev::dev.driver
@@ arch/x86/kernel/probe_roms.c: static struct resource video_rom_resource 
= {
  static bool match_id(struct pci_dev *pdev, unsigned short vendor, 
unsigned short device)
  {
 -  struct pci_driver *drv = pdev->driver;
-+  struct pci_driver *drv = to_pci_driver(pdev->dev.driver);
const struct pci_device_id *id;
  
if (pdev->vendor == vendor && pdev->device == device)
+   return true;
+ 
+-  for (id = drv ? drv->id_table : NU

Re: Add Apple M1 support to PASemi i2c driver

2021-10-04 Thread Christian Zigotzky



> On 3. Oct 2021, at 16:36, Sven Peter  wrote:
> 
> Hi,
> 
> 
>> On Fri, Oct 1, 2021, at 06:47, Christian Zigotzky wrote:
>>> On 27 September 2021 at 07:39 am, Sven Peter wrote:
>>> Hi Christian,
>>> 
>>> Thanks already for volunteering to test this!
>>> 
>> Hello Sven,
>> 
>> Damien (Hypex) has successfully tested the RC3 of kernel 5.15 with your 
>> modified i2c driver on his Nemo board yesterday. [1]
> 
> Thanks a lot, that's great to hear!
> If he wants to I can credit him with a Tested-by tag in the commit message,
> see e.g. 
> https://www.kernel.org/doc/html/latest/process/submitting-patches.html#using-reported-by-tested-by-reviewed-by-suggested-by-and-fixes.
> 
> 
> Best,
> 
> 
> Sven

Hi Sven,

Unfortunately Damien has found an issue. [1]

Output of i2cdetect -l with the default RC3 of kernel 5.15 without your 
modifications:

2c-0i2c Radeon i2c bit bus 0x90 I2C adapter
i2c-1   i2c Radeon i2c bit bus 0x91 I2C adapter
i2c-2   i2c Radeon i2c bit bus 0x92 I2C adapter
i2c-3   i2c Radeon i2c bit bus 0x93 I2C adapter
i2c-4   i2c Radeon i2c bit bus 0x94 I2C adapter
i2c-5   i2c Radeon i2c bit bus 0x95 I2C adapter
i2c-6   i2c Radeon i2c bit bus 0x96 I2C adapter
i2c-7   i2c Radeon i2c bit bus 0x97 I2C adapter
i2c-8   i2c PA Semi SMBus adapter at 0x800200   I2C adapter
i2c-9   i2c PA Semi SMBus adapter at 0x800240   I2C adapter
i2c-10  i2c PA Semi SMBus adapter at 0x800280   I2C adapter

Output of i2cdetect -l with your modifications:

i2c-0   i2c Radeon i2c bit bus 0x90 I2C adapter
i2c-1   i2c Radeon i2c bit bus 0x91 I2C adapter
i2c-2   i2c Radeon i2c bit bus 0x92 I2C adapter
i2c-3   i2c Radeon i2c bit bus 0x93 I2C adapter
i2c-4   i2c Radeon i2c bit bus 0x94 I2C adapter
i2c-5   i2c Radeon i2c bit bus 0x95 I2C adapter
i2c-6   i2c Radeon i2c bit bus 0x96 I2C adapter
i2c-7   i2c Radeon i2c bit bus 0x97 I2C adapter
i2c-8   i2c PA Semi SMBus adapter at 0x(ptrval) I2C 
adapter
i2c-9   i2c PA Semi SMBus adapter at 0x(ptrval) I2C 
adapter
i2c-10  i2c PA Semi SMBus adapter at 0x(ptrval) I2C 
adapter

Please check the outputs.

Thanks,
Christian

[1] https://forum.hyperion-entertainment.com/viewtopic.php?p=54165#p54165

Re: [PATCH v3 1/8] PCI/AER: Remove ID from aer_agent_string[]

2021-10-04 Thread Shuah Khan


On 10/4/21 8:29 AM, Naveen Naidu wrote:

Before 010caed4ccb6 ("PCI/AER: Decode Error Source RequesterID")
the AER error logs looked like:

   pcieport :00:03.0: AER: Corrected error received: id=0018
   pcieport :00:03.0: PCIe Bus Error: severity=Corrected, type=Data Link 
Layer, id=0018 (Receiver ID)
   pcieport :00:03.0:   device [1b36:000c] error 
status/mask=0040/e000
   pcieport :00:03.0:[ 6] BadTLP

In 010caed4ccb6 ("PCI/AER: Decode Error Source Requester ID"),
the "id" field was removed from the AER error logs, so currently AER
logs look like:

   pcieport :00:03.0: AER: Corrected error received: :00:03:0
   pcieport :00:03.0: PCIe Bus Error: severity=Corrected, type=Data Link 
Layer, (Receiver ID)
   pcieport :00:03.0:   device [1b36:000c] error 
status/mask=0040/e000
   pcieport :00:03.0:[ 6] BadTLP

The second line in the above logs prints "(Receiver ID)", even when
there is no "id" in the log line. This is confusing.



Starting your commit log to say that message are confusing and then talk
about why will make it easier to understand why the change is needed.


Remove the "ID" from the aer_agent_string[]. The error logs will
look as follows (Sample from dummy error injected by aer-inject):

   pcieport :00:03.0: AER: Corrected error received: :00:03.0
   pcieport :00:03.0: PCIe Bus Error: severity=Corrected, type=Data Link 
Layer, (Receiver)
   pcieport :00:03.0:   device [1b36:000c] error 
status/mask=0040/e000
   pcieport :00:03.0:[ 6] BadTLP



It is good to see before and after messages. However, it will be helpful
to know why this change is necessary. It isn't very clear why in this
commit log.


Signed-off-by: Naveen Naidu 


Extra signed-off-by?


---
  drivers/pci/pcie/aer.c | 10 +-
  1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/drivers/pci/pcie/aer.c b/drivers/pci/pcie/aer.c
index 9784fdcf3006..241ff361b43c 100644
--- a/drivers/pci/pcie/aer.c
+++ b/drivers/pci/pcie/aer.c
@@ -516,10 +516,10 @@ static const char *aer_uncorrectable_error_string[] = {
  };
  
  static const char *aer_agent_string[] = {

-   "Receiver ID",
-   "Requester ID",
-   "Completer ID",
-   "Transmitter ID"
+   "Receiver",
+   "Requester",
+   "Completer",
+   "Transmitter"
  };
  
  #define aer_stats_dev_attr(name, stats_array, strings_array,		\

@@ -703,7 +703,7 @@ void aer_print_error(struct pci_dev *dev, struct 
aer_err_info *info)
const char *level;
  
  	if (!info->status) {

-   pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, 
(Unregistered Agent ID)\n",
+   pci_err(dev, "PCIe Bus Error: severity=%s, type=Inaccessible, 
(Unregistered Agent)\n",
aer_error_severity_string[info->severity]);
goto out;
}



thanks,
-- Shuah

Re: [PATCH v5 00/14] PCI: Add support for Apple M1

2021-10-04 Thread Linus Walleij

On Mon, Oct 4, 2021 at 9:52 PM Rob Herring  wrote:

> FYI, I pushed patches 1-3 to kernelCI and didn't see any regressions.
> I am a bit worried about changes to the DT interrupt parsing and
> ancient platforms (such as PowerMacs). Most likely there wouldn't be
> any report until -rc1 or months later on those old systems.

Lets page the PPC lists to see if someone can test on some powermac.

Linus Walleij

Re: [PATCH 4/9] powerpc/bpf: Handle large branch ranges with BPF_EXIT

2021-10-04 Thread Naveen N. Rao


Christophe Leroy wrote:



Le 01/10/2021 à 23:14, Naveen N. Rao a écrit :

In some scenarios, it is possible that the program epilogue is outside
the branch range for a BPF_EXIT instruction. Instead of rejecting such
programs, emit an indirect branch. We track the size of the bpf program
emitted after the initial run and do a second pass since BPF_EXIT can
end up emitting different number of instructions depending on the
program size.

Suggested-by: Jordan Niethe 
Signed-off-by: Naveen N. Rao 
---
  arch/powerpc/net/bpf_jit.h|  3 +++
  arch/powerpc/net/bpf_jit_comp.c   | 22 +-
  arch/powerpc/net/bpf_jit_comp32.c |  2 +-
  arch/powerpc/net/bpf_jit_comp64.c |  2 +-
  4 files changed, 26 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
index 89bd744c2bffd4..4023de1698b9f5 100644
--- a/arch/powerpc/net/bpf_jit.h
+++ b/arch/powerpc/net/bpf_jit.h
@@ -126,6 +126,7 @@
  
  #define SEEN_FUNC	0x2000 /* might call external helpers */

  #define SEEN_TAILCALL 0x4000 /* uses tail calls */
+#define SEEN_BIG_PROG  0x8000 /* large prog, >32MB */
  
  #define SEEN_VREG_MASK	0x1ff8 /* Volatile registers r3-r12 */

  #define SEEN_NVREG_MASK   0x0003 /* Non volatile registers r14-r31 */
@@ -179,6 +180,8 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, 
struct codegen_context *
  void bpf_jit_build_prologue(u32 *image, struct codegen_context *ctx);
  void bpf_jit_build_epilogue(u32 *image, struct codegen_context *ctx);
  void bpf_jit_realloc_regs(struct codegen_context *ctx);
+int bpf_jit_emit_exit_insn(u32 *image, struct codegen_context *ctx,
+   int tmp_reg, unsigned long exit_addr);
  
  #endif
  
diff --git a/arch/powerpc/net/bpf_jit_comp.c b/arch/powerpc/net/bpf_jit_comp.c

index fcbf7a917c566e..3204872fbf2738 100644
--- a/arch/powerpc/net/bpf_jit_comp.c
+++ b/arch/powerpc/net/bpf_jit_comp.c
@@ -72,6 +72,21 @@ static int bpf_jit_fixup_subprog_calls(struct bpf_prog *fp, 
u32 *image,
return 0;
  }
  
+int bpf_jit_emit_exit_insn(u32 *image, struct codegen_context *ctx,

+   int tmp_reg, unsigned long exit_addr)
+{
+   if (!(ctx->seen & SEEN_BIG_PROG) && 
is_offset_in_branch_range(exit_addr)) {
+   PPC_JMP(exit_addr);
+   } else {
+   ctx->seen |= SEEN_BIG_PROG;
+   PPC_FUNC_ADDR(tmp_reg, (unsigned long)image + exit_addr);
+   EMIT(PPC_RAW_MTCTR(tmp_reg));
+   EMIT(PPC_RAW_BCTR());
+   }
+
+   return 0;
+}
+
  struct powerpc64_jit_data {
struct bpf_binary_header *header;
u32 *addrs;
@@ -155,12 +170,17 @@ struct bpf_prog *bpf_int_jit_compile(struct bpf_prog *fp)
goto out_addrs;
}
  
+	if (!is_offset_in_branch_range((long)cgctx.idx * 4))

+   cgctx.seen |= SEEN_BIG_PROG;
+
/*
 * If we have seen a tail call, we need a second pass.
 * This is because bpf_jit_emit_common_epilogue() is called
 * from bpf_jit_emit_tail_call() with a not yet stable ctx->seen.
+* We also need a second pass if we ended up with too large
+* a program so as to fix branches.
 */
-   if (cgctx.seen & SEEN_TAILCALL) {
+   if (cgctx.seen & (SEEN_TAILCALL | SEEN_BIG_PROG)) {
cgctx.idx = 0;
if (bpf_jit_build_body(fp, 0, &cgctx, addrs, false)) {
fp = org_fp;
diff --git a/arch/powerpc/net/bpf_jit_comp32.c 
b/arch/powerpc/net/bpf_jit_comp32.c
index a74d52204f8da2..d2a67574a23066 100644
--- a/arch/powerpc/net/bpf_jit_comp32.c
+++ b/arch/powerpc/net/bpf_jit_comp32.c
@@ -852,7 +852,7 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, 
struct codegen_context *
 * we'll just fall through to the epilogue.
 */
if (i != flen - 1)
-   PPC_JMP(exit_addr);
+   bpf_jit_emit_exit_insn(image, ctx, tmp_reg, 
exit_addr);


On ppc32, if you use tmp_reg you must flag it. But I think you could use 
r0 instead.


Indeed. Can we drop tracking of the temp registers and using them while
remapping registers? Are you seeing significant benefits with re-use of 
those temp registers?


- Naveen

Re: [PATCH 0/9] powerpc/bpf: Various fixes

2021-10-04 Thread Naveen N. Rao


Hi Johan,

Johan Almbladh wrote:

On Fri, Oct 1, 2021 at 11:15 PM Naveen N. Rao
 wrote:


Various fixes to the eBPF JIT for powerpc, thanks to some new tests
added by Johan. This series fixes all failures in test_bpf on powerpc64.
There are still some failures on powerpc32 to be looked into.


Great work! I have tested it on powerpc64 in QEMU, which is the same
setup that previously triggered an illegal instruction, and all tests
pass now. On powerpc32 there are still some issues left as you say.


Thanks for the review, and the test!


- Naveen

Re: [PATCH 6/9] powerpc/bpf: Fix BPF_SUB when imm == 0x80000000

2021-10-04 Thread Naveen N. Rao


Christophe Leroy wrote:



Le 01/10/2021 à 23:14, Naveen N. Rao a écrit :

We aren't handling subtraction involving an immediate value of
0x8000 properly. Fix the same.

Fixes: 156d0e290e969c ("powerpc/ebpf/jit: Implement JIT compiler for extended 
BPF")
Signed-off-by: Naveen N. Rao 
---
  arch/powerpc/net/bpf_jit_comp64.c | 16 
  1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit_comp64.c 
b/arch/powerpc/net/bpf_jit_comp64.c
index ffb7a2877a8469..4641a50e82d50d 100644
--- a/arch/powerpc/net/bpf_jit_comp64.c
+++ b/arch/powerpc/net/bpf_jit_comp64.c
@@ -333,15 +333,15 @@ int bpf_jit_build_body(struct bpf_prog *fp, u32 *image, 
struct codegen_context *
case BPF_ALU | BPF_SUB | BPF_K: /* (u32) dst -= (u32) imm */
case BPF_ALU64 | BPF_ADD | BPF_K: /* dst += imm */
case BPF_ALU64 | BPF_SUB | BPF_K: /* dst -= imm */
-   if (BPF_OP(code) == BPF_SUB)
-   imm = -imm;
-   if (imm) {
-   if (imm >= -32768 && imm < 32768)
-   EMIT(PPC_RAW_ADDI(dst_reg, dst_reg, 
IMM_L(imm)));
-   else {
-   PPC_LI32(b2p[TMP_REG_1], imm);
+   if (imm > -32768 && imm < 32768) {
+   EMIT(PPC_RAW_ADDI(dst_reg, dst_reg,
+   BPF_OP(code) == BPF_SUB ? IMM_L(-imm) : 
IMM_L(imm)));
+   } else {
+   PPC_LI32(b2p[TMP_REG_1], imm);
+   if (BPF_OP(code) == BPF_SUB)
+   EMIT(PPC_RAW_SUB(dst_reg, dst_reg, 
b2p[TMP_REG_1]));
+   else
EMIT(PPC_RAW_ADD(dst_reg, dst_reg, 
b2p[TMP_REG_1]));
-   }
}
goto bpf_alu32_trunc;


There is now so few code common to both BPF_ADD and BPF_SUB that you 
should make them different cases.


While at it, why not also use ADDIS if imm is 32 bits ? That would be an 
ADDIS/ADDI instead of LIS/ORI/ADD


Sure. I wanted to limit the change for this fix. We can do a separate 
patch to optimize code generation for BPF_ADD.



- Naveen

Re: [PATCH 3/9] powerpc/bpf: Remove unused SEEN_STACK

2021-10-04 Thread Naveen N. Rao


Christophe Leroy wrote:



Le 01/10/2021 à 23:14, Naveen N. Rao a écrit :

From: Ravi Bangoria 

SEEN_STACK is unused on PowerPC. Remove it. Also, have
SEEN_TAILCALL use 0x4000.


Why change SEEN_TAILCALL ? Would it be a problem to leave it as is ?



Signed-off-by: Ravi Bangoria 
Reviewed-by: Christophe Leroy 


I prefer the bit usage to be contiguous. Changing SEEN_TAILCALL isn't a 
problem either.



- Naveen

Re: [PATCH 2/9] powerpc/bpf: Validate branch ranges

2021-10-04 Thread Naveen N. Rao


Christophe Leroy wrote:



Le 01/10/2021 à 23:14, Naveen N. Rao a écrit :

Add checks to ensure that we never emit branch instructions with
truncated branch offsets.

Suggested-by: Michael Ellerman 
Signed-off-by: Naveen N. Rao 
---
  arch/powerpc/net/bpf_jit.h| 26 --
  arch/powerpc/net/bpf_jit_comp.c   |  6 +-
  arch/powerpc/net/bpf_jit_comp32.c |  8 ++--
  arch/powerpc/net/bpf_jit_comp64.c |  8 ++--
  4 files changed, 37 insertions(+), 11 deletions(-)

diff --git a/arch/powerpc/net/bpf_jit.h b/arch/powerpc/net/bpf_jit.h
index 935ea95b66359e..7e9b978b768ed9 100644
--- a/arch/powerpc/net/bpf_jit.h
+++ b/arch/powerpc/net/bpf_jit.h
@@ -24,16 +24,30 @@
  #define EMIT(instr)   PLANT_INSTR(image, ctx->idx, instr)
  
  /* Long jump; (unconditional 'branch') */

-#define PPC_JMP(dest)  EMIT(PPC_INST_BRANCH |\
-(((dest) - (ctx->idx * 4)) & 0x03fc))
+#define PPC_JMP(dest)\
+   do {  \
+   long offset = (long)(dest) - (ctx->idx * 4);   \
+   if (!is_offset_in_branch_range(offset)) { \
+   pr_err_ratelimited("Branch offset 0x%lx (@%u) out of 
range\n", offset, ctx->idx);  \


Does it really deserves a KERN_ERR ?


The intent is to ensure that we handle this when JIT'ing the BPF
instruction. One of the subsequent patches fixes the only scenario where 
we can hit this today. In practice, we should never hit this and if we 
do see this, then it is a bug with the JIT.



Isn't that something that can trigger with a userland request ?


This can't be triggered by unprivileged BPF programs since those are 
limited to 4096 BPF instructions. You need root privileges to load large 
enough BPF programs that can trigger out of range branches.



- Naveen

Re: [PATCH 1/9] powerpc/lib: Add helper to check if offset is within conditional branch range

2021-10-04 Thread Naveen N. Rao


Hi Christophe,
Thanks for the reviews.


Christophe Leroy wrote:



Le 01/10/2021 à 23:14, Naveen N. Rao a écrit :

Add a helper to check if a given offset is within the branch range for a
powerpc conditional branch instruction, and update some sites to use the
new helper.

Signed-off-by: Naveen N. Rao 
---
  arch/powerpc/include/asm/code-patching.h | 1 +
  arch/powerpc/lib/code-patching.c | 7 ++-
  arch/powerpc/net/bpf_jit.h   | 7 +--
  3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/code-patching.h 
b/arch/powerpc/include/asm/code-patching.h
index a95f63788c6b14..4ba834599c4d4c 100644
--- a/arch/powerpc/include/asm/code-patching.h
+++ b/arch/powerpc/include/asm/code-patching.h
@@ -23,6 +23,7 @@
  #define BRANCH_ABSOLUTE   0x2
  
  bool is_offset_in_branch_range(long offset);

+bool is_offset_in_cond_branch_range(long offset);
  int create_branch(struct ppc_inst *instr, const u32 *addr,
  unsigned long target, int flags);
  int create_cond_branch(struct ppc_inst *instr, const u32 *addr,
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index f9a3019e37b43c..e2342b9a1ab9c9 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -228,6 +228,11 @@ bool is_offset_in_branch_range(long offset)
return (offset >= -0x200 && offset <= 0x1fc && !(offset & 0x3));
  }
  
+bool is_offset_in_cond_branch_range(long offset)

+{
+   return offset >= -0x8000 && offset <= 0x7FFF && !(offset & 0x3);
+}


Would be better without capital letters in numbers, in extenso 0x7fff 
instead of 0x7FFF


Ack.

- Naveen

Re: [PATCH 1/9] powerpc/lib: Add helper to check if offset is within conditional branch range

2021-10-04 Thread Naveen N. Rao


Hi Song,
Thanks for the reviews.


Song Liu wrote:

On Fri, Oct 1, 2021 at 2:16 PM Naveen N. Rao
 wrote:


Add a helper to check if a given offset is within the branch range for a
powerpc conditional branch instruction, and update some sites to use the
new helper.

Signed-off-by: Naveen N. Rao 


Acked-by: Song Liu 

With one nitpick:


---
 arch/powerpc/include/asm/code-patching.h | 1 +
 arch/powerpc/lib/code-patching.c | 7 ++-
 arch/powerpc/net/bpf_jit.h   | 7 +--
 3 files changed, 8 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/code-patching.h 
b/arch/powerpc/include/asm/code-patching.h
index a95f63788c6b14..4ba834599c4d4c 100644
--- a/arch/powerpc/include/asm/code-patching.h
+++ b/arch/powerpc/include/asm/code-patching.h
@@ -23,6 +23,7 @@
 #define BRANCH_ABSOLUTE0x2

 bool is_offset_in_branch_range(long offset);
+bool is_offset_in_cond_branch_range(long offset);
 int create_branch(struct ppc_inst *instr, const u32 *addr,
  unsigned long target, int flags);
 int create_cond_branch(struct ppc_inst *instr, const u32 *addr,
diff --git a/arch/powerpc/lib/code-patching.c b/arch/powerpc/lib/code-patching.c
index f9a3019e37b43c..e2342b9a1ab9c9 100644
--- a/arch/powerpc/lib/code-patching.c
+++ b/arch/powerpc/lib/code-patching.c
@@ -228,6 +228,11 @@ bool is_offset_in_branch_range(long offset)
return (offset >= -0x200 && offset <= 0x1fc && !(offset & 0x3));
 }

+bool is_offset_in_cond_branch_range(long offset)
+{
+   return offset >= -0x8000 && offset <= 0x7FFF && !(offset & 0x3);
+}


Why not inline this one?


Good point. This was modeled after the existing 
is_offset_in_branch_range(), and I guess both of those helpers can be 
inlined. I'll do a separate patch for that.



- Naveen

Re: [PATCH 01/10] dt-bindings: i2c: Add Apple I2C controller bindings

2021-10-04 Thread Rob Herring

On Sun, 26 Sep 2021 11:58:38 +0200, Sven Peter wrote:
> The Apple I2C controller is based on the PASemi I2C controller.
> It is present on Apple SoCs such as the M1.
> 
> Signed-off-by: Sven Peter 
> ---
>  .../devicetree/bindings/i2c/apple,i2c.yaml| 61 +++
>  MAINTAINERS   |  1 +
>  2 files changed, 62 insertions(+)
>  create mode 100644 Documentation/devicetree/bindings/i2c/apple,i2c.yaml
> 

Reviewed-by: Rob Herring

Re: [PATCH 3/3] powerpc: Set crashkernel offset to mid of RMA region

2021-10-04 Thread Sourabh Jain


Hello Aneesh,

@@ -1235,6 +1235,9 @@ int __init early_init_dt_scan_rtas(unsigned long 
node,

  entryp = of_get_flat_dt_prop(node, "linux,rtas-entry", NULL);
  sizep  = of_get_flat_dt_prop(node, "rtas-size", NULL);
  +    if (of_get_flat_dt_prop(node, "ibm,hypertas-functions", NULL))
+    powerpc_firmware_features |= FW_FEATURE_LPAR;
+


The equivalent check that we currently do more than checking 
ibm,hypertas-functions.


if (!strcmp(uname, "rtas") || !strcmp(uname, "rtas@0")) {
    prop = of_get_flat_dt_prop(node, "ibm,hypertas-functions",
   &len);
    if (prop) {
    powerpc_firmware_features |= FW_FEATURE_LPAR;
    fw_hypertas_feature_init(prop, len);
}

If ibm,hypertas-functions prop has to be part of rtas or rtas@0 node to 
decide we are on LPAR then how about splitting the probe_fw_features 
functions into two functions, one to detect FW_FEATURE_LPAR and another 
function to do the rest?


also do we expect other firmware features to be set along with 
FW_FEATURE_LPAR?



No only FW_FEATURE_LPAR feature so that kernel can decide the 
crashkernel offset accordingly.



Thanks for the review.

- Sourabh Jain

Re: [PATCH v3 2/2] powerpc/powermac: constify device_node in of_irq_parse_oldworld()

2021-10-04 Thread Rob Herring

On Fri, 24 Sep 2021 12:56:53 +0200, Krzysztof Kozlowski wrote:
> The of_irq_parse_oldworld() does not modify passed device_node so make
> it a pointer to const for safety.  Drop the extern while modifying the
> line.
> 
> Signed-off-by: Krzysztof Kozlowski 
> 
> ---
> 
> Changes since v1:
> 1. Drop extern.
> ---
>  arch/powerpc/platforms/powermac/pic.c | 2 +-
>  include/linux/of_irq.h| 6 +++---
>  2 files changed, 4 insertions(+), 4 deletions(-)
> 

Acked-by: Rob Herring

Re: [PATCH 2/5] memory: fsl_ifc: populate child devices without relying on simple-bus

2021-10-04 Thread Rob Herring

On Thu, Sep 30, 2021 at 07:09:21PM -0500, Li Yang wrote:
> After we update the binding to not use simple-bus compatible for the
> controller, we need the driver to populate the child devices explicitly.
> 
> Signed-off-by: Li Yang 
> ---
>  drivers/memory/fsl_ifc.c | 9 +
>  1 file changed, 9 insertions(+)
> 
> diff --git a/drivers/memory/fsl_ifc.c b/drivers/memory/fsl_ifc.c
> index d062c2f8250f..251d713cd50b 100644
> --- a/drivers/memory/fsl_ifc.c
> +++ b/drivers/memory/fsl_ifc.c
> @@ -88,6 +88,7 @@ static int fsl_ifc_ctrl_remove(struct platform_device *dev)
>  {
>   struct fsl_ifc_ctrl *ctrl = dev_get_drvdata(&dev->dev);
>  
> + of_platform_depopulate(&dev->dev);
>   free_irq(ctrl->nand_irq, ctrl);
>   free_irq(ctrl->irq, ctrl);
>  
> @@ -285,6 +286,14 @@ static int fsl_ifc_ctrl_probe(struct platform_device 
> *dev)
>   }
>   }
>  
> + /* legacy dts may still use "simple-bus" compatible */
> + if (!of_device_is_compatible(dev->dev.of_node, "simple-bus")) {
> + ret = of_platform_populate(dev->dev.of_node, NULL, NULL,
> + &dev->dev);

There's no need to make this conditional. of_platform_populate() is safe 
to call multiple times. If that doesn't work, it's a bug.

Rob

Re: [PATCH 3/3] powerpc: Set crashkernel offset to mid of RMA region

2021-10-04 Thread Aneesh Kumar K.V


On 10/4/21 20:41, Sourabh Jain wrote:

On large config LPARs (having 192 and more cores), Linux fails to boot
due to insufficient memory in the first memory block. It is due to the
reserve crashkernel area starts at 128MB offset by default and which
doesn't leave enough space in the first memory block to accommodate
memory for other essential system resources.

Given that the RMA region size can be 512MB or more, setting the
crashkernel offset to mid of RMA size will leave enough space to
kernel to allocate memory for other system resources in the first
memory block.

Signed-off-by: Sourabh Jain 
Reported-and-tested-by: Abdul haleem 
---
  arch/powerpc/kernel/rtas.c |  3 +++
  arch/powerpc/kexec/core.c  | 13 +
  2 files changed, 12 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kernel/rtas.c b/arch/powerpc/kernel/rtas.c
index ff80bbad22a5..ce5e62bb4d8e 100644
--- a/arch/powerpc/kernel/rtas.c
+++ b/arch/powerpc/kernel/rtas.c
@@ -1235,6 +1235,9 @@ int __init early_init_dt_scan_rtas(unsigned long node,
entryp = of_get_flat_dt_prop(node, "linux,rtas-entry", NULL);
sizep  = of_get_flat_dt_prop(node, "rtas-size", NULL);
  
+	if (of_get_flat_dt_prop(node, "ibm,hypertas-functions", NULL))

+   powerpc_firmware_features |= FW_FEATURE_LPAR;
+


The equivalent check that we currently do more than checking 
ibm,hypertas-functions.


if (!strcmp(uname, "rtas") || !strcmp(uname, "rtas@0")) {
prop = of_get_flat_dt_prop(node, "ibm,hypertas-functions",
   &len);
if (prop) {
powerpc_firmware_features |= FW_FEATURE_LPAR;
fw_hypertas_feature_init(prop, len);
}


also do we expect other firmware features to be set along with 
FW_FEATURE_LPAR?



if (basep && entryp && sizep) {
rtas.base = *basep;
rtas.entry = *entryp;
diff --git a/arch/powerpc/kexec/core.c b/arch/powerpc/kexec/core.c
index 48525e8b5730..f69cf3e370ec 100644
--- a/arch/powerpc/kexec/core.c
+++ b/arch/powerpc/kexec/core.c
@@ -147,11 +147,16 @@ void __init reserve_crashkernel(void)
if (!crashk_res.start) {
  #ifdef CONFIG_PPC64
/*
-* On 64bit we split the RMO in half but cap it at half of
-* a small SLB (128MB) since the crash kernel needs to place
-* itself and some stacks to be in the first segment.
+* crash kernel needs to placed in the first segment. On LPAR
+* setting crash kernel start to mid of RMA size (512MB or more)
+* would help primary kernel to boot properly on large config
+* LPAR (with core count 192 or more) and for the reset keep
+* cap the crash kernel start at 128MB offse.
 */
-   crashk_res.start = min(0x800ULL, (ppc64_rma_size / 2));
+   if (firmware_has_feature(FW_FEATURE_LPAR))
+   crashk_res.start = ppc64_rma_size / 2;
+   else
+   crashk_res.start = min(0x800ULL, (ppc64_rma_size / 
2));
  #else
crashk_res.start = KDUMP_KERNELBASE;
  #endif

[RFC PATCH] KVM: PPC: Book3S HV P9: Move H_CEDE logic mostly to one place

2021-10-04 Thread Nicholas Piggin

Move the vcpu->arch.ceded, hrtimer, and blocking handling to one place,
except the xive escalation rearm case. The only special case is the
xive handling, as it is to be done before the xive context is pulled.

This means the P9 path does not run with ceded==1 or the hrtimer armed
except in the kvmhv_handle_cede function, and hopefully cede handling is
a bit more understandable.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/kvm_ppc.h|   4 +-
 arch/powerpc/kvm/book3s_hv.c  | 137 +-
 arch/powerpc/kvm/book3s_hv_p9_entry.c |   3 +-
 arch/powerpc/kvm/book3s_xive.c|  26 ++---
 4 files changed, 88 insertions(+), 82 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 70ffcb3c91bf..e2e6cee9dddf 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -674,7 +674,7 @@ extern int kvmppc_xive_set_irq(struct kvm *kvm, int 
irq_source_id, u32 irq,
   int level, bool line_status);
 extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu);
 extern void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu);
-extern void kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu);
+extern bool kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu);
 
 static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu)
 {
@@ -712,7 +712,7 @@ static inline int kvmppc_xive_set_irq(struct kvm *kvm, int 
irq_source_id, u32 ir
  int level, bool line_status) { return 
-ENODEV; }
 static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { }
 static inline void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu) { }
-static inline void kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu) { }
+static inline bool kvmppc_xive_rearm_escalation(struct kvm_vcpu *vcpu) { 
return false; }
 
 static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu)
{ return 0; }
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 36c54f483a02..230f10b67f98 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1019,6 +1019,8 @@ static long kvmppc_h_rpt_invalidate(struct kvm_vcpu *vcpu,
return H_SUCCESS;
 }
 
+static int kvmhv_handle_cede(struct kvm_vcpu *vcpu);
+
 int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
 {
struct kvm *kvm = vcpu->kvm;
@@ -1080,7 +1082,9 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
break;
 
case H_CEDE:
+   ret = kvmhv_handle_cede(vcpu);
break;
+
case H_PROD:
target = kvmppc_get_gpr(vcpu, 4);
tvcpu = kvmppc_find_vcpu(kvm, target);
@@ -1292,25 +1296,6 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
return RESUME_GUEST;
 }
 
-/*
- * Handle H_CEDE in the P9 path where we don't call the real-mode hcall
- * handlers in book3s_hv_rmhandlers.S.
- *
- * This has to be done early, not in kvmppc_pseries_do_hcall(), so
- * that the cede logic in kvmppc_run_single_vcpu() works properly.
- */
-static void kvmppc_cede(struct kvm_vcpu *vcpu)
-{
-   vcpu->arch.shregs.msr |= MSR_EE;
-   vcpu->arch.ceded = 1;
-   smp_mb();
-   if (vcpu->arch.prodded) {
-   vcpu->arch.prodded = 0;
-   smp_mb();
-   vcpu->arch.ceded = 0;
-   }
-}
-
 static int kvmppc_hcall_impl_hv(unsigned long cmd)
 {
switch (cmd) {
@@ -2971,7 +2956,7 @@ static int kvmppc_core_check_requests_hv(struct kvm_vcpu 
*vcpu)
return 1;
 }
 
-static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
+static bool kvmppc_set_timer(struct kvm_vcpu *vcpu)
 {
unsigned long dec_nsec, now;
 
@@ -2980,11 +2965,12 @@ static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
/* decrementer has already gone negative */
kvmppc_core_queue_dec(vcpu);
kvmppc_core_prepare_to_enter(vcpu);
-   return;
+   return false;
}
dec_nsec = tb_to_ns(kvmppc_dec_expires_host_tb(vcpu) - now);
hrtimer_start(&vcpu->arch.dec_timer, dec_nsec, HRTIMER_MODE_REL);
vcpu->arch.timer_running = 1;
+   return true;
 }
 
 extern int __kvmppc_vcore_entry(void);
@@ -4015,21 +4001,11 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
else if (*tb >= time_limit) /* nested time limit */
return BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER;
 
-   vcpu->arch.ceded = 0;
-
vcpu_vpa_increment_dispatch(vcpu);
 
if (kvmhv_on_pseries()) {
trap = kvmhv_vcpu_entry_p9_nested(vcpu, time_limit, lpcr, tb);
 
-   /* H_CEDE has to be handled now, not later */
-   if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested &&
-   kvmppc_get_gpr(vcpu, 3) == H_CEDE) {
-   kvmppc_cede(vcpu);
-   kvmppc_set_gpr(vcpu, 3, 0);
-   trap = 0;
-

[PATCH v3 52/52] KVM: PPC: Book3S HV P9: Remove subcore HMI handling

2021-10-04 Thread Nicholas Piggin

On POWER9 and newer, rather than the complex HMI synchronisation and
subcore state, have each thread un-apply the guest TB offset before
calling into the early HMI handler.

This allows the subcore state to be avoided, including subcore enter
/ exit guest, which includes an expensive divide that shows up
slightly in profiles.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/kvm_ppc.h|  1 +
 arch/powerpc/kvm/book3s_hv.c  | 12 +++---
 arch/powerpc/kvm/book3s_hv_hmi.c  |  7 +++-
 arch/powerpc/kvm/book3s_hv_p9_entry.c |  2 +-
 arch/powerpc/kvm/book3s_hv_ras.c  | 54 +++
 5 files changed, 67 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 671fbd1a765e..70ffcb3c91bf 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -760,6 +760,7 @@ void kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu);
 void kvmppc_subcore_enter_guest(void);
 void kvmppc_subcore_exit_guest(void);
 long kvmppc_realmode_hmi_handler(void);
+long kvmppc_p9_realmode_hmi_handler(struct kvm_vcpu *vcpu);
 long kvmppc_h_enter(struct kvm_vcpu *vcpu, unsigned long flags,
 long pte_index, unsigned long pteh, unsigned long ptel);
 long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long flags,
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 351018f617fb..449ac0a19ceb 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4014,8 +4014,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
 
vcpu->arch.ceded = 0;
 
-   kvmppc_subcore_enter_guest();
-
vcpu_vpa_increment_dispatch(vcpu);
 
if (kvmhv_on_pseries()) {
@@ -4068,8 +4066,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
 
vcpu_vpa_increment_dispatch(vcpu);
 
-   kvmppc_subcore_exit_guest();
-
return trap;
 }
 
@@ -6069,9 +6065,11 @@ static int kvmppc_book3s_init_hv(void)
if (r)
return r;
 
-   r = kvm_init_subcore_bitmap();
-   if (r)
-   return r;
+   if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+   r = kvm_init_subcore_bitmap();
+   if (r)
+   return r;
+   }
 
/*
 * We need a way of accessing the XICS interrupt controller,
diff --git a/arch/powerpc/kvm/book3s_hv_hmi.c b/arch/powerpc/kvm/book3s_hv_hmi.c
index 9af660476314..1ec50c69678b 100644
--- a/arch/powerpc/kvm/book3s_hv_hmi.c
+++ b/arch/powerpc/kvm/book3s_hv_hmi.c
@@ -20,10 +20,15 @@ void wait_for_subcore_guest_exit(void)
 
/*
 * NULL bitmap pointer indicates that KVM module hasn't
-* been loaded yet and hence no guests are running.
+* been loaded yet and hence no guests are running, or running
+* on POWER9 or newer CPU.
+*
 * If no KVM is in use, no need to co-ordinate among threads
 * as all of them will always be in host and no one is going
 * to modify TB other than the opal hmi handler.
+*
+* POWER9 and newer don't need this synchronisation.
+*
 * Hence, just return from here.
 */
if (!local_paca->sibling_subcore_state)
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c 
b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index fbecbdc42c26..86a222f97e8e 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -938,7 +938,7 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
kvmppc_realmode_machine_check(vcpu);
 
} else if (unlikely(trap == BOOK3S_INTERRUPT_HMI)) {
-   kvmppc_realmode_hmi_handler();
+   kvmppc_p9_realmode_hmi_handler(vcpu);
 
} else if (trap == BOOK3S_INTERRUPT_H_EMUL_ASSIST) {
vcpu->arch.emul_inst = mfspr(SPRN_HEIR);
diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c
index d4bca93b79f6..ccfd96965630 100644
--- a/arch/powerpc/kvm/book3s_hv_ras.c
+++ b/arch/powerpc/kvm/book3s_hv_ras.c
@@ -136,6 +136,60 @@ void kvmppc_realmode_machine_check(struct kvm_vcpu *vcpu)
vcpu->arch.mce_evt = mce_evt;
 }
 
+
+long kvmppc_p9_realmode_hmi_handler(struct kvm_vcpu *vcpu)
+{
+   struct kvmppc_vcore *vc = vcpu->arch.vcore;
+   long ret = 0;
+
+   /*
+* Unapply and clear the offset first. That way, if the TB was not
+* resynced then it will remain in host-offset, and if it was resynced
+* then it is brought into host-offset. Then the tb offset is
+* re-applied before continuing with the KVM exit.
+*
+* This way, we don't need to actually know whether not OPAL resynced
+* the timebase or do any of the complicated dance that the P7/8
+* path requires.
+*/
+   if (vc->tb_offset_applied) {
+   u64 new_tb = m

[PATCH v3 51/52] KVM: PPC: Book3S HV P9: Stop using vc->dpdes

2021-10-04 Thread Nicholas Piggin

The P9 path uses vc->dpdes only for msgsndp / SMT emulation. This adds
an ordering requirement between vcpu->doorbell_request and vc->dpdes for
no real benefit. Use vcpu->doorbell_request directly.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv.c  | 18 ++
 arch/powerpc/kvm/book3s_hv_builtin.c  |  2 ++
 arch/powerpc/kvm/book3s_hv_p9_entry.c | 14 ++
 3 files changed, 22 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 57bf49c90e73..351018f617fb 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -761,6 +761,8 @@ static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu)
 
if (vcpu->arch.doorbell_request)
return true;
+   if (cpu_has_feature(CPU_FTR_ARCH_300))
+   return false;
/*
 * Ensure that the read of vcore->dpdes comes after the read
 * of vcpu->doorbell_request.  This barrier matches the
@@ -2185,8 +2187,10 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, 
u64 id,
 * either vcore->dpdes or doorbell_request.
 * On POWER8, doorbell_request is 0.
 */
-   *val = get_reg_val(id, vcpu->arch.vcore->dpdes |
-  vcpu->arch.doorbell_request);
+   if (cpu_has_feature(CPU_FTR_ARCH_300))
+   *val = get_reg_val(id, vcpu->arch.doorbell_request);
+   else
+   *val = get_reg_val(id, vcpu->arch.vcore->dpdes);
break;
case KVM_REG_PPC_VTB:
*val = get_reg_val(id, vcpu->arch.vcore->vtb);
@@ -2423,7 +2427,10 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, 
u64 id,
vcpu->arch.pspb = set_reg_val(id, *val);
break;
case KVM_REG_PPC_DPDES:
-   vcpu->arch.vcore->dpdes = set_reg_val(id, *val);
+   if (cpu_has_feature(CPU_FTR_ARCH_300))
+   vcpu->arch.doorbell_request = set_reg_val(id, *val) & 1;
+   else
+   vcpu->arch.vcore->dpdes = set_reg_val(id, *val);
break;
case KVM_REG_PPC_VTB:
vcpu->arch.vcore->vtb = set_reg_val(id, *val);
@@ -4472,11 +4479,6 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 
time_limit,
 
if (!nested) {
kvmppc_core_prepare_to_enter(vcpu);
-   if (vcpu->arch.doorbell_request) {
-   vc->dpdes = 1;
-   smp_wmb();
-   vcpu->arch.doorbell_request = 0;
-   }
if (test_bit(BOOK3S_IRQPRIO_EXTERNAL,
 &vcpu->arch.pending_exceptions))
lpcr |= LPCR_MER;
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c 
b/arch/powerpc/kvm/book3s_hv_builtin.c
index fcf4760a3a0e..a4fc4b2d3806 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -649,6 +649,8 @@ void kvmppc_guest_entry_inject_int(struct kvm_vcpu *vcpu)
int ext;
unsigned long lpcr;
 
+   WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
+
/* Insert EXTERNAL bit into LPCR at the MER bit position */
ext = (vcpu->arch.pending_exceptions >> BOOK3S_IRQPRIO_EXTERNAL) & 1;
lpcr = mfspr(SPRN_LPCR);
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c 
b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index 5a71532a3adf..fbecbdc42c26 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -705,6 +705,7 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
unsigned long host_pidr;
unsigned long host_dawr1;
unsigned long host_dawrx1;
+   unsigned long dpdes;
 
hdec = time_limit - *tb;
if (hdec < 0)
@@ -767,8 +768,10 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
 
if (vc->pcr)
mtspr(SPRN_PCR, vc->pcr | PCR_MASK);
-   if (vc->dpdes)
-   mtspr(SPRN_DPDES, vc->dpdes);
+   if (vcpu->arch.doorbell_request) {
+   vcpu->arch.doorbell_request = 0;
+   mtspr(SPRN_DPDES, 1);
+   }
 
if (dawr_enabled()) {
if (vcpu->arch.dawr0 != host_dawr0)
@@ -999,7 +1002,10 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2);
vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3);
 
-   vc->dpdes = mfspr(SPRN_DPDES);
+   dpdes = mfspr(SPRN_DPDES);
+   if (dpdes)
+   vcpu->arch.doorbell_request = 1;
+
vc->vtb = mfspr(SPRN_VTB);
 
dec = mfspr(SPRN_DEC);
@@ -1061,7 +1067,7 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
}
}
 
-   if (vc->dpdes)
+   if (dpdes

[PATCH v3 50/52] KVM: PPC: Book3S HV P9: Tidy kvmppc_create_dtl_entry

2021-10-04 Thread Nicholas Piggin

This goes further to removing vcores from the P9 path. Also avoid the
memset in favour of explicitly initialising all fields.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv.c | 61 +---
 1 file changed, 35 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index d614f83c7b3f..57bf49c90e73 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -698,41 +698,30 @@ static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 
now)
return p;
 }
 
-static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
-   struct kvmppc_vcore *vc, u64 tb)
+static void __kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
+   unsigned int pcpu, u64 now,
+   unsigned long stolen)
 {
struct dtl_entry *dt;
struct lppaca *vpa;
-   unsigned long stolen;
-   unsigned long core_stolen;
-   u64 now;
-   unsigned long flags;
 
dt = vcpu->arch.dtl_ptr;
vpa = vcpu->arch.vpa.pinned_addr;
-   now = tb;
-
-   if (cpu_has_feature(CPU_FTR_ARCH_300)) {
-   stolen = 0;
-   } else {
-   core_stolen = vcore_stolen_time(vc, now);
-   stolen = core_stolen - vcpu->arch.stolen_logged;
-   vcpu->arch.stolen_logged = core_stolen;
-   spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
-   stolen += vcpu->arch.busy_stolen;
-   vcpu->arch.busy_stolen = 0;
-   spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
-   }
 
if (!dt || !vpa)
return;
-   memset(dt, 0, sizeof(struct dtl_entry));
+
dt->dispatch_reason = 7;
-   dt->processor_id = cpu_to_be16(vc->pcpu + vcpu->arch.ptid);
-   dt->timebase = cpu_to_be64(now + vc->tb_offset);
+   dt->preempt_reason = 0;
+   dt->processor_id = cpu_to_be16(pcpu + vcpu->arch.ptid);
dt->enqueue_to_dispatch_time = cpu_to_be32(stolen);
+   dt->ready_to_enqueue_time = 0;
+   dt->waiting_to_ready_time = 0;
+   dt->timebase = cpu_to_be64(now);
+   dt->fault_addr = 0;
dt->srr0 = cpu_to_be64(kvmppc_get_pc(vcpu));
dt->srr1 = cpu_to_be64(vcpu->arch.shregs.msr);
+
++dt;
if (dt == vcpu->arch.dtl.pinned_end)
dt = vcpu->arch.dtl.pinned_addr;
@@ -743,6 +732,27 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
vcpu->arch.dtl.dirty = true;
 }
 
+static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
+   struct kvmppc_vcore *vc)
+{
+   unsigned long stolen;
+   unsigned long core_stolen;
+   u64 now;
+   unsigned long flags;
+
+   now = mftb();
+
+   core_stolen = vcore_stolen_time(vc, now);
+   stolen = core_stolen - vcpu->arch.stolen_logged;
+   vcpu->arch.stolen_logged = core_stolen;
+   spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
+   stolen += vcpu->arch.busy_stolen;
+   vcpu->arch.busy_stolen = 0;
+   spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
+
+   __kvmppc_create_dtl_entry(vcpu, vc->pcpu, now + vc->tb_offset, stolen);
+}
+
 /* See if there is a doorbell interrupt pending for a vcpu */
 static bool kvmppc_doorbell_pending(struct kvm_vcpu *vcpu)
 {
@@ -3750,7 +3760,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore 
*vc)
pvc->pcpu = pcpu + thr;
for_each_runnable_thread(i, vcpu, pvc) {
kvmppc_start_thread(vcpu, pvc);
-   kvmppc_create_dtl_entry(vcpu, pvc, mftb());
+   kvmppc_create_dtl_entry(vcpu, pvc);
trace_kvm_guest_enter(vcpu);
if (!vcpu->arch.ptid)
thr0_done = true;
@@ -4313,7 +4323,7 @@ static int kvmppc_run_vcpu(struct kvm_vcpu *vcpu)
if ((vc->vcore_state == VCORE_PIGGYBACK ||
 vc->vcore_state == VCORE_RUNNING) &&
   !VCORE_IS_EXITING(vc)) {
-   kvmppc_create_dtl_entry(vcpu, vc, mftb());
+   kvmppc_create_dtl_entry(vcpu, vc);
kvmppc_start_thread(vcpu, vc);
trace_kvm_guest_enter(vcpu);
} else if (vc->vcore_state == VCORE_SLEEPING) {
@@ -4490,8 +4500,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 
time_limit,
local_paca->kvm_hstate.ptid = 0;
local_paca->kvm_hstate.fake_suspend = 0;
 
-   vc->pcpu = pcpu; // for kvmppc_create_dtl_entry
-   kvmppc_create_dtl_entry(vcpu, vc, tb);
+   __kvmppc_create_dtl_entry(vcpu, pcpu, tb + vc->tb_offset, 0);
 
trace_kvm_guest_enter(vcpu);
 
-- 
2.23.0

[PATCH v3 49/52] KVM: PPC: Book3S HV P9: Remove most of the vcore logic

2021-10-04 Thread Nicholas Piggin

The P9 path always uses one vcpu per vcore, so none of the vcore, locks,
stolen time, blocking logic, shared waitq, etc., is required.

Remove most of it.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv.c | 147 ---
 1 file changed, 85 insertions(+), 62 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6574e8a3731e..d614f83c7b3f 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -276,6 +276,8 @@ static void kvmppc_core_start_stolen(struct kvmppc_vcore 
*vc, u64 tb)
 {
unsigned long flags;
 
+   WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
+
spin_lock_irqsave(&vc->stoltb_lock, flags);
vc->preempt_tb = tb;
spin_unlock_irqrestore(&vc->stoltb_lock, flags);
@@ -285,6 +287,8 @@ static void kvmppc_core_end_stolen(struct kvmppc_vcore *vc, 
u64 tb)
 {
unsigned long flags;
 
+   WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
+
spin_lock_irqsave(&vc->stoltb_lock, flags);
if (vc->preempt_tb != TB_NIL) {
vc->stolen_tb += tb - vc->preempt_tb;
@@ -297,7 +301,12 @@ static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu 
*vcpu, int cpu)
 {
struct kvmppc_vcore *vc = vcpu->arch.vcore;
unsigned long flags;
-   u64 now = mftb();
+   u64 now;
+
+   if (cpu_has_feature(CPU_FTR_ARCH_300))
+   return;
+
+   now = mftb();
 
/*
 * We can test vc->runner without taking the vcore lock,
@@ -321,7 +330,12 @@ static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu)
 {
struct kvmppc_vcore *vc = vcpu->arch.vcore;
unsigned long flags;
-   u64 now = mftb();
+   u64 now;
+
+   if (cpu_has_feature(CPU_FTR_ARCH_300))
+   return;
+
+   now = mftb();
 
if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
kvmppc_core_start_stolen(vc, now);
@@ -673,6 +687,8 @@ static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 
now)
u64 p;
unsigned long flags;
 
+   WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
+
spin_lock_irqsave(&vc->stoltb_lock, flags);
p = vc->stolen_tb;
if (vc->vcore_state != VCORE_INACTIVE &&
@@ -695,13 +711,19 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
dt = vcpu->arch.dtl_ptr;
vpa = vcpu->arch.vpa.pinned_addr;
now = tb;
-   core_stolen = vcore_stolen_time(vc, now);
-   stolen = core_stolen - vcpu->arch.stolen_logged;
-   vcpu->arch.stolen_logged = core_stolen;
-   spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
-   stolen += vcpu->arch.busy_stolen;
-   vcpu->arch.busy_stolen = 0;
-   spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
+
+   if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+   stolen = 0;
+   } else {
+   core_stolen = vcore_stolen_time(vc, now);
+   stolen = core_stolen - vcpu->arch.stolen_logged;
+   vcpu->arch.stolen_logged = core_stolen;
+   spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
+   stolen += vcpu->arch.busy_stolen;
+   vcpu->arch.busy_stolen = 0;
+   spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
+   }
+
if (!dt || !vpa)
return;
memset(dt, 0, sizeof(struct dtl_entry));
@@ -898,13 +920,14 @@ static int kvm_arch_vcpu_yield_to(struct kvm_vcpu *target)
 * mode handler is not called but no other threads are in the
 * source vcore.
 */
-
-   spin_lock(&vcore->lock);
-   if (target->arch.state == KVMPPC_VCPU_RUNNABLE &&
-   vcore->vcore_state != VCORE_INACTIVE &&
-   vcore->runner)
-   target = vcore->runner;
-   spin_unlock(&vcore->lock);
+   if (!cpu_has_feature(CPU_FTR_ARCH_300)) {
+   spin_lock(&vcore->lock);
+   if (target->arch.state == KVMPPC_VCPU_RUNNABLE &&
+   vcore->vcore_state != VCORE_INACTIVE &&
+   vcore->runner)
+   target = vcore->runner;
+   spin_unlock(&vcore->lock);
+   }
 
return kvm_vcpu_yield_to(target);
 }
@@ -3125,13 +3148,6 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, 
struct kvmppc_vcore *vc)
kvmppc_ipi_thread(cpu);
 }
 
-/* Old path does this in asm */
-static void kvmppc_stop_thread(struct kvm_vcpu *vcpu)
-{
-   vcpu->cpu = -1;
-   vcpu->arch.thread_cpu = -1;
-}
-
 static void kvmppc_wait_for_nap(int n_threads)
 {
int cpu = smp_processor_id();
@@ -3220,6 +3236,8 @@ static void kvmppc_vcore_preempt(struct kvmppc_vcore *vc)
 {
struct preempted_vcore_list *lp = this_cpu_ptr(&preempted_vcores);
 
+   WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300));
+
vc->vcore_state = VCORE_PREEMPT;
vc->pcpu = smp_processor_id();
if (vc->num_threads < threads_

[PATCH v3 48/52] KVM: PPC: Book3S HV P9: Avoid cpu_in_guest atomics on entry and exit

2021-10-04 Thread Nicholas Piggin

cpu_in_guest is set to determine if a CPU needs to be IPI'ed to exit
the guest and notice the need_tlb_flush bit.

This can be implemented as a global per-CPU pointer to the currently
running guest instead of per-guest cpumasks, saving 2 atomics per
entry/exit. P7/8 doesn't require cpu_in_guest, nor does a nested HV
(only the L0 does), so move it to the P9 HV path.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/kvm_book3s_64.h |  1 -
 arch/powerpc/include/asm/kvm_host.h  |  1 -
 arch/powerpc/kvm/book3s_hv.c | 38 +---
 3 files changed, 21 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index 96f0fda50a07..fe07558173ef 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -44,7 +44,6 @@ struct kvm_nested_guest {
struct mutex tlb_lock;  /* serialize page faults and tlbies */
struct kvm_nested_guest *next;
cpumask_t need_tlb_flush;
-   cpumask_t cpu_in_guest;
short prev_cpu[NR_CPUS];
u8 radix;   /* is this nested guest radix */
 };
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 92925f82a1e3..4de418f6c0a2 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -287,7 +287,6 @@ struct kvm_arch {
u32 online_vcores;
atomic_t hpte_mod_interest;
cpumask_t need_tlb_flush;
-   cpumask_t cpu_in_guest;
u8 radix;
u8 fwnmi_enabled;
u8 secure_guest;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6e072e2e130a..6574e8a3731e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3009,30 +3009,33 @@ static void kvmppc_release_hwthread(int cpu)
tpaca->kvm_hstate.kvm_split_mode = NULL;
 }
 
+static DEFINE_PER_CPU(struct kvm *, cpu_in_guest);
+
 static void radix_flush_cpu(struct kvm *kvm, int cpu, struct kvm_vcpu *vcpu)
 {
struct kvm_nested_guest *nested = vcpu->arch.nested;
-   cpumask_t *cpu_in_guest;
int i;
 
cpu = cpu_first_tlb_thread_sibling(cpu);
-   if (nested) {
+   if (nested)
cpumask_set_cpu(cpu, &nested->need_tlb_flush);
-   cpu_in_guest = &nested->cpu_in_guest;
-   } else {
+   else
cpumask_set_cpu(cpu, &kvm->arch.need_tlb_flush);
-   cpu_in_guest = &kvm->arch.cpu_in_guest;
-   }
/*
-* Make sure setting of bit in need_tlb_flush precedes
-* testing of cpu_in_guest bits.  The matching barrier on
-* the other side is the first smp_mb() in kvmppc_run_core().
+* Make sure setting of bit in need_tlb_flush precedes testing of
+* cpu_in_guest. The matching barrier on the other side is hwsync
+* when switching to guest MMU mode, which happens between
+* cpu_in_guest being set to the guest kvm, and need_tlb_flush bit
+* being tested.
 */
smp_mb();
for (i = cpu; i <= cpu_last_tlb_thread_sibling(cpu);
-   i += cpu_tlb_thread_sibling_step())
-   if (cpumask_test_cpu(i, cpu_in_guest))
+   i += cpu_tlb_thread_sibling_step()) {
+   struct kvm *running = *per_cpu_ptr(&cpu_in_guest, i);
+
+   if (running == kvm)
smp_call_function_single(i, do_nothing, NULL, 1);
+   }
 }
 
 static void do_migrate_away_vcpu(void *arg)
@@ -3100,7 +3103,6 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, 
struct kvmppc_vcore *vc)
 {
int cpu;
struct paca_struct *tpaca;
-   struct kvm *kvm = vc->kvm;
 
cpu = vc->pcpu;
if (vcpu) {
@@ -3111,7 +3113,6 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, 
struct kvmppc_vcore *vc)
cpu += vcpu->arch.ptid;
vcpu->cpu = vc->pcpu;
vcpu->arch.thread_cpu = cpu;
-   cpumask_set_cpu(cpu, &kvm->arch.cpu_in_guest);
}
tpaca = paca_ptrs[cpu];
tpaca->kvm_hstate.kvm_vcpu = vcpu;
@@ -3829,7 +3830,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore 
*vc)
kvmppc_release_hwthread(pcpu + i);
if (sip && sip->napped[i])
kvmppc_ipi_thread(pcpu + i);
-   cpumask_clear_cpu(pcpu + i, &vc->kvm->arch.cpu_in_guest);
}
 
spin_unlock(&vc->lock);
@@ -3997,8 +3997,14 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
}
 
} else {
+   struct kvm *kvm = vcpu->kvm;
+
kvmppc_xive_push_vcpu(vcpu);
+
+   __this_cpu_write(cpu_in_guest, kvm);
trap = kvmhv_vcpu_entry_p9(vcpu, time_limit, lpcr, tb);
+   __this_cpu_write(cpu_in_guest, NULL

[PATCH v3 47/52] KVM: PPC: Book3S HV P9: Add unlikely annotation for !mmu_ready

2021-10-04 Thread Nicholas Piggin

The mmu will almost always be ready.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 0bbef4587f41..6e072e2e130a 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4408,7 +4408,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 
time_limit,
vc->runner = vcpu;
 
/* See if the MMU is ready to go */
-   if (!kvm->arch.mmu_ready) {
+   if (unlikely(!kvm->arch.mmu_ready)) {
r = kvmhv_setup_mmu(vcpu);
if (r) {
run->exit_reason = KVM_EXIT_FAIL_ENTRY;
-- 
2.23.0

[PATCH v3 46/52] KVM: PPC: Book3S HV P9: Avoid changing MSR[RI] in entry and exit

2021-10-04 Thread Nicholas Piggin

kvm_hstate.in_guest provides the equivalent of MSR[RI]=0 protection,
and it covers the existing MSR[RI]=0 section in late entry and early
exit, so clearing and setting MSR[RI] in those cases does not
actually do anything useful.

Remove the RI manipulation and replace it with comments. Make the
in_guest memory accesses a bit closer to a proper critical section
pattern. This speeds up guest entry/exit performance.

This also removes the MSR[RI] warnings which aren't very interesting
and would cause crashes if they hit due to causing an interrupt in
non-recoverable code.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv_p9_entry.c | 50 ---
 1 file changed, 23 insertions(+), 27 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c 
b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index 99ce5805ea28..5a71532a3adf 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -829,7 +829,15 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
 * But TM could be split out if this would be a significant benefit.
 */
 
-   local_paca->kvm_hstate.in_guest = KVM_GUEST_MODE_HV_P9;
+   /*
+* MSR[RI] does not need to be cleared (and is not, for radix guests
+* with no prefetch bug), because in_guest is set. If we take a SRESET
+* or MCE with in_guest set but still in HV mode, then
+* kvmppc_p9_bad_interrupt handles the interrupt, which effectively
+* clears MSR[RI] and doesn't return.
+*/
+   WRITE_ONCE(local_paca->kvm_hstate.in_guest, KVM_GUEST_MODE_HV_P9);
+   barrier(); /* Open in_guest critical section */
 
/*
 * Hash host, hash guest, or radix guest with prefetch bug, all have
@@ -841,14 +849,10 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
 
save_clear_host_mmu(kvm);
 
-   if (kvm_is_radix(kvm)) {
+   if (kvm_is_radix(kvm))
switch_mmu_to_guest_radix(kvm, vcpu, lpcr);
-   if (!cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
-   __mtmsrd(0, 1); /* clear RI */
-
-   } else {
+   else
switch_mmu_to_guest_hpt(kvm, vcpu, lpcr);
-   }
 
/* TLBIEL uses LPID=LPIDR, so run this after setting guest LPID */
kvmppc_check_need_tlb_flush(kvm, vc->pcpu, nested);
@@ -903,19 +907,16 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
vcpu->arch.regs.gpr[3] = local_paca->kvm_hstate.scratch2;
 
/*
-* Only set RI after reading machine check regs (DAR, DSISR, SRR0/1)
-* and hstate scratch (which we need to move into exsave to make
-* re-entrant vs SRESET/MCE)
+* After reading machine check regs (DAR, DSISR, SRR0/1) and hstate
+* scratch (which we need to move into exsave to make re-entrant vs
+* SRESET/MCE), register state is protected from reentrancy. However
+* timebase, MMU, among other state is still set to guest, so don't
+* enable MSR[RI] here. It gets enabled at the end, after in_guest
+* is cleared.
+*
+* It is possible an NMI could come in here, which is why it is
+* important to save the above state early so it can be debugged.
 */
-   if (ri_set) {
-   if (unlikely(!(mfmsr() & MSR_RI))) {
-   __mtmsrd(MSR_RI, 1);
-   WARN_ON_ONCE(1);
-   }
-   } else {
-   WARN_ON_ONCE(mfmsr() & MSR_RI);
-   __mtmsrd(MSR_RI, 1);
-   }
 
vcpu->arch.regs.gpr[9] = exsave[EX_R9/sizeof(u64)];
vcpu->arch.regs.gpr[10] = exsave[EX_R10/sizeof(u64)];
@@ -973,13 +974,6 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
 */
mtspr(SPRN_HSRR0, vcpu->arch.regs.nip);
mtspr(SPRN_HSRR1, vcpu->arch.shregs.msr);
-
-   /*
-* tm_return_to_guest re-loads SRR0/1, DAR,
-* DSISR after RI is cleared, in case they had
-* been clobbered by a MCE.
-*/
-   __mtmsrd(0, 1); /* clear RI */
goto tm_return_to_guest;
}
}
@@ -1079,7 +1073,9 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
 
restore_p9_host_os_sprs(vcpu, &host_os_sprs);
 
-   local_paca->kvm_hstate.in_guest = KVM_GUEST_MODE_NONE;
+   barrier(); /* Close in_guest critical section */
+   WRITE_ONCE(local_paca->kvm_hstate.in_guest, KVM_GUEST_MODE_NONE);
+   /* Interrupts are recoverable at this point */
 
/*
 * cp_abort is required if the processor suppo

[PATCH v3 45/52] KVM: PPC: Book3S HV P9: Optimise hash guest SLB saving

2021-10-04 Thread Nicholas Piggin

slbmfee/slbmfev instructions are very expensive, moreso than a regular
mfspr instruction, so minimising them significantly improves hash guest
exit performance. The slbmfev is only required if slbmfee found a valid
SLB entry.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv_p9_entry.c | 22 ++
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c 
b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index 646f487ebf97..99ce5805ea28 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -487,10 +487,22 @@ static void __accumulate_time(struct kvm_vcpu *vcpu, 
struct kvmhv_tb_accumulator
 #define accumulate_time(vcpu, next) do {} while (0)
 #endif
 
-static inline void mfslb(unsigned int idx, u64 *slbee, u64 *slbev)
+static inline u64 mfslbv(unsigned int idx)
 {
-   asm volatile("slbmfev  %0,%1" : "=r" (*slbev) : "r" (idx));
-   asm volatile("slbmfee  %0,%1" : "=r" (*slbee) : "r" (idx));
+   u64 slbev;
+
+   asm volatile("slbmfev  %0,%1" : "=r" (slbev) : "r" (idx));
+
+   return slbev;
+}
+
+static inline u64 mfslbe(unsigned int idx)
+{
+   u64 slbee;
+
+   asm volatile("slbmfee  %0,%1" : "=r" (slbee) : "r" (idx));
+
+   return slbee;
 }
 
 static inline void mtslb(u64 slbee, u64 slbev)
@@ -620,8 +632,10 @@ static void save_clear_guest_mmu(struct kvm *kvm, struct 
kvm_vcpu *vcpu)
 */
for (i = 0; i < vcpu->arch.slb_nr; i++) {
u64 slbee, slbev;
-   mfslb(i, &slbee, &slbev);
+
+   slbee = mfslbe(i);
if (slbee & SLB_ESID_V) {
+   slbev = mfslbv(i);
vcpu->arch.slb[nr].orige = slbee | i;
vcpu->arch.slb[nr].origv = slbev;
nr++;
-- 
2.23.0

[PATCH v3 44/52] KVM: PPC: Book3S HV P9: Improve mfmsr performance on entry

2021-10-04 Thread Nicholas Piggin

Rearrange the MSR saving on entry so it does not follow the mtmsrd to
disable interrupts, avoiding a possible RAW scoreboard stall.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/kvm_book3s_64.h |  2 +
 arch/powerpc/kvm/book3s_hv.c | 18 ++-
 arch/powerpc/kvm/book3s_hv_p9_entry.c| 66 +++-
 3 files changed, 47 insertions(+), 39 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index 0a319ed9c2fd..96f0fda50a07 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -154,6 +154,8 @@ static inline bool kvmhv_vcpu_is_radix(struct kvm_vcpu 
*vcpu)
return radix;
 }
 
+unsigned long kvmppc_msr_hard_disable_set_facilities(struct kvm_vcpu *vcpu, 
unsigned long msr);
+
 int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long 
lpcr, u64 *tb);
 
 #define KVM_DEFAULT_HPT_ORDER  24  /* 16MB HPT by default */
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 342b4f125d03..0bbef4587f41 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3878,6 +3878,8 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu 
*vcpu, u64 time_limit, uns
s64 dec;
int trap;
 
+   msr = mfmsr();
+
save_p9_host_os_sprs(&host_os_sprs);
 
/*
@@ -3888,24 +3890,10 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu 
*vcpu, u64 time_limit, uns
 */
host_psscr = mfspr(SPRN_PSSCR_PR);
 
-   hard_irq_disable();
+   kvmppc_msr_hard_disable_set_facilities(vcpu, msr);
if (lazy_irq_pending())
return 0;
 
-   /* MSR bits may have been cleared by context switch */
-   msr = 0;
-   if (IS_ENABLED(CONFIG_PPC_FPU))
-   msr |= MSR_FP;
-   if (cpu_has_feature(CPU_FTR_ALTIVEC))
-   msr |= MSR_VEC;
-   if (cpu_has_feature(CPU_FTR_VSX))
-   msr |= MSR_VSX;
-   if ((cpu_has_feature(CPU_FTR_TM) ||
-   cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) &&
-   (vcpu->arch.hfscr & HFSCR_TM))
-   msr |= MSR_TM;
-   msr = msr_check_and_set(msr);
-
if (unlikely(load_vcpu_state(vcpu, &host_os_sprs)))
msr = mfmsr(); /* TM restore can update msr */
 
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c 
b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index a45ba584c734..646f487ebf97 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -632,6 +632,44 @@ static void save_clear_guest_mmu(struct kvm *kvm, struct 
kvm_vcpu *vcpu)
}
 }
 
+unsigned long kvmppc_msr_hard_disable_set_facilities(struct kvm_vcpu *vcpu, 
unsigned long msr)
+{
+   unsigned long msr_needed = 0;
+
+   msr &= ~MSR_EE;
+
+   /* MSR bits may have been cleared by context switch so must recheck */
+   if (IS_ENABLED(CONFIG_PPC_FPU))
+   msr_needed |= MSR_FP;
+   if (cpu_has_feature(CPU_FTR_ALTIVEC))
+   msr_needed |= MSR_VEC;
+   if (cpu_has_feature(CPU_FTR_VSX))
+   msr_needed |= MSR_VSX;
+   if ((cpu_has_feature(CPU_FTR_TM) ||
+   cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) &&
+   (vcpu->arch.hfscr & HFSCR_TM))
+   msr_needed |= MSR_TM;
+
+   /*
+* This could be combined with MSR[RI] clearing, but that expands
+* the unrecoverable window. It would be better to cover unrecoverable
+* with KVM bad interrupt handling rather than use MSR[RI] at all.
+*
+* Much more difficult and less worthwhile to combine with IR/DR
+* disable.
+*/
+   if ((msr & msr_needed) != msr_needed) {
+   msr |= msr_needed;
+   __mtmsrd(msr, 0);
+   } else {
+   __hard_irq_disable();
+   }
+   local_paca->irq_happened |= PACA_IRQ_HARD_DIS;
+
+   return msr;
+}
+EXPORT_SYMBOL_GPL(kvmppc_msr_hard_disable_set_facilities);
+
 int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long 
lpcr, u64 *tb)
 {
struct p9_host_os_sprs host_os_sprs;
@@ -665,6 +703,9 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
 
vcpu->arch.ceded = 0;
 
+   /* Save MSR for restore, with EE clear. */
+   msr = mfmsr() & ~MSR_EE;
+
host_hfscr = mfspr(SPRN_HFSCR);
host_ciabr = mfspr(SPRN_CIABR);
host_psscr = mfspr(SPRN_PSSCR_PR);
@@ -686,35 +727,12 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
 
save_p9_host_os_sprs(&host_os_sprs);
 
-   /*
-* This could be combined with MSR[RI] clearing, but that expands
-* the unrecoverable window. It would be better to cover unrecoverable
-* with KVM bad interrupt handling rather than use MSR[RI] at all.
-*
-* Much more difficult and less

[PATCH v3 43/52] KVM: PPC: Book3S HV Nested: Avoid extra mftb() in nested entry

2021-10-04 Thread Nicholas Piggin

mftb() is expensive and one can be avoided on nested guest dispatch.

If the time checking code distinguishes between the L0 timer and the
nested HV timer, then both can be tested in the same place with the
same mftb() value.

This also nicely illustrates the relationship between the L0 and nested
HV timers.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/kvm_asm.h  |  1 +
 arch/powerpc/kvm/book3s_hv.c| 12 
 arch/powerpc/kvm/book3s_hv_nested.c |  5 -
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_asm.h 
b/arch/powerpc/include/asm/kvm_asm.h
index fbbf3cec92e9..d68d71987d5c 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -79,6 +79,7 @@
 #define BOOK3S_INTERRUPT_FP_UNAVAIL0x800
 #define BOOK3S_INTERRUPT_DECREMENTER   0x900
 #define BOOK3S_INTERRUPT_HV_DECREMENTER0x980
+#define BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER 0x1980
 #define BOOK3S_INTERRUPT_DOORBELL  0xa00
 #define BOOK3S_INTERRUPT_SYSCALL   0xc00
 #define BOOK3S_INTERRUPT_TRACE 0xd00
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index d2a9c930f33e..342b4f125d03 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1486,6 +1486,10 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
run->ready_for_interrupt_injection = 1;
switch (vcpu->arch.trap) {
/* We're good on these - the host merely wanted to get our attention */
+   case BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER:
+   WARN_ON_ONCE(1); /* Should never happen */
+   vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
+   fallthrough;
case BOOK3S_INTERRUPT_HV_DECREMENTER:
vcpu->stat.dec_exits++;
r = RESUME_GUEST;
@@ -1814,6 +1818,12 @@ static int kvmppc_handle_nested_exit(struct kvm_vcpu 
*vcpu)
vcpu->stat.ext_intr_exits++;
r = RESUME_GUEST;
break;
+   /* These need to go to the nested HV */
+   case BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER:
+   vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
+   vcpu->stat.dec_exits++;
+   r = RESUME_HOST;
+   break;
/* SR/HMI/PMI are HV interrupts that host has handled. Resume guest.*/
case BOOK3S_INTERRUPT_HMI:
case BOOK3S_INTERRUPT_PERFMON:
@@ -3975,6 +3985,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
return BOOK3S_INTERRUPT_HV_DECREMENTER;
if (next_timer < time_limit)
time_limit = next_timer;
+   else if (*tb >= time_limit) /* nested time limit */
+   return BOOK3S_INTERRUPT_NESTED_HV_DECREMENTER;
 
vcpu->arch.ceded = 0;
 
diff --git a/arch/powerpc/kvm/book3s_hv_nested.c 
b/arch/powerpc/kvm/book3s_hv_nested.c
index 7bed0b91245e..e57c08b968c0 100644
--- a/arch/powerpc/kvm/book3s_hv_nested.c
+++ b/arch/powerpc/kvm/book3s_hv_nested.c
@@ -375,11 +375,6 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu)
vcpu->arch.ret = RESUME_GUEST;
vcpu->arch.trap = 0;
do {
-   if (mftb() >= hdec_exp) {
-   vcpu->arch.trap = BOOK3S_INTERRUPT_HV_DECREMENTER;
-   r = RESUME_HOST;
-   break;
-   }
r = kvmhv_run_single_vcpu(vcpu, hdec_exp, lpcr);
} while (is_kvmppc_resume_guest(r));
 
-- 
2.23.0

[PATCH v3 42/52] KVM: PPC: Book3S HV P9: Avoid tlbsync sequence on radix guest exit

2021-10-04 Thread Nicholas Piggin

Use the existing TLB flushing logic to IPI the previous CPU and run the
necessary barriers before running a guest vCPU on a new physical CPU,
to do the necessary radix GTSE barriers for handling the case of an
interrupted guest tlbie sequence.

This results in more IPIs than the TLB flush logic requires, but it's
a significant win for common case scheduling when the vCPU remains on
the same physical CPU.

This saves about 520 cycles (nearly 10%) on a guest entry+exit micro
benchmark on a POWER9.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv.c  | 31 +++
 arch/powerpc/kvm/book3s_hv_p9_entry.c |  9 
 2 files changed, 27 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index f10cb4167549..d2a9c930f33e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3025,6 +3025,25 @@ static void radix_flush_cpu(struct kvm *kvm, int cpu, 
struct kvm_vcpu *vcpu)
smp_call_function_single(i, do_nothing, NULL, 1);
 }
 
+static void do_migrate_away_vcpu(void *arg)
+{
+   struct kvm_vcpu *vcpu = arg;
+   struct kvm *kvm = vcpu->kvm;
+
+   /*
+* If the guest has GTSE, it may execute tlbie, so do a eieio; tlbsync;
+* ptesync sequence on the old CPU before migrating to a new one, in
+* case we interrupted the guest between a tlbie ; eieio ;
+* tlbsync; ptesync sequence.
+*
+* Otherwise, ptesync is sufficient.
+*/
+   if (kvm->arch.lpcr & LPCR_GTSE)
+   asm volatile("eieio; tlbsync; ptesync");
+   else
+   asm volatile("ptesync");
+}
+
 static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu *vcpu, int pcpu)
 {
struct kvm_nested_guest *nested = vcpu->arch.nested;
@@ -3052,10 +3071,14 @@ static void kvmppc_prepare_radix_vcpu(struct kvm_vcpu 
*vcpu, int pcpu)
 * so we use a single bit in .need_tlb_flush for all 4 threads.
 */
if (prev_cpu != pcpu) {
-   if (prev_cpu >= 0 &&
-   cpu_first_tlb_thread_sibling(prev_cpu) !=
-   cpu_first_tlb_thread_sibling(pcpu))
-   radix_flush_cpu(kvm, prev_cpu, vcpu);
+   if (prev_cpu >= 0) {
+   if (cpu_first_tlb_thread_sibling(prev_cpu) !=
+   cpu_first_tlb_thread_sibling(pcpu))
+   radix_flush_cpu(kvm, prev_cpu, vcpu);
+
+   smp_call_function_single(prev_cpu,
+   do_migrate_away_vcpu, vcpu, 1);
+   }
if (nested)
nested->prev_cpu[vcpu->arch.nested_vcpu_id] = pcpu;
else
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c 
b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index eae9d806d704..a45ba584c734 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -1049,15 +1049,6 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
 
local_paca->kvm_hstate.in_guest = KVM_GUEST_MODE_NONE;
 
-   if (kvm_is_radix(kvm)) {
-   /*
-* Since this is radix, do a eieio; tlbsync; ptesync sequence
-* in case we interrupted the guest between a tlbie and a
-* ptesync.
-*/
-   asm volatile("eieio; tlbsync; ptesync");
-   }
-
/*
 * cp_abort is required if the processor supports local copy-paste
 * to clear the copy buffer that was under control of the guest.
-- 
2.23.0

[PATCH v3 41/52] KVM: PPC: Book3S HV P9: Don't restore PSSCR if not needed

2021-10-04 Thread Nicholas Piggin

This also moves the PSSCR update in nested entry to avoid a SPR
scoreboard stall.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv.c  |  7 +--
 arch/powerpc/kvm/book3s_hv_p9_entry.c | 26 +++---
 2 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index f4445cc5a29a..f10cb4167549 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3876,7 +3876,9 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu 
*vcpu, u64 time_limit, uns
if (unlikely(load_vcpu_state(vcpu, &host_os_sprs)))
msr = mfmsr(); /* TM restore can update msr */
 
-   mtspr(SPRN_PSSCR_PR, vcpu->arch.psscr);
+   if (vcpu->arch.psscr != host_psscr)
+   mtspr(SPRN_PSSCR_PR, vcpu->arch.psscr);
+
kvmhv_save_hv_regs(vcpu, &hvregs);
hvregs.lpcr = lpcr;
vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
@@ -3917,7 +3919,6 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu 
*vcpu, u64 time_limit, uns
vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
vcpu->arch.psscr = mfspr(SPRN_PSSCR_PR);
-   mtspr(SPRN_PSSCR_PR, host_psscr);
 
store_vcpu_state(vcpu);
 
@@ -3930,6 +3931,8 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu 
*vcpu, u64 time_limit, uns
timer_rearm_host_dec(*tb);
 
restore_p9_host_os_sprs(vcpu, &host_os_sprs);
+   if (vcpu->arch.psscr != host_psscr)
+   mtspr(SPRN_PSSCR_PR, host_psscr);
 
return trap;
 }
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c 
b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index 0f341011816c..eae9d806d704 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -649,6 +649,7 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
unsigned long host_dawr0;
unsigned long host_dawrx0;
unsigned long host_psscr;
+   unsigned long host_hpsscr;
unsigned long host_pidr;
unsigned long host_dawr1;
unsigned long host_dawrx1;
@@ -666,7 +667,9 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
 
host_hfscr = mfspr(SPRN_HFSCR);
host_ciabr = mfspr(SPRN_CIABR);
-   host_psscr = mfspr(SPRN_PSSCR);
+   host_psscr = mfspr(SPRN_PSSCR_PR);
+   if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
+   host_hpsscr = mfspr(SPRN_PSSCR);
host_pidr = mfspr(SPRN_PID);
 
if (dawr_enabled()) {
@@ -750,8 +753,14 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
if (vcpu->arch.ciabr != host_ciabr)
mtspr(SPRN_CIABR, vcpu->arch.ciabr);
 
-   mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC |
- (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
+
+   if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) {
+   mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC |
+ (local_paca->kvm_hstate.fake_suspend << 
PSSCR_FAKE_SUSPEND_LG));
+   } else {
+   if (vcpu->arch.psscr != host_psscr)
+   mtspr(SPRN_PSSCR_PR, vcpu->arch.psscr);
+   }
 
mtspr(SPRN_HFSCR, vcpu->arch.hfscr);
 
@@ -957,7 +966,7 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
 
vcpu->arch.ic = mfspr(SPRN_IC);
vcpu->arch.pid = mfspr(SPRN_PID);
-   vcpu->arch.psscr = mfspr(SPRN_PSSCR) & PSSCR_GUEST_VIS;
+   vcpu->arch.psscr = mfspr(SPRN_PSSCR_PR);
 
vcpu->arch.shregs.sprg0 = mfspr(SPRN_SPRG0);
vcpu->arch.shregs.sprg1 = mfspr(SPRN_SPRG1);
@@ -1003,9 +1012,12 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr);
mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr);
 
-   /* Preserve PSSCR[FAKE_SUSPEND] until we've called kvmppc_save_tm_hv */
-   mtspr(SPRN_PSSCR, host_psscr |
- (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
+   if (cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) {
+   /* Preserve PSSCR[FAKE_SUSPEND] until we've called 
kvmppc_save_tm_hv */
+   mtspr(SPRN_PSSCR, host_hpsscr |
+ (local_paca->kvm_hstate.fake_suspend << 
PSSCR_FAKE_SUSPEND_LG));
+   }
+
mtspr(SPRN_HFSCR, host_hfscr);
if (vcpu->arch.ciabr != host_ciabr)
mtspr(SPRN_CIABR, host_ciabr);
-- 
2.23.0

[PATCH v3 40/52] KVM: PPC: Book3S HV P9: Test dawr_enabled() before saving host DAWR SPRs

2021-10-04 Thread Nicholas Piggin

Some of the DAWR SPR access is already predicated on dawr_enabled(),
apply this to the remainder of the accesses.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv_p9_entry.c | 34 ---
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c 
b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index 323b692bbfe2..0f341011816c 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -666,13 +666,16 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
 
host_hfscr = mfspr(SPRN_HFSCR);
host_ciabr = mfspr(SPRN_CIABR);
-   host_dawr0 = mfspr(SPRN_DAWR0);
-   host_dawrx0 = mfspr(SPRN_DAWRX0);
host_psscr = mfspr(SPRN_PSSCR);
host_pidr = mfspr(SPRN_PID);
-   if (cpu_has_feature(CPU_FTR_DAWR1)) {
-   host_dawr1 = mfspr(SPRN_DAWR1);
-   host_dawrx1 = mfspr(SPRN_DAWRX1);
+
+   if (dawr_enabled()) {
+   host_dawr0 = mfspr(SPRN_DAWR0);
+   host_dawrx0 = mfspr(SPRN_DAWRX0);
+   if (cpu_has_feature(CPU_FTR_DAWR1)) {
+   host_dawr1 = mfspr(SPRN_DAWR1);
+   host_dawrx1 = mfspr(SPRN_DAWRX1);
+   }
}
 
local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR);
@@ -1006,15 +1009,18 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
mtspr(SPRN_HFSCR, host_hfscr);
if (vcpu->arch.ciabr != host_ciabr)
mtspr(SPRN_CIABR, host_ciabr);
-   if (vcpu->arch.dawr0 != host_dawr0)
-   mtspr(SPRN_DAWR0, host_dawr0);
-   if (vcpu->arch.dawrx0 != host_dawrx0)
-   mtspr(SPRN_DAWRX0, host_dawrx0);
-   if (cpu_has_feature(CPU_FTR_DAWR1)) {
-   if (vcpu->arch.dawr1 != host_dawr1)
-   mtspr(SPRN_DAWR1, host_dawr1);
-   if (vcpu->arch.dawrx1 != host_dawrx1)
-   mtspr(SPRN_DAWRX1, host_dawrx1);
+
+   if (dawr_enabled()) {
+   if (vcpu->arch.dawr0 != host_dawr0)
+   mtspr(SPRN_DAWR0, host_dawr0);
+   if (vcpu->arch.dawrx0 != host_dawrx0)
+   mtspr(SPRN_DAWRX0, host_dawrx0);
+   if (cpu_has_feature(CPU_FTR_DAWR1)) {
+   if (vcpu->arch.dawr1 != host_dawr1)
+   mtspr(SPRN_DAWR1, host_dawr1);
+   if (vcpu->arch.dawrx1 != host_dawrx1)
+   mtspr(SPRN_DAWRX1, host_dawrx1);
+   }
}
 
if (vc->dpdes)
-- 
2.23.0

[PATCH v3 39/52] KVM: PPC: Book3S HV P9: Comment and fix MMU context switching code

2021-10-04 Thread Nicholas Piggin

Tighten up partition switching code synchronisation and comments.

In particular, hwsync ; isync is required after the last access that is
performed in the context of a partition, before the partition is
switched away from.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_64_entry.S | 11 +--
 arch/powerpc/kvm/book3s_64_mmu_radix.c |  4 +++
 arch/powerpc/kvm/book3s_hv_p9_entry.c  | 40 +++---
 3 files changed, 42 insertions(+), 13 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_64_entry.S 
b/arch/powerpc/kvm/book3s_64_entry.S
index 983b8c18bc31..05e003eb5d90 100644
--- a/arch/powerpc/kvm/book3s_64_entry.S
+++ b/arch/powerpc/kvm/book3s_64_entry.S
@@ -374,11 +374,16 @@ END_MMU_FTR_SECTION_IFCLR(MMU_FTR_TYPE_RADIX)
 BEGIN_FTR_SECTION
mtspr   SPRN_DAWRX1,r10
 END_FTR_SECTION_IFSET(CPU_FTR_DAWR1)
-   mtspr   SPRN_PID,r10
 
/*
-* Switch to host MMU mode
+* Switch to host MMU mode (don't have the real host PID but we aren't
+* going back to userspace).
 */
+   hwsync
+   isync
+
+   mtspr   SPRN_PID,r10
+
ld  r10, HSTATE_KVM_VCPU(r13)
ld  r10, VCPU_KVM(r10)
lwz r10, KVM_HOST_LPID(r10)
@@ -389,6 +394,8 @@ END_FTR_SECTION_IFSET(CPU_FTR_DAWR1)
ld  r10, KVM_HOST_LPCR(r10)
mtspr   SPRN_LPCR,r10
 
+   isync
+
/*
 * Set GUEST_MODE_NONE so the handler won't branch to KVM, and clear
 * MSR_RI in r12 ([H]SRR1) so the handler won't try to return.
diff --git a/arch/powerpc/kvm/book3s_64_mmu_radix.c 
b/arch/powerpc/kvm/book3s_64_mmu_radix.c
index 16359525a40f..8cebe5542256 100644
--- a/arch/powerpc/kvm/book3s_64_mmu_radix.c
+++ b/arch/powerpc/kvm/book3s_64_mmu_radix.c
@@ -57,6 +57,8 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int 
pid,
 
preempt_disable();
 
+   asm volatile("hwsync" ::: "memory");
+   isync();
/* switch the lpid first to avoid running host with unallocated pid */
old_lpid = mfspr(SPRN_LPID);
if (old_lpid != lpid)
@@ -75,6 +77,8 @@ unsigned long __kvmhv_copy_tofrom_guest_radix(int lpid, int 
pid,
ret = __copy_to_user_inatomic((void __user *)to, from, n);
pagefault_enable();
 
+   asm volatile("hwsync" ::: "memory");
+   isync();
/* switch the pid first to avoid running host with unallocated pid */
if (quadrant == 1 && pid != old_pid)
mtspr(SPRN_PID, old_pid);
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c 
b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index 093ac0453d91..323b692bbfe2 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -531,17 +531,19 @@ static void switch_mmu_to_guest_radix(struct kvm *kvm, 
struct kvm_vcpu *vcpu, u6
lpid = nested ? nested->shadow_lpid : kvm->arch.lpid;
 
/*
-* All the isync()s are overkill but trivially follow the ISA
-* requirements. Some can likely be replaced with justification
-* comment for why they are not needed.
+* Prior memory accesses to host PID Q3 must be completed before we
+* start switching, and stores must be drained to avoid not-my-LPAR
+* logic (see switch_mmu_to_host).
 */
+   asm volatile("hwsync" ::: "memory");
isync();
mtspr(SPRN_LPID, lpid);
-   isync();
mtspr(SPRN_LPCR, lpcr);
-   isync();
mtspr(SPRN_PID, vcpu->arch.pid);
-   isync();
+   /*
+* isync not required here because we are HRFID'ing to guest before
+* any guest context access, which is context synchronising.
+*/
 }
 
 static void switch_mmu_to_guest_hpt(struct kvm *kvm, struct kvm_vcpu *vcpu, 
u64 lpcr)
@@ -551,25 +553,41 @@ static void switch_mmu_to_guest_hpt(struct kvm *kvm, 
struct kvm_vcpu *vcpu, u64
 
lpid = kvm->arch.lpid;
 
+   /*
+* See switch_mmu_to_guest_radix. ptesync should not be required here
+* even if the host is in HPT mode because speculative accesses would
+* not cause RC updates (we are in real mode).
+*/
+   asm volatile("hwsync" ::: "memory");
+   isync();
mtspr(SPRN_LPID, lpid);
mtspr(SPRN_LPCR, lpcr);
mtspr(SPRN_PID, vcpu->arch.pid);
 
for (i = 0; i < vcpu->arch.slb_max; i++)
mtslb(vcpu->arch.slb[i].orige, vcpu->arch.slb[i].origv);
-
-   isync();
+   /*
+* isync not required here, see switch_mmu_to_guest_radix.
+*/
 }
 
 static void switch_mmu_to_host(struct kvm *kvm, u32 pid)
 {
+   /*
+* The guest has exited, so guest MMU context is no longer being
+* non-speculatively accessed, but a hwsync is needed before the
+* mtLPIDR / mtPIDR switch, in order to ensure all stores are drained,
+* so the not-my-LPAR tlbie logic does not overlook them.
+*/
+   asm volatile("hwsync" ::: "memory");
isync();

[PATCH v3 38/52] KVM: PPC: Book3S HV P9: Use Linux SPR save/restore to manage some host SPRs

2021-10-04 Thread Nicholas Piggin

Linux implements SPR save/restore including storage space for registers
in the task struct for process context switching. Make use of this
similarly to the way we make use of the context switching fp/vec save
restore.

This improves code reuse, allows some stack space to be saved, and helps
with avoiding VRSAVE updates if they are not required.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/switch_to.h  |  1 +
 arch/powerpc/kernel/process.c |  6 ++
 arch/powerpc/kvm/book3s_hv.c  | 21 +-
 arch/powerpc/kvm/book3s_hv.h  |  3 -
 arch/powerpc/kvm/book3s_hv_p9_entry.c | 93 +++
 5 files changed, 73 insertions(+), 51 deletions(-)

diff --git a/arch/powerpc/include/asm/switch_to.h 
b/arch/powerpc/include/asm/switch_to.h
index e8013cd6b646..1f43ef696033 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -113,6 +113,7 @@ static inline void clear_task_ebb(struct task_struct *t)
 }
 
 void kvmppc_save_user_regs(void);
+void kvmppc_save_current_sprs(void);
 
 extern int set_thread_tidr(struct task_struct *t);
 
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 3fca321b820d..b2d191a8fbf9 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1182,6 +1182,12 @@ void kvmppc_save_user_regs(void)
 #endif
 }
 EXPORT_SYMBOL_GPL(kvmppc_save_user_regs);
+
+void kvmppc_save_current_sprs(void)
+{
+   save_sprs(¤t->thread);
+}
+EXPORT_SYMBOL_GPL(kvmppc_save_current_sprs);
 #endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
 
 static inline void restore_sprs(struct thread_struct *old_thread,
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index e9037f7a3737..f4445cc5a29a 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4540,9 +4540,6 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu)
struct kvm_run *run = vcpu->run;
int r;
int srcu_idx;
-   unsigned long ebb_regs[3] = {}; /* shut up GCC */
-   unsigned long user_tar = 0;
-   unsigned int user_vrsave;
struct kvm *kvm;
unsigned long msr;
 
@@ -4603,14 +4600,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu)
 
kvmppc_save_user_regs();
 
-   /* Save userspace EBB and other register values */
-   if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
-   ebb_regs[0] = mfspr(SPRN_EBBHR);
-   ebb_regs[1] = mfspr(SPRN_EBBRR);
-   ebb_regs[2] = mfspr(SPRN_BESCR);
-   user_tar = mfspr(SPRN_TAR);
-   }
-   user_vrsave = mfspr(SPRN_VRSAVE);
+   kvmppc_save_current_sprs();
 
vcpu->arch.waitp = &vcpu->arch.vcore->wait;
vcpu->arch.pgdir = kvm->mm->pgd;
@@ -4651,15 +4641,6 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu)
}
} while (is_kvmppc_resume_guest(r));
 
-   /* Restore userspace EBB and other register values */
-   if (cpu_has_feature(CPU_FTR_ARCH_207S)) {
-   mtspr(SPRN_EBBHR, ebb_regs[0]);
-   mtspr(SPRN_EBBRR, ebb_regs[1]);
-   mtspr(SPRN_BESCR, ebb_regs[2]);
-   mtspr(SPRN_TAR, user_tar);
-   }
-   mtspr(SPRN_VRSAVE, user_vrsave);
-
vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
atomic_dec(&kvm->arch.vcpus_running);
 
diff --git a/arch/powerpc/kvm/book3s_hv.h b/arch/powerpc/kvm/book3s_hv.h
index d7485b9e9762..6b7f07d9026b 100644
--- a/arch/powerpc/kvm/book3s_hv.h
+++ b/arch/powerpc/kvm/book3s_hv.h
@@ -4,11 +4,8 @@
  * Privileged (non-hypervisor) host registers to save.
  */
 struct p9_host_os_sprs {
-   unsigned long dscr;
-   unsigned long tidr;
unsigned long iamr;
unsigned long amr;
-   unsigned long fscr;
 
unsigned int pmc1;
unsigned int pmc2;
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c 
b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index 8499e8a9ca8f..093ac0453d91 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -231,15 +231,26 @@ EXPORT_SYMBOL_GPL(switch_pmu_to_host);
 static void load_spr_state(struct kvm_vcpu *vcpu,
struct p9_host_os_sprs *host_os_sprs)
 {
+   /* TAR is very fast */
mtspr(SPRN_TAR, vcpu->arch.tar);
 
+#ifdef CONFIG_ALTIVEC
+   if (cpu_has_feature(CPU_FTR_ALTIVEC) &&
+   current->thread.vrsave != vcpu->arch.vrsave)
+   mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
+#endif
+
if (vcpu->arch.hfscr & HFSCR_EBB) {
-   mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
-   mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
-   mtspr(SPRN_BESCR, vcpu->arch.bescr);
+   if (current->thread.ebbhr != vcpu->arch.ebbhr)
+   mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
+   if (current->thread.ebbrr != vcpu->arch.ebbrr)
+   mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
+   if (current->thr

[PATCH v3 37/52] KVM: PPC: Book3S HV P9: Demand fault TM facility registers

2021-10-04 Thread Nicholas Piggin

Use HFSCR facility disabling to implement demand faulting for TM, with
a hysteresis counter similar to the load_fp etc counters in context
switching that implement the equivalent demand faulting for userspace
facilities.

This speeds up guest entry/exit by avoiding the register save/restore
when a guest is not frequently using them. When a guest does use them
often, there will be some additional demand fault overhead, but these
are not commonly used facilities.

Reviewed-by: Fabiano Rosas 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/kvm_host.h   |  3 +++
 arch/powerpc/kvm/book3s_hv.c  | 26 --
 arch/powerpc/kvm/book3s_hv_p9_entry.c | 15 +++
 3 files changed, 34 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 9c63eff35812..92925f82a1e3 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -580,6 +580,9 @@ struct kvm_vcpu_arch {
ulong ppr;
u32 pspb;
u8 load_ebb;
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   u8 load_tm;
+#endif
ulong fscr;
ulong shadow_fscr;
ulong ebbhr;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 070469867bf5..e9037f7a3737 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1446,6 +1446,16 @@ static int kvmppc_ebb_unavailable(struct kvm_vcpu *vcpu)
return RESUME_GUEST;
 }
 
+static int kvmppc_tm_unavailable(struct kvm_vcpu *vcpu)
+{
+   if (!(vcpu->arch.hfscr_permitted & HFSCR_TM))
+   return EMULATE_FAIL;
+
+   vcpu->arch.hfscr |= HFSCR_TM;
+
+   return RESUME_GUEST;
+}
+
 static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
 struct task_struct *tsk)
 {
@@ -1739,6 +1749,8 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
r = kvmppc_pmu_unavailable(vcpu);
if (cause == FSCR_EBB_LG)
r = kvmppc_ebb_unavailable(vcpu);
+   if (cause == FSCR_TM_LG)
+   r = kvmppc_tm_unavailable(vcpu);
}
if (r == EMULATE_FAIL) {
kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
@@ -2783,9 +2795,9 @@ static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu 
*vcpu)
vcpu->arch.hfscr_permitted = vcpu->arch.hfscr;
 
/*
-* PM, EBB is demand-faulted so start with it clear.
+* PM, EBB, TM are demand-faulted so start with it clear.
 */
-   vcpu->arch.hfscr &= ~(HFSCR_PM | HFSCR_EBB);
+   vcpu->arch.hfscr &= ~(HFSCR_PM | HFSCR_EBB | HFSCR_TM);
 
kvmppc_mmu_book3s_hv_init(vcpu);
 
@@ -3855,8 +3867,9 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu 
*vcpu, u64 time_limit, uns
msr |= MSR_VEC;
if (cpu_has_feature(CPU_FTR_VSX))
msr |= MSR_VSX;
-   if (cpu_has_feature(CPU_FTR_TM) ||
-   cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
+   if ((cpu_has_feature(CPU_FTR_TM) ||
+   cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) &&
+   (vcpu->arch.hfscr & HFSCR_TM))
msr |= MSR_TM;
msr = msr_check_and_set(msr);
 
@@ -4582,8 +4595,9 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu)
msr |= MSR_VEC;
if (cpu_has_feature(CPU_FTR_VSX))
msr |= MSR_VSX;
-   if (cpu_has_feature(CPU_FTR_TM) ||
-   cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
+   if ((cpu_has_feature(CPU_FTR_TM) ||
+   cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) &&
+   (vcpu->arch.hfscr & HFSCR_TM))
msr |= MSR_TM;
msr = msr_check_and_set(msr);
 
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c 
b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index 929a7c336b09..8499e8a9ca8f 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -310,7 +310,7 @@ bool load_vcpu_state(struct kvm_vcpu *vcpu,
if (MSR_TM_ACTIVE(guest_msr)) {
kvmppc_restore_tm_hv(vcpu, guest_msr, true);
ret = true;
-   } else {
+   } else if (vcpu->arch.hfscr & HFSCR_TM) {
mtspr(SPRN_TEXASR, vcpu->arch.texasr);
mtspr(SPRN_TFHAR, vcpu->arch.tfhar);
mtspr(SPRN_TFIAR, vcpu->arch.tfiar);
@@ -346,10 +346,16 @@ void store_vcpu_state(struct kvm_vcpu *vcpu)
unsigned long guest_msr = vcpu->arch.shregs.msr;
if (MSR_TM_ACTIVE(guest_msr)) {
kvmppc_save_tm_hv(vcpu, guest_msr, true);
-   } else {
+   } else if (vcpu->arch.hfscr & HFSCR_TM) {
vcpu->arch.texasr = mfspr(SPRN_TEXASR);
vcpu->arch.tfhar = mfspr(SPRN_

[PATCH v3 36/52] KVM: PPC: Book3S HV P9: Demand fault EBB facility registers

2021-10-04 Thread Nicholas Piggin

Use HFSCR facility disabling to implement demand faulting for EBB, with
a hysteresis counter similar to the load_fp etc counters in context
switching that implement the equivalent demand faulting for userspace
facilities.

This speeds up guest entry/exit by avoiding the register save/restore
when a guest is not frequently using them. When a guest does use them
often, there will be some additional demand fault overhead, but these
are not commonly used facilities.

Reviewed-by: Fabiano Rosas 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/kvm_host.h   |  1 +
 arch/powerpc/kvm/book3s_hv.c  | 16 +--
 arch/powerpc/kvm/book3s_hv_p9_entry.c | 28 +--
 3 files changed, 37 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index c5fc4d016695..9c63eff35812 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -579,6 +579,7 @@ struct kvm_vcpu_arch {
ulong cfar;
ulong ppr;
u32 pspb;
+   u8 load_ebb;
ulong fscr;
ulong shadow_fscr;
ulong ebbhr;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6fb941aa77f1..070469867bf5 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1436,6 +1436,16 @@ static int kvmppc_pmu_unavailable(struct kvm_vcpu *vcpu)
return RESUME_GUEST;
 }
 
+static int kvmppc_ebb_unavailable(struct kvm_vcpu *vcpu)
+{
+   if (!(vcpu->arch.hfscr_permitted & HFSCR_EBB))
+   return EMULATE_FAIL;
+
+   vcpu->arch.hfscr |= HFSCR_EBB;
+
+   return RESUME_GUEST;
+}
+
 static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
 struct task_struct *tsk)
 {
@@ -1727,6 +1737,8 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
r = kvmppc_emulate_doorbell_instr(vcpu);
if (cause == FSCR_PM_LG)
r = kvmppc_pmu_unavailable(vcpu);
+   if (cause == FSCR_EBB_LG)
+   r = kvmppc_ebb_unavailable(vcpu);
}
if (r == EMULATE_FAIL) {
kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
@@ -2771,9 +2783,9 @@ static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu 
*vcpu)
vcpu->arch.hfscr_permitted = vcpu->arch.hfscr;
 
/*
-* PM is demand-faulted so start with it clear.
+* PM, EBB is demand-faulted so start with it clear.
 */
-   vcpu->arch.hfscr &= ~HFSCR_PM;
+   vcpu->arch.hfscr &= ~(HFSCR_PM | HFSCR_EBB);
 
kvmppc_mmu_book3s_hv_init(vcpu);
 
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c 
b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index a23f09fa7d2d..929a7c336b09 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -232,9 +232,12 @@ static void load_spr_state(struct kvm_vcpu *vcpu,
struct p9_host_os_sprs *host_os_sprs)
 {
mtspr(SPRN_TAR, vcpu->arch.tar);
-   mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
-   mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
-   mtspr(SPRN_BESCR, vcpu->arch.bescr);
+
+   if (vcpu->arch.hfscr & HFSCR_EBB) {
+   mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
+   mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
+   mtspr(SPRN_BESCR, vcpu->arch.bescr);
+   }
 
if (cpu_has_feature(CPU_FTR_P9_TIDR))
mtspr(SPRN_TIDR, vcpu->arch.tid);
@@ -265,9 +268,22 @@ static void load_spr_state(struct kvm_vcpu *vcpu,
 static void store_spr_state(struct kvm_vcpu *vcpu)
 {
vcpu->arch.tar = mfspr(SPRN_TAR);
-   vcpu->arch.ebbhr = mfspr(SPRN_EBBHR);
-   vcpu->arch.ebbrr = mfspr(SPRN_EBBRR);
-   vcpu->arch.bescr = mfspr(SPRN_BESCR);
+
+   if (vcpu->arch.hfscr & HFSCR_EBB) {
+   vcpu->arch.ebbhr = mfspr(SPRN_EBBHR);
+   vcpu->arch.ebbrr = mfspr(SPRN_EBBRR);
+   vcpu->arch.bescr = mfspr(SPRN_BESCR);
+   /*
+* This is like load_fp in context switching, turn off the
+* facility after it wraps the u8 to try avoiding saving
+* and restoring the registers each partition switch.
+*/
+   if (!vcpu->arch.nested) {
+   vcpu->arch.load_ebb++;
+   if (!vcpu->arch.load_ebb)
+   vcpu->arch.hfscr &= ~HFSCR_EBB;
+   }
+   }
 
if (cpu_has_feature(CPU_FTR_P9_TIDR))
vcpu->arch.tid = mfspr(SPRN_TIDR);
-- 
2.23.0

[PATCH v3 35/52] KVM: PPC: Book3S HV P9: More SPR speed improvements

2021-10-04 Thread Nicholas Piggin

This avoids more scoreboard stalls and reduces mtSPRs.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv_p9_entry.c | 73 ---
 1 file changed, 43 insertions(+), 30 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c 
b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index 67f57b03a896..a23f09fa7d2d 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -645,24 +645,29 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
vc->tb_offset_applied = vc->tb_offset;
}
 
-   if (vc->pcr)
-   mtspr(SPRN_PCR, vc->pcr | PCR_MASK);
-   mtspr(SPRN_DPDES, vc->dpdes);
mtspr(SPRN_VTB, vc->vtb);
-
mtspr(SPRN_PURR, vcpu->arch.purr);
mtspr(SPRN_SPURR, vcpu->arch.spurr);
 
+   if (vc->pcr)
+   mtspr(SPRN_PCR, vc->pcr | PCR_MASK);
+   if (vc->dpdes)
+   mtspr(SPRN_DPDES, vc->dpdes);
+
if (dawr_enabled()) {
-   mtspr(SPRN_DAWR0, vcpu->arch.dawr0);
-   mtspr(SPRN_DAWRX0, vcpu->arch.dawrx0);
+   if (vcpu->arch.dawr0 != host_dawr0)
+   mtspr(SPRN_DAWR0, vcpu->arch.dawr0);
+   if (vcpu->arch.dawrx0 != host_dawrx0)
+   mtspr(SPRN_DAWRX0, vcpu->arch.dawrx0);
if (cpu_has_feature(CPU_FTR_DAWR1)) {
-   mtspr(SPRN_DAWR1, vcpu->arch.dawr1);
-   mtspr(SPRN_DAWRX1, vcpu->arch.dawrx1);
+   if (vcpu->arch.dawr1 != host_dawr1)
+   mtspr(SPRN_DAWR1, vcpu->arch.dawr1);
+   if (vcpu->arch.dawrx1 != host_dawrx1)
+   mtspr(SPRN_DAWRX1, vcpu->arch.dawrx1);
}
}
-   mtspr(SPRN_CIABR, vcpu->arch.ciabr);
-   mtspr(SPRN_IC, vcpu->arch.ic);
+   if (vcpu->arch.ciabr != host_ciabr)
+   mtspr(SPRN_CIABR, vcpu->arch.ciabr);
 
mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC |
  (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
@@ -881,20 +886,6 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
vc->dpdes = mfspr(SPRN_DPDES);
vc->vtb = mfspr(SPRN_VTB);
 
-   save_clear_guest_mmu(kvm, vcpu);
-   switch_mmu_to_host(kvm, host_pidr);
-
-   /*
-* If we are in real mode, only switch MMU on after the MMU is
-* switched to host, to avoid the P9_RADIX_PREFETCH_BUG.
-*/
-   if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) &&
-   vcpu->arch.shregs.msr & MSR_TS_MASK)
-   msr |= MSR_TS_S;
-   __mtmsrd(msr, 0);
-
-   store_vcpu_state(vcpu);
-
dec = mfspr(SPRN_DEC);
if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */
dec = (s32) dec;
@@ -912,6 +903,22 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
vc->tb_offset_applied = 0;
}
 
+   save_clear_guest_mmu(kvm, vcpu);
+   switch_mmu_to_host(kvm, host_pidr);
+
+   /*
+* Enable MSR here in order to have facilities enabled to save
+* guest registers. This enables MMU (if we were in realmode), so
+* only switch MMU on after the MMU is switched to host, to avoid
+* the P9_RADIX_PREFETCH_BUG or hash guest context.
+*/
+   if (IS_ENABLED(CONFIG_PPC_TRANSACTIONAL_MEM) &&
+   vcpu->arch.shregs.msr & MSR_TS_MASK)
+   msr |= MSR_TS_S;
+   __mtmsrd(msr, 0);
+
+   store_vcpu_state(vcpu);
+
mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr);
mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr);
 
@@ -919,15 +926,21 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
mtspr(SPRN_PSSCR, host_psscr |
  (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
mtspr(SPRN_HFSCR, host_hfscr);
-   mtspr(SPRN_CIABR, host_ciabr);
-   mtspr(SPRN_DAWR0, host_dawr0);
-   mtspr(SPRN_DAWRX0, host_dawrx0);
+   if (vcpu->arch.ciabr != host_ciabr)
+   mtspr(SPRN_CIABR, host_ciabr);
+   if (vcpu->arch.dawr0 != host_dawr0)
+   mtspr(SPRN_DAWR0, host_dawr0);
+   if (vcpu->arch.dawrx0 != host_dawrx0)
+   mtspr(SPRN_DAWRX0, host_dawrx0);
if (cpu_has_feature(CPU_FTR_DAWR1)) {
-   mtspr(SPRN_DAWR1, host_dawr1);
-   mtspr(SPRN_DAWRX1, host_dawrx1);
+   if (vcpu->arch.dawr1 != host_dawr1)
+   mtspr(SPRN_DAWR1, host_dawr1);
+   if (vcpu->arch.dawrx1 != host_dawrx1)
+   mtspr(SPRN_DAWRX1, host_dawrx1);
}
 
-   mtspr(SPRN_DPDES, 0);
+   if (vc->dpdes)
+   mtspr(SPRN_DPDES, 0);
if (vc->pcr)
mtspr(SPRN_PCR, PCR_MASK);
 
-- 
2

[PATCH v3 34/52] KVM: PPC: Book3S HV P9: Restrict DSISR canary workaround to processors that require it

2021-10-04 Thread Nicholas Piggin

Use CPU_FTR_P9_RADIX_PREFETCH_BUG to apply the workaround, to test for
DD2.1 and below processors. This saves a mtSPR in guest entry.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv.c  | 3 ++-
 arch/powerpc/kvm/book3s_hv_p9_entry.c | 6 --
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 5a1859311b3e..6fb941aa77f1 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1590,7 +1590,8 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
unsigned long vsid;
long err;
 
-   if (vcpu->arch.fault_dsisr == HDSISR_CANARY) {
+   if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG) &&
+   unlikely(vcpu->arch.fault_dsisr == HDSISR_CANARY)) {
r = RESUME_GUEST; /* Just retry if it's the canary */
break;
}
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c 
b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index 619bbcd47b92..67f57b03a896 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -683,9 +683,11 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
 * HDSI which should correctly update the HDSISR the second time HDSI
 * entry.
 *
-* Just do this on all p9 processors for now.
+* The "radix prefetch bug" test can be used to test for this bug, as
+* it also exists fo DD2.1 and below.
 */
-   mtspr(SPRN_HDSISR, HDSISR_CANARY);
+   if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG))
+   mtspr(SPRN_HDSISR, HDSISR_CANARY);
 
mtspr(SPRN_SPRG0, vcpu->arch.shregs.sprg0);
mtspr(SPRN_SPRG1, vcpu->arch.shregs.sprg1);
-- 
2.23.0

[PATCH v3 33/52] KVM: PPC: Book3S HV P9: Switch PMU to guest as late as possible

2021-10-04 Thread Nicholas Piggin

This moves PMU switch to guest as late as possible in entry, and switch
back to host as early as possible at exit. This helps the host get the
most perf coverage of KVM entry/exit code as possible.

This is slightly suboptimal for SPR scheduling point of view when the
PMU is enabled, but when perf is disabled there is no real difference.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv.c  | 6 ++
 arch/powerpc/kvm/book3s_hv_p9_entry.c | 6 ++
 2 files changed, 4 insertions(+), 8 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index db42eeb27c15..5a1859311b3e 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3820,8 +3820,6 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu 
*vcpu, u64 time_limit, uns
s64 dec;
int trap;
 
-   switch_pmu_to_guest(vcpu, &host_os_sprs);
-
save_p9_host_os_sprs(&host_os_sprs);
 
/*
@@ -3884,9 +3882,11 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu 
*vcpu, u64 time_limit, uns
 
mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
+   switch_pmu_to_guest(vcpu, &host_os_sprs);
trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs),
  __pa(&vcpu->arch.regs));
kvmhv_restore_hv_return_state(vcpu, &hvregs);
+   switch_pmu_to_host(vcpu, &host_os_sprs);
vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
@@ -3905,8 +3905,6 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu 
*vcpu, u64 time_limit, uns
 
restore_p9_host_os_sprs(vcpu, &host_os_sprs);
 
-   switch_pmu_to_host(vcpu, &host_os_sprs);
-
return trap;
 }
 
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c 
b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index 6bef509bccb8..619bbcd47b92 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -601,8 +601,6 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR);
local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR);
 
-   switch_pmu_to_guest(vcpu, &host_os_sprs);
-
save_p9_host_os_sprs(&host_os_sprs);
 
/*
@@ -744,7 +742,9 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
 
accumulate_time(vcpu, &vcpu->arch.guest_time);
 
+   switch_pmu_to_guest(vcpu, &host_os_sprs);
kvmppc_p9_enter_guest(vcpu);
+   switch_pmu_to_host(vcpu, &host_os_sprs);
 
accumulate_time(vcpu, &vcpu->arch.rm_intr);
 
@@ -955,8 +955,6 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
asm volatile(PPC_CP_ABORT);
 
 out:
-   switch_pmu_to_host(vcpu, &host_os_sprs);
-
end_timing(vcpu);
 
return trap;
-- 
2.23.0

[PATCH v3 32/52] KVM: PPC: Book3S HV P9: Implement TM fastpath for guest entry/exit

2021-10-04 Thread Nicholas Piggin

If TM is not active, only TM register state needs to be saved and
restored, avoiding several mfmsr/mtmsrd instructions and improving
performance.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv_p9_entry.c | 27 +++
 1 file changed, 23 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c 
b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index fa080533bd8d..6bef509bccb8 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -287,11 +287,20 @@ bool load_vcpu_state(struct kvm_vcpu *vcpu,
 {
bool ret = false;
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
if (cpu_has_feature(CPU_FTR_TM) ||
cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) {
-   kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
-   ret = true;
+   unsigned long guest_msr = vcpu->arch.shregs.msr;
+   if (MSR_TM_ACTIVE(guest_msr)) {
+   kvmppc_restore_tm_hv(vcpu, guest_msr, true);
+   ret = true;
+   } else {
+   mtspr(SPRN_TEXASR, vcpu->arch.texasr);
+   mtspr(SPRN_TFHAR, vcpu->arch.tfhar);
+   mtspr(SPRN_TFIAR, vcpu->arch.tfiar);
+   }
}
+#endif
 
load_spr_state(vcpu, host_os_sprs);
 
@@ -315,9 +324,19 @@ void store_vcpu_state(struct kvm_vcpu *vcpu)
 #endif
vcpu->arch.vrsave = mfspr(SPRN_VRSAVE);
 
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
if (cpu_has_feature(CPU_FTR_TM) ||
-   cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
-   kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
+   cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) {
+   unsigned long guest_msr = vcpu->arch.shregs.msr;
+   if (MSR_TM_ACTIVE(guest_msr)) {
+   kvmppc_save_tm_hv(vcpu, guest_msr, true);
+   } else {
+   vcpu->arch.texasr = mfspr(SPRN_TEXASR);
+   vcpu->arch.tfhar = mfspr(SPRN_TFHAR);
+   vcpu->arch.tfiar = mfspr(SPRN_TFIAR);
+   }
+   }
+#endif
 }
 EXPORT_SYMBOL_GPL(store_vcpu_state);
 
-- 
2.23.0

[PATCH v3 31/52] KVM: PPC: Book3S HV P9: Move remaining SPR and MSR access into low level entry

2021-10-04 Thread Nicholas Piggin

Move register saving and loading from kvmhv_p9_guest_entry() into the HV
and nested entry handlers.

Accesses are scheduled to reduce mtSPR / mfSPR interleaving which
reduces SPR scoreboard stalls.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv.c  | 79 ++
 arch/powerpc/kvm/book3s_hv_p9_entry.c | 96 ---
 2 files changed, 109 insertions(+), 66 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index a57727463980..db42eeb27c15 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3814,9 +3814,15 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu 
*vcpu, u64 time_limit, uns
 {
struct kvmppc_vcore *vc = vcpu->arch.vcore;
unsigned long host_psscr;
+   unsigned long msr;
struct hv_guest_state hvregs;
-   int trap;
+   struct p9_host_os_sprs host_os_sprs;
s64 dec;
+   int trap;
+
+   switch_pmu_to_guest(vcpu, &host_os_sprs);
+
+   save_p9_host_os_sprs(&host_os_sprs);
 
/*
 * We need to save and restore the guest visible part of the
@@ -3825,6 +3831,27 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu 
*vcpu, u64 time_limit, uns
 * this is done in kvmhv_vcpu_entry_p9() below otherwise.
 */
host_psscr = mfspr(SPRN_PSSCR_PR);
+
+   hard_irq_disable();
+   if (lazy_irq_pending())
+   return 0;
+
+   /* MSR bits may have been cleared by context switch */
+   msr = 0;
+   if (IS_ENABLED(CONFIG_PPC_FPU))
+   msr |= MSR_FP;
+   if (cpu_has_feature(CPU_FTR_ALTIVEC))
+   msr |= MSR_VEC;
+   if (cpu_has_feature(CPU_FTR_VSX))
+   msr |= MSR_VSX;
+   if (cpu_has_feature(CPU_FTR_TM) ||
+   cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
+   msr |= MSR_TM;
+   msr = msr_check_and_set(msr);
+
+   if (unlikely(load_vcpu_state(vcpu, &host_os_sprs)))
+   msr = mfmsr(); /* TM restore can update msr */
+
mtspr(SPRN_PSSCR_PR, vcpu->arch.psscr);
kvmhv_save_hv_regs(vcpu, &hvregs);
hvregs.lpcr = lpcr;
@@ -3866,12 +3893,20 @@ static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu 
*vcpu, u64 time_limit, uns
vcpu->arch.psscr = mfspr(SPRN_PSSCR_PR);
mtspr(SPRN_PSSCR_PR, host_psscr);
 
+   store_vcpu_state(vcpu);
+
dec = mfspr(SPRN_DEC);
if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */
dec = (s32) dec;
*tb = mftb();
vcpu->arch.dec_expires = dec + (*tb + vc->tb_offset);
 
+   timer_rearm_host_dec(*tb);
+
+   restore_p9_host_os_sprs(vcpu, &host_os_sprs);
+
+   switch_pmu_to_host(vcpu, &host_os_sprs);
+
return trap;
 }
 
@@ -3882,9 +3917,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
 unsigned long lpcr, u64 *tb)
 {
struct kvmppc_vcore *vc = vcpu->arch.vcore;
-   struct p9_host_os_sprs host_os_sprs;
u64 next_timer;
-   unsigned long msr;
int trap;
 
next_timer = timer_get_next_tb();
@@ -3895,33 +3928,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
 
vcpu->arch.ceded = 0;
 
-   save_p9_host_os_sprs(&host_os_sprs);
-
-   /*
-* This could be combined with MSR[RI] clearing, but that expands
-* the unrecoverable window. It would be better to cover unrecoverable
-* with KVM bad interrupt handling rather than use MSR[RI] at all.
-*
-* Much more difficult and less worthwhile to combine with IR/DR
-* disable.
-*/
-   hard_irq_disable();
-   if (lazy_irq_pending())
-   return 0;
-
-   /* MSR bits may have been cleared by context switch */
-   msr = 0;
-   if (IS_ENABLED(CONFIG_PPC_FPU))
-   msr |= MSR_FP;
-   if (cpu_has_feature(CPU_FTR_ALTIVEC))
-   msr |= MSR_VEC;
-   if (cpu_has_feature(CPU_FTR_VSX))
-   msr |= MSR_VSX;
-   if (cpu_has_feature(CPU_FTR_TM) ||
-   cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
-   msr |= MSR_TM;
-   msr = msr_check_and_set(msr);
-
kvmppc_subcore_enter_guest();
 
vc->entry_exit_map = 1;
@@ -3929,11 +3935,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
 
vcpu_vpa_increment_dispatch(vcpu);
 
-   if (unlikely(load_vcpu_state(vcpu, &host_os_sprs)))
-   msr = mfmsr(); /* MSR may have been updated */
-
-   switch_pmu_to_guest(vcpu, &host_os_sprs);
-
if (kvmhv_on_pseries()) {
trap = kvmhv_vcpu_entry_p9_nested(vcpu, time_limit, lpcr, tb);
 
@@ -3976,16 +3977,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
vcpu->arch.slb_max = 0;
}
 
-   switch_pmu_to_host(vcpu, &host_os_sprs);
-
-   store_vcpu_stat

[PATCH v3 30/52] KVM: PPC: Book3S HV P9: Move nested guest entry into its own function

2021-10-04 Thread Nicholas Piggin

Move the part of the guest entry which is specific to nested HV into its
own function. This is just refactoring.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv.c | 125 +++
 1 file changed, 67 insertions(+), 58 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 580bac4753f6..a57727463980 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3809,6 +3809,72 @@ static void vcpu_vpa_increment_dispatch(struct kvm_vcpu 
*vcpu)
}
 }
 
+/* call our hypervisor to load up HV regs and go */
+static int kvmhv_vcpu_entry_p9_nested(struct kvm_vcpu *vcpu, u64 time_limit, 
unsigned long lpcr, u64 *tb)
+{
+   struct kvmppc_vcore *vc = vcpu->arch.vcore;
+   unsigned long host_psscr;
+   struct hv_guest_state hvregs;
+   int trap;
+   s64 dec;
+
+   /*
+* We need to save and restore the guest visible part of the
+* psscr (i.e. using SPRN_PSSCR_PR) since the hypervisor
+* doesn't do this for us. Note only required if pseries since
+* this is done in kvmhv_vcpu_entry_p9() below otherwise.
+*/
+   host_psscr = mfspr(SPRN_PSSCR_PR);
+   mtspr(SPRN_PSSCR_PR, vcpu->arch.psscr);
+   kvmhv_save_hv_regs(vcpu, &hvregs);
+   hvregs.lpcr = lpcr;
+   vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
+   hvregs.version = HV_GUEST_STATE_VERSION;
+   if (vcpu->arch.nested) {
+   hvregs.lpid = vcpu->arch.nested->shadow_lpid;
+   hvregs.vcpu_token = vcpu->arch.nested_vcpu_id;
+   } else {
+   hvregs.lpid = vcpu->kvm->arch.lpid;
+   hvregs.vcpu_token = vcpu->vcpu_id;
+   }
+   hvregs.hdec_expiry = time_limit;
+
+   /*
+* When setting DEC, we must always deal with irq_work_raise
+* via NMI vs setting DEC. The problem occurs right as we
+* switch into guest mode if a NMI hits and sets pending work
+* and sets DEC, then that will apply to the guest and not
+* bring us back to the host.
+*
+* irq_work_raise could check a flag (or possibly LPCR[HDICE]
+* for example) and set HDEC to 1? That wouldn't solve the
+* nested hv case which needs to abort the hcall or zero the
+* time limit.
+*
+* XXX: Another day's problem.
+*/
+   mtspr(SPRN_DEC, kvmppc_dec_expires_host_tb(vcpu) - *tb);
+
+   mtspr(SPRN_DAR, vcpu->arch.shregs.dar);
+   mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr);
+   trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs),
+ __pa(&vcpu->arch.regs));
+   kvmhv_restore_hv_return_state(vcpu, &hvregs);
+   vcpu->arch.shregs.msr = vcpu->arch.regs.msr;
+   vcpu->arch.shregs.dar = mfspr(SPRN_DAR);
+   vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR);
+   vcpu->arch.psscr = mfspr(SPRN_PSSCR_PR);
+   mtspr(SPRN_PSSCR_PR, host_psscr);
+
+   dec = mfspr(SPRN_DEC);
+   if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */
+   dec = (s32) dec;
+   *tb = mftb();
+   vcpu->arch.dec_expires = dec + (*tb + vc->tb_offset);
+
+   return trap;
+}
+
 /*
  * Guest entry for POWER9 and later CPUs.
  */
@@ -3817,7 +3883,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
 {
struct kvmppc_vcore *vc = vcpu->arch.vcore;
struct p9_host_os_sprs host_os_sprs;
-   s64 dec;
u64 next_timer;
unsigned long msr;
int trap;
@@ -3870,63 +3935,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
switch_pmu_to_guest(vcpu, &host_os_sprs);
 
if (kvmhv_on_pseries()) {
-   /*
-* We need to save and restore the guest visible part of the
-* psscr (i.e. using SPRN_PSSCR_PR) since the hypervisor
-* doesn't do this for us. Note only required if pseries since
-* this is done in kvmhv_vcpu_entry_p9() below otherwise.
-*/
-   unsigned long host_psscr;
-   /* call our hypervisor to load up HV regs and go */
-   struct hv_guest_state hvregs;
-
-   host_psscr = mfspr(SPRN_PSSCR_PR);
-   mtspr(SPRN_PSSCR_PR, vcpu->arch.psscr);
-   kvmhv_save_hv_regs(vcpu, &hvregs);
-   hvregs.lpcr = lpcr;
-   vcpu->arch.regs.msr = vcpu->arch.shregs.msr;
-   hvregs.version = HV_GUEST_STATE_VERSION;
-   if (vcpu->arch.nested) {
-   hvregs.lpid = vcpu->arch.nested->shadow_lpid;
-   hvregs.vcpu_token = vcpu->arch.nested_vcpu_id;
-   } else {
-   hvregs.lpid = vcpu->kvm->arch.lpid;
-   hvregs.vcpu_token = vcpu->vcpu_id;
-   }
-   hvregs.hdec_expiry = time_limit;
-
-   /*
-

[PATCH v3 29/52] KVM: PPC: Book3S HV P9: Move host OS save/restore functions to built-in

2021-10-04 Thread Nicholas Piggin

Move the P9 guest/host register switching functions to the built-in
P9 entry code, and export it for nested to use as well.

This allows more flexibility in scheduling these supervisor privileged
SPR accesses with the HV privileged and PR SPR accesses in the low level
entry code.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv.c  | 379 +-
 arch/powerpc/kvm/book3s_hv.h  |  45 +++
 arch/powerpc/kvm/book3s_hv_p9_entry.c | 353 
 3 files changed, 399 insertions(+), 378 deletions(-)
 create mode 100644 arch/powerpc/kvm/book3s_hv.h

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 8d721baf8c6b..580bac4753f6 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -80,6 +80,7 @@
 #include 
 
 #include "book3s.h"
+#include "book3s_hv.h"
 
 #define CREATE_TRACE_POINTS
 #include "trace_hv.h"
@@ -127,11 +128,6 @@ static bool nested = true;
 module_param(nested, bool, S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(nested, "Enable nested virtualization (only on POWER9)");
 
-static inline bool nesting_enabled(struct kvm *kvm)
-{
-   return kvm->arch.nested_enable && kvm_is_radix(kvm);
-}
-
 static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu);
 
 /*
@@ -3797,379 +3793,6 @@ static noinline void kvmppc_run_core(struct 
kvmppc_vcore *vc)
trace_kvmppc_run_core(vc, 1);
 }
 
-/*
- * Privileged (non-hypervisor) host registers to save.
- */
-struct p9_host_os_sprs {
-   unsigned long dscr;
-   unsigned long tidr;
-   unsigned long iamr;
-   unsigned long amr;
-   unsigned long fscr;
-
-   unsigned int pmc1;
-   unsigned int pmc2;
-   unsigned int pmc3;
-   unsigned int pmc4;
-   unsigned int pmc5;
-   unsigned int pmc6;
-   unsigned long mmcr0;
-   unsigned long mmcr1;
-   unsigned long mmcr2;
-   unsigned long mmcr3;
-   unsigned long mmcra;
-   unsigned long siar;
-   unsigned long sier1;
-   unsigned long sier2;
-   unsigned long sier3;
-   unsigned long sdar;
-};
-
-static void freeze_pmu(unsigned long mmcr0, unsigned long mmcra)
-{
-   if (!(mmcr0 & MMCR0_FC))
-   goto do_freeze;
-   if (mmcra & MMCRA_SAMPLE_ENABLE)
-   goto do_freeze;
-   if (cpu_has_feature(CPU_FTR_ARCH_31)) {
-   if (!(mmcr0 & MMCR0_PMCCEXT))
-   goto do_freeze;
-   if (!(mmcra & MMCRA_BHRB_DISABLE))
-   goto do_freeze;
-   }
-   return;
-
-do_freeze:
-   mmcr0 = MMCR0_FC;
-   mmcra = 0;
-   if (cpu_has_feature(CPU_FTR_ARCH_31)) {
-   mmcr0 |= MMCR0_PMCCEXT;
-   mmcra = MMCRA_BHRB_DISABLE;
-   }
-
-   mtspr(SPRN_MMCR0, mmcr0);
-   mtspr(SPRN_MMCRA, mmcra);
-   isync();
-}
-
-static void switch_pmu_to_guest(struct kvm_vcpu *vcpu,
-   struct p9_host_os_sprs *host_os_sprs)
-{
-   struct lppaca *lp;
-   int load_pmu = 1;
-
-   lp = vcpu->arch.vpa.pinned_addr;
-   if (lp)
-   load_pmu = lp->pmcregs_in_use;
-
-   /* Save host */
-   if (ppc_get_pmu_inuse()) {
-   /*
-* It might be better to put PMU handling (at least for the
-* host) in the perf subsystem because it knows more about what
-* is being used.
-*/
-
-   /* POWER9, POWER10 do not implement HPMC or SPMC */
-
-   host_os_sprs->mmcr0 = mfspr(SPRN_MMCR0);
-   host_os_sprs->mmcra = mfspr(SPRN_MMCRA);
-
-   freeze_pmu(host_os_sprs->mmcr0, host_os_sprs->mmcra);
-
-   host_os_sprs->pmc1 = mfspr(SPRN_PMC1);
-   host_os_sprs->pmc2 = mfspr(SPRN_PMC2);
-   host_os_sprs->pmc3 = mfspr(SPRN_PMC3);
-   host_os_sprs->pmc4 = mfspr(SPRN_PMC4);
-   host_os_sprs->pmc5 = mfspr(SPRN_PMC5);
-   host_os_sprs->pmc6 = mfspr(SPRN_PMC6);
-   host_os_sprs->mmcr1 = mfspr(SPRN_MMCR1);
-   host_os_sprs->mmcr2 = mfspr(SPRN_MMCR2);
-   host_os_sprs->sdar = mfspr(SPRN_SDAR);
-   host_os_sprs->siar = mfspr(SPRN_SIAR);
-   host_os_sprs->sier1 = mfspr(SPRN_SIER);
-
-   if (cpu_has_feature(CPU_FTR_ARCH_31)) {
-   host_os_sprs->mmcr3 = mfspr(SPRN_MMCR3);
-   host_os_sprs->sier2 = mfspr(SPRN_SIER2);
-   host_os_sprs->sier3 = mfspr(SPRN_SIER3);
-   }
-   }
-
-#ifdef CONFIG_PPC_PSERIES
-   /* After saving PMU, before loading guest PMU, flip pmcregs_in_use */
-   if (kvmhv_on_pseries()) {
-   barrier();
-   get_lppaca()->pmcregs_in_use = load_pmu;
-   barrier();
-   }
-#endif
-
-   /*
-* Load guest. If the VPA said the PMCs are not in use but the guest
-* tried to access them anyway, HFSCR[PM] wi

[PATCH v3 28/52] KVM: PPC: Book3S HV P9: Move vcpu register save/restore into functions

2021-10-04 Thread Nicholas Piggin

This should be no functional difference but makes the caller easier
to read.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv.c | 65 +++-
 1 file changed, 41 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index e817159cd53f..8d721baf8c6b 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4095,6 +4095,44 @@ static void store_spr_state(struct kvm_vcpu *vcpu)
vcpu->arch.ctrl = mfspr(SPRN_CTRLF);
 }
 
+/* Returns true if current MSR and/or guest MSR may have changed */
+static bool load_vcpu_state(struct kvm_vcpu *vcpu,
+  struct p9_host_os_sprs *host_os_sprs)
+{
+   bool ret = false;
+
+   if (cpu_has_feature(CPU_FTR_TM) ||
+   cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) {
+   kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
+   ret = true;
+   }
+
+   load_spr_state(vcpu, host_os_sprs);
+
+   load_fp_state(&vcpu->arch.fp);
+#ifdef CONFIG_ALTIVEC
+   load_vr_state(&vcpu->arch.vr);
+#endif
+   mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
+
+   return ret;
+}
+
+static void store_vcpu_state(struct kvm_vcpu *vcpu)
+{
+   store_spr_state(vcpu);
+
+   store_fp_state(&vcpu->arch.fp);
+#ifdef CONFIG_ALTIVEC
+   store_vr_state(&vcpu->arch.vr);
+#endif
+   vcpu->arch.vrsave = mfspr(SPRN_VRSAVE);
+
+   if (cpu_has_feature(CPU_FTR_TM) ||
+   cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
+   kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
+}
+
 static void save_p9_host_os_sprs(struct p9_host_os_sprs *host_os_sprs)
 {
host_os_sprs->dscr = mfspr(SPRN_DSCR);
@@ -4203,19 +4241,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
 
vcpu_vpa_increment_dispatch(vcpu);
 
-   if (cpu_has_feature(CPU_FTR_TM) ||
-   cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) {
-   kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
-   msr = mfmsr(); /* TM restore can update msr */
-   }
-
-   load_spr_state(vcpu, &host_os_sprs);
-
-   load_fp_state(&vcpu->arch.fp);
-#ifdef CONFIG_ALTIVEC
-   load_vr_state(&vcpu->arch.vr);
-#endif
-   mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
+   if (unlikely(load_vcpu_state(vcpu, &host_os_sprs)))
+   msr = mfmsr(); /* MSR may have been updated */
 
switch_pmu_to_guest(vcpu, &host_os_sprs);
 
@@ -4319,17 +4346,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
 
switch_pmu_to_host(vcpu, &host_os_sprs);
 
-   store_spr_state(vcpu);
-
-   store_fp_state(&vcpu->arch.fp);
-#ifdef CONFIG_ALTIVEC
-   store_vr_state(&vcpu->arch.vr);
-#endif
-   vcpu->arch.vrsave = mfspr(SPRN_VRSAVE);
-
-   if (cpu_has_feature(CPU_FTR_TM) ||
-   cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
-   kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
+   store_vcpu_state(vcpu);
 
vcpu_vpa_increment_dispatch(vcpu);
 
-- 
2.23.0

[PATCH v3 27/52] KVM: PPC: Book3S HV P9: Juggle SPR switching around

2021-10-04 Thread Nicholas Piggin

This juggles SPR switching on the entry and exit sides to be more
symmetric, which makes the next refactoring patch possible with no
functional change.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv.c | 8 
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 460290cc79af..e817159cd53f 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4209,7 +4209,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
msr = mfmsr(); /* TM restore can update msr */
}
 
-   switch_pmu_to_guest(vcpu, &host_os_sprs);
+   load_spr_state(vcpu, &host_os_sprs);
 
load_fp_state(&vcpu->arch.fp);
 #ifdef CONFIG_ALTIVEC
@@ -4217,7 +4217,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
 #endif
mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
 
-   load_spr_state(vcpu, &host_os_sprs);
+   switch_pmu_to_guest(vcpu, &host_os_sprs);
 
if (kvmhv_on_pseries()) {
/*
@@ -4317,6 +4317,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
vcpu->arch.slb_max = 0;
}
 
+   switch_pmu_to_host(vcpu, &host_os_sprs);
+
store_spr_state(vcpu);
 
store_fp_state(&vcpu->arch.fp);
@@ -4331,8 +4333,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
 
vcpu_vpa_increment_dispatch(vcpu);
 
-   switch_pmu_to_host(vcpu, &host_os_sprs);
-
timer_rearm_host_dec(*tb);
 
restore_p9_host_os_sprs(vcpu, &host_os_sprs);
-- 
2.23.0

[PATCH v3 26/52] KVM: PPC: Book3S HV P9: Only execute mtSPR if the value changed

2021-10-04 Thread Nicholas Piggin

Keep better track of the current SPR value in places where
they are to be loaded with a new context, to reduce expensive
mtSPR operations.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv.c | 51 ++--
 1 file changed, 31 insertions(+), 20 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 823d64047d01..460290cc79af 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4042,20 +4042,28 @@ static void switch_pmu_to_host(struct kvm_vcpu *vcpu,
}
 }
 
-static void load_spr_state(struct kvm_vcpu *vcpu)
+static void load_spr_state(struct kvm_vcpu *vcpu,
+   struct p9_host_os_sprs *host_os_sprs)
 {
-   mtspr(SPRN_DSCR, vcpu->arch.dscr);
-   mtspr(SPRN_IAMR, vcpu->arch.iamr);
-   mtspr(SPRN_PSPB, vcpu->arch.pspb);
-   mtspr(SPRN_FSCR, vcpu->arch.fscr);
mtspr(SPRN_TAR, vcpu->arch.tar);
mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
mtspr(SPRN_BESCR, vcpu->arch.bescr);
+
if (cpu_has_feature(CPU_FTR_P9_TIDR))
mtspr(SPRN_TIDR, vcpu->arch.tid);
-   mtspr(SPRN_AMR, vcpu->arch.amr);
-   mtspr(SPRN_UAMOR, vcpu->arch.uamor);
+   if (host_os_sprs->iamr != vcpu->arch.iamr)
+   mtspr(SPRN_IAMR, vcpu->arch.iamr);
+   if (host_os_sprs->amr != vcpu->arch.amr)
+   mtspr(SPRN_AMR, vcpu->arch.amr);
+   if (vcpu->arch.uamor != 0)
+   mtspr(SPRN_UAMOR, vcpu->arch.uamor);
+   if (host_os_sprs->fscr != vcpu->arch.fscr)
+   mtspr(SPRN_FSCR, vcpu->arch.fscr);
+   if (host_os_sprs->dscr != vcpu->arch.dscr)
+   mtspr(SPRN_DSCR, vcpu->arch.dscr);
+   if (vcpu->arch.pspb != 0)
+   mtspr(SPRN_PSPB, vcpu->arch.pspb);
 
/*
 * DAR, DSISR, and for nested HV, SPRGs must be set with MSR[RI]
@@ -4070,20 +4078,21 @@ static void load_spr_state(struct kvm_vcpu *vcpu)
 
 static void store_spr_state(struct kvm_vcpu *vcpu)
 {
-   vcpu->arch.ctrl = mfspr(SPRN_CTRLF);
-
-   vcpu->arch.iamr = mfspr(SPRN_IAMR);
-   vcpu->arch.pspb = mfspr(SPRN_PSPB);
-   vcpu->arch.fscr = mfspr(SPRN_FSCR);
vcpu->arch.tar = mfspr(SPRN_TAR);
vcpu->arch.ebbhr = mfspr(SPRN_EBBHR);
vcpu->arch.ebbrr = mfspr(SPRN_EBBRR);
vcpu->arch.bescr = mfspr(SPRN_BESCR);
+
if (cpu_has_feature(CPU_FTR_P9_TIDR))
vcpu->arch.tid = mfspr(SPRN_TIDR);
+   vcpu->arch.iamr = mfspr(SPRN_IAMR);
vcpu->arch.amr = mfspr(SPRN_AMR);
vcpu->arch.uamor = mfspr(SPRN_UAMOR);
+   vcpu->arch.fscr = mfspr(SPRN_FSCR);
vcpu->arch.dscr = mfspr(SPRN_DSCR);
+   vcpu->arch.pspb = mfspr(SPRN_PSPB);
+
+   vcpu->arch.ctrl = mfspr(SPRN_CTRLF);
 }
 
 static void save_p9_host_os_sprs(struct p9_host_os_sprs *host_os_sprs)
@@ -4094,6 +4103,7 @@ static void save_p9_host_os_sprs(struct p9_host_os_sprs 
*host_os_sprs)
host_os_sprs->iamr = mfspr(SPRN_IAMR);
host_os_sprs->amr = mfspr(SPRN_AMR);
host_os_sprs->fscr = mfspr(SPRN_FSCR);
+   host_os_sprs->dscr = mfspr(SPRN_DSCR);
 }
 
 /* vcpu guest regs must already be saved */
@@ -4102,19 +4112,20 @@ static void restore_p9_host_os_sprs(struct kvm_vcpu 
*vcpu,
 {
mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso);
 
-   mtspr(SPRN_PSPB, 0);
-   mtspr(SPRN_UAMOR, 0);
-
-   mtspr(SPRN_DSCR, host_os_sprs->dscr);
if (cpu_has_feature(CPU_FTR_P9_TIDR))
mtspr(SPRN_TIDR, host_os_sprs->tidr);
-   mtspr(SPRN_IAMR, host_os_sprs->iamr);
-
+   if (host_os_sprs->iamr != vcpu->arch.iamr)
+   mtspr(SPRN_IAMR, host_os_sprs->iamr);
+   if (vcpu->arch.uamor != 0)
+   mtspr(SPRN_UAMOR, 0);
if (host_os_sprs->amr != vcpu->arch.amr)
mtspr(SPRN_AMR, host_os_sprs->amr);
-
if (host_os_sprs->fscr != vcpu->arch.fscr)
mtspr(SPRN_FSCR, host_os_sprs->fscr);
+   if (host_os_sprs->dscr != vcpu->arch.dscr)
+   mtspr(SPRN_DSCR, host_os_sprs->dscr);
+   if (vcpu->arch.pspb != 0)
+   mtspr(SPRN_PSPB, 0);
 
/* Save guest CTRL register, set runlatch to 1 */
if (!(vcpu->arch.ctrl & 1))
@@ -4206,7 +4217,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
 #endif
mtspr(SPRN_VRSAVE, vcpu->arch.vrsave);
 
-   load_spr_state(vcpu);
+   load_spr_state(vcpu, &host_os_sprs);
 
if (kvmhv_on_pseries()) {
/*
-- 
2.23.0

[PATCH v3 25/52] KVM: PPC: Book3S HV P9: Avoid SPR scoreboard stalls

2021-10-04 Thread Nicholas Piggin

Avoid interleaving mfSPR and mtSPR to reduce SPR scoreboard stalls.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv.c  |  8 
 arch/powerpc/kvm/book3s_hv_p9_entry.c | 19 +++
 2 files changed, 15 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index f3c052b8b7ee..823d64047d01 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4308,10 +4308,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
 
store_spr_state(vcpu);
 
-   timer_rearm_host_dec(*tb);
-
-   restore_p9_host_os_sprs(vcpu, &host_os_sprs);
-
store_fp_state(&vcpu->arch.fp);
 #ifdef CONFIG_ALTIVEC
store_vr_state(&vcpu->arch.vr);
@@ -4326,6 +4322,10 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
 
switch_pmu_to_host(vcpu, &host_os_sprs);
 
+   timer_rearm_host_dec(*tb);
+
+   restore_p9_host_os_sprs(vcpu, &host_os_sprs);
+
vc->entry_exit_map = 0x101;
vc->in_guest = 0;
 
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c 
b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index 2bd96d8256d1..bd0021cd3a67 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -228,6 +228,9 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
host_dawrx1 = mfspr(SPRN_DAWRX1);
}
 
+   local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR);
+   local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR);
+
if (vc->tb_offset) {
u64 new_tb = *tb + vc->tb_offset;
mtspr(SPRN_TBU40, new_tb);
@@ -244,8 +247,6 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
mtspr(SPRN_DPDES, vc->dpdes);
mtspr(SPRN_VTB, vc->vtb);
 
-   local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR);
-   local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR);
mtspr(SPRN_PURR, vcpu->arch.purr);
mtspr(SPRN_SPURR, vcpu->arch.spurr);
 
@@ -448,10 +449,8 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
/* Advance host PURR/SPURR by the amount used by guest */
purr = mfspr(SPRN_PURR);
spurr = mfspr(SPRN_SPURR);
-   mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr +
- purr - vcpu->arch.purr);
-   mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr +
- spurr - vcpu->arch.spurr);
+   local_paca->kvm_hstate.host_purr += purr - vcpu->arch.purr;
+   local_paca->kvm_hstate.host_spurr += spurr - vcpu->arch.spurr;
vcpu->arch.purr = purr;
vcpu->arch.spurr = spurr;
 
@@ -464,6 +463,9 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
vcpu->arch.shregs.sprg2 = mfspr(SPRN_SPRG2);
vcpu->arch.shregs.sprg3 = mfspr(SPRN_SPRG3);
 
+   vc->dpdes = mfspr(SPRN_DPDES);
+   vc->vtb = mfspr(SPRN_VTB);
+
dec = mfspr(SPRN_DEC);
if (!(lpcr & LPCR_LD)) /* Sign extend if not using large decrementer */
dec = (s32) dec;
@@ -481,6 +483,9 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
vc->tb_offset_applied = 0;
}
 
+   mtspr(SPRN_PURR, local_paca->kvm_hstate.host_purr);
+   mtspr(SPRN_SPURR, local_paca->kvm_hstate.host_spurr);
+
/* Preserve PSSCR[FAKE_SUSPEND] until we've called kvmppc_save_tm_hv */
mtspr(SPRN_PSSCR, host_psscr |
  (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
@@ -509,8 +514,6 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
if (cpu_has_feature(CPU_FTR_ARCH_31))
asm volatile(PPC_CP_ABORT);
 
-   vc->dpdes = mfspr(SPRN_DPDES);
-   vc->vtb = mfspr(SPRN_VTB);
mtspr(SPRN_DPDES, 0);
if (vc->pcr)
mtspr(SPRN_PCR, PCR_MASK);
-- 
2.23.0

[PATCH v3 24/52] KVM: PPC: Book3S HV P9: Optimise timebase reads

2021-10-04 Thread Nicholas Piggin

Reduce the number of mfTB executed by passing the current timebase
around entry and exit code rather than read it multiple times.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/kvm_book3s_64.h |  2 +-
 arch/powerpc/kvm/book3s_hv.c | 88 +---
 arch/powerpc/kvm/book3s_hv_p9_entry.c| 33 +
 3 files changed, 65 insertions(+), 58 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index fff391b9b97b..0a319ed9c2fd 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -154,7 +154,7 @@ static inline bool kvmhv_vcpu_is_radix(struct kvm_vcpu 
*vcpu)
return radix;
 }
 
-int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long 
lpcr);
+int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long 
lpcr, u64 *tb);
 
 #define KVM_DEFAULT_HPT_ORDER  24  /* 16MB HPT by default */
 #endif
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 4abe4a24e5e7..f3c052b8b7ee 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -276,22 +276,22 @@ static void kvmppc_fast_vcpu_kick_hv(struct kvm_vcpu 
*vcpu)
  * they should never fail.)
  */
 
-static void kvmppc_core_start_stolen(struct kvmppc_vcore *vc)
+static void kvmppc_core_start_stolen(struct kvmppc_vcore *vc, u64 tb)
 {
unsigned long flags;
 
spin_lock_irqsave(&vc->stoltb_lock, flags);
-   vc->preempt_tb = mftb();
+   vc->preempt_tb = tb;
spin_unlock_irqrestore(&vc->stoltb_lock, flags);
 }
 
-static void kvmppc_core_end_stolen(struct kvmppc_vcore *vc)
+static void kvmppc_core_end_stolen(struct kvmppc_vcore *vc, u64 tb)
 {
unsigned long flags;
 
spin_lock_irqsave(&vc->stoltb_lock, flags);
if (vc->preempt_tb != TB_NIL) {
-   vc->stolen_tb += mftb() - vc->preempt_tb;
+   vc->stolen_tb += tb - vc->preempt_tb;
vc->preempt_tb = TB_NIL;
}
spin_unlock_irqrestore(&vc->stoltb_lock, flags);
@@ -301,6 +301,7 @@ static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu *vcpu, 
int cpu)
 {
struct kvmppc_vcore *vc = vcpu->arch.vcore;
unsigned long flags;
+   u64 now = mftb();
 
/*
 * We can test vc->runner without taking the vcore lock,
@@ -309,12 +310,12 @@ static void kvmppc_core_vcpu_load_hv(struct kvm_vcpu 
*vcpu, int cpu)
 * ever sets it to NULL.
 */
if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
-   kvmppc_core_end_stolen(vc);
+   kvmppc_core_end_stolen(vc, now);
 
spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST &&
vcpu->arch.busy_preempt != TB_NIL) {
-   vcpu->arch.busy_stolen += mftb() - vcpu->arch.busy_preempt;
+   vcpu->arch.busy_stolen += now - vcpu->arch.busy_preempt;
vcpu->arch.busy_preempt = TB_NIL;
}
spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
@@ -324,13 +325,14 @@ static void kvmppc_core_vcpu_put_hv(struct kvm_vcpu *vcpu)
 {
struct kvmppc_vcore *vc = vcpu->arch.vcore;
unsigned long flags;
+   u64 now = mftb();
 
if (vc->runner == vcpu && vc->vcore_state >= VCORE_SLEEPING)
-   kvmppc_core_start_stolen(vc);
+   kvmppc_core_start_stolen(vc, now);
 
spin_lock_irqsave(&vcpu->arch.tbacct_lock, flags);
if (vcpu->arch.state == KVMPPC_VCPU_BUSY_IN_HOST)
-   vcpu->arch.busy_preempt = mftb();
+   vcpu->arch.busy_preempt = now;
spin_unlock_irqrestore(&vcpu->arch.tbacct_lock, flags);
 }
 
@@ -685,7 +687,7 @@ static u64 vcore_stolen_time(struct kvmppc_vcore *vc, u64 
now)
 }
 
 static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
-   struct kvmppc_vcore *vc)
+   struct kvmppc_vcore *vc, u64 tb)
 {
struct dtl_entry *dt;
struct lppaca *vpa;
@@ -696,7 +698,7 @@ static void kvmppc_create_dtl_entry(struct kvm_vcpu *vcpu,
 
dt = vcpu->arch.dtl_ptr;
vpa = vcpu->arch.vpa.pinned_addr;
-   now = mftb();
+   now = tb;
core_stolen = vcore_stolen_time(vc, now);
stolen = core_stolen - vcpu->arch.stolen_logged;
vcpu->arch.stolen_logged = core_stolen;
@@ -2914,14 +2916,14 @@ static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
 extern int __kvmppc_vcore_entry(void);
 
 static void kvmppc_remove_runnable(struct kvmppc_vcore *vc,
-  struct kvm_vcpu *vcpu)
+  struct kvm_vcpu *vcpu, u64 tb)
 {
u64 now;
 
if (vcpu->arch.state != KVMPPC_VCPU_RUNNABLE)
return;
spin_lock_irq(&vcpu->arch.tbacct_lock);
-   now = mftb();
+   now = tb;
vcpu->arch.busy_stolen +=

[PATCH v3 23/52] KVM: PPC: Book3S HV P9: Move TB updates

2021-10-04 Thread Nicholas Piggin

Move the TB updates between saving and loading guest and host SPRs,
to improve scheduling by keeping issue-NTC operations together as
much as possible.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv_p9_entry.c | 36 +--
 1 file changed, 18 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c 
b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index 814b0dfd590f..e7793bb806eb 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -215,15 +215,6 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
 
vcpu->arch.ceded = 0;
 
-   if (vc->tb_offset) {
-   u64 new_tb = tb + vc->tb_offset;
-   mtspr(SPRN_TBU40, new_tb);
-   tb = mftb();
-   if ((tb & 0xff) < (new_tb & 0xff))
-   mtspr(SPRN_TBU40, new_tb + 0x100);
-   vc->tb_offset_applied = vc->tb_offset;
-   }
-
/* Could avoid mfmsr by passing around, but probably no big deal */
msr = mfmsr();
 
@@ -238,6 +229,15 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
host_dawrx1 = mfspr(SPRN_DAWRX1);
}
 
+   if (vc->tb_offset) {
+   u64 new_tb = tb + vc->tb_offset;
+   mtspr(SPRN_TBU40, new_tb);
+   tb = mftb();
+   if ((tb & 0xff) < (new_tb & 0xff))
+   mtspr(SPRN_TBU40, new_tb + 0x100);
+   vc->tb_offset_applied = vc->tb_offset;
+   }
+
if (vc->pcr)
mtspr(SPRN_PCR, vc->pcr | PCR_MASK);
mtspr(SPRN_DPDES, vc->dpdes);
@@ -469,6 +469,15 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
tb = mftb();
vcpu->arch.dec_expires = dec + tb;
 
+   if (vc->tb_offset_applied) {
+   u64 new_tb = tb - vc->tb_offset_applied;
+   mtspr(SPRN_TBU40, new_tb);
+   tb = mftb();
+   if ((tb & 0xff) < (new_tb & 0xff))
+   mtspr(SPRN_TBU40, new_tb + 0x100);
+   vc->tb_offset_applied = 0;
+   }
+
/* Preserve PSSCR[FAKE_SUSPEND] until we've called kvmppc_save_tm_hv */
mtspr(SPRN_PSSCR, host_psscr |
  (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG));
@@ -503,15 +512,6 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
if (vc->pcr)
mtspr(SPRN_PCR, PCR_MASK);
 
-   if (vc->tb_offset_applied) {
-   u64 new_tb = mftb() - vc->tb_offset_applied;
-   mtspr(SPRN_TBU40, new_tb);
-   tb = mftb();
-   if ((tb & 0xff) < (new_tb & 0xff))
-   mtspr(SPRN_TBU40, new_tb + 0x100);
-   vc->tb_offset_applied = 0;
-   }
-
/* HDEC must be at least as large as DEC, so decrementer_max fits */
mtspr(SPRN_HDEC, decrementer_max);
 
-- 
2.23.0

[PATCH v3 22/52] KVM: PPC: Book3S HV: Change dec_expires to be relative to guest timebase

2021-10-04 Thread Nicholas Piggin

Change dec_expires to be relative to the guest timebase, and allow
it to be moved into low level P9 guest entry functions, to improve
SPR access scheduling.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/kvm_book3s.h   |  6 +++
 arch/powerpc/include/asm/kvm_host.h |  2 +-
 arch/powerpc/kvm/book3s_hv.c| 58 +
 arch/powerpc/kvm/book3s_hv_nested.c |  3 ++
 arch/powerpc/kvm/book3s_hv_p9_entry.c   | 10 -
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 13 --
 6 files changed, 49 insertions(+), 43 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index caaa0f592d8e..15b573671f99 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -406,6 +406,12 @@ static inline ulong kvmppc_get_fault_dar(struct kvm_vcpu 
*vcpu)
return vcpu->arch.fault_dar;
 }
 
+/* Expiry time of vcpu DEC relative to host TB */
+static inline u64 kvmppc_dec_expires_host_tb(struct kvm_vcpu *vcpu)
+{
+   return vcpu->arch.dec_expires - vcpu->arch.vcore->tb_offset;
+}
+
 static inline bool is_kvmppc_resume_guest(int r)
 {
return (r == RESUME_GUEST || r == RESUME_GUEST_NV);
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 080a7feb7731..c5fc4d016695 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -741,7 +741,7 @@ struct kvm_vcpu_arch {
 
struct hrtimer dec_timer;
u64 dec_jiffies;
-   u64 dec_expires;
+   u64 dec_expires;/* Relative to guest timebase. */
unsigned long pending_exceptions;
u8 ceded;
u8 prodded;
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 0a711d929531..4abe4a24e5e7 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2261,8 +2261,7 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, 
u64 id,
*val = get_reg_val(id, vcpu->arch.vcore->arch_compat);
break;
case KVM_REG_PPC_DEC_EXPIRY:
-   *val = get_reg_val(id, vcpu->arch.dec_expires +
-  vcpu->arch.vcore->tb_offset);
+   *val = get_reg_val(id, vcpu->arch.dec_expires);
break;
case KVM_REG_PPC_ONLINE:
*val = get_reg_val(id, vcpu->arch.online);
@@ -2514,8 +2513,7 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, 
u64 id,
r = kvmppc_set_arch_compat(vcpu, set_reg_val(id, *val));
break;
case KVM_REG_PPC_DEC_EXPIRY:
-   vcpu->arch.dec_expires = set_reg_val(id, *val) -
-   vcpu->arch.vcore->tb_offset;
+   vcpu->arch.dec_expires = set_reg_val(id, *val);
break;
case KVM_REG_PPC_ONLINE:
i = set_reg_val(id, *val);
@@ -2902,13 +2900,13 @@ static void kvmppc_set_timer(struct kvm_vcpu *vcpu)
unsigned long dec_nsec, now;
 
now = get_tb();
-   if (now > vcpu->arch.dec_expires) {
+   if (now > kvmppc_dec_expires_host_tb(vcpu)) {
/* decrementer has already gone negative */
kvmppc_core_queue_dec(vcpu);
kvmppc_core_prepare_to_enter(vcpu);
return;
}
-   dec_nsec = tb_to_ns(vcpu->arch.dec_expires - now);
+   dec_nsec = tb_to_ns(kvmppc_dec_expires_host_tb(vcpu) - now);
hrtimer_start(&vcpu->arch.dec_timer, dec_nsec, HRTIMER_MODE_REL);
vcpu->arch.timer_running = 1;
 }
@@ -3380,7 +3378,7 @@ static void post_guest_process(struct kvmppc_vcore *vc, 
bool is_master)
 */
spin_unlock(&vc->lock);
/* cancel pending dec exception if dec is positive */
-   if (now < vcpu->arch.dec_expires &&
+   if (now < kvmppc_dec_expires_host_tb(vcpu) &&
kvmppc_core_pending_dec(vcpu))
kvmppc_core_dequeue_dec(vcpu);
 
@@ -4211,20 +4209,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
 
load_spr_state(vcpu);
 
-   /*
-* When setting DEC, we must always deal with irq_work_raise via NMI vs
-* setting DEC. The problem occurs right as we switch into guest mode
-* if a NMI hits and sets pending work and sets DEC, then that will
-* apply to the guest and not bring us back to the host.
-*
-* irq_work_raise could check a flag (or possibly LPCR[HDICE] for
-* example) and set HDEC to 1? That wouldn't solve the nested hv
-* case which needs to abort the hcall or zero the time limit.
-*
-* XXX: Another day's problem.
-*/
-   mtspr(SPRN_DEC, vcpu->arch.dec_expires - tb);
-
if (kvmhv_on_pseries()) {
/*
 * We need to save and restore the guest visible part of the
@@ -4250,6 +4234,23 @@ static int kvmh

[PATCH v3 21/52] KVM: PPC: Book3S HV P9: Add kvmppc_stop_thread to match kvmppc_start_thread

2021-10-04 Thread Nicholas Piggin

Small cleanup makes it a bit easier to match up entry and exit
operations.

Reviewed-by: Fabiano Rosas 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv.c | 11 +--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 7e8ddffd61c7..0a711d929531 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3070,6 +3070,13 @@ static void kvmppc_start_thread(struct kvm_vcpu *vcpu, 
struct kvmppc_vcore *vc)
kvmppc_ipi_thread(cpu);
 }
 
+/* Old path does this in asm */
+static void kvmppc_stop_thread(struct kvm_vcpu *vcpu)
+{
+   vcpu->cpu = -1;
+   vcpu->arch.thread_cpu = -1;
+}
+
 static void kvmppc_wait_for_nap(int n_threads)
 {
int cpu = smp_processor_id();
@@ -4297,8 +4304,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
dec = (s32) dec;
tb = mftb();
vcpu->arch.dec_expires = dec + tb;
-   vcpu->cpu = -1;
-   vcpu->arch.thread_cpu = -1;
 
store_spr_state(vcpu);
 
@@ -4782,6 +4787,8 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 
time_limit,
 
guest_exit_irqoff();
 
+   kvmppc_stop_thread(vcpu);
+
powerpc_local_irq_pmu_restore(flags);
 
cpumask_clear_cpu(pcpu, &kvm->arch.cpu_in_guest);
-- 
2.23.0

[PATCH v3 20/52] KVM: PPC: Book3S HV P9: Improve mtmsrd scheduling by delaying MSR[EE] disable

2021-10-04 Thread Nicholas Piggin

Moving the mtmsrd after the host SPRs are saved and before the guest
SPRs start to be loaded can prevent an SPR scoreboard stall (because
the mtmsrd is L=1 type which does not cause context synchronisation.

This is also now more convenient to combined with the mtmsrd L=0
instruction to enable facilities just below, but that is not done yet.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv.c | 23 ++-
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 16365c0e9872..7e8ddffd61c7 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4156,6 +4156,18 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
 
save_p9_host_os_sprs(&host_os_sprs);
 
+   /*
+* This could be combined with MSR[RI] clearing, but that expands
+* the unrecoverable window. It would be better to cover unrecoverable
+* with KVM bad interrupt handling rather than use MSR[RI] at all.
+*
+* Much more difficult and less worthwhile to combine with IR/DR
+* disable.
+*/
+   hard_irq_disable();
+   if (lazy_irq_pending())
+   return 0;
+
/* MSR bits may have been cleared by context switch */
msr = 0;
if (IS_ENABLED(CONFIG_PPC_FPU))
@@ -4667,6 +4679,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 
time_limit,
struct kvmppc_vcore *vc;
struct kvm *kvm = vcpu->kvm;
struct kvm_nested_guest *nested = vcpu->arch.nested;
+   unsigned long flags;
 
trace_kvmppc_run_vcpu_enter(vcpu);
 
@@ -4710,11 +4723,11 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 
time_limit,
if (kvm_is_radix(kvm))
kvmppc_prepare_radix_vcpu(vcpu, pcpu);
 
-   local_irq_disable();
-   hard_irq_disable();
+   /* flags save not required, but irq_pmu has no disable/enable API */
+   powerpc_local_irq_pmu_save(flags);
if (signal_pending(current))
goto sigpend;
-   if (lazy_irq_pending() || need_resched() || !kvm->arch.mmu_ready)
+   if (need_resched() || !kvm->arch.mmu_ready)
goto out;
 
if (!nested) {
@@ -4769,7 +4782,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 
time_limit,
 
guest_exit_irqoff();
 
-   local_irq_enable();
+   powerpc_local_irq_pmu_restore(flags);
 
cpumask_clear_cpu(pcpu, &kvm->arch.cpu_in_guest);
 
@@ -4827,7 +4840,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 
time_limit,
run->exit_reason = KVM_EXIT_INTR;
vcpu->arch.ret = -EINTR;
  out:
-   local_irq_enable();
+   powerpc_local_irq_pmu_restore(flags);
preempt_enable();
goto done;
 }
-- 
2.23.0

[PATCH v3 19/52] KVM: PPC: Book3S HV P9: Reduce mtmsrd instructions required to save host SPRs

2021-10-04 Thread Nicholas Piggin

This reduces the number of mtmsrd required to enable facility bits when
saving/restoring registers, by having the KVM code set all bits up front
rather than using individual facility functions that set their particular
MSR bits.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/switch_to.h  |  2 +
 arch/powerpc/kernel/process.c | 28 +
 arch/powerpc/kvm/book3s_hv.c  | 59 ++-
 arch/powerpc/kvm/book3s_hv_p9_entry.c |  1 +
 4 files changed, 71 insertions(+), 19 deletions(-)

diff --git a/arch/powerpc/include/asm/switch_to.h 
b/arch/powerpc/include/asm/switch_to.h
index 9d1fbd8be1c7..e8013cd6b646 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -112,6 +112,8 @@ static inline void clear_task_ebb(struct task_struct *t)
 #endif
 }
 
+void kvmppc_save_user_regs(void);
+
 extern int set_thread_tidr(struct task_struct *t);
 
 #endif /* _ASM_POWERPC_SWITCH_TO_H */
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index 50436b52c213..3fca321b820d 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -1156,6 +1156,34 @@ static inline void save_sprs(struct thread_struct *t)
 #endif
 }
 
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+void kvmppc_save_user_regs(void)
+{
+   unsigned long usermsr;
+
+   if (!current->thread.regs)
+   return;
+
+   usermsr = current->thread.regs->msr;
+
+   if (usermsr & MSR_FP)
+   save_fpu(current);
+
+   if (usermsr & MSR_VEC)
+   save_altivec(current);
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
+   if (usermsr & MSR_TM) {
+   current->thread.tm_tfhar = mfspr(SPRN_TFHAR);
+   current->thread.tm_tfiar = mfspr(SPRN_TFIAR);
+   current->thread.tm_texasr = mfspr(SPRN_TEXASR);
+   current->thread.regs->msr &= ~MSR_TM;
+   }
+#endif
+}
+EXPORT_SYMBOL_GPL(kvmppc_save_user_regs);
+#endif /* CONFIG_KVM_BOOK3S_HV_POSSIBLE */
+
 static inline void restore_sprs(struct thread_struct *old_thread,
struct thread_struct *new_thread)
 {
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index fca89ed2244f..16365c0e9872 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4140,6 +4140,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
struct p9_host_os_sprs host_os_sprs;
s64 dec;
u64 tb, next_timer;
+   unsigned long msr;
int trap;
 
WARN_ON_ONCE(vcpu->arch.ceded);
@@ -4151,8 +4152,23 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
if (next_timer < time_limit)
time_limit = next_timer;
 
+   vcpu->arch.ceded = 0;
+
save_p9_host_os_sprs(&host_os_sprs);
 
+   /* MSR bits may have been cleared by context switch */
+   msr = 0;
+   if (IS_ENABLED(CONFIG_PPC_FPU))
+   msr |= MSR_FP;
+   if (cpu_has_feature(CPU_FTR_ALTIVEC))
+   msr |= MSR_VEC;
+   if (cpu_has_feature(CPU_FTR_VSX))
+   msr |= MSR_VSX;
+   if (cpu_has_feature(CPU_FTR_TM) ||
+   cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
+   msr |= MSR_TM;
+   msr = msr_check_and_set(msr);
+
kvmppc_subcore_enter_guest();
 
vc->entry_exit_map = 1;
@@ -4161,12 +4177,13 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
vcpu_vpa_increment_dispatch(vcpu);
 
if (cpu_has_feature(CPU_FTR_TM) ||
-   cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
+   cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST)) {
kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
+   msr = mfmsr(); /* TM restore can update msr */
+   }
 
switch_pmu_to_guest(vcpu, &host_os_sprs);
 
-   msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
load_fp_state(&vcpu->arch.fp);
 #ifdef CONFIG_ALTIVEC
load_vr_state(&vcpu->arch.vr);
@@ -4275,7 +4292,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
 
restore_p9_host_os_sprs(vcpu, &host_os_sprs);
 
-   msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
store_fp_state(&vcpu->arch.fp);
 #ifdef CONFIG_ALTIVEC
store_vr_state(&vcpu->arch.vr);
@@ -4825,19 +4841,24 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu)
unsigned long user_tar = 0;
unsigned int user_vrsave;
struct kvm *kvm;
+   unsigned long msr;
 
if (!vcpu->arch.sane) {
run->exit_reason = KVM_EXIT_INTERNAL_ERROR;
return -EINVAL;
}
 
+   /* No need to go into the guest when all we'll do is come back out */
+   if (signal_pending(current)) {
+   run->exit_reason = KVM_EXIT_INTR;
+   return -EINTR;
+   }
+
+#ifdef CONFIG_PPC_TRANSACTIONAL_MEM
/*
 * Don't allow entr

[PATCH v3 18/52] KVM: PPC: Book3S HV P9: Move SPRG restore to restore_p9_host_os_sprs

2021-10-04 Thread Nicholas Piggin

Move the SPR update into its relevant helper function. This will
help with SPR scheduling improvements in later changes.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 1c5b81bd02c1..fca89ed2244f 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4093,6 +4093,8 @@ static void save_p9_host_os_sprs(struct p9_host_os_sprs 
*host_os_sprs)
 static void restore_p9_host_os_sprs(struct kvm_vcpu *vcpu,
struct p9_host_os_sprs *host_os_sprs)
 {
+   mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso);
+
mtspr(SPRN_PSPB, 0);
mtspr(SPRN_UAMOR, 0);
 
@@ -4293,8 +4295,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
 
timer_rearm_host_dec(tb);
 
-   mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso);
-
kvmppc_subcore_exit_guest();
 
return trap;
-- 
2.23.0

[PATCH v3 17/52] KVM: PPC: Book3S HV: CTRL SPR does not require read-modify-write

2021-10-04 Thread Nicholas Piggin

Processors that support KVM HV do not require read-modify-write of
the CTRL SPR to set/clear their thread's runlatch. Just write 1 or 0
to it.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv.c|  2 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 15 ++-
 2 files changed, 7 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index f0ad3fb2eabd..1c5b81bd02c1 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4058,7 +4058,7 @@ static void load_spr_state(struct kvm_vcpu *vcpu)
 */
 
if (!(vcpu->arch.ctrl & 1))
-   mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1);
+   mtspr(SPRN_CTRLT, 0);
 }
 
 static void store_spr_state(struct kvm_vcpu *vcpu)
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 7fa0df632f89..070e228b3c20 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -775,12 +775,11 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S)
mtspr   SPRN_AMR,r5
mtspr   SPRN_UAMOR,r6
 
-   /* Restore state of CTRL run bit; assume 1 on entry */
+   /* Restore state of CTRL run bit; the host currently has it set to 1 */
lwz r5,VCPU_CTRL(r4)
andi.   r5,r5,1
bne 4f
-   mfspr   r6,SPRN_CTRLF
-   clrrdi  r6,r6,1
+   li  r6,0
mtspr   SPRN_CTRLT,r6
 4:
/* Secondary threads wait for primary to have done partition switch */
@@ -1203,12 +1202,12 @@ guest_bypass:
stw r0, VCPU_CPU(r9)
stw r0, VCPU_THREAD_CPU(r9)
 
-   /* Save guest CTRL register, set runlatch to 1 */
+   /* Save guest CTRL register, set runlatch to 1 if it was clear */
mfspr   r6,SPRN_CTRLF
stw r6,VCPU_CTRL(r9)
andi.   r0,r6,1
bne 4f
-   ori r6,r6,1
+   li  r6,1
mtspr   SPRN_CTRLT,r6
 4:
/*
@@ -2178,8 +2177,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_TM)
 * Also clear the runlatch bit before napping.
 */
 kvm_do_nap:
-   mfspr   r0, SPRN_CTRLF
-   clrrdi  r0, r0, 1
+   li  r0,0
mtspr   SPRN_CTRLT, r0
 
li  r0,1
@@ -2198,8 +2196,7 @@ kvm_nap_sequence: /* desired LPCR value in r5 */
 
bl  isa206_idle_insn_mayloss
 
-   mfspr   r0, SPRN_CTRLF
-   ori r0, r0, 1
+   li  r0,1
mtspr   SPRN_CTRLT, r0
 
mtspr   SPRN_SRR1, r3
-- 
2.23.0

[PATCH v3 16/52] KVM: PPC: Book3S HV P9: Factor out yield_count increment

2021-10-04 Thread Nicholas Piggin

Factor duplicated code into a helper function.

Reviewed-by: Fabiano Rosas 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv.c | 24 
 1 file changed, 12 insertions(+), 12 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6bbd670658b9..f0ad3fb2eabd 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4118,6 +4118,16 @@ static inline bool hcall_is_xics(unsigned long req)
req == H_IPOLL || req == H_XIRR || req == H_XIRR_X;
 }
 
+static void vcpu_vpa_increment_dispatch(struct kvm_vcpu *vcpu)
+{
+   struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
+   if (lp) {
+   u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
+   lp->yield_count = cpu_to_be32(yield_count);
+   vcpu->arch.vpa.dirty = 1;
+   }
+}
+
 /*
  * Guest entry for POWER9 and later CPUs.
  */
@@ -4146,12 +4156,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
vc->entry_exit_map = 1;
vc->in_guest = 1;
 
-   if (vcpu->arch.vpa.pinned_addr) {
-   struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
-   u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
-   lp->yield_count = cpu_to_be32(yield_count);
-   vcpu->arch.vpa.dirty = 1;
-   }
+   vcpu_vpa_increment_dispatch(vcpu);
 
if (cpu_has_feature(CPU_FTR_TM) ||
cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
@@ -4279,12 +4284,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
kvmppc_save_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
 
-   if (vcpu->arch.vpa.pinned_addr) {
-   struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
-   u32 yield_count = be32_to_cpu(lp->yield_count) + 1;
-   lp->yield_count = cpu_to_be32(yield_count);
-   vcpu->arch.vpa.dirty = 1;
-   }
+   vcpu_vpa_increment_dispatch(vcpu);
 
switch_pmu_to_host(vcpu, &host_os_sprs);
 
-- 
2.23.0

[PATCH v3 15/52] KVM: PPC: Book3S HV P9: Demand fault PMU SPRs when marked not inuse

2021-10-04 Thread Nicholas Piggin

The pmcregs_in_use field in the guest VPA can not be trusted to reflect
what the guest is doing with PMU SPRs, so the PMU must always be managed
(stopped) when exiting the guest, and SPR values set when entering the
guest to ensure it can't cause a covert channel or otherwise cause other
guests or the host to misbehave.

So prevent guest access to the PMU with HFSCR[PM] if pmcregs_in_use is
clear, and avoid the PMU SPR access on every partition switch. Guests
that set pmcregs_in_use incorrectly or when first setting it and using
the PMU will take a hypervisor facility unavailable interrupt that will
bring in the PMU SPRs.

Reviewed-by: Athira Jajeev 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv.c | 131 ++-
 1 file changed, 98 insertions(+), 33 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 29a8c770c4a6..6bbd670658b9 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -1421,6 +1421,23 @@ static int kvmppc_emulate_doorbell_instr(struct kvm_vcpu 
*vcpu)
return RESUME_GUEST;
 }
 
+/*
+ * If the lppaca had pmcregs_in_use clear when we exited the guest, then
+ * HFSCR_PM is cleared for next entry. If the guest then tries to access
+ * the PMU SPRs, we get this facility unavailable interrupt. Putting HFSCR_PM
+ * back in the guest HFSCR will cause the next entry to load the PMU SPRs and
+ * allow the guest access to continue.
+ */
+static int kvmppc_pmu_unavailable(struct kvm_vcpu *vcpu)
+{
+   if (!(vcpu->arch.hfscr_permitted & HFSCR_PM))
+   return EMULATE_FAIL;
+
+   vcpu->arch.hfscr |= HFSCR_PM;
+
+   return RESUME_GUEST;
+}
+
 static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
 struct task_struct *tsk)
 {
@@ -1702,16 +1719,22 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu,
 * to emulate.
 * Otherwise, we just generate a program interrupt to the guest.
 */
-   case BOOK3S_INTERRUPT_H_FAC_UNAVAIL:
+   case BOOK3S_INTERRUPT_H_FAC_UNAVAIL: {
+   u64 cause = vcpu->arch.hfscr >> 56;
+
r = EMULATE_FAIL;
-   if (((vcpu->arch.hfscr >> 56) == FSCR_MSGP_LG) &&
-   cpu_has_feature(CPU_FTR_ARCH_300))
-   r = kvmppc_emulate_doorbell_instr(vcpu);
+   if (cpu_has_feature(CPU_FTR_ARCH_300)) {
+   if (cause == FSCR_MSGP_LG)
+   r = kvmppc_emulate_doorbell_instr(vcpu);
+   if (cause == FSCR_PM_LG)
+   r = kvmppc_pmu_unavailable(vcpu);
+   }
if (r == EMULATE_FAIL) {
kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
r = RESUME_GUEST;
}
break;
+   }
 
case BOOK3S_INTERRUPT_HV_RM_HARD:
r = RESUME_PASSTHROUGH;
@@ -2750,6 +2773,11 @@ static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu 
*vcpu)
 
vcpu->arch.hfscr_permitted = vcpu->arch.hfscr;
 
+   /*
+* PM is demand-faulted so start with it clear.
+*/
+   vcpu->arch.hfscr &= ~HFSCR_PM;
+
kvmppc_mmu_book3s_hv_init(vcpu);
 
vcpu->arch.state = KVMPPC_VCPU_NOTREADY;
@@ -3820,6 +3848,14 @@ static void freeze_pmu(unsigned long mmcr0, unsigned 
long mmcra)
 static void switch_pmu_to_guest(struct kvm_vcpu *vcpu,
struct p9_host_os_sprs *host_os_sprs)
 {
+   struct lppaca *lp;
+   int load_pmu = 1;
+
+   lp = vcpu->arch.vpa.pinned_addr;
+   if (lp)
+   load_pmu = lp->pmcregs_in_use;
+
+   /* Save host */
if (ppc_get_pmu_inuse()) {
/*
 * It might be better to put PMU handling (at least for the
@@ -3854,41 +3890,47 @@ static void switch_pmu_to_guest(struct kvm_vcpu *vcpu,
}
 
 #ifdef CONFIG_PPC_PSERIES
+   /* After saving PMU, before loading guest PMU, flip pmcregs_in_use */
if (kvmhv_on_pseries()) {
barrier();
-   if (vcpu->arch.vpa.pinned_addr) {
-   struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
-   get_lppaca()->pmcregs_in_use = lp->pmcregs_in_use;
-   } else {
-   get_lppaca()->pmcregs_in_use = 1;
-   }
+   get_lppaca()->pmcregs_in_use = load_pmu;
barrier();
}
 #endif
 
-   /* load guest */
-   mtspr(SPRN_PMC1, vcpu->arch.pmc[0]);
-   mtspr(SPRN_PMC2, vcpu->arch.pmc[1]);
-   mtspr(SPRN_PMC3, vcpu->arch.pmc[2]);
-   mtspr(SPRN_PMC4, vcpu->arch.pmc[3]);
-   mtspr(SPRN_PMC5, vcpu->arch.pmc[4]);
-   mtspr(SPRN_PMC6, vcpu->arch.pmc[5]);
-   mtspr(SPRN_MMCR1, vcpu->arch.mmcr[1]);
-   mtspr(SPRN_MMCR2, vcpu->arch.mmcr[2]);
-   mtspr(SPRN_SDAR, vcpu->arch.sdar);
-   mtspr(SPRN_SIAR, vcpu->arch.s

[PATCH v3 14/52] KVM: PPC: Book3S HV P9: Factor PMU save/load into context switch functions

2021-10-04 Thread Nicholas Piggin

Rather than guest/host save/retsore functions, implement context switch
functions that take care of details like the VPA update for nested.

The reason to split these kind of helpers into explicit save/load
functions is mainly to schedule SPR access nicely, but PMU is a special
case where the load requires mtSPR (to stop counters) and other
difficulties, so there's less possibility to schedule those nicely. The
SPR accesses also have side-effects if the PMU is running, and in later
changes we keep the host PMU running as long as possible so this code
can be better profiled, which also complicates scheduling.

Reviewed-by: Athira Jajeev 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv.c | 61 +---
 1 file changed, 28 insertions(+), 33 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 211184544599..29a8c770c4a6 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3817,7 +3817,8 @@ static void freeze_pmu(unsigned long mmcr0, unsigned long 
mmcra)
isync();
 }
 
-static void save_p9_host_pmu(struct p9_host_os_sprs *host_os_sprs)
+static void switch_pmu_to_guest(struct kvm_vcpu *vcpu,
+   struct p9_host_os_sprs *host_os_sprs)
 {
if (ppc_get_pmu_inuse()) {
/*
@@ -3851,10 +3852,21 @@ static void save_p9_host_pmu(struct p9_host_os_sprs 
*host_os_sprs)
host_os_sprs->sier3 = mfspr(SPRN_SIER3);
}
}
-}
 
-static void load_p9_guest_pmu(struct kvm_vcpu *vcpu)
-{
+#ifdef CONFIG_PPC_PSERIES
+   if (kvmhv_on_pseries()) {
+   barrier();
+   if (vcpu->arch.vpa.pinned_addr) {
+   struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
+   get_lppaca()->pmcregs_in_use = lp->pmcregs_in_use;
+   } else {
+   get_lppaca()->pmcregs_in_use = 1;
+   }
+   barrier();
+   }
+#endif
+
+   /* load guest */
mtspr(SPRN_PMC1, vcpu->arch.pmc[0]);
mtspr(SPRN_PMC2, vcpu->arch.pmc[1]);
mtspr(SPRN_PMC3, vcpu->arch.pmc[2]);
@@ -3879,7 +3891,8 @@ static void load_p9_guest_pmu(struct kvm_vcpu *vcpu)
/* No isync necessary because we're starting counters */
 }
 
-static void save_p9_guest_pmu(struct kvm_vcpu *vcpu)
+static void switch_pmu_to_host(struct kvm_vcpu *vcpu,
+   struct p9_host_os_sprs *host_os_sprs)
 {
struct lppaca *lp;
int save_pmu = 1;
@@ -3922,10 +3935,15 @@ static void save_p9_guest_pmu(struct kvm_vcpu *vcpu)
} else {
freeze_pmu(mfspr(SPRN_MMCR0), mfspr(SPRN_MMCRA));
}
-}
 
-static void load_p9_host_pmu(struct p9_host_os_sprs *host_os_sprs)
-{
+#ifdef CONFIG_PPC_PSERIES
+   if (kvmhv_on_pseries()) {
+   barrier();
+   get_lppaca()->pmcregs_in_use = ppc_get_pmu_inuse();
+   barrier();
+   }
+#endif
+
if (ppc_get_pmu_inuse()) {
mtspr(SPRN_PMC1, host_os_sprs->pmc1);
mtspr(SPRN_PMC2, host_os_sprs->pmc2);
@@ -4058,8 +4076,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
 
save_p9_host_os_sprs(&host_os_sprs);
 
-   save_p9_host_pmu(&host_os_sprs);
-
kvmppc_subcore_enter_guest();
 
vc->entry_exit_map = 1;
@@ -4076,19 +4092,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
cpu_has_feature(CPU_FTR_P9_TM_HV_ASSIST))
kvmppc_restore_tm_hv(vcpu, vcpu->arch.shregs.msr, true);
 
-#ifdef CONFIG_PPC_PSERIES
-   if (kvmhv_on_pseries()) {
-   barrier();
-   if (vcpu->arch.vpa.pinned_addr) {
-   struct lppaca *lp = vcpu->arch.vpa.pinned_addr;
-   get_lppaca()->pmcregs_in_use = lp->pmcregs_in_use;
-   } else {
-   get_lppaca()->pmcregs_in_use = 1;
-   }
-   barrier();
-   }
-#endif
-   load_p9_guest_pmu(vcpu);
+   switch_pmu_to_guest(vcpu, &host_os_sprs);
 
msr_check_and_set(MSR_FP | MSR_VEC | MSR_VSX);
load_fp_state(&vcpu->arch.fp);
@@ -4217,14 +4221,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
vcpu->arch.vpa.dirty = 1;
}
 
-   save_p9_guest_pmu(vcpu);
-#ifdef CONFIG_PPC_PSERIES
-   if (kvmhv_on_pseries()) {
-   barrier();
-   get_lppaca()->pmcregs_in_use = ppc_get_pmu_inuse();
-   barrier();
-   }
-#endif
+   switch_pmu_to_host(vcpu, &host_os_sprs);
 
vc->entry_exit_map = 0x101;
vc->in_guest = 0;
@@ -4233,8 +4230,6 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
 
mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso);
 
-   load_p9_host_pmu(&host_os_sprs);
-
kvmppc_subcore_exit_guest();
 
re

[PATCH v3 13/52] KVM: PPC: Book3S HV P9: Implement PMU save/restore in C

2021-10-04 Thread Nicholas Piggin

Implement the P9 path PMU save/restore code in C, and remove the
POWER9/10 code from the P7/8 path assembly.

Cc: Madhavan Srinivasan 
Reviewed-by: Athira Jajeev 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/asm-prototypes.h |   5 -
 arch/powerpc/kvm/book3s_hv.c  | 221 +++---
 arch/powerpc/kvm/book3s_hv_interrupts.S   |  13 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   |  43 +
 4 files changed, 208 insertions(+), 74 deletions(-)

diff --git a/arch/powerpc/include/asm/asm-prototypes.h 
b/arch/powerpc/include/asm/asm-prototypes.h
index 222823861a67..41b8a1e1144a 100644
--- a/arch/powerpc/include/asm/asm-prototypes.h
+++ b/arch/powerpc/include/asm/asm-prototypes.h
@@ -141,11 +141,6 @@ static inline void kvmppc_restore_tm_hv(struct kvm_vcpu 
*vcpu, u64 msr,
bool preserve_nv) { }
 #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
 
-void kvmhv_save_host_pmu(void);
-void kvmhv_load_host_pmu(void);
-void kvmhv_save_guest_pmu(struct kvm_vcpu *vcpu, bool pmu_in_use);
-void kvmhv_load_guest_pmu(struct kvm_vcpu *vcpu);
-
 void kvmppc_p9_enter_guest(struct kvm_vcpu *vcpu);
 
 long kvmppc_h_set_dabr(struct kvm_vcpu *vcpu, unsigned long dabr);
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index b069209b49b2..211184544599 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3762,6 +3762,196 @@ static noinline void kvmppc_run_core(struct 
kvmppc_vcore *vc)
trace_kvmppc_run_core(vc, 1);
 }
 
+/*
+ * Privileged (non-hypervisor) host registers to save.
+ */
+struct p9_host_os_sprs {
+   unsigned long dscr;
+   unsigned long tidr;
+   unsigned long iamr;
+   unsigned long amr;
+   unsigned long fscr;
+
+   unsigned int pmc1;
+   unsigned int pmc2;
+   unsigned int pmc3;
+   unsigned int pmc4;
+   unsigned int pmc5;
+   unsigned int pmc6;
+   unsigned long mmcr0;
+   unsigned long mmcr1;
+   unsigned long mmcr2;
+   unsigned long mmcr3;
+   unsigned long mmcra;
+   unsigned long siar;
+   unsigned long sier1;
+   unsigned long sier2;
+   unsigned long sier3;
+   unsigned long sdar;
+};
+
+static void freeze_pmu(unsigned long mmcr0, unsigned long mmcra)
+{
+   if (!(mmcr0 & MMCR0_FC))
+   goto do_freeze;
+   if (mmcra & MMCRA_SAMPLE_ENABLE)
+   goto do_freeze;
+   if (cpu_has_feature(CPU_FTR_ARCH_31)) {
+   if (!(mmcr0 & MMCR0_PMCCEXT))
+   goto do_freeze;
+   if (!(mmcra & MMCRA_BHRB_DISABLE))
+   goto do_freeze;
+   }
+   return;
+
+do_freeze:
+   mmcr0 = MMCR0_FC;
+   mmcra = 0;
+   if (cpu_has_feature(CPU_FTR_ARCH_31)) {
+   mmcr0 |= MMCR0_PMCCEXT;
+   mmcra = MMCRA_BHRB_DISABLE;
+   }
+
+   mtspr(SPRN_MMCR0, mmcr0);
+   mtspr(SPRN_MMCRA, mmcra);
+   isync();
+}
+
+static void save_p9_host_pmu(struct p9_host_os_sprs *host_os_sprs)
+{
+   if (ppc_get_pmu_inuse()) {
+   /*
+* It might be better to put PMU handling (at least for the
+* host) in the perf subsystem because it knows more about what
+* is being used.
+*/
+
+   /* POWER9, POWER10 do not implement HPMC or SPMC */
+
+   host_os_sprs->mmcr0 = mfspr(SPRN_MMCR0);
+   host_os_sprs->mmcra = mfspr(SPRN_MMCRA);
+
+   freeze_pmu(host_os_sprs->mmcr0, host_os_sprs->mmcra);
+
+   host_os_sprs->pmc1 = mfspr(SPRN_PMC1);
+   host_os_sprs->pmc2 = mfspr(SPRN_PMC2);
+   host_os_sprs->pmc3 = mfspr(SPRN_PMC3);
+   host_os_sprs->pmc4 = mfspr(SPRN_PMC4);
+   host_os_sprs->pmc5 = mfspr(SPRN_PMC5);
+   host_os_sprs->pmc6 = mfspr(SPRN_PMC6);
+   host_os_sprs->mmcr1 = mfspr(SPRN_MMCR1);
+   host_os_sprs->mmcr2 = mfspr(SPRN_MMCR2);
+   host_os_sprs->sdar = mfspr(SPRN_SDAR);
+   host_os_sprs->siar = mfspr(SPRN_SIAR);
+   host_os_sprs->sier1 = mfspr(SPRN_SIER);
+
+   if (cpu_has_feature(CPU_FTR_ARCH_31)) {
+   host_os_sprs->mmcr3 = mfspr(SPRN_MMCR3);
+   host_os_sprs->sier2 = mfspr(SPRN_SIER2);
+   host_os_sprs->sier3 = mfspr(SPRN_SIER3);
+   }
+   }
+}
+
+static void load_p9_guest_pmu(struct kvm_vcpu *vcpu)
+{
+   mtspr(SPRN_PMC1, vcpu->arch.pmc[0]);
+   mtspr(SPRN_PMC2, vcpu->arch.pmc[1]);
+   mtspr(SPRN_PMC3, vcpu->arch.pmc[2]);
+   mtspr(SPRN_PMC4, vcpu->arch.pmc[3]);
+   mtspr(SPRN_PMC5, vcpu->arch.pmc[4]);
+   mtspr(SPRN_PMC6, vcpu->arch.pmc[5]);
+   mtspr(SPRN_MMCR1, vcpu->arch.mmcr[1]);
+   mtspr(SPRN_MMCR2, vcpu->arch.mmcr[2]);
+   mtspr(SPRN_SDAR, vcpu->arch.sdar);
+   mtspr(SPRN_SIAR, vcpu->arch.siar);
+   mtspr(SPRN_S

[PATCH v3 12/52] powerpc/64s: Implement PMU override command line option

2021-10-04 Thread Nicholas Piggin

It can be useful in simulators (with very constrained environments)
to allow some PMCs to run from boot so they can be sampled directly
by a test harness, rather than having to run perf.

A previous change freezes counters at boot by default, so provide
a boot time option to un-freeze (plus a bit more flexibility).

Cc: Madhavan Srinivasan 
Reviewed-by: Athira Jajeev 
Signed-off-by: Nicholas Piggin 
---
 .../admin-guide/kernel-parameters.txt |  8 +
 arch/powerpc/perf/core-book3s.c   | 35 +++
 2 files changed, 43 insertions(+)

diff --git a/Documentation/admin-guide/kernel-parameters.txt 
b/Documentation/admin-guide/kernel-parameters.txt
index 91ba391f9b32..02a80c02a713 100644
--- a/Documentation/admin-guide/kernel-parameters.txt
+++ b/Documentation/admin-guide/kernel-parameters.txt
@@ -4120,6 +4120,14 @@
Override pmtimer IOPort with a hex value.
e.g. pmtmr=0x508
 
+   pmu_override=   [PPC] Override the PMU.
+   This option takes over the PMU facility, so it is no
+   longer usable by perf. Setting this option starts the
+   PMU counters by setting MMCR0 to 0 (the FC bit is
+   cleared). If a number is given, then MMCR1 is set to
+   that number, otherwise (e.g., 'pmu_override=on'), MMCR1
+   remains 0.
+
pm_debug_messages   [SUSPEND,KNL]
Enable suspend/resume debug messages during boot up.
 
diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c
index 73e62e9b179b..8d4ff93462fb 100644
--- a/arch/powerpc/perf/core-book3s.c
+++ b/arch/powerpc/perf/core-book3s.c
@@ -2419,8 +2419,24 @@ int register_power_pmu(struct power_pmu *pmu)
 }
 
 #ifdef CONFIG_PPC64
+static bool pmu_override = false;
+static unsigned long pmu_override_val;
+static void do_pmu_override(void *data)
+{
+   ppc_set_pmu_inuse(1);
+   if (pmu_override_val)
+   mtspr(SPRN_MMCR1, pmu_override_val);
+   mtspr(SPRN_MMCR0, mfspr(SPRN_MMCR0) & ~MMCR0_FC);
+}
+
 static int __init init_ppc64_pmu(void)
 {
+   if (cpu_has_feature(CPU_FTR_HVMODE) && pmu_override) {
+   pr_warn("disabling perf due to pmu_override= command line 
option.\n");
+   on_each_cpu(do_pmu_override, NULL, 1);
+   return 0;
+   }
+
/* run through all the pmu drivers one at a time */
if (!init_power5_pmu())
return 0;
@@ -2442,4 +2458,23 @@ static int __init init_ppc64_pmu(void)
return init_generic_compat_pmu();
 }
 early_initcall(init_ppc64_pmu);
+
+static int __init pmu_setup(char *str)
+{
+   unsigned long val;
+
+   if (!early_cpu_has_feature(CPU_FTR_HVMODE))
+   return 0;
+
+   pmu_override = true;
+
+   if (kstrtoul(str, 0, &val))
+   val = 0;
+
+   pmu_override_val = val;
+
+   return 1;
+}
+__setup("pmu_override=", pmu_setup);
+
 #endif
-- 
2.23.0

[PATCH v3 11/52] powerpc/64s: Always set PMU control registers to frozen/disabled when not in use

2021-10-04 Thread Nicholas Piggin

KVM PMU management code looks for particular frozen/disabled bits in
the PMU registers so it knows whether it must clear them when coming
out of a guest or not. Setting this up helps KVM make these optimisations
without getting confused. Longer term the better approach might be to
move guest/host PMU switching to the perf subsystem.

Cc: Madhavan Srinivasan 
Reviewed-by: Athira Jajeev 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/cpu_setup_power.c | 4 ++--
 arch/powerpc/kernel/dt_cpu_ftrs.c | 6 +++---
 arch/powerpc/kvm/book3s_hv.c  | 5 +
 3 files changed, 10 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/kernel/cpu_setup_power.c 
b/arch/powerpc/kernel/cpu_setup_power.c
index a29dc8326622..3dc61e203f37 100644
--- a/arch/powerpc/kernel/cpu_setup_power.c
+++ b/arch/powerpc/kernel/cpu_setup_power.c
@@ -109,7 +109,7 @@ static void init_PMU_HV_ISA207(void)
 static void init_PMU(void)
 {
mtspr(SPRN_MMCRA, 0);
-   mtspr(SPRN_MMCR0, 0);
+   mtspr(SPRN_MMCR0, MMCR0_FC);
mtspr(SPRN_MMCR1, 0);
mtspr(SPRN_MMCR2, 0);
 }
@@ -123,7 +123,7 @@ static void init_PMU_ISA31(void)
 {
mtspr(SPRN_MMCR3, 0);
mtspr(SPRN_MMCRA, MMCRA_BHRB_DISABLE);
-   mtspr(SPRN_MMCR0, MMCR0_PMCCEXT);
+   mtspr(SPRN_MMCR0, MMCR0_FC | MMCR0_PMCCEXT);
 }
 
 /*
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c 
b/arch/powerpc/kernel/dt_cpu_ftrs.c
index 0a6b36b4bda8..06a089fbeaa7 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -353,7 +353,7 @@ static void init_pmu_power8(void)
}
 
mtspr(SPRN_MMCRA, 0);
-   mtspr(SPRN_MMCR0, 0);
+   mtspr(SPRN_MMCR0, MMCR0_FC);
mtspr(SPRN_MMCR1, 0);
mtspr(SPRN_MMCR2, 0);
mtspr(SPRN_MMCRS, 0);
@@ -392,7 +392,7 @@ static void init_pmu_power9(void)
mtspr(SPRN_MMCRC, 0);
 
mtspr(SPRN_MMCRA, 0);
-   mtspr(SPRN_MMCR0, 0);
+   mtspr(SPRN_MMCR0, MMCR0_FC);
mtspr(SPRN_MMCR1, 0);
mtspr(SPRN_MMCR2, 0);
 }
@@ -428,7 +428,7 @@ static void init_pmu_power10(void)
 
mtspr(SPRN_MMCR3, 0);
mtspr(SPRN_MMCRA, MMCRA_BHRB_DISABLE);
-   mtspr(SPRN_MMCR0, MMCR0_PMCCEXT);
+   mtspr(SPRN_MMCR0, MMCR0_FC | MMCR0_PMCCEXT);
 }
 
 static int __init feat_enable_pmu_power10(struct dt_cpu_feature *f)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 945fc9a96439..b069209b49b2 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -2715,6 +2715,11 @@ static int kvmppc_core_vcpu_create_hv(struct kvm_vcpu 
*vcpu)
 #endif
 #endif
vcpu->arch.mmcr[0] = MMCR0_FC;
+   if (cpu_has_feature(CPU_FTR_ARCH_31)) {
+   vcpu->arch.mmcr[0] |= MMCR0_PMCCEXT;
+   vcpu->arch.mmcra = MMCRA_BHRB_DISABLE;
+   }
+
vcpu->arch.ctrl = CTRL_RUNLATCH;
/* default to host PVR, since we can't spoof it */
kvmppc_set_pvr_hv(vcpu, mfspr(SPRN_PVR));
-- 
2.23.0

[PATCH v3 10/52] KVM: PPC: Book3S HV: Don't always save PMU for guest capable of nesting

2021-10-04 Thread Nicholas Piggin

Provide a config option that controls the workaround added by commit
63279eeb7f93 ("KVM: PPC: Book3S HV: Always save guest pmu for guest
capable of nesting"). The option defaults to y for now, but is expected
to go away within a few releases.

Nested capable guests running with the earlier commit ("KVM: PPC: Book3S
HV Nested: Indicate guest PMU in-use in VPA") will now indicate the PMU
in-use status of their guests, which means the parent does not need to
unconditionally save the PMU for nested capable guests.

After this latest round of performance optimisations, this option costs
about 540 cycles or 10% entry/exit performance on a POWER9 nested-capable
guest.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/Kconfig | 15 +++
 arch/powerpc/kvm/book3s_hv.c | 10 --
 2 files changed, 23 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index ff581d70f20c..1e7aae522be8 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -130,6 +130,21 @@ config KVM_BOOK3S_HV_EXIT_TIMING
 
  If unsure, say N.
 
+config KVM_BOOK3S_HV_NESTED_PMU_WORKAROUND
+   bool "Nested L0 host workaround for L1 KVM host PMU handling bug" if 
EXPERT
+   depends on KVM_BOOK3S_HV_POSSIBLE
+   default !EXPERT
+   help
+ Old nested HV capable Linux guests have a bug where the don't
+ reflect the PMU in-use status of their L2 guest to the L0 host
+ while the L2 PMU registers are live. This can result in loss
+  of L2 PMU register state, causing perf to not work correctly in
+ L2 guests.
+
+ Selecting this option for the L0 host implements a workaround for
+ those buggy L1s which saves the L2 state, at the cost of performance
+ in all nested-capable guest entry/exit.
+
 config KVM_BOOKE_HV
bool
 
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 463534402107..945fc9a96439 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4034,8 +4034,14 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
vcpu->arch.vpa.dirty = 1;
save_pmu = lp->pmcregs_in_use;
}
-   /* Must save pmu if this guest is capable of running nested guests */
-   save_pmu |= nesting_enabled(vcpu->kvm);
+   if (IS_ENABLED(CONFIG_KVM_BOOK3S_HV_NESTED_PMU_WORKAROUND)) {
+   /*
+* Save pmu if this guest is capable of running nested guests.
+* This is option is for old L1s that do not set their
+* lppaca->pmcregs_in_use properly when entering their L2.
+*/
+   save_pmu |= nesting_enabled(vcpu->kvm);
+   }
 
kvmhv_save_guest_pmu(vcpu, save_pmu);
 #ifdef CONFIG_PPC_PSERIES
-- 
2.23.0

[PATCH v3 09/52] powerpc/64s: Keep AMOR SPR a constant ~0 at runtime

2021-10-04 Thread Nicholas Piggin

This register controls supervisor SPR modifications, and as such is only
relevant for KVM. KVM always sets AMOR to ~0 on guest entry, and never
restores it coming back out to the host, so it can be kept constant and
avoid the mtSPR in KVM guest entry.

Reviewed-by: Fabiano Rosas 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kernel/cpu_setup_power.c|  8 
 arch/powerpc/kernel/dt_cpu_ftrs.c|  2 ++
 arch/powerpc/kvm/book3s_hv_p9_entry.c|  2 --
 arch/powerpc/kvm/book3s_hv_rmhandlers.S  |  2 --
 arch/powerpc/mm/book3s64/radix_pgtable.c | 15 ---
 arch/powerpc/platforms/powernv/idle.c|  8 +++-
 6 files changed, 13 insertions(+), 24 deletions(-)

diff --git a/arch/powerpc/kernel/cpu_setup_power.c 
b/arch/powerpc/kernel/cpu_setup_power.c
index 3cca88ee96d7..a29dc8326622 100644
--- a/arch/powerpc/kernel/cpu_setup_power.c
+++ b/arch/powerpc/kernel/cpu_setup_power.c
@@ -137,6 +137,7 @@ void __setup_cpu_power7(unsigned long offset, struct 
cpu_spec *t)
return;
 
mtspr(SPRN_LPID, 0);
+   mtspr(SPRN_AMOR, ~0);
mtspr(SPRN_PCR, PCR_MASK);
init_LPCR_ISA206(mfspr(SPRN_LPCR), LPCR_LPES1 >> LPCR_LPES_SH);
 }
@@ -150,6 +151,7 @@ void __restore_cpu_power7(void)
return;
 
mtspr(SPRN_LPID, 0);
+   mtspr(SPRN_AMOR, ~0);
mtspr(SPRN_PCR, PCR_MASK);
init_LPCR_ISA206(mfspr(SPRN_LPCR), LPCR_LPES1 >> LPCR_LPES_SH);
 }
@@ -164,6 +166,7 @@ void __setup_cpu_power8(unsigned long offset, struct 
cpu_spec *t)
return;
 
mtspr(SPRN_LPID, 0);
+   mtspr(SPRN_AMOR, ~0);
mtspr(SPRN_PCR, PCR_MASK);
init_LPCR_ISA206(mfspr(SPRN_LPCR) | LPCR_PECEDH, 0); /* LPES = 0 */
init_HFSCR();
@@ -184,6 +187,7 @@ void __restore_cpu_power8(void)
return;
 
mtspr(SPRN_LPID, 0);
+   mtspr(SPRN_AMOR, ~0);
mtspr(SPRN_PCR, PCR_MASK);
init_LPCR_ISA206(mfspr(SPRN_LPCR) | LPCR_PECEDH, 0); /* LPES = 0 */
init_HFSCR();
@@ -202,6 +206,7 @@ void __setup_cpu_power9(unsigned long offset, struct 
cpu_spec *t)
mtspr(SPRN_PSSCR, 0);
mtspr(SPRN_LPID, 0);
mtspr(SPRN_PID, 0);
+   mtspr(SPRN_AMOR, ~0);
mtspr(SPRN_PCR, PCR_MASK);
init_LPCR_ISA300((mfspr(SPRN_LPCR) | LPCR_PECEDH | LPCR_PECE_HVEE |\
 LPCR_HVICE | LPCR_HEIC) & ~(LPCR_UPRT | LPCR_HR), 0);
@@ -223,6 +228,7 @@ void __restore_cpu_power9(void)
mtspr(SPRN_PSSCR, 0);
mtspr(SPRN_LPID, 0);
mtspr(SPRN_PID, 0);
+   mtspr(SPRN_AMOR, ~0);
mtspr(SPRN_PCR, PCR_MASK);
init_LPCR_ISA300((mfspr(SPRN_LPCR) | LPCR_PECEDH | LPCR_PECE_HVEE |\
 LPCR_HVICE | LPCR_HEIC) & ~(LPCR_UPRT | LPCR_HR), 0);
@@ -242,6 +248,7 @@ void __setup_cpu_power10(unsigned long offset, struct 
cpu_spec *t)
mtspr(SPRN_PSSCR, 0);
mtspr(SPRN_LPID, 0);
mtspr(SPRN_PID, 0);
+   mtspr(SPRN_AMOR, ~0);
mtspr(SPRN_PCR, PCR_MASK);
init_LPCR_ISA300((mfspr(SPRN_LPCR) | LPCR_PECEDH | LPCR_PECE_HVEE |\
 LPCR_HVICE | LPCR_HEIC) & ~(LPCR_UPRT | LPCR_HR), 0);
@@ -264,6 +271,7 @@ void __restore_cpu_power10(void)
mtspr(SPRN_PSSCR, 0);
mtspr(SPRN_LPID, 0);
mtspr(SPRN_PID, 0);
+   mtspr(SPRN_AMOR, ~0);
mtspr(SPRN_PCR, PCR_MASK);
init_LPCR_ISA300((mfspr(SPRN_LPCR) | LPCR_PECEDH | LPCR_PECE_HVEE |\
 LPCR_HVICE | LPCR_HEIC) & ~(LPCR_UPRT | LPCR_HR), 0);
diff --git a/arch/powerpc/kernel/dt_cpu_ftrs.c 
b/arch/powerpc/kernel/dt_cpu_ftrs.c
index 358aee7c2d79..0a6b36b4bda8 100644
--- a/arch/powerpc/kernel/dt_cpu_ftrs.c
+++ b/arch/powerpc/kernel/dt_cpu_ftrs.c
@@ -80,6 +80,7 @@ static void __restore_cpu_cpufeatures(void)
mtspr(SPRN_LPCR, system_registers.lpcr);
if (hv_mode) {
mtspr(SPRN_LPID, 0);
+   mtspr(SPRN_AMOR, ~0);
mtspr(SPRN_HFSCR, system_registers.hfscr);
mtspr(SPRN_PCR, system_registers.pcr);
}
@@ -216,6 +217,7 @@ static int __init feat_enable_hv(struct dt_cpu_feature *f)
}
 
mtspr(SPRN_LPID, 0);
+   mtspr(SPRN_AMOR, ~0);
 
lpcr = mfspr(SPRN_LPCR);
lpcr &=  ~LPCR_LPES0; /* HV external interrupts */
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c 
b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index bd8cf0a65ce8..a7f63082b4e3 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -286,8 +286,6 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2);
mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3);
 
-   mtspr(SPRN_AMOR, ~0UL);
-
local_paca->kvm_hstate.in_guest = KVM_GUEST_MODE_HV_P9;
 
/*
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 90484425a1e6..a5a2ef1c70ec 100644
--- a/arch/p

[PATCH v3 08/52] KVM: PPC: Book3S HV: POWER10 enable HAIL when running radix guests

2021-10-04 Thread Nicholas Piggin

HV interrupts may be taken with the MMU enabled when radix guests are
running. Enable LPCR[HAIL] on ISA v3.1 processors for radix guests.
Make this depend on the host LPCR[HAIL] being enabled. Currently that is
always enabled, but having this test means any issue that might require
LPCR[HAIL] to be disabled in the host will not have to be duplicated in
KVM.

This optimisation takes 1380 cycles off a NULL hcall entry+exit micro
benchmark on a POWER10.

Reviewed-by: Fabiano Rosas 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv.c | 29 +
 1 file changed, 25 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index e83c7aa7dbba..463534402107 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -5047,6 +5047,8 @@ static int kvmppc_hv_setup_htab_rma(struct kvm_vcpu *vcpu)
  */
 int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
 {
+   unsigned long lpcr, lpcr_mask;
+
if (nesting_enabled(kvm))
kvmhv_release_all_nested(kvm);
kvmppc_rmap_reset(kvm);
@@ -5056,8 +5058,13 @@ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
kvm->arch.radix = 0;
spin_unlock(&kvm->mmu_lock);
kvmppc_free_radix(kvm);
-   kvmppc_update_lpcr(kvm, LPCR_VPM1,
-  LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
+
+   lpcr = LPCR_VPM1;
+   lpcr_mask = LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR;
+   if (cpu_has_feature(CPU_FTR_ARCH_31))
+   lpcr_mask |= LPCR_HAIL;
+   kvmppc_update_lpcr(kvm, lpcr, lpcr_mask);
+
return 0;
 }
 
@@ -5067,6 +5074,7 @@ int kvmppc_switch_mmu_to_hpt(struct kvm *kvm)
  */
 int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
 {
+   unsigned long lpcr, lpcr_mask;
int err;
 
err = kvmppc_init_vm_radix(kvm);
@@ -5078,8 +5086,17 @@ int kvmppc_switch_mmu_to_radix(struct kvm *kvm)
kvm->arch.radix = 1;
spin_unlock(&kvm->mmu_lock);
kvmppc_free_hpt(&kvm->arch.hpt);
-   kvmppc_update_lpcr(kvm, LPCR_UPRT | LPCR_GTSE | LPCR_HR,
-  LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR);
+
+   lpcr = LPCR_UPRT | LPCR_GTSE | LPCR_HR;
+   lpcr_mask = LPCR_VPM1 | LPCR_UPRT | LPCR_GTSE | LPCR_HR;
+   if (cpu_has_feature(CPU_FTR_ARCH_31)) {
+   lpcr_mask |= LPCR_HAIL;
+   if (cpu_has_feature(CPU_FTR_HVMODE) &&
+   (kvm->arch.host_lpcr & LPCR_HAIL))
+   lpcr |= LPCR_HAIL;
+   }
+   kvmppc_update_lpcr(kvm, lpcr, lpcr_mask);
+
return 0;
 }
 
@@ -5243,6 +5260,10 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm)
kvm->arch.mmu_ready = 1;
lpcr &= ~LPCR_VPM1;
lpcr |= LPCR_UPRT | LPCR_GTSE | LPCR_HR;
+   if (cpu_has_feature(CPU_FTR_HVMODE) &&
+   cpu_has_feature(CPU_FTR_ARCH_31) &&
+   (kvm->arch.host_lpcr & LPCR_HAIL))
+   lpcr |= LPCR_HAIL;
ret = kvmppc_init_vm_radix(kvm);
if (ret) {
kvmppc_free_lpid(kvm->arch.lpid);
-- 
2.23.0

[PATCH v3 07/52] powerpc/time: add API for KVM to re-arm the host timer/decrementer

2021-10-04 Thread Nicholas Piggin

Rather than have KVM look up the host timer and fiddle with the
irq-work internal details, have the powerpc/time.c code provide a
function for KVM to re-arm the Linux timer code when exiting a
guest.

This is implementation has an improvement over existing code of
marking a decrementer interrupt as soft-pending if a timer has
expired, rather than setting DEC to a -ve value, which tended to
cause host timers to take two interrupts (first hdec to exit the
guest, then the immediate dec).

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/time.h | 16 +++---
 arch/powerpc/kernel/time.c  | 52 +++--
 arch/powerpc/kvm/book3s_hv.c|  7 ++---
 3 files changed, 49 insertions(+), 26 deletions(-)

diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 69b6be617772..924b2157882f 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -99,18 +99,6 @@ extern void div128_by_32(u64 dividend_high, u64 dividend_low,
 extern void secondary_cpu_time_init(void);
 extern void __init time_init(void);
 
-#ifdef CONFIG_PPC64
-static inline unsigned long test_irq_work_pending(void)
-{
-   unsigned long x;
-
-   asm volatile("lbz %0,%1(13)"
-   : "=r" (x)
-   : "i" (offsetof(struct paca_struct, irq_work_pending)));
-   return x;
-}
-#endif
-
 DECLARE_PER_CPU(u64, decrementers_next_tb);
 
 static inline u64 timer_get_next_tb(void)
@@ -118,6 +106,10 @@ static inline u64 timer_get_next_tb(void)
return __this_cpu_read(decrementers_next_tb);
 }
 
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+void timer_rearm_host_dec(u64 now);
+#endif
+
 /* Convert timebase ticks to nanoseconds */
 unsigned long long tb_to_ns(unsigned long long tb_ticks);
 
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 6ce40d2ac201..2a6c118a43fb 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -498,6 +498,16 @@ EXPORT_SYMBOL(profile_pc);
  * 64-bit uses a byte in the PACA, 32-bit uses a per-cpu variable...
  */
 #ifdef CONFIG_PPC64
+static inline unsigned long test_irq_work_pending(void)
+{
+   unsigned long x;
+
+   asm volatile("lbz %0,%1(13)"
+   : "=r" (x)
+   : "i" (offsetof(struct paca_struct, irq_work_pending)));
+   return x;
+}
+
 static inline void set_irq_work_pending_flag(void)
 {
asm volatile("stb %0,%1(13)" : :
@@ -541,13 +551,44 @@ void arch_irq_work_raise(void)
preempt_enable();
 }
 
+static void set_dec_or_work(u64 val)
+{
+   set_dec(val);
+   /* We may have raced with new irq work */
+   if (unlikely(test_irq_work_pending()))
+   set_dec(1);
+}
+
 #else  /* CONFIG_IRQ_WORK */
 
 #define test_irq_work_pending()0
 #define clear_irq_work_pending()
 
+static void set_dec_or_work(u64 val)
+{
+   set_dec(val);
+}
 #endif /* CONFIG_IRQ_WORK */
 
+#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
+void timer_rearm_host_dec(u64 now)
+{
+   u64 *next_tb = this_cpu_ptr(&decrementers_next_tb);
+
+   WARN_ON_ONCE(!arch_irqs_disabled());
+   WARN_ON_ONCE(mfmsr() & MSR_EE);
+
+   if (now >= *next_tb) {
+   local_paca->irq_happened |= PACA_IRQ_DEC;
+   } else {
+   now = *next_tb - now;
+   if (now <= decrementer_max)
+   set_dec_or_work(now);
+   }
+}
+EXPORT_SYMBOL_GPL(timer_rearm_host_dec);
+#endif
+
 /*
  * timer_interrupt - gets called when the decrementer overflows,
  * with interrupts disabled.
@@ -608,10 +649,7 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt)
} else {
now = *next_tb - now;
if (now <= decrementer_max)
-   set_dec(now);
-   /* We may have raced with new irq work */
-   if (test_irq_work_pending())
-   set_dec(1);
+   set_dec_or_work(now);
__this_cpu_inc(irq_stat.timer_irqs_others);
}
 
@@ -853,11 +891,7 @@ static int decrementer_set_next_event(unsigned long evt,
  struct clock_event_device *dev)
 {
__this_cpu_write(decrementers_next_tb, get_tb() + evt);
-   set_dec(evt);
-
-   /* We may have raced with new irq work */
-   if (test_irq_work_pending())
-   set_dec(1);
+   set_dec_or_work(evt);
 
return 0;
 }
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index e4482bf546ed..e83c7aa7dbba 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4049,11 +4049,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
vc->entry_exit_map = 0x101;
vc->in_guest = 0;
 
-   next_timer = timer_get_next_tb();
-   set_dec(next_timer - tb);
-   /* We may have raced with new irq work */
-   if (test_irq_work_pending())
-   set_dec(1);
+   timer_rearm_host_dec(tb);
+

[PATCH v3 06/52] KVM: PPC: Book3S HV P9: Reduce mftb per guest entry/exit

2021-10-04 Thread Nicholas Piggin

mftb is serialising (dispatch next-to-complete) so it is heavy weight
for a mfspr. Avoid reading it multiple times in the entry or exit paths.
A small number of cycles delay to timers is tolerable.

Reviewed-by: Fabiano Rosas 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv.c  | 4 ++--
 arch/powerpc/kvm/book3s_hv_p9_entry.c | 5 +++--
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 30d400bf161b..e4482bf546ed 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3927,7 +3927,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
 *
 * XXX: Another day's problem.
 */
-   mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb());
+   mtspr(SPRN_DEC, vcpu->arch.dec_expires - tb);
 
if (kvmhv_on_pseries()) {
/*
@@ -4050,7 +4050,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
vc->in_guest = 0;
 
next_timer = timer_get_next_tb();
-   set_dec(next_timer - mftb());
+   set_dec(next_timer - tb);
/* We may have raced with new irq work */
if (test_irq_work_pending())
set_dec(1);
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c 
b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index 0ff9ddb5e7ca..bd8cf0a65ce8 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -203,7 +203,8 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
unsigned long host_dawr1;
unsigned long host_dawrx1;
 
-   hdec = time_limit - mftb();
+   tb = mftb();
+   hdec = time_limit - tb;
if (hdec < 0)
return BOOK3S_INTERRUPT_HV_DECREMENTER;
 
@@ -215,7 +216,7 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
vcpu->arch.ceded = 0;
 
if (vc->tb_offset) {
-   u64 new_tb = mftb() + vc->tb_offset;
+   u64 new_tb = tb + vc->tb_offset;
mtspr(SPRN_TBU40, new_tb);
tb = mftb();
if ((tb & 0xff) < (new_tb & 0xff))
-- 
2.23.0

[PATCH v3 05/52] KVM: PPC: Book3S HV P9: Use large decrementer for HDEC

2021-10-04 Thread Nicholas Piggin

On processors that don't suppress the HDEC exceptions when LPCR[HDICE]=0,
this could help reduce needless guest exits due to leftover exceptions on
entering the guest.

Reviewed-by: Alexey Kardashevskiy 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/time.h   | 2 ++
 arch/powerpc/kernel/time.c| 1 +
 arch/powerpc/kvm/book3s_hv_p9_entry.c | 3 ++-
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index fd09b4797fd7..69b6be617772 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -18,6 +18,8 @@
 #include 
 
 /* time.c */
+extern u64 decrementer_max;
+
 extern unsigned long tb_ticks_per_jiffy;
 extern unsigned long tb_ticks_per_usec;
 extern unsigned long tb_ticks_per_sec;
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index e84a087223ce..6ce40d2ac201 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -88,6 +88,7 @@ static struct clocksource clocksource_timebase = {
 
 #define DECREMENTER_DEFAULT_MAX 0x7FFF
 u64 decrementer_max = DECREMENTER_DEFAULT_MAX;
+EXPORT_SYMBOL_GPL(decrementer_max); /* for KVM HDEC */
 
 static int decrementer_set_next_event(unsigned long evt,
  struct clock_event_device *dev);
diff --git a/arch/powerpc/kvm/book3s_hv_p9_entry.c 
b/arch/powerpc/kvm/book3s_hv_p9_entry.c
index 961b3d70483c..0ff9ddb5e7ca 100644
--- a/arch/powerpc/kvm/book3s_hv_p9_entry.c
+++ b/arch/powerpc/kvm/book3s_hv_p9_entry.c
@@ -504,7 +504,8 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 
time_limit, unsigned long lpc
vc->tb_offset_applied = 0;
}
 
-   mtspr(SPRN_HDEC, 0x7fff);
+   /* HDEC must be at least as large as DEC, so decrementer_max fits */
+   mtspr(SPRN_HDEC, decrementer_max);
 
save_clear_guest_mmu(kvm, vcpu);
switch_mmu_to_host(kvm, host_pidr);
-- 
2.23.0

[PATCH v3 04/52] KVM: PPC: Book3S HV P9: Use host timer accounting to avoid decrementer read

2021-10-04 Thread Nicholas Piggin

There is no need to save away the host DEC value, as it is derived
from the host timer subsystem which maintains the next timer time,
so it can be restored from there.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/include/asm/time.h |  5 +
 arch/powerpc/kernel/time.c  |  1 +
 arch/powerpc/kvm/book3s_hv.c| 14 +++---
 3 files changed, 13 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 8c2c3dd4ddba..fd09b4797fd7 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -111,6 +111,11 @@ static inline unsigned long test_irq_work_pending(void)
 
 DECLARE_PER_CPU(u64, decrementers_next_tb);
 
+static inline u64 timer_get_next_tb(void)
+{
+   return __this_cpu_read(decrementers_next_tb);
+}
+
 /* Convert timebase ticks to nanoseconds */
 unsigned long long tb_to_ns(unsigned long long tb_ticks);
 
diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c
index 934d8ae66cc6..e84a087223ce 100644
--- a/arch/powerpc/kernel/time.c
+++ b/arch/powerpc/kernel/time.c
@@ -107,6 +107,7 @@ struct clock_event_device decrementer_clockevent = {
 EXPORT_SYMBOL(decrementer_clockevent);
 
 DEFINE_PER_CPU(u64, decrementers_next_tb);
+EXPORT_SYMBOL_GPL(decrementers_next_tb);
 static DEFINE_PER_CPU(struct clock_event_device, decrementers);
 
 #define XSEC_PER_SEC (1024*1024)
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 6a07a79f07d8..30d400bf161b 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3860,18 +3860,17 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
struct kvmppc_vcore *vc = vcpu->arch.vcore;
struct p9_host_os_sprs host_os_sprs;
s64 dec;
-   u64 tb;
+   u64 tb, next_timer;
int trap, save_pmu;
 
WARN_ON_ONCE(vcpu->arch.ceded);
 
-   dec = mfspr(SPRN_DEC);
tb = mftb();
-   if (dec < 0)
+   next_timer = timer_get_next_tb();
+   if (tb >= next_timer)
return BOOK3S_INTERRUPT_HV_DECREMENTER;
-   local_paca->kvm_hstate.dec_expires = dec + tb;
-   if (local_paca->kvm_hstate.dec_expires < time_limit)
-   time_limit = local_paca->kvm_hstate.dec_expires;
+   if (next_timer < time_limit)
+   time_limit = next_timer;
 
save_p9_host_os_sprs(&host_os_sprs);
 
@@ -4050,7 +4049,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
vc->entry_exit_map = 0x101;
vc->in_guest = 0;
 
-   set_dec(local_paca->kvm_hstate.dec_expires - mftb());
+   next_timer = timer_get_next_tb();
+   set_dec(next_timer - mftb());
/* We may have raced with new irq work */
if (test_irq_work_pending())
set_dec(1);
-- 
2.23.0

[PATCH v3 03/52] KMV: PPC: Book3S HV P9: Use set_dec to set decrementer to host

2021-10-04 Thread Nicholas Piggin

The host Linux timer code arms the decrementer with the value
'decrementers_next_tb - current_tb' using set_dec(), which stores
val - 1 on Book3S-64, which is not quite the same as what KVM does
to re-arm the host decrementer when exiting the guest.

This shouldn't be a significant change, but it makes the logic match
and avoids this small extra change being brought into the next patch.

Suggested-by: Alexey Kardashevskiy 
Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index f4a779fffd18..6a07a79f07d8 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -4050,7 +4050,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, 
u64 time_limit,
vc->entry_exit_map = 0x101;
vc->in_guest = 0;
 
-   mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb());
+   set_dec(local_paca->kvm_hstate.dec_expires - mftb());
/* We may have raced with new irq work */
if (test_irq_work_pending())
set_dec(1);
-- 
2.23.0

[PATCH v3 02/52] powerpc/64s: guard optional TIDR SPR with CPU ftr test

2021-10-04 Thread Nicholas Piggin

The TIDR SPR only exists on POWER9. Avoid accessing it when the
feature bit for it is not set.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/kvm/book3s_hv.c | 12 
 arch/powerpc/xmon/xmon.c | 10 --
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 2acb1c96cfaf..f4a779fffd18 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3767,7 +3767,8 @@ static void load_spr_state(struct kvm_vcpu *vcpu)
mtspr(SPRN_EBBHR, vcpu->arch.ebbhr);
mtspr(SPRN_EBBRR, vcpu->arch.ebbrr);
mtspr(SPRN_BESCR, vcpu->arch.bescr);
-   mtspr(SPRN_TIDR, vcpu->arch.tid);
+   if (cpu_has_feature(CPU_FTR_P9_TIDR))
+   mtspr(SPRN_TIDR, vcpu->arch.tid);
mtspr(SPRN_AMR, vcpu->arch.amr);
mtspr(SPRN_UAMOR, vcpu->arch.uamor);
 
@@ -3793,7 +3794,8 @@ static void store_spr_state(struct kvm_vcpu *vcpu)
vcpu->arch.ebbhr = mfspr(SPRN_EBBHR);
vcpu->arch.ebbrr = mfspr(SPRN_EBBRR);
vcpu->arch.bescr = mfspr(SPRN_BESCR);
-   vcpu->arch.tid = mfspr(SPRN_TIDR);
+   if (cpu_has_feature(CPU_FTR_P9_TIDR))
+   vcpu->arch.tid = mfspr(SPRN_TIDR);
vcpu->arch.amr = mfspr(SPRN_AMR);
vcpu->arch.uamor = mfspr(SPRN_UAMOR);
vcpu->arch.dscr = mfspr(SPRN_DSCR);
@@ -3813,7 +3815,8 @@ struct p9_host_os_sprs {
 static void save_p9_host_os_sprs(struct p9_host_os_sprs *host_os_sprs)
 {
host_os_sprs->dscr = mfspr(SPRN_DSCR);
-   host_os_sprs->tidr = mfspr(SPRN_TIDR);
+   if (cpu_has_feature(CPU_FTR_P9_TIDR))
+   host_os_sprs->tidr = mfspr(SPRN_TIDR);
host_os_sprs->iamr = mfspr(SPRN_IAMR);
host_os_sprs->amr = mfspr(SPRN_AMR);
host_os_sprs->fscr = mfspr(SPRN_FSCR);
@@ -3827,7 +3830,8 @@ static void restore_p9_host_os_sprs(struct kvm_vcpu *vcpu,
mtspr(SPRN_UAMOR, 0);
 
mtspr(SPRN_DSCR, host_os_sprs->dscr);
-   mtspr(SPRN_TIDR, host_os_sprs->tidr);
+   if (cpu_has_feature(CPU_FTR_P9_TIDR))
+   mtspr(SPRN_TIDR, host_os_sprs->tidr);
mtspr(SPRN_IAMR, host_os_sprs->iamr);
 
if (host_os_sprs->amr != vcpu->arch.amr)
diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c
index dd8241c009e5..7958e5aae844 100644
--- a/arch/powerpc/xmon/xmon.c
+++ b/arch/powerpc/xmon/xmon.c
@@ -2107,8 +2107,14 @@ static void dump_300_sprs(void)
if (!cpu_has_feature(CPU_FTR_ARCH_300))
return;
 
-   printf("pidr   = %.16lx  tidr  = %.16lx\n",
-   mfspr(SPRN_PID), mfspr(SPRN_TIDR));
+   if (cpu_has_feature(CPU_FTR_P9_TIDR)) {
+   printf("pidr   = %.16lx  tidr  = %.16lx\n",
+   mfspr(SPRN_PID), mfspr(SPRN_TIDR));
+   } else {
+   printf("pidr   = %.16lx\n",
+   mfspr(SPRN_PID));
+   }
+
printf("psscr  = %.16lx\n",
hv ? mfspr(SPRN_PSSCR) : mfspr(SPRN_PSSCR_PR));
 
-- 
2.23.0

[PATCH v3 01/52] powerpc/64s: Remove WORT SPR from POWER9/10 (take 2)

2021-10-04 Thread Nicholas Piggin

This removes a missed remnant of the WORT SPR.

Signed-off-by: Nicholas Piggin 
---
 arch/powerpc/platforms/powernv/idle.c | 1 -
 1 file changed, 1 deletion(-)

diff --git a/arch/powerpc/platforms/powernv/idle.c 
b/arch/powerpc/platforms/powernv/idle.c
index e3ffdc8e8567..86e787502e42 100644
--- a/arch/powerpc/platforms/powernv/idle.c
+++ b/arch/powerpc/platforms/powernv/idle.c
@@ -589,7 +589,6 @@ struct p9_sprs {
u64 purr;
u64 spurr;
u64 dscr;
-   u64 wort;
u64 ciabr;
 
u64 mmcra;
-- 
2.23.0

[PATCH v3 00/52] KVM: PPC: Book3S HV P9: entry/exit optimisations

2021-10-04 Thread Nicholas Piggin

This reduces radix guest full entry/exit latency on POWER9 and POWER10
by 2x.

Nested HV guests should see smaller improvements in their L1 entry/exit,
but this is also combined with most L0 speedups also applying to nested
entry. nginx localhost throughput test in a SMP nested guest is improved
about 10% (in a direct guest it doesn't change much because it uses XIVE
for IPIs) when L0 and L1 are patched.

It does this in several main ways:

- Rearrange code to optimise SPR accesses. Mainly, avoid scoreboard
  stalls.

- Test SPR values to avoid mtSPRs where possible. mtSPRs are expensive.

- Reduce mftb. mftb is expensive.

- Demand fault certain facilities to avoid saving and/or restoring them
  (at the cost of fault when they are used, but this is mitigated over
  a number of entries, like the facilities when context switching 
  processes). PM, TM, and EBB so far.

- Defer some sequences that are made just in case a guest is interrupted
  in the middle of a critical section to the case where the guest is
  scheduled on a different CPU, rather than every time (at the cost of
  an extra IPI in this case). Namely the tlbsync sequence for radix with
  GTSE, which is very expensive.

- Reduce locking, barriers, atomics related to the vcpus-per-vcore > 1
  handling that the P9 path does not require.

Changes since v2:
- Rebased, several patches from the series were merged in the previous
  merge window.
- Fixed some compile errors noticed by kernel test robot.
- Added RB from Athira for the PMU stuff (thanks!)
- Split TIDR ftr check (patch 2) out into its own patch.
- Added a missed license tag on new file.

Changes since v1:
- Verified DPDES changes still work with msgsndp SMT emulation.
- Fixed HMI handling bug.
- Split softpatch handling fixes into smaller pieces.
- Rebased with Fabiano's latest HV sanitising patches.
- Fix TM demand faulting bug causing nested guest TM tests to TM Bad
  Thing the host in rare cases.
- Re-name new "pmu=" command line option to "pmu_override=" and update
  documentation wording.
- Add default=y config option rather than unconditionally removing the
  L0 nested PMU workaround.
- Remove unnecessary MSR[RI] updates in entry/exit. Down to about 4700
  cycles now.
- Another bugfix from Alexey's testing.

Changes since RFC:
- Rebased with Fabiano's HV sanitising patches at the front.
- Several demand faulting bug fixes mostly relating to nested guests.
- Removed facility demand-faulting from L0 nested entry/exit handler.
  Demand faulting is still done in the L1, but not the L0. The reason
  is to reduce complexity (although it's only a small amount of
  complexity), reduce demand faulting overhead that may require several

Thanks,
Nick

Nicholas Piggin (52):
  powerpc/64s: Remove WORT SPR from POWER9/10 (take 2)
  powerpc/64s: guard optional TIDR SPR with CPU ftr test
  KMV: PPC: Book3S HV P9: Use set_dec to set decrementer to host
  KVM: PPC: Book3S HV P9: Use host timer accounting to avoid decrementer
read
  KVM: PPC: Book3S HV P9: Use large decrementer for HDEC
  KVM: PPC: Book3S HV P9: Reduce mftb per guest entry/exit
  powerpc/time: add API for KVM to re-arm the host timer/decrementer
  KVM: PPC: Book3S HV: POWER10 enable HAIL when running radix guests
  powerpc/64s: Keep AMOR SPR a constant ~0 at runtime
  KVM: PPC: Book3S HV: Don't always save PMU for guest capable of
nesting
  powerpc/64s: Always set PMU control registers to frozen/disabled when
not in use
  powerpc/64s: Implement PMU override command line option
  KVM: PPC: Book3S HV P9: Implement PMU save/restore in C
  KVM: PPC: Book3S HV P9: Factor PMU save/load into context switch
functions
  KVM: PPC: Book3S HV P9: Demand fault PMU SPRs when marked not inuse
  KVM: PPC: Book3S HV P9: Factor out yield_count increment
  KVM: PPC: Book3S HV: CTRL SPR does not require read-modify-write
  KVM: PPC: Book3S HV P9: Move SPRG restore to restore_p9_host_os_sprs
  KVM: PPC: Book3S HV P9: Reduce mtmsrd instructions required to save
host SPRs
  KVM: PPC: Book3S HV P9: Improve mtmsrd scheduling by delaying MSR[EE]
disable
  KVM: PPC: Book3S HV P9: Add kvmppc_stop_thread to match
kvmppc_start_thread
  KVM: PPC: Book3S HV: Change dec_expires to be relative to guest
timebase
  KVM: PPC: Book3S HV P9: Move TB updates
  KVM: PPC: Book3S HV P9: Optimise timebase reads
  KVM: PPC: Book3S HV P9: Avoid SPR scoreboard stalls
  KVM: PPC: Book3S HV P9: Only execute mtSPR if the value changed
  KVM: PPC: Book3S HV P9: Juggle SPR switching around
  KVM: PPC: Book3S HV P9: Move vcpu register save/restore into functions
  KVM: PPC: Book3S HV P9: Move host OS save/restore functions to
built-in
  KVM: PPC: Book3S HV P9: Move nested guest entry into its own function
  KVM: PPC: Book3S HV P9: Move remaining SPR and MSR access into low
level entry
  KVM: PPC: Book3S HV P9: Implement TM fastpath for guest entry/exit
  KVM: PPC: Book3S HV P9: Switch PMU to guest as late as possible
  KVM: PPC: Book3S HV P9: Rest

Re: [PATCH 5/5] powerpc/64s: Fix unrecoverable MCE calling async handler from NMI

2021-10-04 Thread Cédric Le Goater


On 10/4/21 16:56, Nicholas Piggin wrote:

The machine check handler is not considered NMI on 64s. The early
handler is the true NMI handler, and then it schedules the
machine_check_exception handler to run when interrupts are enabled.

This works fine except the case of an unrecoverable MCE, where the true
NMI is taken when MSR[RI] is clear, it can not recover, so it calls
machine_check_exception directly so something might be done about it.

Calling an async handler from NMI context can result in irq state and
other things getting corrupted. This can also trigger the BUG at
   arch/powerpc/include/asm/interrupt.h:168
   BUG_ON(!arch_irq_disabled_regs(regs) && !(regs->msr & MSR_EE));


I was hitting this problem when I rebooted a P8 tuleta system and
this series fixes it.

Tested-by: Cédric Le Goater 

Thanks,

C.
 

Fix this by making an _async version of the handler which is called
in the normal case, and a NMI version that is called for unrecoverable
interrupts.

Fixes: 2b43dd7653cc ("powerpc/64: enable MSR[EE] in irq replay pt_regs")
Signed-off-by: Nicholas Piggin > ---
  arch/powerpc/include/asm/interrupt.h |  5 ++---
  arch/powerpc/kernel/exceptions-64s.S |  8 +--
  arch/powerpc/kernel/traps.c  | 31 
  3 files changed, 26 insertions(+), 18 deletions(-)

diff --git a/arch/powerpc/include/asm/interrupt.h 
b/arch/powerpc/include/asm/interrupt.h
index b894b7169706..a1d238255f07 100644
--- a/arch/powerpc/include/asm/interrupt.h
+++ b/arch/powerpc/include/asm/interrupt.h
@@ -528,10 +528,9 @@ static __always_inline long ##func(struct pt_regs 
*regs)
  /* kernel/traps.c */
  DECLARE_INTERRUPT_HANDLER_NMI(system_reset_exception);
  #ifdef CONFIG_PPC_BOOK3S_64
-DECLARE_INTERRUPT_HANDLER_ASYNC(machine_check_exception);
-#else
-DECLARE_INTERRUPT_HANDLER_NMI(machine_check_exception);
+DECLARE_INTERRUPT_HANDLER_ASYNC(machine_check_exception_async);
  #endif
+DECLARE_INTERRUPT_HANDLER_NMI(machine_check_exception);
  DECLARE_INTERRUPT_HANDLER(SMIException);
  DECLARE_INTERRUPT_HANDLER(handle_hmi_exception);
  DECLARE_INTERRUPT_HANDLER(unknown_exception);
diff --git a/arch/powerpc/kernel/exceptions-64s.S 
b/arch/powerpc/kernel/exceptions-64s.S
index 024d9231f88c..eaf1f72131a1 100644
--- a/arch/powerpc/kernel/exceptions-64s.S
+++ b/arch/powerpc/kernel/exceptions-64s.S
@@ -1243,7 +1243,7 @@ EXC_COMMON_BEGIN(machine_check_common)
li  r10,MSR_RI
mtmsrd  r10,1
addir3,r1,STACK_FRAME_OVERHEAD
-   bl  machine_check_exception
+   bl  machine_check_exception_async
b   interrupt_return_srr
  
  
@@ -1303,7 +1303,11 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE)

subir12,r12,1
sth r12,PACA_IN_MCE(r13)
  
-	/* Invoke machine_check_exception to print MCE event and panic. */

+   /*
+* Invoke machine_check_exception to print MCE event and panic.
+* This is the NMI version of the handler because we are called from
+* the early handler which is a true NMI.
+*/
addir3,r1,STACK_FRAME_OVERHEAD
bl  machine_check_exception
  
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c

index e453b13b..11741703d26e 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -796,24 +796,22 @@ void die_mce(const char *str, struct pt_regs *regs, long 
err)
 * do_exit() checks for in_interrupt() and panics in that case, so
 * exit the irq/nmi before calling die.
 */
-   if (IS_ENABLED(CONFIG_PPC_BOOK3S_64))
-   irq_exit();
-   else
+   if (in_nmi())
nmi_exit();
+   else
+   irq_exit();
die(str, regs, err);
  }
  
  /*

- * BOOK3S_64 does not call this handler as a non-maskable interrupt
+ * BOOK3S_64 does not usually call this handler as a non-maskable interrupt
   * (it uses its own early real-mode handler to handle the MCE proper
   * and then raises irq_work to call this handler when interrupts are
- * enabled).
+ * enabled). The only time when this is not true is if the early handler
+ * is unrecoverable, then it does call this directly to try to get a
+ * message out.
   */
-#ifdef CONFIG_PPC_BOOK3S_64
-DEFINE_INTERRUPT_HANDLER_ASYNC(machine_check_exception)
-#else
-DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception)
-#endif
+static void __machine_check_exception(struct pt_regs *regs)
  {
int recover = 0;
  
@@ -847,12 +845,19 @@ DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception)

/* Must die if the interrupt is not recoverable */
if (regs_is_unrecoverable(regs))
die_mce("Unrecoverable Machine check", regs, SIGBUS);
+}
  
  #ifdef CONFIG_PPC_BOOK3S_64

-   return;
-#else
-   return 0;
+DEFINE_INTERRUPT_HANDLER_ASYNC(machine_check_exception_async)
+{
+   __machine_check_exception(regs);
+}
  #endif
+DEFINE_INTERRUPT_HANDLER_NMI(machine_check_exception)
+{
+   __machine_check_excep

1 2 >

1 - 100 of 123 matches

Mail list logo