Re: [PATCH next v3 12/15] printk: introduce a kmsg_dump iterator
Hello, Thank you kernel test robot! Despite all of my efforts to carefully construct and test this series, somehome I managed to miss a compile test with CONFIG_MTD_OOPS. That kmsg_dumper does require the dumper parameter so that it can use container_of(). I will discuss this with the printk team. But most likely we will just re-instate the dumper parameter in the callback. I apologize for the lack of care on my part. John Ogness On 2021-02-26, kernel test robot wrote: > Hi John, > > I love your patch! Yet something to improve: > > [auto build test ERROR on next-20210225] > > url: > https://github.com/0day-ci/linux/commits/John-Ogness/printk-remove-logbuf_lock/20210226-043457 > base:7f206cf3ec2bee4621325cfacb2588e5085c07f5 > config: arm-randconfig-r024-20210225 (attached as .config) > compiler: clang version 13.0.0 (https://github.com/llvm/llvm-project > a921aaf789912d981cbb2036bdc91ad7289e1523) > reproduce (this is a W=1 build): > wget > https://raw.githubusercontent.com/intel/lkp-tests/master/sbin/make.cross -O > ~/bin/make.cross > chmod +x ~/bin/make.cross > # install arm cross compiling tool for clang build > # apt-get install binutils-arm-linux-gnueabi > # > https://github.com/0day-ci/linux/commit/fc7f655cded40fc98ba5304c200e3a01e8291fb4 > git remote add linux-review https://github.com/0day-ci/linux > git fetch --no-tags linux-review > John-Ogness/printk-remove-logbuf_lock/20210226-043457 > git checkout fc7f655cded40fc98ba5304c200e3a01e8291fb4 > # save the attached .config to linux build tree > COMPILER_INSTALL_PATH=$HOME/0day COMPILER=clang make.cross ARCH=arm > > If you fix the issue, kindly add following tag as appropriate > Reported-by: kernel test robot > > All errors (new ones prefixed by >>): > >>> drivers/mtd/mtdoops.c:277:45: error: use of undeclared identifier 'dumper' >struct mtdoops_context *cxt = container_of(dumper, > ^ >>> drivers/mtd/mtdoops.c:277:45: error: use of undeclared identifier 'dumper' >>> drivers/mtd/mtdoops.c:277:45: error: use of undeclared identifier 'dumper' >3 errors generated. > > > vim +/dumper +277 drivers/mtd/mtdoops.c > > 4b23aff083649e Richard Purdie 2007-05-29 274 > fc7f655cded40f John Ogness2021-02-25 275 static void > mtdoops_do_dump(enum kmsg_dump_reason reason) > 2e386e4bac9055 Simon Kagstrom 2009-11-03 276 { > 2e386e4bac9055 Simon Kagstrom 2009-11-03 @277 struct mtdoops_context > *cxt = container_of(dumper, > 2e386e4bac9055 Simon Kagstrom 2009-11-03 278 struct > mtdoops_context, dump); > fc7f655cded40f John Ogness2021-02-25 279 struct kmsg_dump_iter > iter; > fc2d557c74dc58 Seiji Aguchi 2011-01-12 280 > 2e386e4bac9055 Simon Kagstrom 2009-11-03 281 /* Only dump oopses if > dump_oops is set */ > 2e386e4bac9055 Simon Kagstrom 2009-11-03 282 if (reason == > KMSG_DUMP_OOPS && !dump_oops) > 2e386e4bac9055 Simon Kagstrom 2009-11-03 283 return; > 2e386e4bac9055 Simon Kagstrom 2009-11-03 284 > fc7f655cded40f John Ogness2021-02-25 285 kmsg_dump_rewind(&iter); > fc7f655cded40f John Ogness2021-02-25 286 > df92cad8a03e83 John Ogness2021-02-25 287 if (test_and_set_bit(0, > &cxt->oops_buf_busy)) > df92cad8a03e83 John Ogness2021-02-25 288 return; > fc7f655cded40f John Ogness2021-02-25 289 > kmsg_dump_get_buffer(&iter, true, cxt->oops_buf + MTDOOPS_HEADER_SIZE, > e2ae715d66bf4b Kay Sievers2012-06-15 290 > record_size - MTDOOPS_HEADER_SIZE, NULL); > df92cad8a03e83 John Ogness2021-02-25 291 clear_bit(0, > &cxt->oops_buf_busy); > 2e386e4bac9055 Simon Kagstrom 2009-11-03 292 > c1cf1d57d14922 Mark Tomlinson 2020-09-03 293 if (reason != > KMSG_DUMP_OOPS) { > 2e386e4bac9055 Simon Kagstrom 2009-11-03 294 /* Panics must > be written immediately */ > 2e386e4bac9055 Simon Kagstrom 2009-11-03 295 > mtdoops_write(cxt, 1); > c1cf1d57d14922 Mark Tomlinson 2020-09-03 296 } else { > 2e386e4bac9055 Simon Kagstrom 2009-11-03 297 /* For other > cases, schedule work to write it "nicely" */ > 2e386e4bac9055 Simon Kagstrom 2009-11-03 298 > schedule_work(&cxt->work_write); > 2e386e4bac9055 Simon Kagstrom 2009-11-03 299 } > c1cf1d57d14922 Mark Tomlinson 2020-09-03 300 } > 4b23aff083649e Richard Purdie 2007-05-29 301 > > --- > 0-DAY CI Kernel Test Service, Intel Corporation > https://lists.01.org/hyperkitty/list/kbuild-...@lists.01.org
[PATCH V2 2/2] powerpc/perf: Add platform specific check_attr_config
Add platform specific attr.config value checks. Patch includes checks for both power9 and power10. Signed-off-by: Madhavan Srinivasan --- Changelog v1: - No changes. arch/powerpc/perf/isa207-common.c | 41 +++ arch/powerpc/perf/isa207-common.h | 2 ++ arch/powerpc/perf/power10-pmu.c | 13 ++ arch/powerpc/perf/power9-pmu.c| 13 ++ 4 files changed, 69 insertions(+) diff --git a/arch/powerpc/perf/isa207-common.c b/arch/powerpc/perf/isa207-common.c index e4f577da33d8..b255799f5b51 100644 --- a/arch/powerpc/perf/isa207-common.c +++ b/arch/powerpc/perf/isa207-common.c @@ -694,3 +694,44 @@ int isa207_get_alternatives(u64 event, u64 alt[], int size, unsigned int flags, return num_alt; } + +int isa3_X_check_attr_config(struct perf_event *ev) +{ + u64 val, sample_mode; + u64 event = ev->attr.config; + + val = (event >> EVENT_SAMPLE_SHIFT) & EVENT_SAMPLE_MASK; + sample_mode = val & 0x3; + + /* +* MMCRA[61:62] is Randome Sampling Mode (SM). +* value of 0b11 is reserved. +*/ + if (sample_mode == 0x3) + return -1; + + /* +* Check for all reserved value +*/ + switch (val) { + case 0x5: + case 0x9: + case 0xD: + case 0x19: + case 0x1D: + case 0x1A: + case 0x1E: + return -1; + } + + /* +* MMCRA[48:51]/[52:55]) Threshold Start/Stop +* Events Selection. +* 0b/0b is reserved. +*/ + val = (event >> EVENT_THR_CTL_SHIFT) & EVENT_THR_CTL_MASK; + if (((val & 0xF0) == 0xF0) || ((val & 0xF) == 0xF)) + return -1; + + return 0; +} diff --git a/arch/powerpc/perf/isa207-common.h b/arch/powerpc/perf/isa207-common.h index 1af0e8c97ac7..ae8eaf05efd1 100644 --- a/arch/powerpc/perf/isa207-common.h +++ b/arch/powerpc/perf/isa207-common.h @@ -280,4 +280,6 @@ void isa207_get_mem_data_src(union perf_mem_data_src *dsrc, u32 flags, struct pt_regs *regs); void isa207_get_mem_weight(u64 *weight); +int isa3_X_check_attr_config(struct perf_event *ev); + #endif diff --git a/arch/powerpc/perf/power10-pmu.c b/arch/powerpc/perf/power10-pmu.c index a901c1348cad..bc64354cab6a 100644 --- a/arch/powerpc/perf/power10-pmu.c +++ b/arch/powerpc/perf/power10-pmu.c @@ -106,6 +106,18 @@ static int power10_get_alternatives(u64 event, unsigned int flags, u64 alt[]) return num_alt; } +static int power10_check_attr_config(struct perf_event *ev) +{ + u64 val; + u64 event = ev->attr.config; + + val = (event >> EVENT_SAMPLE_SHIFT) & EVENT_SAMPLE_MASK; + if (val == 0x10 || isa3_X_check_attr_config(ev)) + return -1; + + return 0; +} + GENERIC_EVENT_ATTR(cpu-cycles, PM_RUN_CYC); GENERIC_EVENT_ATTR(instructions, PM_RUN_INST_CMPL); GENERIC_EVENT_ATTR(branch-instructions,PM_BR_CMPL); @@ -559,6 +571,7 @@ static struct power_pmu power10_pmu = { .attr_groups= power10_pmu_attr_groups, .bhrb_nr= 32, .capabilities = PERF_PMU_CAP_EXTENDED_REGS, + .check_attr_config = power10_check_attr_config, }; int init_power10_pmu(void) diff --git a/arch/powerpc/perf/power9-pmu.c b/arch/powerpc/perf/power9-pmu.c index 2a57e93a79dc..b3b9b226d053 100644 --- a/arch/powerpc/perf/power9-pmu.c +++ b/arch/powerpc/perf/power9-pmu.c @@ -151,6 +151,18 @@ static int power9_get_alternatives(u64 event, unsigned int flags, u64 alt[]) return num_alt; } +static int power9_check_attr_config(struct perf_event *ev) +{ + u64 val; + u64 event = ev->attr.config; + + val = (event >> EVENT_SAMPLE_SHIFT) & EVENT_SAMPLE_MASK; + if (val == 0xC || isa3_X_check_attr_config(ev)) + return -1; + + return 0; +} + GENERIC_EVENT_ATTR(cpu-cycles, PM_CYC); GENERIC_EVENT_ATTR(stalled-cycles-frontend,PM_ICT_NOSLOT_CYC); GENERIC_EVENT_ATTR(stalled-cycles-backend, PM_CMPLU_STALL); @@ -437,6 +449,7 @@ static struct power_pmu power9_pmu = { .attr_groups= power9_pmu_attr_groups, .bhrb_nr= 32, .capabilities = PERF_PMU_CAP_EXTENDED_REGS, + .check_attr_config = power9_check_attr_config, }; int init_power9_pmu(void) -- 2.26.2
[PATCH V2 1/2] powerpc/perf: Infrastructure to support checking of attr.config*
Introduce code to support the checking of attr.config* for values which are reserved for a given platform. Performance Monitoring Unit (PMU) configuration registers have fields that are reserved and specific value to bit field as reserved. For ex., MMCRA[61:62] is Randome Sampling Mode (SM) and value of 0b11 to this field is reserved. Writing a non-zero values in these fields or writing invalid value to bit fields will have unknown behaviours. Patch adds a generic call-back function "check_attr_config" in "struct power_pmu", to be called in event_init to check for attr.config* values for a given platform. "check_attr_config" is valid only for raw event type. Signed-off-by: Madhavan Srinivasan --- Changelog v1: -Fixed commit message and in-code comments arch/powerpc/include/asm/perf_event_server.h | 6 ++ arch/powerpc/perf/core-book3s.c | 14 ++ 2 files changed, 20 insertions(+) diff --git a/arch/powerpc/include/asm/perf_event_server.h b/arch/powerpc/include/asm/perf_event_server.h index 00e7e671bb4b..dde97d7d9253 100644 --- a/arch/powerpc/include/asm/perf_event_server.h +++ b/arch/powerpc/include/asm/perf_event_server.h @@ -67,6 +67,12 @@ struct power_pmu { * the pmu supports extended perf regs capability */ int capabilities; + /* +* Function to check event code for values which are +* reserved. Function takes struct perf_event as input, +* since event code could be spread in attr.config* +*/ + int (*check_attr_config)(struct perf_event *ev); }; /* diff --git a/arch/powerpc/perf/core-book3s.c b/arch/powerpc/perf/core-book3s.c index 6817331e22ff..c6eeb4fdc5fd 100644 --- a/arch/powerpc/perf/core-book3s.c +++ b/arch/powerpc/perf/core-book3s.c @@ -1958,6 +1958,20 @@ static int power_pmu_event_init(struct perf_event *event) if (ppmu->blacklist_ev && is_event_blacklisted(ev)) return -EINVAL; + /* +* PMU config registers have fields that are +* reserved and specific value to bit field as reserved. +* For ex., MMCRA[61:62] is Randome Sampling Mode (SM) +* and value of 0b11 to this field is reserved. +* +* This check is needed only for raw event type, +* since tools like fuzzer use raw event type to +* provide randomized event code values for test. +* +*/ + if (ppmu->check_attr_config && + ppmu->check_attr_config(event)) + return -EINVAL; break; default: return -ENOENT; -- 2.26.2
Re: [PATCH v2 05/37] KVM: PPC: Book3S HV: Ensure MSR[ME] is always set in guest MSR
Hi Nick, > void kvmppc_set_msr_hv(struct kvm_vcpu *vcpu, u64 msr) > { > + /* > + * Guest must always run with machine check interrupt > + * enabled. > + */ > + if (!(msr & MSR_ME)) > + msr |= MSR_ME; This 'if' is technically redundant but you mention a future patch warning on !(msr & MSR_ME) so I'm holding off on any judgement about the 'if' until I get to that patch :) The patch seems sane to me, I agree that we don't want guests running with MSR_ME=0 and kvmppc_set_msr_hv already ensures that the transactional state is sane so this is another sanity-enforcement in the same sort of vein. All up: Reviewed-by: Daniel Axtens Kind regards, Daniel > + > /* >* Check for illegal transactional state bit combination >* and if we find it, force the TS field to a safe state. > -- > 2.23.0
Re: [PATCH v2 04/37] powerpc/64s: remove KVM SKIP test from instruction breakpoint handler
Nicholas Piggin writes: > The code being executed in KVM_GUEST_MODE_SKIP is hypervisor code with > MSR[IR]=0, so the faults of concern are the d-side ones caused by access > to guest context by the hypervisor. > > Instruction breakpoint interrupts are not a concern here. It's unlikely > any good would come of causing breaks in this code, but skipping the > instruction that caused it won't help matters (e.g., skip the mtmsr that > sets MSR[DR]=0 or clears KVM_GUEST_MODE_SKIP). I'm not entirely clear on the example here, but the patch makes sense and I can follow your logic for removing the IKVM_SKIP handler from the instruction breakpoint exception. On that basis: Reviewed-by: Daniel Axtens Kind regards, Daniel > > Reviewed-by: Fabiano Rosas > Signed-off-by: Nicholas Piggin > --- > arch/powerpc/kernel/exceptions-64s.S | 1 - > 1 file changed, 1 deletion(-) > > diff --git a/arch/powerpc/kernel/exceptions-64s.S > b/arch/powerpc/kernel/exceptions-64s.S > index a027600beeb1..0097e0676ed7 100644 > --- a/arch/powerpc/kernel/exceptions-64s.S > +++ b/arch/powerpc/kernel/exceptions-64s.S > @@ -2553,7 +2553,6 @@ EXC_VIRT_NONE(0x5200, 0x100) > INT_DEFINE_BEGIN(instruction_breakpoint) > IVEC=0x1300 > #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE > - IKVM_SKIP=1 > IKVM_REAL=1 > #endif > INT_DEFINE_END(instruction_breakpoint) > -- > 2.23.0
Re: [PATCH v4 12/14] swiotlb: Add restricted DMA alloc/free support.
On Fri, Feb 26, 2021 at 12:17:50PM +0800, Claire Chang wrote: > Do you think I should fix this and rebase on the latest linux-next > now? I wonder if there are more factor and clean up coming and I > should wait after that. Here is my preferred plan: 1) wait for my series to support the min alignment in swiotlb to land in Linus tree 2) I'll resend my series with the further swiotlb cleanup and refactoring, which includes a slightly rebased version of your patch to add the io_tlb_mem structure 3) resend your series on top of that as a baseline This is my current WIP tree for 2: http://git.infradead.org/users/hch/misc.git/shortlog/refs/heads/swiotlb-struct
Re: [PATCH v2 02/37] KVM: PPC: Book3S HV: Fix CONFIG_SPAPR_TCE_IOMMU=n default hcalls
Hi Nick, > This config option causes the warning in init_default_hcalls to fire > because the TCE handlers are in the default hcall list but not > implemented. I checked that the TCE handlers are indeed not defined unless CONFIG_SPAPR_TCE_IOMMU=y, and so I can see how you would hit the warning. This seems like the right solution to me. Reviewed-by: Daniel Axtens Kind regards, Daniel > > Signed-off-by: Nicholas Piggin > --- > arch/powerpc/kvm/book3s_hv.c | 2 ++ > 1 file changed, 2 insertions(+) > > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c > index 13bad6bf4c95..895090636295 100644 > --- a/arch/powerpc/kvm/book3s_hv.c > +++ b/arch/powerpc/kvm/book3s_hv.c > @@ -5369,8 +5369,10 @@ static unsigned int default_hcall_list[] = { > H_READ, > H_PROTECT, > H_BULK_REMOVE, > +#ifdef CONFIG_SPAPR_TCE_IOMMU > H_GET_TCE, > H_PUT_TCE, > +#endif > H_SET_DABR, > H_SET_XDABR, > H_CEDE, > -- > 2.23.0
Re: [PATCH v2 01/37] KVM: PPC: Book3S 64: remove unused kvmppc_h_protect argument
Hi Nick, > The va argument is not used in the function or set by its asm caller, > so remove it to be safe. Huh, so it isn't. I tracked the original implementation down to commit a8606e20e41a ("KVM: PPC: Handle some PAPR hcalls in the kernel") where paulus first added the ability to handle it in the kernel - there it takes a va argument but even then doesn't do anything with it. ajd also pointed out that we don't pass a va when linux is running as a guest, and LoPAR does not mention va as an argument. One small nit: checkpatch is complaining about spaces vs tabs: ERROR: code indent should use tabs where possible #25: FILE: arch/powerpc/include/asm/kvm_ppc.h:770: + unsigned long pte_index, unsigned long avpn);$ WARNING: please, no spaces at the start of a line #25: FILE: arch/powerpc/include/asm/kvm_ppc.h:770: + unsigned long pte_index, unsigned long avpn);$ Once that is resolved, Reviewed-by: Daniel Axtens Kind regards, Daniel Axtens > Signed-off-by: Nicholas Piggin > --- > arch/powerpc/include/asm/kvm_ppc.h | 3 +-- > arch/powerpc/kvm/book3s_hv_rm_mmu.c | 3 +-- > 2 files changed, 2 insertions(+), 4 deletions(-) > > diff --git a/arch/powerpc/include/asm/kvm_ppc.h > b/arch/powerpc/include/asm/kvm_ppc.h > index 8aacd76bb702..9531b1c1b190 100644 > --- a/arch/powerpc/include/asm/kvm_ppc.h > +++ b/arch/powerpc/include/asm/kvm_ppc.h > @@ -767,8 +767,7 @@ long kvmppc_h_remove(struct kvm_vcpu *vcpu, unsigned long > flags, > unsigned long pte_index, unsigned long avpn); > long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu); > long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, > - unsigned long pte_index, unsigned long avpn, > - unsigned long va); > + unsigned long pte_index, unsigned long avpn); > long kvmppc_h_read(struct kvm_vcpu *vcpu, unsigned long flags, > unsigned long pte_index); > long kvmppc_h_clear_ref(struct kvm_vcpu *vcpu, unsigned long flags, > diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c > b/arch/powerpc/kvm/book3s_hv_rm_mmu.c > index 88da2764c1bb..7af7c70f1468 100644 > --- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c > +++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c > @@ -673,8 +673,7 @@ long kvmppc_h_bulk_remove(struct kvm_vcpu *vcpu) > } > > long kvmppc_h_protect(struct kvm_vcpu *vcpu, unsigned long flags, > - unsigned long pte_index, unsigned long avpn, > - unsigned long va) > + unsigned long pte_index, unsigned long avpn) > { > struct kvm *kvm = vcpu->kvm; > __be64 *hpte; > -- > 2.23.0
Re: [PATCH v4 12/14] swiotlb: Add restricted DMA alloc/free support.
> diff --git a/kernel/dma/swiotlb.c b/kernel/dma/swiotlb.c > index fd9c1bd183ac..8b77fd64199e 100644 > --- a/kernel/dma/swiotlb.c > +++ b/kernel/dma/swiotlb.c > @@ -836,6 +836,40 @@ late_initcall(swiotlb_create_default_debugfs); > #endif > > #ifdef CONFIG_DMA_RESTRICTED_POOL > +struct page *dev_swiotlb_alloc(struct device *dev, size_t size, gfp_t gfp) > +{ > + struct swiotlb *swiotlb; > + phys_addr_t tlb_addr; > + unsigned int index; > + > + /* dev_swiotlb_alloc can be used only in the context which permits > sleeping. */ > + if (!dev->dev_swiotlb || !gfpflags_allow_blocking(gfp)) Just noticed that !gfpflags_allow_blocking(gfp) shouldn't be here. Hi Christoph, Do you think I should fix this and rebase on the latest linux-next now? I wonder if there are more factor and clean up coming and I should wait after that. Thanks, Claire
[PATCH v2] crypto/nx: add missing call to of_node_put()
In one of the error paths of the for_each_child_of_node() loop, add missing call to of_node_put(). Fix the following coccicheck warning: ./drivers/crypto/nx/nx-common-powernv.c:927:1-23: WARNING: Function "for_each_child_of_node" should have of_node_put() before return around line 936. Reported-by: Abaci Robot Signed-off-by: Yang Li --- Changes in v2: -add braces for if drivers/crypto/nx/nx-common-powernv.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/drivers/crypto/nx/nx-common-powernv.c b/drivers/crypto/nx/nx-common-powernv.c index 13c65de..446f611 100644 --- a/drivers/crypto/nx/nx-common-powernv.c +++ b/drivers/crypto/nx/nx-common-powernv.c @@ -932,8 +932,10 @@ static int __init nx_powernv_probe_vas(struct device_node *pn) ret = find_nx_device_tree(dn, chip_id, vasid, NX_CT_GZIP, "ibm,p9-nx-gzip", &ct_gzip); - if (ret) + if (ret) { + of_node_put(dn); return ret; + } } if (!ct_842 || !ct_gzip) { -- 1.8.3.1
Re: [RFC PATCH 8/8] powerpc/64/asm: don't reassign labels
Segher Boessenkool writes: > On Thu, Feb 25, 2021 at 02:10:06PM +1100, Daniel Axtens wrote: >> The assembler really does not like us reassigning things to the same >> label: >> >> :7:9: error: invalid reassignment of non-absolute variable >> 'fs_label' >> >> This happens across a bunch of platforms: >> https://github.com/ClangBuiltLinux/linux/issues/1043 >> https://github.com/ClangBuiltLinux/linux/issues/1008 >> https://github.com/ClangBuiltLinux/linux/issues/920 >> https://github.com/ClangBuiltLinux/linux/issues/1050 >> >> There is no hope of getting this fixed in LLVM, so if we want to build >> with LLVM_IAS, we need to hack around it ourselves. >> >> For us the big problem comes from this: >> >> \#define USE_FIXED_SECTION(sname)\ >> fs_label = start_##sname; \ >> fs_start = sname##_start; \ >> use_ftsec sname; >> >> \#define USE_TEXT_SECTION() >> fs_label = start_text; \ >> fs_start = text_start; \ >> .text >> >> and in particular fs_label. > > The "Setting Symbols" super short chapter reads: > > "A symbol can be given an arbitrary value by writing a symbol, followed > by an equals sign '=', followed by an expression. This is equivalent > to using the '.set' directive." > > And ".set" has > > "Set the value of SYMBOL to EXPRESSION. This changes SYMBOL's value and > type to conform to EXPRESSION. If SYMBOL was flagged as external, it > remains flagged. > > You may '.set' a symbol many times in the same assembly provided that > the values given to the symbol are constants. Values that are based on > expressions involving other symbols are allowed, but some targets may > restrict this to only being done once per assembly. This is because > those targets do not set the addresses of symbols at assembly time, but > rather delay the assignment until a final link is performed. This > allows the linker a chance to change the code in the files, changing the > location of, and the relative distance between, various different > symbols. > > If you '.set' a global symbol, the value stored in the object file is > the last value stored into it." > > So this really should be fixed in clang: it is basic assembler syntax. No doubt I have explained this poorly. LLVM does allow some things, this builds fine for example: .set foo, 8192 addi %r3, %r3, foo .set foo, 1234 addi %r3, %r3, foo However, this does not: a: .set foo, a addi %r3, %r3, foo@l b: .set foo, b addi %r3, %r3, foo-a clang -target ppc64le -integrated-as foo.s -o foo.o -c foo.s:5:11: error: invalid reassignment of non-absolute variable 'foo' in '.set' directive .set foo, b ^ gas otoh, has no issues with reassignment: $ powerpc64-linux-gnu-as foo.s -c -o foo.o $ powerpc64-linux-gnu-objdump -dr foo.o foo.o: file format elf64-powerpc Disassembly of section .text: : 0: 38 63 00 00 addir3,r3,0 2: R_PPC64_ADDR16_LO.text 0004 : 4: 38 63 00 04 addir3,r3,4 It seems the llvm assembler only does a single pass, so they're not keen on trying to support reassigning labels with non-absolute values. Kind regards, Daniel > > Segher
Re: [RFC PATCH 7/8] powerpc/purgatory: drop .machine specifier
Segher Boessenkool writes: > On Thu, Feb 25, 2021 at 02:10:05PM +1100, Daniel Axtens wrote: >> It's ignored by future versions of llvm's integrated assembler (by not -11). >> I'm not sure what it does for us in gas. > > It enables all insns that exist on 620 (the first 64-bit PowerPC CPU). > >> --- a/arch/powerpc/purgatory/trampoline_64.S >> +++ b/arch/powerpc/purgatory/trampoline_64.S >> @@ -12,7 +12,7 @@ >> #include >> #include >> >> -.machine ppc64 >> +//upgrade clang, gets ignored .machine ppc64 > > Why delete it if it is ignored? Why add a cryptic comment? Sorry, poor form on my part. I think I will give up on having llvm-11 work and target llvm HEAD, which means I can drop this. > > > Segher
Re: [RFC PATCH 5/8] poweprc/lib/quad: Provide macros for lq/stq
Segher Boessenkool writes: > On Thu, Feb 25, 2021 at 02:10:03PM +1100, Daniel Axtens wrote: >> +#define PPC_RAW_LQ(t, a, dq)(0xe000 | ___PPC_RT(t) | >> ___PPC_RA(a) | (((dq) & 0xfff) << 3)) > > Please keep the operand order the same as for the assembler insns? So > t,dq,a here. > > It should be ((dq) & 0x0fff) << 4) . > >> +#define PPC_RAW_STQ(t, a, ds) (0xf802 | ___PPC_RT(t) | >> ___PPC_RA(a) | (((ds) & 0xfff) << 3)) > > And t,ds,a here. (But it should use "s" instead of "t" preferably, and > use ___PPC_RS, because it is a source field, not a target). > > It should be ((ds) & 0x3fff) << 2) as well. > Ah, thank you. I'll fix this up. Kind regards, Daniel > > Segher
Re: [RFC PATCH 4/8] powerpc/ppc_asm: use plain numbers for registers
Segher Boessenkool writes: > On Thu, Feb 25, 2021 at 02:10:02PM +1100, Daniel Axtens wrote: >> This is dumb but makes the llvm integrated assembler happy. >> https://github.com/ClangBuiltLinux/linux/issues/764 > >> -#define r0 %r0 > >> +#define r0 0 > > This is a big step back (compare 9a13a524ba37). > > If you use a new enough GAS, you can use the -mregnames option and just > say "r0" directly (so not define it at all, or define it to itself). > > === > addi 3,3,3 > addi r3,r3,3 > addi %r3,%r3,3 > > addi 3,3,3 > addi r3,r3,r3 > addi %r3,%r3,%r3 > === > > $ as t.s -o t.o -mregnames > t.s: Assembler messages: > t.s:6: Warning: invalid register expression > t.s:7: Warning: invalid register expression > > > Many people do not like bare numbers. It is a bit like not wearing > seatbelts (but so is all assembler code really: you just have to pay > attention). A better argument is that it is harder to read for people > not used to assembler code like this. > > We used to have "#define r0 0" etc., and that was quite problematic. > Like that "addi r3,r3,r3" example, but also, people wrote "r0" where > only a plain 0 is allowed (like in "lwzx r3,0,r3": "r0" would be > misleading there!) So an overarching comment on all of these patches is that they're not intended to be ready to merge, nor are they necessarily what I think is the best solution. I'm just swinging a big hammer to see how far towards LLVM_IAS=1 I can get on powerpc, and I accept I'm going to have to come back and clean things up. Anyway, noted, I'll push harder on trying to get llvm to accept %rN: there was a patch that went in after llvm-11 that should help. Kind regards, Daniel > > > Segher
[PATCH v2] vio: make remove callback return void
The driver core ignores the return value of struct bus_type::remove() because there is only little that can be done. To simplify the quest to make this function return void, let struct vio_driver::remove() return void, too. All users already unconditionally return 0, this commit makes it obvious that returning an error code is a bad idea. Note there are two nominally different implementations for a vio bus: one in arch/sparc/kernel/vio.c and the other in arch/powerpc/platforms/pseries/vio.c. This patch only adapts the powerpc one. Before this patch for a device that was bound to a driver without a remove callback vio_cmo_bus_remove(viodev) wasn't called. As the device core still considers the device unbound after vio_bus_remove() returns calling this unconditionally is the consistent behaviour which is implemented here. Reviewed-by: Tyrel Datwyler Acked-by: Lijun Pan Acked-by: Greg Kroah-Hartman Signed-off-by: Uwe Kleine-König --- Hello, I dropped the sparc specific files (i.e. all that Michael Ellerman didn't characterize as powerpc specific and verified that they are indeed sparc-only). The commit log is adapted accordingly. Best regards Uwe arch/powerpc/include/asm/vio.h | 2 +- arch/powerpc/platforms/pseries/vio.c | 7 +++ drivers/char/hw_random/pseries-rng.c | 3 +-- drivers/char/tpm/tpm_ibmvtpm.c | 4 +--- drivers/crypto/nx/nx-842-pseries.c | 4 +--- drivers/crypto/nx/nx.c | 4 +--- drivers/misc/ibmvmc.c| 4 +--- drivers/net/ethernet/ibm/ibmveth.c | 4 +--- drivers/net/ethernet/ibm/ibmvnic.c | 4 +--- drivers/scsi/ibmvscsi/ibmvfc.c | 3 +-- drivers/scsi/ibmvscsi/ibmvscsi.c | 4 +--- drivers/scsi/ibmvscsi_tgt/ibmvscsi_tgt.c | 4 +--- drivers/tty/hvc/hvcs.c | 3 +-- 13 files changed, 15 insertions(+), 35 deletions(-) diff --git a/arch/powerpc/include/asm/vio.h b/arch/powerpc/include/asm/vio.h index 0cf52746531b..721c0d6715ac 100644 --- a/arch/powerpc/include/asm/vio.h +++ b/arch/powerpc/include/asm/vio.h @@ -113,7 +113,7 @@ struct vio_driver { const char *name; const struct vio_device_id *id_table; int (*probe)(struct vio_dev *dev, const struct vio_device_id *id); - int (*remove)(struct vio_dev *dev); + void (*remove)(struct vio_dev *dev); /* A driver must have a get_desired_dma() function to * be loaded in a CMO environment if it uses DMA. */ diff --git a/arch/powerpc/platforms/pseries/vio.c b/arch/powerpc/platforms/pseries/vio.c index b2797cfe4e2b..9cb4fc839fd5 100644 --- a/arch/powerpc/platforms/pseries/vio.c +++ b/arch/powerpc/platforms/pseries/vio.c @@ -1261,7 +1261,6 @@ static int vio_bus_remove(struct device *dev) struct vio_dev *viodev = to_vio_dev(dev); struct vio_driver *viodrv = to_vio_driver(dev->driver); struct device *devptr; - int ret = 1; /* * Hold a reference to the device after the remove function is called @@ -1270,13 +1269,13 @@ static int vio_bus_remove(struct device *dev) devptr = get_device(dev); if (viodrv->remove) - ret = viodrv->remove(viodev); + viodrv->remove(viodev); - if (!ret && firmware_has_feature(FW_FEATURE_CMO)) + if (firmware_has_feature(FW_FEATURE_CMO)) vio_cmo_bus_remove(viodev); put_device(devptr); - return ret; + return 0; } /** diff --git a/drivers/char/hw_random/pseries-rng.c b/drivers/char/hw_random/pseries-rng.c index 8038a8a9fb58..f4949b689bd5 100644 --- a/drivers/char/hw_random/pseries-rng.c +++ b/drivers/char/hw_random/pseries-rng.c @@ -54,10 +54,9 @@ static int pseries_rng_probe(struct vio_dev *dev, return hwrng_register(&pseries_rng); } -static int pseries_rng_remove(struct vio_dev *dev) +static void pseries_rng_remove(struct vio_dev *dev) { hwrng_unregister(&pseries_rng); - return 0; } static const struct vio_device_id pseries_rng_driver_ids[] = { diff --git a/drivers/char/tpm/tpm_ibmvtpm.c b/drivers/char/tpm/tpm_ibmvtpm.c index 994385bf37c0..903604769de9 100644 --- a/drivers/char/tpm/tpm_ibmvtpm.c +++ b/drivers/char/tpm/tpm_ibmvtpm.c @@ -343,7 +343,7 @@ static int ibmvtpm_crq_send_init_complete(struct ibmvtpm_dev *ibmvtpm) * * Return: Always 0. */ -static int tpm_ibmvtpm_remove(struct vio_dev *vdev) +static void tpm_ibmvtpm_remove(struct vio_dev *vdev) { struct tpm_chip *chip = dev_get_drvdata(&vdev->dev); struct ibmvtpm_dev *ibmvtpm = dev_get_drvdata(&chip->dev); @@ -372,8 +372,6 @@ static int tpm_ibmvtpm_remove(struct vio_dev *vdev) kfree(ibmvtpm); /* For tpm_ibmvtpm_get_desired_dma */ dev_set_drvdata(&vdev->dev, NULL); - - return 0; } /** diff --git a/drivers/crypto/nx/nx-842-pseries.c b/drivers/crypto/nx/nx-842-pseries.c index 2de5e3672e42..cc8dd3072b8b 100644 --- a/drivers/crypto/nx/nx-842-pseries.c +++ b/drivers/c
Re: [PATCH v3 2/5] ibmvfc: fix invalid sub-CRQ handles after hard reset
On 2/25/21 1:42 PM, Tyrel Datwyler wrote: > A hard reset results in a complete transport disconnect such that the > CRQ connection with the partner VIOS is broken. This has the side effect > of also invalidating the associated sub-CRQs. The current code assumes > that the sub-CRQs are perserved resulting in a protocol violation after > trying to reconnect them with the VIOS. This introduces an infinite loop > such that the VIOS forces a disconnect after each subsequent attempt to > re-register with invalid handles. > > Avoid the aforementioned issue by releasing the sub-CRQs prior to CRQ > disconnect, and driving a reinitialization of the sub-CRQs once a new > CRQ is registered with the hypervisor. > > fixes: faacf8c5f1d5 ("ibmvfc: add alloc/dealloc routines for SCSI Sub-CRQ > Channels") > Signed-off-by: Tyrel Datwyler > Reviewed-by: Brian King > --- > drivers/scsi/ibmvscsi/ibmvfc.c | 21 + > 1 file changed, 9 insertions(+), 12 deletions(-) > > diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c > index 384960036f8b..2cca55f2e464 100644 > --- a/drivers/scsi/ibmvscsi/ibmvfc.c > +++ b/drivers/scsi/ibmvscsi/ibmvfc.c > @@ -158,6 +158,9 @@ static void ibmvfc_npiv_logout(struct ibmvfc_host *); > static void ibmvfc_tgt_implicit_logout_and_del(struct ibmvfc_target *); > static void ibmvfc_tgt_move_login(struct ibmvfc_target *); > > +static void ibmvfc_release_sub_crqs(struct ibmvfc_host *); > +static void ibmvfc_init_sub_crqs(struct ibmvfc_host *); > + > static const char *unknown_error = "unknown error"; > > static long h_reg_sub_crq(unsigned long unit_address, unsigned long ioba, > @@ -926,8 +929,8 @@ static int ibmvfc_reset_crq(struct ibmvfc_host *vhost) > unsigned long flags; > struct vio_dev *vdev = to_vio_dev(vhost->dev); > struct ibmvfc_queue *crq = &vhost->crq; > - struct ibmvfc_queue *scrq; > - int i; > + > + ibmvfc_release_sub_crqs(vhost); > > /* Close the CRQ */ > do { > @@ -936,6 +939,8 @@ static int ibmvfc_reset_crq(struct ibmvfc_host *vhost) > rc = plpar_hcall_norets(H_FREE_CRQ, vdev->unit_address); > } while (rc == H_BUSY || H_IS_LONG_BUSY(rc)); > > + ibmvfc_init_sub_crqs(vhost); This has the same issue as patch 5 in that if fail to set up sub-crqs do_enquiry will be set to zero, but the locked code region below will then flip it back to one which we don't want. -T > + > spin_lock_irqsave(vhost->host->host_lock, flags); > spin_lock(vhost->crq.q_lock); > vhost->state = IBMVFC_NO_CRQ; > @@ -947,16 +952,6 @@ static int ibmvfc_reset_crq(struct ibmvfc_host *vhost) > memset(crq->msgs.crq, 0, PAGE_SIZE); > crq->cur = 0; > > - if (vhost->scsi_scrqs.scrqs) { > - for (i = 0; i < nr_scsi_hw_queues; i++) { > - scrq = &vhost->scsi_scrqs.scrqs[i]; > - spin_lock(scrq->q_lock); > - memset(scrq->msgs.scrq, 0, PAGE_SIZE); > - scrq->cur = 0; > - spin_unlock(scrq->q_lock); > - } > - } > - > /* And re-open it again */ > rc = plpar_hcall_norets(H_REG_CRQ, vdev->unit_address, > crq->msg_token, PAGE_SIZE); > @@ -966,6 +961,7 @@ static int ibmvfc_reset_crq(struct ibmvfc_host *vhost) > dev_warn(vhost->dev, "Partner adapter not ready\n"); > else if (rc != 0) > dev_warn(vhost->dev, "Couldn't register crq (rc=%d)\n", rc); > + > spin_unlock(vhost->crq.q_lock); > spin_unlock_irqrestore(vhost->host->host_lock, flags); > > @@ -5692,6 +5688,7 @@ static void ibmvfc_deregister_scsi_channel(struct > ibmvfc_host *vhost, int index) > > free_irq(scrq->irq, scrq); > irq_dispose_mapping(scrq->irq); > + scrq->irq = 0; > > do { > rc = plpar_hcall_norets(H_FREE_SUB_CRQ, vdev->unit_address, >
Re: [PATCH v3 5/5] ibmvfc: reinitialize sub-CRQs and perform channel enquiry after LPM
On 2/25/21 1:42 PM, Tyrel Datwyler wrote: > A live partition migration (LPM) results in a CRQ disconnect similar to > a hard reset. In this LPM case the hypervisor moslty perserves the CRQ > transport such that it simply needs to be reenabled. However, the > capabilities may have changed such as fewer channels, or no channels at > all. Further, its possible that there may be sub-CRQ support, but no > channel support. The CRQ reenable path currently doesn't take any of > this into consideration. > > For simpilicty release and reinitialize sub-CRQs during reenable, and > set do_enquiry and using_channels with the appropriate values to trigger > channel renegotiation. > > Signed-off-by: Tyrel Datwyler > --- > drivers/scsi/ibmvscsi/ibmvfc.c | 12 > 1 file changed, 12 insertions(+) > > diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c > index 1bb08e5f3674..6bbc2697ad5a 100644 > --- a/drivers/scsi/ibmvscsi/ibmvfc.c > +++ b/drivers/scsi/ibmvscsi/ibmvfc.c > @@ -903,6 +903,9 @@ static int ibmvfc_reenable_crq_queue(struct ibmvfc_host > *vhost) > { > int rc = 0; > struct vio_dev *vdev = to_vio_dev(vhost->dev); > + unsigned long flags; > + > + ibmvfc_release_sub_crqs(vhost); > > /* Re-enable the CRQ */ > do { > @@ -914,6 +917,15 @@ static int ibmvfc_reenable_crq_queue(struct ibmvfc_host > *vhost) > if (rc) > dev_err(vhost->dev, "Error enabling adapter (rc=%d)\n", rc); > > + ibmvfc_init_sub_crqs(vhost); Realized that if this fails it set the do_enquiry flag to zero which the locked region below will then flip back to one. Need to move sub-crq init to after locked region. -T > + > + spin_lock_irqsave(vhost->host->host_lock, flags); > + spin_lock(vhost->crq.q_lock); > + vhost->do_enquiry = 1; > + vhost->using_channels = 0; > + spin_unlock(vhost->crq.q_lock); > + spin_unlock_irqrestore(vhost->host->host_lock, flags); > + > return rc; > } > >
[PATCH v1] powerpc: low_i2c: change @lock to raw_spinlock_t
i2c transfers are occurring with local interrupts disabled: smp_core99_give_timebase() local_irq_save(); smp_core99_cypress_tb_freeze() pmac_i2c_xfer() kw_i2c_xfer() spin_lock_irqsave(&host->lock, flags) This is a problem because with PREEMPT_RT a spinlock_t can sleep, causing the system to hang. Convert the spinlock_t to the non-sleeping raw_spinlock_t. Signed-off-by: John Ogness --- arch/powerpc/platforms/powermac/low_i2c.c | 16 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/platforms/powermac/low_i2c.c b/arch/powerpc/platforms/powermac/low_i2c.c index f77a59b5c2e1..ba89c95ef290 100644 --- a/arch/powerpc/platforms/powermac/low_i2c.c +++ b/arch/powerpc/platforms/powermac/low_i2c.c @@ -116,7 +116,7 @@ struct pmac_i2c_host_kw int polled; int result; struct completion complete; - spinlock_t lock; + raw_spinlock_t lock; struct timer_list timeout_timer; }; @@ -346,14 +346,14 @@ static irqreturn_t kw_i2c_irq(int irq, void *dev_id) struct pmac_i2c_host_kw *host = dev_id; unsigned long flags; - spin_lock_irqsave(&host->lock, flags); + raw_spin_lock_irqsave(&host->lock, flags); del_timer(&host->timeout_timer); kw_i2c_handle_interrupt(host, kw_read_reg(reg_isr)); if (host->state != state_idle) { host->timeout_timer.expires = jiffies + KW_POLL_TIMEOUT; add_timer(&host->timeout_timer); } - spin_unlock_irqrestore(&host->lock, flags); + raw_spin_unlock_irqrestore(&host->lock, flags); return IRQ_HANDLED; } @@ -362,7 +362,7 @@ static void kw_i2c_timeout(struct timer_list *t) struct pmac_i2c_host_kw *host = from_timer(host, t, timeout_timer); unsigned long flags; - spin_lock_irqsave(&host->lock, flags); + raw_spin_lock_irqsave(&host->lock, flags); /* * If the timer is pending, that means we raced with the @@ -377,7 +377,7 @@ static void kw_i2c_timeout(struct timer_list *t) add_timer(&host->timeout_timer); } skip: - spin_unlock_irqrestore(&host->lock, flags); + raw_spin_unlock_irqrestore(&host->lock, flags); } static int kw_i2c_open(struct pmac_i2c_bus *bus) @@ -470,9 +470,9 @@ static int kw_i2c_xfer(struct pmac_i2c_bus *bus, u8 addrdir, int subsize, unsigned long flags; u8 isr = kw_i2c_wait_interrupt(host); - spin_lock_irqsave(&host->lock, flags); + raw_spin_lock_irqsave(&host->lock, flags); kw_i2c_handle_interrupt(host, isr); - spin_unlock_irqrestore(&host->lock, flags); + raw_spin_unlock_irqrestore(&host->lock, flags); } } @@ -508,7 +508,7 @@ static struct pmac_i2c_host_kw *__init kw_i2c_host_init(struct device_node *np) } mutex_init(&host->mutex); init_completion(&host->complete); - spin_lock_init(&host->lock); + raw_spin_lock_init(&host->lock); timer_setup(&host->timeout_timer, kw_i2c_timeout, 0); psteps = of_get_property(np, "AAPL,address-step", NULL); -- 2.20.1
Re: [PATCH next v3 12/15] printk: introduce a kmsg_dump iterator
On Thu, Feb 25, 2021 at 09:24:35PM +0100, John Ogness wrote: > Rather than storing the iterator information in the registered > kmsg_dumper structure, create a separate iterator structure. The > kmsg_dump_iter structure can reside on the stack of the caller, thus > allowing lockless use of the kmsg_dump functions. > > This change also means that the kmsg_dumper dump() callback no > longer needs to pass in the kmsg_dumper as an argument. If > kmsg_dumpers want to access the kernel logs, they can use the new > iterator. > > Update the kmsg_dumper callback prototype. Update code that accesses > the kernel logs using the kmsg_dumper structure to use the new > kmsg_dump_iter structure. For kmsg_dumpers, this also means adding a > call to kmsg_dump_rewind() to initialize the iterator. > > All this is in preparation for removal of @logbuf_lock. > > Signed-off-by: John Ogness > --- > arch/powerpc/kernel/nvram_64.c | 14 +++--- > arch/powerpc/platforms/powernv/opal-kmsg.c | 3 +- > arch/powerpc/xmon/xmon.c | 6 +-- > arch/um/kernel/kmsg_dump.c | 8 +-- > drivers/hv/vmbus_drv.c | 7 +-- > drivers/mtd/mtdoops.c | 8 +-- > fs/pstore/platform.c | 8 +-- Reviewed-by: Kees Cook # pstore -Kees > include/linux/kmsg_dump.h | 38 --- > kernel/debug/kdb/kdb_main.c| 10 ++-- > kernel/printk/printk.c | 57 ++ > 10 files changed, 81 insertions(+), 78 deletions(-) > > diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c > index 532f22637783..5a64b24a91c2 100644 > --- a/arch/powerpc/kernel/nvram_64.c > +++ b/arch/powerpc/kernel/nvram_64.c > @@ -72,8 +72,7 @@ static const char *nvram_os_partitions[] = { > NULL > }; > > -static void oops_to_nvram(struct kmsg_dumper *dumper, > - enum kmsg_dump_reason reason); > +static void oops_to_nvram(enum kmsg_dump_reason reason); > > static struct kmsg_dumper nvram_kmsg_dumper = { > .dump = oops_to_nvram > @@ -642,11 +641,11 @@ void __init nvram_init_oops_partition(int > rtas_partition_exists) > * that we think will compress sufficiently to fit in the lnx,oops-log > * partition. If that's too much, go back and capture uncompressed text. > */ > -static void oops_to_nvram(struct kmsg_dumper *dumper, > - enum kmsg_dump_reason reason) > +static void oops_to_nvram(enum kmsg_dump_reason reason) > { > struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf; > static unsigned int oops_count = 0; > + static struct kmsg_dump_iter iter; > static bool panicking = false; > static DEFINE_SPINLOCK(lock); > unsigned long flags; > @@ -681,13 +680,14 @@ static void oops_to_nvram(struct kmsg_dumper *dumper, > return; > > if (big_oops_buf) { > - kmsg_dump_get_buffer(dumper, false, > + kmsg_dump_rewind(&iter); > + kmsg_dump_get_buffer(&iter, false, >big_oops_buf, big_oops_buf_sz, &text_len); > rc = zip_oops(text_len); > } > if (rc != 0) { > - kmsg_dump_rewind(dumper); > - kmsg_dump_get_buffer(dumper, false, > + kmsg_dump_rewind(&iter); > + kmsg_dump_get_buffer(&iter, false, >oops_data, oops_data_sz, &text_len); > err_type = ERR_TYPE_KERNEL_PANIC; > oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION); > diff --git a/arch/powerpc/platforms/powernv/opal-kmsg.c > b/arch/powerpc/platforms/powernv/opal-kmsg.c > index 6c3bc4b4da98..a7bd6ac681f4 100644 > --- a/arch/powerpc/platforms/powernv/opal-kmsg.c > +++ b/arch/powerpc/platforms/powernv/opal-kmsg.c > @@ -19,8 +19,7 @@ > * may not be completely printed. This function does not actually dump the > * message, it just ensures that OPAL completely flushes the console buffer. > */ > -static void kmsg_dump_opal_console_flush(struct kmsg_dumper *dumper, > - enum kmsg_dump_reason reason) > +static void kmsg_dump_opal_console_flush(enum kmsg_dump_reason reason) > { > /* >* Outside of a panic context the pollers will continue to run, > diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c > index 80ed3e1becf9..5978b90a885f 100644 > --- a/arch/powerpc/xmon/xmon.c > +++ b/arch/powerpc/xmon/xmon.c > @@ -3001,7 +3001,7 @@ print_address(unsigned long addr) > static void > dump_log_buf(void) > { > - struct kmsg_dumper dumper; > + struct kmsg_dump_iter iter; > unsigned char buf[128]; > size_t len; > > @@ -3013,9 +3013,9 @@ dump_log_buf(void) > catch_memory_errors = 1; > sync(); > > - kmsg_dump_rewind_nolock(&dumper); > + kmsg_dump_rewind_nolock(&iter); > xmon_start_pagination(); > - while (kmsg_dum
[PATCH v4 2/5] ibmvfc: fix invalid sub-CRQ handles after hard reset
A hard reset results in a complete transport disconnect such that the CRQ connection with the partner VIOS is broken. This has the side effect of also invalidating the associated sub-CRQs. The current code assumes that the sub-CRQs are perserved resulting in a protocol violation after trying to reconnect them with the VIOS. This introduces an infinite loop such that the VIOS forces a disconnect after each subsequent attempt to re-register with invalid handles. Avoid the aforementioned issue by releasing the sub-CRQs prior to CRQ disconnect, and driving a reinitialization of the sub-CRQs once a new CRQ is registered with the hypervisor. fixes: faacf8c5f1d5 ("ibmvfc: add alloc/dealloc routines for SCSI Sub-CRQ Channels") Signed-off-by: Tyrel Datwyler --- drivers/scsi/ibmvscsi/ibmvfc.c | 21 + 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index 384960036f8b..d34e1a4f74d9 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -158,6 +158,9 @@ static void ibmvfc_npiv_logout(struct ibmvfc_host *); static void ibmvfc_tgt_implicit_logout_and_del(struct ibmvfc_target *); static void ibmvfc_tgt_move_login(struct ibmvfc_target *); +static void ibmvfc_release_sub_crqs(struct ibmvfc_host *); +static void ibmvfc_init_sub_crqs(struct ibmvfc_host *); + static const char *unknown_error = "unknown error"; static long h_reg_sub_crq(unsigned long unit_address, unsigned long ioba, @@ -926,8 +929,8 @@ static int ibmvfc_reset_crq(struct ibmvfc_host *vhost) unsigned long flags; struct vio_dev *vdev = to_vio_dev(vhost->dev); struct ibmvfc_queue *crq = &vhost->crq; - struct ibmvfc_queue *scrq; - int i; + + ibmvfc_release_sub_crqs(vhost); /* Close the CRQ */ do { @@ -947,16 +950,6 @@ static int ibmvfc_reset_crq(struct ibmvfc_host *vhost) memset(crq->msgs.crq, 0, PAGE_SIZE); crq->cur = 0; - if (vhost->scsi_scrqs.scrqs) { - for (i = 0; i < nr_scsi_hw_queues; i++) { - scrq = &vhost->scsi_scrqs.scrqs[i]; - spin_lock(scrq->q_lock); - memset(scrq->msgs.scrq, 0, PAGE_SIZE); - scrq->cur = 0; - spin_unlock(scrq->q_lock); - } - } - /* And re-open it again */ rc = plpar_hcall_norets(H_REG_CRQ, vdev->unit_address, crq->msg_token, PAGE_SIZE); @@ -966,9 +959,12 @@ static int ibmvfc_reset_crq(struct ibmvfc_host *vhost) dev_warn(vhost->dev, "Partner adapter not ready\n"); else if (rc != 0) dev_warn(vhost->dev, "Couldn't register crq (rc=%d)\n", rc); + spin_unlock(vhost->crq.q_lock); spin_unlock_irqrestore(vhost->host->host_lock, flags); + ibmvfc_init_sub_crqs(vhost); + return rc; } @@ -5692,6 +5688,7 @@ static void ibmvfc_deregister_scsi_channel(struct ibmvfc_host *vhost, int index) free_irq(scrq->irq, scrq); irq_dispose_mapping(scrq->irq); + scrq->irq = 0; do { rc = plpar_hcall_norets(H_FREE_SUB_CRQ, vdev->unit_address, -- 2.27.0
[PATCH v4 5/5] ibmvfc: reinitialize sub-CRQs and perform channel enquiry after LPM
A live partition migration (LPM) results in a CRQ disconnect similar to a hard reset. In this LPM case the hypervisor moslty perserves the CRQ transport such that it simply needs to be reenabled. However, the capabilities may have changed such as fewer channels, or no channels at all. Further, its possible that there may be sub-CRQ support, but no channel support. The CRQ reenable path currently doesn't take any of this into consideration. For simpilicty release and reinitialize sub-CRQs during reenable, and set do_enquiry and using_channels with the appropriate values to trigger channel renegotiation. fixes: faacf8c5f1d5 ("ibmvfc: add alloc/dealloc routines for SCSI Sub-CRQ Channels") Signed-off-by: Tyrel Datwyler --- drivers/scsi/ibmvscsi/ibmvfc.c | 12 1 file changed, 12 insertions(+) diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index ef03fa559433..1e2ea21713ad 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -903,6 +903,9 @@ static int ibmvfc_reenable_crq_queue(struct ibmvfc_host *vhost) { int rc = 0; struct vio_dev *vdev = to_vio_dev(vhost->dev); + unsigned long flags; + + ibmvfc_release_sub_crqs(vhost); /* Re-enable the CRQ */ do { @@ -914,6 +917,15 @@ static int ibmvfc_reenable_crq_queue(struct ibmvfc_host *vhost) if (rc) dev_err(vhost->dev, "Error enabling adapter (rc=%d)\n", rc); + spin_lock_irqsave(vhost->host->host_lock, flags); + spin_lock(vhost->crq.q_lock); + vhost->do_enquiry = 1; + vhost->using_channels = 0; + spin_unlock(vhost->crq.q_lock); + spin_unlock_irqrestore(vhost->host->host_lock, flags); + + ibmvfc_init_sub_crqs(vhost); + return rc; } -- 2.27.0
[PATCH v4 0/5] ibmvfc: hard reset fixes
This series contains a minor simplification of ibmvfc_init_sub_crqs() followed by a couple fixes for sub-CRQ handling which effect hard reset of the client/host adapter CRQ pair. changes in v4: Patch 2: dropped Reviewed-by tag and moved sub-crq init to after locked region Patch 5: moved sub-crq init to after locked region changes in v3: * Patch 1 & 5: moved ibmvfc_init_sub_crqs out of locked patch changes in v2: * added Reviewed-by tags for patches 1-3 * Patch 4: use rtas_busy_delay to test rc and delay correct amount of time * Patch 5: (new) similar fix for LPM case where CRQ pair needs re-enablement Tyrel Datwyler (5): powerpc/pseries: extract host bridge from pci_bus prior to bus removal ibmvfc: simplify handling of sub-CRQ initialization ibmvfc: fix invalid sub-CRQ handles after hard reset ibmvfc: treat H_CLOSED as success during sub-CRQ registration ibmvfc: store return code of H_FREE_SUB_CRQ during cleanup arch/powerpc/platforms/pseries/pci_dlpar.c | 4 +- drivers/scsi/ibmvscsi/ibmvfc.c | 49 ++ 2 files changed, 26 insertions(+), 27 deletions(-) -- 2.27.0
[PATCH v4 3/5] ibmvfc: treat H_CLOSED as success during sub-CRQ registration
A non-zero return code for H_REG_SUB_CRQ is currently treated as a failure resulting in failing sub-CRQ setup. The case of H_CLOSED should not be treated as a failure. This return code translates to a successful sub-CRQ registration by the hypervisor, and is meant to communicate back that there is currently no partner VIOS CRQ connection established as of yet. This is a common occurrence during a disconnect where the client adapter can possibly come back up prior to the partner adapter. For non-zero return code from H_REG_SUB_CRQ treat a H_CLOSED as success so that sub-CRQs are successfully setup. Fixes: faacf8c5f1d5 ("ibmvfc: add alloc/dealloc routines for SCSI Sub-CRQ Channels") Signed-off-by: Tyrel Datwyler Reviewed-by: Brian King --- drivers/scsi/ibmvscsi/ibmvfc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index d34e1a4f74d9..1d9f961715ca 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -5636,7 +5636,8 @@ static int ibmvfc_register_scsi_channel(struct ibmvfc_host *vhost, rc = h_reg_sub_crq(vdev->unit_address, scrq->msg_token, PAGE_SIZE, &scrq->cookie, &scrq->hw_irq); - if (rc) { + /* H_CLOSED indicates successful register, but no CRQ partner */ + if (rc && rc != H_CLOSED) { dev_warn(dev, "Error registering sub-crq: %d\n", rc); if (rc == H_PARAMETER) dev_warn_once(dev, "Firmware may not support MQ\n"); -- 2.27.0
[PATCH v4 4/5] ibmvfc: store return code of H_FREE_SUB_CRQ during cleanup
The H_FREE_SUB_CRQ hypercall can return a retry delay return code that indicates the call needs to be retried after a specific amount of time delay. The error path to free a sub-CRQ in case of a failure during channel registration fails to capture the return code of H_FREE_SUB_CRQ which will result in the delay loop being skipped in the case of a retry delay return code. Store the return code result of the H_FREE_SUB_CRQ call such that the return code check in the delay loop evaluates a meaningful value. Also, use the rtas_busy_delay() to check the rc value and delay for the appropriate amount of time. Fixes: 9288d35d70b5 ("ibmvfc: map/request irq and register Sub-CRQ interrupt handler") Signed-off-by: Tyrel Datwyler Reviewed-by: Brian King --- drivers/scsi/ibmvscsi/ibmvfc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index 1d9f961715ca..ef03fa559433 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -5670,8 +5671,8 @@ static int ibmvfc_register_scsi_channel(struct ibmvfc_host *vhost, irq_failed: do { - plpar_hcall_norets(H_FREE_SUB_CRQ, vdev->unit_address, scrq->cookie); - } while (rc == H_BUSY || H_IS_LONG_BUSY(rc)); + rc = plpar_hcall_norets(H_FREE_SUB_CRQ, vdev->unit_address, scrq->cookie); + } while (rtas_busy_delay(rc)); reg_failed: ibmvfc_free_queue(vhost, scrq); LEAVE; -- 2.27.0
[PATCH v4 1/5] ibmvfc: simplify handling of sub-CRQ initialization
If ibmvfc_init_sub_crqs() fails ibmvfc_probe() simply parrots registration failure reported elsewhere, and futher vhost->scsi_scrq.scrq == NULL is indication enough to the driver that it has no sub-CRQs available. The mq_enabled check can also be moved into ibmvfc_init_sub_crqs() such that each caller doesn't have to gate the call with a mq_enabled check. Finally, in the case of sub-CRQ setup failure setting do_enquiry can be turned off to putting the driver into single queue fallback mode. The aforementioned changes also simplify the next patch in the series that fixes a hard reset issue, by tying a sub-CRQ setup failure and do_enquiry logic into ibmvfc_init_sub_crqs(). Signed-off-by: Tyrel Datwyler Reviewed-by: Brian King --- drivers/scsi/ibmvscsi/ibmvfc.c | 21 ++--- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index 7097028d4cb6..384960036f8b 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -5705,17 +5705,21 @@ static void ibmvfc_deregister_scsi_channel(struct ibmvfc_host *vhost, int index) LEAVE; } -static int ibmvfc_init_sub_crqs(struct ibmvfc_host *vhost) +static void ibmvfc_init_sub_crqs(struct ibmvfc_host *vhost) { int i, j; ENTER; + if (!vhost->mq_enabled) + return; vhost->scsi_scrqs.scrqs = kcalloc(nr_scsi_hw_queues, sizeof(*vhost->scsi_scrqs.scrqs), GFP_KERNEL); - if (!vhost->scsi_scrqs.scrqs) - return -1; + if (!vhost->scsi_scrqs.scrqs) { + vhost->do_enquiry = 0; + return; + } for (i = 0; i < nr_scsi_hw_queues; i++) { if (ibmvfc_register_scsi_channel(vhost, i)) { @@ -5724,13 +5728,12 @@ static int ibmvfc_init_sub_crqs(struct ibmvfc_host *vhost) kfree(vhost->scsi_scrqs.scrqs); vhost->scsi_scrqs.scrqs = NULL; vhost->scsi_scrqs.active_queues = 0; - LEAVE; - return -1; + vhost->do_enquiry = 0; + break; } } LEAVE; - return 0; } static void ibmvfc_release_sub_crqs(struct ibmvfc_host *vhost) @@ -5997,11 +6000,7 @@ static int ibmvfc_probe(struct vio_dev *vdev, const struct vio_device_id *id) goto remove_shost; } - if (vhost->mq_enabled) { - rc = ibmvfc_init_sub_crqs(vhost); - if (rc) - dev_warn(dev, "Failed to allocate Sub-CRQs. rc=%d\n", rc); - } + ibmvfc_init_sub_crqs(vhost); if (shost_to_fc_host(shost)->rqst_q) blk_queue_max_segments(shost_to_fc_host(shost)->rqst_q, 1); -- 2.27.0
[PATCH v3 5/5] ibmvfc: reinitialize sub-CRQs and perform channel enquiry after LPM
A live partition migration (LPM) results in a CRQ disconnect similar to a hard reset. In this LPM case the hypervisor moslty perserves the CRQ transport such that it simply needs to be reenabled. However, the capabilities may have changed such as fewer channels, or no channels at all. Further, its possible that there may be sub-CRQ support, but no channel support. The CRQ reenable path currently doesn't take any of this into consideration. For simpilicty release and reinitialize sub-CRQs during reenable, and set do_enquiry and using_channels with the appropriate values to trigger channel renegotiation. Signed-off-by: Tyrel Datwyler --- drivers/scsi/ibmvscsi/ibmvfc.c | 12 1 file changed, 12 insertions(+) diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index 1bb08e5f3674..6bbc2697ad5a 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -903,6 +903,9 @@ static int ibmvfc_reenable_crq_queue(struct ibmvfc_host *vhost) { int rc = 0; struct vio_dev *vdev = to_vio_dev(vhost->dev); + unsigned long flags; + + ibmvfc_release_sub_crqs(vhost); /* Re-enable the CRQ */ do { @@ -914,6 +917,15 @@ static int ibmvfc_reenable_crq_queue(struct ibmvfc_host *vhost) if (rc) dev_err(vhost->dev, "Error enabling adapter (rc=%d)\n", rc); + ibmvfc_init_sub_crqs(vhost); + + spin_lock_irqsave(vhost->host->host_lock, flags); + spin_lock(vhost->crq.q_lock); + vhost->do_enquiry = 1; + vhost->using_channels = 0; + spin_unlock(vhost->crq.q_lock); + spin_unlock_irqrestore(vhost->host->host_lock, flags); + return rc; } -- 2.27.0
[PATCH v3 4/5] ibmvfc: store return code of H_FREE_SUB_CRQ during cleanup
The H_FREE_SUB_CRQ hypercall can return a retry delay return code that indicates the call needs to be retried after a specific amount of time delay. The error path to free a sub-CRQ in case of a failure during channel registration fails to capture the return code of H_FREE_SUB_CRQ which will result in the delay loop being skipped in the case of a retry delay return code. Store the return code result of the H_FREE_SUB_CRQ call such that the return code check in the delay loop evaluates a meaningful value. Also, use the rtas_busy_delay() to check the rc value and delay for the appropriate amount of time. Fixes: 9288d35d70b5 ("ibmvfc: map/request irq and register Sub-CRQ interrupt handler") Signed-off-by: Tyrel Datwyler Reviewed-by: Brian King --- drivers/scsi/ibmvscsi/ibmvfc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index 274c5a1fac9c..1bb08e5f3674 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -5670,8 +5671,8 @@ static int ibmvfc_register_scsi_channel(struct ibmvfc_host *vhost, irq_failed: do { - plpar_hcall_norets(H_FREE_SUB_CRQ, vdev->unit_address, scrq->cookie); - } while (rc == H_BUSY || H_IS_LONG_BUSY(rc)); + rc = plpar_hcall_norets(H_FREE_SUB_CRQ, vdev->unit_address, scrq->cookie); + } while (rtas_busy_delay(rc)); reg_failed: ibmvfc_free_queue(vhost, scrq); LEAVE; -- 2.27.0
[PATCH v3 3/5] ibmvfc: treat H_CLOSED as success during sub-CRQ registration
A non-zero return code for H_REG_SUB_CRQ is currently treated as a failure resulting in failing sub-CRQ setup. The case of H_CLOSED should not be treated as a failure. This return code translates to a successful sub-CRQ registration by the hypervisor, and is meant to communicate back that there is currently no partner VIOS CRQ connection established as of yet. This is a common occurrence during a disconnect where the client adapter can possibly come back up prior to the partner adapter. For non-zero return code from H_REG_SUB_CRQ treat a H_CLOSED as success so that sub-CRQs are successfully setup. Fixes: faacf8c5f1d5 ("ibmvfc: add alloc/dealloc routines for SCSI Sub-CRQ Channels") Signed-off-by: Tyrel Datwyler Reviewed-by: Brian King --- drivers/scsi/ibmvscsi/ibmvfc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index 2cca55f2e464..274c5a1fac9c 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -5636,7 +5636,8 @@ static int ibmvfc_register_scsi_channel(struct ibmvfc_host *vhost, rc = h_reg_sub_crq(vdev->unit_address, scrq->msg_token, PAGE_SIZE, &scrq->cookie, &scrq->hw_irq); - if (rc) { + /* H_CLOSED indicates successful register, but no CRQ partner */ + if (rc && rc != H_CLOSED) { dev_warn(dev, "Error registering sub-crq: %d\n", rc); if (rc == H_PARAMETER) dev_warn_once(dev, "Firmware may not support MQ\n"); -- 2.27.0
[PATCH v3 0/5] ibmvfc: hard reset fixes
This series contains a minor simplification of ibmvfc_init_sub_crqs() followed by a couple fixes for sub-CRQ handling which effect hard reset of the client/host adapter CRQ pair. changes in v3: * Patch 1 & 5: moved ibmvfc_init_sub_crqs out of locked patch changes in v2: * added Reviewed-by tags for patches 1-3 * Patch 4: use rtas_busy_delay to test rc and delay correct amount of time * Patch 5: (new) similar fix for LPM case where CRQ pair needs re-enablement Tyrel Datwyler (5): powerpc/pseries: extract host bridge from pci_bus prior to bus removal ibmvfc: simplify handling of sub-CRQ initialization ibmvfc: fix invalid sub-CRQ handles after hard reset ibmvfc: treat H_CLOSED as success during sub-CRQ registration ibmvfc: store return code of H_FREE_SUB_CRQ during cleanup arch/powerpc/platforms/pseries/pci_dlpar.c | 4 +- drivers/scsi/ibmvscsi/ibmvfc.c | 49 ++ 2 files changed, 26 insertions(+), 27 deletions(-) -- 2.27.0
[PATCH v3 1/5] ibmvfc: simplify handling of sub-CRQ initialization
If ibmvfc_init_sub_crqs() fails ibmvfc_probe() simply parrots registration failure reported elsewhere, and futher vhost->scsi_scrq.scrq == NULL is indication enough to the driver that it has no sub-CRQs available. The mq_enabled check can also be moved into ibmvfc_init_sub_crqs() such that each caller doesn't have to gate the call with a mq_enabled check. Finally, in the case of sub-CRQ setup failure setting do_enquiry can be turned off to putting the driver into single queue fallback mode. The aforementioned changes also simplify the next patch in the series that fixes a hard reset issue, by tying a sub-CRQ setup failure and do_enquiry logic into ibmvfc_init_sub_crqs(). Signed-off-by: Tyrel Datwyler Reviewed-by: Brian King --- drivers/scsi/ibmvscsi/ibmvfc.c | 21 ++--- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index 7097028d4cb6..384960036f8b 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -5705,17 +5705,21 @@ static void ibmvfc_deregister_scsi_channel(struct ibmvfc_host *vhost, int index) LEAVE; } -static int ibmvfc_init_sub_crqs(struct ibmvfc_host *vhost) +static void ibmvfc_init_sub_crqs(struct ibmvfc_host *vhost) { int i, j; ENTER; + if (!vhost->mq_enabled) + return; vhost->scsi_scrqs.scrqs = kcalloc(nr_scsi_hw_queues, sizeof(*vhost->scsi_scrqs.scrqs), GFP_KERNEL); - if (!vhost->scsi_scrqs.scrqs) - return -1; + if (!vhost->scsi_scrqs.scrqs) { + vhost->do_enquiry = 0; + return; + } for (i = 0; i < nr_scsi_hw_queues; i++) { if (ibmvfc_register_scsi_channel(vhost, i)) { @@ -5724,13 +5728,12 @@ static int ibmvfc_init_sub_crqs(struct ibmvfc_host *vhost) kfree(vhost->scsi_scrqs.scrqs); vhost->scsi_scrqs.scrqs = NULL; vhost->scsi_scrqs.active_queues = 0; - LEAVE; - return -1; + vhost->do_enquiry = 0; + break; } } LEAVE; - return 0; } static void ibmvfc_release_sub_crqs(struct ibmvfc_host *vhost) @@ -5997,11 +6000,7 @@ static int ibmvfc_probe(struct vio_dev *vdev, const struct vio_device_id *id) goto remove_shost; } - if (vhost->mq_enabled) { - rc = ibmvfc_init_sub_crqs(vhost); - if (rc) - dev_warn(dev, "Failed to allocate Sub-CRQs. rc=%d\n", rc); - } + ibmvfc_init_sub_crqs(vhost); if (shost_to_fc_host(shost)->rqst_q) blk_queue_max_segments(shost_to_fc_host(shost)->rqst_q, 1); -- 2.27.0
[PATCH v3 2/5] ibmvfc: fix invalid sub-CRQ handles after hard reset
A hard reset results in a complete transport disconnect such that the CRQ connection with the partner VIOS is broken. This has the side effect of also invalidating the associated sub-CRQs. The current code assumes that the sub-CRQs are perserved resulting in a protocol violation after trying to reconnect them with the VIOS. This introduces an infinite loop such that the VIOS forces a disconnect after each subsequent attempt to re-register with invalid handles. Avoid the aforementioned issue by releasing the sub-CRQs prior to CRQ disconnect, and driving a reinitialization of the sub-CRQs once a new CRQ is registered with the hypervisor. fixes: faacf8c5f1d5 ("ibmvfc: add alloc/dealloc routines for SCSI Sub-CRQ Channels") Signed-off-by: Tyrel Datwyler Reviewed-by: Brian King --- drivers/scsi/ibmvscsi/ibmvfc.c | 21 + 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index 384960036f8b..2cca55f2e464 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -158,6 +158,9 @@ static void ibmvfc_npiv_logout(struct ibmvfc_host *); static void ibmvfc_tgt_implicit_logout_and_del(struct ibmvfc_target *); static void ibmvfc_tgt_move_login(struct ibmvfc_target *); +static void ibmvfc_release_sub_crqs(struct ibmvfc_host *); +static void ibmvfc_init_sub_crqs(struct ibmvfc_host *); + static const char *unknown_error = "unknown error"; static long h_reg_sub_crq(unsigned long unit_address, unsigned long ioba, @@ -926,8 +929,8 @@ static int ibmvfc_reset_crq(struct ibmvfc_host *vhost) unsigned long flags; struct vio_dev *vdev = to_vio_dev(vhost->dev); struct ibmvfc_queue *crq = &vhost->crq; - struct ibmvfc_queue *scrq; - int i; + + ibmvfc_release_sub_crqs(vhost); /* Close the CRQ */ do { @@ -936,6 +939,8 @@ static int ibmvfc_reset_crq(struct ibmvfc_host *vhost) rc = plpar_hcall_norets(H_FREE_CRQ, vdev->unit_address); } while (rc == H_BUSY || H_IS_LONG_BUSY(rc)); + ibmvfc_init_sub_crqs(vhost); + spin_lock_irqsave(vhost->host->host_lock, flags); spin_lock(vhost->crq.q_lock); vhost->state = IBMVFC_NO_CRQ; @@ -947,16 +952,6 @@ static int ibmvfc_reset_crq(struct ibmvfc_host *vhost) memset(crq->msgs.crq, 0, PAGE_SIZE); crq->cur = 0; - if (vhost->scsi_scrqs.scrqs) { - for (i = 0; i < nr_scsi_hw_queues; i++) { - scrq = &vhost->scsi_scrqs.scrqs[i]; - spin_lock(scrq->q_lock); - memset(scrq->msgs.scrq, 0, PAGE_SIZE); - scrq->cur = 0; - spin_unlock(scrq->q_lock); - } - } - /* And re-open it again */ rc = plpar_hcall_norets(H_REG_CRQ, vdev->unit_address, crq->msg_token, PAGE_SIZE); @@ -966,6 +961,7 @@ static int ibmvfc_reset_crq(struct ibmvfc_host *vhost) dev_warn(vhost->dev, "Partner adapter not ready\n"); else if (rc != 0) dev_warn(vhost->dev, "Couldn't register crq (rc=%d)\n", rc); + spin_unlock(vhost->crq.q_lock); spin_unlock_irqrestore(vhost->host->host_lock, flags); @@ -5692,6 +5688,7 @@ static void ibmvfc_deregister_scsi_channel(struct ibmvfc_host *vhost, int index) free_irq(scrq->irq, scrq); irq_dispose_mapping(scrq->irq); + scrq->irq = 0; do { rc = plpar_hcall_norets(H_FREE_SUB_CRQ, vdev->unit_address, -- 2.27.0
Re: [PATCH v2 5/5] ibmvfc: reinitialize sub-CRQs and perform channel enquiry after LPM
On 2/25/21 2:48 PM, Tyrel Datwyler wrote: > A live partition migration (LPM) results in a CRQ disconnect similar to > a hard reset. In this LPM case the hypervisor moslty perserves the CRQ > transport such that it simply needs to be reenabled. However, the > capabilities may have changed such as fewer channels, or no channels at > all. Further, its possible that there may be sub-CRQ support, but no > channel support. The CRQ reenable path currently doesn't take any of > this into consideration. > > For simpilicty release and reinitialize sub-CRQs during reenable, and > set do_enquiry and using_channels with the appropriate values to trigger > channel renegotiation. > > Signed-off-by: Tyrel Datwyler > --- > drivers/scsi/ibmvscsi/ibmvfc.c | 13 + > 1 file changed, 13 insertions(+) > > diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c > index 4ac2c442e1e2..9ae6be56e375 100644 > --- a/drivers/scsi/ibmvscsi/ibmvfc.c > +++ b/drivers/scsi/ibmvscsi/ibmvfc.c > @@ -903,6 +903,9 @@ static int ibmvfc_reenable_crq_queue(struct ibmvfc_host > *vhost) > { > int rc = 0; > struct vio_dev *vdev = to_vio_dev(vhost->dev); > + unsigned long flags; > + > + ibmvfc_release_sub_crqs(vhost); > > /* Re-enable the CRQ */ > do { > @@ -914,6 +917,16 @@ static int ibmvfc_reenable_crq_queue(struct ibmvfc_host > *vhost) > if (rc) > dev_err(vhost->dev, "Error enabling adapter (rc=%d)\n", rc); > > + spin_lock_irqsave(vhost->host->host_lock, flags); > + spin_lock(vhost->crq.q_lock); > + vhost->do_enquiry = 1; > + vhost->using_channels = 0; > + > + ibmvfc_init_sub_crqs(vhost); > + > + spin_unlock(vhost->crq.q_lock); > + spin_unlock_irqrestore(vhost->host->host_lock, flags); ibmvfc_init_sub_crqs can sleep, for multiple reasons, so you can't hold a lock when you call it. There is a GFP_KERNEL allocation in it, and the patch before this one adds an msleep in an error path. Thanks, Brian -- Brian King Power Linux I/O IBM Linux Technology Center
Re: [PATCH v2 4/5] ibmvfc: store return code of H_FREE_SUB_CRQ during cleanup
Reviewed-by: Brian King -- Brian King Power Linux I/O IBM Linux Technology Center
Freeing page tables through RCU
In order to walk the page tables without the mmap semaphore, it must be possible to prevent them from being freed and reused (eg if munmap() races with viewing /proc/$pid/smaps). There is various commentary within the mm on how to prevent this. One way is to disable interrupts, relying on that to block rcu_sched or IPIs. I don't think the RT people are terribly happy about reading a proc file disabling interrupts, and it doesn't work for architectures that free page tables directly instead of batching them into an rcu_sched (because the IPI may not be sent to this CPU if the task has never run on it). See "Fast GUP" in mm/gup.c Ideally, I'd like rcu_read_lock() to delay page table reuse. This is close to trivial for architectures which use entire pages or multiple pages for levels of their page tables as we can use the rcu_head embedded in struct page to queue the page for RCU. s390 and powerpc are the only two architectures I know of that have levels of their page table that are smaller than their PAGE_SIZE. I'd like to discuss options. There may be a complicated scheme that allows partial pages to be freed via RCU, but I have something simpler in mind. For powerpc in particular, it can have a PAGE_SIZE of 64kB and then the MMU wants to see 4kB entries in the PMD. I suggest that instead of allocating each 4kB entry individually, we allocate a 64kB page and fill in 16 consecutive PMDs. This could cost a bit more memory (although if you've asked for a CONFIG_PAGE_SIZE of 64kB, you presumably don't care too much about it), but it'll make future page faults cheaper (as the PMDs will already be present, assuming you have good locality of reference). I'd like to hear better ideas than this.
Re: [PATCH v2 5/5] ibmvfc: reinitialize sub-CRQs and perform channel enquiry after LPM
On 2/25/21 12:48 PM, Tyrel Datwyler wrote: > A live partition migration (LPM) results in a CRQ disconnect similar to > a hard reset. In this LPM case the hypervisor moslty perserves the CRQ > transport such that it simply needs to be reenabled. However, the > capabilities may have changed such as fewer channels, or no channels at > all. Further, its possible that there may be sub-CRQ support, but no > channel support. The CRQ reenable path currently doesn't take any of > this into consideration. > > For simpilicty release and reinitialize sub-CRQs during reenable, and > set do_enquiry and using_channels with the appropriate values to trigger > channel renegotiation. > This should of had the same fixes tag as patch 2. fixes: faacf8c5f1d5 ("ibmvfc: add alloc/dealloc routines for SCSI Sub-CRQ Channels") > Signed-off-by: Tyrel Datwyler > --- > drivers/scsi/ibmvscsi/ibmvfc.c | 13 + > 1 file changed, 13 insertions(+) > > diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c > index 4ac2c442e1e2..9ae6be56e375 100644 > --- a/drivers/scsi/ibmvscsi/ibmvfc.c > +++ b/drivers/scsi/ibmvscsi/ibmvfc.c > @@ -903,6 +903,9 @@ static int ibmvfc_reenable_crq_queue(struct ibmvfc_host > *vhost) > { > int rc = 0; > struct vio_dev *vdev = to_vio_dev(vhost->dev); > + unsigned long flags; > + > + ibmvfc_release_sub_crqs(vhost); > > /* Re-enable the CRQ */ > do { > @@ -914,6 +917,16 @@ static int ibmvfc_reenable_crq_queue(struct ibmvfc_host > *vhost) > if (rc) > dev_err(vhost->dev, "Error enabling adapter (rc=%d)\n", rc); > > + spin_lock_irqsave(vhost->host->host_lock, flags); > + spin_lock(vhost->crq.q_lock); > + vhost->do_enquiry = 1; > + vhost->using_channels = 0; > + > + ibmvfc_init_sub_crqs(vhost); > + > + spin_unlock(vhost->crq.q_lock); > + spin_unlock_irqrestore(vhost->host->host_lock, flags); > + > return rc; > } > >
[PATCH v2 4/5] ibmvfc: store return code of H_FREE_SUB_CRQ during cleanup
The H_FREE_SUB_CRQ hypercall can return a retry delay return code that indicates the call needs to be retried after a specific amount of time delay. The error path to free a sub-CRQ in case of a failure during channel registration fails to capture the return code of H_FREE_SUB_CRQ which will result in the delay loop being skipped in the case of a retry delay return code. Store the return code result of the H_FREE_SUB_CRQ call such that the return code check in the delay loop evaluates a meaningful value. Also, use the rtas_busy_delay() to check the rc value and delay for the appropriate amount of time. Fixes: 9288d35d70b5 ("ibmvfc: map/request irq and register Sub-CRQ interrupt handler") Signed-off-by: Tyrel Datwyler --- drivers/scsi/ibmvscsi/ibmvfc.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index ba6fcf9cbc57..4ac2c442e1e2 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -5670,8 +5671,8 @@ static int ibmvfc_register_scsi_channel(struct ibmvfc_host *vhost, irq_failed: do { - plpar_hcall_norets(H_FREE_SUB_CRQ, vdev->unit_address, scrq->cookie); - } while (rc == H_BUSY || H_IS_LONG_BUSY(rc)); + rc = plpar_hcall_norets(H_FREE_SUB_CRQ, vdev->unit_address, scrq->cookie); + } while (rtas_busy_delay(rc)); reg_failed: ibmvfc_free_queue(vhost, scrq); LEAVE; -- 2.27.0
[PATCH v2 0/5] ibmvfc: hard reset fixes
This series contains a minor simplification of ibmvfc_init_sub_crqs() followed by a couple fixes for sub-CRQ handling which effect hard reset of the client/host adapter CRQ pair. changes in v2: * added Reviewed-by tags for patches 1-3 * Patch 4: use rtas_busy_delay to test rc and delay correct amount of time * Patch 5: (new) similar fix for LPM case where CRQ pair needs re-enablement Tyrel Datwyler (5): powerpc/pseries: extract host bridge from pci_bus prior to bus removal ibmvfc: simplify handling of sub-CRQ initialization ibmvfc: fix invalid sub-CRQ handles after hard reset ibmvfc: treat H_CLOSED as success during sub-CRQ registration ibmvfc: store return code of H_FREE_SUB_CRQ during cleanup arch/powerpc/platforms/pseries/pci_dlpar.c | 4 +- drivers/scsi/ibmvscsi/ibmvfc.c | 49 ++ 2 files changed, 26 insertions(+), 27 deletions(-) -- 2.27.0
[PATCH v2 1/5] ibmvfc: simplify handling of sub-CRQ initialization
If ibmvfc_init_sub_crqs() fails ibmvfc_probe() simply parrots registration failure reported elsewhere, and futher vhost->scsi_scrq.scrq == NULL is indication enough to the driver that it has no sub-CRQs available. The mq_enabled check can also be moved into ibmvfc_init_sub_crqs() such that each caller doesn't have to gate the call with a mq_enabled check. Finally, in the case of sub-CRQ setup failure setting do_enquiry can be turned off to putting the driver into single queue fallback mode. The aforementioned changes also simplify the next patch in the series that fixes a hard reset issue, by tying a sub-CRQ setup failure and do_enquiry logic into ibmvfc_init_sub_crqs(). Signed-off-by: Tyrel Datwyler Reviewed-by: Brian King --- drivers/scsi/ibmvscsi/ibmvfc.c | 21 ++--- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index 7097028d4cb6..384960036f8b 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -5705,17 +5705,21 @@ static void ibmvfc_deregister_scsi_channel(struct ibmvfc_host *vhost, int index) LEAVE; } -static int ibmvfc_init_sub_crqs(struct ibmvfc_host *vhost) +static void ibmvfc_init_sub_crqs(struct ibmvfc_host *vhost) { int i, j; ENTER; + if (!vhost->mq_enabled) + return; vhost->scsi_scrqs.scrqs = kcalloc(nr_scsi_hw_queues, sizeof(*vhost->scsi_scrqs.scrqs), GFP_KERNEL); - if (!vhost->scsi_scrqs.scrqs) - return -1; + if (!vhost->scsi_scrqs.scrqs) { + vhost->do_enquiry = 0; + return; + } for (i = 0; i < nr_scsi_hw_queues; i++) { if (ibmvfc_register_scsi_channel(vhost, i)) { @@ -5724,13 +5728,12 @@ static int ibmvfc_init_sub_crqs(struct ibmvfc_host *vhost) kfree(vhost->scsi_scrqs.scrqs); vhost->scsi_scrqs.scrqs = NULL; vhost->scsi_scrqs.active_queues = 0; - LEAVE; - return -1; + vhost->do_enquiry = 0; + break; } } LEAVE; - return 0; } static void ibmvfc_release_sub_crqs(struct ibmvfc_host *vhost) @@ -5997,11 +6000,7 @@ static int ibmvfc_probe(struct vio_dev *vdev, const struct vio_device_id *id) goto remove_shost; } - if (vhost->mq_enabled) { - rc = ibmvfc_init_sub_crqs(vhost); - if (rc) - dev_warn(dev, "Failed to allocate Sub-CRQs. rc=%d\n", rc); - } + ibmvfc_init_sub_crqs(vhost); if (shost_to_fc_host(shost)->rqst_q) blk_queue_max_segments(shost_to_fc_host(shost)->rqst_q, 1); -- 2.27.0
[PATCH v2 5/5] ibmvfc: reinitialize sub-CRQs and perform channel enquiry after LPM
A live partition migration (LPM) results in a CRQ disconnect similar to a hard reset. In this LPM case the hypervisor moslty perserves the CRQ transport such that it simply needs to be reenabled. However, the capabilities may have changed such as fewer channels, or no channels at all. Further, its possible that there may be sub-CRQ support, but no channel support. The CRQ reenable path currently doesn't take any of this into consideration. For simpilicty release and reinitialize sub-CRQs during reenable, and set do_enquiry and using_channels with the appropriate values to trigger channel renegotiation. Signed-off-by: Tyrel Datwyler --- drivers/scsi/ibmvscsi/ibmvfc.c | 13 + 1 file changed, 13 insertions(+) diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index 4ac2c442e1e2..9ae6be56e375 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -903,6 +903,9 @@ static int ibmvfc_reenable_crq_queue(struct ibmvfc_host *vhost) { int rc = 0; struct vio_dev *vdev = to_vio_dev(vhost->dev); + unsigned long flags; + + ibmvfc_release_sub_crqs(vhost); /* Re-enable the CRQ */ do { @@ -914,6 +917,16 @@ static int ibmvfc_reenable_crq_queue(struct ibmvfc_host *vhost) if (rc) dev_err(vhost->dev, "Error enabling adapter (rc=%d)\n", rc); + spin_lock_irqsave(vhost->host->host_lock, flags); + spin_lock(vhost->crq.q_lock); + vhost->do_enquiry = 1; + vhost->using_channels = 0; + + ibmvfc_init_sub_crqs(vhost); + + spin_unlock(vhost->crq.q_lock); + spin_unlock_irqrestore(vhost->host->host_lock, flags); + return rc; } -- 2.27.0
[PATCH v2 3/5] ibmvfc: treat H_CLOSED as success during sub-CRQ registration
A non-zero return code for H_REG_SUB_CRQ is currently treated as a failure resulting in failing sub-CRQ setup. The case of H_CLOSED should not be treated as a failure. This return code translates to a successful sub-CRQ registration by the hypervisor, and is meant to communicate back that there is currently no partner VIOS CRQ connection established as of yet. This is a common occurrence during a disconnect where the client adapter can possibly come back up prior to the partner adapter. For non-zero return code from H_REG_SUB_CRQ treat a H_CLOSED as success so that sub-CRQs are successfully setup. Fixes: faacf8c5f1d5 ("ibmvfc: add alloc/dealloc routines for SCSI Sub-CRQ Channels") Signed-off-by: Tyrel Datwyler Reviewed-by: Brian King --- drivers/scsi/ibmvscsi/ibmvfc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index 3dd20f383453..ba6fcf9cbc57 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -5636,7 +5636,8 @@ static int ibmvfc_register_scsi_channel(struct ibmvfc_host *vhost, rc = h_reg_sub_crq(vdev->unit_address, scrq->msg_token, PAGE_SIZE, &scrq->cookie, &scrq->hw_irq); - if (rc) { + /* H_CLOSED indicates successful register, but no CRQ partner */ + if (rc && rc != H_CLOSED) { dev_warn(dev, "Error registering sub-crq: %d\n", rc); if (rc == H_PARAMETER) dev_warn_once(dev, "Firmware may not support MQ\n"); -- 2.27.0
[PATCH v2 2/5] ibmvfc: fix invalid sub-CRQ handles after hard reset
A hard reset results in a complete transport disconnect such that the CRQ connection with the partner VIOS is broken. This has the side effect of also invalidating the associated sub-CRQs. The current code assumes that the sub-CRQs are perserved resulting in a protocol violation after trying to reconnect them with the VIOS. This introduces an infinite loop such that the VIOS forces a disconnect after each subsequent attempt to re-register with invalid handles. Avoid the aforementioned issue by releasing the sub-CRQs prior to CRQ disconnect, and driving a reinitialization of the sub-CRQs once a new CRQ is registered with the hypervisor. fixes: faacf8c5f1d5 ("ibmvfc: add alloc/dealloc routines for SCSI Sub-CRQ Channels") Signed-off-by: Tyrel Datwyler Reviewed-by: Brian King --- drivers/scsi/ibmvscsi/ibmvfc.c | 21 + 1 file changed, 9 insertions(+), 12 deletions(-) diff --git a/drivers/scsi/ibmvscsi/ibmvfc.c b/drivers/scsi/ibmvscsi/ibmvfc.c index 384960036f8b..3dd20f383453 100644 --- a/drivers/scsi/ibmvscsi/ibmvfc.c +++ b/drivers/scsi/ibmvscsi/ibmvfc.c @@ -158,6 +158,9 @@ static void ibmvfc_npiv_logout(struct ibmvfc_host *); static void ibmvfc_tgt_implicit_logout_and_del(struct ibmvfc_target *); static void ibmvfc_tgt_move_login(struct ibmvfc_target *); +static void ibmvfc_release_sub_crqs(struct ibmvfc_host *); +static void ibmvfc_init_sub_crqs(struct ibmvfc_host *); + static const char *unknown_error = "unknown error"; static long h_reg_sub_crq(unsigned long unit_address, unsigned long ioba, @@ -926,8 +929,8 @@ static int ibmvfc_reset_crq(struct ibmvfc_host *vhost) unsigned long flags; struct vio_dev *vdev = to_vio_dev(vhost->dev); struct ibmvfc_queue *crq = &vhost->crq; - struct ibmvfc_queue *scrq; - int i; + + ibmvfc_release_sub_crqs(vhost); /* Close the CRQ */ do { @@ -947,16 +950,6 @@ static int ibmvfc_reset_crq(struct ibmvfc_host *vhost) memset(crq->msgs.crq, 0, PAGE_SIZE); crq->cur = 0; - if (vhost->scsi_scrqs.scrqs) { - for (i = 0; i < nr_scsi_hw_queues; i++) { - scrq = &vhost->scsi_scrqs.scrqs[i]; - spin_lock(scrq->q_lock); - memset(scrq->msgs.scrq, 0, PAGE_SIZE); - scrq->cur = 0; - spin_unlock(scrq->q_lock); - } - } - /* And re-open it again */ rc = plpar_hcall_norets(H_REG_CRQ, vdev->unit_address, crq->msg_token, PAGE_SIZE); @@ -966,6 +959,9 @@ static int ibmvfc_reset_crq(struct ibmvfc_host *vhost) dev_warn(vhost->dev, "Partner adapter not ready\n"); else if (rc != 0) dev_warn(vhost->dev, "Couldn't register crq (rc=%d)\n", rc); + + ibmvfc_init_sub_crqs(vhost); + spin_unlock(vhost->crq.q_lock); spin_unlock_irqrestore(vhost->host->host_lock, flags); @@ -5692,6 +5688,7 @@ static void ibmvfc_deregister_scsi_channel(struct ibmvfc_host *vhost, int index) free_irq(scrq->irq, scrq); irq_dispose_mapping(scrq->irq); + scrq->irq = 0; do { rc = plpar_hcall_norets(H_FREE_SUB_CRQ, vdev->unit_address, -- 2.27.0
[PATCH next v3 12/15] printk: introduce a kmsg_dump iterator
Rather than storing the iterator information in the registered kmsg_dumper structure, create a separate iterator structure. The kmsg_dump_iter structure can reside on the stack of the caller, thus allowing lockless use of the kmsg_dump functions. This change also means that the kmsg_dumper dump() callback no longer needs to pass in the kmsg_dumper as an argument. If kmsg_dumpers want to access the kernel logs, they can use the new iterator. Update the kmsg_dumper callback prototype. Update code that accesses the kernel logs using the kmsg_dumper structure to use the new kmsg_dump_iter structure. For kmsg_dumpers, this also means adding a call to kmsg_dump_rewind() to initialize the iterator. All this is in preparation for removal of @logbuf_lock. Signed-off-by: John Ogness --- arch/powerpc/kernel/nvram_64.c | 14 +++--- arch/powerpc/platforms/powernv/opal-kmsg.c | 3 +- arch/powerpc/xmon/xmon.c | 6 +-- arch/um/kernel/kmsg_dump.c | 8 +-- drivers/hv/vmbus_drv.c | 7 +-- drivers/mtd/mtdoops.c | 8 +-- fs/pstore/platform.c | 8 +-- include/linux/kmsg_dump.h | 38 --- kernel/debug/kdb/kdb_main.c| 10 ++-- kernel/printk/printk.c | 57 ++ 10 files changed, 81 insertions(+), 78 deletions(-) diff --git a/arch/powerpc/kernel/nvram_64.c b/arch/powerpc/kernel/nvram_64.c index 532f22637783..5a64b24a91c2 100644 --- a/arch/powerpc/kernel/nvram_64.c +++ b/arch/powerpc/kernel/nvram_64.c @@ -72,8 +72,7 @@ static const char *nvram_os_partitions[] = { NULL }; -static void oops_to_nvram(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason); +static void oops_to_nvram(enum kmsg_dump_reason reason); static struct kmsg_dumper nvram_kmsg_dumper = { .dump = oops_to_nvram @@ -642,11 +641,11 @@ void __init nvram_init_oops_partition(int rtas_partition_exists) * that we think will compress sufficiently to fit in the lnx,oops-log * partition. If that's too much, go back and capture uncompressed text. */ -static void oops_to_nvram(struct kmsg_dumper *dumper, - enum kmsg_dump_reason reason) +static void oops_to_nvram(enum kmsg_dump_reason reason) { struct oops_log_info *oops_hdr = (struct oops_log_info *)oops_buf; static unsigned int oops_count = 0; + static struct kmsg_dump_iter iter; static bool panicking = false; static DEFINE_SPINLOCK(lock); unsigned long flags; @@ -681,13 +680,14 @@ static void oops_to_nvram(struct kmsg_dumper *dumper, return; if (big_oops_buf) { - kmsg_dump_get_buffer(dumper, false, + kmsg_dump_rewind(&iter); + kmsg_dump_get_buffer(&iter, false, big_oops_buf, big_oops_buf_sz, &text_len); rc = zip_oops(text_len); } if (rc != 0) { - kmsg_dump_rewind(dumper); - kmsg_dump_get_buffer(dumper, false, + kmsg_dump_rewind(&iter); + kmsg_dump_get_buffer(&iter, false, oops_data, oops_data_sz, &text_len); err_type = ERR_TYPE_KERNEL_PANIC; oops_hdr->version = cpu_to_be16(OOPS_HDR_VERSION); diff --git a/arch/powerpc/platforms/powernv/opal-kmsg.c b/arch/powerpc/platforms/powernv/opal-kmsg.c index 6c3bc4b4da98..a7bd6ac681f4 100644 --- a/arch/powerpc/platforms/powernv/opal-kmsg.c +++ b/arch/powerpc/platforms/powernv/opal-kmsg.c @@ -19,8 +19,7 @@ * may not be completely printed. This function does not actually dump the * message, it just ensures that OPAL completely flushes the console buffer. */ -static void kmsg_dump_opal_console_flush(struct kmsg_dumper *dumper, -enum kmsg_dump_reason reason) +static void kmsg_dump_opal_console_flush(enum kmsg_dump_reason reason) { /* * Outside of a panic context the pollers will continue to run, diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 80ed3e1becf9..5978b90a885f 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -3001,7 +3001,7 @@ print_address(unsigned long addr) static void dump_log_buf(void) { - struct kmsg_dumper dumper; + struct kmsg_dump_iter iter; unsigned char buf[128]; size_t len; @@ -3013,9 +3013,9 @@ dump_log_buf(void) catch_memory_errors = 1; sync(); - kmsg_dump_rewind_nolock(&dumper); + kmsg_dump_rewind_nolock(&iter); xmon_start_pagination(); - while (kmsg_dump_get_line_nolock(&dumper, false, buf, sizeof(buf), &len)) { + while (kmsg_dump_get_line_nolock(&iter, false, buf, sizeof(buf), &len)) { buf[len] = '\0'; printf("%s", buf); } diff --git a/arch/um/kernel/km
[PATCH next v3 00/15] printk: remove logbuf_lock
Hello, Here is v3 of a series to remove @logbuf_lock, exposing the ringbuffer locklessly to both readers and writers. v2 is here [0]. Since @logbuf_lock was protecting much more than just the ringbuffer, this series clarifies and cleans up the various protections using comments, lockless accessors, atomic types, and a new finer-grained @syslog_lock. Removing @logbuf_lock required changing the semantics of the kmsg_dumper callback in order to work locklessly. Since this involved touching all the kmsg_dump users, we also decided [1] to use this opportunity to clean up and clarify the kmsg_dump semantics in general. This series is based on next-20210225. Changes since v2: - use get_maintainer.pl to get the full list of developers that should at least see the changes in their respective areas - do not disable interrupts in arch/um kmsg_dumper (because there is no need to) - protect the mtd/mtdoops kmsg_dumper buffer against concurrent dumps - update kerneldoc for kmsg_dump_get_line() (@len_out) - remove ksmg_dump's @active flag - change kmsg_dumper callback to: void (*dump)(enum kmsg_dump_reason reason); - rename kmsg_dumper_iter to kmsg_dump_iter - update kmsg_dumpers to use their own kmsg_dump_iter (and initialize it with kmsg_dump_rewind() if necessary) John Ogness [0] https://lkml.kernel.org/r/20210218081817.28849-1-john.ogn...@linutronix.de [1] https://lkml.kernel.org/r/YDeZAA08NKCHa4s%2F@alley John Ogness (15): um: synchronize kmsg_dumper mtd: mtdoops: synchronize kmsg_dumper printk: limit second loop of syslog_print_all printk: kmsg_dump: remove unused fields printk: refactor kmsg_dump_get_buffer() printk: consolidate kmsg_dump_get_buffer/syslog_print_all code printk: introduce CONSOLE_LOG_MAX for improved multi-line support printk: use seqcount_latch for clear_seq printk: use atomic64_t for devkmsg_user.seq printk: add syslog_lock printk: kmsg_dumper: remove @active field printk: introduce a kmsg_dump iterator printk: remove logbuf_lock printk: kmsg_dump: remove _nolock() variants printk: console: remove unnecessary safe buffer usage arch/powerpc/kernel/nvram_64.c | 14 +- arch/powerpc/platforms/powernv/opal-kmsg.c | 3 +- arch/powerpc/xmon/xmon.c | 6 +- arch/um/kernel/kmsg_dump.c | 15 +- drivers/hv/vmbus_drv.c | 7 +- drivers/mtd/mtdoops.c | 20 +- fs/pstore/platform.c | 8 +- include/linux/kmsg_dump.h | 49 +-- kernel/debug/kdb/kdb_main.c| 10 +- kernel/printk/internal.h | 4 +- kernel/printk/printk.c | 456 ++--- kernel/printk/printk_safe.c| 27 +- 12 files changed, 309 insertions(+), 310 deletions(-) -- 2.20.1
[PATCH next v3 14/15] printk: kmsg_dump: remove _nolock() variants
kmsg_dump_rewind() and kmsg_dump_get_line() are lockless, so there is no need for _nolock() variants. Remove these functions and switch all callers of the _nolock() variants. The functions without _nolock() were chosen because they are already exported to kernel modules. Signed-off-by: John Ogness Reviewed-by: Petr Mladek --- arch/powerpc/xmon/xmon.c| 4 +-- include/linux/kmsg_dump.h | 16 -- kernel/debug/kdb/kdb_main.c | 8 ++--- kernel/printk/printk.c | 60 + 4 files changed, 14 insertions(+), 74 deletions(-) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 5978b90a885f..bf7d69625a2e 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -3013,9 +3013,9 @@ dump_log_buf(void) catch_memory_errors = 1; sync(); - kmsg_dump_rewind_nolock(&iter); + kmsg_dump_rewind(&iter); xmon_start_pagination(); - while (kmsg_dump_get_line_nolock(&iter, false, buf, sizeof(buf), &len)) { + while (kmsg_dump_get_line(&iter, false, buf, sizeof(buf), &len)) { buf[len] = '\0'; printf("%s", buf); } diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h index 5d3bf20f9f0a..532673b6570a 100644 --- a/include/linux/kmsg_dump.h +++ b/include/linux/kmsg_dump.h @@ -57,17 +57,12 @@ struct kmsg_dumper { #ifdef CONFIG_PRINTK void kmsg_dump(enum kmsg_dump_reason reason); -bool kmsg_dump_get_line_nolock(struct kmsg_dump_iter *iter, bool syslog, - char *line, size_t size, size_t *len); - bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog, char *line, size_t size, size_t *len); bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog, char *buf, size_t size, size_t *len_out); -void kmsg_dump_rewind_nolock(struct kmsg_dump_iter *iter); - void kmsg_dump_rewind(struct kmsg_dump_iter *iter); int kmsg_dump_register(struct kmsg_dumper *dumper); @@ -80,13 +75,6 @@ static inline void kmsg_dump(enum kmsg_dump_reason reason) { } -static inline bool kmsg_dump_get_line_nolock(struct kmsg_dump_iter *iter, -bool syslog, const char *line, -size_t size, size_t *len) -{ - return false; -} - static inline bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog, const char *line, size_t size, size_t *len) { @@ -99,10 +87,6 @@ static inline bool kmsg_dump_get_buffer(struct kmsg_dump_iter *iter, bool syslog return false; } -static inline void kmsg_dump_rewind_nolock(struct kmsg_dump_iter *iter) -{ -} - static inline void kmsg_dump_rewind(struct kmsg_dump_iter *iter) { } diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 8544d7a55a57..67d9f2403b52 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2126,8 +2126,8 @@ static int kdb_dmesg(int argc, const char **argv) kdb_set(2, setargs); } - kmsg_dump_rewind_nolock(&iter); - while (kmsg_dump_get_line_nolock(&iter, 1, NULL, 0, NULL)) + kmsg_dump_rewind(&iter); + while (kmsg_dump_get_line(&iter, 1, NULL, 0, NULL)) n++; if (lines < 0) { @@ -2159,8 +2159,8 @@ static int kdb_dmesg(int argc, const char **argv) if (skip >= n || skip < 0) return 0; - kmsg_dump_rewind_nolock(&iter); - while (kmsg_dump_get_line_nolock(&iter, 1, buf, sizeof(buf), &len)) { + kmsg_dump_rewind(&iter); + while (kmsg_dump_get_line(&iter, 1, buf, sizeof(buf), &len)) { if (skip) { skip--; continue; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index 01385ea92e7c..15a9bc409e0a 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3373,7 +3373,7 @@ void kmsg_dump(enum kmsg_dump_reason reason) } /** - * kmsg_dump_get_line_nolock - retrieve one kmsg log line (unlocked version) + * kmsg_dump_get_line - retrieve one kmsg log line * @iter: kmsg dump iterator * @syslog: include the "<4>" prefixes * @line: buffer to copy the line to @@ -3388,18 +3388,18 @@ void kmsg_dump(enum kmsg_dump_reason reason) * * A return value of FALSE indicates that there are no more records to * read. - * - * The function is similar to kmsg_dump_get_line(), but grabs no locks. */ -bool kmsg_dump_get_line_nolock(struct kmsg_dump_iter *iter, bool syslog, - char *line, size_t size, size_t *len) +bool kmsg_dump_get_line(struct kmsg_dump_iter *iter, bool syslog, + char *line, size_t size, size_t *len) { struct printk_info info; unsigned int line_count; struct printk_record r; + unsigned long flags; size_t l = 0; bool ret = false; +
[PATCH next v3 11/15] printk: kmsg_dumper: remove @active field
All 6 kmsg_dumpers do not benefit from the @active flag: (provide their own synchronization) - arch/powerpc/kernel/nvram_64.c - arch/um/kernel/kmsg_dump.c - drivers/mtd/mtdoops.c - fs/pstore/platform.c (only dump on KMSG_DUMP_PANIC, which does not require synchronization) - arch/powerpc/platforms/powernv/opal-kmsg.c - drivers/hv/vmbus_drv.c The other 2 kmsg_dump users also do not rely on @active: (hard-code @active to always be true) - arch/powerpc/xmon/xmon.c - kernel/debug/kdb/kdb_main.c Therefore, @active can be removed. Signed-off-by: John Ogness --- arch/powerpc/xmon/xmon.c| 2 +- include/linux/kmsg_dump.h | 2 -- kernel/debug/kdb/kdb_main.c | 2 +- kernel/printk/printk.c | 10 +- 4 files changed, 3 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/xmon/xmon.c b/arch/powerpc/xmon/xmon.c index 3fe37495f63d..80ed3e1becf9 100644 --- a/arch/powerpc/xmon/xmon.c +++ b/arch/powerpc/xmon/xmon.c @@ -3001,7 +3001,7 @@ print_address(unsigned long addr) static void dump_log_buf(void) { - struct kmsg_dumper dumper = { .active = 1 }; + struct kmsg_dumper dumper; unsigned char buf[128]; size_t len; diff --git a/include/linux/kmsg_dump.h b/include/linux/kmsg_dump.h index 070c994ff19f..84eaa2090efa 100644 --- a/include/linux/kmsg_dump.h +++ b/include/linux/kmsg_dump.h @@ -36,7 +36,6 @@ enum kmsg_dump_reason { * through the record iterator * @max_reason:filter for highest reason number that should be dumped * @registered:Flag that specifies if this is already registered - * @active:Flag that specifies if this is currently dumping * @cur_seq: Points to the oldest message to dump * @next_seq: Points after the newest message to dump */ @@ -44,7 +43,6 @@ struct kmsg_dumper { struct list_head list; void (*dump)(struct kmsg_dumper *dumper, enum kmsg_dump_reason reason); enum kmsg_dump_reason max_reason; - bool active; bool registered; /* private state of the kmsg iterator */ diff --git a/kernel/debug/kdb/kdb_main.c b/kernel/debug/kdb/kdb_main.c index 930ac1b25ec7..315169d5e119 100644 --- a/kernel/debug/kdb/kdb_main.c +++ b/kernel/debug/kdb/kdb_main.c @@ -2101,7 +2101,7 @@ static int kdb_dmesg(int argc, const char **argv) int adjust = 0; int n = 0; int skip = 0; - struct kmsg_dumper dumper = { .active = 1 }; + struct kmsg_dumper dumper; size_t len; char buf[201]; diff --git a/kernel/printk/printk.c b/kernel/printk/printk.c index c2ed7db8930b..45cb3e9c62c5 100644 --- a/kernel/printk/printk.c +++ b/kernel/printk/printk.c @@ -3408,8 +3408,6 @@ void kmsg_dump(enum kmsg_dump_reason reason) continue; /* initialize iterator with data about the stored records */ - dumper->active = true; - logbuf_lock_irqsave(flags); dumper->cur_seq = latched_seq_read_nolock(&clear_seq); dumper->next_seq = prb_next_seq(prb); @@ -3417,9 +3415,6 @@ void kmsg_dump(enum kmsg_dump_reason reason) /* invoke dumper which will iterate over records */ dumper->dump(dumper, reason); - - /* reset iterator */ - dumper->active = false; } rcu_read_unlock(); } @@ -3454,9 +3449,6 @@ bool kmsg_dump_get_line_nolock(struct kmsg_dumper *dumper, bool syslog, prb_rec_init_rd(&r, &info, line, size); - if (!dumper->active) - goto out; - /* Read text or count text lines? */ if (line) { if (!prb_read_valid(prb, dumper->cur_seq, &r)) @@ -3542,7 +3534,7 @@ bool kmsg_dump_get_buffer(struct kmsg_dumper *dumper, bool syslog, bool ret = false; bool time = printk_time; - if (!dumper->active || !buf || !size) + if (!buf || !size) goto out; logbuf_lock_irqsave(flags); -- 2.20.1
Re: [PATCH] docs: powerpc: Fix tables in syscall64-abi.rst
Andrew Donnellan writes: > Commit 209b44c804c ("docs: powerpc: syscall64-abi.rst: fix a malformed > table") attempted to fix the formatting of tables in syscall64-abi.rst, but > inadvertently changed some register names. > > Redo the tables with the correct register names, and while we're here, > clean things up to separate the registers into different rows and add > headings. > > Fixes: 209b44c804c ("docs: powerpc: syscall64-abi.rst: fix a malformed table") > Signed-off-by: Andrew Donnellan > --- > Documentation/powerpc/syscall64-abi.rst | 51 - > 1 file changed, 32 insertions(+), 19 deletions(-) Applied, thanks. jon
[PATCH v1 12/15] powerpc/uaccess: Refactor get/put_user() and __get/put_user()
Make get_user() do the access_ok() check then call __get_user(). Make put_user() do the access_ok() check then call __put_user(). Then embed __get_user_size() and __put_user_size() in __get_user() and __put_user(). Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/uaccess.h | 66 +++--- 1 file changed, 23 insertions(+), 43 deletions(-) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 616a3a7928c2..671c083f2f2f 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -43,21 +43,6 @@ static inline bool __access_ok(unsigned long addr, unsigned long size) * exception handling means that it's no longer "just"...) * */ -#define __put_user_size(x, ptr, size, retval) \ -do { \ - __label__ __pu_failed; \ - \ - retval = 0; \ - allow_write_to_user(ptr, size); \ - __put_user_size_goto(x, ptr, size, __pu_failed);\ - prevent_write_to_user(ptr, size); \ - break; \ - \ -__pu_failed: \ - retval = -EFAULT; \ - prevent_write_to_user(ptr, size); \ -} while (0) - #define __put_user(x, ptr) \ ({ \ long __pu_err; \ @@ -66,23 +51,29 @@ __pu_failed: \ __typeof__(sizeof(*(ptr))) __pu_size = sizeof(*(ptr)); \ \ might_fault(); \ - __put_user_size(__pu_val, __pu_addr, __pu_size, __pu_err); \ + do {\ + __label__ __pu_failed; \ + \ + allow_write_to_user(__pu_addr, __pu_size); \ + __put_user_size_goto(__pu_val, __pu_addr, __pu_size, __pu_failed); \ + prevent_write_to_user(__pu_addr, __pu_size);\ + __pu_err = 0; \ + break; \ + \ +__pu_failed: \ + prevent_write_to_user(__pu_addr, __pu_size);\ + __pu_err = -EFAULT; \ + } while (0);\ \ __pu_err; \ }) #define put_user(x, ptr) \ ({ \ - long __pu_err = -EFAULT;\ - __typeof__(*(ptr)) __user *__pu_addr = (ptr); \ - __typeof__(*(ptr)) __pu_val = (__typeof__(*(ptr)))(x); \ - __typeof__(sizeof(*(ptr))) __pu_size = sizeof(*(ptr)); \ + __typeof__(*(ptr)) __user *_pu_addr = (ptr);\ \ - might_fault(); \ - if (access_ok(__pu_addr, __pu_size))\ - __put_user_size(__pu_val, __pu_addr, __pu_size, __pu_err); \ - \ - __pu_err; \ + access_ok(_pu_addr, sizeof(*(ptr))) ? \ + __put_user(x, _pu_addr) : -EFAULT;\ }) /* @@ -192,13 +183,6 @@ do { \ } \ } while (0) -#define __get_user_size(x, ptr, size, retval) \ -do { \ - allow_read_from_user(ptr, size);\ - __get_user_size_allowed(x, ptr, size, retval); \ - prevent_read_from_user(ptr, size); \ -} while (0) - /* * This is a type: either unsigned long, if the argument fits into * that type, or otherwise unsigned long long. @@ -214,7 +198,9 @@ do {
[PATCH v1 14/15] powerpc/uaccess: Also perform 64 bits copies in unsafe_copy_to_user() on ppc32
ppc32 has an efficiant 64 bits __put_user(), so also use it in order to unroll loops more. Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/uaccess.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index b3bd1fb42242..e831653db51a 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -371,9 +371,9 @@ do { \ size_t _len = (l); \ int _i; \ \ - for (_i = 0; _i < (_len & ~(sizeof(long) - 1)); _i += sizeof(long)) \ - unsafe_put_user(*(long*)(_src + _i), (long __user *)(_dst + _i), e); \ - if (IS_ENABLED(CONFIG_PPC64) && (_len & 4)) { \ + for (_i = 0; _i < (_len & ~(sizeof(u64) - 1)); _i += sizeof(u64)) \ + unsafe_put_user(*(u64 *)(_src + _i), (u64 __user *)(_dst + _i), e); \ + if (_len & 4) { \ unsafe_put_user(*(u32*)(_src + _i), (u32 __user *)(_dst + _i), e); \ _i += 4;\ } \ -- 2.25.0
[PATCH v1 15/15] powerpc/uaccess: Move copy_mc_xxx() functions down
copy_mc_xxx() functions are in the middle of raw_copy functions. For clarity, move them out of the raw_copy functions block. They are using access_ok, so they need to be after the general functions in order to eventually allow the inclusion of asm-generic/uaccess.h in some future. Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/uaccess.h | 52 +++--- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index e831653db51a..f1f9237ed857 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -220,32 +220,6 @@ do { \ extern unsigned long __copy_tofrom_user(void __user *to, const void __user *from, unsigned long size); -#ifdef CONFIG_ARCH_HAS_COPY_MC -unsigned long __must_check -copy_mc_generic(void *to, const void *from, unsigned long size); - -static inline unsigned long __must_check -copy_mc_to_kernel(void *to, const void *from, unsigned long size) -{ - return copy_mc_generic(to, from, size); -} -#define copy_mc_to_kernel copy_mc_to_kernel - -static inline unsigned long __must_check -copy_mc_to_user(void __user *to, const void *from, unsigned long n) -{ - if (likely(check_copy_size(from, n, true))) { - if (access_ok(to, n)) { - allow_write_to_user(to, n); - n = copy_mc_generic((void *)to, from, n); - prevent_write_to_user(to, n); - } - } - - return n; -} -#endif - #ifdef __powerpc64__ static inline unsigned long raw_copy_in_user(void __user *to, const void __user *from, unsigned long n) @@ -302,6 +276,32 @@ static inline unsigned long clear_user(void __user *addr, unsigned long size) extern long strncpy_from_user(char *dst, const char __user *src, long count); extern __must_check long strnlen_user(const char __user *str, long n); +#ifdef CONFIG_ARCH_HAS_COPY_MC +unsigned long __must_check +copy_mc_generic(void *to, const void *from, unsigned long size); + +static inline unsigned long __must_check +copy_mc_to_kernel(void *to, const void *from, unsigned long size) +{ + return copy_mc_generic(to, from, size); +} +#define copy_mc_to_kernel copy_mc_to_kernel + +static inline unsigned long __must_check +copy_mc_to_user(void __user *to, const void *from, unsigned long n) +{ + if (likely(check_copy_size(from, n, true))) { + if (access_ok(to, n)) { + allow_write_to_user(to, n); + n = copy_mc_generic((void *)to, from, n); + prevent_write_to_user(to, n); + } + } + + return n; +} +#endif + extern long __copy_from_user_flushcache(void *dst, const void __user *src, unsigned size); extern void memcpy_page_flushcache(char *to, struct page *page, size_t offset, -- 2.25.0
[PATCH v1 13/15] powerpc/uaccess: Swap clear_user() and __clear_user()
It is clear_user() which is expected to call __clear_user(), not the reverse. Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/uaccess.h | 17 - 1 file changed, 8 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 671c083f2f2f..b3bd1fb42242 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -283,21 +283,20 @@ raw_copy_to_user(void __user *to, const void *from, unsigned long n) unsigned long __arch_clear_user(void __user *addr, unsigned long size); -static inline unsigned long clear_user(void __user *addr, unsigned long size) +static inline unsigned long __clear_user(void __user *addr, unsigned long size) { - unsigned long ret = size; + unsigned long ret; + might_fault(); - if (likely(access_ok(addr, size))) { - allow_write_to_user(addr, size); - ret = __arch_clear_user(addr, size); - prevent_write_to_user(addr, size); - } + allow_write_to_user(addr, size); + ret = __arch_clear_user(addr, size); + prevent_write_to_user(addr, size); return ret; } -static inline unsigned long __clear_user(void __user *addr, unsigned long size) +static inline unsigned long clear_user(void __user *addr, unsigned long size) { - return clear_user(addr, size); + return likely(access_ok(addr, size)) ? __clear_user(addr, size) : size; } extern long strncpy_from_user(char *dst, const char __user *src, long count); -- 2.25.0
[PATCH v1 11/15] powerpc/uaccess: Rename __get/put_user_check/nocheck
__get_user_check() becomes get_user() __put_user_check() becomes put_user() __get_user_nocheck() becomes __get_user() __put_user_nocheck() becomes __put_user() Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/uaccess.h | 30 ++ 1 file changed, 10 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 678651a615c3..616a3a7928c2 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -43,16 +43,6 @@ static inline bool __access_ok(unsigned long addr, unsigned long size) * exception handling means that it's no longer "just"...) * */ -#define get_user(x, ptr) \ - __get_user_check((x), (ptr), sizeof(*(ptr))) -#define put_user(x, ptr) \ - __put_user_check((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr))) - -#define __get_user(x, ptr) \ - __get_user_nocheck((x), (ptr), sizeof(*(ptr))) -#define __put_user(x, ptr) \ - __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr))) - #define __put_user_size(x, ptr, size, retval) \ do { \ __label__ __pu_failed; \ @@ -68,12 +58,12 @@ __pu_failed: \ prevent_write_to_user(ptr, size); \ } while (0) -#define __put_user_nocheck(x, ptr, size) \ +#define __put_user(x, ptr) \ ({ \ long __pu_err; \ __typeof__(*(ptr)) __user *__pu_addr = (ptr); \ - __typeof__(*(ptr)) __pu_val = (x); \ - __typeof__(size) __pu_size = (size);\ + __typeof__(*(ptr)) __pu_val = (__typeof__(*(ptr)))(x); \ + __typeof__(sizeof(*(ptr))) __pu_size = sizeof(*(ptr)); \ \ might_fault(); \ __put_user_size(__pu_val, __pu_addr, __pu_size, __pu_err); \ @@ -81,12 +71,12 @@ __pu_failed: \ __pu_err; \ }) -#define __put_user_check(x, ptr, size) \ +#define put_user(x, ptr) \ ({ \ long __pu_err = -EFAULT;\ __typeof__(*(ptr)) __user *__pu_addr = (ptr); \ - __typeof__(*(ptr)) __pu_val = (x); \ - __typeof__(size) __pu_size = (size);\ + __typeof__(*(ptr)) __pu_val = (__typeof__(*(ptr)))(x); \ + __typeof__(sizeof(*(ptr))) __pu_size = sizeof(*(ptr)); \ \ might_fault(); \ if (access_ok(__pu_addr, __pu_size))\ @@ -216,12 +206,12 @@ do { \ #define __long_type(x) \ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) -#define __get_user_nocheck(x, ptr, size) \ +#define __get_user(x, ptr) \ ({ \ long __gu_err; \ __long_type(*(ptr)) __gu_val; \ __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ - __typeof__(size) __gu_size = (size);\ + __typeof__(sizeof(*(ptr))) __gu_size = sizeof(*(ptr)); \ \ might_fault(); \ __get_user_size(__gu_val, __gu_addr, __gu_size, __gu_err); \ @@ -230,12 +220,12 @@ do { \ __gu_err; \ }) -#define __get_user_check(x, ptr, size) \ +#define get_user(x, ptr) \ ({ \ long __gu_err = -EFAULT;\ __long_type(*(ptr)) __gu_val = 0; \ __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ - __typeof__(size) __gu_size = (size);\ + __typeof__(sizeof(*(ptr))) __gu_size = sizeof(*(ptr)); \
[PATCH v1 09/15] powerpc/uaccess: Remove calls to __get_user_bad() and __put_user_bad()
__get_user_bad() and __put_user_bad() are functions that are declared but not defined, in order to make the build fails in case they are called. Nowadays, we have BUILD_BUG() and BUILD_BUG_ON() for that. Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/uaccess.h | 11 +++ 1 file changed, 3 insertions(+), 8 deletions(-) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index a9f2639ca3a8..a8c683695ec7 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -53,8 +53,6 @@ static inline bool __access_ok(unsigned long addr, unsigned long size) #define __put_user(x, ptr) \ __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr))) -extern long __put_user_bad(void); - #define __put_user_size(x, ptr, size, retval) \ do { \ __label__ __pu_failed; \ @@ -136,12 +134,10 @@ do { \ case 2: __put_user_asm_goto(x, __pus_addr, label, "sth"); break; \ case 4: __put_user_asm_goto(x, __pus_addr, label, "stw"); break; \ case 8: __put_user_asm2_goto(x, __pus_addr, label); break; \ - default: __put_user_bad(); \ + default: BUILD_BUG(); \ } \ } while (0) -extern long __get_user_bad(void); - /* * This does an atomic 128 byte aligned load from userspace. * Upto caller to do enable_kernel_vmx() before calling! @@ -196,14 +192,13 @@ extern long __get_user_bad(void); #define __get_user_size_allowed(x, ptr, size, retval) \ do { \ retval = 0; \ - if (size > sizeof(x)) \ - (x) = __get_user_bad(); \ + BUILD_BUG_ON(size > sizeof(x)); \ switch (size) { \ case 1: __get_user_asm(x, (u8 __user *)ptr, retval, "lbz"); break; \ case 2: __get_user_asm(x, (u16 __user *)ptr, retval, "lhz"); break; \ case 4: __get_user_asm(x, (u32 __user *)ptr, retval, "lwz"); break; \ case 8: __get_user_asm2(x, (u64 __user *)ptr, retval); break; \ - default: (x) = __get_user_bad();\ + default: BUILD_BUG(); \ } \ } while (0) -- 2.25.0
[PATCH v1 10/15] powerpc/uaccess: Split out __get_user_nocheck()
One part of __get_user_nocheck() is used for __get_user(), the other part for unsafe_get_user(). Move the part dedicated to unsafe_get_user() in it. Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/uaccess.h | 22 -- 1 file changed, 12 insertions(+), 10 deletions(-) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index a8c683695ec7..678651a615c3 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -49,7 +49,7 @@ static inline bool __access_ok(unsigned long addr, unsigned long size) __put_user_check((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr))) #define __get_user(x, ptr) \ - __get_user_nocheck((x), (ptr), sizeof(*(ptr)), true) + __get_user_nocheck((x), (ptr), sizeof(*(ptr))) #define __put_user(x, ptr) \ __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr))) @@ -216,19 +216,15 @@ do { \ #define __long_type(x) \ __typeof__(__builtin_choose_expr(sizeof(x) > sizeof(0UL), 0ULL, 0UL)) -#define __get_user_nocheck(x, ptr, size, do_allow) \ +#define __get_user_nocheck(x, ptr, size) \ ({ \ long __gu_err; \ __long_type(*(ptr)) __gu_val; \ __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ __typeof__(size) __gu_size = (size);\ \ - if (do_allow) { \ - might_fault(); \ - __get_user_size(__gu_val, __gu_addr, __gu_size, __gu_err); \ - } else { \ - __get_user_size_allowed(__gu_val, __gu_addr, __gu_size, __gu_err); \ - } \ + might_fault(); \ + __get_user_size(__gu_val, __gu_addr, __gu_size, __gu_err); \ (x) = (__typeof__(*(ptr)))__gu_val; \ \ __gu_err; \ @@ -386,8 +382,14 @@ user_write_access_begin(const void __user *ptr, size_t len) #define user_write_access_end prevent_current_write_to_user #define unsafe_get_user(x, p, e) do { \ - if (unlikely(__get_user_nocheck((x), (p), sizeof(*(p)), false)))\ - goto e; \ + long __gu_err; \ + __long_type(*(p)) __gu_val; \ + __typeof__(*(p)) __user *__gu_addr = (p); \ + \ + __get_user_size_allowed(__gu_val, __gu_addr, sizeof(*(p)), __gu_err); \ + if (__gu_err) \ + goto e; \ + (x) = (__typeof__(*(p)))__gu_val; \ } while (0) #define unsafe_put_user(x, p, e) \ -- 2.25.0
[PATCH v1 08/15] powerpc/uaccess: Remove __chk_user_ptr() in __get/put_user
Commit d02f6b7dab82 ("powerpc/uaccess: Evaluate macro arguments once, before user access is allowed") changed the __chk_user_ptr() argument from the passed ptr pointer to the locally declared __gu_addr. But __gu_addr is locally defined as __user so the check is pointless. During kernel build __chk_user_ptr() voids and is only evaluated during sparse checks so it should have been armless to leave the original pointer check there. Nevertheless, this check is indeed redundant with the assignment above which casts the ptr pointer to the local __user __gu_addr. In case of mismatch, sparse will detect it there, so the __check_user_ptr() is not needed anywhere else than in access_ok(). Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/uaccess.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index a6d3563cf3ee..a9f2639ca3a8 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -78,7 +78,6 @@ __pu_failed: \ __typeof__(size) __pu_size = (size);\ \ might_fault(); \ - __chk_user_ptr(__pu_addr); \ __put_user_size(__pu_val, __pu_addr, __pu_size, __pu_err); \ \ __pu_err; \ @@ -197,7 +196,6 @@ extern long __get_user_bad(void); #define __get_user_size_allowed(x, ptr, size, retval) \ do { \ retval = 0; \ - __chk_user_ptr(ptr);\ if (size > sizeof(x)) \ (x) = __get_user_bad(); \ switch (size) { \ @@ -230,7 +228,6 @@ do { \ __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ __typeof__(size) __gu_size = (size);\ \ - __chk_user_ptr(__gu_addr); \ if (do_allow) { \ might_fault(); \ __get_user_size(__gu_val, __gu_addr, __gu_size, __gu_err); \ -- 2.25.0
[PATCH v1 07/15] powerpc/uaccess: Remove __unsafe_put_user_goto()
__unsafe_put_user_goto() is just an intermediate layer to __put_user_size_goto() without added value other than doing the __user pointer type checking. Do the __user pointer type checking in __put_user_size_goto() and remove __unsafe_put_user_goto(). Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/uaccess.h | 20 +++- 1 file changed, 7 insertions(+), 13 deletions(-) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index c4bbc64758a0..a6d3563cf3ee 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -130,23 +130,17 @@ __pu_failed: \ #define __put_user_size_goto(x, ptr, size, label) \ do { \ + __typeof__(*(ptr)) __user *__pus_addr = (ptr); \ + \ switch (size) { \ - case 1: __put_user_asm_goto(x, ptr, label, "stb"); break; \ - case 2: __put_user_asm_goto(x, ptr, label, "sth"); break; \ - case 4: __put_user_asm_goto(x, ptr, label, "stw"); break; \ - case 8: __put_user_asm2_goto(x, ptr, label); break; \ + case 1: __put_user_asm_goto(x, __pus_addr, label, "stb"); break; \ + case 2: __put_user_asm_goto(x, __pus_addr, label, "sth"); break; \ + case 4: __put_user_asm_goto(x, __pus_addr, label, "stw"); break; \ + case 8: __put_user_asm2_goto(x, __pus_addr, label); break; \ default: __put_user_bad(); \ } \ } while (0) -#define __unsafe_put_user_goto(x, ptr, size, label)\ -do { \ - __typeof__(*(ptr)) __user *__pu_addr = (ptr); \ - __chk_user_ptr(ptr);\ - __put_user_size_goto((x), __pu_addr, (size), label);\ -} while (0) - - extern long __get_user_bad(void); /* @@ -405,7 +399,7 @@ user_write_access_begin(const void __user *ptr, size_t len) } while (0) #define unsafe_put_user(x, p, e) \ - __unsafe_put_user_goto((__typeof__(*(p)))(x), (p), sizeof(*(p)), e) + __put_user_size_goto((__typeof__(*(p)))(x), (p), sizeof(*(p)), e) #define unsafe_copy_to_user(d, s, l, e) \ do { \ -- 2.25.0
[PATCH v1 05/15] powerpc/align: Don't use __get_user_instr() on kernel addresses
In the old days, when we didn't have kernel userspace access protection and had set_fs(), it was wise to use __get_user() and friends to read kernel memory. Nowadays, get_user() is granting userspace access and is exclusively for userspace access. In alignment exception handler, use probe_kernel_read_inst() instead of __get_user_instr() for reading instructions in kernel. This will allow to remove the is_kernel_addr() check in __get/put_user() in a following patch. Signed-off-by: Christophe Leroy --- arch/powerpc/kernel/align.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c index 83b199026a1e..55e262627b53 100644 --- a/arch/powerpc/kernel/align.c +++ b/arch/powerpc/kernel/align.c @@ -305,7 +305,11 @@ int fix_alignment(struct pt_regs *regs) */ CHECK_FULL_REGS(regs); - if (unlikely(__get_user_instr(instr, (void __user *)regs->nip))) + if (is_kernel_addr(regs->nip)) + r = probe_kernel_read_inst(&instr, (void *)regs->nip); + else + r = __get_user_instr(instr, (void __user *)regs->nip); + if (unlikely(r)) return -EFAULT; if ((regs->msr & MSR_LE) != (MSR_KERNEL & MSR_LE)) { /* We don't handle PPC little-endian any more... */ -- 2.25.0
[PATCH v1 06/15] powerpc/uaccess: Call might_fault() inconditionaly
Commit 6bfd93c32a50 ("powerpc: Fix incorrect might_sleep in __get_user/__put_user on kernel addresses") added a check to not call might_sleep() on kernel addresses. This was to enable the use of __get_user() in the alignment exception handler for any address. Then commit 95156f0051cb ("lockdep, mm: fix might_fault() annotation") added a check of the address space in might_fault(), based on set_fs() logic. But this didn't solve the powerpc alignment exception case as it didn't call set_fs(KERNEL_DS). Nowadays, set_fs() is gone, previous patch fixed the alignment exception handler and __get_user/__put_user are not supposed to be used anymore to read kernel memory. Therefore the is_kernel_addr() check has become useless and can be removed. Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/uaccess.h | 9 - 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index eaa828a6a419..c4bbc64758a0 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -77,8 +77,7 @@ __pu_failed: \ __typeof__(*(ptr)) __pu_val = (x); \ __typeof__(size) __pu_size = (size);\ \ - if (!is_kernel_addr((unsigned long)__pu_addr)) \ - might_fault(); \ + might_fault(); \ __chk_user_ptr(__pu_addr); \ __put_user_size(__pu_val, __pu_addr, __pu_size, __pu_err); \ \ @@ -238,12 +237,12 @@ do { \ __typeof__(size) __gu_size = (size);\ \ __chk_user_ptr(__gu_addr); \ - if (do_allow && !is_kernel_addr((unsigned long)__gu_addr)) \ + if (do_allow) { \ might_fault(); \ - if (do_allow) \ __get_user_size(__gu_val, __gu_addr, __gu_size, __gu_err); \ - else \ + } else { \ __get_user_size_allowed(__gu_val, __gu_addr, __gu_size, __gu_err); \ + } \ (x) = (__typeof__(*(ptr)))__gu_val; \ \ __gu_err; \ -- 2.25.0
[PATCH v1 02/15] powerpc/uaccess: Define ___get_user_instr() for ppc32
Define simple ___get_user_instr() for ppc32 instead of defining ppc32 versions of the three get_user_instr() helpers. Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/uaccess.h | 16 1 file changed, 4 insertions(+), 12 deletions(-) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 8cbf3e3874f1..a08c482b1315 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -81,6 +81,10 @@ static inline bool __access_ok(unsigned long addr, unsigned long size) } \ __gui_ret; \ }) +#else /* !CONFIG_PPC64 */ +#define ___get_user_instr(gu_op, dest, ptr)\ + gu_op((dest).val, (u32 __user *)(ptr)) +#endif /* CONFIG_PPC64 */ #define get_user_instr(x, ptr) \ ___get_user_instr(get_user, x, ptr) @@ -91,18 +95,6 @@ static inline bool __access_ok(unsigned long addr, unsigned long size) #define __get_user_instr_inatomic(x, ptr) \ ___get_user_instr(__get_user_inatomic, x, ptr) -#else /* !CONFIG_PPC64 */ -#define get_user_instr(x, ptr) \ - get_user((x).val, (u32 __user *)(ptr)) - -#define __get_user_instr(x, ptr) \ - __get_user_nocheck((x).val, (u32 __user *)(ptr), sizeof(u32), true) - -#define __get_user_instr_inatomic(x, ptr) \ - __get_user_nosleep((x).val, (u32 __user *)(ptr), sizeof(u32)) - -#endif /* CONFIG_PPC64 */ - extern long __put_user_bad(void); #define __put_user_size(x, ptr, size, retval) \ -- 2.25.0
[PATCH v1 04/15] powerpc/uaccess: Move get_user_instr helpers in asm/inst.h
Those helpers use get_user helpers but they don't participate in their implementation, so they do not belong to asm/uaccess.h Move them in asm/inst.h Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/inst.h| 34 ++ arch/powerpc/include/asm/uaccess.h | 34 -- 2 files changed, 34 insertions(+), 34 deletions(-) diff --git a/arch/powerpc/include/asm/inst.h b/arch/powerpc/include/asm/inst.h index cc73c1267572..19e18af2fac9 100644 --- a/arch/powerpc/include/asm/inst.h +++ b/arch/powerpc/include/asm/inst.h @@ -4,6 +4,40 @@ #include +#ifdef CONFIG_PPC64 + +#define ___get_user_instr(gu_op, dest, ptr)\ +({ \ + long __gui_ret = 0; \ + unsigned long __gui_ptr = (unsigned long)ptr; \ + struct ppc_inst __gui_inst; \ + unsigned int __prefix, __suffix;\ + __gui_ret = gu_op(__prefix, (unsigned int __user *)__gui_ptr); \ + if (__gui_ret == 0) { \ + if ((__prefix >> 26) == OP_PREFIX) {\ + __gui_ret = gu_op(__suffix, \ + (unsigned int __user *)__gui_ptr + 1); \ + __gui_inst = ppc_inst_prefix(__prefix, \ +__suffix); \ + } else {\ + __gui_inst = ppc_inst(__prefix);\ + } \ + if (__gui_ret == 0) \ + (dest) = __gui_inst;\ + } \ + __gui_ret; \ +}) +#else /* !CONFIG_PPC64 */ +#define ___get_user_instr(gu_op, dest, ptr)\ + gu_op((dest).val, (u32 __user *)(ptr)) +#endif /* CONFIG_PPC64 */ + +#define get_user_instr(x, ptr) \ + ___get_user_instr(get_user, x, ptr) + +#define __get_user_instr(x, ptr) \ + ___get_user_instr(__get_user, x, ptr) + /* * Instruction data type for POWER */ diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 01aea0df4dd0..eaa828a6a419 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -53,40 +53,6 @@ static inline bool __access_ok(unsigned long addr, unsigned long size) #define __put_user(x, ptr) \ __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr))) -#ifdef CONFIG_PPC64 - -#define ___get_user_instr(gu_op, dest, ptr)\ -({ \ - long __gui_ret = 0; \ - unsigned long __gui_ptr = (unsigned long)ptr; \ - struct ppc_inst __gui_inst; \ - unsigned int __prefix, __suffix;\ - __gui_ret = gu_op(__prefix, (unsigned int __user *)__gui_ptr); \ - if (__gui_ret == 0) { \ - if ((__prefix >> 26) == OP_PREFIX) {\ - __gui_ret = gu_op(__suffix, \ - (unsigned int __user *)__gui_ptr + 1); \ - __gui_inst = ppc_inst_prefix(__prefix, \ -__suffix); \ - } else {\ - __gui_inst = ppc_inst(__prefix);\ - } \ - if (__gui_ret == 0) \ - (dest) = __gui_inst;\ - } \ - __gui_ret; \ -}) -#else /* !CONFIG_PPC64 */ -#define ___get_user_instr(gu_op, dest, ptr)\ - gu_op((dest).val, (u32 __user *)(ptr)) -#endif /* CONFIG_PPC64 */ - -#define get_user_instr(x, ptr) \ - ___get_user_instr(get_user, x, ptr) - -#define __get_user_instr(x, ptr) \ - ___get_user_instr(__get_user, x, ptr) - extern long __put_user_bad(void); #define __put_user_size(x, ptr, size, retval) \ -- 2.25.0
[PATCH v1 03/15] powerpc/uaccess: Remove __get/put_user_inatomic()
Since commit 662bbcb2747c ("mm, sched: Allow uaccess in atomic with pagefault_disable()"), __get/put_user() can be used in atomic parts of the code, therefore the __get/put_user_inatomic() introduced by commit e68c825bb016 ("[POWERPC] Add inatomic versions of __get_user and __put_user") have become useless. powerpc is the only one having such functions. There is a real intention not to have to provide such _inatomic() helpers, see the comment in might_fault() in mm/memory.c introduced by commit 3ee1afa308f2 ("x86: some lock annotations for user copy paths, v2"): /* * it would be nicer only to annotate paths which are not under * pagefault_disable, however that requires a larger audit and * providing helpers like get_user_atomic. */ So remove __get_user_inatomic() and __put_user_inatomic(). Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/uaccess.h| 37 --- arch/powerpc/kernel/align.c | 32 .../kernel/hw_breakpoint_constraints.c| 2 +- arch/powerpc/kernel/traps.c | 2 +- 4 files changed, 18 insertions(+), 55 deletions(-) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index a08c482b1315..01aea0df4dd0 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -53,11 +53,6 @@ static inline bool __access_ok(unsigned long addr, unsigned long size) #define __put_user(x, ptr) \ __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr))) -#define __get_user_inatomic(x, ptr) \ - __get_user_nosleep((x), (ptr), sizeof(*(ptr))) -#define __put_user_inatomic(x, ptr) \ - __put_user_nosleep((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr))) - #ifdef CONFIG_PPC64 #define ___get_user_instr(gu_op, dest, ptr)\ @@ -92,9 +87,6 @@ static inline bool __access_ok(unsigned long addr, unsigned long size) #define __get_user_instr(x, ptr) \ ___get_user_instr(__get_user, x, ptr) -#define __get_user_instr_inatomic(x, ptr) \ - ___get_user_instr(__get_user_inatomic, x, ptr) - extern long __put_user_bad(void); #define __put_user_size(x, ptr, size, retval) \ @@ -141,20 +133,6 @@ __pu_failed: \ __pu_err; \ }) -#define __put_user_nosleep(x, ptr, size) \ -({ \ - long __pu_err; \ - __typeof__(*(ptr)) __user *__pu_addr = (ptr); \ - __typeof__(*(ptr)) __pu_val = (x); \ - __typeof__(size) __pu_size = (size);\ - \ - __chk_user_ptr(__pu_addr); \ - __put_user_size(__pu_val, __pu_addr, __pu_size, __pu_err); \ - \ - __pu_err; \ -}) - - /* * We don't tell gcc that we are accessing memory, but this is OK * because we do not write to any memory gcc knows about, so there @@ -320,21 +298,6 @@ do { \ __gu_err; \ }) -#define __get_user_nosleep(x, ptr, size) \ -({ \ - long __gu_err; \ - __long_type(*(ptr)) __gu_val; \ - __typeof__(*(ptr)) __user *__gu_addr = (ptr); \ - __typeof__(size) __gu_size = (size);\ - \ - __chk_user_ptr(__gu_addr); \ - __get_user_size(__gu_val, __gu_addr, __gu_size, __gu_err); \ - (x) = (__force __typeof__(*(ptr)))__gu_val; \ - \ - __gu_err; \ -}) - - /* more complex routines */ extern unsigned long __copy_tofrom_user(void __user *to, diff --git a/arch/powerpc/kernel/align.c b/arch/powerpc/kernel/align.c index c7797eb958c7..83b199026a1e 100644 --- a/arch/powerpc/kernel/align.c +++ b/arch/powerpc/kernel/align.c @@ -174,18 +174,18 @@ static int emulate_spe(struct pt_regs *regs, unsigned int reg, switch (nb) { case 8: - ret |= __get_user_inatomic(temp.v[0], p++); - ret |= __get_user_inatomic(temp.v[1], p++); - ret |= __get_user_inatomic(temp.v[2], p++); - ret |= __get_user_inatomic(temp.v[3], p++); +
[PATCH v1 00/15] powerpc: Cleanup of uaccess.h
This series cleans up uaccess.h Christophe Leroy (15): powerpc/uaccess: Remove __get_user_allowed() and unsafe_op_wrap() powerpc/uaccess: Define ___get_user_instr() for ppc32 powerpc/uaccess: Remove __get/put_user_inatomic() powerpc/uaccess: Move get_user_instr helpers in asm/inst.h powerpc/align: Don't use __get_user_instr() on kernel addresses powerpc/uaccess: Call might_fault() inconditionaly powerpc/uaccess: Remove __unsafe_put_user_goto() powerpc/uaccess: Remove __chk_user_ptr() in __get/put_user powerpc/uaccess: Remove calls to __get_user_bad() and __put_user_bad() powerpc/uaccess: Split out __get_user_nocheck() powerpc/uaccess: Rename __get/put_user_check/nocheck powerpc/uaccess: Refactor get/put_user() and __get/put_user() powerpc/uaccess: Swap clear_user() and __clear_user() powerpc/uaccess: Also perform 64 bits copies in unsafe_copy_to_user() on ppc32 powerpc/uaccess: Move copy_mc_xxx() functions down arch/powerpc/include/asm/inst.h | 34 ++ arch/powerpc/include/asm/uaccess.h| 303 ++ arch/powerpc/kernel/align.c | 38 ++- .../kernel/hw_breakpoint_constraints.c| 2 +- arch/powerpc/kernel/traps.c | 2 +- 5 files changed, 147 insertions(+), 232 deletions(-) -- 2.25.0
[PATCH v1 01/15] powerpc/uaccess: Remove __get_user_allowed() and unsafe_op_wrap()
Those two macros have only one user which is unsafe_get_user(). Put everything in one place and remove them. Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/uaccess.h | 10 +- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/uaccess.h b/arch/powerpc/include/asm/uaccess.h index 78e2a3990eab..8cbf3e3874f1 100644 --- a/arch/powerpc/include/asm/uaccess.h +++ b/arch/powerpc/include/asm/uaccess.h @@ -53,9 +53,6 @@ static inline bool __access_ok(unsigned long addr, unsigned long size) #define __put_user(x, ptr) \ __put_user_nocheck((__typeof__(*(ptr)))(x), (ptr), sizeof(*(ptr))) -#define __get_user_allowed(x, ptr) \ - __get_user_nocheck((x), (ptr), sizeof(*(ptr)), false) - #define __get_user_inatomic(x, ptr) \ __get_user_nosleep((x), (ptr), sizeof(*(ptr))) #define __put_user_inatomic(x, ptr) \ @@ -482,8 +479,11 @@ user_write_access_begin(const void __user *ptr, size_t len) #define user_write_access_beginuser_write_access_begin #define user_write_access_end prevent_current_write_to_user -#define unsafe_op_wrap(op, err) do { if (unlikely(op)) goto err; } while (0) -#define unsafe_get_user(x, p, e) unsafe_op_wrap(__get_user_allowed(x, p), e) +#define unsafe_get_user(x, p, e) do { \ + if (unlikely(__get_user_nocheck((x), (p), sizeof(*(p)), false)))\ + goto e; \ +} while (0) + #define unsafe_put_user(x, p, e) \ __unsafe_put_user_goto((__typeof__(*(p)))(x), (p), sizeof(*(p)), e) -- 2.25.0
[PATCH] perf bench numa: Fix the condition checks for max number of numa nodes
In systems having higher node numbers available like node 255, perf numa bench will fail with SIGABORT. <<>> perf: bench/numa.c:1416: init: Assertion `!(g->p.nr_nodes > 64 || g->p.nr_nodes < 0)' failed. Aborted (core dumped) <<>> Snippet from 'numactl -H' below on a powerpc system where the highest node number available is 255. available: 6 nodes (0,8,252-255) node 0 cpus: node 0 size: 519587 MB node 0 free: 516659 MB node 8 cpus: node 8 size: 523607 MB node 8 free: 486757 MB node 252 cpus: node 252 size: 0 MB node 252 free: 0 MB node 253 cpus: node 253 size: 0 MB node 253 free: 0 MB node 254 cpus: node 254 size: 0 MB node 254 free: 0 MB node 255 cpus: node 255 size: 0 MB node 255 free: 0 MB node distances: node 0 8 252 253 254 255 Note: expands to actual cpu list in the original output. These nodes 252-255 are to represent the memory on GPUs and are valid nodes. The perf numa bench init code has a condition check to see if the number of numa nodes (nr_nodes) exceeds MAX_NR_NODES. The value of MAX_NR_NODES defined in perf code is 64. And the 'nr_nodes' is the value from numa_max_node() which represents the highest node number available in the system. In some systems where we could have numa node 255, this condition check fails and results in SIGABORT. The numa benchmark uses static value of MAX_NR_NODES in the code to represent size of two numa node arrays and node bitmask used for setting memory policy. Patch adds a fix to dynamically allocate size for the two arrays and bitmask value based on the node numbers available in the system. With the fix, perf numa benchmark will work with node configuration on any system and thus removes the static MAX_NR_NODES value. Signed-off-by: Athira Rajeev --- tools/perf/bench/numa.c | 42 +- 1 file changed, 29 insertions(+), 13 deletions(-) diff --git a/tools/perf/bench/numa.c b/tools/perf/bench/numa.c index 11726ec..20b87e2 100644 --- a/tools/perf/bench/numa.c +++ b/tools/perf/bench/numa.c @@ -344,18 +344,22 @@ static void mempol_restore(void) static void bind_to_memnode(int node) { - unsigned long nodemask; + struct bitmask *node_mask; int ret; if (node == NUMA_NO_NODE) return; - BUG_ON(g->p.nr_nodes > (int)sizeof(nodemask)*8); - nodemask = 1L << node; + node_mask = numa_allocate_nodemask(); + BUG_ON(!node_mask); - ret = set_mempolicy(MPOL_BIND, &nodemask, sizeof(nodemask)*8); - dprintf("binding to node %d, mask: %016lx => %d\n", node, nodemask, ret); + numa_bitmask_clearall(node_mask); + numa_bitmask_setbit(node_mask, node); + ret = set_mempolicy(MPOL_BIND, node_mask->maskp, node_mask->size + 1); + dprintf("binding to node %d, mask: %016lx => %d\n", node, *node_mask->maskp, ret); + + numa_bitmask_free(node_mask); BUG_ON(ret); } @@ -876,8 +880,6 @@ static void update_curr_cpu(int task_nr, unsigned long bytes_worked) prctl(0, bytes_worked); } -#define MAX_NR_NODES 64 - /* * Count the number of nodes a process's threads * are spread out on. @@ -888,10 +890,15 @@ static void update_curr_cpu(int task_nr, unsigned long bytes_worked) */ static int count_process_nodes(int process_nr) { - char node_present[MAX_NR_NODES] = { 0, }; + char *node_present; int nodes; int n, t; + node_present = (char *)malloc(g->p.nr_nodes * sizeof(char)); + BUG_ON(!node_present); + for (nodes = 0; nodes < g->p.nr_nodes; nodes++) + node_present[nodes] = 0; + for (t = 0; t < g->p.nr_threads; t++) { struct thread_data *td; int task_nr; @@ -901,17 +908,20 @@ static int count_process_nodes(int process_nr) td = g->threads + task_nr; node = numa_node_of_cpu(td->curr_cpu); - if (node < 0) /* curr_cpu was likely still -1 */ + if (node < 0) /* curr_cpu was likely still -1 */ { + free(node_present); return 0; + } node_present[node] = 1; } nodes = 0; - for (n = 0; n < MAX_NR_NODES; n++) + for (n = 0; n < g->p.nr_nodes; n++) nodes += node_present[n]; + free(node_present); return nodes; } @@ -980,7 +990,7 @@ static void calc_convergence(double runtime_ns_max, double *convergence) { unsigned int loops_done_min, loops_done_max; int process_groups; - int nodes[MAX_NR_NODES]; + int *nodes; int distance; int nr_min; int nr_max; @@ -994,6 +1004,8 @@ static void calc_convergence(double runtime_ns_max, double *convergence) if (!g->p.show_convergence && !g->p.measure_convergence) return; + nodes = (int *)malloc(g->p.nr_nodes * sizeof(int)); + BUG_ON(!nodes); for (node = 0; node < g->p.nr_nodes; node++
Re: [RFC PATCH 8/8] powerpc/64/asm: don't reassign labels
On Thu, Feb 25, 2021 at 02:10:06PM +1100, Daniel Axtens wrote: > The assembler really does not like us reassigning things to the same > label: > > :7:9: error: invalid reassignment of non-absolute variable > 'fs_label' > > This happens across a bunch of platforms: > https://github.com/ClangBuiltLinux/linux/issues/1043 > https://github.com/ClangBuiltLinux/linux/issues/1008 > https://github.com/ClangBuiltLinux/linux/issues/920 > https://github.com/ClangBuiltLinux/linux/issues/1050 > > There is no hope of getting this fixed in LLVM, so if we want to build > with LLVM_IAS, we need to hack around it ourselves. > > For us the big problem comes from this: > > \#define USE_FIXED_SECTION(sname) \ > fs_label = start_##sname; \ > fs_start = sname##_start; \ > use_ftsec sname; > > \#define USE_TEXT_SECTION() > fs_label = start_text; \ > fs_start = text_start; \ > .text > > and in particular fs_label. The "Setting Symbols" super short chapter reads: "A symbol can be given an arbitrary value by writing a symbol, followed by an equals sign '=', followed by an expression. This is equivalent to using the '.set' directive." And ".set" has "Set the value of SYMBOL to EXPRESSION. This changes SYMBOL's value and type to conform to EXPRESSION. If SYMBOL was flagged as external, it remains flagged. You may '.set' a symbol many times in the same assembly provided that the values given to the symbol are constants. Values that are based on expressions involving other symbols are allowed, but some targets may restrict this to only being done once per assembly. This is because those targets do not set the addresses of symbols at assembly time, but rather delay the assignment until a final link is performed. This allows the linker a chance to change the code in the files, changing the location of, and the relative distance between, various different symbols. If you '.set' a global symbol, the value stored in the object file is the last value stored into it." So this really should be fixed in clang: it is basic assembler syntax. Segher
Re: [RFC PATCH 7/8] powerpc/purgatory: drop .machine specifier
On Thu, Feb 25, 2021 at 02:10:05PM +1100, Daniel Axtens wrote: > It's ignored by future versions of llvm's integrated assembler (by not -11). > I'm not sure what it does for us in gas. It enables all insns that exist on 620 (the first 64-bit PowerPC CPU). > --- a/arch/powerpc/purgatory/trampoline_64.S > +++ b/arch/powerpc/purgatory/trampoline_64.S > @@ -12,7 +12,7 @@ > #include > #include > > - .machine ppc64 > +//upgrade clang, gets ignored.machine ppc64 Why delete it if it is ignored? Why add a cryptic comment? Segher
Re: [RFC PATCH 5/8] poweprc/lib/quad: Provide macros for lq/stq
On Thu, Feb 25, 2021 at 02:10:03PM +1100, Daniel Axtens wrote: > +#define PPC_RAW_LQ(t, a, dq) (0xe000 | ___PPC_RT(t) | > ___PPC_RA(a) | (((dq) & 0xfff) << 3)) Please keep the operand order the same as for the assembler insns? So t,dq,a here. It should be ((dq) & 0x0fff) << 4) . > +#define PPC_RAW_STQ(t, a, ds)(0xf802 | ___PPC_RT(t) | > ___PPC_RA(a) | (((ds) & 0xfff) << 3)) And t,ds,a here. (But it should use "s" instead of "t" preferably, and use ___PPC_RS, because it is a source field, not a target). It should be ((ds) & 0x3fff) << 2) as well. Segher
Re: [RFC PATCH 4/8] powerpc/ppc_asm: use plain numbers for registers
On Thu, Feb 25, 2021 at 02:10:02PM +1100, Daniel Axtens wrote: > This is dumb but makes the llvm integrated assembler happy. > https://github.com/ClangBuiltLinux/linux/issues/764 > -#define r0 %r0 > +#define r0 0 This is a big step back (compare 9a13a524ba37). If you use a new enough GAS, you can use the -mregnames option and just say "r0" directly (so not define it at all, or define it to itself). === addi 3,3,3 addi r3,r3,3 addi %r3,%r3,3 addi 3,3,3 addi r3,r3,r3 addi %r3,%r3,%r3 === $ as t.s -o t.o -mregnames t.s: Assembler messages: t.s:6: Warning: invalid register expression t.s:7: Warning: invalid register expression Many people do not like bare numbers. It is a bit like not wearing seatbelts (but so is all assembler code really: you just have to pay attention). A better argument is that it is harder to read for people not used to assembler code like this. We used to have "#define r0 0" etc., and that was quite problematic. Like that "addi r3,r3,r3" example, but also, people wrote "r0" where only a plain 0 is allowed (like in "lwzx r3,0,r3": "r0" would be misleading there!) Segher
Re: [PATCH v2 16/37] KVM: PPC: Book3S HV P9: Stop handling hcalls in real-mode in the P9 path
On 2/25/21 2:46 PM, Nicholas Piggin wrote: > In the interest of minimising the amount of code that is run in > "real-mode", don't handle hcalls in real mode in the P9 path. > > POWER8 and earlier are much more expensive to exit from HV real mode > and switch to host mode, because on those processors HV interrupts get > to the hypervisor with the MMU off, and the other threads in the core > need to be pulled out of the guest, and SLBs all need to be saved, > ERATs invalidated, and host SLB reloaded before the MMU is re-enabled > in host mode. Hash guests also require a lot of hcalls to run. The > XICS interrupt controller requires hcalls to run. > > By contrast, POWER9 has independent thread switching, and in radix mode > the hypervisor is already in a host virtual memory mode when the HV > interrupt is taken. Radix + xive guests don't need hcalls to handle > interrupts or manage translations. > > So it's much less important to handle hcalls in real mode in P9. > > Signed-off-by: Nicholas Piggin > --- > arch/powerpc/include/asm/kvm_ppc.h | 5 + > arch/powerpc/kvm/book3s_hv.c| 25 ++--- > arch/powerpc/kvm/book3s_hv_rmhandlers.S | 5 + > arch/powerpc/kvm/book3s_xive.c | 25 + > 4 files changed, 57 insertions(+), 3 deletions(-) > > diff --git a/arch/powerpc/include/asm/kvm_ppc.h > b/arch/powerpc/include/asm/kvm_ppc.h > index 73b1ca5a6471..db6646c2ade2 100644 > --- a/arch/powerpc/include/asm/kvm_ppc.h > +++ b/arch/powerpc/include/asm/kvm_ppc.h > @@ -607,6 +607,7 @@ extern void kvmppc_free_pimap(struct kvm *kvm); > extern int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall); > extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu); > extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd); > +extern int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req); > extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu); > extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval); > extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev, > @@ -639,6 +640,8 @@ static inline int kvmppc_xics_enabled(struct kvm_vcpu > *vcpu) > static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { } > static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd) > { return 0; } > +static inline int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req) > + { return 0; } > #endif > > #ifdef CONFIG_KVM_XIVE > @@ -673,6 +676,7 @@ extern int kvmppc_xive_set_irq(struct kvm *kvm, int > irq_source_id, u32 irq, > int level, bool line_status); > extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu); > extern void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu); > +extern void kvmppc_xive_cede_vcpu(struct kvm_vcpu *vcpu); I can not find this routine. Is it missing or coming later in the patchset ? C. > > static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu) > { > @@ -714,6 +718,7 @@ static inline int kvmppc_xive_set_irq(struct kvm *kvm, > int irq_source_id, u32 ir > int level, bool line_status) { return > -ENODEV; } > static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { } > static inline void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu) { } > +static inline void kvmppc_xive_cede_vcpu(struct kvm_vcpu *vcpu) { } > > static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu) > { return 0; } > diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c > index 7e23838b7f9b..d4770b222d7e 100644 > --- a/arch/powerpc/kvm/book3s_hv.c > +++ b/arch/powerpc/kvm/book3s_hv.c > @@ -1144,7 +1144,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) > * This has to be done early, not in kvmppc_pseries_do_hcall(), so > * that the cede logic in kvmppc_run_single_vcpu() works properly. > */ > -static void kvmppc_nested_cede(struct kvm_vcpu *vcpu) > +static void kvmppc_cede(struct kvm_vcpu *vcpu) > { > vcpu->arch.shregs.msr |= MSR_EE; > vcpu->arch.ceded = 1; > @@ -3731,15 +3731,34 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu > *vcpu, u64 time_limit, > /* H_CEDE has to be handled now, not later */ > if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested && > kvmppc_get_gpr(vcpu, 3) == H_CEDE) { > - kvmppc_nested_cede(vcpu); > + kvmppc_cede(vcpu); > kvmppc_set_gpr(vcpu, 3, 0); > trap = 0; > } > } else { > kvmppc_xive_push_vcpu(vcpu); > trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr); > - kvmppc_xive_pull_vcpu(vcpu); > + /* H_CEDE has to be handled now, not later */ > + /* XICS hcalls must be handled before xive is pulled */ > + if (trap == BOOK3S_INTERRUPT_SYSCALL && > + !(vcpu->arch.shregs.msr & MSR_PR)) { > +
Re: [PATCH v2] vio: make remove callback return void
On Thu, Feb 25, 2021 at 12:52 PM Michael Ellerman wrote: > > Uwe Kleine-König writes: > > The driver core ignores the return value of struct bus_type::remove() > > because there is only little that can be done. To simplify the quest to > > make this function return void, let struct vio_driver::remove() return > > void, too. All users already unconditionally return 0, this commit makes > > it obvious that returning an error code is a bad idea and makes it > > obvious for future driver authors that returning an error code isn't > > intended. > > > > Note there are two nominally different implementations for a vio bus: > > one in arch/sparc/kernel/vio.c and the other in > > arch/powerpc/platforms/pseries/vio.c. I didn't care to check which > > driver is using which of these busses (or if even some of them can be > > used with both) and simply adapt all drivers and the two bus codes in > > one go. > > I'm 99% sure there's no connection between the two implementations, > other than the name. > > So splitting the patch by arch would make it easier to merge. I'm > reluctant to merge changes to sparc code. The sparc subsystem clearly started out as a copy of the powerpc version, and serves roughly the same purpose, but the communication with the hypervisor is quite different. As there are only four drivers for the sparc vio subsystem: drivers/block/sunvdc.c drivers/net/ethernet/sun/ldmvsw.c drivers/net/ethernet/sun/sunvnet.c drivers/tty/vcc.c maybe it would make sense to rename those to use distinct identifiers now? Arnd
[PATCH v2 37/37] KVM: PPC: Book3S HV: remove POWER9 support from P7/8 paths
This is dead code now. Signed-off-by: Nicholas Piggin --- arch/powerpc/kvm/book3s_hv.c| 27 +- arch/powerpc/kvm/book3s_hv_interrupts.S | 9 +- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 404 +--- arch/powerpc/platforms/powernv/idle.c | 52 +-- 4 files changed, 23 insertions(+), 469 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 10d5c7ea80ca..483a1a821ea4 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3040,8 +3040,7 @@ static void prepare_threads(struct kvmppc_vcore *vc) for_each_runnable_thread(i, vcpu, vc) { if (signal_pending(vcpu->arch.run_task)) vcpu->arch.ret = -EINTR; - else if (no_mixing_hpt_and_radix && -kvm_is_radix(vc->kvm) != radix_enabled()) + else if (kvm_is_radix(vc->kvm)) vcpu->arch.ret = -EINVAL; else if (vcpu->arch.vpa.update_pending || vcpu->arch.slb_shadow.update_pending || @@ -3249,6 +3248,9 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) int trap; bool is_power8; + if (WARN_ON_ONCE(cpu_has_feature(CPU_FTR_ARCH_300))) + return; + /* * Remove from the list any threads that have a signal pending * or need a VPA update done @@ -3276,9 +3278,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) * Make sure we are running on primary threads, and that secondary * threads are offline. Also check if the number of threads in this * guest are greater than the current system threads per guest. -* On POWER9, we need to be not in independent-threads mode if -* this is a HPT guest on a radix host machine where the -* CPU threads may not be in different MMU modes. */ if ((controlled_threads > 1) && ((vc->num_threads > threads_per_subcore) || !on_primary_thread())) { @@ -3302,18 +3301,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) if (vc->num_threads < target_threads) collect_piggybacks(&core_info, target_threads); - /* -* On radix, arrange for TLB flushing if necessary. -* This has to be done before disabling interrupts since -* it uses smp_call_function(). -*/ - pcpu = smp_processor_id(); - if (kvm_is_radix(vc->kvm)) { - for (sub = 0; sub < core_info.n_subcores; ++sub) - for_each_runnable_thread(i, vcpu, core_info.vc[sub]) - kvmppc_prepare_radix_vcpu(vcpu, pcpu); - } - /* * Hard-disable interrupts, and check resched flag and signals. * If we need to reschedule or deliver a signal, clean up @@ -3346,8 +,7 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) cmd_bit = stat_bit = 0; split = core_info.n_subcores; sip = NULL; - is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S) - && !cpu_has_feature(CPU_FTR_ARCH_300); + is_power8 = cpu_has_feature(CPU_FTR_ARCH_207S); if (split > 1) { sip = &split_info; @@ -3630,8 +3616,7 @@ static void restore_p9_host_os_sprs(struct kvm_vcpu *vcpu, } /* - * Virtual-mode guest entry for POWER9 and later when the host and - * guest are both using the radix MMU. The LPIDR has already been set. + * Guest entry for POWER9 and later CPUs. */ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr) diff --git a/arch/powerpc/kvm/book3s_hv_interrupts.S b/arch/powerpc/kvm/book3s_hv_interrupts.S index 327417d79eac..f83cb133 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupts.S +++ b/arch/powerpc/kvm/book3s_hv_interrupts.S @@ -58,7 +58,7 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) /* * Put whatever is in the decrementer into the * hypervisor decrementer. -* Because of a hardware deviation in P8 and P9, +* Because of a hardware deviation in P8, * we need to set LPCR[HDICE] before writing HDEC. */ ld r5, HSTATE_KVM_VCORE(r13) @@ -67,15 +67,10 @@ END_FTR_SECTION_IFCLR(CPU_FTR_ARCH_207S) ori r8, r9, LPCR_HDICE mtspr SPRN_LPCR, r8 isync - andis. r0, r9, LPCR_LD@h mfspr r8,SPRN_DEC mftbr7 -BEGIN_FTR_SECTION - /* On POWER9, don't sign-extend if host LPCR[LD] bit is set */ - bne 32f -END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) extsw r8,r8 -32:mtspr SPRN_HDEC,r8 + mtspr SPRN_HDEC,r8 add r8,r8,r7 std r8,HSTATE_DECEXP(r13) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index be5742640780..4e9808a2c3a9 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +
[PATCH v2 36/37] KVM: PPC: Book3S HV P9: implement hash host / hash guest support
Signed-off-by: Nicholas Piggin --- arch/powerpc/kvm/book3s_hv.c | 2 +- arch/powerpc/kvm/book3s_hv_interrupt.c | 75 -- 2 files changed, 47 insertions(+), 30 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 97320531f37c..10d5c7ea80ca 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -4403,7 +4403,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu) * The TLB prefetch bug fixup is only in the kvmppc_run_vcpu * path, which also handles hash and dependent threads mode. */ - if (radix_enabled()) + if (cpu_has_feature(CPU_FTR_ARCH_300)) r = kvmhv_run_single_vcpu(vcpu, ~(u64)0, vcpu->arch.vcore->lpcr); else diff --git a/arch/powerpc/kvm/book3s_hv_interrupt.c b/arch/powerpc/kvm/book3s_hv_interrupt.c index d79c6f4f330c..af4772755e5d 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupt.c +++ b/arch/powerpc/kvm/book3s_hv_interrupt.c @@ -140,12 +140,51 @@ static void switch_mmu_to_guest_hpt(struct kvm *kvm, struct kvm_vcpu *vcpu, u64 } -static void switch_mmu_to_host_radix(struct kvm *kvm, u32 pid) +static void switch_mmu_to_host(struct kvm *kvm, u32 pid) { mtspr(SPRN_PID, pid); mtspr(SPRN_LPID, kvm->arch.host_lpid); mtspr(SPRN_LPCR, kvm->arch.host_lpcr); isync(); + + /* XXX: could save and restore host SLBs to reduce SLB faults */ + if (!radix_enabled()) + slb_restore_bolted_realmode(); +} + +static void save_host_mmu(struct kvm *kvm) +{ + if (!radix_enabled()) { + mtslb(0, 0, 0); + slb_invalidate(6); + } +} + +static void save_guest_mmu(struct kvm *kvm, struct kvm_vcpu *vcpu) +{ + if (kvm_is_radix(kvm)) { + radix_clear_slb(); + } else { + int i; + int nr = 0; + + /* +* This must run before switching to host (radix host can't +* access all SLBs). +*/ + for (i = 0; i < vcpu->arch.slb_nr; i++) { + u64 slbee, slbev; + mfslb(i, &slbee, &slbev); + if (slbee & SLB_ESID_V) { + vcpu->arch.slb[nr].orige = slbee | i; + vcpu->arch.slb[nr].origv = slbev; + nr++; + } + } + vcpu->arch.slb_max = nr; + mtslb(0, 0, 0); + slb_invalidate(6); + } } int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr) @@ -252,15 +291,16 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc mtspr(SPRN_AMOR, ~0UL); + if (!radix_enabled() || !kvm_is_radix(kvm) || cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) + __mtmsrd(msr & ~(MSR_IR|MSR_DR|MSR_RI), 0); + + save_host_mmu(kvm); if (kvm_is_radix(kvm)) { - if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) - __mtmsrd(msr & ~(MSR_IR|MSR_DR|MSR_RI), 0); switch_mmu_to_guest_radix(kvm, vcpu, lpcr); if (!cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) __mtmsrd(0, 1); /* clear RI */ } else { - __mtmsrd(msr & ~(MSR_IR|MSR_DR|MSR_RI), 0); switch_mmu_to_guest_hpt(kvm, vcpu, lpcr); } @@ -437,31 +477,8 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc /* HDEC must be at least as large as DEC, so decrementer_max fits */ mtspr(SPRN_HDEC, decrementer_max); - if (kvm_is_radix(kvm)) { - radix_clear_slb(); - } else { - int i; - int nr = 0; - - /* -* This must run before switching to host (radix host can't -* access all SLBs). -*/ - for (i = 0; i < vcpu->arch.slb_nr; i++) { - u64 slbee, slbev; - mfslb(i, &slbee, &slbev); - if (slbee & SLB_ESID_V) { - vcpu->arch.slb[nr].orige = slbee | i; - vcpu->arch.slb[nr].origv = slbev; - nr++; - } - } - vcpu->arch.slb_max = nr; - mtslb(0, 0, 0); - slb_invalidate(6); - } - - switch_mmu_to_host_radix(kvm, host_pidr); + save_guest_mmu(kvm, vcpu); + switch_mmu_to_host(kvm, host_pidr); /* * If we are in real mode, don't switch MMU on until the MMU is -- 2.23.0
[PATCH v2 35/37] KVM: PPC: Book3S HV P9: implement hash guest support
Signed-off-by: Nicholas Piggin --- arch/powerpc/kvm/book3s_hv.c| 20 ++-- arch/powerpc/kvm/book3s_hv_interrupt.c | 123 +--- arch/powerpc/kvm/book3s_hv_rm_mmu.c | 4 + arch/powerpc/kvm/book3s_hv_rmhandlers.S | 14 +-- 4 files changed, 109 insertions(+), 52 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 1bbc46f2cfbf..97320531f37c 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3765,7 +3765,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, } kvmppc_xive_pull_vcpu(vcpu); - vcpu->arch.slb_max = 0; + if (kvm_is_radix(vcpu->kvm)) + vcpu->arch.slb_max = 0; } dec = mfspr(SPRN_DEC); @@ -3998,7 +3999,6 @@ static void kvmppc_vcore_blocked(struct kvmppc_vcore *vc) /* * This never fails for a radix guest, as none of the operations it does * for a radix guest can fail or have a way to report failure. - * kvmhv_run_single_vcpu() relies on this fact. */ static int kvmhv_setup_mmu(struct kvm_vcpu *vcpu) { @@ -4177,8 +4177,15 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, vc->runner = vcpu; /* See if the MMU is ready to go */ - if (!kvm->arch.mmu_ready) - kvmhv_setup_mmu(vcpu); + if (!kvm->arch.mmu_ready) { + r = kvmhv_setup_mmu(vcpu); + if (r) { + run->exit_reason = KVM_EXIT_FAIL_ENTRY; + run->fail_entry.hardware_entry_failure_reason = 0; + vcpu->arch.ret = r; + return r; + } + } if (need_resched()) cond_resched(); @@ -4191,7 +4198,8 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, preempt_disable(); pcpu = smp_processor_id(); vc->pcpu = pcpu; - kvmppc_prepare_radix_vcpu(vcpu, pcpu); + if (kvm_is_radix(kvm)) + kvmppc_prepare_radix_vcpu(vcpu, pcpu); local_irq_disable(); hard_irq_disable(); @@ -4395,7 +4403,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu) * The TLB prefetch bug fixup is only in the kvmppc_run_vcpu * path, which also handles hash and dependent threads mode. */ - if (kvm_is_radix(kvm)) + if (radix_enabled()) r = kvmhv_run_single_vcpu(vcpu, ~(u64)0, vcpu->arch.vcore->lpcr); else diff --git a/arch/powerpc/kvm/book3s_hv_interrupt.c b/arch/powerpc/kvm/book3s_hv_interrupt.c index 9784da3f8565..d79c6f4f330c 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupt.c +++ b/arch/powerpc/kvm/book3s_hv_interrupt.c @@ -55,44 +55,25 @@ static void __accumulate_time(struct kvm_vcpu *vcpu, struct kvmhv_tb_accumulator #define accumulate_time(vcpu, next) do {} while (0) #endif -static void switch_mmu_to_guest_radix(struct kvm *kvm, struct kvm_vcpu *vcpu, u64 lpcr) -{ - struct kvmppc_vcore *vc = vcpu->arch.vcore; - struct kvm_nested_guest *nested = vcpu->arch.nested; - u32 lpid; - - lpid = nested ? nested->shadow_lpid : kvm->arch.lpid; - - mtspr(SPRN_LPID, lpid); - mtspr(SPRN_LPCR, lpcr); - mtspr(SPRN_PID, vcpu->arch.pid); - isync(); - - /* TLBIEL must have LPIDR set, so set guest LPID before flushing. */ - kvmppc_check_need_tlb_flush(kvm, vc->pcpu, nested); -} - -static void switch_mmu_to_host_radix(struct kvm *kvm, u32 pid) -{ - mtspr(SPRN_PID, pid); - mtspr(SPRN_LPID, kvm->arch.host_lpid); - mtspr(SPRN_LPCR, kvm->arch.host_lpcr); - isync(); -} - static inline void mfslb(unsigned int idx, u64 *slbee, u64 *slbev) { asm volatile("slbmfev %0,%1" : "=r" (*slbev) : "r" (idx)); asm volatile("slbmfee %0,%1" : "=r" (*slbee) : "r" (idx)); } +static inline void __mtslb(u64 slbee, u64 slbev) +{ + asm volatile("slbmte %0,%1" :: "r" (slbev), "r" (slbee)); +} + static inline void mtslb(unsigned int idx, u64 slbee, u64 slbev) { BUG_ON((slbee & 0xfff) != idx); - asm volatile("slbmte %0,%1" :: "r" (slbev), "r" (slbee)); + __mtslb(slbee, slbev); } + static inline void slb_invalidate(unsigned int ih) { asm volatile("slbia %0" :: "i"(ih)); @@ -119,6 +100,54 @@ static void radix_clear_slb(void) } } +static void switch_mmu_to_guest_radix(struct kvm *kvm, struct kvm_vcpu *vcpu, u64 lpcr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + struct kvm_nested_guest *nested = vcpu->arch.nested; + u32 lpid; + + lpid = nested ? nested->shadow_lpid : kvm->arch.lpid; + + mtspr(SPRN_LPID, lpid); + mtspr(SPRN_LPCR, lpcr); + mtspr(SPRN_PID, vcpu->arch.pid); + isync(); + + /* TLBIEL must have LPIDR set, so set
[PATCH v2 34/37] KVM: PPC: Book3S HV: add virtual mode handlers for HPT hcalls and page faults
In order to support hash guests in the P9 path (which does not do real mode hcalls or page fault handling), these real-mode hash specific interrupts need to be implemented in virt mode. Signed-off-by: Nicholas Piggin --- arch/powerpc/kvm/book3s_hv.c | 118 +-- 1 file changed, 113 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 9d2fa21201c1..1bbc46f2cfbf 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -935,6 +935,52 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) return RESUME_HOST; switch (req) { + case H_REMOVE: + ret = kvmppc_h_remove(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), + kvmppc_get_gpr(vcpu, 6)); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; + case H_ENTER: + ret = kvmppc_h_enter(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), + kvmppc_get_gpr(vcpu, 6), + kvmppc_get_gpr(vcpu, 7)); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; + case H_READ: + ret = kvmppc_h_read(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5)); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; + case H_CLEAR_MOD: + ret = kvmppc_h_clear_mod(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5)); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; + case H_CLEAR_REF: + ret = kvmppc_h_clear_ref(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5)); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; + case H_PROTECT: + ret = kvmppc_h_protect(vcpu, kvmppc_get_gpr(vcpu, 4), + kvmppc_get_gpr(vcpu, 5), + kvmppc_get_gpr(vcpu, 6)); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; + case H_BULK_REMOVE: + ret = kvmppc_h_bulk_remove(vcpu); + if (ret == H_TOO_HARD) + return RESUME_HOST; + break; + case H_CEDE: break; case H_PROD: @@ -1134,6 +1180,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) default: return RESUME_HOST; } + WARN_ON_ONCE(ret == H_TOO_HARD); kvmppc_set_gpr(vcpu, 3, ret); vcpu->arch.hcall_needed = 0; return RESUME_GUEST; @@ -1420,19 +1467,80 @@ static int kvmppc_handle_exit_hv(struct kvm_vcpu *vcpu, * host page has been paged out. Any other HDSI/HISI interrupts * have been handled already. */ - case BOOK3S_INTERRUPT_H_DATA_STORAGE: - r = RESUME_PAGE_FAULT; - if (vcpu->arch.fault_dsisr == HDSISR_CANARY) + case BOOK3S_INTERRUPT_H_DATA_STORAGE: { + unsigned long vsid; + long err; + + if (vcpu->arch.fault_dsisr == HDSISR_CANARY) { r = RESUME_GUEST; /* Just retry if it's the canary */ + break; + } + + if (kvm_is_radix(vcpu->kvm)) { + r = RESUME_PAGE_FAULT; + break; + } + + if (!(vcpu->arch.fault_dsisr & (DSISR_NOHPTE | DSISR_PROTFAULT))) { + kvmppc_core_queue_data_storage(vcpu, vcpu->arch.fault_dar, vcpu->arch.fault_dsisr); + r = RESUME_GUEST; + break; + } + if (!(vcpu->arch.shregs.msr & MSR_DR)) { + vsid = vcpu->kvm->arch.vrma_slb_v; + } else { + vsid = vcpu->arch.fault_gpa; + } + err = kvmppc_hpte_hv_fault(vcpu, vcpu->arch.fault_dar, + vsid, vcpu->arch.fault_dsisr, true); + if (err == 0) { + r = RESUME_GUEST; + } else if (err == -1 || err == -2) { + r = RESUME_PAGE_FAULT; + } else { + kvmppc_core_queue_data_storage(vcpu, vcpu->arch.fault_dar, err); + r = RESUME_GUEST; + } break; - case BOOK3S_INTERRUPT_H_INST_STORAGE: + } + case BOOK3S_INTERRUPT_H_INST_STORAGE: { + unsigned long vsid; +
[PATCH v2 33/37] KVM: PPC: Book3S HV: small pseries_do_hcall cleanup
Functionality should not be changed. Signed-off-by: Nicholas Piggin --- arch/powerpc/kvm/book3s_hv.c | 29 +++-- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 1f27187ff1e7..9d2fa21201c1 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -923,6 +923,7 @@ static int kvmppc_get_yield_count(struct kvm_vcpu *vcpu) int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) { + struct kvm *kvm = vcpu->kvm; unsigned long req = kvmppc_get_gpr(vcpu, 3); unsigned long target, ret = H_SUCCESS; int yield_count; @@ -938,7 +939,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) break; case H_PROD: target = kvmppc_get_gpr(vcpu, 4); - tvcpu = kvmppc_find_vcpu(vcpu->kvm, target); + tvcpu = kvmppc_find_vcpu(kvm, target); if (!tvcpu) { ret = H_PARAMETER; break; @@ -952,7 +953,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) target = kvmppc_get_gpr(vcpu, 4); if (target == -1) break; - tvcpu = kvmppc_find_vcpu(vcpu->kvm, target); + tvcpu = kvmppc_find_vcpu(kvm, target); if (!tvcpu) { ret = H_PARAMETER; break; @@ -968,12 +969,12 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) kvmppc_get_gpr(vcpu, 6)); break; case H_RTAS: - if (list_empty(&vcpu->kvm->arch.rtas_tokens)) + if (list_empty(&kvm->arch.rtas_tokens)) return RESUME_HOST; - idx = srcu_read_lock(&vcpu->kvm->srcu); + idx = srcu_read_lock(&kvm->srcu); rc = kvmppc_rtas_hcall(vcpu); - srcu_read_unlock(&vcpu->kvm->srcu, idx); + srcu_read_unlock(&kvm->srcu, idx); if (rc == -ENOENT) return RESUME_HOST; @@ -1060,12 +1061,12 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) case H_SET_PARTITION_TABLE: ret = H_FUNCTION; - if (nesting_enabled(vcpu->kvm)) + if (nesting_enabled(kvm)) ret = kvmhv_set_partition_table(vcpu); break; case H_ENTER_NESTED: ret = H_FUNCTION; - if (!nesting_enabled(vcpu->kvm)) + if (!nesting_enabled(kvm)) break; ret = kvmhv_enter_nested_guest(vcpu); if (ret == H_INTERRUPT) { @@ -1080,12 +1081,12 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) break; case H_TLB_INVALIDATE: ret = H_FUNCTION; - if (nesting_enabled(vcpu->kvm)) + if (nesting_enabled(kvm)) ret = kvmhv_do_nested_tlbie(vcpu); break; case H_COPY_TOFROM_GUEST: ret = H_FUNCTION; - if (nesting_enabled(vcpu->kvm)) + if (nesting_enabled(kvm)) ret = kvmhv_copy_tofrom_guest_nested(vcpu); break; case H_PAGE_INIT: @@ -1096,7 +1097,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) case H_SVM_PAGE_IN: ret = H_UNSUPPORTED; if (kvmppc_get_srr1(vcpu) & MSR_S) - ret = kvmppc_h_svm_page_in(vcpu->kvm, + ret = kvmppc_h_svm_page_in(kvm, kvmppc_get_gpr(vcpu, 4), kvmppc_get_gpr(vcpu, 5), kvmppc_get_gpr(vcpu, 6)); @@ -1104,7 +1105,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) case H_SVM_PAGE_OUT: ret = H_UNSUPPORTED; if (kvmppc_get_srr1(vcpu) & MSR_S) - ret = kvmppc_h_svm_page_out(vcpu->kvm, + ret = kvmppc_h_svm_page_out(kvm, kvmppc_get_gpr(vcpu, 4), kvmppc_get_gpr(vcpu, 5), kvmppc_get_gpr(vcpu, 6)); @@ -1112,12 +1113,12 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) case H_SVM_INIT_START: ret = H_UNSUPPORTED; if (kvmppc_get_srr1(vcpu) & MSR_S) - ret = kvmppc_h_svm_init_start(vcpu->kvm); + ret = kvmppc_h_svm_init_start(kvm); break; case H_SVM_INIT_DONE: ret = H_UNSUPPORTED; if (kvmppc_get_srr1(vcpu) & MSR_S) - ret = kvmppc_h_svm_init_done(vcpu->kvm); + ret = k
[PATCH v2 32/37] KVM: PPC: Book3S HV: Remove radix guest support from P7/8 path
The P9 path will run all supported radix guest combinations now, so remove support from the old path. Signed-off-by: Nicholas Piggin --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 65 ++--- 1 file changed, 3 insertions(+), 62 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 61f71a7df238..a8ce68eed13e 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -899,11 +899,6 @@ ALT_FTR_SECTION_END_IFCLR(CPU_FTR_ARCH_300) cmpdi r3, 512 /* 1 microsecond */ blt hdec_soon - ld r6, VCPU_KVM(r4) - lbz r0, KVM_RADIX(r6) - cmpwi r0, 0 - bne 9f - /* For hash guest, clear out and reload the SLB */ BEGIN_MMU_FTR_SECTION /* Radix host won't have populated the SLB, so no need to clear */ @@ -1389,11 +1384,7 @@ guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ patch_site 1b patch__call_kvm_flush_link_stack /* For hash guest, read the guest SLB and save it away */ - ld r5, VCPU_KVM(r9) - lbz r0, KVM_RADIX(r5) li r5, 0 - cmpwi r0, 0 - bne 0f /* for radix, save 0 entries */ lwz r0,VCPU_SLB_NR(r9) /* number of entries in SLB */ mtctr r0 li r6,0 @@ -1432,23 +1423,6 @@ END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) slbmte r6,r5 1: addir8,r8,16 .endr - b guest_bypass - -0: /* -* Malicious or buggy radix guests may have inserted SLB entries -* (only 0..3 because radix always runs with UPRT=1), so these must -* be cleared here to avoid side-channels. slbmte is used rather -* than slbia, as it won't clear cached translations. -*/ - li r0,0 - stw r0,VCPU_SLB_MAX(r9) - slbmte r0,r0 - li r4,1 - slbmte r0,r4 - li r4,2 - slbmte r0,r4 - li r4,3 - slbmte r0,r4 guest_bypass: stw r12, STACK_SLOT_TRAP(r1) @@ -1694,24 +1668,6 @@ BEGIN_FTR_SECTION mtspr SPRN_PID, r7 END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) -#ifdef CONFIG_PPC_RADIX_MMU - /* -* Are we running hash or radix ? -*/ - ld r5, VCPU_KVM(r9) - lbz r0, KVM_RADIX(r5) - cmpwi cr2, r0, 0 - beq cr2, 2f - - /* -* Radix: do eieio; tlbsync; ptesync sequence in case we -* interrupted the guest between a tlbie and a ptesync. -*/ - eieio - tlbsync - ptesync -#endif /* CONFIG_PPC_RADIX_MMU */ - /* * cp_abort is required if the processor supports local copy-paste * to clear the copy buffer that was under control of the guest. @@ -1970,8 +1926,6 @@ kvmppc_tm_emul: * reflect the HDSI to the guest as a DSI. */ kvmppc_hdsi: - ld r3, VCPU_KVM(r9) - lbz r0, KVM_RADIX(r3) mfspr r4, SPRN_HDAR mfspr r6, SPRN_HDSISR BEGIN_FTR_SECTION @@ -1979,8 +1933,6 @@ BEGIN_FTR_SECTION cmpdi r6, 0x7fff beq 6f END_FTR_SECTION_IFSET(CPU_FTR_ARCH_300) - cmpwi r0, 0 - bne .Lradix_hdsi/* on radix, just save DAR/DSISR/ASDR */ /* HPTE not found fault or protection fault? */ andis. r0, r6, (DSISR_NOHPTE | DSISR_PROTFAULT)@h beq 1f /* if not, send it to the guest */ @@ -2057,23 +2009,11 @@ fast_interrupt_c_return: stb r0, HSTATE_IN_GUEST(r13) b guest_exit_cont -.Lradix_hdsi: - std r4, VCPU_FAULT_DAR(r9) - stw r6, VCPU_FAULT_DSISR(r9) -.Lradix_hisi: - mfspr r5, SPRN_ASDR - std r5, VCPU_FAULT_GPA(r9) - b guest_exit_cont - /* * Similarly for an HISI, reflect it to the guest as an ISI unless * it is an HPTE not found fault for a page that we have paged out. */ kvmppc_hisi: - ld r3, VCPU_KVM(r9) - lbz r0, KVM_RADIX(r3) - cmpwi r0, 0 - bne .Lradix_hisi/* for radix, just save ASDR */ andis. r0, r11, SRR1_ISI_NOPT@h beq 1f andi. r0, r11, MSR_IR /* instruction relocation enabled? */ @@ -3217,15 +3157,16 @@ BEGIN_FTR_SECTION mtspr SPRN_DAWRX1, r0 END_FTR_SECTION_IFSET(CPU_FTR_DAWR1) - /* Clear hash and radix guest SLB. */ + /* Clear guest SLB. */ slbmte r0, r0 PPC_SLBIA(6) + ptesync BEGIN_MMU_FTR_SECTION b 4f END_MMU_FTR_SECTION_IFSET(MMU_FTR_TYPE_RADIX) - ptesync + /* load host SLB entries */ ld r8, PACA_SLBSHADOWPTR(r13) .rept SLB_NUM_BOLTED li r3, SLBSHADOW_SAVEAREA -- 2.23.0
[PATCH v2 31/37] KVM: PPC: Book3S HV: Remove support for dependent threads mode on P9
Radix guest support will be removed from the P7/8 path, so disallow dependent threads mode on P9. Signed-off-by: Nicholas Piggin --- arch/powerpc/include/asm/kvm_host.h | 1 - arch/powerpc/kvm/book3s_hv.c| 27 +-- 2 files changed, 5 insertions(+), 23 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 05fb00d37609..dd017dfa4e65 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -304,7 +304,6 @@ struct kvm_arch { u8 fwnmi_enabled; u8 secure_guest; u8 svm_enabled; - bool threads_indep; bool nested_enable; bool dawr1_enabled; pgd_t *pgtable; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index c3064075f1d7..1f27187ff1e7 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -103,13 +103,9 @@ static int target_smt_mode; module_param(target_smt_mode, int, 0644); MODULE_PARM_DESC(target_smt_mode, "Target threads per core (0 = max)"); -static bool indep_threads_mode = true; -module_param(indep_threads_mode, bool, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(indep_threads_mode, "Independent-threads mode (only on POWER9)"); - static bool one_vm_per_core; module_param(one_vm_per_core, bool, S_IRUGO | S_IWUSR); -MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires indep_threads_mode=N)"); +MODULE_PARM_DESC(one_vm_per_core, "Only run vCPUs from the same VM on a core (requires POWER8 or older)"); #ifdef CONFIG_KVM_XICS static const struct kernel_param_ops module_param_ops = { @@ -2201,7 +2197,7 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, */ static int threads_per_vcore(struct kvm *kvm) { - if (kvm->arch.threads_indep) + if (cpu_has_feature(CPU_FTR_ARCH_300)) return 1; return threads_per_subcore; } @@ -4290,7 +4286,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu) * The TLB prefetch bug fixup is only in the kvmppc_run_vcpu * path, which also handles hash and dependent threads mode. */ - if (kvm->arch.threads_indep && kvm_is_radix(kvm)) + if (kvm_is_radix(kvm)) r = kvmhv_run_single_vcpu(vcpu, ~(u64)0, vcpu->arch.vcore->lpcr); else @@ -4910,21 +4906,8 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) /* * Track that we now have a HV mode VM active. This blocks secondary * CPU threads from coming online. -* On POWER9, we only need to do this if the "indep_threads_mode" -* module parameter has been set to N. */ - if (cpu_has_feature(CPU_FTR_ARCH_300)) { - if (!indep_threads_mode && !cpu_has_feature(CPU_FTR_HVMODE)) { - pr_warn("KVM: Ignoring indep_threads_mode=N in nested hypervisor\n"); - kvm->arch.threads_indep = true; - } else if (!indep_threads_mode && cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) { - pr_warn("KVM: Ignoring indep_threads_mode=N on pre-DD2.2 POWER9\n"); - kvm->arch.threads_indep = true; - } else { - kvm->arch.threads_indep = indep_threads_mode; - } - } - if (!kvm->arch.threads_indep) + if (!cpu_has_feature(CPU_FTR_ARCH_300)) kvm_hv_vm_activated(); /* @@ -4965,7 +4948,7 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) { debugfs_remove_recursive(kvm->arch.debugfs_dir); - if (!kvm->arch.threads_indep) + if (!cpu_has_feature(CPU_FTR_ARCH_300)) kvm_hv_vm_deactivated(); kvmppc_free_vcores(kvm); -- 2.23.0
[PATCH v2 30/37] KVM: PPC: Book3S HV: Implement radix prefetch workaround by disabling MMU
Rather than partition the guest PID space and catch and flush a rogue guest, instead work around this issue by ensuring the MMU is always disabled in HV mode while the guest MMU context is switched in. This may be a bit less efficient, but it is a lot less complicated and allows the P9 path to trivally implement the workaround too. Newer CPUs are not subject to this issue. Signed-off-by: Nicholas Piggin --- arch/powerpc/include/asm/mmu_context.h | 6 arch/powerpc/kvm/book3s_hv.c | 10 -- arch/powerpc/kvm/book3s_hv_interrupt.c | 14 ++-- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 34 -- arch/powerpc/mm/book3s64/radix_pgtable.c | 27 +- arch/powerpc/mm/book3s64/radix_tlb.c | 46 arch/powerpc/mm/mmu_context.c| 4 +-- 7 files changed, 28 insertions(+), 113 deletions(-) diff --git a/arch/powerpc/include/asm/mmu_context.h b/arch/powerpc/include/asm/mmu_context.h index 652ce85f9410..bb5c7e5e142e 100644 --- a/arch/powerpc/include/asm/mmu_context.h +++ b/arch/powerpc/include/asm/mmu_context.h @@ -122,12 +122,6 @@ static inline bool need_extra_context(struct mm_struct *mm, unsigned long ea) } #endif -#if defined(CONFIG_KVM_BOOK3S_HV_POSSIBLE) && defined(CONFIG_PPC_RADIX_MMU) -extern void radix_kvm_prefetch_workaround(struct mm_struct *mm); -#else -static inline void radix_kvm_prefetch_workaround(struct mm_struct *mm) { } -#endif - extern void switch_cop(struct mm_struct *next); extern int use_cop(unsigned long acop, struct mm_struct *mm); extern void drop_cop(unsigned long acop, struct mm_struct *mm); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index ad16331c3370..c3064075f1d7 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -806,6 +806,10 @@ static int kvmppc_h_set_mode(struct kvm_vcpu *vcpu, unsigned long mflags, /* KVM does not support mflags=2 (AIL=2) */ if (mflags != 0 && mflags != 3) return H_UNSUPPORTED_FLAG_START; + /* Prefetch bug */ + if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG) && + kvmhv_vcpu_is_radix(vcpu) && mflags == 3) + return H_UNSUPPORTED_FLAG_START; return H_TOO_HARD; default: return H_TOO_HARD; @@ -4286,8 +4290,7 @@ static int kvmppc_vcpu_run_hv(struct kvm_vcpu *vcpu) * The TLB prefetch bug fixup is only in the kvmppc_run_vcpu * path, which also handles hash and dependent threads mode. */ - if (kvm->arch.threads_indep && kvm_is_radix(kvm) && - !cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) + if (kvm->arch.threads_indep && kvm_is_radix(kvm)) r = kvmhv_run_single_vcpu(vcpu, ~(u64)0, vcpu->arch.vcore->lpcr); else @@ -4914,6 +4917,9 @@ static int kvmppc_core_init_vm_hv(struct kvm *kvm) if (!indep_threads_mode && !cpu_has_feature(CPU_FTR_HVMODE)) { pr_warn("KVM: Ignoring indep_threads_mode=N in nested hypervisor\n"); kvm->arch.threads_indep = true; + } else if (!indep_threads_mode && cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) { + pr_warn("KVM: Ignoring indep_threads_mode=N on pre-DD2.2 POWER9\n"); + kvm->arch.threads_indep = true; } else { kvm->arch.threads_indep = indep_threads_mode; } diff --git a/arch/powerpc/kvm/book3s_hv_interrupt.c b/arch/powerpc/kvm/book3s_hv_interrupt.c index b93d861d8538..9784da3f8565 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupt.c +++ b/arch/powerpc/kvm/book3s_hv_interrupt.c @@ -223,6 +223,9 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc mtspr(SPRN_AMOR, ~0UL); + if (cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) + __mtmsrd(msr & ~(MSR_IR|MSR_DR|MSR_RI), 0); + switch_mmu_to_guest_radix(kvm, vcpu, lpcr); /* @@ -231,7 +234,8 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc */ mtspr(SPRN_HDEC, hdec); - __mtmsrd(0, 1); /* clear RI */ + if (!cpu_has_feature(CPU_FTR_P9_RADIX_PREFETCH_BUG)) + __mtmsrd(0, 1); /* clear RI */ mtspr(SPRN_DAR, vcpu->arch.shregs.dar); mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr); @@ -338,8 +342,6 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc radix_clear_slb(); - __mtmsrd(msr, 0); - accumulate_time(vcpu, &vcpu->arch.rm_exit); /* Advance host PURR/SPURR by the amount used by guest */ @@ -406,6 +408,12 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned
[PATCH v2 29/37] KVM: PPC: Book3S HV P9: Switch to guest MMU context as late as possible
Move WARN_ON traps early so they are less likely to get tangled on CPU switching to guest. Move MMU context switch as late as reasonably possible to minimise code running with guest context switched in. This becomes more important when this code may run in real-mode, with later changes. Signed-off-by: Nicholas Piggin --- arch/powerpc/kvm/book3s_hv_interrupt.c | 40 +- arch/powerpc/kvm/book3s_hv_nested.c| 1 + 2 files changed, 21 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_interrupt.c b/arch/powerpc/kvm/book3s_hv_interrupt.c index dd0a78a69f49..b93d861d8538 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupt.c +++ b/arch/powerpc/kvm/book3s_hv_interrupt.c @@ -143,8 +143,13 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc if (hdec < 0) return BOOK3S_INTERRUPT_HV_DECREMENTER; + WARN_ON_ONCE(vcpu->arch.shregs.msr & MSR_HV); + WARN_ON_ONCE(!(vcpu->arch.shregs.msr & MSR_ME)); + start_timing(vcpu, &vcpu->arch.rm_entry); + vcpu->arch.ceded = 0; + if (vc->tb_offset) { u64 new_tb = tb + vc->tb_offset; mtspr(SPRN_TBU40, new_tb); @@ -193,26 +198,6 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc mtspr(SPRN_HFSCR, vcpu->arch.hfscr); - mtspr(SPRN_SPRG0, vcpu->arch.shregs.sprg0); - mtspr(SPRN_SPRG1, vcpu->arch.shregs.sprg1); - mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2); - mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3); - - mtspr(SPRN_AMOR, ~0UL); - - switch_mmu_to_guest_radix(kvm, vcpu, lpcr); - - /* -* P9 suppresses the HDEC exception when LPCR[HDICE] = 0, -* so set guest LPCR (with HDICE) before writing HDEC. -*/ - mtspr(SPRN_HDEC, hdec); - - vcpu->arch.ceded = 0; - - WARN_ON_ONCE(vcpu->arch.shregs.msr & MSR_HV); - WARN_ON_ONCE(!(vcpu->arch.shregs.msr & MSR_ME)); - mtspr(SPRN_HSRR0, vcpu->arch.regs.nip); mtspr(SPRN_HSRR1, (vcpu->arch.shregs.msr & ~MSR_HV) | MSR_ME); @@ -231,6 +216,21 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc */ mtspr(SPRN_HDSISR, HDSISR_CANARY); + mtspr(SPRN_SPRG0, vcpu->arch.shregs.sprg0); + mtspr(SPRN_SPRG1, vcpu->arch.shregs.sprg1); + mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2); + mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3); + + mtspr(SPRN_AMOR, ~0UL); + + switch_mmu_to_guest_radix(kvm, vcpu, lpcr); + + /* +* P9 suppresses the HDEC exception when LPCR[HDICE] = 0, +* so set guest LPCR (with HDICE) before writing HDEC. +*/ + mtspr(SPRN_HDEC, hdec); + __mtmsrd(0, 1); /* clear RI */ mtspr(SPRN_DAR, vcpu->arch.shregs.dar); diff --git a/arch/powerpc/kvm/book3s_hv_nested.c b/arch/powerpc/kvm/book3s_hv_nested.c index 0cd0e7aad588..cdf3ee2145ab 100644 --- a/arch/powerpc/kvm/book3s_hv_nested.c +++ b/arch/powerpc/kvm/book3s_hv_nested.c @@ -323,6 +323,7 @@ long kvmhv_enter_nested_guest(struct kvm_vcpu *vcpu) vcpu->arch.shregs.msr = vcpu->arch.regs.msr; mask = LPCR_DPFD | LPCR_ILE | LPCR_TC | LPCR_AIL | LPCR_LD | LPCR_LPES | LPCR_MER; + /* XXX: set lpcr in sanitise hv regs? Why is it plumbed through? */ lpcr = (vc->lpcr & ~mask) | (l2_hv.lpcr & mask); sanitise_hv_regs(vcpu, &l2_hv); restore_hv_regs(vcpu, &l2_hv); -- 2.23.0
[PATCH v2 28/37] KVM: PPC: Book3S HV P9: Add helpers for OS SPR handling
This is a first step to wrapping supervisor and user SPR saving and loading up into helpers, which will then be called independently in bare metal and nested HV cases in order to optimise SPR access. Signed-off-by: Nicholas Piggin --- arch/powerpc/kvm/book3s_hv.c | 131 ++- 1 file changed, 84 insertions(+), 47 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 94989fe2fdfe..ad16331c3370 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3442,6 +3442,84 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) trace_kvmppc_run_core(vc, 1); } +static void load_spr_state(struct kvm_vcpu *vcpu) +{ + mtspr(SPRN_DSCR, vcpu->arch.dscr); + mtspr(SPRN_IAMR, vcpu->arch.iamr); + mtspr(SPRN_PSPB, vcpu->arch.pspb); + mtspr(SPRN_FSCR, vcpu->arch.fscr); + mtspr(SPRN_TAR, vcpu->arch.tar); + mtspr(SPRN_EBBHR, vcpu->arch.ebbhr); + mtspr(SPRN_EBBRR, vcpu->arch.ebbrr); + mtspr(SPRN_BESCR, vcpu->arch.bescr); + mtspr(SPRN_WORT, vcpu->arch.wort); + mtspr(SPRN_TIDR, vcpu->arch.tid); + /* XXX: DAR, DSISR must be set with MSR[RI] clear (or hstate as appropriate) */ + mtspr(SPRN_AMR, vcpu->arch.amr); + mtspr(SPRN_UAMOR, vcpu->arch.uamor); + + if (!(vcpu->arch.ctrl & 1)) + mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1); +} + +static void store_spr_state(struct kvm_vcpu *vcpu) +{ + vcpu->arch.ctrl = mfspr(SPRN_CTRLF); + + vcpu->arch.iamr = mfspr(SPRN_IAMR); + vcpu->arch.pspb = mfspr(SPRN_PSPB); + vcpu->arch.fscr = mfspr(SPRN_FSCR); + vcpu->arch.tar = mfspr(SPRN_TAR); + vcpu->arch.ebbhr = mfspr(SPRN_EBBHR); + vcpu->arch.ebbrr = mfspr(SPRN_EBBRR); + vcpu->arch.bescr = mfspr(SPRN_BESCR); + vcpu->arch.wort = mfspr(SPRN_WORT); + vcpu->arch.tid = mfspr(SPRN_TIDR); + vcpu->arch.amr = mfspr(SPRN_AMR); + vcpu->arch.uamor = mfspr(SPRN_UAMOR); + vcpu->arch.dscr = mfspr(SPRN_DSCR); +} + +/* + * Privileged (non-hypervisor) host registers to save. + */ +struct p9_host_os_sprs { + unsigned long dscr; + unsigned long tidr; + unsigned long iamr; + unsigned long amr; + unsigned long fscr; +}; + +static void save_p9_host_os_sprs(struct p9_host_os_sprs *host_os_sprs) +{ + host_os_sprs->dscr = mfspr(SPRN_DSCR); + host_os_sprs->tidr = mfspr(SPRN_TIDR); + host_os_sprs->iamr = mfspr(SPRN_IAMR); + host_os_sprs->amr = mfspr(SPRN_AMR); + host_os_sprs->fscr = mfspr(SPRN_FSCR); +} + +/* vcpu guest regs must already be saved */ +static void restore_p9_host_os_sprs(struct kvm_vcpu *vcpu, + struct p9_host_os_sprs *host_os_sprs) +{ + mtspr(SPRN_PSPB, 0); + mtspr(SPRN_WORT, 0); + mtspr(SPRN_UAMOR, 0); + mtspr(SPRN_PSPB, 0); + + mtspr(SPRN_DSCR, host_os_sprs->dscr); + mtspr(SPRN_TIDR, host_os_sprs->tidr); + mtspr(SPRN_IAMR, host_os_sprs->iamr); + + if (host_os_sprs->amr != vcpu->arch.amr) + mtspr(SPRN_AMR, host_os_sprs->amr); + + if (host_os_sprs->fscr != vcpu->arch.fscr) + mtspr(SPRN_FSCR, host_os_sprs->fscr); +} + /* * Virtual-mode guest entry for POWER9 and later when the host and * guest are both using the radix MMU. The LPIDR has already been set. @@ -3450,11 +3528,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr) { struct kvmppc_vcore *vc = vcpu->arch.vcore; - unsigned long host_dscr = mfspr(SPRN_DSCR); - unsigned long host_tidr = mfspr(SPRN_TIDR); - unsigned long host_iamr = mfspr(SPRN_IAMR); - unsigned long host_amr = mfspr(SPRN_AMR); - unsigned long host_fscr = mfspr(SPRN_FSCR); + struct p9_host_os_sprs host_os_sprs; s64 dec; u64 tb, next_timer; int trap, save_pmu; @@ -3469,6 +3543,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vcpu->arch.ceded = 0; + save_p9_host_os_sprs(&host_os_sprs); + kvmhv_save_host_pmu(); /* saves it to PACA kvm_hstate */ kvmppc_subcore_enter_guest(); @@ -3496,22 +3572,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, #endif mtspr(SPRN_VRSAVE, vcpu->arch.vrsave); - mtspr(SPRN_DSCR, vcpu->arch.dscr); - mtspr(SPRN_IAMR, vcpu->arch.iamr); - mtspr(SPRN_PSPB, vcpu->arch.pspb); - mtspr(SPRN_FSCR, vcpu->arch.fscr); - mtspr(SPRN_TAR, vcpu->arch.tar); - mtspr(SPRN_EBBHR, vcpu->arch.ebbhr); - mtspr(SPRN_EBBRR, vcpu->arch.ebbrr); - mtspr(SPRN_BESCR, vcpu->arch.bescr); - mtspr(SPRN_WORT, vcpu->arch.wort); - mtspr(SPRN_TIDR, vcpu->arch.tid); - /* XXX: DAR, DSISR must be set with MSR[RI] clear (or hstate as appropriate) */ - mtspr(SPRN_AMR, vcpu->arch.amr); -
[PATCH v2 27/37] KVM: PPC: Book3S HV P9: Move SPR loading after expiry time check
This is wasted work if the time limit is exceeded. Signed-off-by: Nicholas Piggin --- arch/powerpc/kvm/book3s_hv_interrupt.c | 38 -- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_interrupt.c b/arch/powerpc/kvm/book3s_hv_interrupt.c index 4a158c8fc0bc..dd0a78a69f49 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupt.c +++ b/arch/powerpc/kvm/book3s_hv_interrupt.c @@ -126,22 +126,17 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc s64 hdec; u64 tb, purr, spurr; u64 *exsave; - bool ri_clear; - unsigned long msr = mfmsr(); int trap; - unsigned long host_hfscr = mfspr(SPRN_HFSCR); - unsigned long host_ciabr = mfspr(SPRN_CIABR); - unsigned long host_dawr0 = mfspr(SPRN_DAWR0); - unsigned long host_dawrx0 = mfspr(SPRN_DAWRX0); - unsigned long host_psscr = mfspr(SPRN_PSSCR); - unsigned long host_pidr = mfspr(SPRN_PID); - unsigned long host_dawr1 = 0; - unsigned long host_dawrx1 = 0; - - if (cpu_has_feature(CPU_FTR_DAWR1)) { - host_dawr1 = mfspr(SPRN_DAWR1); - host_dawrx1 = mfspr(SPRN_DAWRX1); - } + bool ri_clear; + unsigned long msr; + unsigned long host_hfscr; + unsigned long host_ciabr; + unsigned long host_dawr0; + unsigned long host_dawrx0; + unsigned long host_psscr; + unsigned long host_pidr; + unsigned long host_dawr1; + unsigned long host_dawrx1; tb = mftb(); hdec = time_limit - tb; @@ -159,6 +154,19 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc vc->tb_offset_applied = vc->tb_offset; } + msr = mfmsr(); + + host_hfscr = mfspr(SPRN_HFSCR); + host_ciabr = mfspr(SPRN_CIABR); + host_dawr0 = mfspr(SPRN_DAWR0); + host_dawrx0 = mfspr(SPRN_DAWRX0); + host_psscr = mfspr(SPRN_PSSCR); + host_pidr = mfspr(SPRN_PID); + if (cpu_has_feature(CPU_FTR_DAWR1)) { + host_dawr1 = mfspr(SPRN_DAWR1); + host_dawrx1 = mfspr(SPRN_DAWRX1); + } + if (vc->pcr) mtspr(SPRN_PCR, vc->pcr | PCR_MASK); mtspr(SPRN_DPDES, vc->dpdes); -- 2.23.0
[PATCH v2 26/37] KVM: PPC: Book3S HV P9: Improve exit timing accounting coverage
The C conversion caused exit timing to become a bit cramped. Expand it to cover more of the entry and exit code. Signed-off-by: Nicholas Piggin --- arch/powerpc/kvm/book3s_hv_interrupt.c | 8 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_interrupt.c b/arch/powerpc/kvm/book3s_hv_interrupt.c index f5fef7398e37..4a158c8fc0bc 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupt.c +++ b/arch/powerpc/kvm/book3s_hv_interrupt.c @@ -148,6 +148,8 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc if (hdec < 0) return BOOK3S_INTERRUPT_HV_DECREMENTER; + start_timing(vcpu, &vcpu->arch.rm_entry); + if (vc->tb_offset) { u64 new_tb = tb + vc->tb_offset; mtspr(SPRN_TBU40, new_tb); @@ -198,8 +200,6 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc */ mtspr(SPRN_HDEC, hdec); - start_timing(vcpu, &vcpu->arch.rm_entry); - vcpu->arch.ceded = 0; WARN_ON_ONCE(vcpu->arch.shregs.msr & MSR_HV); @@ -334,8 +334,6 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc accumulate_time(vcpu, &vcpu->arch.rm_exit); - end_timing(vcpu); - /* Advance host PURR/SPURR by the amount used by guest */ purr = mfspr(SPRN_PURR); spurr = mfspr(SPRN_SPURR); @@ -400,6 +398,8 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc switch_mmu_to_host_radix(kvm, host_pidr); + end_timing(vcpu); + return trap; } EXPORT_SYMBOL_GPL(kvmhv_vcpu_entry_p9); -- 2.23.0
[PATCH v2 25/37] KVM: PPC: Book3S HV P9: Read machine check registers while MSR[RI] is 0
SRR0/1, DAR, DSISR must all be protected from machine check which can clobber them. Ensure MSR[RI] is clear while they are live. Signed-off-by: Nicholas Piggin --- arch/powerpc/kvm/book3s_hv.c | 5 +++-- arch/powerpc/kvm/book3s_hv_interrupt.c | 26 +++--- arch/powerpc/kvm/book3s_hv_ras.c | 5 + 3 files changed, 31 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index f99503acdda5..94989fe2fdfe 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3506,8 +3506,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, mtspr(SPRN_BESCR, vcpu->arch.bescr); mtspr(SPRN_WORT, vcpu->arch.wort); mtspr(SPRN_TIDR, vcpu->arch.tid); - mtspr(SPRN_DAR, vcpu->arch.shregs.dar); - mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr); + /* XXX: DAR, DSISR must be set with MSR[RI] clear (or hstate as appropriate) */ mtspr(SPRN_AMR, vcpu->arch.amr); mtspr(SPRN_UAMOR, vcpu->arch.uamor); @@ -3553,6 +3552,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, hvregs.vcpu_token = vcpu->vcpu_id; } hvregs.hdec_expiry = time_limit; + mtspr(SPRN_DAR, vcpu->arch.shregs.dar); + mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr); trap = plpar_hcall_norets(H_ENTER_NESTED, __pa(&hvregs), __pa(&vcpu->arch.regs)); kvmhv_restore_hv_return_state(vcpu, &hvregs); diff --git a/arch/powerpc/kvm/book3s_hv_interrupt.c b/arch/powerpc/kvm/book3s_hv_interrupt.c index dea3eca3648a..f5fef7398e37 100644 --- a/arch/powerpc/kvm/book3s_hv_interrupt.c +++ b/arch/powerpc/kvm/book3s_hv_interrupt.c @@ -126,6 +126,7 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc s64 hdec; u64 tb, purr, spurr; u64 *exsave; + bool ri_clear; unsigned long msr = mfmsr(); int trap; unsigned long host_hfscr = mfspr(SPRN_HFSCR); @@ -197,9 +198,6 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc */ mtspr(SPRN_HDEC, hdec); - mtspr(SPRN_SRR0, vcpu->arch.shregs.srr0); - mtspr(SPRN_SRR1, vcpu->arch.shregs.srr1); - start_timing(vcpu, &vcpu->arch.rm_entry); vcpu->arch.ceded = 0; @@ -225,6 +223,13 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc */ mtspr(SPRN_HDSISR, HDSISR_CANARY); + __mtmsrd(0, 1); /* clear RI */ + + mtspr(SPRN_DAR, vcpu->arch.shregs.dar); + mtspr(SPRN_DSISR, vcpu->arch.shregs.dsisr); + mtspr(SPRN_SRR0, vcpu->arch.shregs.srr0); + mtspr(SPRN_SRR1, vcpu->arch.shregs.srr1); + accumulate_time(vcpu, &vcpu->arch.guest_time); local_paca->kvm_hstate.in_guest = KVM_GUEST_MODE_GUEST_HV_FAST; @@ -240,6 +245,13 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc vcpu->arch.shregs.dar = mfspr(SPRN_DAR); vcpu->arch.shregs.dsisr = mfspr(SPRN_DSISR); + /* HSRR interrupts leave MSR[RI] unchanged, SRR interrupts clear it. */ + if ((local_paca->kvm_hstate.scratch0 & 0x2) && + (vcpu->arch.shregs.msr & MSR_RI)) + ri_clear = false; + else + ri_clear = true; + trap = local_paca->kvm_hstate.scratch0 & ~0x2; if (likely(trap > BOOK3S_INTERRUPT_MACHINE_CHECK)) { exsave = local_paca->exgen; @@ -251,6 +263,14 @@ int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpc vcpu->arch.regs.gpr[1] = local_paca->kvm_hstate.scratch1; vcpu->arch.regs.gpr[3] = local_paca->kvm_hstate.scratch2; + + if (ri_clear) { +/// XXX this fires maybe on syscalls on mambo WARN_ON((mfmsr() & MSR_RI)); + __mtmsrd(MSR_RI, 1); /* set RI after reading machine check regs (DAR, DSISR, SRR0/1) and hstate scratch (which we need to move into exsave) */ + } else { + WARN_ON(!(mfmsr() & MSR_RI)); + } + vcpu->arch.regs.gpr[9] = exsave[EX_R9/sizeof(u64)]; vcpu->arch.regs.gpr[10] = exsave[EX_R10/sizeof(u64)]; vcpu->arch.regs.gpr[11] = exsave[EX_R11/sizeof(u64)]; diff --git a/arch/powerpc/kvm/book3s_hv_ras.c b/arch/powerpc/kvm/book3s_hv_ras.c index d4bca93b79f6..7a645f4428c2 100644 --- a/arch/powerpc/kvm/book3s_hv_ras.c +++ b/arch/powerpc/kvm/book3s_hv_ras.c @@ -198,6 +198,7 @@ static void kvmppc_tb_resync_done(void) * value. Hence the idea is to resync the TB on every HMI, so that we * know about the exact state of the TB value. Resync TB call will * restore TB to host timebase. + * XXX: could use new opal hmi handler flags for this * * Things to consider: * - On TB error, HMI interrupt is reported on all the thread
[PATCH v2 24/37] KVM: PPC: Book3S HV P9: inline kvmhv_load_hv_regs_and_go into __kvmhv_vcpu_entry_p9
Now the initial C implementation is done, inline more HV code to make rearranging things easier. And rename __kvmhv_vcpu_entry_p9 to drop the leading underscores as it's now C, and is now a more complete vcpu entry. Signed-off-by: Nicholas Piggin --- arch/powerpc/include/asm/kvm_book3s_64.h | 2 +- arch/powerpc/kvm/book3s_hv.c | 181 +-- arch/powerpc/kvm/book3s_hv_interrupt.c | 168 - 3 files changed, 169 insertions(+), 182 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index c214bcffb441..eaf3a562bf1e 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -153,7 +153,7 @@ static inline bool kvmhv_vcpu_is_radix(struct kvm_vcpu *vcpu) return radix; } -int __kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu); +int kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr); #define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ #endif diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 28a2761515e3..f99503acdda5 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3442,183 +3442,6 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) trace_kvmppc_run_core(vc, 1); } -static void switch_mmu_to_guest_radix(struct kvm *kvm, struct kvm_vcpu *vcpu, u64 lpcr) -{ - struct kvmppc_vcore *vc = vcpu->arch.vcore; - struct kvm_nested_guest *nested = vcpu->arch.nested; - u32 lpid; - - lpid = nested ? nested->shadow_lpid : kvm->arch.lpid; - - mtspr(SPRN_LPID, lpid); - mtspr(SPRN_LPCR, lpcr); - mtspr(SPRN_PID, vcpu->arch.pid); - isync(); - - /* TLBIEL must have LPIDR set, so set guest LPID before flushing. */ - kvmppc_check_need_tlb_flush(kvm, vc->pcpu, nested); -} - -static void switch_mmu_to_host_radix(struct kvm *kvm, u32 pid) -{ - mtspr(SPRN_PID, pid); - mtspr(SPRN_LPID, kvm->arch.host_lpid); - mtspr(SPRN_LPCR, kvm->arch.host_lpcr); - isync(); -} - -/* - * Load up hypervisor-mode registers on P9. - */ -static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, -unsigned long lpcr) -{ - struct kvm *kvm = vcpu->kvm; - struct kvmppc_vcore *vc = vcpu->arch.vcore; - s64 hdec; - u64 tb, purr, spurr; - int trap; - unsigned long host_hfscr = mfspr(SPRN_HFSCR); - unsigned long host_ciabr = mfspr(SPRN_CIABR); - unsigned long host_dawr0 = mfspr(SPRN_DAWR0); - unsigned long host_dawrx0 = mfspr(SPRN_DAWRX0); - unsigned long host_psscr = mfspr(SPRN_PSSCR); - unsigned long host_pidr = mfspr(SPRN_PID); - unsigned long host_dawr1 = 0; - unsigned long host_dawrx1 = 0; - - if (cpu_has_feature(CPU_FTR_DAWR1)) { - host_dawr1 = mfspr(SPRN_DAWR1); - host_dawrx1 = mfspr(SPRN_DAWRX1); - } - - tb = mftb(); - hdec = time_limit - tb; - if (hdec < 0) - return BOOK3S_INTERRUPT_HV_DECREMENTER; - - if (vc->tb_offset) { - u64 new_tb = tb + vc->tb_offset; - mtspr(SPRN_TBU40, new_tb); - tb = mftb(); - if ((tb & 0xff) < (new_tb & 0xff)) - mtspr(SPRN_TBU40, new_tb + 0x100); - vc->tb_offset_applied = vc->tb_offset; - } - - if (vc->pcr) - mtspr(SPRN_PCR, vc->pcr | PCR_MASK); - mtspr(SPRN_DPDES, vc->dpdes); - mtspr(SPRN_VTB, vc->vtb); - - local_paca->kvm_hstate.host_purr = mfspr(SPRN_PURR); - local_paca->kvm_hstate.host_spurr = mfspr(SPRN_SPURR); - mtspr(SPRN_PURR, vcpu->arch.purr); - mtspr(SPRN_SPURR, vcpu->arch.spurr); - - if (dawr_enabled()) { - mtspr(SPRN_DAWR0, vcpu->arch.dawr0); - mtspr(SPRN_DAWRX0, vcpu->arch.dawrx0); - if (cpu_has_feature(CPU_FTR_DAWR1)) { - mtspr(SPRN_DAWR1, vcpu->arch.dawr1); - mtspr(SPRN_DAWRX1, vcpu->arch.dawrx1); - } - } - mtspr(SPRN_CIABR, vcpu->arch.ciabr); - mtspr(SPRN_IC, vcpu->arch.ic); - - mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC | - (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG)); - - mtspr(SPRN_HFSCR, vcpu->arch.hfscr); - - mtspr(SPRN_SPRG0, vcpu->arch.shregs.sprg0); - mtspr(SPRN_SPRG1, vcpu->arch.shregs.sprg1); - mtspr(SPRN_SPRG2, vcpu->arch.shregs.sprg2); - mtspr(SPRN_SPRG3, vcpu->arch.shregs.sprg3); - - mtspr(SPRN_AMOR, ~0UL); - - switch_mmu_to_guest_radix(kvm, vcpu, lpcr); - - /* -* P9 suppresses the HDEC exception when LPCR[HDICE] = 0, -* so set guest LPCR (with HDICE) before writing HDEC. -*/ - mtspr(SPRN_HDEC, hdec); - - mtspr(SPRN_SRR0,
[PATCH v2 23/37] KVM: PPC: Book3S HV P9: Implement the rest of the P9 path in C
Almost all logic is moved to C, by introducing a new in_guest mode that selects and branches very early in the interrupt handler to the P9 exit code. The remaining assembly is only about 160 lines of low level stack setup, with VCPU vs host register save and restore, plus a small shim to the legacy paths in the interrupt handler. There are two motivations for this, the first is just make the code more maintainable being in C. The second is to reduce the amount of code running in a special KVM mode, "realmode". I put that in quotes because with radix it is no longer necessarily real-mode in the MMU, but it still has to be treated specially because it may be in real-mode, and has various important registers like PID, DEC, TB, etc set to guest. This is hostile to the rest of Linux and can't use arbitrary kernel functionality or be instrumented well. This initial patch is a reasonably faithful conversion of the asm code. It does lack any loop to return quickly back into the guest without switching out of realmode in the case of unimportant or easily handled interrupts, as explained in the previous change, handling HV interrupts in real mode is not so important for P9. Signed-off-by: Nicholas Piggin --- arch/powerpc/include/asm/asm-prototypes.h | 3 +- arch/powerpc/include/asm/kvm_asm.h| 3 +- arch/powerpc/include/asm/kvm_book3s_64.h | 8 + arch/powerpc/kernel/security.c| 5 +- arch/powerpc/kvm/Makefile | 3 + arch/powerpc/kvm/book3s_64_entry.S| 181 ++ arch/powerpc/kvm/book3s_hv.c | 27 ++- arch/powerpc/kvm/book3s_hv_interrupt.c| 221 ++ arch/powerpc/kvm/book3s_hv_rmhandlers.S | 123 +--- arch/powerpc/kvm/book3s_xive.c| 34 10 files changed, 480 insertions(+), 128 deletions(-) create mode 100644 arch/powerpc/kvm/book3s_hv_interrupt.c diff --git a/arch/powerpc/include/asm/asm-prototypes.h b/arch/powerpc/include/asm/asm-prototypes.h index 939f3c94c8f3..8677d27929fe 100644 --- a/arch/powerpc/include/asm/asm-prototypes.h +++ b/arch/powerpc/include/asm/asm-prototypes.h @@ -122,6 +122,7 @@ extern s32 patch__call_flush_branch_caches3; extern s32 patch__flush_count_cache_return; extern s32 patch__flush_link_stack_return; extern s32 patch__call_kvm_flush_link_stack; +extern s32 patch__call_kvm_flush_link_stack_2; extern s32 patch__memset_nocache, patch__memcpy_nocache; extern long flush_branch_caches; @@ -142,7 +143,7 @@ void kvmhv_load_host_pmu(void); void kvmhv_save_guest_pmu(struct kvm_vcpu *vcpu, bool pmu_in_use); void kvmhv_load_guest_pmu(struct kvm_vcpu *vcpu); -int __kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu); +void kvmppc_p9_enter_guest(struct kvm_vcpu *vcpu); long kvmppc_h_set_dabr(struct kvm_vcpu *vcpu, unsigned long dabr); long kvmppc_h_set_xdabr(struct kvm_vcpu *vcpu, unsigned long dabr, diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h index a3633560493b..b4f9996bd331 100644 --- a/arch/powerpc/include/asm/kvm_asm.h +++ b/arch/powerpc/include/asm/kvm_asm.h @@ -146,7 +146,8 @@ #define KVM_GUEST_MODE_GUEST 1 #define KVM_GUEST_MODE_SKIP2 #define KVM_GUEST_MODE_GUEST_HV3 -#define KVM_GUEST_MODE_HOST_HV 4 +#define KVM_GUEST_MODE_GUEST_HV_FAST 4 /* ISA v3.0 with host radix mode */ +#define KVM_GUEST_MODE_HOST_HV 5 #define KVM_INST_FETCH_FAILED -1 diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h b/arch/powerpc/include/asm/kvm_book3s_64.h index 9bb9bb370b53..c214bcffb441 100644 --- a/arch/powerpc/include/asm/kvm_book3s_64.h +++ b/arch/powerpc/include/asm/kvm_book3s_64.h @@ -153,9 +153,17 @@ static inline bool kvmhv_vcpu_is_radix(struct kvm_vcpu *vcpu) return radix; } +int __kvmhv_vcpu_entry_p9(struct kvm_vcpu *vcpu); + #define KVM_DEFAULT_HPT_ORDER 24 /* 16MB HPT by default */ #endif +/* + * Invalid HDSISR value which is used to indicate when HW has not set the reg. + * Used to work around an errata. + */ +#define HDSISR_CANARY 0x7fff + /* * We use a lock bit in HPTE dword 0 to synchronize updates and * accesses to each HPTE, and another bit to indicate non-present diff --git a/arch/powerpc/kernel/security.c b/arch/powerpc/kernel/security.c index e4e1a94ccf6a..6c37aeed0650 100644 --- a/arch/powerpc/kernel/security.c +++ b/arch/powerpc/kernel/security.c @@ -430,16 +430,19 @@ device_initcall(stf_barrier_debugfs_init); static void update_branch_cache_flush(void) { - u32 *site; + u32 *site, __maybe_unused *site2; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE site = &patch__call_kvm_flush_link_stack; + site2 = &patch__call_kvm_flush_link_stack_2; // This controls the branch from guest_exit_cont to kvm_flush_link_stack if (link_stack_flush_type == BRANCH_CACHE_FLUSH_NONE) { patch_instruction_site(site, ppc_inst(PPC_INST_NOP)); + patch_instruction_site(site2, ppc_inst(PPC_INST_NOP)); } else {
[PATCH v2 22/37] KVM: PPC: Book3S HV P9: Reduce irq_work vs guest decrementer races
irq_work's use of the DEC SPR is racy with guest<->host switch and guest entry which flips the DEC interrupt to guest, which could lose a host work interrupt. This patch closes one race, and attempts to comment several others. (XXX: should think a bit harder about this) Signed-off-by: Nicholas Piggin --- arch/powerpc/include/asm/paca.h | 1 + arch/powerpc/kvm/book3s_hv.c| 15 ++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h index ec18ac818e3a..23c12048fbc9 100644 --- a/arch/powerpc/include/asm/paca.h +++ b/arch/powerpc/include/asm/paca.h @@ -174,6 +174,7 @@ struct paca_struct { u8 irq_happened;/* irq happened while soft-disabled */ u8 irq_work_pending;/* IRQ_WORK interrupt while soft-disable */ #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + /* Could have irq_work_using_hdec here, but what about nested HV entry modifying DEC? Could have a pointer to the hv struct time limit */ u8 pmcregs_in_use; /* pseries puts this in lppaca */ #endif u64 sprg_vdso; /* Saved user-visible sprg */ diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index d98958b78830..1997cf347d3e 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3689,6 +3689,18 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, if (!(vcpu->arch.ctrl & 1)) mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1); + /* +* XXX: must always deal with irq_work_raise via NMI vs setting DEC. +* The problem occurs right as we switch into guest mode if a NMI +* hits and sets pending work and sets DEC, then that will apply to +* the guest and not bring us back to the host. +* +* irq_work_raise could check a flag (or possibly LPCR[HDICE] for +* example) and set HDEC to 1? That wouldn't solve the nested hv +* case which needs to abort the hcall or zero the time limit. +* +* Another day's problem. +*/ mtspr(SPRN_DEC, vcpu->arch.dec_expires - tb); if (kvmhv_on_pseries()) { @@ -3822,7 +3834,8 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vc->entry_exit_map = 0x101; vc->in_guest = 0; - mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - tb); + set_dec_or_work(local_paca->kvm_hstate.dec_expires - tb); + mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso); kvmhv_load_host_pmu(); -- 2.23.0
[PATCH v2 21/37] powerpc: add set_dec_or_work API for safely updating decrementer
Decrementer updates must always check for new irq work to avoid an irq work decrementer interrupt being lost. Add an API for this in the timer code so callers don't have to care about details. Signed-off-by: Nicholas Piggin --- arch/powerpc/include/asm/time.h | 9 + arch/powerpc/kernel/time.c | 20 +++- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 0128cd9769bc..d62bde57bf02 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -78,6 +78,15 @@ static inline void set_dec(u64 val) mtspr(SPRN_DEC, val - 1); } +#ifdef CONFIG_IRQ_WORK +void set_dec_or_work(u64 val); +#else +static inline void set_dec_or_work(u64 val) +{ + set_dec(val); +} +#endif + static inline unsigned long tb_ticks_since(unsigned long tstamp) { return mftb() - tstamp; diff --git a/arch/powerpc/kernel/time.c b/arch/powerpc/kernel/time.c index b67d93a609a2..e35156858e6e 100644 --- a/arch/powerpc/kernel/time.c +++ b/arch/powerpc/kernel/time.c @@ -561,6 +561,15 @@ void arch_irq_work_raise(void) preempt_enable(); } +void set_dec_or_work(u64 val) +{ + set_dec(val); + /* We may have raced with new irq work */ + if (unlikely(test_irq_work_pending())) + set_dec(1); +} +EXPORT_SYMBOL_GPL(set_dec_or_work); + #else /* CONFIG_IRQ_WORK */ #define test_irq_work_pending()0 @@ -628,10 +637,7 @@ DEFINE_INTERRUPT_HANDLER_ASYNC(timer_interrupt) } else { now = *next_tb - now; if (now <= decrementer_max) - set_dec(now); - /* We may have raced with new irq work */ - if (test_irq_work_pending()) - set_dec(1); + set_dec_or_work(now); __this_cpu_inc(irq_stat.timer_irqs_others); } @@ -873,11 +879,7 @@ static int decrementer_set_next_event(unsigned long evt, struct clock_event_device *dev) { __this_cpu_write(decrementers_next_tb, get_tb() + evt); - set_dec(evt); - - /* We may have raced with new irq work */ - if (test_irq_work_pending()) - set_dec(1); + set_dec_or_work(evt); return 0; } -- 2.23.0
[PATCH v2 20/37] KVM: PPC: Book3S HV P9: Reduce mftb per guest entry/exit
mftb is serialising (dispatch next-to-complete) so it is heavy weight for a mfspr. Avoid reading it multiple times in the entry or exit paths. A small number of cycles delay to timers is tolerable. Signed-off-by: Nicholas Piggin --- arch/powerpc/kvm/book3s_hv.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 735ec40ece86..d98958b78830 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3689,7 +3689,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, if (!(vcpu->arch.ctrl & 1)) mtspr(SPRN_CTRLT, mfspr(SPRN_CTRLF) & ~1); - mtspr(SPRN_DEC, vcpu->arch.dec_expires - mftb()); + mtspr(SPRN_DEC, vcpu->arch.dec_expires - tb); if (kvmhv_on_pseries()) { /* @@ -3822,7 +3822,7 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, vc->entry_exit_map = 0x101; vc->in_guest = 0; - mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - mftb()); + mtspr(SPRN_DEC, local_paca->kvm_hstate.dec_expires - tb); mtspr(SPRN_SPRG_VDSO_WRITE, local_paca->sprg_vdso); kvmhv_load_host_pmu(); -- 2.23.0
[PATCH v2 19/37] KVM: PPC: Book3S HV P9: Use host timer accounting to avoid decrementer read
There is no need to save away the host DEC value, as it is derived from the host timer subsystem, which maintains the next timer time. Signed-off-by: Nicholas Piggin --- arch/powerpc/include/asm/time.h | 5 + arch/powerpc/kvm/book3s_hv.c| 12 ++-- 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 68d94711811e..0128cd9769bc 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -101,6 +101,11 @@ extern void __init time_init(void); DECLARE_PER_CPU(u64, decrementers_next_tb); +static inline u64 timer_get_next_tb(void) +{ + return __this_cpu_read(decrementers_next_tb); +} + /* Convert timebase ticks to nanoseconds */ unsigned long long tb_to_ns(unsigned long long tb_ticks); diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 913582bd848f..735ec40ece86 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3631,16 +3631,16 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long host_amr = mfspr(SPRN_AMR); unsigned long host_fscr = mfspr(SPRN_FSCR); s64 dec; - u64 tb; + u64 tb, next_timer; int trap, save_pmu; - dec = mfspr(SPRN_DEC); tb = mftb(); - if (dec < 0) + next_timer = timer_get_next_tb(); + if (tb >= next_timer) return BOOK3S_INTERRUPT_HV_DECREMENTER; - local_paca->kvm_hstate.dec_expires = dec + tb; - if (local_paca->kvm_hstate.dec_expires < time_limit) - time_limit = local_paca->kvm_hstate.dec_expires; + local_paca->kvm_hstate.dec_expires = next_timer; + if (next_timer < time_limit) + time_limit = next_timer; vcpu->arch.ceded = 0; -- 2.23.0
[PATCH v2 18/37] KVM: PPC: Book3S HV P9: Use large decrementer for HDEC
On processors that don't suppress the HDEC exceptions when LPCR[HDICE]=0, this could help reduce needless guest exits due to leftover exceptions on entering the guest. Signed-off-by: Nicholas Piggin --- arch/powerpc/include/asm/time.h | 2 ++ arch/powerpc/kvm/book3s_hv.c| 3 ++- 2 files changed, 4 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 8dd3cdb25338..68d94711811e 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -18,6 +18,8 @@ #include /* time.c */ +extern u64 decrementer_max; + extern unsigned long tb_ticks_per_jiffy; extern unsigned long tb_ticks_per_usec; extern unsigned long tb_ticks_per_sec; diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 63cc92c45c5d..913582bd848f 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3609,7 +3609,8 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, vc->tb_offset_applied = 0; } - mtspr(SPRN_HDEC, 0x7fff); + /* HDEC must be at least as large as DEC, so decrementer_max fits */ + mtspr(SPRN_HDEC, decrementer_max); switch_mmu_to_host_radix(kvm, host_pidr); -- 2.23.0
[PATCH v2 17/37] KVM: PPC: Book3S HV P9: Move setting HDEC after switching to guest LPCR
LPCR[HDICE]=0 suppresses hypervisor decrementer exceptions on some processors, so it must be enabled before HDEC is set. Rather than set it in the host LPCR then setting HDEC, move the HDEC update to after the guest MMU context (including LPCR) is loaded. There shouldn't be much concern with delaying HDEC by some 10s or 100s of nanoseconds by setting it a bit later. Signed-off-by: Nicholas Piggin --- arch/powerpc/kvm/book3s_hv.c | 24 ++-- 1 file changed, 10 insertions(+), 14 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index d4770b222d7e..63cc92c45c5d 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3490,23 +3490,13 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, host_dawrx1 = mfspr(SPRN_DAWRX1); } - /* -* P8 and P9 suppress the HDEC exception when LPCR[HDICE] = 0, -* so set HDICE before writing HDEC. -*/ - mtspr(SPRN_LPCR, kvm->arch.host_lpcr | LPCR_HDICE); - isync(); - - hdec = time_limit - mftb(); - if (hdec < 0) { - mtspr(SPRN_LPCR, kvm->arch.host_lpcr); - isync(); + tb = mftb(); + hdec = time_limit - tb; + if (hdec < 0) return BOOK3S_INTERRUPT_HV_DECREMENTER; - } - mtspr(SPRN_HDEC, hdec); if (vc->tb_offset) { - u64 new_tb = mftb() + vc->tb_offset; + u64 new_tb = tb + vc->tb_offset; mtspr(SPRN_TBU40, new_tb); tb = mftb(); if ((tb & 0xff) < (new_tb & 0xff)) @@ -3549,6 +3539,12 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, switch_mmu_to_guest_radix(kvm, vcpu, lpcr); + /* +* P9 suppresses the HDEC exception when LPCR[HDICE] = 0, +* so set guest LPCR (with HDICE) before writing HDEC. +*/ + mtspr(SPRN_HDEC, hdec); + mtspr(SPRN_SRR0, vcpu->arch.shregs.srr0); mtspr(SPRN_SRR1, vcpu->arch.shregs.srr1); -- 2.23.0
[PATCH v2 16/37] KVM: PPC: Book3S HV P9: Stop handling hcalls in real-mode in the P9 path
In the interest of minimising the amount of code that is run in "real-mode", don't handle hcalls in real mode in the P9 path. POWER8 and earlier are much more expensive to exit from HV real mode and switch to host mode, because on those processors HV interrupts get to the hypervisor with the MMU off, and the other threads in the core need to be pulled out of the guest, and SLBs all need to be saved, ERATs invalidated, and host SLB reloaded before the MMU is re-enabled in host mode. Hash guests also require a lot of hcalls to run. The XICS interrupt controller requires hcalls to run. By contrast, POWER9 has independent thread switching, and in radix mode the hypervisor is already in a host virtual memory mode when the HV interrupt is taken. Radix + xive guests don't need hcalls to handle interrupts or manage translations. So it's much less important to handle hcalls in real mode in P9. Signed-off-by: Nicholas Piggin --- arch/powerpc/include/asm/kvm_ppc.h | 5 + arch/powerpc/kvm/book3s_hv.c| 25 ++--- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 5 + arch/powerpc/kvm/book3s_xive.c | 25 + 4 files changed, 57 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 73b1ca5a6471..db6646c2ade2 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -607,6 +607,7 @@ extern void kvmppc_free_pimap(struct kvm *kvm); extern int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall); extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu); extern int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd); +extern int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req); extern u64 kvmppc_xics_get_icp(struct kvm_vcpu *vcpu); extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval); extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev, @@ -639,6 +640,8 @@ static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { } static inline int kvmppc_xics_hcall(struct kvm_vcpu *vcpu, u32 cmd) { return 0; } +static inline int kvmppc_xive_xics_hcall(struct kvm_vcpu *vcpu, u32 req) + { return 0; } #endif #ifdef CONFIG_KVM_XIVE @@ -673,6 +676,7 @@ extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status); extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu); extern void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu); +extern void kvmppc_xive_cede_vcpu(struct kvm_vcpu *vcpu); static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu) { @@ -714,6 +718,7 @@ static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 ir int level, bool line_status) { return -ENODEV; } static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { } static inline void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu) { } +static inline void kvmppc_xive_cede_vcpu(struct kvm_vcpu *vcpu) { } static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu) { return 0; } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 7e23838b7f9b..d4770b222d7e 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -1144,7 +1144,7 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu) * This has to be done early, not in kvmppc_pseries_do_hcall(), so * that the cede logic in kvmppc_run_single_vcpu() works properly. */ -static void kvmppc_nested_cede(struct kvm_vcpu *vcpu) +static void kvmppc_cede(struct kvm_vcpu *vcpu) { vcpu->arch.shregs.msr |= MSR_EE; vcpu->arch.ceded = 1; @@ -3731,15 +3731,34 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, /* H_CEDE has to be handled now, not later */ if (trap == BOOK3S_INTERRUPT_SYSCALL && !vcpu->arch.nested && kvmppc_get_gpr(vcpu, 3) == H_CEDE) { - kvmppc_nested_cede(vcpu); + kvmppc_cede(vcpu); kvmppc_set_gpr(vcpu, 3, 0); trap = 0; } } else { kvmppc_xive_push_vcpu(vcpu); trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr); - kvmppc_xive_pull_vcpu(vcpu); + /* H_CEDE has to be handled now, not later */ + /* XICS hcalls must be handled before xive is pulled */ + if (trap == BOOK3S_INTERRUPT_SYSCALL && + !(vcpu->arch.shregs.msr & MSR_PR)) { + unsigned long req = kvmppc_get_gpr(vcpu, 3); + if (req == H_CEDE) { + kvmppc_cede(vcpu); + kvmppc_xive_cede_vcpu(vcpu); /* may un-cede */ + kvmppc_set_gpr(
[PATCH v2 15/37] KVM: PPC: Book3S HV P9: Move xive vcpu context management into kvmhv_p9_guest_entry
Move the xive management up so the low level register switching can be pushed further down in a later patch. XIVE MMIO CI operations can run in higher level code with machine checks, tracing, etc., available. Signed-off-by: Nicholas Piggin --- arch/powerpc/kvm/book3s_hv.c | 7 +++ 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index e3344d58537d..7e23838b7f9b 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3549,15 +3549,11 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, switch_mmu_to_guest_radix(kvm, vcpu, lpcr); - kvmppc_xive_push_vcpu(vcpu); - mtspr(SPRN_SRR0, vcpu->arch.shregs.srr0); mtspr(SPRN_SRR1, vcpu->arch.shregs.srr1); trap = __kvmhv_vcpu_entry_p9(vcpu); - kvmppc_xive_pull_vcpu(vcpu); - /* Advance host PURR/SPURR by the amount used by guest */ purr = mfspr(SPRN_PURR); spurr = mfspr(SPRN_SPURR); @@ -3740,7 +3736,10 @@ static int kvmhv_p9_guest_entry(struct kvm_vcpu *vcpu, u64 time_limit, trap = 0; } } else { + kvmppc_xive_push_vcpu(vcpu); trap = kvmhv_load_hv_regs_and_go(vcpu, time_limit, lpcr); + kvmppc_xive_pull_vcpu(vcpu); + } vcpu->arch.slb_max = 0; -- 2.23.0
[PATCH v2 14/37] KVM: PPC: Book3S HV P9: implement kvmppc_xive_pull_vcpu in C
This is more symmetric with kvmppc_xive_push_vcpu. The extra test in the asm will go away in a later change. Signed-off-by: Nicholas Piggin --- arch/powerpc/include/asm/kvm_ppc.h | 2 ++ arch/powerpc/kvm/book3s_hv.c| 2 ++ arch/powerpc/kvm/book3s_hv_rmhandlers.S | 5 arch/powerpc/kvm/book3s_xive.c | 34 + 4 files changed, 43 insertions(+) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 9531b1c1b190..73b1ca5a6471 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -672,6 +672,7 @@ extern int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval); extern int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status); extern void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu); +extern void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu); static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu) { @@ -712,6 +713,7 @@ static inline int kvmppc_xive_set_icp(struct kvm_vcpu *vcpu, u64 icpval) { retur static inline int kvmppc_xive_set_irq(struct kvm *kvm, int irq_source_id, u32 irq, int level, bool line_status) { return -ENODEV; } static inline void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) { } +static inline void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu) { } static inline int kvmppc_xive_enabled(struct kvm_vcpu *vcpu) { return 0; } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 23d6dc04b0e9..e3344d58537d 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3556,6 +3556,8 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, trap = __kvmhv_vcpu_entry_p9(vcpu); + kvmppc_xive_pull_vcpu(vcpu); + /* Advance host PURR/SPURR by the amount used by guest */ purr = mfspr(SPRN_PURR); spurr = mfspr(SPRN_SPURR); diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 75405ef53238..c11597f815e4 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1442,6 +1442,11 @@ guest_exit_cont: /* r9 = vcpu, r12 = trap, r13 = paca */ bl kvmhv_accumulate_time #endif #ifdef CONFIG_KVM_XICS + /* If we came in through the P9 short path, xive pull is done in C */ + lwz r0, STACK_SLOT_SHORT_PATH(r1) + cmpwi r0, 0 + bne 1f + /* We are exiting, pull the VP from the XIVE */ lbz r0, VCPU_XIVE_PUSHED(r9) cmpwi cr0, r0, 0 diff --git a/arch/powerpc/kvm/book3s_xive.c b/arch/powerpc/kvm/book3s_xive.c index e7219b6f5f9a..8632fb998a55 100644 --- a/arch/powerpc/kvm/book3s_xive.c +++ b/arch/powerpc/kvm/book3s_xive.c @@ -127,6 +127,40 @@ void kvmppc_xive_push_vcpu(struct kvm_vcpu *vcpu) } EXPORT_SYMBOL_GPL(kvmppc_xive_push_vcpu); +/* + * Pull a vcpu's context from the XIVE on guest exit. + * This assumes we are in virtual mode (MMU on) + */ +void kvmppc_xive_pull_vcpu(struct kvm_vcpu *vcpu) +{ + void __iomem *tima = local_paca->kvm_hstate.xive_tima_virt; + + BUG_ON(!(mfmsr() & MSR_IR)); + BUG_ON(!(mfmsr() & MSR_DR)); + + if (!vcpu->arch.xive_pushed) + return; + + /* +* Sould not have been pushed if there is no tima +*/ + if (WARN_ON(!tima)) + return; + + eieio(); + /* First load to pull the context, we ignore the value */ + __raw_readl(tima + TM_SPC_PULL_OS_CTX); + /* Second load to recover the context state (Words 0 and 1) */ + vcpu->arch.xive_saved_state.w01 = __raw_readq(tima + TM_QW1_OS); + + /* Fixup some of the state for the next load */ + vcpu->arch.xive_saved_state.lsmfb = 0; + vcpu->arch.xive_saved_state.ack = 0xff; + vcpu->arch.xive_pushed = 0; + eieio(); +} +EXPORT_SYMBOL_GPL(kvmppc_xive_pull_vcpu); + /* * This is a simple trigger for a generic XIVE IRQ. This must * only be called for interrupts that support a trigger page -- 2.23.0
[PATCH v2 13/37] KVM: PPC: Book3S HV P9: Move radix MMU switching instructions together
Switching the MMU from radix<->radix mode is tricky particularly as the MMU can remain enabled and requires a certain sequence of SPR updates. Move these together into their own functions. This also includes the radix TLB check / flush because it's tied in to MMU switching due to tlbiel getting LPID from LPIDR. (XXX: isync / hwsync synchronisation TBD) Signed-off-by: Nicholas Piggin --- arch/powerpc/kvm/book3s_hv.c | 55 +--- 1 file changed, 32 insertions(+), 23 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 895090636295..23d6dc04b0e9 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3440,12 +3440,38 @@ static noinline void kvmppc_run_core(struct kvmppc_vcore *vc) trace_kvmppc_run_core(vc, 1); } +static void switch_mmu_to_guest_radix(struct kvm *kvm, struct kvm_vcpu *vcpu, u64 lpcr) +{ + struct kvmppc_vcore *vc = vcpu->arch.vcore; + struct kvm_nested_guest *nested = vcpu->arch.nested; + u32 lpid; + + lpid = nested ? nested->shadow_lpid : kvm->arch.lpid; + + mtspr(SPRN_LPID, lpid); + mtspr(SPRN_LPCR, lpcr); + mtspr(SPRN_PID, vcpu->arch.pid); + isync(); + + /* TLBIEL must have LPIDR set, so set guest LPID before flushing. */ + kvmppc_check_need_tlb_flush(kvm, vc->pcpu, nested); +} + +static void switch_mmu_to_host_radix(struct kvm *kvm, u32 pid) +{ + mtspr(SPRN_PID, pid); + mtspr(SPRN_LPID, kvm->arch.host_lpid); + mtspr(SPRN_LPCR, kvm->arch.host_lpcr); + isync(); +} + /* * Load up hypervisor-mode registers on P9. */ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, unsigned long lpcr) { + struct kvm *kvm = vcpu->kvm; struct kvmppc_vcore *vc = vcpu->arch.vcore; s64 hdec; u64 tb, purr, spurr; @@ -3468,12 +3494,12 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, * P8 and P9 suppress the HDEC exception when LPCR[HDICE] = 0, * so set HDICE before writing HDEC. */ - mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr | LPCR_HDICE); + mtspr(SPRN_LPCR, kvm->arch.host_lpcr | LPCR_HDICE); isync(); hdec = time_limit - mftb(); if (hdec < 0) { - mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr); + mtspr(SPRN_LPCR, kvm->arch.host_lpcr); isync(); return BOOK3S_INTERRUPT_HV_DECREMENTER; } @@ -3508,7 +3534,6 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, } mtspr(SPRN_CIABR, vcpu->arch.ciabr); mtspr(SPRN_IC, vcpu->arch.ic); - mtspr(SPRN_PID, vcpu->arch.pid); mtspr(SPRN_PSSCR, vcpu->arch.psscr | PSSCR_EC | (local_paca->kvm_hstate.fake_suspend << PSSCR_FAKE_SUSPEND_LG)); @@ -3522,8 +3547,7 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, mtspr(SPRN_AMOR, ~0UL); - mtspr(SPRN_LPCR, lpcr); - isync(); + switch_mmu_to_guest_radix(kvm, vcpu, lpcr); kvmppc_xive_push_vcpu(vcpu); @@ -3562,7 +3586,6 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, mtspr(SPRN_DAWR1, host_dawr1); mtspr(SPRN_DAWRX1, host_dawrx1); } - mtspr(SPRN_PID, host_pidr); /* * Since this is radix, do a eieio; tlbsync; ptesync sequence in @@ -3577,9 +3600,6 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, if (cpu_has_feature(CPU_FTR_ARCH_31)) asm volatile(PPC_CP_ABORT); - mtspr(SPRN_LPID, vcpu->kvm->arch.host_lpid);/* restore host LPID */ - isync(); - vc->dpdes = mfspr(SPRN_DPDES); vc->vtb = mfspr(SPRN_VTB); mtspr(SPRN_DPDES, 0); @@ -3596,7 +3616,8 @@ static int kvmhv_load_hv_regs_and_go(struct kvm_vcpu *vcpu, u64 time_limit, } mtspr(SPRN_HDEC, 0x7fff); - mtspr(SPRN_LPCR, vcpu->kvm->arch.host_lpcr); + + switch_mmu_to_host_radix(kvm, host_pidr); return trap; } @@ -4130,7 +4151,7 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, { struct kvm_run *run = vcpu->run; int trap, r, pcpu; - int srcu_idx, lpid; + int srcu_idx; struct kvmppc_vcore *vc; struct kvm *kvm = vcpu->kvm; struct kvm_nested_guest *nested = vcpu->arch.nested; @@ -4204,13 +4225,6 @@ int kvmhv_run_single_vcpu(struct kvm_vcpu *vcpu, u64 time_limit, vc->vcore_state = VCORE_RUNNING; trace_kvmppc_run_core(vc, 0); - if (cpu_has_feature(CPU_FTR_HVMODE)) { - lpid = nested ? nested->shadow_lpid : kvm->arch.lpid; - mtspr(SPRN_LPID, lpid); - isync(); - kvmppc_check_need_tlb_flush(kvm, pcpu, nested); - } - guest_
[PATCH v2 12/37] KVM: PPC: Book3S 64: Minimise hcall handler calling convention differences
This sets up the same calling convention from interrupt entry to KVM interrupt handler for system calls as exists for other interrupt types. This is a better API, it uses a save area rather than SPR, and it has more registers free to use. Using a single common API helps maintain it, and it becomes easier to use in C in a later patch. Signed-off-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S | 16 +++- arch/powerpc/kvm/book3s_64_entry.S | 22 +++--- 2 files changed, 18 insertions(+), 20 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index bbda628ab344..dcd71d9e7913 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1892,8 +1892,22 @@ EXC_VIRT_END(system_call, 0x4c00, 0x100) #ifdef CONFIG_KVM_BOOK3S_64_HANDLER TRAMP_REAL_BEGIN(kvm_hcall) + std r9,PACA_EXGEN+EX_R9(r13) + std r11,PACA_EXGEN+EX_R11(r13) + std r12,PACA_EXGEN+EX_R12(r13) + mfcrr9 mfctr r10 - SET_SCRATCH0(r10) /* Save r13 in SCRATCH0 */ + std r10,PACA_EXGEN+EX_R13(r13) + li r10,0 + std r10,PACA_EXGEN+EX_CFAR(r13) + std r10,PACA_EXGEN+EX_CTR(r13) +BEGIN_FTR_SECTION + mfspr r10,SPRN_PPR + std r10,PACA_EXGEN+EX_PPR(r13) +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) + + HMT_MEDIUM + #ifdef CONFIG_RELOCATABLE /* * Requires __LOAD_FAR_HANDLER beause kvmppc_hcall lives diff --git a/arch/powerpc/kvm/book3s_64_entry.S b/arch/powerpc/kvm/book3s_64_entry.S index 75accb1321c9..f826c8dc2e19 100644 --- a/arch/powerpc/kvm/book3s_64_entry.S +++ b/arch/powerpc/kvm/book3s_64_entry.S @@ -13,24 +13,9 @@ .globalkvmppc_hcall .balign IFETCH_ALIGN_BYTES kvmppc_hcall: - /* -* This is a hcall, so register convention is as -* Documentation/powerpc/papr_hcalls.rst, with these additions: -* R13 = PACA -* guest R13 saved in SPRN_SCRATCH0 -* R10 = free -*/ -BEGIN_FTR_SECTION - mfspr r10,SPRN_PPR - std r10,HSTATE_PPR(r13) -END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) - HMT_MEDIUM - mfcrr10 - std r12,HSTATE_SCRATCH0(r13) - sldir12,r10,32 - ori r12,r12,0xc00 - ld r10,PACA_EXGEN+EX_R10(r13) - b do_kvm_interrupt + ld r10,PACA_EXGEN+EX_R13(r13) + SET_SCRATCH0(r10) + li r10,0xc00 .globalkvmppc_interrupt .balign IFETCH_ALIGN_BYTES @@ -61,7 +46,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ld r10,EX_R10(r11) ld r11,EX_R11(r11) -do_kvm_interrupt: /* * Hcalls and other interrupts come here after normalising register * contents and save locations: -- 2.23.0
[PATCH v2 11/37] KVM: PPC: Book3S 64: move bad_host_intr check to HV handler
This is not used by PR KVM. Signed-off-by: Nicholas Piggin --- arch/powerpc/kvm/book3s_64_entry.S | 3 --- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 4 +++- arch/powerpc/kvm/book3s_segment.S | 7 +++ 3 files changed, 10 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/kvm/book3s_64_entry.S b/arch/powerpc/kvm/book3s_64_entry.S index 4603c0709ae3..75accb1321c9 100644 --- a/arch/powerpc/kvm/book3s_64_entry.S +++ b/arch/powerpc/kvm/book3s_64_entry.S @@ -77,11 +77,8 @@ do_kvm_interrupt: beq-.Lmaybe_skip .Lno_skip: #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE - cmpwi r9,KVM_GUEST_MODE_HOST_HV - beq kvmppc_bad_host_intr #ifdef CONFIG_KVM_BOOK3S_PR_POSSIBLE cmpwi r9,KVM_GUEST_MODE_GUEST - ld r9,HSTATE_SCRATCH2(r13) beq kvmppc_interrupt_pr #endif b kvmppc_interrupt_hv diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index f976efb7e4a9..75405ef53238 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -1265,6 +1265,7 @@ hdec_soon: kvmppc_interrupt_hv: /* * Register contents: +* R9 = HSTATE_IN_GUEST * R12 = (guest CR << 32) | interrupt vector * R13 = PACA * guest R12 saved in shadow VCPU SCRATCH0 @@ -1272,6 +1273,8 @@ kvmppc_interrupt_hv: * guest R9 saved in HSTATE_SCRATCH2 */ /* We're now back in the host but in guest MMU context */ + cmpwi r9,KVM_GUEST_MODE_HOST_HV + beq kvmppc_bad_host_intr li r9, KVM_GUEST_MODE_HOST_HV stb r9, HSTATE_IN_GUEST(r13) @@ -3272,7 +3275,6 @@ END_FTR_SECTION_IFCLR(CPU_FTR_P9_TM_HV_ASSIST) * cfar is saved in HSTATE_CFAR(r13) * ppr is saved in HSTATE_PPR(r13) */ -.global kvmppc_bad_host_intr kvmppc_bad_host_intr: /* * Switch to the emergency stack, but start half-way down in diff --git a/arch/powerpc/kvm/book3s_segment.S b/arch/powerpc/kvm/book3s_segment.S index 1f492aa4c8d6..ef1d88b869bf 100644 --- a/arch/powerpc/kvm/book3s_segment.S +++ b/arch/powerpc/kvm/book3s_segment.S @@ -167,8 +167,15 @@ kvmppc_interrupt_pr: * R12 = (guest CR << 32) | exit handler id * R13 = PACA * HSTATE.SCRATCH0 = guest R12 +* +* If HV is possible, additionally: +* R9 = HSTATE_IN_GUEST +* HSTATE.SCRATCH2 = guest R9 */ #ifdef CONFIG_PPC64 +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE + ld r9,HSTATE_SCRATCH2(r13) +#endif /* Match 32-bit entry */ rotldi r12, r12, 32 /* Flip R12 halves for stw */ stw r12, HSTATE_SCRATCH1(r13) /* CR is now in the low half */ -- 2.23.0
[PATCH v2 10/37] KVM: PPC: Book3S 64: Move interrupt early register setup to KVM
Like the earlier patch for hcalls, KVM interrupt entry requires a different calling convention than the Linux interrupt handlers set up. Move the code that converts from one to the other into KVM. Signed-off-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S | 126 --- arch/powerpc/kvm/book3s_64_entry.S | 34 +++- 2 files changed, 50 insertions(+), 110 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index b7092ba87da8..bbda628ab344 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -187,7 +187,6 @@ do_define_int n .endif .endm -#ifdef CONFIG_KVM_BOOK3S_64_HANDLER /* * All interrupts which set HSRR registers, as well as SRESET and MCE and * syscall when invoked with "sc 1" switch to MSR[HV]=1 (HVMODE) to be taken, @@ -220,54 +219,25 @@ do_define_int n * to KVM to handle. */ -.macro KVMTEST name +.macro KVMTEST name handler +#ifdef CONFIG_KVM_BOOK3S_64_HANDLER lbz r10,HSTATE_IN_GUEST(r13) cmpwi r10,0 - bne \name\()_kvm -.endm - -.macro GEN_KVM name - .balign IFETCH_ALIGN_BYTES -\name\()_kvm: - -BEGIN_FTR_SECTION - ld r10,IAREA+EX_CFAR(r13) - std r10,HSTATE_CFAR(r13) -END_FTR_SECTION_IFSET(CPU_FTR_CFAR) - - ld r10,IAREA+EX_CTR(r13) - mtctr r10 -BEGIN_FTR_SECTION - ld r10,IAREA+EX_PPR(r13) - std r10,HSTATE_PPR(r13) -END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) - ld r11,IAREA+EX_R11(r13) - ld r12,IAREA+EX_R12(r13) - std r12,HSTATE_SCRATCH0(r13) - sldir12,r9,32 - ld r9,IAREA+EX_R9(r13) - ld r10,IAREA+EX_R10(r13) /* HSRR variants have the 0x2 bit added to their trap number */ .if IHSRR_IF_HVMODE BEGIN_FTR_SECTION - ori r12,r12,(IVEC + 0x2) + li r10,(IVEC + 0x2) FTR_SECTION_ELSE - ori r12,r12,(IVEC) + li r10,(IVEC) ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) .elseif IHSRR - ori r12,r12,(IVEC+ 0x2) + li r10,(IVEC + 0x2) .else - ori r12,r12,(IVEC) + li r10,(IVEC) .endif - b kvmppc_interrupt -.endm - -#else -.macro KVMTEST name -.endm -.macro GEN_KVM name -.endm + bne \handler #endif +.endm /* * This is the BOOK3S interrupt entry code macro. @@ -409,7 +379,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_CFAR) DEFINE_FIXED_SYMBOL(\name\()_common_real) \name\()_common_real: .if IKVM_REAL - KVMTEST \name + KVMTEST \name kvm_interrupt .endif ld r10,PACAKMSR(r13) /* get MSR value for kernel */ @@ -432,7 +402,7 @@ DEFINE_FIXED_SYMBOL(\name\()_common_real) DEFINE_FIXED_SYMBOL(\name\()_common_virt) \name\()_common_virt: .if IKVM_VIRT - KVMTEST \name + KVMTEST \name kvm_interrupt 1: .endif .endif /* IVIRT */ @@ -446,7 +416,7 @@ DEFINE_FIXED_SYMBOL(\name\()_common_virt) DEFINE_FIXED_SYMBOL(\name\()_common_real) \name\()_common_real: .if IKVM_REAL - KVMTEST \name + KVMTEST \name kvm_interrupt .endif .endm @@ -967,8 +937,6 @@ EXC_COMMON_BEGIN(system_reset_common) EXCEPTION_RESTORE_REGS RFI_TO_USER_OR_KERNEL - GEN_KVM system_reset - /** * Interrupt 0x200 - Machine Check Interrupt (MCE). @@ -1132,7 +1100,7 @@ END_FTR_SECTION_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) /* * Check if we are coming from guest. If yes, then run the normal * exception handler which will take the -* machine_check_kvm->kvmppc_interrupt branch to deliver the MC event +* machine_check_kvm->kvm_interrupt branch to deliver the MC event * to guest. */ lbz r11,HSTATE_IN_GUEST(r13) @@ -1203,8 +1171,6 @@ EXC_COMMON_BEGIN(machine_check_common) bl machine_check_exception b interrupt_return - GEN_KVM machine_check - #ifdef CONFIG_PPC_P7_NAP /* @@ -1339,8 +1305,6 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) REST_NVGPRS(r1) b interrupt_return - GEN_KVM data_access - /** * Interrupt 0x380 - Data Segment Interrupt (DSLB). @@ -1390,8 +1354,6 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) bl do_bad_slb_fault b interrupt_return - GEN_KVM data_access_slb - /** * Interrupt 0x400 - Instruction Storage Interrupt (ISI). @@ -1428,8 +1390,6 @@ MMU_FTR_SECTION_ELSE ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) b interrupt_return - GEN_KVM instruction_access - /** * Interrupt 0x480 - Instruction Segment Interrupt (ISLB). @@ -1474,8 +1434,6 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) bl do_bad_slb_fault b interrupt_return -
[PATCH v2 09/37] KVM: PPC: Book3S 64: Move hcall early register setup to KVM
System calls / hcalls have a different calling convention than other interrupts, so there is code in the KVMTEST to massage these into the same form as other interrupt handlers. Move this work into the KVM hcall handler. This means teaching KVM a little more about the low level interrupt handler setup, PACA save areas, etc., although that's not obviously worse than the current approach of coming up with an entirely different interrupt register / save convention. Signed-off-by: Nicholas Piggin --- arch/powerpc/include/asm/exception-64s.h | 13 +++ arch/powerpc/kernel/exceptions-64s.S | 44 ++-- arch/powerpc/kvm/book3s_64_entry.S | 17 + 3 files changed, 32 insertions(+), 42 deletions(-) diff --git a/arch/powerpc/include/asm/exception-64s.h b/arch/powerpc/include/asm/exception-64s.h index c1a8aac01cf9..bb6f78fcf981 100644 --- a/arch/powerpc/include/asm/exception-64s.h +++ b/arch/powerpc/include/asm/exception-64s.h @@ -35,6 +35,19 @@ /* PACA save area size in u64 units (exgen, exmc, etc) */ #define EX_SIZE10 +/* PACA save area offsets */ +#define EX_R9 0 +#define EX_R10 8 +#define EX_R11 16 +#define EX_R12 24 +#define EX_R13 32 +#define EX_DAR 40 +#define EX_DSISR 48 +#define EX_CCR 52 +#define EX_CFAR56 +#define EX_PPR 64 +#define EX_CTR 72 + /* * maximum recursive depth of MCE exceptions */ diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index 9ae463e8522b..b7092ba87da8 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -21,22 +21,6 @@ #include #include -/* PACA save area offsets (exgen, exmc, etc) */ -#define EX_R9 0 -#define EX_R10 8 -#define EX_R11 16 -#define EX_R12 24 -#define EX_R13 32 -#define EX_DAR 40 -#define EX_DSISR 48 -#define EX_CCR 52 -#define EX_CFAR56 -#define EX_PPR 64 -#define EX_CTR 72 -.if EX_SIZE != 10 - .error "EX_SIZE is wrong" -.endif - /* * Following are fixed section helper macros. * @@ -1964,45 +1948,21 @@ EXC_VIRT_END(system_call, 0x4c00, 0x100) #ifdef CONFIG_KVM_BOOK3S_64_HANDLER TRAMP_REAL_BEGIN(system_call_kvm) - /* -* This is a hcall, so register convention is as above, with these -* differences: -* r13 = PACA -* ctr = orig r13 -* orig r10 saved in PACA -*/ -/* - * Save the PPR (on systems that support it) before changing to - * HMT_MEDIUM. That allows the KVM code to save that value into the - * guest state (it is the guest's PPR value). - */ -BEGIN_FTR_SECTION - mfspr r10,SPRN_PPR - std r10,HSTATE_PPR(r13) -END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) - HMT_MEDIUM mfctr r10 - SET_SCRATCH0(r10) - mfcrr10 - std r12,HSTATE_SCRATCH0(r13) - sldir12,r10,32 - ori r12,r12,0xc00 + SET_SCRATCH0(r10) /* Save r13 in SCRATCH0 */ #ifdef CONFIG_RELOCATABLE /* -* Requires __LOAD_FAR_HANDLER beause kvmppc_interrupt lives +* Requires __LOAD_FAR_HANDLER beause kvmppc_hcall lives * outside the head section. */ __LOAD_FAR_HANDLER(r10, kvmppc_hcall) mtctr r10 - ld r10,PACA_EXGEN+EX_R10(r13) bctr #else - ld r10,PACA_EXGEN+EX_R10(r13) b kvmppc_hcall #endif #endif - /** * Interrupt 0xd00 - Trace Interrupt. * This is a synchronous interrupt in response to instruction step or diff --git a/arch/powerpc/kvm/book3s_64_entry.S b/arch/powerpc/kvm/book3s_64_entry.S index 9572f759255c..1c9518ab7d96 100644 --- a/arch/powerpc/kvm/book3s_64_entry.S +++ b/arch/powerpc/kvm/book3s_64_entry.S @@ -13,6 +13,23 @@ .globalkvmppc_hcall .balign IFETCH_ALIGN_BYTES kvmppc_hcall: + /* +* This is a hcall, so register convention is as +* Documentation/powerpc/papr_hcalls.rst, with these additions: +* R13 = PACA +* guest R13 saved in SPRN_SCRATCH0 +* R10 = free +*/ +BEGIN_FTR_SECTION + mfspr r10,SPRN_PPR + std r10,HSTATE_PPR(r13) +END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) + HMT_MEDIUM + mfcrr10 + std r12,HSTATE_SCRATCH0(r13) + sldir12,r10,32 + ori r12,r12,0xc00 + ld r10,PACA_EXGEN+EX_R10(r13) .globalkvmppc_interrupt .balign IFETCH_ALIGN_BYTES -- 2.23.0
[PATCH v2 08/37] KVM: PPC: Book3S 64: add hcall interrupt handler
Add a separate hcall entry point. This can be used to deal with the different calling convention. Reviewed-by: Fabiano Rosas Signed-off-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S | 4 ++-- arch/powerpc/kvm/book3s_64_entry.S | 6 +- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index d956dd9ed61f..9ae463e8522b 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -1992,13 +1992,13 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) * Requires __LOAD_FAR_HANDLER beause kvmppc_interrupt lives * outside the head section. */ - __LOAD_FAR_HANDLER(r10, kvmppc_interrupt) + __LOAD_FAR_HANDLER(r10, kvmppc_hcall) mtctr r10 ld r10,PACA_EXGEN+EX_R10(r13) bctr #else ld r10,PACA_EXGEN+EX_R10(r13) - b kvmppc_interrupt + b kvmppc_hcall #endif #endif diff --git a/arch/powerpc/kvm/book3s_64_entry.S b/arch/powerpc/kvm/book3s_64_entry.S index c1276f616af4..9572f759255c 100644 --- a/arch/powerpc/kvm/book3s_64_entry.S +++ b/arch/powerpc/kvm/book3s_64_entry.S @@ -7,9 +7,13 @@ #include /* - * This is branched to from interrupt handlers in exception-64s.S which set + * These are branched to from interrupt handlers in exception-64s.S which set * IKVM_REAL or IKVM_VIRT, if HSTATE_IN_GUEST was found to be non-zero. */ +.globalkvmppc_hcall +.balign IFETCH_ALIGN_BYTES +kvmppc_hcall: + .globalkvmppc_interrupt .balign IFETCH_ALIGN_BYTES kvmppc_interrupt: -- 2.23.0
[PATCH v2 07/37] KVM: PPC: Book3S 64: Move GUEST_MODE_SKIP test into KVM
Move the GUEST_MODE_SKIP logic into KVM code. This is quite a KVM internal detail that has no real need to be in common handlers. Also add a comment explaining why this thing exists. Reviewed-by: Fabiano Rosas Signed-off-by: Nicholas Piggin --- arch/powerpc/kernel/exceptions-64s.S | 60 arch/powerpc/kvm/book3s_64_entry.S | 51 ++- 2 files changed, 50 insertions(+), 61 deletions(-) diff --git a/arch/powerpc/kernel/exceptions-64s.S b/arch/powerpc/kernel/exceptions-64s.S index ba13d749d203..d956dd9ed61f 100644 --- a/arch/powerpc/kernel/exceptions-64s.S +++ b/arch/powerpc/kernel/exceptions-64s.S @@ -133,7 +133,6 @@ name: #define IBRANCH_TO_COMMON .L_IBRANCH_TO_COMMON_\name\() /* ENTRY branch to common */ #define IREALMODE_COMMON .L_IREALMODE_COMMON_\name\() /* Common runs in realmode */ #define IMASK .L_IMASK_\name\() /* IRQ soft-mask bit */ -#define IKVM_SKIP .L_IKVM_SKIP_\name\() /* Generate KVM skip handler */ #define IKVM_REAL .L_IKVM_REAL_\name\() /* Real entry tests KVM */ #define __IKVM_REAL(name) .L_IKVM_REAL_ ## name #define IKVM_VIRT .L_IKVM_VIRT_\name\() /* Virt entry tests KVM */ @@ -190,9 +189,6 @@ do_define_int n .ifndef IMASK IMASK=0 .endif - .ifndef IKVM_SKIP - IKVM_SKIP=0 - .endif .ifndef IKVM_REAL IKVM_REAL=0 .endif @@ -250,15 +246,10 @@ do_define_int n .balign IFETCH_ALIGN_BYTES \name\()_kvm: - .if IKVM_SKIP - cmpwi r10,KVM_GUEST_MODE_SKIP - beq 89f - .else BEGIN_FTR_SECTION ld r10,IAREA+EX_CFAR(r13) std r10,HSTATE_CFAR(r13) END_FTR_SECTION_IFSET(CPU_FTR_CFAR) - .endif ld r10,IAREA+EX_CTR(r13) mtctr r10 @@ -285,27 +276,6 @@ END_FTR_SECTION_IFSET(CPU_FTR_HAS_PPR) ori r12,r12,(IVEC) .endif b kvmppc_interrupt - - .if IKVM_SKIP -89:mtocrf 0x80,r9 - ld r10,IAREA+EX_CTR(r13) - mtctr r10 - ld r9,IAREA+EX_R9(r13) - ld r10,IAREA+EX_R10(r13) - ld r11,IAREA+EX_R11(r13) - ld r12,IAREA+EX_R12(r13) - .if IHSRR_IF_HVMODE - BEGIN_FTR_SECTION - b kvmppc_skip_Hinterrupt - FTR_SECTION_ELSE - b kvmppc_skip_interrupt - ALT_FTR_SECTION_END_IFSET(CPU_FTR_HVMODE | CPU_FTR_ARCH_206) - .elseif IHSRR - b kvmppc_skip_Hinterrupt - .else - b kvmppc_skip_interrupt - .endif - .endif .endm #else @@ -1083,7 +1053,6 @@ INT_DEFINE_BEGIN(machine_check) ISET_RI=0 IDAR=1 IDSISR=1 - IKVM_SKIP=1 IKVM_REAL=1 INT_DEFINE_END(machine_check) @@ -1356,7 +1325,6 @@ INT_DEFINE_BEGIN(data_access) IVEC=0x300 IDAR=1 IDSISR=1 - IKVM_SKIP=1 IKVM_REAL=1 INT_DEFINE_END(data_access) @@ -1410,7 +1378,6 @@ ALT_MMU_FTR_SECTION_END_IFCLR(MMU_FTR_TYPE_RADIX) INT_DEFINE_BEGIN(data_access_slb) IVEC=0x380 IDAR=1 - IKVM_SKIP=1 IKVM_REAL=1 INT_DEFINE_END(data_access_slb) @@ -2080,7 +2047,6 @@ INT_DEFINE_BEGIN(h_data_storage) IHSRR=1 IDAR=1 IDSISR=1 - IKVM_SKIP=1 IKVM_REAL=1 IKVM_VIRT=1 INT_DEFINE_END(h_data_storage) @@ -3024,32 +2990,6 @@ EXPORT_SYMBOL(do_uaccess_flush) MASKED_INTERRUPT MASKED_INTERRUPT hsrr=1 -#ifdef CONFIG_KVM_BOOK3S_64_HANDLER -kvmppc_skip_interrupt: - /* -* Here all GPRs are unchanged from when the interrupt happened -* except for r13, which is saved in SPRG_SCRATCH0. -*/ - mfspr r13, SPRN_SRR0 - addir13, r13, 4 - mtspr SPRN_SRR0, r13 - GET_SCRATCH0(r13) - RFI_TO_KERNEL - b . - -kvmppc_skip_Hinterrupt: - /* -* Here all GPRs are unchanged from when the interrupt happened -* except for r13, which is saved in SPRG_SCRATCH0. -*/ - mfspr r13, SPRN_HSRR0 - addir13, r13, 4 - mtspr SPRN_HSRR0, r13 - GET_SCRATCH0(r13) - HRFI_TO_KERNEL - b . -#endif - /* * Relocation-on interrupts: A subset of the interrupts can be delivered * with IR=1/DR=1, if AIL==2 and MSR.HV won't be changed by delivering diff --git a/arch/powerpc/kvm/book3s_64_entry.S b/arch/powerpc/kvm/book3s_64_entry.S index e9a6a8fbb164..c1276f616af4 100644 --- a/arch/powerpc/kvm/book3s_64_entry.S +++ b/arch/powerpc/kvm/book3s_64_entry.S @@ -1,5 +1,6 @@ #include #include +#include #include #include #include @@ -19,9 +20,12 @@ kvmppc_interrupt: * guest R12 saved in shadow VCPU SCRATCH0 * guest R13 saved in SPRN_SCRATCH0 */ -#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE std r9,HSTATE_SCRATCH2(r13) lbz r9,HSTATE_IN_GUEST(r13) + cmpwi r9,KVM_GUEST_MODE_SKIP + beq-.Lmayb