Re: [PowerPC] today's main line failed to build on PowerPC
On Thursday18 August 2016 11:50 AM, Abdul Haleem wrote: Hi, The main line stable 4.8.0-rc2 failed to build on PowerPC with following build errors. config : pseries_le_defconfig Machine Type : PowerPC Bare Metal My mistake, The build is failing on the attached config and not for 'pseries_le_defconfig' 09:34:22 00:04:59 INFO | make -j 160 vmlinux 09:34:24 00:05:01 ERROR| [stderr] arch/powerpc/mm/hash_low_32.S: Assembler messages: 09:34:24 00:05:01 ERROR| [stderr] arch/powerpc/mm/hash_low_32.S:353: Error: missing operand 09:34:24 00:05:01 ERROR| [stderr] arch/powerpc/mm/hash_low_32.S:612: Error: missing operand 09:34:24 00:05:01 ERROR| [stderr] arch/powerpc/mm/hash_low_32.S:670: Error: missing operand 09:34:24 00:05:01 ERROR| [stderr] make[1]: *** [arch/powerpc/mm/hash_low_32.o] Error 1 09:34:24 00:05:01 ERROR| [stderr] make[1]: *** Waiting for unfinished jobs 09:34:25 00:05:02 ERROR| [stderr] arch/powerpc/kernel/head_32.S: Assembler messages: 09:34:25 00:05:02 ERROR| [stderr] arch/powerpc/kernel/head_32.S:1113: Error: missing operand 09:34:25 00:05:02 ERROR| [stderr] make[1]: *** [arch/powerpc/kernel/head_32.o] Error 1 09:34:25 00:05:02 ERROR| [stderr] make[1]: *** Waiting for unfinished jobs 09:34:27 00:05:04 ERROR| [stderr] make: *** [arch/powerpc/mm] Error 2 09:34:27 00:05:04 ERROR| [stderr] make: *** Waiting for unfinished jobs 09:34:42 00:05:19 ERROR| [stderr] make: *** [arch/powerpc/kernel] Error 2 Regard's Abdul # # Automatically generated file; DO NOT EDIT. # Linux/powerpc 4.7.0-rc7 Kernel Configuration # CONFIG_PPC64=y # # Processor support # CONFIG_PPC_BOOK3S_64=y # CONFIG_PPC_BOOK3E_64 is not set CONFIG_POWER7_CPU=y # CONFIG_POWER8_CPU is not set CONFIG_PPC_BOOK3S=y CONFIG_PPC_FPU=y CONFIG_ALTIVEC=y CONFIG_VSX=y # CONFIG_PPC_ICSWX is not set CONFIG_PPC_STD_MMU=y CONFIG_PPC_STD_MMU_64=y CONFIG_PPC_RADIX_MMU=y CONFIG_PPC_MM_SLICES=y CONFIG_PPC_HAVE_PMU_SUPPORT=y CONFIG_PPC_PERF_CTRS=y CONFIG_SMP=y CONFIG_NR_CPUS=32 CONFIG_PPC_DOORBELL=y # CONFIG_CPU_BIG_ENDIAN is not set CONFIG_CPU_LITTLE_ENDIAN=y CONFIG_PPC64_BOOT_WRAPPER=y CONFIG_64BIT=y CONFIG_WORD_SIZE=64 CONFIG_ARCH_PHYS_ADDR_T_64BIT=y CONFIG_ARCH_DMA_ADDR_T_64BIT=y CONFIG_MMU=y CONFIG_HAVE_SETUP_PER_CPU_AREA=y CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y CONFIG_NR_IRQS=512 CONFIG_STACKTRACE_SUPPORT=y CONFIG_TRACE_IRQFLAGS_SUPPORT=y CONFIG_LOCKDEP_SUPPORT=y CONFIG_RWSEM_XCHGADD_ALGORITHM=y CONFIG_ARCH_HAS_ILOG2_U32=y CONFIG_ARCH_HAS_ILOG2_U64=y CONFIG_GENERIC_HWEIGHT=y CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK=y CONFIG_PPC=y CONFIG_GENERIC_CSUM=y CONFIG_EARLY_PRINTK=y CONFIG_PANIC_TIMEOUT=180 CONFIG_COMPAT=y CONFIG_SYSVIPC_COMPAT=y CONFIG_SCHED_OMIT_FRAME_POINTER=y CONFIG_ARCH_MAY_HAVE_PC_FDC=y CONFIG_PPC_UDBG_16550=y # CONFIG_GENERIC_TBSYNC is not set CONFIG_AUDIT_ARCH=y CONFIG_GENERIC_BUG=y CONFIG_EPAPR_BOOT=y # CONFIG_DEFAULT_UIMAGE is not set CONFIG_ARCH_HIBERNATION_POSSIBLE=y CONFIG_ARCH_SUSPEND_POSSIBLE=y # CONFIG_PPC_DCR_NATIVE is not set # CONFIG_PPC_DCR_MMIO is not set # CONFIG_PPC_OF_PLATFORM_PCI is not set CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y CONFIG_ARCH_SUPPORTS_UPROBES=y CONFIG_PPC_EMULATE_SSTEP=y CONFIG_ZONE_DMA32=y CONFIG_PGTABLE_LEVELS=4 CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config" CONFIG_IRQ_WORK=y # # General setup # CONFIG_INIT_ENV_ARG_LIMIT=32 CONFIG_CROSS_COMPILE="" # CONFIG_COMPILE_TEST is not set CONFIG_LOCALVERSION="" CONFIG_LOCALVERSION_AUTO=y CONFIG_DEFAULT_HOSTNAME="(none)" CONFIG_SWAP=y CONFIG_SYSVIPC=y CONFIG_SYSVIPC_SYSCTL=y CONFIG_POSIX_MQUEUE=y CONFIG_POSIX_MQUEUE_SYSCTL=y CONFIG_CROSS_MEMORY_ATTACH=y CONFIG_FHANDLE=y # CONFIG_USELIB is not set # CONFIG_AUDIT is not set CONFIG_HAVE_ARCH_AUDITSYSCALL=y # # IRQ subsystem # CONFIG_GENERIC_IRQ_SHOW=y CONFIG_GENERIC_IRQ_SHOW_LEVEL=y CONFIG_IRQ_DOMAIN=y CONFIG_GENERIC_MSI_IRQ=y CONFIG_IRQ_DOMAIN_DEBUG=y CONFIG_IRQ_FORCED_THREADING=y CONFIG_SPARSE_IRQ=y CONFIG_GENERIC_TIME_VSYSCALL_OLD=y CONFIG_GENERIC_CLOCKEVENTS=y CONFIG_ARCH_HAS_TICK_BROADCAST=y CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y CONFIG_GENERIC_CMOS_UPDATE=y # # Timers subsystem # CONFIG_TICK_ONESHOT=y CONFIG_NO_HZ_COMMON=y # CONFIG_HZ_PERIODIC is not set CONFIG_NO_HZ_IDLE=y # CONFIG_NO_HZ_FULL is not set CONFIG_NO_HZ=y CONFIG_HIGH_RES_TIMERS=y # # CPU/Task time and stats accounting # CONFIG_VIRT_CPU_ACCOUNTING=y # CONFIG_TICK_CPU_ACCOUNTING is not set CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y # CONFIG_VIRT_CPU_ACCOUNTING_GEN is not set # CONFIG_BSD_PROCESS_ACCT is not set CONFIG_TASKSTATS=y CONFIG_TASK_DELAY_ACCT=y # CONFIG_TASK_XACCT is not set # # RCU Subsystem # CONFIG_TREE_RCU=y # CONFIG_RCU_EXPERT is not set CONFIG_SRCU=y CONFIG_TASKS_RCU=y CONFIG_RCU_STALL_COMMON=y CONFIG_TREE_RCU_TRACE=y # CONFIG_RCU_EXPEDITE_BOOT is not set CONFIG_BUILD_BIN2C=y CONFIG_IKCONFIG=y CONFIG_IKCONFIG_PROC=y CONFIG_LOG_BUF_SHIFT=17 CONFIG_LOG_CPU_MAX_BUF_SHIFT=12 CONFIG_NMI_LOG_BUF_SHIFT=13 CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y # CONFIG_NUMA_BALANCING is not set CONFIG_CGROUPS=y # C
[PATCH] cxl: use pcibios_free_controller_deferred() when removing vPHBs
When cxl removes a vPHB, it's possible that the pci_controller may be freed before all references to the devices on the vPHB have been released. This in turn causes an invalid memory access when the devices are eventually released, as pcibios_release_device() attempts to call the phb's release_device hook. In cxl_pci_vphb_remove(), remove the existing call to pcibios_free_controller(). Instead, use pcibios_free_controller_deferred() to free the pci_controller after all devices have been released. Export pci_set_host_bridge_release() so we can do this. Cc: sta...@vger.kernel.org Signed-off-by: Andrew Donnellan --- This patch requires http://patchwork.ozlabs.org/patch/658324/. It should go through the powerpc tree. --- drivers/misc/cxl/vphb.c | 10 +- drivers/pci/host-bridge.c | 1 + 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/drivers/misc/cxl/vphb.c b/drivers/misc/cxl/vphb.c index 7ada5f1..3519ace 100644 --- a/drivers/misc/cxl/vphb.c +++ b/drivers/misc/cxl/vphb.c @@ -230,6 +230,11 @@ int cxl_pci_vphb_add(struct cxl_afu *afu) if (phb->bus == NULL) return -ENXIO; + /* Set release hook on root bus */ + pci_set_host_bridge_release(to_pci_host_bridge(phb->bus->bridge), + pcibios_free_controller_deferred, + (void *) phb); + /* Claim resources. This might need some rework as well depending * whether we are doing probe-only or not, like assigning unassigned * resources etc... @@ -256,7 +261,10 @@ void cxl_pci_vphb_remove(struct cxl_afu *afu) afu->phb = NULL; pci_remove_root_bus(phb->bus); - pcibios_free_controller(phb); + /* +* We don't free phb here - that's handled by +* pcibios_free_controller_deferred() +*/ } static bool _cxl_pci_is_vphb_device(struct pci_controller *phb) diff --git a/drivers/pci/host-bridge.c b/drivers/pci/host-bridge.c index 5f4a2e0..add6623 100644 --- a/drivers/pci/host-bridge.c +++ b/drivers/pci/host-bridge.c @@ -44,6 +44,7 @@ void pci_set_host_bridge_release(struct pci_host_bridge *bridge, bridge->release_fn = release_fn; bridge->release_data = release_data; } +EXPORT_SYMBOL_GPL(pci_set_host_bridge_release); void pcibios_resource_to_bus(struct pci_bus *bus, struct pci_bus_region *region, struct resource *res) -- Andrew Donnellan OzLabs, ADL Canberra andrew.donnel...@au1.ibm.com IBM Australia Limited
Re: [PATCH v2 2/2] kexec: extend kexec_file_load system call
Since Eric was objecting the extension, I think you should convince him, but I will review from code point of view. On 08/11/16 at 08:03pm, Thiago Jung Bauermann wrote: > From: AKASHI Takahiro > > Device tree blob must be passed to a second kernel on DTB-capable > archs, like powerpc and arm64, but the current kernel interface > lacks this support. > > This patch extends kexec_file_load system call by adding an extra > argument to this syscall so that an arbitrary number of file descriptors > can be handed out from user space to the kernel. > > long sys_kexec_file_load(int kernel_fd, int initrd_fd, >unsigned long cmdline_len, >const char __user *cmdline_ptr, >unsigned long flags, >const struct kexec_fdset __user *ufdset); > > If KEXEC_FILE_EXTRA_FDS is set to the "flags" argument, the "ufdset" > argument points to the following struct buffer: > > struct kexec_fdset { > int nr_fds; > struct kexec_file_fd fds[0]; > } > > Signed-off-by: AKASHI Takahiro > Signed-off-by: Thiago Jung Bauermann > --- > include/linux/fs.h | 1 + > include/linux/kexec.h | 7 ++-- > include/linux/syscalls.h | 4 ++- > include/uapi/linux/kexec.h | 22 > kernel/kexec_file.c| 83 > ++ > 5 files changed, 108 insertions(+), 9 deletions(-) > > diff --git a/include/linux/fs.h b/include/linux/fs.h > index 3523bf62f328..847d9c31f428 100644 > --- a/include/linux/fs.h > +++ b/include/linux/fs.h > @@ -2656,6 +2656,7 @@ extern int do_pipe_flags(int *, int); > id(MODULE, kernel-module) \ > id(KEXEC_IMAGE, kexec-image)\ > id(KEXEC_INITRAMFS, kexec-initramfs)\ > + id(KEXEC_PARTIAL_DTB, kexec-partial-dtb)\ > id(POLICY, security-policy) \ > id(MAX_ID, ) > > diff --git a/include/linux/kexec.h b/include/linux/kexec.h > index 4f85d284ed0b..29202935055d 100644 > --- a/include/linux/kexec.h > +++ b/include/linux/kexec.h > @@ -148,7 +148,10 @@ struct kexec_file_ops { > kexec_verify_sig_t *verify_sig; > #endif > }; > -#endif > + > +int __weak arch_kexec_verify_buffer(enum kexec_file_type type, const void > *buf, > + unsigned long size); > +#endif /* CONFIG_KEXEC_FILE */ > > struct kimage { > kimage_entry_t head; > @@ -280,7 +283,7 @@ extern int kexec_load_disabled; > > /* List of defined/legal kexec file flags */ > #define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \ > - KEXEC_FILE_NO_INITRAMFS) > + KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_EXTRA_FDS) > > #define VMCOREINFO_BYTES (4096) > #define VMCOREINFO_NOTE_NAME "VMCOREINFO" > diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h > index d02239022bd0..fc072bdb74e3 100644 > --- a/include/linux/syscalls.h > +++ b/include/linux/syscalls.h > @@ -66,6 +66,7 @@ struct perf_event_attr; > struct file_handle; > struct sigaltstack; > union bpf_attr; > +struct kexec_fdset; > > #include > #include > @@ -321,7 +322,8 @@ asmlinkage long sys_kexec_load(unsigned long entry, > unsigned long nr_segments, > asmlinkage long sys_kexec_file_load(int kernel_fd, int initrd_fd, > unsigned long cmdline_len, > const char __user *cmdline_ptr, > - unsigned long flags); > + unsigned long flags, > + const struct kexec_fdset __user *ufdset); > > asmlinkage long sys_exit(int error_code); > asmlinkage long sys_exit_group(int error_code); > diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h > index aae5ebf2022b..6279be79efba 100644 > --- a/include/uapi/linux/kexec.h > +++ b/include/uapi/linux/kexec.h > @@ -23,6 +23,28 @@ > #define KEXEC_FILE_UNLOAD0x0001 > #define KEXEC_FILE_ON_CRASH 0x0002 > #define KEXEC_FILE_NO_INITRAMFS 0x0004 > +#define KEXEC_FILE_EXTRA_FDS 0x0008 > + > +enum kexec_file_type { > + KEXEC_FILE_TYPE_KERNEL, > + KEXEC_FILE_TYPE_INITRAMFS, > + > + /* > + * Device Tree Blob containing just the nodes and properties that > + * the kexec_file_load caller wants to add or modify. > + */ > + KEXEC_FILE_TYPE_PARTIAL_DTB, > +}; > + > +struct kexec_file_fd { > + enum kexec_file_type type; > + int fd; > +}; > + > +struct kexec_fdset { > + int nr_fds; > + struct kexec_file_fd fds[0]; > +}; > > /* These values match the ELF architecture values. > * Unless there is a good reason that should continue to be the case. > diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c > index 113af2f219b9..d6803dd884e2 100644 > --- a/kernel/kexec_file.c > +++ b/
Re: debug problems on ppc 83xx target due to changed struct task_struct
Le 17/08/2016 à 17:27, Holger Brunck a écrit : On 16/08/16 19:27, christophe leroy wrote: Le 15/08/2016 à 18:19, Dave Hansen a écrit : On 08/15/2016 07:35 AM, Holger Brunck wrote: I tried this but unfortunately the error only occurs while remote debugging. Locally with gdb everything works fine. BTW we double-checked with a 85xx ppc target which is also 32-bit and it ends up with the same behaviour. I was also investigating where I have to move the line in the struct task_struct and it turns out to be like this (diff to 4.7 kernel): diff --git a/include/linux/sched.h b/include/linux/sched.h index 253538f..4868874 100644 --- a/include/linux/sched.h +++ b/include/linux/sched.h @@ -1655,7 +1655,9 @@ struct task_struct { struct signal_struct *signal; struct sighand_struct *sighand; + // struct thread_struct thread; // until here everything is fine sigset_t blocked, real_blocked; + struct thread_struct thread; // from here it's broken sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */ struct sigpending pending; Wow, thanks for all the debugging here! So, we know it has to do with signals, thread_info, and probably only affects 32-bit powerpc. Seems awfully weird. Have you checked with any of the 64-bit powerpc guys to see if they have any ideas? I went grepping around for a bit. Where is the task_struct stored? Is it on-stack on ppc32 or something? The thread_info is, I assume, but I see some THREAD_INFO vs. THREAD (thread struct) math happening in here, which confuses me: .globl ret_from_debug_exc ret_from_debug_exc: mfspr r9,SPRN_SPRG_THREAD lwz r10,SAVED_KSP_LIMIT(r1) stw r10,KSP_LIMIT(r9) lwz r9,THREAD_INFO-THREAD(r9) CURRENT_THREAD_INFO(r10, r1) lwz r10,TI_PREEMPT(r10) stw r10,TI_PREEMPT(r9) RESTORE_xSRR(SRR0,SRR1); RESTORE_xSRR(CSRR0,CSRR1); RESTORE_MMU_REGS; RET_FROM_EXC_LEVEL(SPRN_DSRR0, SPRN_DSRR1, PPC_RFDI) But, I'm really at a loss to explain this. It still seems like a deeply ppc-specific issue. We can obviously work around it with an #ifdef for your platform, but that's awfully hackish and hides the real bug, whatever it is. My suspicion is that there's a bug in the 32-bit ppc assembly somewhere. I don't see any references to 'blocked' or 'real_blocked' in assembly though. You could add a bunch of padding instead of moving the thread_struct and see if that does anything, but that's really a stab in the dark. Just to let you know, I'm not sure it is the same issue, but I also get my 8xx target stuck when I try to use gdbserver. If I debug a very small app, it gets stuck quickly after the app has stopped: indeed, the console seems ok but as soon as I try to execute something simple, like a ps or top, it get stuck. The target still responds to pings, but nothing else. If I debug a big app, it gets stuck soon after the start of debug: I set a bpoint at main(), do a 'continue', get breaked at main(), do some steps with 'next' then it gets stuck. I have tried moving the struct thread_struct thread but it has no impact. that sounds a bit different to what I see. Is your program also mutli-threaded? No my program is a simple app doing a few printf("Hello World !"); and nothing more. I have now identified the issue, it is most likely specific to the 8xx: when entering single step exception, the 8xx asserts the internal Freeze which stops the decrementer and the timebase. In order to clear the internal Freeze, the ICR SPR has to be read. I'll now be able to check with your program and see how it behaves. Christophe
Re: [PATCH v2 1/2] kexec: add dtb info to struct kimage
On 08/11/16 at 08:03pm, Thiago Jung Bauermann wrote: > From: AKASHI Takahiro > > Device tree blob must be passed to a second kernel on DTB-capable > archs, like powerpc and arm64, but the current kernel interface > lacks this support. > > This patch adds dtb buffer information to struct kimage. > When users don't specify dtb explicitly and the one used for the current > kernel can be re-used, this change will be good enough for implementing > kexec_file_load feature. > > Signed-off-by: AKASHI Takahiro > --- > include/linux/kexec.h | 3 +++ > kernel/kexec_file.c | 3 +++ > 2 files changed, 6 insertions(+) > > diff --git a/include/linux/kexec.h b/include/linux/kexec.h > index d743baaa..4f85d284ed0b 100644 > --- a/include/linux/kexec.h > +++ b/include/linux/kexec.h > @@ -192,6 +192,9 @@ struct kimage { > char *cmdline_buf; > unsigned long cmdline_buf_len; > > + void *dtb_buf; > + unsigned long dtb_buf_len; > + > /* File operations provided by image loader */ > struct kexec_file_ops *fops; > > diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c > index 503bc2d348e5..113af2f219b9 100644 > --- a/kernel/kexec_file.c > +++ b/kernel/kexec_file.c > @@ -92,6 +92,9 @@ void kimage_file_post_load_cleanup(struct kimage *image) > vfree(image->initrd_buf); > image->initrd_buf = NULL; > > + vfree(image->dtb_buf); > + image->dtb_buf = NULL; > + > kfree(image->cmdline_buf); > image->cmdline_buf = NULL; > > -- > 1.9.1 > > > ___ > kexec mailing list > ke...@lists.infradead.org > http://lists.infradead.org/mailman/listinfo/kexec Acked-by: Dave Young Thanks Dave
Re: [PATCH v2] powerpc: move hmi.c to arch/powerpc/kvm/
On 11/08/2016 15:07, Paolo Bonzini wrote: > hmi.c functions are unused unless sibling_subcore_state is nonzero, and > that in turn happens only if KVM is in use. So move the code to > arch/powerpc/kvm/, putting it under CONFIG_KVM_BOOK3S_HV_POSSIBLE > rather than CONFIG_PPC_BOOK3S_64. The sibling_subcore_state is also > included in struct paca_struct only if KVM is supported by the kernel. > > Cc: Daniel Axtens > Cc: Michael Ellerman > Cc: Mahesh Salgaonkar > Cc: Paul Mackerras > Cc: linuxppc-dev@lists.ozlabs.org > Cc: kvm-...@vger.kernel.org > Cc: k...@vger.kernel.org > Signed-off-by: Paolo Bonzini > --- > v1->v2: use CONFIG_KVM_BOOK3S_HV_POSSIBLE, not > CONFIG_KVM_BOOK3S_64_HANDLER. The former implies > the latter, but the reverse is not true. > > arch/powerpc/include/asm/hmi.h | 2 +- > arch/powerpc/include/asm/paca.h| 12 +++- > arch/powerpc/kernel/Makefile | 2 +- > arch/powerpc/kvm/Makefile | 1 + > arch/powerpc/{kernel/hmi.c => kvm/book3s_hv_hmi.c} | 0 > 5 files changed, 10 insertions(+), 7 deletions(-) > rename arch/powerpc/{kernel/hmi.c => kvm/book3s_hv_hmi.c} (100%) > > diff --git a/arch/powerpc/include/asm/hmi.h b/arch/powerpc/include/asm/hmi.h > index 88b4901ac4ee..85b7a1a21e22 100644 > --- a/arch/powerpc/include/asm/hmi.h > +++ b/arch/powerpc/include/asm/hmi.h > @@ -21,7 +21,7 @@ > #ifndef __ASM_PPC64_HMI_H__ > #define __ASM_PPC64_HMI_H__ > > -#ifdef CONFIG_PPC_BOOK3S_64 > +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE > > #define CORE_TB_RESYNC_REQ_BIT 63 > #define MAX_SUBCORE_PER_CORE 4 > diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h > index 148303e7771f..6a6792bb39fb 100644 > --- a/arch/powerpc/include/asm/paca.h > +++ b/arch/powerpc/include/asm/paca.h > @@ -183,11 +183,6 @@ struct paca_struct { >*/ > u16 in_mce; > u8 hmi_event_available; /* HMI event is available */ > - /* > - * Bitmap for sibling subcore status. See kvm/book3s_hv_ras.c for > - * more details > - */ > - struct sibling_subcore_state *sibling_subcore_state; > #endif > > /* Stuff for accurate time accounting */ > @@ -202,6 +197,13 @@ struct paca_struct { > struct kvmppc_book3s_shadow_vcpu shadow_vcpu; > #endif > struct kvmppc_host_state kvm_hstate; > +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE > + /* > + * Bitmap for sibling subcore status. See kvm/book3s_hv_ras.c for > + * more details > + */ > + struct sibling_subcore_state *sibling_subcore_state; > +#endif > #endif > }; > > diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile > index b2027a5cf508..fe4c075bcf50 100644 > --- a/arch/powerpc/kernel/Makefile > +++ b/arch/powerpc/kernel/Makefile > @@ -41,7 +41,7 @@ obj-$(CONFIG_VDSO32)+= vdso32/ > obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o > obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_ppc970.o cpu_setup_pa6t.o > obj-$(CONFIG_PPC_BOOK3S_64) += cpu_setup_power.o > -obj-$(CONFIG_PPC_BOOK3S_64) += mce.o mce_power.o hmi.o > +obj-$(CONFIG_PPC_BOOK3S_64) += mce.o mce_power.o > obj-$(CONFIG_PPC_BOOK3E_64) += exceptions-64e.o idle_book3e.o > obj-$(CONFIG_PPC64) += vdso64/ > obj-$(CONFIG_ALTIVEC)+= vecemu.o > diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile > index 1f9e5529e692..855d4b95d752 100644 > --- a/arch/powerpc/kvm/Makefile > +++ b/arch/powerpc/kvm/Makefile > @@ -78,6 +78,7 @@ kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \ > > ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE > kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \ > + book3s_hv_hmi.o \ > book3s_hv_rmhandlers.o \ > book3s_hv_rm_mmu.o \ > book3s_hv_ras.o \ > diff --git a/arch/powerpc/kernel/hmi.c b/arch/powerpc/kvm/book3s_hv_hmi.c > similarity index 100% > rename from arch/powerpc/kernel/hmi.c > rename to arch/powerpc/kvm/book3s_hv_hmi.c > Ping? Paolo
Re: [PATCH v2 3/6] kexec_file: Allow skipping checksum calculation for some segments.
On 08/13/16 at 12:18am, Thiago Jung Bauermann wrote: > Adds checksum argument to kexec_add_buffer specifying whether the given > segment should be part of the checksum calculation. > Since it is used with add buffer, could it be added to kbuf as a new field? Like kbuf.no_checksum, default value is 0 that means checksum is needed if it is 1 then no need a checksum. > The next patch will add a way to update segments after a kimage is loaded. > Segments that will be updated in this way should not be checksummed, > otherwise they will cause the purgatory checksum verification to fail > when the machine is rebooted. > > As a bonus, we don't need to special-case the purgatory segment anymore > to avoid checksumming it. > > Adjust call sites for the new argument. > > Signed-off-by: Thiago Jung Bauermann > --- > arch/powerpc/kernel/kexec_elf_64.c | 6 +++--- > arch/x86/kernel/crash.c| 4 ++-- > arch/x86/kernel/kexec-bzimage64.c | 6 +++--- > include/linux/kexec.h | 10 +++--- > kernel/kexec_file.c| 23 --- > 5 files changed, 27 insertions(+), 22 deletions(-) > > diff --git a/arch/powerpc/kernel/kexec_elf_64.c > b/arch/powerpc/kernel/kexec_elf_64.c > index 22afc7b5ee73..4c528c81b076 100644 > --- a/arch/powerpc/kernel/kexec_elf_64.c > +++ b/arch/powerpc/kernel/kexec_elf_64.c > @@ -128,7 +128,7 @@ static int elf_exec_load(struct kimage *image, struct > elfhdr *ehdr, > kbuf.memsz = phdr->p_memsz; > kbuf.buf_align = phdr->p_align; > kbuf.buf_min = phdr->p_paddr + base; > - ret = kexec_add_buffer(&kbuf); > + ret = kexec_add_buffer(&kbuf, true); > if (ret) > goto out; > load_addr = kbuf.mem; > @@ -188,7 +188,7 @@ void *elf64_load(struct kimage *image, char *kernel_buf, > kbuf.bufsz = kbuf.memsz = initrd_len; > kbuf.buf_align = PAGE_SIZE; > kbuf.top_down = false; > - ret = kexec_add_buffer(&kbuf); > + ret = kexec_add_buffer(&kbuf, true); > if (ret) > goto out; > initrd_load_addr = kbuf.mem; > @@ -245,7 +245,7 @@ void *elf64_load(struct kimage *image, char *kernel_buf, > kbuf.bufsz = kbuf.memsz = fdt_size; > kbuf.buf_align = PAGE_SIZE; > kbuf.top_down = true; > - ret = kexec_add_buffer(&kbuf); > + ret = kexec_add_buffer(&kbuf, true); > if (ret) > goto out; > fdt_load_addr = kbuf.mem; > diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c > index 38a1cdf6aa05..634ab16377b1 100644 > --- a/arch/x86/kernel/crash.c > +++ b/arch/x86/kernel/crash.c > @@ -642,7 +642,7 @@ int crash_load_segments(struct kimage *image) >* copied in purgatory after crash. Just add a zero filled >* segment for now to make sure checksum logic works fine. >*/ > - ret = kexec_add_buffer(&kbuf); > + ret = kexec_add_buffer(&kbuf, true); > if (ret) > return ret; > image->arch.backup_load_addr = kbuf.mem; > @@ -661,7 +661,7 @@ int crash_load_segments(struct kimage *image) > > kbuf.memsz = kbuf.bufsz; > kbuf.buf_align = ELF_CORE_HEADER_ALIGN; > - ret = kexec_add_buffer(&kbuf); > + ret = kexec_add_buffer(&kbuf, true); > if (ret) { > vfree((void *)image->arch.elf_headers); > return ret; > diff --git a/arch/x86/kernel/kexec-bzimage64.c > b/arch/x86/kernel/kexec-bzimage64.c > index 4b3a75329fb6..a46e3fbb0639 100644 > --- a/arch/x86/kernel/kexec-bzimage64.c > +++ b/arch/x86/kernel/kexec-bzimage64.c > @@ -422,7 +422,7 @@ static void *bzImage64_load(struct kimage *image, char > *kernel, > kbuf.memsz = kbuf.bufsz; > kbuf.buf_align = 16; > kbuf.buf_min = MIN_BOOTPARAM_ADDR; > - ret = kexec_add_buffer(&kbuf); > + ret = kexec_add_buffer(&kbuf, true); > if (ret) > goto out_free_params; > bootparam_load_addr = kbuf.mem; > @@ -435,7 +435,7 @@ static void *bzImage64_load(struct kimage *image, char > *kernel, > kbuf.memsz = PAGE_ALIGN(header->init_size); > kbuf.buf_align = header->kernel_alignment; > kbuf.buf_min = MIN_KERNEL_LOAD_ADDR; > - ret = kexec_add_buffer(&kbuf); > + ret = kexec_add_buffer(&kbuf, true); > if (ret) > goto out_free_params; > kernel_load_addr = kbuf.mem; > @@ -449,7 +449,7 @@ static void *bzImage64_load(struct kimage *image, char > *kernel, > kbuf.bufsz = kbuf.memsz = initrd_len; > kbuf.buf_align = PAGE_SIZE; > kbuf.buf_min = MIN_INITRD_LOAD_ADDR; > - ret = kexec_add_buffer(&kbuf); > + ret = kexec_add_buffer(&kbuf, true); > if (ret) > goto out_free_params; > initrd_load_addr = kbuf.mem; > dif
[PATCH] powerpc/8xx: fix single_step debug
SPRN_ICR must be read for clearing the internal freeze signal which is asserted by the single step exception, otherwise the timebase and decrementer remain freezed Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/reg_8xx.h | 1 + arch/powerpc/kernel/traps.c| 8 2 files changed, 9 insertions(+) diff --git a/arch/powerpc/include/asm/reg_8xx.h b/arch/powerpc/include/asm/reg_8xx.h index feaf641..6dae71f 100644 --- a/arch/powerpc/include/asm/reg_8xx.h +++ b/arch/powerpc/include/asm/reg_8xx.h @@ -17,6 +17,7 @@ #define SPRN_DC_DAT570 /* Read-only data register */ /* Misc Debug */ +#define SPRN_ICR 148 #define SPRN_DPDR 630 #define SPRN_MI_CAM816 #define SPRN_MI_RAM0 817 diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 2cb5892..0f1f0ce 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -400,8 +400,16 @@ static inline int check_io_access(struct pt_regs *regs) #define REASON_TRAP0x2 #define single_stepping(regs) ((regs)->msr & MSR_SE) +#ifdef CONFIG_PPC_8xx +static inline void clear_single_step(struct pt_regs *regs) +{ + regs->msr &= ~MSR_SE; + mfspr(SPRN_ICR); +} +#else #define clear_single_step(regs)((regs)->msr &= ~MSR_SE) #endif +#endif #if defined(CONFIG_4xx) int machine_check_4xx(struct pt_regs *regs) -- 2.1.0
Re: [PATCH] powerpc/8xx: fix single_step debug
On Thu, Aug 18, 2016 at 11:44:20AM +0200, Christophe Leroy wrote: > SPRN_ICR must be read for clearing the internal freeze signal which > is asserted by the single step exception, otherwise the timebase and > decrementer remain freezed Minor nit: s/freezed/frozen/ If the timebase and decrementer are frozen even for a few cycles, this probably upsets timekeeping. I consider this a completely stupid design decision, and maybe I'm not alone. Gabriel > > Signed-off-by: Christophe Leroy > --- > arch/powerpc/include/asm/reg_8xx.h | 1 + > arch/powerpc/kernel/traps.c| 8 > 2 files changed, 9 insertions(+) > > diff --git a/arch/powerpc/include/asm/reg_8xx.h > b/arch/powerpc/include/asm/reg_8xx.h > index feaf641..6dae71f 100644 > --- a/arch/powerpc/include/asm/reg_8xx.h > +++ b/arch/powerpc/include/asm/reg_8xx.h > @@ -17,6 +17,7 @@ > #define SPRN_DC_DAT 570 /* Read-only data register */ > > /* Misc Debug */ > +#define SPRN_ICR 148 > #define SPRN_DPDR630 > #define SPRN_MI_CAM 816 > #define SPRN_MI_RAM0 817 > diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c > index 2cb5892..0f1f0ce 100644 > --- a/arch/powerpc/kernel/traps.c > +++ b/arch/powerpc/kernel/traps.c > @@ -400,8 +400,16 @@ static inline int check_io_access(struct pt_regs *regs) > #define REASON_TRAP 0x2 > > #define single_stepping(regs)((regs)->msr & MSR_SE) > +#ifdef CONFIG_PPC_8xx > +static inline void clear_single_step(struct pt_regs *regs) > +{ > + regs->msr &= ~MSR_SE; > + mfspr(SPRN_ICR); > +} > +#else > #define clear_single_step(regs) ((regs)->msr &= ~MSR_SE) > #endif > +#endif > > #if defined(CONFIG_4xx) > int machine_check_4xx(struct pt_regs *regs) > -- > 2.1.0
Re: [PATCH] powerpc/8xx: fix single_step debug
Le 18/08/2016 à 11:58, Gabriel Paubert a écrit : On Thu, Aug 18, 2016 at 11:44:20AM +0200, Christophe Leroy wrote: SPRN_ICR must be read for clearing the internal freeze signal which is asserted by the single step exception, otherwise the timebase and decrementer remain freezed Minor nit: s/freezed/frozen/ If the timebase and decrementer are frozen even for a few cycles, this probably upsets timekeeping. I consider this a completely stupid design decision, and maybe I'm not alone. Gabriel We could also unset TBF bit (TimeBase Freeze enable) in TBSCR register (today it is set in arch/powerpc/platforms/8xx/m8xx_setup.c) but then it would impact debug done with an external BDM system which expects the decrementer and TB frozen when it freezes the execution. Christophe Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/reg_8xx.h | 1 + arch/powerpc/kernel/traps.c| 8 2 files changed, 9 insertions(+) diff --git a/arch/powerpc/include/asm/reg_8xx.h b/arch/powerpc/include/asm/reg_8xx.h index feaf641..6dae71f 100644 --- a/arch/powerpc/include/asm/reg_8xx.h +++ b/arch/powerpc/include/asm/reg_8xx.h @@ -17,6 +17,7 @@ #define SPRN_DC_DAT570 /* Read-only data register */ /* Misc Debug */ +#define SPRN_ICR 148 #define SPRN_DPDR 630 #define SPRN_MI_CAM816 #define SPRN_MI_RAM0 817 diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c index 2cb5892..0f1f0ce 100644 --- a/arch/powerpc/kernel/traps.c +++ b/arch/powerpc/kernel/traps.c @@ -400,8 +400,16 @@ static inline int check_io_access(struct pt_regs *regs) #define REASON_TRAP0x2 #define single_stepping(regs) ((regs)->msr & MSR_SE) +#ifdef CONFIG_PPC_8xx +static inline void clear_single_step(struct pt_regs *regs) +{ + regs->msr &= ~MSR_SE; + mfspr(SPRN_ICR); +} +#else #define clear_single_step(regs)((regs)->msr &= ~MSR_SE) #endif +#endif #if defined(CONFIG_4xx) int machine_check_4xx(struct pt_regs *regs) -- 2.1.0
Re: [PATCH] powerpc/8xx: fix single_step debug
On Thu, Aug 18, 2016 at 12:13:21PM +0200, Christophe Leroy wrote: > > > Le 18/08/2016 à 11:58, Gabriel Paubert a écrit : > >On Thu, Aug 18, 2016 at 11:44:20AM +0200, Christophe Leroy wrote: > >>SPRN_ICR must be read for clearing the internal freeze signal which > >>is asserted by the single step exception, otherwise the timebase and > >>decrementer remain freezed > > > >Minor nit: s/freezed/frozen/ > > > >If the timebase and decrementer are frozen even for a few cycles, this > >probably upsets timekeeping. I consider this a completely stupid design > >decision, and maybe I'm not alone. > > > >Gabriel > > We could also unset TBF bit (TimeBase Freeze enable) in TBSCR > register (today it is set in > arch/powerpc/platforms/8xx/m8xx_setup.c) but then it would impact > debug done with an external BDM system which expects the decrementer > and TB frozen when it freezes the execution. Ok, I believe that systematically setting it is a mistake, but then I'm always a bit nervous about screwing up timekeeping (it certainly is always a very bad idea when you are driving telescopes). Gabriel
Re: [PATCH v2 0/2] extend kexec_file_load system call
On Thu, Aug 11, 2016 at 08:03:56PM -0300, Thiago Jung Bauermann wrote: > This patch series is from AKASHI Takahiro. I will use it in my next > version of the kexec_file_load implementation for powerpc, so I am > rebasing it on top of v4.8-rc1. [...] > Original cover letter: > > Device tree blob must be passed to a second kernel on DTB-capable > archs, like powerpc and arm64, but the current kernel interface > lacks this support. > > This patch extends kexec_file_load system call by adding an extra > argument to this syscall so that an arbitrary number of file descriptors > can be handed out from user space to the kernel. > > See the background [1]. > > Please note that the new interface looks quite similar to the current > system call, but that it won't always mean that it provides the "binary > compatibility." > > [1] http://lists.infradead.org/pipermail/kexec/2016-June/016276.html As with the original posting, I have a number of concerns, and I'm really not keen on this. * For typical usecases, I do not believe that this is necessary (at least for arm64), and generally do not believe that it should be necessary for a user to manipulate the DTB (much like the user need not manipulate ACPI tables or other FW data structures). Other than (potentially) the case of Linux as a flashed-in bootloader, I don't see a compelling case for modifying the DTB that could not be accomplished in-kernel. For that case, if truly necessary, I think that we can get away with something simpler. * This series adds architecture-specific hooks, but doesn't define what the architecture code is expected to do. For example, what is the format of the partial DTB? Is it formatted as an overlay, or a regular DTB that is expected to be merged somehow? I'm afraid that the scope is unbound, and we'll see requests to whitelist/blacklist arbitrary nodes or properties in arch code. This goes against the original simple design of kexec_file_load. It also implies that we're going to have varied architecture-specific semantics, and that arch code might not consistently check all that it should. * Further, I believe that this offers a lot of scope for unintentionally allowing certain modifications to the DTB that we do not want, and avoiding that in general is very tricky. e.g. if we allow the insertion or modification of nodes, how do we prevent phandle target hijacking? I really don't think that this is a good idea. Please consider this a NAK from my end. Thanks, Mark.
Re: [PATCH] powerpc/8xx: fix single_step debug
Le 18/08/2016 à 12:16, Gabriel Paubert a écrit : On Thu, Aug 18, 2016 at 12:13:21PM +0200, Christophe Leroy wrote: Le 18/08/2016 à 11:58, Gabriel Paubert a écrit : On Thu, Aug 18, 2016 at 11:44:20AM +0200, Christophe Leroy wrote: SPRN_ICR must be read for clearing the internal freeze signal which is asserted by the single step exception, otherwise the timebase and decrementer remain freezed Minor nit: s/freezed/frozen/ If the timebase and decrementer are frozen even for a few cycles, this probably upsets timekeeping. I consider this a completely stupid design decision, and maybe I'm not alone. Gabriel We could also unset TBF bit (TimeBase Freeze enable) in TBSCR register (today it is set in arch/powerpc/platforms/8xx/m8xx_setup.c) but then it would impact debug done with an external BDM system which expects the decrementer and TB frozen when it freezes the execution. Ok, I believe that systematically setting it is a mistake, but then I'm always a bit nervous about screwing up timekeeping (it certainly is always a very bad idea when you are driving telescopes). Indeed you are right, this should not happen. The issue is due to the fact that the bootloader set the TRE bit in the DER register. So the fix is to be done in the boot loader. Christophe
[PATCH 15/16] powerpc: powermac: Convert to hotplug state machine
Install the callbacks via the state machine. I assume here that the powermac has two CPUs and so only one can go up or down at a time. The variable smp_core99_host_open is here to ensure that we do not try to open or close the i2c host twice if something goes wrong and we invoke the prepare or online callback twice due to rollback. Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Sebastian Andrzej Siewior --- arch/powerpc/platforms/powermac/smp.c | 50 +-- include/linux/cpuhotplug.h| 1 + 2 files changed, 26 insertions(+), 25 deletions(-) diff --git a/arch/powerpc/platforms/powermac/smp.c b/arch/powerpc/platforms/powermac/smp.c index 834868b9fdc9..4a853323f906 100644 --- a/arch/powerpc/platforms/powermac/smp.c +++ b/arch/powerpc/platforms/powermac/smp.c @@ -852,37 +852,33 @@ static void smp_core99_setup_cpu(int cpu_nr) #ifdef CONFIG_PPC64 #ifdef CONFIG_HOTPLUG_CPU -static int smp_core99_cpu_notify(struct notifier_block *self, -unsigned long action, void *hcpu) +static unsigned int smp_core99_host_open; + +static int smp_core99_cpu_prepare(unsigned int cpu) { int rc; - switch(action & ~CPU_TASKS_FROZEN) { - case CPU_UP_PREPARE: - /* Open i2c bus if it was used for tb sync */ - if (pmac_tb_clock_chip_host) { - rc = pmac_i2c_open(pmac_tb_clock_chip_host, 1); - if (rc) { - pr_err("Failed to open i2c bus for time sync\n"); - return notifier_from_errno(rc); - } + /* Open i2c bus if it was used for tb sync */ + if (pmac_tb_clock_chip_host && !smp_core99_host_open) { + rc = pmac_i2c_open(pmac_tb_clock_chip_host, 1); + if (rc) { + pr_err("Failed to open i2c bus for time sync\n"); + return notifier_from_errno(rc); } - break; - case CPU_ONLINE: - case CPU_UP_CANCELED: - /* Close i2c bus if it was used for tb sync */ - if (pmac_tb_clock_chip_host) - pmac_i2c_close(pmac_tb_clock_chip_host); - break; - default: - break; + smp_core99_host_open = 1; } - return NOTIFY_OK; + return 0; } -static struct notifier_block smp_core99_cpu_nb = { - .notifier_call = smp_core99_cpu_notify, -}; +static int smp_core99_cpu_online(unsigned int cpu) +{ + /* Close i2c bus if it was used for tb sync */ + if (pmac_tb_clock_chip_host && smp_core99_host_open) { + pmac_i2c_close(pmac_tb_clock_chip_host); + smp_core99_host_open = 0; + } + return 0; +} #endif /* CONFIG_HOTPLUG_CPU */ static void __init smp_core99_bringup_done(void) @@ -902,7 +898,11 @@ static void __init smp_core99_bringup_done(void) g5_phy_disable_cpu1(); } #ifdef CONFIG_HOTPLUG_CPU - register_cpu_notifier(&smp_core99_cpu_nb); + cpuhp_setup_state_nocalls(CPUHP_POWER_PMAC_PREPARE, + "POWER_PMAC_PREPARE", smp_core99_cpu_prepare, + NULL); + cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "AP_POWER_PMAC_ONLINE", + smp_core99_cpu_online, NULL); #endif if (ppc_md.progress) diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 9e50a7b3bbcd..4974c9fdbf9a 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -32,6 +32,7 @@ enum cpuhp_state { CPUHP_RCUTREE_PREP, CPUHP_MD_RAID5_PREPARE, CPUHP_CPUIDLE_COUPLED_PREPARE, + CPUHP_POWER_PMAC_PREPARE, CPUHP_NOTIFY_PREPARE, CPUHP_TIMERS_DEAD, CPUHP_BRINGUP_CPU, -- 2.9.3
[PATCH 16/16] powerpc: mmu nohash: Convert to hotplug state machine
Install the callbacks via the state machine. Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Michael Ellerman Cc: linuxppc-dev@lists.ozlabs.org Signed-off-by: Sebastian Andrzej Siewior --- arch/powerpc/mm/mmu_context_nohash.c | 54 +++- include/linux/cpuhotplug.h | 1 + 2 files changed, 24 insertions(+), 31 deletions(-) diff --git a/arch/powerpc/mm/mmu_context_nohash.c b/arch/powerpc/mm/mmu_context_nohash.c index 7d95bc402dba..f7755fcbe1f8 100644 --- a/arch/powerpc/mm/mmu_context_nohash.c +++ b/arch/powerpc/mm/mmu_context_nohash.c @@ -369,44 +369,34 @@ void destroy_context(struct mm_struct *mm) } #ifdef CONFIG_SMP - -static int mmu_context_cpu_notify(struct notifier_block *self, - unsigned long action, void *hcpu) +static int mmu_ctx_cpu_prepare(unsigned int cpu) { - unsigned int cpu = (unsigned int)(long)hcpu; - /* We don't touch CPU 0 map, it's allocated at aboot and kept * around forever */ if (cpu == boot_cpuid) - return NOTIFY_OK; + return 0; - switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: - pr_devel("MMU: Allocating stale context map for CPU %d\n", cpu); - stale_map[cpu] = kzalloc(CTX_MAP_SIZE, GFP_KERNEL); - break; -#ifdef CONFIG_HOTPLUG_CPU - case CPU_UP_CANCELED: - case CPU_UP_CANCELED_FROZEN: - case CPU_DEAD: - case CPU_DEAD_FROZEN: - pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu); - kfree(stale_map[cpu]); - stale_map[cpu] = NULL; - - /* We also clear the cpu_vm_mask bits of CPUs going away */ - clear_tasks_mm_cpumask(cpu); - break; -#endif /* CONFIG_HOTPLUG_CPU */ - } - return NOTIFY_OK; + pr_devel("MMU: Allocating stale context map for CPU %d\n", cpu); + stale_map[cpu] = kzalloc(CTX_MAP_SIZE, GFP_KERNEL); + return 0; } -static struct notifier_block mmu_context_cpu_nb = { - .notifier_call = mmu_context_cpu_notify, -}; +static int mmu_ctx_cpu_dead(unsigned int cpu) +{ +#ifdef CONFIG_HOTPLUG_CPU + if (cpu == boot_cpuid) + return 0; + + pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu); + kfree(stale_map[cpu]); + stale_map[cpu] = NULL; + + /* We also clear the cpu_vm_mask bits of CPUs going away */ + clear_tasks_mm_cpumask(cpu); +#endif + return 0; +} #endif /* CONFIG_SMP */ @@ -469,7 +459,9 @@ void __init mmu_context_init(void) #else stale_map[boot_cpuid] = memblock_virt_alloc(CTX_MAP_SIZE, 0); - register_cpu_notifier(&mmu_context_cpu_nb); + cpuhp_setup_state_nocalls(CPUHP_POWER_MMU_CTX_PREPARE, + "POWER_MMU_CTX_PREPARE", mmu_ctx_cpu_prepare, + mmu_ctx_cpu_dead); #endif printk(KERN_INFO diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h index 4974c9fdbf9a..92b9cf3271b2 100644 --- a/include/linux/cpuhotplug.h +++ b/include/linux/cpuhotplug.h @@ -33,6 +33,7 @@ enum cpuhp_state { CPUHP_MD_RAID5_PREPARE, CPUHP_CPUIDLE_COUPLED_PREPARE, CPUHP_POWER_PMAC_PREPARE, + CPUHP_POWER_MMU_CTX_PREPARE, CPUHP_NOTIFY_PREPARE, CPUHP_TIMERS_DEAD, CPUHP_BRINGUP_CPU, -- 2.9.3
Re: [PowerPC] today's main line failed to build on PowerPC
On Thu, Aug 18, 2016 at 11:50:28AM +0530, Abdul Haleem wrote: > Hi, > > The main line stable 4.8.0-rc2 failed to build on PowerPC with following > build errors. config : pseries_le_defconfig Machine Type : PowerPC Bare > Metal > > 09:34:22 00:04:59 INFO | make -j 160 vmlinux > 09:34:24 00:05:01 ERROR| [stderr] arch/powerpc/mm/hash_low_32.S: Assembler > messages: > 09:34:24 00:05:01 ERROR| [stderr] arch/powerpc/mm/hash_low_32.S:353: Error: > missing operand > 09:34:24 00:05:01 ERROR| [stderr] arch/powerpc/mm/hash_low_32.S:612: Error: > missing operand > 09:34:24 00:05:01 ERROR| [stderr] arch/powerpc/mm/hash_low_32.S:670: Error: > missing operand > 09:34:24 00:05:01 ERROR| [stderr] make[1]: *** > [arch/powerpc/mm/hash_low_32.o] Error 1 > 09:34:24 00:05:01 ERROR| [stderr] make[1]: *** Waiting for unfinished jobs > 09:34:25 00:05:02 ERROR| [stderr] arch/powerpc/kernel/head_32.S: Assembler > messages: > 09:34:25 00:05:02 ERROR| [stderr] arch/powerpc/kernel/head_32.S:1113: Error: > missing operand > 09:34:25 00:05:02 ERROR| [stderr] make[1]: *** > [arch/powerpc/kernel/head_32.o] Error 1 > 09:34:25 00:05:02 ERROR| [stderr] make[1]: *** Waiting for unfinished jobs > 09:34:27 00:05:04 ERROR| [stderr] make: *** [arch/powerpc/mm] Error 2 > 09:34:27 00:05:04 ERROR| [stderr] make: *** Waiting for unfinished jobs > 09:34:42 00:05:19 ERROR| [stderr] make: *** [arch/powerpc/kernel] Error 2 > Sounds like the assembler could not build the 32 bit assembly files. Has the build succeeded with the same compiler/toolchain and options before? > Regard's > Abdul >
[PATCH V4 2/8] powerpc/memory: Parse new memory property to register blocks.
powerpc/memory: Add parallel routines to parse the new property "ibm,dynamic-memory-v2" property when it is present, and then to register the relevant memory blocks with the operating system. This property format is intended to provide a more compact representation of memory when communicating with the front end processor, especially when describing vast amounts of RAM. [V4: Move a couple of function prototypes from header file to a later patch where they will be used.] [V4: Add some comments.] [V4: Change a property check to scan actual device tree.] [V4: Compress some common code.] Signed-off-by: Michael Bringmann --- diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h index 7f436ba..b9a1534 100644 --- a/arch/powerpc/include/asm/prom.h +++ b/arch/powerpc/include/asm/prom.h @@ -69,6 +69,8 @@ struct boot_param_header { * OF address retreival & translation */ +extern int n_mem_addr_cells; + /* Parse the ibm,dma-window property of an OF node into the busno, phys and * size parameters. */ @@ -81,8 +83,9 @@ extern void of_instantiate_rtc(void); extern int of_get_ibm_chip_id(struct device_node *np); /* The of_drconf_cell struct defines the layout of the LMB array - * specified in the device tree property - * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory + * specified in the device tree properties, + * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory + * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory-v2 */ struct of_drconf_cell { u64 base_addr; @@ -92,9 +95,20 @@ struct of_drconf_cell { u32 flags; }; -#define DRCONF_MEM_ASSIGNED0x0008 -#define DRCONF_MEM_AI_INVALID 0x0040 -#define DRCONF_MEM_RESERVED0x0080 +#define DRCONF_MEM_ASSIGNED0x0008 +#define DRCONF_MEM_AI_INVALID 0x0040 +#define DRCONF_MEM_RESERVED0x0080 + +struct of_drconf_cell_v2 { + u32 num_seq_lmbs; + u64 base_addr; + u32 drc_index; + u32 aa_index; + u32 flags; +} __attribute__((packed)); + +extern void read_drconf_cell_v2(struct of_drconf_cell_v2 *drmem, + const __be32 **cellp); /* * There are two methods for telling firmware what our capabilities are. diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 669a15e..ad294ce 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -57,8 +57,10 @@ EXPORT_SYMBOL(node_data); static int min_common_depth; -static int n_mem_addr_cells, n_mem_size_cells; +int n_mem_addr_cells; +static int n_mem_size_cells; static int form1_affinity; +EXPORT_SYMBOL(n_mem_addr_cells); #define MAX_DISTANCE_REF_POINTS 4 static int distance_ref_points_depth; @@ -405,6 +405,24 @@ static void read_drconf_cell(struct of_drconf_cell *drmem, const __be32 **cellp) *cellp = cp + 4; } + + /* + * Retrieve and validate the ibm,dynamic-memory property of the device tree. + * Read the next memory block set entry from the ibm,dynamic-memory-v2 property + * and return the information in the provided of_drconf_cell_v2 structure. + */ +void read_drconf_cell_v2(struct of_drconf_cell_v2 *drmem, const __be32 **cellp) +{ + const __be32 *cp = (const __be32 *)*cellp; + drmem->num_seq_lmbs = be32_to_cpu(*cp++); + drmem->base_addr = read_n_cells(n_mem_addr_cells, &cp); + drmem->drc_index = be32_to_cpu(*cp++); + drmem->aa_index = be32_to_cpu(*cp++); + drmem->flags = be32_to_cpu(*cp++); + + *cellp = cp; +} +EXPORT_SYMBOL(read_drconf_cell_v2); /* * Retrieve and validate the ibm,dynamic-memory property of the device tree. diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c index b0245be..51330bc 100644 --- a/arch/powerpc/kernel/prom.c +++ b/arch/powerpc/kernel/prom.c @@ -443,23 +443,34 @@ static int __init early_init_dt_scan_chosen_ppc(unsigned long node, #ifdef CONFIG_PPC_PSERIES /* - * Interpret the ibm,dynamic-memory property in the - * /ibm,dynamic-reconfiguration-memory node. + * Retrieve and validate the ibm,lmb-size property for drconf memory + * from the flattened device tree. + */ +static u64 __init get_lmb_size(unsigned long node) +{ + const __be32 *ls; + int len; + ls = of_get_flat_dt_prop(node, "ibm,lmb-size", &len); + if (!ls || len < dt_root_size_cells * sizeof(__be32)) + return 0; + return dt_mem_next_cell(dt_root_size_cells, &ls); +} + +/* + * Interpret the ibm,dynamic-memory property/ibm,dynamic-memory-v2 + * in the /ibm,dynamic-reconfiguration-memory node. * This contains a list of memory blocks along with NUMA affinity * information. */ -static int __init early_init_dt_scan_drconf_memory(unsigned long node) +static int __init early_init_dt_scan_drconf_memory_v1(unsigned long node) { - const __be32 *dm, *ls, *usm; + const __be32 *dm, *usm; int l; unsigned long n, flags; u64 base, size, membloc
[PATCH V4 3/8] powerpc/memory: Parse new memory property to initialize structures.
powerpc/memory: Add parallel routines to parse the new property "ibm,dynamic-memory-v2" property when it is present, and then to finish initialization of the relevant memory structures with the operating system. This code is shared between the boot-time initialization functions and the runtime functions for memory hotplug, so it needs to be able to handle both formats. [V4: Added external function prototype definitions to header file "prom.h" for use in other files.] [V4: Replace a firmware feature test by an actual property scan.] [V4: Delete an unused variable.] [V4: Small cleanups to comments.] Signed-off-by: Michael Bringmann --- diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h index 7f436ba..b9a1534 100644 --- a/arch/powerpc/include/asm/prom.h +++ b/arch/powerpc/include/asm/prom.h @@ -109,6 +109,18 @@ struct of_drconf_cell_v2 { extern void read_drconf_cell_v2(struct of_drconf_cell_v2 *drmem, const __be32 **cellp); + +extern void read_one_drc_info(int **info, char **drc_type, char **drc_name, + unsigned long int *fdi_p, unsigned long int *nsl_p, + unsigned long int *si_p, unsigned long int *ldi_p); + +static inline int dyn_mem_v2_len(int entries) +{ + int drconf_v2_cells = (n_mem_addr_cells + 4); + int drconf_v2_cells_len = (drconf_v2_cells * sizeof(unsigned int)); + return (((entries) * drconf_v2_cells_len) + +(1 * sizeof(unsigned int))); +} /* * There are two methods for telling firmware what our capabilities are. diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c index 669a15e..18b4ee7 100644 --- a/arch/powerpc/mm/numa.c +++ b/arch/powerpc/mm/numa.c @@ -427,30 +426,55 @@ EXPORT_SYMBOL(read_drconf_cell_v2); /* - * Retrieve and validate the ibm,dynamic-memory property of the device tree. + * Retrieve and validate the ibm,dynamic-memory[-v2] property of the + * device tree. + * + * The layout of the ibm,dynamic-memory property is a number N of memory + * block description list entries followed by N memory block description + * list entries. Each memory block description list entry contains + * information as laid out in the of_drconf_cell struct above. * - * The layout of the ibm,dynamic-memory property is a number N of memblock - * list entries followed by N memblock list entries. Each memblock list entry - * contains information as laid out in the of_drconf_cell struct above. + * The layout of the ibm,dynamic-memory-v2 property is a number N of memory + * block set description list entries, followed by N memory block set + * description set entries. */ static int of_get_drconf_memory(struct device_node *memory, const __be32 **dm) { const __be32 *prop; u32 len, entries; - prop = of_get_property(memory, "ibm,dynamic-memory", &len); - if (!prop || len < sizeof(unsigned int)) - return 0; + if (firmware_has_feature(FW_FEATURE_DYN_MEM_V2)) { - entries = of_read_number(prop++, 1); + prop = of_get_property(memory, "ibm,dynamic-memory-v2", &len); + if (!prop || len < sizeof(unsigned int)) + return 0; - /* Now that we know the number of entries, revalidate the size -* of the property read in to ensure we have everything -*/ - if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int)) - return 0; + entries = of_read_number(prop++, 1); + + /* Now that we know the number of set entries, revalidate the +* size of the property read in to ensure we have everything. +*/ + if (len < dyn_mem_v2_len(entries)) + return 0; + + *dm = prop; + } else { + prop = of_get_property(memory, "ibm,dynamic-memory", &len); + if (!prop || len < sizeof(unsigned int)) + return 0; + + entries = of_read_number(prop++, 1); + + /* Now that we know the number of entries, revalidate the size +* of the property read in to ensure we have everything +*/ + if (len < (entries * (n_mem_addr_cells + 4) + 1) * + sizeof(unsigned int)) + return 0; + + *dm = prop; + } - *dm = prop; return entries; } @@ -513,7 +537,7 @@ * This is like of_node_to_nid_single() for memory represented in the * ibm,dynamic-reconfiguration-memory node. */ -static int of_drconf_to_nid_single(struct of_drconf_cell *drmem, +static int of_drconf_to_nid_single(u32 drmem_flags, u32 drmem_aa_index, struct assoc_arrays *aa) { int default_nid = 0; @@ -521,16 +545,16 @@ int index; if (min_common_depth > 0 && min_common_depth <= aa->array_sz && - !(drmem->fl
[PATCH V4 4/8] pseries/hotplug init: Convert new DRC memory property for hotplug runtime
hotplug_init: Simplify the code needed for runtime memory hotplug and maintenance with a conversion routine that transforms the compressed property "ibm,dynamic-memory-v2" to the form of "ibm,dynamic-memory" within the "ibm,dynamic-reconfiguration-memory" property. Thus only a single set of routines should be required at runtime to parse, edit, and manipulate the memory representation in the device tree. Similarly, any userspace applications that need this information will only need to recognize the older format to be able to continue to operate. [V4: Remove unneeded code braces.] [V4: Simplify allocation of a couple of loop index variables.] Signed-off-by: Michael Bringmann --- diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c b/arch/powerpc/platforms/pseries/hotplug-memory.c index 2ce1385..0c46fbc 100644 --- a/arch/powerpc/platforms/pseries/hotplug-memory.c +++ b/arch/powerpc/platforms/pseries/hotplug-memory.c @@ -24,6 +24,8 @@ #include #include "pseries.h" +#ifdef CONFIG_MEMORY_HOTPLUG + static bool rtas_hp_event; unsigned long pseries_memory_block_size(void) @@ -887,11 +889,102 @@ static int pseries_memory_notifier(struct notifier_block *nb, static struct notifier_block pseries_mem_nb = { .notifier_call = pseries_memory_notifier, }; +#endif /* CONFIG_MEMORY_HOTPLUG */ + +static int pseries_rewrite_dynamic_memory_v2(void) +{ + unsigned long memblock_size; + struct device_node *dn; + struct property *prop, *prop_v2; + __be32 *p; + struct of_drconf_cell *lmbs; + u32 num_lmb_desc_sets, num_lmbs; + int i, j, k; + + dn = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory"); + if (!dn) + return -EINVAL; + + prop_v2 = of_find_property(dn, "ibm,dynamic-memory-v2", NULL); + if (!prop_v2) + return -EINVAL; + + memblock_size = pseries_memory_block_size(); + if (!memblock_size) + return -EINVAL; + + /* The first int of the property is the number of lmb sets +* described by the property. +*/ + p = (__be32 *)prop_v2->value; + num_lmb_desc_sets = be32_to_cpu(*p++); + + /* Count the number of LMBs for generating the alternate format +*/ + for (i = 0, num_lmbs = 0; i < num_lmb_desc_sets; i++) { + struct of_drconf_cell_v2 drmem; + + read_drconf_cell_v2(&drmem, (const __be32 **)&p); + num_lmbs += drmem.num_seq_lmbs; + } + + /* Create an empty copy of the new 'ibm,dynamic-memory' property +*/ + prop = kzalloc(sizeof(*prop), GFP_KERNEL); + if (!prop) + return -ENOMEM; + prop->name = kstrdup("ibm,dynamic-memory", GFP_KERNEL); + prop->length = dyn_mem_v2_len(num_lmbs); + prop->value = kzalloc(prop->length, GFP_KERNEL); + + /* Copy/expand the ibm,dynamic-memory-v2 format to produce the +* ibm,dynamic-memory format. +*/ + p = (__be32 *)prop->value; + *p = cpu_to_be32(num_lmbs); + p++; + lmbs = (struct of_drconf_cell *)p; + + p = (__be32 *)prop_v2->value; + p++; + + for (i = 0, k = 0; i < num_lmb_desc_sets; i++) { + struct of_drconf_cell_v2 drmem; + + read_drconf_cell_v2(&drmem, (const __be32 **)&p); + + for (j = 0; j < drmem.num_seq_lmbs; j++) { + lmbs[k+j].base_addr = be64_to_cpu(drmem.base_addr); + lmbs[k+j].drc_index = be32_to_cpu(drmem.drc_index); + lmbs[k+j].reserved = 0; + lmbs[k+j].aa_index = be32_to_cpu(drmem.aa_index); + lmbs[k+i].flags = be32_to_cpu(drmem.flags); + + drmem.base_addr += memblock_size; + drmem.drc_index++; + } + + k += drmem.num_seq_lmbs; + } + + of_remove_property(dn, prop_v2); + + of_add_property(dn, prop); + + /* And disable feature flag since the property has gone away */ + powerpc_firmware_features &= ~FW_FEATURE_DYN_MEM_V2; + + return 0; +} static int __init pseries_memory_hotplug_init(void) { + if (firmware_has_feature(FW_FEATURE_DYN_MEM_V2)) + pseries_rewrite_dynamic_memory_v2(); +#ifdef CONFIG_MEMORY_HOTPLUG if (firmware_has_feature(FW_FEATURE_LPAR)) of_reconfig_notifier_register(&pseries_mem_nb); +#endif /* CONFIG_MEMORY_HOTPLUG */ return 0; } diff --git a/arch/powerpc/platforms/pseries/Makefile b/arch/powerpc/platforms/pseries/Makefile index fedc2ccf0..e74cf6c 100644 --- a/arch/powerpc/platforms/pseries/Makefile +++ b/arch/powerpc/platforms/pseries/Makefile @@ -5,14 +5,14 @@ obj-y := lpar.o hvCall.o nvram.o reconfig.o \ of_helpers.o \ setup.o iommu.o event_sources.o ras.o \ firmware.o power.o dlpa
Re: [PATCH 2/6] cxlflash: Cache owning adapter within context
Acked-by: Manoj N. Kumar On 8/9/2016 6:39 PM, Matthew R. Ochs wrote: The context removal routine requires access to the owning adapter structure to reset the context within the AFU as part of the tear down sequence. In order to support kref adoption, the owning adapter must be accessible from the release handler. As the kref framework only provides the kref reference as the sole parameter, another means is needed to derive the owning adapter. As a remedy, the owning adapter reference is saved off within the context during initialization. Signed-off-by: Matthew R. Ochs --- drivers/scsi/cxlflash/superpipe.c | 1 + drivers/scsi/cxlflash/superpipe.h | 1 + 2 files changed, 2 insertions(+) diff --git a/drivers/scsi/cxlflash/superpipe.c b/drivers/scsi/cxlflash/superpipe.c index ab5c893..640c3a2 100644 --- a/drivers/scsi/cxlflash/superpipe.c +++ b/drivers/scsi/cxlflash/superpipe.c @@ -804,6 +804,7 @@ static void init_context(struct ctx_info *ctxi, struct cxlflash_cfg *cfg, ctxi->lfd = adap_fd; ctxi->pid = current->tgid; /* tgid = pid */ ctxi->ctx = ctx; + ctxi->cfg = cfg; ctxi->file = file; ctxi->initialized = true; mutex_init(&ctxi->mutex); diff --git a/drivers/scsi/cxlflash/superpipe.h b/drivers/scsi/cxlflash/superpipe.h index 5f9a091..61404f2 100644 --- a/drivers/scsi/cxlflash/superpipe.h +++ b/drivers/scsi/cxlflash/superpipe.h @@ -107,6 +107,7 @@ struct ctx_info { bool err_recovery_active; struct mutex mutex; /* Context protection */ struct cxl_context *ctx; + struct cxlflash_cfg *cfg; struct list_head luns; /* LUNs attached to this context */ const struct vm_operations_struct *cxl_mmap_vmops; struct file *file;
Re: [PowerPC] today's main line failed to build on PowerPC
On Thu, Aug 18, 2016 at 12:48:17PM +0530, Abdul Haleem wrote: > >09:34:24 00:05:01 ERROR| [stderr] arch/powerpc/mm/hash_low_32.S: > >Assembler messages: > >09:34:24 00:05:01 ERROR| [stderr] arch/powerpc/mm/hash_low_32.S:353: > >Error: missing operand > >09:34:24 00:05:01 ERROR| [stderr] arch/powerpc/mm/hash_low_32.S:612: > >Error: missing operand > >09:34:24 00:05:01 ERROR| [stderr] arch/powerpc/mm/hash_low_32.S:670: > >Error: missing operand You're building 32-bit no-smp... > CONFIG_SMP=y > CONFIG_WORD_SIZE=64 ... but your config says otherwise. Some (re-)configuration mishap? Segher
[PATCH] powerpc/8xx: use SPRN_EIE and SPRN_EID to enable/disable interrupts
The 8xx has two special registers called EID (External Interrupt Disable) and EIE (External Interrupt Enable) for clearing/setting EE in MSR. It avoids the three instructions set mfmsr/ori/mtmsr or mfmsr/rlwinm/mtmsr. We just have to write something in the register to change MSR EE bit. So we write r0 into the register, regardless of r0 value. Signed-off-by: Christophe Leroy --- arch/powerpc/include/asm/hw_irq.h | 6 ++ arch/powerpc/include/asm/reg.h | 2 ++ arch/powerpc/include/asm/reg_8xx.h | 5 + 3 files changed, 13 insertions(+) diff --git a/arch/powerpc/include/asm/hw_irq.h b/arch/powerpc/include/asm/hw_irq.h index c7d82ff..7ffb392 100644 --- a/arch/powerpc/include/asm/hw_irq.h +++ b/arch/powerpc/include/asm/hw_irq.h @@ -155,6 +155,8 @@ static inline unsigned long arch_local_irq_save(void) unsigned long flags = arch_local_save_flags(); #ifdef CONFIG_BOOKE asm volatile("wrteei 0" : : : "memory"); +#elif defined(CONFIG_PPC_8xx) + wrtspr(SPRN_EID); #else SET_MSR_EE(flags & ~MSR_EE); #endif @@ -165,6 +167,8 @@ static inline void arch_local_irq_disable(void) { #ifdef CONFIG_BOOKE asm volatile("wrteei 0" : : : "memory"); +#elif defined(CONFIG_PPC_8xx) + wrtspr(SPRN_EID); #else arch_local_irq_save(); #endif @@ -174,6 +178,8 @@ static inline void arch_local_irq_enable(void) { #ifdef CONFIG_BOOKE asm volatile("wrteei 1" : : : "memory"); +#elif defined(CONFIG_PPC_8xx) + wrtspr(SPRN_EIE); #else unsigned long msr = mfmsr(); SET_MSR_EE(msr | MSR_EE); diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index f69f40f..4bbd9be 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -1246,6 +1246,8 @@ static inline void mtmsr_isync(unsigned long val) : "r" ((unsigned long)(v)) \ : "memory") #endif +#define wrtspr(rn) asm volatile("mtspr " __stringify(rn) ",0" : \ +: : "memory") extern void msr_check_and_set(unsigned long bits); extern bool strict_msr_control; diff --git a/arch/powerpc/include/asm/reg_8xx.h b/arch/powerpc/include/asm/reg_8xx.h index 6dae71f..d4bca3de 100644 --- a/arch/powerpc/include/asm/reg_8xx.h +++ b/arch/powerpc/include/asm/reg_8xx.h @@ -6,6 +6,11 @@ #include +/* Special MSR manipulation registers */ +#define SPRN_EIE 80 /* External interrupt enable (EE=1, RI=1) */ +#define SPRN_EID 81 /* External interrupt disable (EE=0, RI=1) */ +#define SPRN_NRI 81 /* Non Recoverable interrupt (EE=0, RI=0) */ + /* Cache control on the MPC8xx is provided through some additional * special purpose registers. */ -- 2.1.0
[PATCH] ibmvnic: Handle backing device failover and reinitialization
An upcoming feature of IBM VNIC protocol is the ability to configure redundant backing devices for a VNIC client. In case of a failure on the current backing device, the driver will receive a signal from the hypervisor indicating that a failover will occur. The driver will then wait for a message from the backing device before establishing a new connection. Signed-off-by: Thomas Falcon --- drivers/net/ethernet/ibm/ibmvnic.c | 34 -- drivers/net/ethernet/ibm/ibmvnic.h | 2 ++ 2 files changed, 34 insertions(+), 2 deletions(-) diff --git a/drivers/net/ethernet/ibm/ibmvnic.c b/drivers/net/ethernet/ibm/ibmvnic.c index 88f3c85..b942108 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.c +++ b/drivers/net/ethernet/ibm/ibmvnic.c @@ -203,7 +203,8 @@ static void free_long_term_buff(struct ibmvnic_adapter *adapter, struct device *dev = &adapter->vdev->dev; dma_free_coherent(dev, ltb->size, ltb->buff, ltb->addr); - send_request_unmap(adapter, ltb->map_id); + if (!adapter->failover) + send_request_unmap(adapter, ltb->map_id); } static int alloc_rx_pool(struct ibmvnic_adapter *adapter, @@ -522,7 +523,8 @@ static int ibmvnic_close(struct net_device *netdev) for (i = 0; i < adapter->req_rx_queues; i++) napi_disable(&adapter->napi[i]); - netif_tx_stop_all_queues(netdev); + if (!adapter->failover) + netif_tx_stop_all_queues(netdev); if (adapter->bounce_buffer) { if (!dma_mapping_error(dev, adapter->bounce_buffer_dma)) { @@ -3280,6 +3282,10 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq, rc = ibmvnic_send_crq_init(adapter); if (rc) dev_err(dev, "Error sending init rc=%ld\n", rc); + } else if (gen_crq->cmd == IBMVNIC_DEVICE_FAILOVER) { + dev_info(dev, "Backing device failover detected\n"); + netif_carrier_off(netdev); + adapter->failover = true; } else { /* The adapter lost the connection */ dev_err(dev, "Virtual Adapter failed (rc=%d)\n", @@ -3615,8 +3621,18 @@ static void handle_crq_init_rsp(struct work_struct *work) struct device *dev = &adapter->vdev->dev; struct net_device *netdev = adapter->netdev; unsigned long timeout = msecs_to_jiffies(3); + bool restart = false; int rc; + if (adapter->failover) { + release_sub_crqs(adapter); + if (netif_running(netdev)) { + netif_tx_disable(netdev); + ibmvnic_close(netdev); + restart = true; + } + } + send_version_xchg(adapter); reinit_completion(&adapter->init_done); if (!wait_for_completion_timeout(&adapter->init_done, timeout)) { @@ -3645,6 +3661,17 @@ static void handle_crq_init_rsp(struct work_struct *work) netdev->real_num_tx_queues = adapter->req_tx_queues; + if (adapter->failover) { + adapter->failover = false; + if (restart) { + rc = ibmvnic_open(netdev); + if (rc) + goto restart_failed; + } + netif_carrier_on(netdev); + return; + } + rc = register_netdev(netdev); if (rc) { dev_err(dev, @@ -3655,6 +3682,8 @@ static void handle_crq_init_rsp(struct work_struct *work) return; +restart_failed: + dev_err(dev, "Failed to restart ibmvnic, rc=%d\n", rc); register_failed: release_sub_crqs(adapter); task_failed: @@ -3692,6 +3721,7 @@ static int ibmvnic_probe(struct vio_dev *dev, const struct vio_device_id *id) dev_set_drvdata(&dev->dev, netdev); adapter->vdev = dev; adapter->netdev = netdev; + adapter->failover = false; ether_addr_copy(adapter->mac_addr, mac_addr_p); ether_addr_copy(netdev->dev_addr, adapter->mac_addr); diff --git a/drivers/net/ethernet/ibm/ibmvnic.h b/drivers/net/ethernet/ibm/ibmvnic.h index e82898f..bfc84c7 100644 --- a/drivers/net/ethernet/ibm/ibmvnic.h +++ b/drivers/net/ethernet/ibm/ibmvnic.h @@ -830,6 +830,7 @@ enum ibmvfc_crq_format { IBMVNIC_CRQ_INIT = 0x01, IBMVNIC_CRQ_INIT_COMPLETE= 0x02, IBMVNIC_PARTITION_MIGRATED = 0x06, + IBMVNIC_DEVICE_FAILOVER = 0x08, }; struct ibmvnic_crq_queue { @@ -1047,4 +1048,5 @@ struct ibmvnic_adapter { u8 map_id; struct work_struct vnic_crq_init; + bool failover; }; -- 1.8.3.1
Re: [PATCH] powerpc/8xx: use SPRN_EIE and SPRN_EID to enable/disable interrupts
On Thu, Aug 18, 2016 at 05:56:02PM +0200, Christophe Leroy wrote: > The 8xx has two special registers called EID (External Interrupt > Disable) and EIE (External Interrupt Enable) for clearing/setting > EE in MSR. It avoids the three instructions set mfmsr/ori/mtmsr or > mfmsr/rlwinm/mtmsr. All 8xx? What other models? (5xx for example). > +/* Special MSR manipulation registers */ > +#define SPRN_EIE 80 /* External interrupt enable (EE=1, RI=1) */ > +#define SPRN_EID 81 /* External interrupt disable (EE=0, RI=1) */ > +#define SPRN_NRI 81 /* Non Recoverable interrupt (EE=0, RI=0) */ This is wrong (NRI is 82). Don't write code you cannot test / don't submit code you haven't tested? :-) Segher
Re: [PATCH] powerpc/8xx: use SPRN_EIE and SPRN_EID to enable/disable interrupts
Le 18/08/2016 à 18:34, Segher Boessenkool a écrit : On Thu, Aug 18, 2016 at 05:56:02PM +0200, Christophe Leroy wrote: The 8xx has two special registers called EID (External Interrupt Disable) and EIE (External Interrupt Enable) for clearing/setting EE in MSR. It avoids the three instructions set mfmsr/ori/mtmsr or mfmsr/rlwinm/mtmsr. All 8xx? What other models? (5xx for example). At least 823, 860, 866 and 885 have it. Looks like the 5xx have it too (at least the 565). Does Linux supports that one at all ? 8272 and 8323 don't have it. +/* Special MSR manipulation registers */ +#define SPRN_EIE 80 /* External interrupt enable (EE=1, RI=1) */ +#define SPRN_EID 81 /* External interrupt disable (EE=0, RI=1) */ +#define SPRN_NRI 81 /* Non Recoverable interrupt (EE=0, RI=0) */ This is wrong (NRI is 82). Don't write code you cannot test / don't submit code you haven't tested? :-) Oops. You're right, copy/paste failure. Was tested on an 885. Unfortunatly SPRN_NRI is not used (yet) :-( Christophe
Re: [PATCH] cxl: use pcibios_free_controller_deferred() when removing vPHBs
> On Aug 18, 2016, at 2:35 AM, Andrew Donnellan > wrote: > > When cxl removes a vPHB, it's possible that the pci_controller may be freed > before all references to the devices on the vPHB have been released. This > in turn causes an invalid memory access when the devices are eventually > released, as pcibios_release_device() attempts to call the phb's > release_device hook. > > In cxl_pci_vphb_remove(), remove the existing call to > pcibios_free_controller(). Instead, use > pcibios_free_controller_deferred() to free the pci_controller after all > devices have been released. Export pci_set_host_bridge_release() so we can > do this. > > Cc: sta...@vger.kernel.org > Signed-off-by: Andrew Donnellan Reviewed-by: Matthew R. Ochs
Re: [PATCH] powerpc/8xx: use SPRN_EIE and SPRN_EID to enable/disable interrupts
On Thu, Aug 18, 2016 at 06:52:47PM +0200, Christophe Leroy wrote: > Le 18/08/2016 à 18:34, Segher Boessenkool a écrit : > >On Thu, Aug 18, 2016 at 05:56:02PM +0200, Christophe Leroy wrote: > >>The 8xx has two special registers called EID (External Interrupt > >>Disable) and EIE (External Interrupt Enable) for clearing/setting > >>EE in MSR. It avoids the three instructions set mfmsr/ori/mtmsr or > >>mfmsr/rlwinm/mtmsr. > > > >All 8xx? What other models? (5xx for example). > > At least 823, 860, 866 and 885 have it. I haven't been able to find a manual for all 8xx. But there is AN2055, which suggests EIE etc. is for all 8xx indeed. > Looks like the 5xx have it too (at least the 565). Does Linux supports > that one at all ? All 5xx have it, there is a manual for *that* ("RCPU") :-) > >>+/* Special MSR manipulation registers */ > >>+#define SPRN_EIE 80 /* External interrupt enable (EE=1, RI=1) */ > >>+#define SPRN_EID 81 /* External interrupt disable (EE=0, RI=1) */ > >>+#define SPRN_NRI 81 /* Non Recoverable interrupt (EE=0, RI=0) */ Is it correct to set RI in all places you do now? > >This is wrong (NRI is 82). Don't write code you cannot test / don't submit > >code you haven't tested? :-) > > Oops. You're right, copy/paste failure. > Was tested on an 885. Unfortunatly SPRN_NRI is not used (yet) :-( Well, that was my point! Segher
Re: [PATCH 3/6] cxlflash: Add kref to context
Acked-by: Manoj N. Kumar On 8/9/2016 6:39 PM, Matthew R. Ochs wrote: Currently, context user references are tracked via the list of LUNs that have attached to the context. While convenient, this is not intuitive without a deep study of the code and is inconsistent with the existing reference tracking patterns within the kernel. This design choice can lead to future bug injection. To improve code comprehension and better protect against future bugs, add explicit reference counting to contexts and migrate the context removal code to the kref release handler. Inspired-by: Al Viro Signed-off-by: Matthew R. Ochs
Re: [PATCH v2 3/6] kexec_file: Allow skipping checksum calculation for some segments.
Hello Dave, Thanks for your review! [ Trimming down Cc: list a little to try to clear the "too many recipients" mailing list restriction. ] Am Donnerstag, 18 August 2016, 17:03:30 schrieb Dave Young: > On 08/13/16 at 12:18am, Thiago Jung Bauermann wrote: > > Adds checksum argument to kexec_add_buffer specifying whether the given > > segment should be part of the checksum calculation. > > Since it is used with add buffer, could it be added to kbuf as a new > field? I was on the fence about adding it as a new argument to kexec_add_buffer or as a new field to struct kexec_buf. Both alternatives make sense to me. I implemented your suggestion in the patch below, what do you think? > Like kbuf.no_checksum, default value is 0 that means checksum is needed > if it is 1 then no need a checksum. It's an interesting idea and I implemented it that way, though in practice all current users of struct kexec_buf put it on the stack so the field needs to be initialized explicitly. -- []'s Thiago Jung Bauermann IBM Linux Technology Center Subject: [PATCH v2 3/6] kexec_file: Allow skipping checksum calculation for some segments. Add skip_checksum member to struct kexec_buf to specify whether the corresponding segment should be part of the checksum calculation. The next patch will add a way to update segments after a kimage is loaded. Segments that will be updated in this way should not be checksummed, otherwise they will cause the purgatory checksum verification to fail when the machine is rebooted. As a bonus, we don't need to special-case the purgatory segment anymore to avoid checksumming it. Adjust places using struct kexec_buf to set skip_checksum. Signed-off-by: Thiago Jung Bauermann --- arch/powerpc/kernel/kexec_elf_64.c | 5 +++-- arch/x86/kernel/crash.c| 3 ++- arch/x86/kernel/kexec-bzimage64.c | 2 +- include/linux/kexec.h | 23 ++- kernel/kexec_file.c| 15 +++ 5 files changed, 27 insertions(+), 21 deletions(-) diff --git a/arch/powerpc/kernel/kexec_elf_64.c b/arch/powerpc/kernel/kexec_elf_64.c index 22afc7b5ee73..d009f5363968 100644 --- a/arch/powerpc/kernel/kexec_elf_64.c +++ b/arch/powerpc/kernel/kexec_elf_64.c @@ -107,7 +107,7 @@ static int elf_exec_load(struct kimage *image, struct elfhdr *ehdr, int ret; size_t i; struct kexec_buf kbuf = { .image = image, .buf_max = ppc64_rma_size, - .top_down = false }; + .top_down = false, .skip_checksum = false }; /* Read in the PT_LOAD segments. */ for (i = 0; i < ehdr->e_phnum; i++) { @@ -162,7 +162,8 @@ void *elf64_load(struct kimage *image, char *kernel_buf, struct elf_info elf_info; struct fdt_reserve_entry *rsvmap; struct kexec_buf kbuf = { .image = image, .buf_min = 0, - .buf_max = ppc64_rma_size }; + .buf_max = ppc64_rma_size, + .skip_checksum = false }; ret = build_elf_exec_info(kernel_buf, kernel_len, &ehdr, &elf_info); if (ret) diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c index 38a1cdf6aa05..7b8f62c86651 100644 --- a/arch/x86/kernel/crash.c +++ b/arch/x86/kernel/crash.c @@ -617,7 +617,8 @@ int crash_load_segments(struct kimage *image) { int ret; struct kexec_buf kbuf = { .image = image, .buf_min = 0, - .buf_max = ULONG_MAX, .top_down = false }; + .buf_max = ULONG_MAX, .top_down = false, + .skip_checksum = false }; /* * Determine and load a segment for backup area. First 640K RAM diff --git a/arch/x86/kernel/kexec-bzimage64.c b/arch/x86/kernel/kexec-bzimage64.c index 4b3a75329fb6..449f433cd225 100644 --- a/arch/x86/kernel/kexec-bzimage64.c +++ b/arch/x86/kernel/kexec-bzimage64.c @@ -341,7 +341,7 @@ static void *bzImage64_load(struct kimage *image, char *kernel, unsigned int setup_hdr_offset = offsetof(struct boot_params, hdr); unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset; struct kexec_buf kbuf = { .image = image, .buf_max = ULONG_MAX, - .top_down = true }; + .top_down = true, .skip_checksum = false }; header = (struct setup_header *)(kernel + setup_hdr_offset); setup_sects = header->setup_sects; diff --git a/include/linux/kexec.h b/include/linux/kexec.h index 4559a1a01b0a..e5b3d99cbe50 100644 --- a/include/linux/kexec.h +++ b/include/linux/kexec.h @@ -100,6 +100,9 @@ struct kexec_segment { size_t bufsz; unsigned long mem; size_t memsz; + + /* Whether this segment is ignored in the checksum calculation. */ + bool skip_checksum; }; #ifdef CONFIG_COMPAT @@ -151,15 +154,16 @@ struct kexec_file_ops { /** * s
Re: [PATCH v2] powerpc: move hmi.c to arch/powerpc/kvm/
On Thu, 2016-08-18 at 10:53 +0200, Paolo Bonzini wrote: > > On 11/08/2016 15:07, Paolo Bonzini wrote: > > > > hmi.c functions are unused unless sibling_subcore_state is nonzero, > > and > > that in turn happens only if KVM is in use. So move the code to > > arch/powerpc/kvm/, putting it under CONFIG_KVM_BOOK3S_HV_POSSIBLE > > rather than CONFIG_PPC_BOOK3S_64. The sibling_subcore_state is > > also > > included in struct paca_struct only if KVM is supported by the > > kernel. > Mahesh, can you review this ? > > Cc: Daniel Axtens > > Cc: Michael Ellerman > > Cc: Mahesh Salgaonkar > > Cc: Paul Mackerras > > Cc: linuxppc-dev@lists.ozlabs.org > > Cc: kvm-...@vger.kernel.org > > Cc: k...@vger.kernel.org > > Signed-off-by: Paolo Bonzini > > --- > > v1->v2: use CONFIG_KVM_BOOK3S_HV_POSSIBLE, not > > CONFIG_KVM_BOOK3S_64_HANDLER. The former implies > > the latter, but the reverse is not true. > > > > arch/powerpc/include/asm/hmi.h | 2 +- > > arch/powerpc/include/asm/paca.h| 12 +++--- > > -- > > arch/powerpc/kernel/Makefile | 2 +- > > arch/powerpc/kvm/Makefile | 1 + > > arch/powerpc/{kernel/hmi.c => kvm/book3s_hv_hmi.c} | 0 > > 5 files changed, 10 insertions(+), 7 deletions(-) > > rename arch/powerpc/{kernel/hmi.c => kvm/book3s_hv_hmi.c} (100%) > > > > diff --git a/arch/powerpc/include/asm/hmi.h > > b/arch/powerpc/include/asm/hmi.h > > index 88b4901ac4ee..85b7a1a21e22 100644 > > --- a/arch/powerpc/include/asm/hmi.h > > +++ b/arch/powerpc/include/asm/hmi.h > > @@ -21,7 +21,7 @@ > > #ifndef __ASM_PPC64_HMI_H__ > > #define __ASM_PPC64_HMI_H__ > > > > -#ifdef CONFIG_PPC_BOOK3S_64 > > +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE > > > > #defineCORE_TB_RESYNC_REQ_BIT 63 > > #define MAX_SUBCORE_PER_CORE 4 > > diff --git a/arch/powerpc/include/asm/paca.h > > b/arch/powerpc/include/asm/paca.h > > index 148303e7771f..6a6792bb39fb 100644 > > --- a/arch/powerpc/include/asm/paca.h > > +++ b/arch/powerpc/include/asm/paca.h > > @@ -183,11 +183,6 @@ struct paca_struct { > > */ > > u16 in_mce; > > u8 hmi_event_available; /* HMI event is > > available */ > > - /* > > - * Bitmap for sibling subcore status. See > > kvm/book3s_hv_ras.c for > > - * more details > > - */ > > - struct sibling_subcore_state *sibling_subcore_state; > > #endif > > > > /* Stuff for accurate time accounting */ > > @@ -202,6 +197,13 @@ struct paca_struct { > > struct kvmppc_book3s_shadow_vcpu shadow_vcpu; > > #endif > > struct kvmppc_host_state kvm_hstate; > > +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE > > + /* > > + * Bitmap for sibling subcore status. See > > kvm/book3s_hv_ras.c for > > + * more details > > + */ > > + struct sibling_subcore_state *sibling_subcore_state; > > +#endif > > #endif > > }; > > > > diff --git a/arch/powerpc/kernel/Makefile > > b/arch/powerpc/kernel/Makefile > > index b2027a5cf508..fe4c075bcf50 100644 > > --- a/arch/powerpc/kernel/Makefile > > +++ b/arch/powerpc/kernel/Makefile > > @@ -41,7 +41,7 @@ obj-$(CONFIG_VDSO32) += vdso32/ > > obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o > > obj-$(CONFIG_PPC_BOOK3S_64)+= cpu_setup_ppc970.o > > cpu_setup_pa6t.o > > obj-$(CONFIG_PPC_BOOK3S_64)+= cpu_setup_power.o > > -obj-$(CONFIG_PPC_BOOK3S_64)+= mce.o mce_power.o hmi.o > > +obj-$(CONFIG_PPC_BOOK3S_64)+= mce.o mce_power.o > > obj-$(CONFIG_PPC_BOOK3E_64)+= exceptions-64e.o > > idle_book3e.o > > obj-$(CONFIG_PPC64)+= vdso64/ > > obj-$(CONFIG_ALTIVEC) += vecemu.o > > diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile > > index 1f9e5529e692..855d4b95d752 100644 > > --- a/arch/powerpc/kvm/Makefile > > +++ b/arch/powerpc/kvm/Makefile > > @@ -78,6 +78,7 @@ kvm-book3s_64-builtin-xics-objs- > > $(CONFIG_KVM_XICS) := \ > > > > ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE > > kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \ > > + book3s_hv_hmi.o \ > > book3s_hv_rmhandlers.o \ > > book3s_hv_rm_mmu.o \ > > book3s_hv_ras.o \ > > diff --git a/arch/powerpc/kernel/hmi.c > > b/arch/powerpc/kvm/book3s_hv_hmi.c > > similarity index 100% > > rename from arch/powerpc/kernel/hmi.c > > rename to arch/powerpc/kvm/book3s_hv_hmi.c > > > > Ping? > > Paolo
[RFC/PATCH 1/2] cpuidle: Allow idle-states to be disabled at start
From: "Gautham R. Shenoy" Currently all the idle states registered by a cpu-idle driver are enabled by default. This patch adds a mechanism which allows the driver to hint if an idle-state should start in a disabled state. The cpu-idle core will use this hint to appropriately initialize the usage->disable knob of the CPU device idle state. The state can be enabled at run time by echo'ing a zero to the sysfs "disable" control file. Signed-off-by: Gautham R. Shenoy --- drivers/cpuidle/cpuidle.c | 7 +++ include/linux/cpuidle.h | 7 ++- 2 files changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c index c73207a..b4debc7 100644 --- a/drivers/cpuidle/cpuidle.c +++ b/drivers/cpuidle/cpuidle.c @@ -439,7 +439,14 @@ static void __cpuidle_unregister_device(struct cpuidle_device *dev) static void __cpuidle_device_init(struct cpuidle_device *dev) { + struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev); + int i; + memset(dev->states_usage, 0, sizeof(dev->states_usage)); + for (i = 0; i < drv->state_count; i++) { + if (drv->states[i].disable_use_at_start) + dev->states_usage[i].disable = 1; + } dev->last_residency = 0; } diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index bb31373..f3fe855 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -44,7 +44,12 @@ struct cpuidle_state { int power_usage; /* in mW */ unsigned inttarget_residency; /* in US */ booldisabled; /* disabled on all CPUs */ - + /* +* disable_use_at_start: If true, then this idle state will be +* disabled by default. It can be enabled at runtime using the +* per-cpu cpuidle sysfs control file named "disable". +*/ + booldisable_use_at_start; int (*enter)(struct cpuidle_device *dev, struct cpuidle_driver *drv, int index); -- 1.9.4
[RFC/PATCH 0/2] powernv:cpuidle: Enable winkle idle state
From: "Gautham R. Shenoy" Hi, The patches in these series enable support for Winkle idle state in CPU-Idle. The first patch is a platform-independent CPU-Idle patch that allows CPU-Idle states to be disabled at start (Currently they are all enabled by default). The second patch adds the winkle enablement for powernv-cpuidle. By default, the winkle idle-state is disabled. It can be enabled by writing zero to the per-cpu cpuidle sysfs control file named "disable". This series has been lightly tested on a 2-socket POWER8 system and the machine was pretty stable while running kernbench and ebizzy. I didn't see any regressions with those. I haven't yet evaluated the impact that these patches might have on latency sensitive workloads. I hope to do that in a day or two. On the power-savings front, I could observe 6-8% additional power-savings when winkle state was enabled on an idle system with SMT=on. With SMT=off, additional idle power-savings observed with winkle enabled were greater than 15%. The numbers indicate that it might be worth the while to pursue this! Gautham R. Shenoy (2): cpuidle: Allow idle-states to be disabled at start powernv:cpuidle: Enable winkle idle state in CPU-Idle. drivers/cpuidle/cpuidle-powernv.c | 44 --- drivers/cpuidle/cpuidle.c | 7 +++ include/linux/cpuidle.h | 7 ++- 3 files changed, 49 insertions(+), 9 deletions(-) -- 1.9.4
[RFC/PATCH 2/2] powernv:cpuidle: Enable winkle idle state in CPU-Idle.
From: "Gautham R. Shenoy" cpu-idle on powernv currently has support for only snooze, nap and fastsleep states. Winkle idle state was excluded due to its large exit-latency. This patch adds winkle as a cpu-idle state for experimental purposes. This state is disabled at start by default. However, should an adventurous user want to enable it on a particular CPU(s), they can do so by echo'ing a zero into the per-cpu sysfs cpuidle control file named "disable" corresponding to this state. Signed-off-by: Gautham R. Shenoy --- drivers/cpuidle/cpuidle-powernv.c | 44 --- 1 file changed, 36 insertions(+), 8 deletions(-) diff --git a/drivers/cpuidle/cpuidle-powernv.c b/drivers/cpuidle/cpuidle-powernv.c index f7ca891..0437d8a 100644 --- a/drivers/cpuidle/cpuidle-powernv.c +++ b/drivers/cpuidle/cpuidle-powernv.c @@ -20,7 +20,6 @@ #include #include -#define POWERNV_THRESHOLD_LATENCY_NS 20 struct cpuidle_driver powernv_idle_driver = { .name = "powernv_idle", @@ -95,6 +94,30 @@ static int fastsleep_loop(struct cpuidle_device *dev, return index; } + +static int winkle_loop(struct cpuidle_device *dev, + struct cpuidle_driver *drv, + int index) +{ + unsigned long old_lpcr = mfspr(SPRN_LPCR); + unsigned long new_lpcr; + + if (unlikely(system_state < SYSTEM_RUNNING)) + return index; + + new_lpcr = old_lpcr; + /* Do not exit powersave upon decrementer as we've setup the timer +* offload. +*/ + new_lpcr &= ~LPCR_PECE1; + + mtspr(SPRN_LPCR, new_lpcr); + power7_winkle(); + + mtspr(SPRN_LPCR, old_lpcr); + + return index; +} #endif static int stop_loop(struct cpuidle_device *dev, @@ -246,13 +269,6 @@ static int powernv_add_idle_states(void) "ibm,cpu-idle-state-residency-ns", residency_ns, dt_idle_states); for (i = 0; i < dt_idle_states; i++) { - /* -* If an idle state has exit latency beyond -* POWERNV_THRESHOLD_LATENCY_NS then don't use it -* in cpu-idle. -*/ - if (latency_ns[i] > POWERNV_THRESHOLD_LATENCY_NS) - continue; /* * Cpuidle accepts exit_latency and target_residency in us. @@ -301,6 +317,18 @@ static int powernv_add_idle_states(void) powernv_states[nr_idle_states].enter = stop_loop; stop_psscr_table[nr_idle_states] = psscr_val[i]; } + + if (flags[i] & OPAL_PM_WINKLE_ENABLED) { + int state_idx = nr_idle_states; + + strcpy(powernv_states[state_idx].name, "Winkle"); + strcpy(powernv_states[state_idx].desc, "Winkle"); + powernv_states[state_idx].flags = + CPUIDLE_FLAG_TIMER_STOP; + powernv_states[state_idx].target_residency = 50; + powernv_states[state_idx].enter = winkle_loop; + powernv_states[state_idx].disable_use_at_start = true; + } #endif powernv_states[nr_idle_states].exit_latency = ((unsigned int)latency_ns[i]) / 1000; -- 1.9.4
Re: [PATCH 0/6] cxlflash: Improvements and cleanup
> "Matthew" == Matthew R Ochs writes: Matthew> This patch set contains various code improvements and cleanups Matthew> that were inspired by Al Viro upon reviewing the cxlflash Matthew> driver. The core improvement is that the driver will no longer Matthew> cache the adapter file descriptor associated with a Matthew> context. This results in a user API change that is documented Matthew> alongside the modifications. Applied patches 1-3 to 4.9/scsi-queue. The remainder await reviews. Matthew> The series is based upon 4.8-rc1, intended for 4.9, and is Matthew> bisectable. Thanks for making that explicit. Makes my life easier! -- Martin K. Petersen Oracle Linux Engineering
Re: [PATCH] cxl: use pcibios_free_controller_deferred() when removing vPHBs
Acked-by: Ian Munsie
Re: linux-next: build warnings after merge of the kbuild tree
Hi Nick, On Thu, 18 Aug 2016 11:09:48 +1000 Nicholas Piggin wrote: > > On Wed, 17 Aug 2016 14:59:59 +0200 > Michal Marek wrote: > > > On 2016-08-17 03:44, Stephen Rothwell wrote: > > > > > > After merging the kbuild tree, today's linux-next build (powerpc > > > ppc64_defconfig) produced these warnings: > > > > > > WARNING: 25 bad relocations > > > c0cf2570 R_PPC64_ADDR64__crc___arch_hweight16 > > [...] > > > Introduced by commit > > > > > > 9445aa1a3062 ("ppc: move exports to definitions") > > > > > > I have reverted that commit for today. > > > > > > [cc-ing the ppc guys for clues - also involved is commit > > > > > > 22823ab419d8 ("EXPORT_SYMBOL() for asm") > > > ] > > > > FWIW, I see these warnings as well. Any help from ppc developers is > > appreciated - should the R_PPC64_ADDR64 be whitelisted for exported asm > > symbols (their CRCs actually)? > > The dangling relocation is a side effect of linker unable to resolve the > reference to the undefined weak symbols. So the real question is, why has > genksyms not overridden these symbols with their CRC values? > > This may not even be powerpc specific, but I'll poke at it a bit more > when I get a chance. Not sure if this is relevant, but with the commit reverted, the __crc___... symbols are absolute. f55b3b3d A __crc___arch_hweight16 -- Cheers, Stephen Rothwell
Re: [PATCH 4/6] cxlflash: Transition to application close model
Acked-by: Manoj N. Kumar On 8/9/2016 6:39 PM, Matthew R. Ochs wrote: Caching the adapter file descriptor and performing a close on behalf of an application is a poor design. This is due to the fact that once a file descriptor in installed, it is free to be altered without the knowledge of the cxlflash driver. This can lead to inconsistencies between the application and kernel. Furthermore, the nature of the former design is more exploitable and thus should be abandoned. To support applications performing a close on the adapter file that is associated with a context, a new flag is introduced to the user API to indicate to applications that they are responsible for the close following the cleanup (detach) of a context. The documentation is also updated to reflect this change in behavior. Inspired-by: Al Viro Signed-off-by: Matthew R. Ochs
Re: [PATCH v3 19/21] powerpc: tm: Always use fp_state and vr_state to store live registers
On Wed, Aug 17, 2016 at 01:43:21PM +1000, Cyril Bur wrote: > There is currently an inconsistency as to how the entire CPU register > state is saved and restored when a thread uses transactional memory > (TM). > > Using transactional memory results in the CPU having duplicated > (almost all) of its register state. This duplication results in a set > of registers which can be considered 'live', those being currently > modified by the instructions being executed and another set that is > frozen at a point in time. > > On context switch, both sets of state have to be saved and (later) > restored. These two states are often called a variety of different > things. Common terms for the state which only exists after has entered > a transaction (performed a TBEGIN instruction) in hardware is the > 'transactional' or 'speculative'. > > Between a TBEGIN and a TEND or TABORT (or an event that causes the > hardware to abort), regardless of the use of TSUSPEND the > transactional state can be referred to as the live state. > > The second state is often to referred to as the 'checkpointed' state > and is a duplication of the live state when the TBEGIN instruction is > executed. This state is kept in the hardware and will be rolled back > to on transaction failure. > > Currently all the registers stored in pt_regs are ALWAYS the live > registers, that is, when a thread has transactional registers their > values are stored in pt_regs and the checkpointed state is in > ckpt_regs. A strange opposite is true for fp_state. When a thread is > non transactional fp_state holds the live registers. When a thread > has initiated a transaction fp_state holds the checkpointed state and > transact_fp becomes the structure which holds the live state (at this > point it is a transactional state). The same is true for vr_state > > This method creates confusion as to where the live state is, in some > circumstances it requires extra work to determine where to put the > live state and prevents the use of common functions designed (probably > before TM) to save the live state. > > With this patch pt_regs, fp_state and vr_state all represent the > same thing and the other structures [pending rename] are for > checkpointed state. > > Signed-off-by: Cyril Bur Acked-by: Simon Guo Thanks, - Simon
Re: linux-next: build warnings after merge of the kbuild tree
Hi Nick, On Fri, 19 Aug 2016 13:38:54 +1000 Stephen Rothwell wrote: > > On Thu, 18 Aug 2016 11:09:48 +1000 Nicholas Piggin wrote: > > > > On Wed, 17 Aug 2016 14:59:59 +0200 > > Michal Marek wrote: > > > > > On 2016-08-17 03:44, Stephen Rothwell wrote: > > > > > > > > After merging the kbuild tree, today's linux-next build (powerpc > > > > ppc64_defconfig) produced these warnings: > > > > > > > > WARNING: 25 bad relocations > > > > c0cf2570 R_PPC64_ADDR64__crc___arch_hweight16 > > > [...] > > > > Introduced by commit > > > > > > > > 9445aa1a3062 ("ppc: move exports to definitions") > > > > > > > > I have reverted that commit for today. > > > > > > > > [cc-ing the ppc guys for clues - also involved is commit > > > > > > > > 22823ab419d8 ("EXPORT_SYMBOL() for asm") > > > > ] > > > > > > FWIW, I see these warnings as well. Any help from ppc developers is > > > appreciated - should the R_PPC64_ADDR64 be whitelisted for exported asm > > > symbols (their CRCs actually)? > > > > The dangling relocation is a side effect of linker unable to resolve the > > reference to the undefined weak symbols. So the real question is, why has > > genksyms not overridden these symbols with their CRC values? > > > > This may not even be powerpc specific, but I'll poke at it a bit more > > when I get a chance. > > Not sure if this is relevant, but with the commit reverted, the > __crc___... symbols are absolute. > > f55b3b3d A __crc___arch_hweight16 Ignore that :-) I just had a look at a x86_64 allmodconfig result and it looks like the weak symbols are not resolved their either ... I may be missing something, but genksyms generates the crc's off the preprocessed C source code and we don't have any for the asm files ... -- Cheers, Stephen Rothwell
[PATCH v5 1/7] perf: Define macro for normalized arch names
Define macro for each normalized arch name and use them instead of using arch name as string. Signed-off-by: Ravi Bangoria --- Changes in v5: - No changes. tools/perf/arch/common.c | 36 ++-- tools/perf/arch/common.h | 11 +++ tools/perf/util/unwind-libunwind.c | 4 ++-- 3 files changed, 31 insertions(+), 20 deletions(-) diff --git a/tools/perf/arch/common.c b/tools/perf/arch/common.c index 886dd2a..f763666 100644 --- a/tools/perf/arch/common.c +++ b/tools/perf/arch/common.c @@ -123,25 +123,25 @@ static int lookup_triplets(const char *const *triplets, const char *name) const char *normalize_arch(char *arch) { if (!strcmp(arch, "x86_64")) - return "x86"; + return NORM_X86; if (arch[0] == 'i' && arch[2] == '8' && arch[3] == '6') - return "x86"; + return NORM_X86; if (!strcmp(arch, "sun4u") || !strncmp(arch, "sparc", 5)) - return "sparc"; + return NORM_SPARC; if (!strcmp(arch, "aarch64") || !strcmp(arch, "arm64")) - return "arm64"; + return NORM_ARM64; if (!strncmp(arch, "arm", 3) || !strcmp(arch, "sa110")) - return "arm"; + return NORM_ARM; if (!strncmp(arch, "s390", 4)) - return "s390"; + return NORM_S390; if (!strncmp(arch, "parisc", 6)) - return "parisc"; + return NORM_PARISC; if (!strncmp(arch, "powerpc", 7) || !strncmp(arch, "ppc", 3)) - return "powerpc"; + return NORM_POWERPC; if (!strncmp(arch, "mips", 4)) - return "mips"; + return NORM_MIPS; if (!strncmp(arch, "sh", 2) && isdigit(arch[2])) - return "sh"; + return NORM_SH; return arch; } @@ -181,21 +181,21 @@ static int perf_env__lookup_binutils_path(struct perf_env *env, zfree(&buf); } - if (!strcmp(arch, "arm")) + if (!strcmp(arch, NORM_ARM)) path_list = arm_triplets; - else if (!strcmp(arch, "arm64")) + else if (!strcmp(arch, NORM_ARM64)) path_list = arm64_triplets; - else if (!strcmp(arch, "powerpc")) + else if (!strcmp(arch, NORM_POWERPC)) path_list = powerpc_triplets; - else if (!strcmp(arch, "sh")) + else if (!strcmp(arch, NORM_SH)) path_list = sh_triplets; - else if (!strcmp(arch, "s390")) + else if (!strcmp(arch, NORM_S390)) path_list = s390_triplets; - else if (!strcmp(arch, "sparc")) + else if (!strcmp(arch, NORM_SPARC)) path_list = sparc_triplets; - else if (!strcmp(arch, "x86")) + else if (!strcmp(arch, NORM_X86)) path_list = x86_triplets; - else if (!strcmp(arch, "mips")) + else if (!strcmp(arch, NORM_MIPS)) path_list = mips_triplets; else { ui__error("binutils for %s not supported.\n", arch); diff --git a/tools/perf/arch/common.h b/tools/perf/arch/common.h index 6b01c73..14ca8ca 100644 --- a/tools/perf/arch/common.h +++ b/tools/perf/arch/common.h @@ -5,6 +5,17 @@ extern const char *objdump_path; +/* Macro for normalized arch names */ +#define NORM_X86 "x86" +#define NORM_SPARC "sparc" +#define NORM_ARM64 "arm64" +#define NORM_ARM "arm" +#define NORM_S390 "s390" +#define NORM_PARISC"parisc" +#define NORM_POWERPC "powerpc" +#define NORM_MIPS "mips" +#define NORM_SH"sh" + int perf_env__lookup_objdump(struct perf_env *env); const char *normalize_arch(char *arch); diff --git a/tools/perf/util/unwind-libunwind.c b/tools/perf/util/unwind-libunwind.c index 6d542a4..6199102 100644 --- a/tools/perf/util/unwind-libunwind.c +++ b/tools/perf/util/unwind-libunwind.c @@ -40,10 +40,10 @@ int unwind__prepare_access(struct thread *thread, struct map *map, arch = normalize_arch(thread->mg->machine->env->arch); - if (!strcmp(arch, "x86")) { + if (!strcmp(arch, NORM_X86)) { if (dso_type != DSO__TYPE_64BIT) ops = x86_32_unwind_libunwind_ops; - } else if (!strcmp(arch, "arm64") || !strcmp(arch, "arm")) { + } else if (!strcmp(arch, NORM_ARM64) || !strcmp(arch, NORM_ARM)) { if (dso_type == DSO__TYPE_64BIT) ops = arm64_unwind_libunwind_ops; } -- 2.5.5
[PATCH v5 0/7] perf: Cross arch annotate + few miscellaneous fixes
Currently Perf annotate support code navigation (branches and calls) only when run on the same architecture where perf.data was recorded. But, for example, record on powerpc server and annotate on client's x86 desktop is not supported. This patchset enables cross arch annotate. Currently I've used x86 and arm instructions which are already available and added support for powerpc. Additionally this patch series also contains few other related fixes. Patches are prepared on top of acme/perf/core and tested it with x86 and powerpc only. Note for arm: Few instructions were defined under #if __arm__ which I've used as a table for arm. But I'm not sure whether instruction defined outside of that also contains arm instructions. Apart from that, 'call__parse()' and 'move__parse()' contains #ifdef __arm__ directive. I've changed it to if (!strcmp(norm_arch, arm)). I don't have a arm machine to test these changes. Example: Record on powerpc: $ ./perf record -a Report -> Annotate on x86: $ ./perf report -i perf.data.powerpc --vmlinux vmlinux.powerpc Changes in v5: - Replaced symbol__annotate with symbol__disassemble. - Removed hacks for jump and call instructions like bctr and bctrl respectively from generic patch that enables support for powerpc and made separate patch for that. - v4 was not annotating powerpc 'btar' instruction. Included that. - Added few generic fixes. v4 link: https://lkml.org/lkml/2016/7/8/10 Naveen N. Rao (1): perf annotate: Add support for powerpc Ravi Bangoria (6): perf: Define macro for normalized arch names perf annotate: Add cross arch annotate support perf annotate: Do not ignore call instruction with indirect target perf annotate: Show raw form for jump instruction with indirect target perf annotate: Support jump instruction with target as second operand perf annotate: Fix jump target outside of function address range tools/perf/arch/common.c | 36 ++--- tools/perf/arch/common.h | 11 ++ tools/perf/builtin-top.c | 2 +- tools/perf/ui/browsers/annotate.c | 8 +- tools/perf/ui/gtk/annotate.c | 2 +- tools/perf/util/annotate.c | 276 + tools/perf/util/annotate.h | 10 +- tools/perf/util/unwind-libunwind.c | 4 +- 8 files changed, 262 insertions(+), 87 deletions(-) -- 2.5.5
[PATCH v5 2/7] perf annotate: Add cross arch annotate support
Change current data structures and function to enable cross arch annotate. Current perf implementation does not support cross arch annotate. To make it truly cross arch, instruction table of all arch should be present in perf binary. And use appropriate table based on arch where perf.data was recorded. Signed-off-by: Ravi Bangoria --- Changes in v5: - Replaced symbol__annotate with symbol__disassemble. tools/perf/builtin-top.c | 2 +- tools/perf/ui/browsers/annotate.c | 3 +- tools/perf/ui/gtk/annotate.c | 2 +- tools/perf/util/annotate.c| 133 -- tools/perf/util/annotate.h| 5 +- 5 files changed, 92 insertions(+), 53 deletions(-) diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c index a3223aa..fdd4203 100644 --- a/tools/perf/builtin-top.c +++ b/tools/perf/builtin-top.c @@ -129,7 +129,7 @@ static int perf_top__parse_source(struct perf_top *top, struct hist_entry *he) return err; } - err = symbol__disassemble(sym, map, 0); + err = symbol__disassemble(sym, map, 0, NULL); if (err == 0) { out_assign: top->sym_filter_entry = he; diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index 2e2d100..21c5e10 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -1050,7 +1050,8 @@ int symbol__tui_annotate(struct symbol *sym, struct map *map, (nr_pcnt - 1); } - err = symbol__disassemble(sym, map, sizeof_bdl); + err = symbol__disassemble(sym, map, sizeof_bdl, + perf_evsel__env_arch(evsel)); if (err) { char msg[BUFSIZ]; symbol__strerror_disassemble(sym, map, err, msg, sizeof(msg)); diff --git a/tools/perf/ui/gtk/annotate.c b/tools/perf/ui/gtk/annotate.c index 42d3199..c127aba 100644 --- a/tools/perf/ui/gtk/annotate.c +++ b/tools/perf/ui/gtk/annotate.c @@ -167,7 +167,7 @@ static int symbol__gtk_annotate(struct symbol *sym, struct map *map, if (map->dso->annotate_warned) return -1; - err = symbol__disassemble(sym, map, 0); + err = symbol__disassemble(sym, map, 0, perf_evsel__env_arch(evsel)); if (err) { char msg[BUFSIZ]; symbol__strerror_disassemble(sym, map, err, msg, sizeof(msg)); diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 25a9259..deb9af0 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -20,12 +20,14 @@ #include #include #include +#include +#include "../arch/common.h" const char *disassembler_style; const char *objdump_path; static regex_t file_lineno; -static struct ins *ins__find(const char *name); +static struct ins *ins__find(const char *name, const char *norm_arch); static int disasm_line__parse(char *line, char **namep, char **rawp); static void ins__delete(struct ins_operands *ops) @@ -53,7 +55,7 @@ int ins__scnprintf(struct ins *ins, char *bf, size_t size, return ins__raw_scnprintf(ins, bf, size, ops); } -static int call__parse(struct ins_operands *ops) +static int call__parse(struct ins_operands *ops, const char *norm_arch) { char *endptr, *tok, *name; @@ -65,10 +67,8 @@ static int call__parse(struct ins_operands *ops) name++; -#ifdef __arm__ - if (strchr(name, '+')) + if (!strcmp(norm_arch, NORM_ARM) && strchr(name, '+')) return -1; -#endif tok = strchr(name, '>'); if (tok == NULL) @@ -117,7 +117,8 @@ bool ins__is_call(const struct ins *ins) return ins->ops == &call_ops; } -static int jump__parse(struct ins_operands *ops) +static int jump__parse(struct ins_operands *ops, + const char *norm_arch __maybe_unused) { const char *s = strchr(ops->raw, '+'); @@ -172,7 +173,7 @@ static int comment__symbol(char *raw, char *comment, u64 *addrp, char **namep) return 0; } -static int lock__parse(struct ins_operands *ops) +static int lock__parse(struct ins_operands *ops, const char *norm_arch) { char *name; @@ -183,7 +184,7 @@ static int lock__parse(struct ins_operands *ops) if (disasm_line__parse(ops->raw, &name, &ops->locked.ops->raw) < 0) goto out_free_ops; - ops->locked.ins = ins__find(name); + ops->locked.ins = ins__find(name, norm_arch); free(name); if (ops->locked.ins == NULL) @@ -193,7 +194,7 @@ static int lock__parse(struct ins_operands *ops) return 0; if (ops->locked.ins->ops->parse && - ops->locked.ins->ops->parse(ops->locked.ops) < 0) + ops->locked.ins->ops->parse(ops->locked.ops, norm_arch) < 0) goto out_free_ops; return 0; @@ -236,7 +237,7 @@ static struct ins_ops lock_ops = { .scnprintf = lock__scnprintf, }; -static int mov__parse(s
[PATCH v5 4/7] perf annotate: Do not ignore call instruction with indirect target
Do not ignore call instruction with indirect target when its already identified as a call. This is an extension of commit e8ea1561952b ("perf annotate: Use raw form for register indirect call instructions") to generalize annotation for all instructions with indirect calls. This is needed for certain powerpc call instructions that use address in a register (such as bctrl, btarl, ...). Apart from that, when kcore is used to disassemble function, all call instructions were ignored. This patch will fix it as a side effect by not ignoring them. For example, Before (with kcore): mov%r13,%rdi callq 0x811a7e70 ^ jmpq 64 mov%gs:0x7ef41a6e(%rip),%al After (with kcore): mov%r13,%rdi > callq 0x811a7e70 ^ jmpq 64 mov%gs:0x7ef41a6e(%rip),%al Suggested-by: Michael Ellerman [Suggested about 'bctrl' instruction] Signed-off-by: Ravi Bangoria --- Changes in v5: - New patch, introduced to annotate all indirect call instructions. tools/perf/util/annotate.c | 8 ++-- 1 file changed, 2 insertions(+), 6 deletions(-) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 0b64841..6368ba9 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -81,16 +81,12 @@ static int call__parse(struct ins_operands *ops, const char *norm_arch) return ops->target.name == NULL ? -1 : 0; indirect_call: - tok = strchr(endptr, '('); - if (tok != NULL) { + tok = strchr(endptr, '*'); + if (tok == NULL) { ops->target.addr = 0; return 0; } - tok = strchr(endptr, '*'); - if (tok == NULL) - return -1; - ops->target.addr = strtoull(tok + 1, NULL, 16); return 0; } -- 2.5.5
[PATCH v5 7/7] perf annotate: Fix jump target outside of function address range
If jump target is outside of function range, perf is not handling it correctly. Especially when target address is lesser than function start address, target offset will be negative. But, target address declared to be unsigned, converts negative number into 2's complement. See below example. Here target of 'jumpq' instruction at 34cf8 is 34ac0 which is lesser than function start address(34cf0). 34ac0 - 34cf0 = -0x230 = 0xfdd0 Objdump output: 00034cf0 <__sigaction>: __GI___sigaction(): 34cf0: lea-0x20(%rdi),%eax 34cf3: cmp-bashx1,%eax 34cf6: jbe34d00 <__sigaction+0x10> 34cf8: jmpq 34ac0 <__GI___libc_sigaction> 34cfd: nopl (%rax) 34d00: mov0x386161(%rip),%rax# 3bae68 <_DYNAMIC+0x2e8> 34d07: movl -bashx16,%fs:(%rax) 34d0e: mov-bashx,%eax 34d13: retq perf annotate before applying patch: __GI___sigaction /usr/lib64/libc-2.22.so lea-0x20(%rdi),%eax cmp-bashx1,%eax V jbe10 V jmpq fdd0 nop 10:mov_DYNAMIC+0x2e8,%rax movl -bashx16,%fs:(%rax) mov-bashx,%eax retq perf annotate after applying patch: __GI___sigaction /usr/lib64/libc-2.22.so lea-0x20(%rdi),%eax cmp-bashx1,%eax V jbe10 ^ jmpq 34ac0 <__GI___libc_sigaction> nop 10:mov_DYNAMIC+0x2e8,%rax movl -bashx16,%fs:(%rax) mov-bashx,%eax retq Signed-off-by: Ravi Bangoria --- Changes in v5: - New patch tools/perf/ui/browsers/annotate.c | 5 +++-- tools/perf/util/annotate.c| 14 +- tools/perf/util/annotate.h| 5 +++-- 3 files changed, 15 insertions(+), 9 deletions(-) diff --git a/tools/perf/ui/browsers/annotate.c b/tools/perf/ui/browsers/annotate.c index 21c5e10..c13df5b 100644 --- a/tools/perf/ui/browsers/annotate.c +++ b/tools/perf/ui/browsers/annotate.c @@ -215,7 +215,7 @@ static void annotate_browser__write(struct ui_browser *browser, void *entry, int ui_browser__set_color(browser, color); if (dl->ins && dl->ins->ops->scnprintf) { if (ins__is_jump(dl->ins)) { - bool fwd = dl->ops.target.offset > (u64)dl->offset; + bool fwd = dl->ops.target.offset > dl->offset; ui_browser__write_graph(browser, fwd ? SLSMG_DARROW_CHAR : SLSMG_UARROW_CHAR); @@ -245,7 +245,8 @@ static bool disasm_line__is_valid_jump(struct disasm_line *dl, struct symbol *sy { if (!dl || !dl->ins || !ins__is_jump(dl->ins) || !disasm_line__has_offset(dl) - || dl->ops.target.offset >= symbol__size(sym)) + || dl->ops.target.offset < 0 + || dl->ops.target.offset >= (s64)symbol__size(sym)) return false; return true; diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 678fb81..c8b017c 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -124,10 +124,12 @@ static int jump__parse(struct ins_operands *ops, else ops->target.addr = strtoull(ops->raw, NULL, 16); - if (s++ != NULL) + if (s++ != NULL) { ops->target.offset = strtoull(s, NULL, 16); - else - ops->target.offset = UINT64_MAX; + ops->target.offset_avail = true; + } else { + ops->target.offset_avail = false; + } return 0; } @@ -135,7 +137,7 @@ static int jump__parse(struct ins_operands *ops, static int jump__scnprintf(struct ins *ins, char *bf, size_t size, struct ins_operands *ops) { - if (!ops->target.addr) + if (!ops->target.addr || ops->target.offset < 0) return ins__raw_scnprintf(ins, bf, size, ops); return scnprintf(bf, size, "%-6.6s %" PRIx64, ins->name, ops->target.offset); @@ -1228,9 +1230,11 @@ static int symbol__parse_objdump_line(struct symbol *sym, struct map *map, if (dl == NULL) return -1; - if (dl->ops.target.offset == UINT64_MAX) + if (!disasm_line__has_offset(dl)) { dl->ops.target.offset = dl->ops.target.addr - map__rip_2objdump(map, sym->start); + dl->ops.target.offset_avail = true; + } /* kcore has no symbols, so add the call target name */ if (dl->ins && ins__is_call(dl->ins) && !dl->ops.target.name) { diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h index 5cfad4e..5787ed8 100644 --- a/tools/perf/util/annotate.h +++ b/tools/perf/util/annotate.h @@ -19,7 +19,8 @@ struct ins_operands { char*raw; char*na
[PATCH v5 5/7] perf annotate: Show raw form for jump instruction with indirect target
For jump instructions that does not include target address as direct operand, use raw value for that. This is needed for certain powerpc jump instructions that use target address in a register (such as bctr, btar, ...). Suggested-by: Michael Ellerman Signed-off-by: Ravi Bangoria --- Changes in v5: - New patch introduced to annotate jump instruction with indirect target tools/perf/util/annotate.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 6368ba9..4a4a583 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -131,6 +131,9 @@ static int jump__parse(struct ins_operands *ops, static int jump__scnprintf(struct ins *ins, char *bf, size_t size, struct ins_operands *ops) { + if (!ops->target.addr) + return ins__raw_scnprintf(ins, bf, size, ops); + return scnprintf(bf, size, "%-6.6s %" PRIx64, ins->name, ops->target.offset); } -- 2.5.5
[PATCH v5 6/7] perf annotate: Support jump instruction with target as second operand
Current perf is not able to parse jump instruction when second operand contains target address. Arch like powerpc has such instructions. For example, 'beq cr7,10173e60'. Signed-off-by: Ravi Bangoria --- Changes in v5: - New patch tools/perf/util/annotate.c | 6 +- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index 4a4a583..678fb81 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -117,8 +117,12 @@ static int jump__parse(struct ins_operands *ops, const char *norm_arch __maybe_unused) { const char *s = strchr(ops->raw, '+'); + const char *c = strchr(ops->raw, ','); - ops->target.addr = strtoull(ops->raw, NULL, 16); + if (c++ != NULL) + ops->target.addr = strtoull(c, NULL, 16); + else + ops->target.addr = strtoull(ops->raw, NULL, 16); if (s++ != NULL) ops->target.offset = strtoull(s, NULL, 16); -- 2.5.5
[PATCH v5 3/7] perf annotate: Add support for powerpc
From: "Naveen N. Rao" Current perf can disassemble annotated function but it does not have parsing logic for powerpc instructions. So all navigation options are not available for powerpc. Apart from that, Powerpc has long list of branch instructions and hardcoding them in table appears to be error-prone. So, add function to find instruction instead of creating table. This function dynamically create table (list of 'struct ins'), and instead of creating object every time, first check if list already contain object for that instruction. Signed-off-by: Naveen N. Rao Signed-off-by: Ravi Bangoria --- Changes in v5: - Removed hacks for instructions like bctr and bctrl from this patch. tools/perf/util/annotate.c | 116 + 1 file changed, 116 insertions(+) diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c index deb9af0..0b64841 100644 --- a/tools/perf/util/annotate.c +++ b/tools/perf/util/annotate.c @@ -459,6 +459,11 @@ static struct ins instructions_arm[] = { { .name = "bne", .ops = &jump_ops, }, }; +struct instructions_powerpc { + struct ins *ins; + struct list_head list; +}; + static int ins__key_cmp(const void *name, const void *insp) { const struct ins *ins = insp; @@ -474,6 +479,115 @@ static int ins__cmp(const void *a, const void *b) return strcmp(ia->name, ib->name); } +static struct ins *list_add__ins_powerpc(struct instructions_powerpc *head, +const char *name, struct ins_ops *ops) +{ + struct instructions_powerpc *ins_powerpc; + struct ins *ins; + + ins = zalloc(sizeof(struct ins)); + if (!ins) + return NULL; + + ins_powerpc = zalloc(sizeof(struct instructions_powerpc)); + if (!ins_powerpc) + goto out_free_ins; + + ins->name = strdup(name); + if (!ins->name) + goto out_free_ins_power; + + ins->ops = ops; + ins_powerpc->ins = ins; + list_add_tail(&(ins_powerpc->list), &(head->list)); + + return ins; + +out_free_ins_power: + zfree(&ins_powerpc); +out_free_ins: + zfree(&ins); + return NULL; +} + +static struct ins *list_search__ins_powerpc(struct instructions_powerpc *head, + const char *name) +{ + struct instructions_powerpc *pos; + + list_for_each_entry(pos, &head->list, list) { + if (!strcmp(pos->ins->name, name)) + return pos->ins; + } + return NULL; +} + +static struct ins *ins__find_powerpc(const char *name) +{ + int i; + struct ins *ins; + struct ins_ops *ops; + static struct instructions_powerpc head; + static bool list_initialized; + + /* +* - Interested only if instruction starts with 'b'. +* - Few start with 'b', but aren't branch instructions. +*/ + if (name[0] != 'b' || + !strncmp(name, "bcd", 3) || + !strncmp(name, "brinc", 5) || + !strncmp(name, "bper", 4)) + return NULL; + + if (!list_initialized) { + INIT_LIST_HEAD(&head.list); + list_initialized = true; + } + + /* +* Return if we already have object of 'struct ins' for this instruction +*/ + ins = list_search__ins_powerpc(&head, name); + if (ins) + return ins; + + ops = &jump_ops; + + i = strlen(name) - 1; + if (i < 0) + return NULL; + + /* ignore optional hints at the end of the instructions */ + if (name[i] == '+' || name[i] == '-') + i--; + + if (name[i] == 'l' || (name[i] == 'a' && name[i-1] == 'l')) { + /* +* if the instruction ends up with 'l' or 'la', then +* those are considered 'calls' since they update LR. +* ... except for 'bnl' which is branch if not less than +* and the absolute form of the same. +*/ + if (strcmp(name, "bnl") && strcmp(name, "bnl+") && + strcmp(name, "bnl-") && strcmp(name, "bnla") && + strcmp(name, "bnla+") && strcmp(name, "bnla-")) + ops = &call_ops; + } + if (name[i] == 'r' && name[i-1] == 'l') + /* +* instructions ending with 'lr' are considered to be +* return instructions +*/ + ops = &ret_ops; + + /* +* Add instruction to list so next time no need to +* allocate memory for it. +*/ + return list_add__ins_powerpc(&head, name, ops); +} + static void ins__sort(struct ins *instructions, int nmemb) { qsort(instructions, nmemb, sizeof(struct ins), ins__cmp); @@ -509,6 +623,8 @@ static struct ins *ins__find(const char *name, const char *norm_arch) } e
Re: linux-next: build warnings after merge of the kbuild tree
On Fri, 19 Aug 2016 15:09:14 +1000 Stephen Rothwell wrote: > Hi Nick, > > On Fri, 19 Aug 2016 13:38:54 +1000 Stephen Rothwell > wrote: > > > > On Thu, 18 Aug 2016 11:09:48 +1000 Nicholas Piggin > > wrote: > > > > > > On Wed, 17 Aug 2016 14:59:59 +0200 > > > Michal Marek wrote: > > > > > > > On 2016-08-17 03:44, Stephen Rothwell wrote: > > > > > > > > > > After merging the kbuild tree, today's linux-next build (powerpc > > > > > ppc64_defconfig) produced these warnings: > > > > > > > > > > WARNING: 25 bad relocations > > > > > c0cf2570 R_PPC64_ADDR64__crc___arch_hweight16 > > > > [...] > > > > > Introduced by commit > > > > > > > > > > 9445aa1a3062 ("ppc: move exports to definitions") > > > > > > > > > > I have reverted that commit for today. > > > > > > > > > > [cc-ing the ppc guys for clues - also involved is commit > > > > > > > > > > 22823ab419d8 ("EXPORT_SYMBOL() for asm") > > > > > ] > > > > > > > > FWIW, I see these warnings as well. Any help from ppc developers is > > > > appreciated - should the R_PPC64_ADDR64 be whitelisted for exported asm > > > > symbols (their CRCs actually)? > > > > > > The dangling relocation is a side effect of linker unable to resolve the > > > reference to the undefined weak symbols. So the real question is, why has > > > genksyms not overridden these symbols with their CRC values? > > > > > > This may not even be powerpc specific, but I'll poke at it a bit more > > > when I get a chance. > > > > Not sure if this is relevant, but with the commit reverted, the > > __crc___... symbols are absolute. > > > > f55b3b3d A __crc___arch_hweight16 > > Ignore that :-) > > I just had a look at a x86_64 allmodconfig result and it looks like the > weak symbols are not resolved their either ... > > I may be missing something, but genksyms generates the crc's off the > preprocessed C source code and we don't have any for the asm files ... Looks like you're right, good find! Thanks, Nick
[PATCH 01/13] powerpc: Add simple cache inhibited MMIO accessors
From: Suresh Warrier Add simple cache inhibited accessors for memory mapped I/O. Unlike the accessors built from the DEF_MMIO_* macros, these don't include any hardware memory barriers, callers need to manage memory barriers on their own. These can only be called in hypervisor real mode. Signed-off-by: Suresh Warrier [pau...@ozlabs.org - added line to comment] Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/io.h | 29 + 1 file changed, 29 insertions(+) diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h index 2fd1690..f6fda84 100644 --- a/arch/powerpc/include/asm/io.h +++ b/arch/powerpc/include/asm/io.h @@ -241,6 +241,35 @@ static inline void out_be64(volatile u64 __iomem *addr, u64 val) #endif #endif /* __powerpc64__ */ + +/* + * Simple Cache inhibited accessors + * Unlike the DEF_MMIO_* macros, these don't include any h/w memory + * barriers, callers need to manage memory barriers on their own. + * These can only be used in hypervisor real mode. + */ + +static inline u32 _lwzcix(unsigned long addr) +{ + u32 ret; + + __asm__ __volatile__("lwzcix %0,0, %1" +: "=r" (ret) : "r" (addr) : "memory"); + return ret; +} + +static inline void _stbcix(u64 addr, u8 val) +{ + __asm__ __volatile__("stbcix %0,0,%1" + : : "r" (val), "r" (addr) : "memory"); +} + +static inline void _stwcix(u64 addr, u32 val) +{ + __asm__ __volatile__("stwcix %0,0,%1" + : : "r" (val), "r" (addr) : "memory"); +} + /* * Low level IO stream instructions are defined out of line for now */ -- 2.8.1
[PATCH 00/13] Real-mode acceleration of device interrupts in HV KVM
This patch set reduces the latency for presenting interrupts from PCI pass-through devices to a Book3S HV guest. Currently, if an interrupt arrives from a PCI pass-through device while a guest is running, it causes an exit of all threads on the core to the host, where the interrupt is handled by making an interrupt pending in the virtual XICS interrupt controller for the guest that owns the device. Furthermore, there is currently no attempt to direct PCI pass-through device interrupts to the physical core where the VCPU that they are directed to is running, so they often land on a different core and require an IPI to interrupt the VCPU. With this patch set, if the interrupt arrives on a core where the correct guest is running, it can be handled in hypervisor real mode without needing an exit to host context. If the destination VCPU is on the same core, then we can interrupt it using at most a msgsnd (message send) instruction, which is considerably faster than an IPI. Further, if an interrupt arrives on a different core, we then change the destination for the interrupt in the physical interrupt controller to point to the core where the VCPU is running. For now, we always direct the interrupt to thread 0 of the core because the other threads are offline from the point of view of the host, and the offline loop (which is where those other threads run when thread 0 is in host context) doesn't handle device interrupts. This patch set is based on a patch set from Suresh Warrier, with considerable revision by me. The data structure for mapping host interrupt numbers to guest interrupt numbers is just a flat array that is searched linearly, which works and is simple but could perform poorly with large numbers of interrupt sources. It would be simple to replace this mapping array with a more sophisticated data structure in future. To test the performance of this patch set, I used a network one-byte ping-pong test between a guest with a Mellanox CX-3 passed through to it, connected over 10Gb ethernet to another POWER8 system running bare-metal with a Chelsio 10Gb ethernet adapter. (The guest was running Ubuntu 16.04.1 under QEMU v2.7-rc2 on a POWER8.) Without this patchset, the round-trip latency was 43us, and with it the latency was 41us, a saving of 2us per round-trip. Paul. - arch/powerpc/include/asm/io.h | 29 arch/powerpc/include/asm/kvm_asm.h | 10 ++ arch/powerpc/include/asm/kvm_book3s.h | 1 + arch/powerpc/include/asm/kvm_host.h| 20 +++ arch/powerpc/include/asm/kvm_ppc.h | 28 arch/powerpc/include/asm/opal.h| 1 + arch/powerpc/include/asm/pnv-pci.h | 3 + arch/powerpc/kvm/Kconfig | 2 + arch/powerpc/kvm/book3s.c | 3 + arch/powerpc/kvm/book3s_hv.c | 199 - arch/powerpc/kvm/book3s_hv_builtin.c | 141 ++ arch/powerpc/kvm/book3s_hv_rm_xics.c | 120 +++ arch/powerpc/kvm/book3s_hv_rmhandlers.S| 183 +-- arch/powerpc/kvm/book3s_xics.c | 55 ++- arch/powerpc/kvm/book3s_xics.h | 2 + arch/powerpc/kvm/powerpc.c | 38 + arch/powerpc/platforms/powernv/opal-wrappers.S | 1 + arch/powerpc/platforms/powernv/pci-ioda.c | 24 ++- 18 files changed, 773 insertions(+), 87 deletions(-)
[PATCH 02/13] KVM: PPC: Book3S HV: Convert kvmppc_read_intr to a C function
From: Suresh Warrier Modify kvmppc_read_intr to make it a C function. Because it is called from kvmppc_check_wake_reason, any of the assembler code that calls either kvmppc_read_intr or kvmppc_check_wake_reason now has to assume that the volatile registers might have been modified. This also adds in the optimization of clearing saved_xirr in the case where we completely handle and EOI an IPI. Without this, the next device interrupt will require two trips through the host interrupt handling code. [pau...@ozlabs.org - made kvmppc_check_wake_reason create a stack frame when it is calling kvmppc_read_intr, which means we can set r12 to the trap number (0x500) after the call to kvmppc_read_intr, instead of using r31. Also moved the deliver_guest_interrupt label so as to restore XER and CTR, plus other minor tweaks.] Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_builtin.c| 84 + arch/powerpc/kvm/book3s_hv_rmhandlers.S | 158 +++- 2 files changed, 158 insertions(+), 84 deletions(-) diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index 5f0380d..b476a6a 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -25,6 +25,7 @@ #include #include #include +#include #define KVM_CMA_CHUNK_ORDER18 @@ -286,3 +287,86 @@ void kvmhv_commence_exit(int trap) struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv; EXPORT_SYMBOL_GPL(kvmppc_host_rm_ops_hv); + +/* + * Determine what sort of external interrupt is pending (if any). + * Returns: + * 0 if no interrupt is pending + * 1 if an interrupt is pending that needs to be handled by the host + * -1 if there was a guest wakeup IPI (which has now been cleared) + */ + +long kvmppc_read_intr(void) +{ + unsigned long xics_phys; + u32 h_xirr; + __be32 xirr; + u32 xisr; + u8 host_ipi; + + /* see if a host IPI is pending */ + host_ipi = local_paca->kvm_hstate.host_ipi; + if (host_ipi) + return 1; + + /* Now read the interrupt from the ICP */ + xics_phys = local_paca->kvm_hstate.xics_phys; + if (unlikely(!xics_phys)) + return 1; + + /* +* Save XIRR for later. Since we get control in reverse endian +* on LE systems, save it byte reversed and fetch it back in +* host endian. Note that xirr is the value read from the +* XIRR register, while h_xirr is the host endian version. +*/ + xirr = _lwzcix(xics_phys + XICS_XIRR); + h_xirr = be32_to_cpu(xirr); + local_paca->kvm_hstate.saved_xirr = h_xirr; + xisr = h_xirr & 0xff; + /* +* Ensure that the store/load complete to guarantee all side +* effects of loading from XIRR has completed +*/ + smp_mb(); + + /* if nothing pending in the ICP */ + if (!xisr) + return 0; + + /* We found something in the ICP... +* +* If it is an IPI, clear the MFRR and EOI it. +*/ + if (xisr == XICS_IPI) { + _stbcix(xics_phys + XICS_MFRR, 0xff); + _stwcix(xics_phys + XICS_XIRR, xirr); + /* +* Need to ensure side effects of above stores +* complete before proceeding. +*/ + smp_mb(); + + /* +* We need to re-check host IPI now in case it got set in the +* meantime. If it's clear, we bounce the interrupt to the +* guest +*/ + host_ipi = local_paca->kvm_hstate.host_ipi; + if (unlikely(host_ipi != 0)) { + /* We raced with the host, +* we need to resend that IPI, bummer +*/ + _stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY); + /* Let side effects complete */ + smp_mb(); + return 1; + } + + /* OK, it's an IPI for us */ + local_paca->kvm_hstate.saved_xirr = 0; + return -1; + } + + return 1; +} diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S b/arch/powerpc/kvm/book3s_hv_rmhandlers.S index 9756555..dccfa85 100644 --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S @@ -221,6 +221,13 @@ kvmppc_primary_no_guest: li r3, 0 /* Don't wake on privileged (OS) doorbell */ b kvm_do_nap +/* + * kvm_novcpu_wakeup + * Entered from kvm_start_guest if kvm_hstate.napping is set + * to NAPPING_NOVCPU + * r2 = kernel TOC + * r13 = paca + */ kvm_novcpu_wakeup: ld r1, HSTATE_HOST_R1(r13) ld r5, HSTATE_KVM_VCORE(r13) @@ -230,6 +237,13 @@ kvm_novcpu_wakeup: /* c
[PATCH 03/13] KVM: PPC: select IRQ_BYPASS_MANAGER
From: Suresh Warrier Select IRQ_BYPASS_MANAGER for PPC when CONFIG_KVM is set. Add the PPC producer functions for add and del producer. [pau...@ozlabs.org - Moved new functions from book3s.c to powerpc.c so booke compiles; added kvm_arch_has_irq_bypass implementation.] Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_ppc.h | 4 arch/powerpc/kvm/Kconfig | 2 ++ arch/powerpc/kvm/powerpc.c | 38 ++ 3 files changed, 44 insertions(+) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 2544eda..94715e2 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -287,6 +287,10 @@ struct kvmppc_ops { long (*arch_vm_ioctl)(struct file *filp, unsigned int ioctl, unsigned long arg); int (*hcall_implemented)(unsigned long hcall); + int (*irq_bypass_add_producer)(struct irq_bypass_consumer *, + struct irq_bypass_producer *); + void (*irq_bypass_del_producer)(struct irq_bypass_consumer *, + struct irq_bypass_producer *); }; extern struct kvmppc_ops *kvmppc_hv_ops; diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig index c2024ac..7ac0569 100644 --- a/arch/powerpc/kvm/Kconfig +++ b/arch/powerpc/kvm/Kconfig @@ -22,6 +22,8 @@ config KVM select ANON_INODES select HAVE_KVM_EVENTFD select SRCU + select IRQ_BYPASS_MANAGER + select HAVE_KVM_IRQ_BYPASS config KVM_BOOK3S_HANDLER bool diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c index 6ce40dd..6d51e0f 100644 --- a/arch/powerpc/kvm/powerpc.c +++ b/arch/powerpc/kvm/powerpc.c @@ -27,6 +27,8 @@ #include #include #include +#include +#include #include #include #include @@ -739,6 +741,42 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu) #endif } +/* + * irq_bypass_add_producer and irq_bypass_del_producer are only + * useful if the architecture supports PCI passthrough. + * irq_bypass_stop and irq_bypass_start are not needed and so + * kvm_ops are not defined for them. + */ +bool kvm_arch_has_irq_bypass(void) +{ + return ((kvmppc_hv_ops && kvmppc_hv_ops->irq_bypass_add_producer) || + (kvmppc_pr_ops && kvmppc_pr_ops->irq_bypass_add_producer)); +} + +int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons, +struct irq_bypass_producer *prod) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm *kvm = irqfd->kvm; + + if (kvm->arch.kvm_ops->irq_bypass_add_producer) + return kvm->arch.kvm_ops->irq_bypass_add_producer(cons, prod); + + return 0; +} + +void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons, + struct irq_bypass_producer *prod) +{ + struct kvm_kernel_irqfd *irqfd = + container_of(cons, struct kvm_kernel_irqfd, consumer); + struct kvm *kvm = irqfd->kvm; + + if (kvm->arch.kvm_ops->irq_bypass_del_producer) + kvm->arch.kvm_ops->irq_bypass_del_producer(cons, prod); +} + static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu, struct kvm_run *run) { -- 2.8.1
[PATCH 04/13] KVM: PPC: Book3S HV: Introduce kvmppc_passthru_irqmap
From: Suresh Warrier This patch introduces an IRQ mapping structure, the kvmppc_passthru_irqmap structure that is to be used to map the real hardware IRQ in the host with the virtual hardware IRQ (gsi) that is injected into a guest by KVM for passthrough adapters. Currently, we assume a separate IRQ mapping structure for each guest. Each kvmppc_passthru_irqmap has a mapping arrays, containing all defined real<->virtual IRQs. [pau...@ozlabs.org - removed irq_chip field from struct kvmppc_passthru_irqmap; changed parameter for kvmppc_get_passthru_irqmap from struct kvm_vcpu * to struct kvm *, removed small cached array.] Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_host.h | 17 + arch/powerpc/include/asm/kvm_ppc.h | 14 ++ arch/powerpc/kvm/book3s_hv.c| 13 + 3 files changed, 44 insertions(+) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index ec35af3..3eb5092 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -197,6 +197,8 @@ struct kvmppc_spapr_tce_table { struct kvmppc_xics; struct kvmppc_icp; +struct kvmppc_passthru_irqmap; + /* * The reverse mapping array has one entry for each HPTE, * which stores the guest's view of the second word of the HPTE @@ -267,6 +269,7 @@ struct kvm_arch { #endif #ifdef CONFIG_KVM_XICS struct kvmppc_xics *xics; + struct kvmppc_passthru_irqmap *pimap; #endif struct kvmppc_ops *kvm_ops; #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE @@ -397,6 +400,20 @@ struct kvmhv_tb_accumulator { u64 tb_max; /* max time */ }; +#ifdef CONFIG_PPC_BOOK3S_64 +struct kvmppc_irq_map { + u32 r_hwirq; + u32 v_hwirq; + struct irq_desc *desc; +}; + +#defineKVMPPC_PIRQ_MAPPED 1024 +struct kvmppc_passthru_irqmap { + int n_mapped; + struct kvmppc_irq_map mapped[KVMPPC_PIRQ_MAPPED]; +}; +#endif + # ifdef CONFIG_PPC_FSL_BOOK3E #define KVMPPC_BOOKE_IAC_NUM 2 #define KVMPPC_BOOKE_DAC_NUM 2 diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 94715e2..4ca2ba3 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -457,8 +457,18 @@ static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) { return vcpu->arch.irq_type == KVMPPC_IRQ_XICS; } + +static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap( + struct kvm *kvm) +{ + if (kvm) + return kvm->arch.pimap; + return NULL; +} + extern void kvmppc_alloc_host_rm_ops(void); extern void kvmppc_free_host_rm_ops(void); +extern void kvmppc_free_pimap(struct kvm *kvm); extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu); extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server); extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args); @@ -470,8 +480,12 @@ extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev, extern void kvmppc_xics_ipi_action(void); extern int h_ipi_redirect; #else +static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap( + struct kvm *kvm) + { return NULL; } static inline void kvmppc_alloc_host_rm_ops(void) {}; static inline void kvmppc_free_host_rm_ops(void) {}; +static inline void kvmppc_free_pimap(struct kvm *kvm) {}; static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) { return 0; } static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 2fd5580..413b5c2f 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3282,6 +3282,19 @@ static int kvmppc_core_check_processor_compat_hv(void) return 0; } +#ifdef CONFIG_KVM_XICS + +void kvmppc_free_pimap(struct kvm *kvm) +{ + kfree(kvm->arch.pimap); +} + +struct kvmppc_passthru_irqmap *kvmppc_alloc_pimap(void) +{ + return kzalloc(sizeof(struct kvmppc_passthru_irqmap), GFP_KERNEL); +} +#endif + static long kvm_arch_vm_ioctl_hv(struct file *filp, unsigned int ioctl, unsigned long arg) { -- 2.8.1
[PATCH 06/13] KVM: PPC: Book3S HV: Enable IRQ bypass
From: Suresh Warrier Add the irq_bypass_add_producer and irq_bypass_del_producer functions. These functions get called whenever a GSI is being defined for a guest. They create/remove the mapping between host real IRQ numbers and the guest GSI. Add the following helper functions to manage the passthrough IRQ map. kvmppc_set_passthru_irq() Creates a mapping in the passthrough IRQ map that maps a host IRQ to a guest GSI. It allocates the structure (one per guest VM) the first time it is called. kvmppc_clr_passthru_irq() Removes the passthrough IRQ map entry given a guest GSI. The passthrough IRQ map structure is not freed even when the number of mapped entries goes to zero. It is only freed when the VM is destroyed. [pau...@ozlabs.org - modified to use is_pnv_opal_msi() rather than requiring all passed-through interrupts to use the same irq_chip; changed deletion so it zeroes out the r_hwirq field rather than copying the last entry down and decrementing the number of entries.] Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv.c | 160 ++- 1 file changed, 159 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 413b5c2f..aa11647 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -53,10 +53,13 @@ #include #include #include +#include #include #include #include #include +#include +#include #include #include "book3s.h" @@ -3247,6 +3250,8 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm) kvmppc_free_vcores(kvm); kvmppc_free_hpt(kvm); + + kvmppc_free_pimap(kvm); } /* We don't need to emulate any privileged instructions or dcbz */ @@ -3289,10 +3294,159 @@ void kvmppc_free_pimap(struct kvm *kvm) kfree(kvm->arch.pimap); } -struct kvmppc_passthru_irqmap *kvmppc_alloc_pimap(void) +static struct kvmppc_passthru_irqmap *kvmppc_alloc_pimap(void) { return kzalloc(sizeof(struct kvmppc_passthru_irqmap), GFP_KERNEL); } + +static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) +{ + struct irq_desc *desc; + struct kvmppc_irq_map *irq_map; + struct kvmppc_passthru_irqmap *pimap; + struct irq_chip *chip; + int i; + + desc = irq_to_desc(host_irq); + if (!desc) + return -EIO; + + mutex_lock(&kvm->lock); + + pimap = kvm->arch.pimap; + if (pimap == NULL) { + /* First call, allocate structure to hold IRQ map */ + pimap = kvmppc_alloc_pimap(); + if (pimap == NULL) { + mutex_unlock(&kvm->lock); + return -ENOMEM; + } + kvm->arch.pimap = pimap; + } + + /* +* For now, we only support interrupts for which the EOI operation +* is an OPAL call followed by a write to XIRR, since that's +* what our real-mode EOI code does. +*/ + chip = irq_data_get_irq_chip(&desc->irq_data); + if (!chip || !is_pnv_opal_msi(chip)) { + pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map for (%d,%d)\n", + host_irq, guest_gsi); + mutex_unlock(&kvm->lock); + return -ENOENT; + } + + /* +* See if we already have an entry for this guest IRQ number. +* If it's mapped to a hardware IRQ number, that's an error, +* otherwise re-use this entry. +*/ + for (i = 0; i < pimap->n_mapped; i++) { + if (guest_gsi == pimap->mapped[i].v_hwirq) { + if (pimap->mapped[i].r_hwirq) { + mutex_unlock(&kvm->lock); + return -EINVAL; + } + break; + } + } + + if (i == KVMPPC_PIRQ_MAPPED) { + mutex_unlock(&kvm->lock); + return -EAGAIN; /* table is full */ + } + + irq_map = &pimap->mapped[i]; + + irq_map->v_hwirq = guest_gsi; + irq_map->r_hwirq = desc->irq_data.hwirq; + irq_map->desc = desc; + + if (i == pimap->n_mapped) + pimap->n_mapped++; + + mutex_unlock(&kvm->lock); + + return 0; +} + +static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) +{ + struct irq_desc *desc; + struct kvmppc_passthru_irqmap *pimap; + int i; + + desc = irq_to_desc(host_irq); + if (!desc) + return -EIO; + + mutex_lock(&kvm->lock); + + if (kvm->arch.pimap == NULL) { + mutex_unlock(&kvm->lock); + return 0; + } + pimap = kvm->arch.pimap; + + for (i = 0; i < pimap->n_mapped; i++) { + if (guest_gsi == pimap->mapped[i].v_hwirq) + break; + } + +
[PATCH 05/13] powerpc/powernv: Provide facilities for EOI, usable from real mode
From: Suresh Warrier This adds a new function pnv_opal_pci_msi_eoi() which does the part of end-of-interrupt (EOI) handling of an MSI which involves doing an OPAL call. This function can be called in real mode. This doesn't just export pnv_ioda2_msi_eoi() because that does a call to icp_native_eoi(), which does not work in real mode. This also adds a function, is_pnv_opal_msi(), which KVM can call to check whether an interrupt is one for which we should be calling pnv_opal_pci_msi_eoi() when we need to do an EOI. [pau...@ozlabs.org - split out the addition of pnv_opal_pci_msi_eoi() from Suresh's patch "KVM: PPC: Book3S HV: Handle passthrough interrupts in guest"; added is_pnv_opal_msi(); wrote description.] Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/pnv-pci.h| 3 +++ arch/powerpc/platforms/powernv/pci-ioda.c | 24 2 files changed, 23 insertions(+), 4 deletions(-) diff --git a/arch/powerpc/include/asm/pnv-pci.h b/arch/powerpc/include/asm/pnv-pci.h index 0cbd813..1b46b52 100644 --- a/arch/powerpc/include/asm/pnv-pci.h +++ b/arch/powerpc/include/asm/pnv-pci.h @@ -12,6 +12,7 @@ #include #include +#include #include #include @@ -33,6 +34,8 @@ int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num); void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num); int pnv_cxl_get_irq_count(struct pci_dev *dev); struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev); +int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq); +bool is_pnv_opal_msi(struct irq_chip *chip); #ifdef CONFIG_CXL_BASE int pnv_cxl_alloc_hwirq_ranges(struct cxl_irq_ranges *irqs, diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c b/arch/powerpc/platforms/powernv/pci-ioda.c index fd9444f..9ce48ae 100644 --- a/arch/powerpc/platforms/powernv/pci-ioda.c +++ b/arch/powerpc/platforms/powernv/pci-ioda.c @@ -2710,15 +2710,21 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb *phb, } #ifdef CONFIG_PCI_MSI -static void pnv_ioda2_msi_eoi(struct irq_data *d) +int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq) { - unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); - struct irq_chip *chip = irq_data_get_irq_chip(d); struct pnv_phb *phb = container_of(chip, struct pnv_phb, ioda.irq_chip); + + return opal_pci_msi_eoi(phb->opal_id, hw_irq); +} + +static void pnv_ioda2_msi_eoi(struct irq_data *d) +{ int64_t rc; + unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d); + struct irq_chip *chip = irq_data_get_irq_chip(d); - rc = opal_pci_msi_eoi(phb->opal_id, hw_irq); + rc = pnv_opal_pci_msi_eoi(chip, hw_irq); WARN_ON_ONCE(rc); icp_native_eoi(d); @@ -2748,6 +2754,16 @@ void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned int virq) irq_set_chip(virq, &phb->ioda.irq_chip); } +/* + * Returns true iff chip is something that we could call + * pnv_opal_pci_msi_eoi for. + */ +bool is_pnv_opal_msi(struct irq_chip *chip) +{ + return chip->irq_eoi == pnv_ioda2_msi_eoi; +} +EXPORT_SYMBOL_GPL(is_pnv_opal_msi); + static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev, unsigned int hwirq, unsigned int virq, unsigned int is_64, struct msi_msg *msg) -- 2.8.1
[PATCH 07/13] KVM: PPC: Book3S HV: Handle passthrough interrupts in guest
From: Suresh Warrier Currently, KVM switches back to the host to handle any external interrupt (when the interrupt is received while running in the guest). This patch updates real-mode KVM to check if an interrupt is generated by a passthrough adapter that is owned by this guest. If so, the real mode KVM will directly inject the corresponding virtual interrupt to the guest VCPU's ICS and also EOI the interrupt in hardware. In short, the interrupt is handled entirely in real mode in the guest context without switching back to the host. In some rare cases, the interrupt cannot be completely handled in real mode, for instance, a VCPU that is sleeping needs to be woken up. In this case, KVM simply switches back to the host with trap reason set to 0x500. This works, but it is clearly not very efficient. A following patch will distinguish this case and handle it correctly in the host. Note that we can use the existing check_too_hard() routine even though we are not in a hypercall to determine if there is unfinished business that needs to be completed in host virtual mode. The patch assumes that the mapping between hardware interrupt IRQ and virtual IRQ to be injected to the guest already exists for the PCI passthrough interrupts that need to be handled in real mode. If the mapping does not exist, KVM falls back to the default existing behavior. The KVM real mode code reads mappings from the mapped array in the passthrough IRQ map without taking any lock. We carefully order the loads and stores of the fields in the kvmppc_irq_map data structure using memory barriers to avoid an inconsistent mapping being seen by the reader. Thus, although it is possible to miss a map entry, it is not possible to read a stale value. [pau...@ozlabs.org - get irq_chip from irq_map rather than pimap, pulled out powernv eoi change into a separate patch, made kvmppc_read_intr get the vcpu from the paca rather than being passed in, rewrote the logic at the end of kvmppc_read_intr to avoid deep indentation, simplified logic in book3s_hv_rmhandlers.S since we were always restoring SRR0/1 anyway, get rid of the cached array (just use the mapped array), removed the kick_all_cpus_sync() call, clear saved_xirr PACA field when we handle the interrupt in real mode.] Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_ppc.h | 3 ++ arch/powerpc/kvm/book3s_hv.c| 8 - arch/powerpc/kvm/book3s_hv_builtin.c| 58 - arch/powerpc/kvm/book3s_hv_rm_xics.c| 44 + arch/powerpc/kvm/book3s_hv_rmhandlers.S | 6 5 files changed, 117 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 4ca2ba3..4299a1f 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -478,6 +478,9 @@ extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval); extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, u32 cpu); extern void kvmppc_xics_ipi_action(void); +extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, u32 xirr, +struct kvmppc_irq_map *irq_map, +struct kvmppc_passthru_irqmap *pimap); extern int h_ipi_redirect; #else static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap( diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index aa11647..175bdab 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3360,9 +3360,15 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) irq_map = &pimap->mapped[i]; irq_map->v_hwirq = guest_gsi; - irq_map->r_hwirq = desc->irq_data.hwirq; irq_map->desc = desc; + /* +* Order the above two stores before the next to serialize with +* the KVM real mode handler. +*/ + smp_wmb(); + irq_map->r_hwirq = desc->irq_data.hwirq; + if (i == pimap->n_mapped) pimap->n_mapped++; diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c b/arch/powerpc/kvm/book3s_hv_builtin.c index b476a6a..fdb8aef 100644 --- a/arch/powerpc/kvm/book3s_hv_builtin.c +++ b/arch/powerpc/kvm/book3s_hv_builtin.c @@ -288,12 +288,41 @@ void kvmhv_commence_exit(int trap) struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv; EXPORT_SYMBOL_GPL(kvmppc_host_rm_ops_hv); +static struct kvmppc_irq_map *get_irqmap(struct kvmppc_passthru_irqmap *pimap, +u32 xisr) +{ + int i; + + /* +* We access the mapped array here without a lock. That +* is safe because we never reduce the number of entries +* in the array and we never change the v_hwirq field of +* an entry once it is set. +* +* We have also carefully ordered the stores
[PATCH 09/13] KVM: PPC: Book3S HV: Dump irqmap in debugfs
From: Suresh Warrier Dump the passthrough irqmap structure associated with a guest as part of /sys/kernel/debug/powerpc/kvm-xics-*. Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_xics.c | 17 + 1 file changed, 17 insertions(+) diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c index d528d22..b41f1d3 100644 --- a/arch/powerpc/kvm/book3s_xics.c +++ b/arch/powerpc/kvm/book3s_xics.c @@ -893,6 +893,21 @@ EXPORT_SYMBOL_GPL(kvmppc_xics_hcall); /* -- Initialisation code etc. -- */ +static void xics_debugfs_irqmap(struct seq_file *m, + struct kvmppc_passthru_irqmap *pimap) +{ + int i; + + if (!pimap) + return; + seq_printf(m, "\nPIRQ mappings: %d maps\n===\n", + pimap->n_mapped); + for (i = 0; i < pimap->n_mapped; i++) { + seq_printf(m, "r_hwirq=%x, v_hwirq=%x\n", + pimap->mapped[i].r_hwirq, pimap->mapped[i].v_hwirq); + } +} + static int xics_debug_show(struct seq_file *m, void *private) { struct kvmppc_xics *xics = m->private; @@ -914,6 +929,8 @@ static int xics_debug_show(struct seq_file *m, void *private) t_check_resend = 0; t_reject = 0; + xics_debugfs_irqmap(m, kvm->arch.pimap); + seq_printf(m, "=\nICP state\n=\n"); kvm_for_each_vcpu(i, vcpu, kvm) { -- 2.8.1
[PATCH 08/13] KVM: PPC: Book3S HV: Complete passthrough interrupt in host
From: Suresh Warrier In existing real mode ICP code, when updating the virtual ICP state, if there is a required action that cannot be completely handled in real mode, as for instance, a VCPU needs to be woken up, flags are set in the ICP to indicate the required action. This is checked when returning from hypercalls to decide whether the call needs switch back to the host where the action can be performed in virtual mode. Note that if h_ipi_redirect is enabled, real mode code will first try to message a free host CPU to complete this job instead of returning the host to do it ourselves. Currently, the real mode PCI passthrough interrupt handling code checks if any of these flags are set and simply returns to the host. This is not good enough as the trap value (0x500) is treated as an external interrupt by the host code. It is only when the trap value is a hypercall that the host code searches for and acts on unfinished work by calling kvmppc_xics_rm_complete. This patch introduces a special trap BOOK3S_INTERRUPT_HV_RM_HARD which is returned by KVM if there is unfinished business to be completed in host virtual mode after handling a PCI passthrough interrupt. The host checks for this special interrupt condition and calls into the kvmppc_xics_rm_complete, which is made an exported function for this reason. [pau...@ozlabs.org - moved logic to set r12 to BOOK3S_INTERRUPT_HV_RM_HARD in book3s_hv_rmhandlers.S into the end of kvmppc_check_wake_reason.] Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_asm.h | 10 ++ arch/powerpc/include/asm/kvm_ppc.h | 3 +++ arch/powerpc/kvm/book3s_hv.c| 8 +++- arch/powerpc/kvm/book3s_hv_builtin.c| 1 + arch/powerpc/kvm/book3s_hv_rm_xics.c| 2 +- arch/powerpc/kvm/book3s_hv_rmhandlers.S | 25 + arch/powerpc/kvm/book3s_xics.c | 3 ++- 7 files changed, 49 insertions(+), 3 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_asm.h b/arch/powerpc/include/asm/kvm_asm.h index 5bca220..05cabed 100644 --- a/arch/powerpc/include/asm/kvm_asm.h +++ b/arch/powerpc/include/asm/kvm_asm.h @@ -105,6 +105,15 @@ #define BOOK3S_INTERRUPT_FAC_UNAVAIL 0xf60 #define BOOK3S_INTERRUPT_H_FAC_UNAVAIL 0xf80 +/* book3s_hv */ + +/* + * Special trap used to indicate to host that this is a + * passthrough interrupt that could not be handled + * completely in the guest. + */ +#define BOOK3S_INTERRUPT_HV_RM_HARD0x + #define BOOK3S_IRQPRIO_SYSTEM_RESET0 #define BOOK3S_IRQPRIO_DATA_SEGMENT1 #define BOOK3S_IRQPRIO_INST_SEGMENT2 @@ -136,6 +145,7 @@ #define RESUME_FLAG_NV (1<<0) /* Reload guest nonvolatile state? */ #define RESUME_FLAG_HOST(1<<1) /* Resume host? */ #define RESUME_FLAG_ARCH1 (1<<2) +#define RESUME_FLAG_ARCH2 (1<<3) #define RESUME_GUEST0 #define RESUME_GUEST_NV RESUME_FLAG_NV diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 4299a1f..e0ada31 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -469,6 +469,7 @@ static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap( extern void kvmppc_alloc_host_rm_ops(void); extern void kvmppc_free_host_rm_ops(void); extern void kvmppc_free_pimap(struct kvm *kvm); +extern int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall); extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu); extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server); extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args); @@ -489,6 +490,8 @@ static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap( static inline void kvmppc_alloc_host_rm_ops(void) {}; static inline void kvmppc_free_host_rm_ops(void) {}; static inline void kvmppc_free_pimap(struct kvm *kvm) {}; +static inline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall) + { return 0; } static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) { return 0; } static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 175bdab..cfddafa 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -73,6 +73,8 @@ /* Used to indicate that a guest page fault needs to be handled */ #define RESUME_PAGE_FAULT (RESUME_GUEST | RESUME_FLAG_ARCH1) +/* Used to indicate that a guest passthrough interrupt needs to be handled */ +#define RESUME_PASSTHROUGH (RESUME_GUEST | RESUME_FLAG_ARCH2) /* Used as a "null" value for timebase values */ #define TB_NIL (~(u64)0) @@ -994,6 +996,9 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, struct kvm_vcpu *vcpu, kvmppc_core_queue_program(vcpu, SRR1_PROGILL); r = RESUME_GUEST; break; + case BOOK3S
[PATCH 10/13] KVM: PPC: Book3S HV: Tunable to disable KVM IRQ bypass
From: Suresh Warrier Add a module parameter kvm_irq_bypass for kvm_hv.ko to disable IRQ bypass for passthrough interrupts. The default value of this tunable is 1 - that is enable the feature. Since the tunable is used by built-in kernel code, we use the module_param_cb macro to achieve this. Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_book3s.h | 1 + arch/powerpc/include/asm/kvm_ppc.h| 2 +- arch/powerpc/kvm/book3s_hv.c | 10 ++ arch/powerpc/kvm/book3s_hv_rm_xics.c | 2 ++ 4 files changed, 14 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/kvm_book3s.h b/arch/powerpc/include/asm/kvm_book3s.h index 8f39796..8e5fac6 100644 --- a/arch/powerpc/include/asm/kvm_book3s.h +++ b/arch/powerpc/include/asm/kvm_book3s.h @@ -191,6 +191,7 @@ extern void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu, struct kvm_vcpu *vcpu); extern void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu, struct kvmppc_book3s_shadow_vcpu *svcpu); +extern int kvm_irq_bypass; static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu) { diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index e0ada31..97b9bad 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -461,7 +461,7 @@ static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu) static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap( struct kvm *kvm) { - if (kvm) + if (kvm && kvm_irq_bypass) return kvm->arch.pimap; return NULL; } diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index cfddafa..2e71518 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -94,6 +94,10 @@ static struct kernel_param_ops module_param_ops = { .get = param_get_int, }; +module_param_cb(kvm_irq_bypass, &module_param_ops, &kvm_irq_bypass, + S_IRUGO | S_IWUSR); +MODULE_PARM_DESC(kvm_irq_bypass, "Bypass passthrough interrupt optimization"); + module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect, S_IRUGO | S_IWUSR); MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core"); @@ -3313,6 +3317,9 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) struct irq_chip *chip; int i; + if (!kvm_irq_bypass) + return 1; + desc = irq_to_desc(host_irq); if (!desc) return -EIO; @@ -3389,6 +3396,9 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) struct kvmppc_passthru_irqmap *pimap; int i; + if (!kvm_irq_bypass) + return 0; + desc = irq_to_desc(host_irq); if (!desc) return -EIO; diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 3b8d7ac..00b9dfde 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -27,6 +27,8 @@ int h_ipi_redirect = 1; EXPORT_SYMBOL(h_ipi_redirect); +int kvm_irq_bypass = 1; +EXPORT_SYMBOL(kvm_irq_bypass); static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp *icp, u32 new_irq); -- 2.8.1
[PATCH 11/13] KVM: PPC: Book3S HV: Update irq stats for IRQs handled in real mode
From: Suresh Warrier When a passthrough IRQ is handled completely within KVM real mode code, it has to also update the IRQ stats since this does not go through the generic IRQ handling code. However, the per CPU kstat_irqs field is an allocated (not static) field and so cannot be directly accessed in real mode safely. The function this_cpu_inc_rm() is introduced to safely increment per CPU fields (currently coded for unsigned integers only) that are allocated and could thus be vmalloced also. Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras --- arch/powerpc/kvm/book3s_hv_rm_xics.c | 50 1 file changed, 50 insertions(+) diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 00b9dfde..554cdfa 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -10,6 +10,7 @@ #include #include #include +#include #include #include @@ -18,6 +19,7 @@ #include #include #include +#include #include #include @@ -734,6 +736,53 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, u32 xirr) _stwcix(xics_phys + XICS_XIRR, xirr); } +/* + * Increment a per-CPU 32-bit unsigned integer variable. + * Safe to call in real-mode. Handles vmalloc'ed addresses + * + * ToDo: Make this work for any integral type + */ + +static inline void this_cpu_inc_rm(unsigned int __percpu *addr) +{ + unsigned long l; + unsigned int *raddr; + int cpu = smp_processor_id(); + + raddr = per_cpu_ptr(addr, cpu); + l = (unsigned long)raddr; + + if (REGION_ID(l) == VMALLOC_REGION_ID) { + l = vmalloc_to_phys(raddr); + raddr = (unsigned int *)l; + } + ++*raddr; +} + +/* + * We don't try to update the flags in the irq_desc 'istate' field in + * here as would happen in the normal IRQ handling path for several reasons: + * - state flags represent internal IRQ state and are not expected to be + *updated outside the IRQ subsystem + * - more importantly, these are useful for edge triggered interrupts, + *IRQ probing, etc., but we are only handling MSI/MSIx interrupts here + *and these states shouldn't apply to us. + * + * However, we do update irq_stats - we somewhat duplicate the code in + * kstat_incr_irqs_this_cpu() for this since this function is defined + * in irq/internal.h which we don't want to include here. + * The only difference is that desc->kstat_irqs is an allocated per CPU + * variable and could have been vmalloc'ed, so we can't directly + * call __this_cpu_inc() on it. The kstat structure is a static + * per CPU variable and it should be accessible by real-mode KVM. + * + */ +static void kvmppc_rm_handle_irq_desc(struct irq_desc *desc) +{ + this_cpu_inc_rm(desc->kstat_irqs); + __this_cpu_inc(kstat.irqs_sum); +} + long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, u32 xirr, struct kvmppc_irq_map *irq_map, @@ -747,6 +796,7 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, xics = vcpu->kvm->arch.xics; icp = vcpu->arch.icp; + kvmppc_rm_handle_irq_desc(irq_map->desc); icp_rm_deliver_irq(xics, icp, irq); /* EOI the interrupt */ -- 2.8.1
[PATCH 12/13] KVM: PPC: Book3S HV: Set server for passed-through interrupts
When a guest has a PCI pass-through device with an interrupt, it will direct the interrupt to a particular guest VCPU. In fact the physical interrupt might arrive on any CPU, and then get delivered to the target VCPU in the emulated XICS (guest interrupt controller), and eventually delivered to the target VCPU. Now that we have code to handle device interrupts in real mode without exiting to the host kernel, there is an advantage to having the device interrupt arrive on the same sub(core) as the target VCPU is running on. In this situation, the interrupt can be delivered to the target VCPU without any exit to the host kernel (using a hypervisor doorbell interrupt between threads if necessary). This patch aims to get passed-through device interrupts arriving on the correct core by setting the interrupt server in the real hardware XICS for the interrupt to the first thread in the (sub)core where its target VCPU is running. We do this in the real-mode H_EOI code because the H_EOI handler already needs to look at the emulated ICS state for the interrupt (whereas the H_XIRR handler doesn't), and we know we are running in the target VCPU context at that point. We set the server CPU in hardware using an OPAL call, regardless of what the IRQ affinity mask for the interrupt says, and without updating the affinity mask. This amounts to saying that when an interrupt is passed through to a guest, as a matter of policy we allow the guest's affinity for the interrupt to override the host's. This is inspired by an earlier patch from Suresh Warrier, although none of this code came from that earlier patch. Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_ppc.h | 4 +++ arch/powerpc/include/asm/opal.h| 1 + arch/powerpc/kvm/book3s_hv.c | 4 +++ arch/powerpc/kvm/book3s_hv_rm_xics.c | 16 arch/powerpc/kvm/book3s_xics.c | 35 ++ arch/powerpc/kvm/book3s_xics.h | 2 ++ arch/powerpc/platforms/powernv/opal-wrappers.S | 1 + 7 files changed, 63 insertions(+) diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 97b9bad..f6e4964 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -479,6 +479,10 @@ extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 icpval); extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev, struct kvm_vcpu *vcpu, u32 cpu); extern void kvmppc_xics_ipi_action(void); +extern void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long guest_irq, + unsigned long host_irq); +extern void kvmppc_xics_clr_mapped(struct kvm *kvm, unsigned long guest_irq, + unsigned long host_irq); extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, u32 xirr, struct kvmppc_irq_map *irq_map, struct kvmppc_passthru_irqmap *pimap); diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h index ee05bd2..e958b70 100644 --- a/arch/powerpc/include/asm/opal.h +++ b/arch/powerpc/include/asm/opal.h @@ -67,6 +67,7 @@ int64_t opal_pci_config_write_half_word(uint64_t phb_id, uint64_t bus_dev_func, int64_t opal_pci_config_write_word(uint64_t phb_id, uint64_t bus_dev_func, uint64_t offset, uint32_t data); int64_t opal_set_xive(uint32_t isn, uint16_t server, uint8_t priority); +int64_t opal_rm_set_xive(uint32_t isn, uint16_t server, uint8_t priority); int64_t opal_get_xive(uint32_t isn, __be16 *server, uint8_t *priority); int64_t opal_register_exception_handler(uint64_t opal_exception, uint64_t handler_address, diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index 2e71518..b969abc 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -3385,6 +3385,8 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) if (i == pimap->n_mapped) pimap->n_mapped++; + kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq); + mutex_unlock(&kvm->lock); return 0; @@ -3421,6 +3423,8 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int guest_gsi) return -ENODEV; } + kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq); + /* invalidate the entry */ pimap->mapped[i].r_hwirq = 0; diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 554cdfa..5f7527e 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -22,6 +22,7 @@ #include #include #include +#include #include "book3s_xics.h" @@ -34,6 +35,7 @@ EXPORT_SYMBOL(kvm_irq_bypass); static void icp_rm_deliver_irq(struc
[PATCH 13/13] KVM: PPC: Book3S HV: Counters for passthrough IRQ stats
From: Suresh Warrier Add VCPU stat counters to track affinity for passthrough interrupts. pthru_all: Counts all passthrough interrupts whose IRQ mappings are in the kvmppc_passthru_irq_map structure. pthru_host: Counts all cached passthrough interrupts that were injected from the host through kvm_set_irq (i.e. not handled in real mode). pthru_bad_aff: Counts how many cached passthrough interrupts have bad affinity (receiving CPU is not running VCPU that is the target of the virtual interrupt in the guest). Signed-off-by: Suresh Warrier Signed-off-by: Paul Mackerras --- arch/powerpc/include/asm/kvm_host.h | 3 +++ arch/powerpc/kvm/book3s.c| 3 +++ arch/powerpc/kvm/book3s_hv_rm_xics.c | 18 +- 3 files changed, 19 insertions(+), 5 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 3eb5092..f371a23 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -131,6 +131,9 @@ struct kvm_vcpu_stat { u32 ld_slow; u32 st_slow; #endif + u32 pthru_all; + u32 pthru_host; + u32 pthru_bad_aff; }; enum kvm_exit_types { diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 47018fc..6d0c45b 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -64,6 +64,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = { { "ld_slow", VCPU_STAT(ld_slow) }, { "st", VCPU_STAT(st) }, { "st_slow", VCPU_STAT(st_slow) }, + { "pthru_all", VCPU_STAT(pthru_all) }, + { "pthru_host", VCPU_STAT(pthru_host) }, + { "pthru_bad_aff", VCPU_STAT(pthru_bad_aff) }, { NULL } }; diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c b/arch/powerpc/kvm/book3s_hv_rm_xics.c index 5f7527e..82ff5de 100644 --- a/arch/powerpc/kvm/book3s_hv_rm_xics.c +++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c @@ -716,11 +716,19 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long xirr) icp->rm_eoied_irq = irq; } - if (state->host_irq && state->intr_cpu != -1) { - int pcpu = cpu_first_thread_sibling(raw_smp_processor_id()); - if (state->intr_cpu != pcpu) - xics_opal_rm_set_server(state->host_irq, pcpu); - state->intr_cpu = -1; + if (state->host_irq) { + ++vcpu->stat.pthru_all; + if (state->intr_cpu != -1) { + int pcpu = raw_smp_processor_id(); + + pcpu = cpu_first_thread_sibling(pcpu); + ++vcpu->stat.pthru_host; + if (state->intr_cpu != pcpu) { + ++vcpu->stat.pthru_bad_aff; + xics_opal_rm_set_server(state->host_irq, pcpu); + } + state->intr_cpu = -1; + } } bail: return check_too_hard(xics, icp); -- 2.8.1
Re: [PATCH v4 0/3] perf annotate: Enable cross arch annotate
I've sent v5 series for this. Please review it. Thanks, Ravi On Wednesday 13 July 2016 03:15 PM, Ravi Bangoria wrote: > Arnaldo, Michael, > > I've tested this patchset on ppc64 BE and LE both. Please review this. > > -Ravi > > On Friday 08 July 2016 10:10 AM, Ravi Bangoria wrote: >> Perf can currently only support code navigation (branches and calls) in >> annotate when run on the same architecture where perf.data was recorded. >> But cross arch annotate is not supported. >> >> This patchset enables cross arch annotate. Currently I've used x86 >> and arm instructions which are already available and adding support >> for powerpc as well. Adding support for other arch will be easy. >> >> I've created this patch on top of acme/perf/core. And tested it with >> x86 and powerpc only. >> >> Note for arm: >> Few instructions were defined under #if __arm__ which I've used as a >> table for arm. But I'm not sure whether instruction defined outside of >> that also contains arm instructions. Apart from that, 'call__parse()' >> and 'move__parse()' contains #ifdef __arm__ directive. I've changed it >> to if (!strcmp(norm_arch, arm)). I don't have a arm machine to test >> these changes. >> >> Example: >> >>Record on powerpc: >>$ ./perf record -a >> >>Report -> Annotate on x86: >>$ ./perf report -i perf.data.powerpc --vmlinux vmlinux.powerpc >> >> Changes in v4: >>- powerpc: Added support for branch instructions that includes 'ctr' >>- __maybe_unused was misplaced at few location. Corrected it. >>- Moved position of v3 last patch that define macro for each arch name >> >> v3 link: https://lkml.org/lkml/2016/6/30/99 >> >> Naveen N. Rao (1): >>perf annotate: add powerpc support >> >> Ravi Bangoria (2): >>perf: Define macro for normalized arch names >>perf annotate: Enable cross arch annotate >> >> tools/perf/arch/common.c | 36 ++--- >> tools/perf/arch/common.h | 11 ++ >> tools/perf/builtin-top.c | 2 +- >> tools/perf/ui/browsers/annotate.c | 3 +- >> tools/perf/ui/gtk/annotate.c | 2 +- >> tools/perf/util/annotate.c | 273 >> ++--- >> tools/perf/util/annotate.h | 6 +- >> tools/perf/util/unwind-libunwind.c | 4 +- >> 8 files changed, 265 insertions(+), 72 deletions(-) >> >> -- >> 2.5.5 >> >
Re: [PATCH v3 02/21] powerpc: Always restore FPU/VEC/VSX if hardware transactional memory in use
On Wed, 2016-08-17 at 13:43 +1000, Cyril Bur wrote: > Comment from arch/powerpc/kernel/process.c:967: > If userspace is inside a transaction (whether active or > suspended) and FP/VMX/VSX instructions have ever been enabled > inside that transaction, then we have to keep them enabled > and keep the FP/VMX/VSX state loaded while ever the transaction > continues. The reason is that if we didn't, and subsequently > got a FP/VMX/VSX unavailable interrupt inside a transaction, > we don't know whether it's the same transaction, and thus we > don't know which of the checkpointed state and the ransactional > state to use. > > restore_math() restore_fp() and restore_altivec() currently may not > restore the registers. It doesn't appear that this is more serious > than a performance penalty. If the math registers aren't restored the > userspace thread will still be run with the facility disabled. > Userspace will not be able to read invalid values. On the first access > it will take an facility unavailable exception and the kernel will > detected an active transaction, at which point it will abort the > transaction. There is the possibility for a pathological case > preventing any progress by transactions, however, transactions > are never guaranteed to make progress. > > Fixes: 70fe3d9 ("powerpc: Restore FPU/VEC/VSX if previously used") > Signed-off-by: Cyril Bur > --- > arch/powerpc/kernel/process.c | 21 ++--- > 1 file changed, 18 insertions(+), 3 deletions(-) > > diff --git a/arch/powerpc/kernel/process.c > b/arch/powerpc/kernel/process.c > index 58ccf86..cdf2d20 100644 > --- a/arch/powerpc/kernel/process.c > +++ b/arch/powerpc/kernel/process.c > @@ -88,7 +88,13 @@ static void check_if_tm_restore_required(struct > task_struct *tsk) > set_thread_flag(TIF_RESTORE_TM); > } > } > + > +static inline bool msr_tm_active(unsigned long msr) > +{ > + return MSR_TM_ACTIVE(msr); > +} I'm not sure what value this function is adding. The MSR_TM_ACTIVE() is used in a lot of other places and is well known so I'd prefer to just keep using it, rather than adding some other abstraction that others have to learn. Other than that, the patch seems good. Mikey > #else > +static inline bool msr_tm_active(unsigned long msr) { return false; } > static inline void check_if_tm_restore_required(struct task_struct *tsk) > { } > #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */ > > @@ -208,7 +214,7 @@ void enable_kernel_fp(void) > EXPORT_SYMBOL(enable_kernel_fp); > > static int restore_fp(struct task_struct *tsk) { > - if (tsk->thread.load_fp) { > + if (tsk->thread.load_fp || msr_tm_active(tsk->thread.regs->msr)) > { > load_fp_state(¤t->thread.fp_state); > current->thread.load_fp++; > return 1; > @@ -278,7 +284,8 @@ EXPORT_SYMBOL_GPL(flush_altivec_to_thread); > > static int restore_altivec(struct task_struct *tsk) > { > - if (cpu_has_feature(CPU_FTR_ALTIVEC) && tsk->thread.load_vec) { > + if (cpu_has_feature(CPU_FTR_ALTIVEC) && > + (tsk->thread.load_vec || msr_tm_active(tsk->thread.regs- > >msr))) { > load_vr_state(&tsk->thread.vr_state); > tsk->thread.used_vr = 1; > tsk->thread.load_vec++; > @@ -464,7 +471,8 @@ void restore_math(struct pt_regs *regs) > { > unsigned long msr; > > - if (!current->thread.load_fp && !loadvec(current->thread)) > + if (!msr_tm_active(regs->msr) && > + !current->thread.load_fp && !loadvec(current->thread)) > return; > > msr = regs->msr; > @@ -983,6 +991,13 @@ void restore_tm_state(struct pt_regs *regs) > msr_diff = current->thread.ckpt_regs.msr & ~regs->msr; > msr_diff &= MSR_FP | MSR_VEC | MSR_VSX; > > + /* Ensure that restore_math() will restore */ > + if (msr_diff & MSR_FP) > + current->thread.load_fp = 1; > +#ifdef CONFIG_ALIVEC > + if (cpu_has_feature(CPU_FTR_ALTIVEC) && msr_diff & MSR_VEC) > + current->thread.load_vec = 1; > +#endif > restore_math(regs); > > regs->msr |= msr_diff;
Re: [PATCH v3 03/21] powerpc: Add check_if_tm_restore_required() to giveup_all()
On Wed, 2016-08-17 at 13:43 +1000, Cyril Bur wrote: > giveup_all() causes FPU/VMX/VSX facilitities to be disabled in a facilities. > threads MSR. If this thread was transactional this should be recorded > as reclaiming/recheckpointing code will need to know. Can you expand on this? It's not clear to me how this relates to the code. Mikey > Fixes: c208505 ("powerpc: create giveup_all()") > Signed-off-by: Cyril Bur > --- > arch/powerpc/kernel/process.c | 1 + > 1 file changed, 1 insertion(+) > > diff --git a/arch/powerpc/kernel/process.c > b/arch/powerpc/kernel/process.c > index cdf2d20..82308fd 100644 > --- a/arch/powerpc/kernel/process.c > +++ b/arch/powerpc/kernel/process.c > @@ -445,6 +445,7 @@ void giveup_all(struct task_struct *tsk) > return; > > msr_check_and_set(msr_all_available); > + check_if_tm_restore_required(tsk); > > #ifdef CONFIG_PPC_FPU > if (usermsr & MSR_FP)
Re: [PATCH v3 04/21] powerpc: Return the new MSR from msr_check_and_set()
On Wed, 2016-08-17 at 13:43 +1000, Cyril Bur wrote: > mfmsr() is a fairly expensive call and callers of msr_check_and_set() > may want to make decisions bits in the MSR that it did not change but > may not know the value of. I can't grok this. Please reword. Mikey > This patch would avoid a two calls to mfmsr(). > > Signed-off-by: Cyril Bur > --- > arch/powerpc/include/asm/reg.h | 2 +- > arch/powerpc/kernel/process.c | 4 +++- > 2 files changed, 4 insertions(+), 2 deletions(-) > > diff --git a/arch/powerpc/include/asm/reg.h > b/arch/powerpc/include/asm/reg.h > index f69f40f..0a3dde9 100644 > --- a/arch/powerpc/include/asm/reg.h > +++ b/arch/powerpc/include/asm/reg.h > @@ -1247,7 +1247,7 @@ static inline void mtmsr_isync(unsigned long val) > : "memory") > #endif > > -extern void msr_check_and_set(unsigned long bits); > +extern unsigned long msr_check_and_set(unsigned long bits); > extern bool strict_msr_control; > extern void __msr_check_and_clear(unsigned long bits); > static inline void msr_check_and_clear(unsigned long bits) > diff --git a/arch/powerpc/kernel/process.c > b/arch/powerpc/kernel/process.c > index 82308fd..c42581b 100644 > --- a/arch/powerpc/kernel/process.c > +++ b/arch/powerpc/kernel/process.c > @@ -110,7 +110,7 @@ static int __init enable_strict_msr_control(char > *str) > } > early_param("ppc_strict_facility_enable", enable_strict_msr_control); > > -void msr_check_and_set(unsigned long bits) > +unsigned long msr_check_and_set(unsigned long bits) > { > unsigned long oldmsr = mfmsr(); > unsigned long newmsr; > @@ -124,6 +124,8 @@ void msr_check_and_set(unsigned long bits) > > if (oldmsr != newmsr) > mtmsr_isync(newmsr); > + > + return newmsr; > } > > void __msr_check_and_clear(unsigned long bits)