Re: [PowerPC] today's main line failed to build on PowerPC

2016-08-18 Thread Abdul Haleem

On Thursday18 August 2016 11:50 AM, Abdul Haleem wrote:

Hi,

The main line stable 4.8.0-rc2 failed to build on PowerPC with 
following build errors. config : pseries_le_defconfig Machine Type : 
PowerPC Bare Metal
My mistake, The build is failing on the attached config and not for 
'pseries_le_defconfig'


09:34:22 00:04:59 INFO | make -j 160  vmlinux
09:34:24 00:05:01 ERROR| [stderr] arch/powerpc/mm/hash_low_32.S: 
Assembler messages:
09:34:24 00:05:01 ERROR| [stderr] arch/powerpc/mm/hash_low_32.S:353: 
Error: missing operand
09:34:24 00:05:01 ERROR| [stderr] arch/powerpc/mm/hash_low_32.S:612: 
Error: missing operand
09:34:24 00:05:01 ERROR| [stderr] arch/powerpc/mm/hash_low_32.S:670: 
Error: missing operand
09:34:24 00:05:01 ERROR| [stderr] make[1]: *** 
[arch/powerpc/mm/hash_low_32.o] Error 1
09:34:24 00:05:01 ERROR| [stderr] make[1]: *** Waiting for unfinished 
jobs
09:34:25 00:05:02 ERROR| [stderr] arch/powerpc/kernel/head_32.S: 
Assembler messages:
09:34:25 00:05:02 ERROR| [stderr] arch/powerpc/kernel/head_32.S:1113: 
Error: missing operand
09:34:25 00:05:02 ERROR| [stderr] make[1]: *** 
[arch/powerpc/kernel/head_32.o] Error 1
09:34:25 00:05:02 ERROR| [stderr] make[1]: *** Waiting for unfinished 
jobs

09:34:27 00:05:04 ERROR| [stderr] make: *** [arch/powerpc/mm] Error 2
09:34:27 00:05:04 ERROR| [stderr] make: *** Waiting for unfinished 
jobs

09:34:42 00:05:19 ERROR| [stderr] make: *** [arch/powerpc/kernel] Error 2

Regard's
Abdul



#
# Automatically generated file; DO NOT EDIT.
# Linux/powerpc 4.7.0-rc7 Kernel Configuration
#
CONFIG_PPC64=y

#
# Processor support
#
CONFIG_PPC_BOOK3S_64=y
# CONFIG_PPC_BOOK3E_64 is not set
CONFIG_POWER7_CPU=y
# CONFIG_POWER8_CPU is not set
CONFIG_PPC_BOOK3S=y
CONFIG_PPC_FPU=y
CONFIG_ALTIVEC=y
CONFIG_VSX=y
# CONFIG_PPC_ICSWX is not set
CONFIG_PPC_STD_MMU=y
CONFIG_PPC_STD_MMU_64=y
CONFIG_PPC_RADIX_MMU=y
CONFIG_PPC_MM_SLICES=y
CONFIG_PPC_HAVE_PMU_SUPPORT=y
CONFIG_PPC_PERF_CTRS=y
CONFIG_SMP=y
CONFIG_NR_CPUS=32
CONFIG_PPC_DOORBELL=y
# CONFIG_CPU_BIG_ENDIAN is not set
CONFIG_CPU_LITTLE_ENDIAN=y
CONFIG_PPC64_BOOT_WRAPPER=y
CONFIG_64BIT=y
CONFIG_WORD_SIZE=64
CONFIG_ARCH_PHYS_ADDR_T_64BIT=y
CONFIG_ARCH_DMA_ADDR_T_64BIT=y
CONFIG_MMU=y
CONFIG_HAVE_SETUP_PER_CPU_AREA=y
CONFIG_NEED_PER_CPU_EMBED_FIRST_CHUNK=y
CONFIG_NR_IRQS=512
CONFIG_STACKTRACE_SUPPORT=y
CONFIG_TRACE_IRQFLAGS_SUPPORT=y
CONFIG_LOCKDEP_SUPPORT=y
CONFIG_RWSEM_XCHGADD_ALGORITHM=y
CONFIG_ARCH_HAS_ILOG2_U32=y
CONFIG_ARCH_HAS_ILOG2_U64=y
CONFIG_GENERIC_HWEIGHT=y
CONFIG_ARCH_HAS_DMA_SET_COHERENT_MASK=y
CONFIG_PPC=y
CONFIG_GENERIC_CSUM=y
CONFIG_EARLY_PRINTK=y
CONFIG_PANIC_TIMEOUT=180
CONFIG_COMPAT=y
CONFIG_SYSVIPC_COMPAT=y
CONFIG_SCHED_OMIT_FRAME_POINTER=y
CONFIG_ARCH_MAY_HAVE_PC_FDC=y
CONFIG_PPC_UDBG_16550=y
# CONFIG_GENERIC_TBSYNC is not set
CONFIG_AUDIT_ARCH=y
CONFIG_GENERIC_BUG=y
CONFIG_EPAPR_BOOT=y
# CONFIG_DEFAULT_UIMAGE is not set
CONFIG_ARCH_HIBERNATION_POSSIBLE=y
CONFIG_ARCH_SUSPEND_POSSIBLE=y
# CONFIG_PPC_DCR_NATIVE is not set
# CONFIG_PPC_DCR_MMIO is not set
# CONFIG_PPC_OF_PLATFORM_PCI is not set
CONFIG_ARCH_SUPPORTS_DEBUG_PAGEALLOC=y
CONFIG_ARCH_SUPPORTS_UPROBES=y
CONFIG_PPC_EMULATE_SSTEP=y
CONFIG_ZONE_DMA32=y
CONFIG_PGTABLE_LEVELS=4
CONFIG_DEFCONFIG_LIST="/lib/modules/$UNAME_RELEASE/.config"
CONFIG_IRQ_WORK=y

#
# General setup
#
CONFIG_INIT_ENV_ARG_LIMIT=32
CONFIG_CROSS_COMPILE=""
# CONFIG_COMPILE_TEST is not set
CONFIG_LOCALVERSION=""
CONFIG_LOCALVERSION_AUTO=y
CONFIG_DEFAULT_HOSTNAME="(none)"
CONFIG_SWAP=y
CONFIG_SYSVIPC=y
CONFIG_SYSVIPC_SYSCTL=y
CONFIG_POSIX_MQUEUE=y
CONFIG_POSIX_MQUEUE_SYSCTL=y
CONFIG_CROSS_MEMORY_ATTACH=y
CONFIG_FHANDLE=y
# CONFIG_USELIB is not set
# CONFIG_AUDIT is not set
CONFIG_HAVE_ARCH_AUDITSYSCALL=y

#
# IRQ subsystem
#
CONFIG_GENERIC_IRQ_SHOW=y
CONFIG_GENERIC_IRQ_SHOW_LEVEL=y
CONFIG_IRQ_DOMAIN=y
CONFIG_GENERIC_MSI_IRQ=y
CONFIG_IRQ_DOMAIN_DEBUG=y
CONFIG_IRQ_FORCED_THREADING=y
CONFIG_SPARSE_IRQ=y
CONFIG_GENERIC_TIME_VSYSCALL_OLD=y
CONFIG_GENERIC_CLOCKEVENTS=y
CONFIG_ARCH_HAS_TICK_BROADCAST=y
CONFIG_GENERIC_CLOCKEVENTS_BROADCAST=y
CONFIG_GENERIC_CMOS_UPDATE=y

#
# Timers subsystem
#
CONFIG_TICK_ONESHOT=y
CONFIG_NO_HZ_COMMON=y
# CONFIG_HZ_PERIODIC is not set
CONFIG_NO_HZ_IDLE=y
# CONFIG_NO_HZ_FULL is not set
CONFIG_NO_HZ=y
CONFIG_HIGH_RES_TIMERS=y

#
# CPU/Task time and stats accounting
#
CONFIG_VIRT_CPU_ACCOUNTING=y
# CONFIG_TICK_CPU_ACCOUNTING is not set
CONFIG_VIRT_CPU_ACCOUNTING_NATIVE=y
# CONFIG_VIRT_CPU_ACCOUNTING_GEN is not set
# CONFIG_BSD_PROCESS_ACCT is not set
CONFIG_TASKSTATS=y
CONFIG_TASK_DELAY_ACCT=y
# CONFIG_TASK_XACCT is not set

#
# RCU Subsystem
#
CONFIG_TREE_RCU=y
# CONFIG_RCU_EXPERT is not set
CONFIG_SRCU=y
CONFIG_TASKS_RCU=y
CONFIG_RCU_STALL_COMMON=y
CONFIG_TREE_RCU_TRACE=y
# CONFIG_RCU_EXPEDITE_BOOT is not set
CONFIG_BUILD_BIN2C=y
CONFIG_IKCONFIG=y
CONFIG_IKCONFIG_PROC=y
CONFIG_LOG_BUF_SHIFT=17
CONFIG_LOG_CPU_MAX_BUF_SHIFT=12
CONFIG_NMI_LOG_BUF_SHIFT=13
CONFIG_ARCH_SUPPORTS_NUMA_BALANCING=y
# CONFIG_NUMA_BALANCING is not set
CONFIG_CGROUPS=y
# C

[PATCH] cxl: use pcibios_free_controller_deferred() when removing vPHBs

2016-08-18 Thread Andrew Donnellan
When cxl removes a vPHB, it's possible that the pci_controller may be freed
before all references to the devices on the vPHB have been released. This
in turn causes an invalid memory access when the devices are eventually
released, as pcibios_release_device() attempts to call the phb's
release_device hook.

In cxl_pci_vphb_remove(), remove the existing call to
pcibios_free_controller(). Instead, use
pcibios_free_controller_deferred() to free the pci_controller after all
devices have been released. Export pci_set_host_bridge_release() so we can
do this.

Cc: sta...@vger.kernel.org
Signed-off-by: Andrew Donnellan 

---

This patch requires http://patchwork.ozlabs.org/patch/658324/. It should go
through the powerpc tree.
---
 drivers/misc/cxl/vphb.c   | 10 +-
 drivers/pci/host-bridge.c |  1 +
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/misc/cxl/vphb.c b/drivers/misc/cxl/vphb.c
index 7ada5f1..3519ace 100644
--- a/drivers/misc/cxl/vphb.c
+++ b/drivers/misc/cxl/vphb.c
@@ -230,6 +230,11 @@ int cxl_pci_vphb_add(struct cxl_afu *afu)
if (phb->bus == NULL)
return -ENXIO;
 
+   /* Set release hook on root bus */
+   pci_set_host_bridge_release(to_pci_host_bridge(phb->bus->bridge),
+   pcibios_free_controller_deferred,
+   (void *) phb);
+
/* Claim resources. This might need some rework as well depending
 * whether we are doing probe-only or not, like assigning unassigned
 * resources etc...
@@ -256,7 +261,10 @@ void cxl_pci_vphb_remove(struct cxl_afu *afu)
afu->phb = NULL;
 
pci_remove_root_bus(phb->bus);
-   pcibios_free_controller(phb);
+   /*
+* We don't free phb here - that's handled by
+* pcibios_free_controller_deferred()
+*/
 }
 
 static bool _cxl_pci_is_vphb_device(struct pci_controller *phb)
diff --git a/drivers/pci/host-bridge.c b/drivers/pci/host-bridge.c
index 5f4a2e0..add6623 100644
--- a/drivers/pci/host-bridge.c
+++ b/drivers/pci/host-bridge.c
@@ -44,6 +44,7 @@ void pci_set_host_bridge_release(struct pci_host_bridge 
*bridge,
bridge->release_fn = release_fn;
bridge->release_data = release_data;
 }
+EXPORT_SYMBOL_GPL(pci_set_host_bridge_release);
 
 void pcibios_resource_to_bus(struct pci_bus *bus, struct pci_bus_region 
*region,
 struct resource *res)
-- 
Andrew Donnellan  OzLabs, ADL Canberra
andrew.donnel...@au1.ibm.com  IBM Australia Limited



Re: [PATCH v2 2/2] kexec: extend kexec_file_load system call

2016-08-18 Thread Dave Young
Since Eric was objecting the extension, I think you should convince him,
but I will review from code point of view.

On 08/11/16 at 08:03pm, Thiago Jung Bauermann wrote:
> From: AKASHI Takahiro 
> 
> Device tree blob must be passed to a second kernel on DTB-capable
> archs, like powerpc and arm64, but the current kernel interface
> lacks this support.
> 
> This patch extends kexec_file_load system call by adding an extra
> argument to this syscall so that an arbitrary number of file descriptors
> can be handed out from user space to the kernel.
> 
>   long sys_kexec_file_load(int kernel_fd, int initrd_fd,
>unsigned long cmdline_len,
>const char __user *cmdline_ptr,
>unsigned long flags,
>const struct kexec_fdset __user *ufdset);
> 
> If KEXEC_FILE_EXTRA_FDS is set to the "flags" argument, the "ufdset"
> argument points to the following struct buffer:
> 
>   struct kexec_fdset {
>   int nr_fds;
>   struct kexec_file_fd fds[0];
>   }
> 
> Signed-off-by: AKASHI Takahiro 
> Signed-off-by: Thiago Jung Bauermann 
> ---
>  include/linux/fs.h |  1 +
>  include/linux/kexec.h  |  7 ++--
>  include/linux/syscalls.h   |  4 ++-
>  include/uapi/linux/kexec.h | 22 
>  kernel/kexec_file.c| 83 
> ++
>  5 files changed, 108 insertions(+), 9 deletions(-)
> 
> diff --git a/include/linux/fs.h b/include/linux/fs.h
> index 3523bf62f328..847d9c31f428 100644
> --- a/include/linux/fs.h
> +++ b/include/linux/fs.h
> @@ -2656,6 +2656,7 @@ extern int do_pipe_flags(int *, int);
>   id(MODULE, kernel-module)   \
>   id(KEXEC_IMAGE, kexec-image)\
>   id(KEXEC_INITRAMFS, kexec-initramfs)\
> + id(KEXEC_PARTIAL_DTB, kexec-partial-dtb)\
>   id(POLICY, security-policy) \
>   id(MAX_ID, )
>  
> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
> index 4f85d284ed0b..29202935055d 100644
> --- a/include/linux/kexec.h
> +++ b/include/linux/kexec.h
> @@ -148,7 +148,10 @@ struct kexec_file_ops {
>   kexec_verify_sig_t *verify_sig;
>  #endif
>  };
> -#endif
> +
> +int __weak arch_kexec_verify_buffer(enum kexec_file_type type, const void 
> *buf,
> + unsigned long size);
> +#endif /* CONFIG_KEXEC_FILE */
>  
>  struct kimage {
>   kimage_entry_t head;
> @@ -280,7 +283,7 @@ extern int kexec_load_disabled;
>  
>  /* List of defined/legal kexec file flags */
>  #define KEXEC_FILE_FLAGS (KEXEC_FILE_UNLOAD | KEXEC_FILE_ON_CRASH | \
> -  KEXEC_FILE_NO_INITRAMFS)
> +  KEXEC_FILE_NO_INITRAMFS | KEXEC_FILE_EXTRA_FDS)
>  
>  #define VMCOREINFO_BYTES   (4096)
>  #define VMCOREINFO_NOTE_NAME   "VMCOREINFO"
> diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h
> index d02239022bd0..fc072bdb74e3 100644
> --- a/include/linux/syscalls.h
> +++ b/include/linux/syscalls.h
> @@ -66,6 +66,7 @@ struct perf_event_attr;
>  struct file_handle;
>  struct sigaltstack;
>  union bpf_attr;
> +struct kexec_fdset;
>  
>  #include 
>  #include 
> @@ -321,7 +322,8 @@ asmlinkage long sys_kexec_load(unsigned long entry, 
> unsigned long nr_segments,
>  asmlinkage long sys_kexec_file_load(int kernel_fd, int initrd_fd,
>   unsigned long cmdline_len,
>   const char __user *cmdline_ptr,
> - unsigned long flags);
> + unsigned long flags,
> + const struct kexec_fdset __user *ufdset);
>  
>  asmlinkage long sys_exit(int error_code);
>  asmlinkage long sys_exit_group(int error_code);
> diff --git a/include/uapi/linux/kexec.h b/include/uapi/linux/kexec.h
> index aae5ebf2022b..6279be79efba 100644
> --- a/include/uapi/linux/kexec.h
> +++ b/include/uapi/linux/kexec.h
> @@ -23,6 +23,28 @@
>  #define KEXEC_FILE_UNLOAD0x0001
>  #define KEXEC_FILE_ON_CRASH  0x0002
>  #define KEXEC_FILE_NO_INITRAMFS  0x0004
> +#define KEXEC_FILE_EXTRA_FDS 0x0008
> +
> +enum kexec_file_type {
> + KEXEC_FILE_TYPE_KERNEL,
> + KEXEC_FILE_TYPE_INITRAMFS,
> +
> + /*
> +  * Device Tree Blob containing just the nodes and properties that
> +  * the kexec_file_load caller wants to add or modify.
> +  */
> + KEXEC_FILE_TYPE_PARTIAL_DTB,
> +};
> +
> +struct kexec_file_fd {
> + enum kexec_file_type type;
> + int fd;
> +};
> +
> +struct kexec_fdset {
> + int nr_fds;
> + struct kexec_file_fd fds[0];
> +};
>  
>  /* These values match the ELF architecture values.
>   * Unless there is a good reason that should continue to be the case.
> diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
> index 113af2f219b9..d6803dd884e2 100644
> --- a/kernel/kexec_file.c
> +++ b/

Re: debug problems on ppc 83xx target due to changed struct task_struct

2016-08-18 Thread Christophe Leroy



Le 17/08/2016 à 17:27, Holger Brunck a écrit :

On 16/08/16 19:27, christophe leroy wrote:


Le 15/08/2016 à 18:19, Dave Hansen a écrit :

On 08/15/2016 07:35 AM, Holger Brunck wrote:

I tried this but unfortunately the error only occurs while remote debugging.
Locally with gdb everything works fine. BTW we double-checked with a 85xx ppc
target which is also 32-bit and it ends up with the same behaviour.

I was also investigating where I have to move the line in the struct task_struct
and it turns out to be like this (diff to 4.7 kernel):

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 253538f..4868874 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1655,7 +1655,9 @@ struct task_struct {
 struct signal_struct *signal;
 struct sighand_struct *sighand;

+   // struct thread_struct thread;   // until here everything is fine
 sigset_t blocked, real_blocked;
+   struct thread_struct thread;  // from here it's broken
 sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used 
*/
 struct sigpending pending;

Wow, thanks for all the debugging here!

So, we know it has to do with signals, thread_info, and probably only
affects 32-bit powerpc.  Seems awfully weird.  Have you checked with any
of the 64-bit powerpc guys to see if they have any ideas?

I went grepping around for a bit.

Where is the task_struct stored?  Is it on-stack on ppc32 or something?
  The thread_info is, I assume, but I see some THREAD_INFO vs. THREAD
(thread struct) math happening in here, which confuses me:

 .globl  ret_from_debug_exc
ret_from_debug_exc:
 mfspr   r9,SPRN_SPRG_THREAD
 lwz r10,SAVED_KSP_LIMIT(r1)
 stw r10,KSP_LIMIT(r9)
 lwz r9,THREAD_INFO-THREAD(r9)
 CURRENT_THREAD_INFO(r10, r1)
 lwz r10,TI_PREEMPT(r10)
 stw r10,TI_PREEMPT(r9)
 RESTORE_xSRR(SRR0,SRR1);
 RESTORE_xSRR(CSRR0,CSRR1);
 RESTORE_MMU_REGS;
 RET_FROM_EXC_LEVEL(SPRN_DSRR0, SPRN_DSRR1, PPC_RFDI)

But, I'm really at a loss to explain this.  It still seems like a deeply
ppc-specific issue.  We can obviously work around it with an #ifdef for
your platform, but that's awfully hackish and hides the real bug,
whatever it is.

My suspicion is that there's a bug in the 32-bit ppc assembly somewhere.
  I don't see any references to 'blocked' or 'real_blocked' in assembly
though.  You could add a bunch of padding instead of moving the
thread_struct and see if that does anything, but that's really a stab in
the dark.


Just to let you know, I'm not sure it is the same issue, but I also get
my 8xx target stuck when I try to use gdbserver.

If I debug a very small app, it gets stuck quickly after the app has
stopped: indeed, the console seems ok but as soon as I try to execute
something simple, like a ps or top, it get stuck. The target still
responds to pings, but nothing else.

If I debug a big app, it gets stuck soon after the start of debug: I set
a bpoint at main(), do a 'continue', get breaked at main(), do some
steps with 'next' then it gets stuck.

I have tried moving the struct thread_struct thread but it has no impact.


that sounds a bit different to what I see. Is your program also mutli-threaded?




No my program is a simple app doing a few printf("Hello World !"); and 
nothing more.


I have now identified the issue, it is most likely specific to the 8xx: 
when entering single step exception, the 8xx asserts the internal Freeze 
which stops the decrementer and the timebase. In order to clear the 
internal Freeze, the ICR SPR has to be read.


I'll now be able to check with your program and see how it behaves.

Christophe


Re: [PATCH v2 1/2] kexec: add dtb info to struct kimage

2016-08-18 Thread Dave Young
On 08/11/16 at 08:03pm, Thiago Jung Bauermann wrote:
> From: AKASHI Takahiro 
> 
> Device tree blob must be passed to a second kernel on DTB-capable
> archs, like powerpc and arm64, but the current kernel interface
> lacks this support.
> 
> This patch adds dtb buffer information to struct kimage.
> When users don't specify dtb explicitly and the one used for the current
> kernel can be re-used, this change will be good enough for implementing
> kexec_file_load feature.
> 
> Signed-off-by: AKASHI Takahiro 
> ---
>  include/linux/kexec.h | 3 +++
>  kernel/kexec_file.c   | 3 +++
>  2 files changed, 6 insertions(+)
> 
> diff --git a/include/linux/kexec.h b/include/linux/kexec.h
> index d743baaa..4f85d284ed0b 100644
> --- a/include/linux/kexec.h
> +++ b/include/linux/kexec.h
> @@ -192,6 +192,9 @@ struct kimage {
>   char *cmdline_buf;
>   unsigned long cmdline_buf_len;
>  
> + void *dtb_buf;
> + unsigned long dtb_buf_len;
> +
>   /* File operations provided by image loader */
>   struct kexec_file_ops *fops;
>  
> diff --git a/kernel/kexec_file.c b/kernel/kexec_file.c
> index 503bc2d348e5..113af2f219b9 100644
> --- a/kernel/kexec_file.c
> +++ b/kernel/kexec_file.c
> @@ -92,6 +92,9 @@ void kimage_file_post_load_cleanup(struct kimage *image)
>   vfree(image->initrd_buf);
>   image->initrd_buf = NULL;
>  
> + vfree(image->dtb_buf);
> + image->dtb_buf = NULL;
> +
>   kfree(image->cmdline_buf);
>   image->cmdline_buf = NULL;
>  
> -- 
> 1.9.1
> 
> 
> ___
> kexec mailing list
> ke...@lists.infradead.org
> http://lists.infradead.org/mailman/listinfo/kexec

Acked-by: Dave Young 

Thanks
Dave


Re: [PATCH v2] powerpc: move hmi.c to arch/powerpc/kvm/

2016-08-18 Thread Paolo Bonzini


On 11/08/2016 15:07, Paolo Bonzini wrote:
> hmi.c functions are unused unless sibling_subcore_state is nonzero, and
> that in turn happens only if KVM is in use.  So move the code to
> arch/powerpc/kvm/, putting it under CONFIG_KVM_BOOK3S_HV_POSSIBLE
> rather than CONFIG_PPC_BOOK3S_64.  The sibling_subcore_state is also
> included in struct paca_struct only if KVM is supported by the kernel.
> 
> Cc: Daniel Axtens 
> Cc: Michael Ellerman 
> Cc: Mahesh Salgaonkar 
> Cc: Paul Mackerras 
> Cc: linuxppc-dev@lists.ozlabs.org
> Cc: kvm-...@vger.kernel.org
> Cc: k...@vger.kernel.org
> Signed-off-by: Paolo Bonzini 
> ---
>   v1->v2: use CONFIG_KVM_BOOK3S_HV_POSSIBLE, not
>   CONFIG_KVM_BOOK3S_64_HANDLER.  The former implies
>   the latter, but the reverse is not true.
> 
>  arch/powerpc/include/asm/hmi.h |  2 +-
>  arch/powerpc/include/asm/paca.h| 12 +++-
>  arch/powerpc/kernel/Makefile   |  2 +-
>  arch/powerpc/kvm/Makefile  |  1 +
>  arch/powerpc/{kernel/hmi.c => kvm/book3s_hv_hmi.c} |  0
>  5 files changed, 10 insertions(+), 7 deletions(-)
>  rename arch/powerpc/{kernel/hmi.c => kvm/book3s_hv_hmi.c} (100%)
> 
> diff --git a/arch/powerpc/include/asm/hmi.h b/arch/powerpc/include/asm/hmi.h
> index 88b4901ac4ee..85b7a1a21e22 100644
> --- a/arch/powerpc/include/asm/hmi.h
> +++ b/arch/powerpc/include/asm/hmi.h
> @@ -21,7 +21,7 @@
>  #ifndef __ASM_PPC64_HMI_H__
>  #define __ASM_PPC64_HMI_H__
>  
> -#ifdef CONFIG_PPC_BOOK3S_64
> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
>  
>  #define  CORE_TB_RESYNC_REQ_BIT  63
>  #define MAX_SUBCORE_PER_CORE 4
> diff --git a/arch/powerpc/include/asm/paca.h b/arch/powerpc/include/asm/paca.h
> index 148303e7771f..6a6792bb39fb 100644
> --- a/arch/powerpc/include/asm/paca.h
> +++ b/arch/powerpc/include/asm/paca.h
> @@ -183,11 +183,6 @@ struct paca_struct {
>*/
>   u16 in_mce;
>   u8 hmi_event_available;  /* HMI event is available */
> - /*
> -  * Bitmap for sibling subcore status. See kvm/book3s_hv_ras.c for
> -  * more details
> -  */
> - struct sibling_subcore_state *sibling_subcore_state;
>  #endif
>  
>   /* Stuff for accurate time accounting */
> @@ -202,6 +197,13 @@ struct paca_struct {
>   struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
>  #endif
>   struct kvmppc_host_state kvm_hstate;
> +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> + /*
> +  * Bitmap for sibling subcore status. See kvm/book3s_hv_ras.c for
> +  * more details
> +  */
> + struct sibling_subcore_state *sibling_subcore_state;
> +#endif
>  #endif
>  };
>  
> diff --git a/arch/powerpc/kernel/Makefile b/arch/powerpc/kernel/Makefile
> index b2027a5cf508..fe4c075bcf50 100644
> --- a/arch/powerpc/kernel/Makefile
> +++ b/arch/powerpc/kernel/Makefile
> @@ -41,7 +41,7 @@ obj-$(CONFIG_VDSO32)+= vdso32/
>  obj-$(CONFIG_HAVE_HW_BREAKPOINT) += hw_breakpoint.o
>  obj-$(CONFIG_PPC_BOOK3S_64)  += cpu_setup_ppc970.o cpu_setup_pa6t.o
>  obj-$(CONFIG_PPC_BOOK3S_64)  += cpu_setup_power.o
> -obj-$(CONFIG_PPC_BOOK3S_64)  += mce.o mce_power.o hmi.o
> +obj-$(CONFIG_PPC_BOOK3S_64)  += mce.o mce_power.o
>  obj-$(CONFIG_PPC_BOOK3E_64)  += exceptions-64e.o idle_book3e.o
>  obj-$(CONFIG_PPC64)  += vdso64/
>  obj-$(CONFIG_ALTIVEC)+= vecemu.o
> diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
> index 1f9e5529e692..855d4b95d752 100644
> --- a/arch/powerpc/kvm/Makefile
> +++ b/arch/powerpc/kvm/Makefile
> @@ -78,6 +78,7 @@ kvm-book3s_64-builtin-xics-objs-$(CONFIG_KVM_XICS) := \
>  
>  ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
>  kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
> + book3s_hv_hmi.o \
>   book3s_hv_rmhandlers.o \
>   book3s_hv_rm_mmu.o \
>   book3s_hv_ras.o \
> diff --git a/arch/powerpc/kernel/hmi.c b/arch/powerpc/kvm/book3s_hv_hmi.c
> similarity index 100%
> rename from arch/powerpc/kernel/hmi.c
> rename to arch/powerpc/kvm/book3s_hv_hmi.c
> 

Ping?

Paolo


Re: [PATCH v2 3/6] kexec_file: Allow skipping checksum calculation for some segments.

2016-08-18 Thread Dave Young
On 08/13/16 at 12:18am, Thiago Jung Bauermann wrote:
> Adds checksum argument to kexec_add_buffer specifying whether the given
> segment should be part of the checksum calculation.
> 

Since it is used with add buffer, could it be added to kbuf as a new
field?

Like kbuf.no_checksum, default value is 0 that means checksum is needed
if it is 1 then no need a checksum.

> The next patch will add a way to update segments after a kimage is loaded.
> Segments that will be updated in this way should not be checksummed,
> otherwise they will cause the purgatory checksum verification to fail
> when the machine is rebooted.
> 
> As a bonus, we don't need to special-case the purgatory segment anymore
> to avoid checksumming it.
> 
> Adjust call sites for the new argument.
> 
> Signed-off-by: Thiago Jung Bauermann 
> ---
>  arch/powerpc/kernel/kexec_elf_64.c |  6 +++---
>  arch/x86/kernel/crash.c|  4 ++--
>  arch/x86/kernel/kexec-bzimage64.c  |  6 +++---
>  include/linux/kexec.h  | 10 +++---
>  kernel/kexec_file.c| 23 ---
>  5 files changed, 27 insertions(+), 22 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/kexec_elf_64.c 
> b/arch/powerpc/kernel/kexec_elf_64.c
> index 22afc7b5ee73..4c528c81b076 100644
> --- a/arch/powerpc/kernel/kexec_elf_64.c
> +++ b/arch/powerpc/kernel/kexec_elf_64.c
> @@ -128,7 +128,7 @@ static int elf_exec_load(struct kimage *image, struct 
> elfhdr *ehdr,
>   kbuf.memsz = phdr->p_memsz;
>   kbuf.buf_align = phdr->p_align;
>   kbuf.buf_min = phdr->p_paddr + base;
> - ret = kexec_add_buffer(&kbuf);
> + ret = kexec_add_buffer(&kbuf, true);
>   if (ret)
>   goto out;
>   load_addr = kbuf.mem;
> @@ -188,7 +188,7 @@ void *elf64_load(struct kimage *image, char *kernel_buf,
>   kbuf.bufsz = kbuf.memsz = initrd_len;
>   kbuf.buf_align = PAGE_SIZE;
>   kbuf.top_down = false;
> - ret = kexec_add_buffer(&kbuf);
> + ret = kexec_add_buffer(&kbuf, true);
>   if (ret)
>   goto out;
>   initrd_load_addr = kbuf.mem;
> @@ -245,7 +245,7 @@ void *elf64_load(struct kimage *image, char *kernel_buf,
>   kbuf.bufsz = kbuf.memsz = fdt_size;
>   kbuf.buf_align = PAGE_SIZE;
>   kbuf.top_down = true;
> - ret = kexec_add_buffer(&kbuf);
> + ret = kexec_add_buffer(&kbuf, true);
>   if (ret)
>   goto out;
>   fdt_load_addr = kbuf.mem;
> diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
> index 38a1cdf6aa05..634ab16377b1 100644
> --- a/arch/x86/kernel/crash.c
> +++ b/arch/x86/kernel/crash.c
> @@ -642,7 +642,7 @@ int crash_load_segments(struct kimage *image)
>* copied in purgatory after crash. Just add a zero filled
>* segment for now to make sure checksum logic works fine.
>*/
> - ret = kexec_add_buffer(&kbuf);
> + ret = kexec_add_buffer(&kbuf, true);
>   if (ret)
>   return ret;
>   image->arch.backup_load_addr = kbuf.mem;
> @@ -661,7 +661,7 @@ int crash_load_segments(struct kimage *image)
>  
>   kbuf.memsz = kbuf.bufsz;
>   kbuf.buf_align = ELF_CORE_HEADER_ALIGN;
> - ret = kexec_add_buffer(&kbuf);
> + ret = kexec_add_buffer(&kbuf, true);
>   if (ret) {
>   vfree((void *)image->arch.elf_headers);
>   return ret;
> diff --git a/arch/x86/kernel/kexec-bzimage64.c 
> b/arch/x86/kernel/kexec-bzimage64.c
> index 4b3a75329fb6..a46e3fbb0639 100644
> --- a/arch/x86/kernel/kexec-bzimage64.c
> +++ b/arch/x86/kernel/kexec-bzimage64.c
> @@ -422,7 +422,7 @@ static void *bzImage64_load(struct kimage *image, char 
> *kernel,
>   kbuf.memsz = kbuf.bufsz;
>   kbuf.buf_align = 16;
>   kbuf.buf_min = MIN_BOOTPARAM_ADDR;
> - ret = kexec_add_buffer(&kbuf);
> + ret = kexec_add_buffer(&kbuf, true);
>   if (ret)
>   goto out_free_params;
>   bootparam_load_addr = kbuf.mem;
> @@ -435,7 +435,7 @@ static void *bzImage64_load(struct kimage *image, char 
> *kernel,
>   kbuf.memsz = PAGE_ALIGN(header->init_size);
>   kbuf.buf_align = header->kernel_alignment;
>   kbuf.buf_min = MIN_KERNEL_LOAD_ADDR;
> - ret = kexec_add_buffer(&kbuf);
> + ret = kexec_add_buffer(&kbuf, true);
>   if (ret)
>   goto out_free_params;
>   kernel_load_addr = kbuf.mem;
> @@ -449,7 +449,7 @@ static void *bzImage64_load(struct kimage *image, char 
> *kernel,
>   kbuf.bufsz = kbuf.memsz = initrd_len;
>   kbuf.buf_align = PAGE_SIZE;
>   kbuf.buf_min = MIN_INITRD_LOAD_ADDR;
> - ret = kexec_add_buffer(&kbuf);
> + ret = kexec_add_buffer(&kbuf, true);
>   if (ret)
>   goto out_free_params;
>   initrd_load_addr = kbuf.mem;
> dif

[PATCH] powerpc/8xx: fix single_step debug

2016-08-18 Thread Christophe Leroy
SPRN_ICR must be read for clearing the internal freeze signal which
is asserted by the single step exception, otherwise the timebase and
decrementer remain freezed

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/reg_8xx.h | 1 +
 arch/powerpc/kernel/traps.c| 8 
 2 files changed, 9 insertions(+)

diff --git a/arch/powerpc/include/asm/reg_8xx.h 
b/arch/powerpc/include/asm/reg_8xx.h
index feaf641..6dae71f 100644
--- a/arch/powerpc/include/asm/reg_8xx.h
+++ b/arch/powerpc/include/asm/reg_8xx.h
@@ -17,6 +17,7 @@
 #define SPRN_DC_DAT570 /* Read-only data register */
 
 /* Misc Debug */
+#define SPRN_ICR   148
 #define SPRN_DPDR  630
 #define SPRN_MI_CAM816
 #define SPRN_MI_RAM0   817
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 2cb5892..0f1f0ce 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -400,8 +400,16 @@ static inline int check_io_access(struct pt_regs *regs)
 #define REASON_TRAP0x2
 
 #define single_stepping(regs)  ((regs)->msr & MSR_SE)
+#ifdef CONFIG_PPC_8xx
+static inline void clear_single_step(struct pt_regs *regs)
+{
+   regs->msr &= ~MSR_SE;
+   mfspr(SPRN_ICR);
+}
+#else
 #define clear_single_step(regs)((regs)->msr &= ~MSR_SE)
 #endif
+#endif
 
 #if defined(CONFIG_4xx)
 int machine_check_4xx(struct pt_regs *regs)
-- 
2.1.0



Re: [PATCH] powerpc/8xx: fix single_step debug

2016-08-18 Thread Gabriel Paubert
On Thu, Aug 18, 2016 at 11:44:20AM +0200, Christophe Leroy wrote:
> SPRN_ICR must be read for clearing the internal freeze signal which
> is asserted by the single step exception, otherwise the timebase and
> decrementer remain freezed

Minor nit: s/freezed/frozen/

If the timebase and decrementer are frozen even for a few cycles, this
probably upsets timekeeping. I consider this a completely stupid design
decision, and maybe I'm not alone.

Gabriel

> 
> Signed-off-by: Christophe Leroy 
> ---
>  arch/powerpc/include/asm/reg_8xx.h | 1 +
>  arch/powerpc/kernel/traps.c| 8 
>  2 files changed, 9 insertions(+)
> 
> diff --git a/arch/powerpc/include/asm/reg_8xx.h 
> b/arch/powerpc/include/asm/reg_8xx.h
> index feaf641..6dae71f 100644
> --- a/arch/powerpc/include/asm/reg_8xx.h
> +++ b/arch/powerpc/include/asm/reg_8xx.h
> @@ -17,6 +17,7 @@
>  #define SPRN_DC_DAT  570 /* Read-only data register */
>  
>  /* Misc Debug */
> +#define SPRN_ICR 148
>  #define SPRN_DPDR630
>  #define SPRN_MI_CAM  816
>  #define SPRN_MI_RAM0 817
> diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
> index 2cb5892..0f1f0ce 100644
> --- a/arch/powerpc/kernel/traps.c
> +++ b/arch/powerpc/kernel/traps.c
> @@ -400,8 +400,16 @@ static inline int check_io_access(struct pt_regs *regs)
>  #define REASON_TRAP  0x2
>  
>  #define single_stepping(regs)((regs)->msr & MSR_SE)
> +#ifdef CONFIG_PPC_8xx
> +static inline void clear_single_step(struct pt_regs *regs)
> +{
> + regs->msr &= ~MSR_SE;
> + mfspr(SPRN_ICR);
> +}
> +#else
>  #define clear_single_step(regs)  ((regs)->msr &= ~MSR_SE)
>  #endif
> +#endif
>  
>  #if defined(CONFIG_4xx)
>  int machine_check_4xx(struct pt_regs *regs)
> -- 
> 2.1.0


Re: [PATCH] powerpc/8xx: fix single_step debug

2016-08-18 Thread Christophe Leroy



Le 18/08/2016 à 11:58, Gabriel Paubert a écrit :

On Thu, Aug 18, 2016 at 11:44:20AM +0200, Christophe Leroy wrote:

SPRN_ICR must be read for clearing the internal freeze signal which
is asserted by the single step exception, otherwise the timebase and
decrementer remain freezed


Minor nit: s/freezed/frozen/

If the timebase and decrementer are frozen even for a few cycles, this
probably upsets timekeeping. I consider this a completely stupid design
decision, and maybe I'm not alone.

Gabriel


We could also unset TBF bit (TimeBase Freeze enable) in TBSCR register 
(today it is set in arch/powerpc/platforms/8xx/m8xx_setup.c) but then it 
would impact debug done with an external BDM system which expects the 
decrementer and TB frozen when it freezes the execution.


Christophe






Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/reg_8xx.h | 1 +
 arch/powerpc/kernel/traps.c| 8 
 2 files changed, 9 insertions(+)

diff --git a/arch/powerpc/include/asm/reg_8xx.h 
b/arch/powerpc/include/asm/reg_8xx.h
index feaf641..6dae71f 100644
--- a/arch/powerpc/include/asm/reg_8xx.h
+++ b/arch/powerpc/include/asm/reg_8xx.h
@@ -17,6 +17,7 @@
 #define SPRN_DC_DAT570 /* Read-only data register */

 /* Misc Debug */
+#define SPRN_ICR   148
 #define SPRN_DPDR  630
 #define SPRN_MI_CAM816
 #define SPRN_MI_RAM0   817
diff --git a/arch/powerpc/kernel/traps.c b/arch/powerpc/kernel/traps.c
index 2cb5892..0f1f0ce 100644
--- a/arch/powerpc/kernel/traps.c
+++ b/arch/powerpc/kernel/traps.c
@@ -400,8 +400,16 @@ static inline int check_io_access(struct pt_regs *regs)
 #define REASON_TRAP0x2

 #define single_stepping(regs)  ((regs)->msr & MSR_SE)
+#ifdef CONFIG_PPC_8xx
+static inline void clear_single_step(struct pt_regs *regs)
+{
+   regs->msr &= ~MSR_SE;
+   mfspr(SPRN_ICR);
+}
+#else
 #define clear_single_step(regs)((regs)->msr &= ~MSR_SE)
 #endif
+#endif

 #if defined(CONFIG_4xx)
 int machine_check_4xx(struct pt_regs *regs)
--
2.1.0


Re: [PATCH] powerpc/8xx: fix single_step debug

2016-08-18 Thread Gabriel Paubert
On Thu, Aug 18, 2016 at 12:13:21PM +0200, Christophe Leroy wrote:
> 
> 
> Le 18/08/2016 à 11:58, Gabriel Paubert a écrit :
> >On Thu, Aug 18, 2016 at 11:44:20AM +0200, Christophe Leroy wrote:
> >>SPRN_ICR must be read for clearing the internal freeze signal which
> >>is asserted by the single step exception, otherwise the timebase and
> >>decrementer remain freezed
> >
> >Minor nit: s/freezed/frozen/
> >
> >If the timebase and decrementer are frozen even for a few cycles, this
> >probably upsets timekeeping. I consider this a completely stupid design
> >decision, and maybe I'm not alone.
> >
> >Gabriel
> 
> We could also unset TBF bit (TimeBase Freeze enable) in TBSCR
> register (today it is set in
> arch/powerpc/platforms/8xx/m8xx_setup.c) but then it would impact
> debug done with an external BDM system which expects the decrementer
> and TB frozen when it freezes the execution.

Ok, I believe that systematically setting it is a mistake, but then I'm
always a bit nervous about screwing up timekeeping (it certainly is
always a very bad idea when you are driving telescopes).

Gabriel


Re: [PATCH v2 0/2] extend kexec_file_load system call

2016-08-18 Thread Mark Rutland
On Thu, Aug 11, 2016 at 08:03:56PM -0300, Thiago Jung Bauermann wrote:
> This patch series is from AKASHI Takahiro. I will use it in my next
> version of the kexec_file_load implementation for powerpc, so I am
> rebasing it on top of v4.8-rc1.

[...]

> Original cover letter:
> 
> Device tree blob must be passed to a second kernel on DTB-capable
> archs, like powerpc and arm64, but the current kernel interface
> lacks this support.
> 
> This patch extends kexec_file_load system call by adding an extra
> argument to this syscall so that an arbitrary number of file descriptors
> can be handed out from user space to the kernel.
> 
> See the background [1].
> 
> Please note that the new interface looks quite similar to the current
> system call, but that it won't always mean that it provides the "binary
> compatibility."
> 
> [1] http://lists.infradead.org/pipermail/kexec/2016-June/016276.html

As with the original posting, I have a number of concerns, and I'm
really not keen on this.

* For typical usecases, I do not believe that this is necessary (at
  least for arm64), and generally do not believe that it should be
  necessary for a user to manipulate the DTB (much like the user need
  not manipulate ACPI tables or other FW data structures).

  Other than (potentially) the case of Linux as a flashed-in bootloader,
  I don't see a compelling case for modifying the DTB that could not be
  accomplished in-kernel. For that case, if truly necessary, I think
  that we can get away with something simpler.

* This series adds architecture-specific hooks, but doesn't define what
  the architecture code is expected to do. For example, what is the
  format of the partial DTB? Is it formatted as an overlay, or a regular
  DTB that is expected to be merged somehow?

  I'm afraid that the scope is unbound, and we'll see requests to
  whitelist/blacklist arbitrary nodes or properties in arch code. This
  goes against the original simple design of kexec_file_load. It also
  implies that we're going to have varied architecture-specific
  semantics, and that arch code might not consistently check all that it
  should.
  
* Further, I believe that this offers a lot of scope for unintentionally
  allowing certain modifications to the DTB that we do not want, and
  avoiding that in general is very tricky. e.g. if we allow the
  insertion or modification of nodes, how do we prevent phandle target
  hijacking?

I really don't think that this is a good idea. Please consider this a
NAK from my end.

Thanks,
Mark.


Re: [PATCH] powerpc/8xx: fix single_step debug

2016-08-18 Thread Christophe Leroy



Le 18/08/2016 à 12:16, Gabriel Paubert a écrit :

On Thu, Aug 18, 2016 at 12:13:21PM +0200, Christophe Leroy wrote:



Le 18/08/2016 à 11:58, Gabriel Paubert a écrit :

On Thu, Aug 18, 2016 at 11:44:20AM +0200, Christophe Leroy wrote:

SPRN_ICR must be read for clearing the internal freeze signal which
is asserted by the single step exception, otherwise the timebase and
decrementer remain freezed


Minor nit: s/freezed/frozen/

If the timebase and decrementer are frozen even for a few cycles, this
probably upsets timekeeping. I consider this a completely stupid design
decision, and maybe I'm not alone.

   Gabriel


We could also unset TBF bit (TimeBase Freeze enable) in TBSCR
register (today it is set in
arch/powerpc/platforms/8xx/m8xx_setup.c) but then it would impact
debug done with an external BDM system which expects the decrementer
and TB frozen when it freezes the execution.


Ok, I believe that systematically setting it is a mistake, but then I'm
always a bit nervous about screwing up timekeeping (it certainly is
always a very bad idea when you are driving telescopes).



Indeed you are right, this should not happen. The issue is due to the 
fact that the bootloader set the TRE bit in the DER register.

So the fix is to be done in the boot loader.

Christophe


[PATCH 15/16] powerpc: powermac: Convert to hotplug state machine

2016-08-18 Thread Sebastian Andrzej Siewior
Install the callbacks via the state machine.
I assume here that the powermac has two CPUs and so only one can go up
or down at a time. The variable smp_core99_host_open is here to ensure
that we do not try to open or close the i2c host twice if something goes
wrong and we invoke the prepare or online callback twice due to
rollback.

Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Sebastian Andrzej Siewior 
---
 arch/powerpc/platforms/powermac/smp.c | 50 +--
 include/linux/cpuhotplug.h|  1 +
 2 files changed, 26 insertions(+), 25 deletions(-)

diff --git a/arch/powerpc/platforms/powermac/smp.c 
b/arch/powerpc/platforms/powermac/smp.c
index 834868b9fdc9..4a853323f906 100644
--- a/arch/powerpc/platforms/powermac/smp.c
+++ b/arch/powerpc/platforms/powermac/smp.c
@@ -852,37 +852,33 @@ static void smp_core99_setup_cpu(int cpu_nr)
 
 #ifdef CONFIG_PPC64
 #ifdef CONFIG_HOTPLUG_CPU
-static int smp_core99_cpu_notify(struct notifier_block *self,
-unsigned long action, void *hcpu)
+static unsigned int smp_core99_host_open;
+
+static int smp_core99_cpu_prepare(unsigned int cpu)
 {
int rc;
 
-   switch(action & ~CPU_TASKS_FROZEN) {
-   case CPU_UP_PREPARE:
-   /* Open i2c bus if it was used for tb sync */
-   if (pmac_tb_clock_chip_host) {
-   rc = pmac_i2c_open(pmac_tb_clock_chip_host, 1);
-   if (rc) {
-   pr_err("Failed to open i2c bus for time 
sync\n");
-   return notifier_from_errno(rc);
-   }
+   /* Open i2c bus if it was used for tb sync */
+   if (pmac_tb_clock_chip_host && !smp_core99_host_open) {
+   rc = pmac_i2c_open(pmac_tb_clock_chip_host, 1);
+   if (rc) {
+   pr_err("Failed to open i2c bus for time sync\n");
+   return notifier_from_errno(rc);
}
-   break;
-   case CPU_ONLINE:
-   case CPU_UP_CANCELED:
-   /* Close i2c bus if it was used for tb sync */
-   if (pmac_tb_clock_chip_host)
-   pmac_i2c_close(pmac_tb_clock_chip_host);
-   break;
-   default:
-   break;
+   smp_core99_host_open = 1;
}
-   return NOTIFY_OK;
+   return 0;
 }
 
-static struct notifier_block smp_core99_cpu_nb = {
-   .notifier_call  = smp_core99_cpu_notify,
-};
+static int smp_core99_cpu_online(unsigned int cpu)
+{
+   /* Close i2c bus if it was used for tb sync */
+   if (pmac_tb_clock_chip_host && smp_core99_host_open) {
+   pmac_i2c_close(pmac_tb_clock_chip_host);
+   smp_core99_host_open = 0;
+   }
+   return 0;
+}
 #endif /* CONFIG_HOTPLUG_CPU */
 
 static void __init smp_core99_bringup_done(void)
@@ -902,7 +898,11 @@ static void __init smp_core99_bringup_done(void)
g5_phy_disable_cpu1();
}
 #ifdef CONFIG_HOTPLUG_CPU
-   register_cpu_notifier(&smp_core99_cpu_nb);
+   cpuhp_setup_state_nocalls(CPUHP_POWER_PMAC_PREPARE,
+ "POWER_PMAC_PREPARE", smp_core99_cpu_prepare,
+ NULL);
+   cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, "AP_POWER_PMAC_ONLINE",
+ smp_core99_cpu_online, NULL);
 #endif
 
if (ppc_md.progress)
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 9e50a7b3bbcd..4974c9fdbf9a 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -32,6 +32,7 @@ enum cpuhp_state {
CPUHP_RCUTREE_PREP,
CPUHP_MD_RAID5_PREPARE,
CPUHP_CPUIDLE_COUPLED_PREPARE,
+   CPUHP_POWER_PMAC_PREPARE,
CPUHP_NOTIFY_PREPARE,
CPUHP_TIMERS_DEAD,
CPUHP_BRINGUP_CPU,
-- 
2.9.3



[PATCH 16/16] powerpc: mmu nohash: Convert to hotplug state machine

2016-08-18 Thread Sebastian Andrzej Siewior
Install the callbacks via the state machine.

Cc: Benjamin Herrenschmidt 
Cc: Paul Mackerras 
Cc: Michael Ellerman 
Cc: linuxppc-dev@lists.ozlabs.org
Signed-off-by: Sebastian Andrzej Siewior 
---
 arch/powerpc/mm/mmu_context_nohash.c | 54 +++-
 include/linux/cpuhotplug.h   |  1 +
 2 files changed, 24 insertions(+), 31 deletions(-)

diff --git a/arch/powerpc/mm/mmu_context_nohash.c 
b/arch/powerpc/mm/mmu_context_nohash.c
index 7d95bc402dba..f7755fcbe1f8 100644
--- a/arch/powerpc/mm/mmu_context_nohash.c
+++ b/arch/powerpc/mm/mmu_context_nohash.c
@@ -369,44 +369,34 @@ void destroy_context(struct mm_struct *mm)
 }
 
 #ifdef CONFIG_SMP
-
-static int mmu_context_cpu_notify(struct notifier_block *self,
- unsigned long action, void *hcpu)
+static int mmu_ctx_cpu_prepare(unsigned int cpu)
 {
-   unsigned int cpu = (unsigned int)(long)hcpu;
-
/* We don't touch CPU 0 map, it's allocated at aboot and kept
 * around forever
 */
if (cpu == boot_cpuid)
-   return NOTIFY_OK;
+   return 0;
 
-   switch (action) {
-   case CPU_UP_PREPARE:
-   case CPU_UP_PREPARE_FROZEN:
-   pr_devel("MMU: Allocating stale context map for CPU %d\n", cpu);
-   stale_map[cpu] = kzalloc(CTX_MAP_SIZE, GFP_KERNEL);
-   break;
-#ifdef CONFIG_HOTPLUG_CPU
-   case CPU_UP_CANCELED:
-   case CPU_UP_CANCELED_FROZEN:
-   case CPU_DEAD:
-   case CPU_DEAD_FROZEN:
-   pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu);
-   kfree(stale_map[cpu]);
-   stale_map[cpu] = NULL;
-
-   /* We also clear the cpu_vm_mask bits of CPUs going away */
-   clear_tasks_mm_cpumask(cpu);
-   break;
-#endif /* CONFIG_HOTPLUG_CPU */
-   }
-   return NOTIFY_OK;
+   pr_devel("MMU: Allocating stale context map for CPU %d\n", cpu);
+   stale_map[cpu] = kzalloc(CTX_MAP_SIZE, GFP_KERNEL);
+   return 0;
 }
 
-static struct notifier_block mmu_context_cpu_nb = {
-   .notifier_call  = mmu_context_cpu_notify,
-};
+static int mmu_ctx_cpu_dead(unsigned int cpu)
+{
+#ifdef CONFIG_HOTPLUG_CPU
+   if (cpu == boot_cpuid)
+   return 0;
+
+   pr_devel("MMU: Freeing stale context map for CPU %d\n", cpu);
+   kfree(stale_map[cpu]);
+   stale_map[cpu] = NULL;
+
+   /* We also clear the cpu_vm_mask bits of CPUs going away */
+   clear_tasks_mm_cpumask(cpu);
+#endif
+   return 0;
+}
 
 #endif /* CONFIG_SMP */
 
@@ -469,7 +459,9 @@ void __init mmu_context_init(void)
 #else
stale_map[boot_cpuid] = memblock_virt_alloc(CTX_MAP_SIZE, 0);
 
-   register_cpu_notifier(&mmu_context_cpu_nb);
+   cpuhp_setup_state_nocalls(CPUHP_POWER_MMU_CTX_PREPARE,
+ "POWER_MMU_CTX_PREPARE", mmu_ctx_cpu_prepare,
+ mmu_ctx_cpu_dead);
 #endif
 
printk(KERN_INFO
diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h
index 4974c9fdbf9a..92b9cf3271b2 100644
--- a/include/linux/cpuhotplug.h
+++ b/include/linux/cpuhotplug.h
@@ -33,6 +33,7 @@ enum cpuhp_state {
CPUHP_MD_RAID5_PREPARE,
CPUHP_CPUIDLE_COUPLED_PREPARE,
CPUHP_POWER_PMAC_PREPARE,
+   CPUHP_POWER_MMU_CTX_PREPARE,
CPUHP_NOTIFY_PREPARE,
CPUHP_TIMERS_DEAD,
CPUHP_BRINGUP_CPU,
-- 
2.9.3



Re: [PowerPC] today's main line failed to build on PowerPC

2016-08-18 Thread Balbir Singh
On Thu, Aug 18, 2016 at 11:50:28AM +0530, Abdul Haleem wrote:
> Hi,
> 
> The main line stable 4.8.0-rc2 failed to build on PowerPC with following
> build errors. config : pseries_le_defconfig Machine Type : PowerPC Bare
> Metal
> 
> 09:34:22 00:04:59 INFO | make -j 160  vmlinux
> 09:34:24 00:05:01 ERROR| [stderr] arch/powerpc/mm/hash_low_32.S: Assembler 
> messages:
> 09:34:24 00:05:01 ERROR| [stderr] arch/powerpc/mm/hash_low_32.S:353: Error: 
> missing operand
> 09:34:24 00:05:01 ERROR| [stderr] arch/powerpc/mm/hash_low_32.S:612: Error: 
> missing operand
> 09:34:24 00:05:01 ERROR| [stderr] arch/powerpc/mm/hash_low_32.S:670: Error: 
> missing operand
> 09:34:24 00:05:01 ERROR| [stderr] make[1]: *** 
> [arch/powerpc/mm/hash_low_32.o] Error 1
> 09:34:24 00:05:01 ERROR| [stderr] make[1]: *** Waiting for unfinished jobs
> 09:34:25 00:05:02 ERROR| [stderr] arch/powerpc/kernel/head_32.S: Assembler 
> messages:
> 09:34:25 00:05:02 ERROR| [stderr] arch/powerpc/kernel/head_32.S:1113: Error: 
> missing operand
> 09:34:25 00:05:02 ERROR| [stderr] make[1]: *** 
> [arch/powerpc/kernel/head_32.o] Error 1
> 09:34:25 00:05:02 ERROR| [stderr] make[1]: *** Waiting for unfinished jobs
> 09:34:27 00:05:04 ERROR| [stderr] make: *** [arch/powerpc/mm] Error 2
> 09:34:27 00:05:04 ERROR| [stderr] make: *** Waiting for unfinished jobs
> 09:34:42 00:05:19 ERROR| [stderr] make: *** [arch/powerpc/kernel] Error 2
>

Sounds like the assembler could not build the 32 bit assembly files. Has
the build succeeded with the same compiler/toolchain and options before?
 
> Regard's
> Abdul
> 


[PATCH V4 2/8] powerpc/memory: Parse new memory property to register blocks.

2016-08-18 Thread Michael Bringmann
powerpc/memory: Add parallel routines to parse the new property
"ibm,dynamic-memory-v2" property when it is present, and then to
register the relevant memory blocks with the operating system.
This property format is intended to provide a more compact
representation of memory when communicating with the front end
processor, especially when describing vast amounts of RAM.

[V4: Move a couple of function prototypes from header file
to a later patch where they will be used.]
[V4: Add some comments.]
[V4: Change a property check to scan actual device tree.]
[V4: Compress some common code.]

Signed-off-by: Michael Bringmann 
---
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 7f436ba..b9a1534 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -69,6 +69,8 @@ struct boot_param_header {
  * OF address retreival & translation
  */
 
+extern int n_mem_addr_cells;
+
 /* Parse the ibm,dma-window property of an OF node into the busno, phys and
  * size parameters.
  */
@@ -81,8 +83,9 @@ extern void of_instantiate_rtc(void);
 extern int of_get_ibm_chip_id(struct device_node *np);
 
 /* The of_drconf_cell struct defines the layout of the LMB array
- * specified in the device tree property
- * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory
+ * specified in the device tree properties,
+ * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory
+ * ibm,dynamic-reconfiguration-memory/ibm,dynamic-memory-v2
  */
 struct of_drconf_cell {
u64 base_addr;
@@ -92,9 +95,20 @@ struct of_drconf_cell {
u32 flags;
 };
 
-#define DRCONF_MEM_ASSIGNED0x0008
-#define DRCONF_MEM_AI_INVALID  0x0040
-#define DRCONF_MEM_RESERVED0x0080
+#define DRCONF_MEM_ASSIGNED0x0008
+#define DRCONF_MEM_AI_INVALID  0x0040
+#define DRCONF_MEM_RESERVED0x0080
+
+struct of_drconf_cell_v2 {
+   u32 num_seq_lmbs;
+   u64 base_addr;
+   u32 drc_index;
+   u32 aa_index;
+   u32 flags;
+} __attribute__((packed));
+
+extern void read_drconf_cell_v2(struct of_drconf_cell_v2 *drmem,
+   const __be32 **cellp);
 
 /*
  * There are two methods for telling firmware what our capabilities are.
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 669a15e..ad294ce 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -57,8 +57,10 @@
 EXPORT_SYMBOL(node_data);
 
 static int min_common_depth;
-static int n_mem_addr_cells, n_mem_size_cells;
+int n_mem_addr_cells;
+static int n_mem_size_cells;
 static int form1_affinity;
+EXPORT_SYMBOL(n_mem_addr_cells);
 
 #define MAX_DISTANCE_REF_POINTS 4
 static int distance_ref_points_depth;
@@ -405,6 +405,24 @@ static void read_drconf_cell(struct of_drconf_cell *drmem, 
const __be32 **cellp)
 
*cellp = cp + 4;
 }
+ 
+ /*
+ * Retrieve and validate the ibm,dynamic-memory property of the device tree.
+ * Read the next memory block set entry from the ibm,dynamic-memory-v2 property
+ * and return the information in the provided of_drconf_cell_v2 structure.
+ */
+void read_drconf_cell_v2(struct of_drconf_cell_v2 *drmem, const __be32 **cellp)
+{
+   const __be32 *cp = (const __be32 *)*cellp;
+   drmem->num_seq_lmbs = be32_to_cpu(*cp++);
+   drmem->base_addr = read_n_cells(n_mem_addr_cells, &cp);
+   drmem->drc_index = be32_to_cpu(*cp++);
+   drmem->aa_index = be32_to_cpu(*cp++);
+   drmem->flags = be32_to_cpu(*cp++);
+
+   *cellp = cp;
+}
+EXPORT_SYMBOL(read_drconf_cell_v2);
 
 /*
  * Retrieve and validate the ibm,dynamic-memory property of the device tree.
diff --git a/arch/powerpc/kernel/prom.c b/arch/powerpc/kernel/prom.c
index b0245be..51330bc 100644
--- a/arch/powerpc/kernel/prom.c
+++ b/arch/powerpc/kernel/prom.c
@@ -443,23 +443,34 @@ static int __init early_init_dt_scan_chosen_ppc(unsigned 
long node,
 
 #ifdef CONFIG_PPC_PSERIES
 /*
- * Interpret the ibm,dynamic-memory property in the
- * /ibm,dynamic-reconfiguration-memory node.
+ * Retrieve and validate the ibm,lmb-size property for drconf memory
+ * from the flattened device tree.
+ */
+static u64 __init get_lmb_size(unsigned long node)
+{
+   const __be32 *ls;
+   int len;
+   ls = of_get_flat_dt_prop(node, "ibm,lmb-size", &len);
+   if (!ls || len < dt_root_size_cells * sizeof(__be32))
+   return 0;
+   return dt_mem_next_cell(dt_root_size_cells, &ls);
+}
+
+/*
+ * Interpret the ibm,dynamic-memory property/ibm,dynamic-memory-v2
+ * in the /ibm,dynamic-reconfiguration-memory node.
  * This contains a list of memory blocks along with NUMA affinity
  * information.
  */
-static int __init early_init_dt_scan_drconf_memory(unsigned long node)
+static int __init early_init_dt_scan_drconf_memory_v1(unsigned long node)
 {
-   const __be32 *dm, *ls, *usm;
+   const __be32 *dm, *usm;
int l;
unsigned long n, flags;
u64 base, size, membloc

[PATCH V4 3/8] powerpc/memory: Parse new memory property to initialize structures.

2016-08-18 Thread Michael Bringmann
powerpc/memory: Add parallel routines to parse the new property
"ibm,dynamic-memory-v2" property when it is present, and then to
finish initialization of the relevant memory structures with the
operating system.  This code is shared between the boot-time
initialization functions and the runtime functions for memory
hotplug, so it needs to be able to handle both formats.

[V4: Added external function prototype definitions to header file
"prom.h" for use in other files.]
[V4: Replace a firmware feature test by an actual property scan.]
[V4: Delete an unused variable.]
[V4: Small cleanups to comments.]

Signed-off-by: Michael Bringmann 
---
diff --git a/arch/powerpc/include/asm/prom.h b/arch/powerpc/include/asm/prom.h
index 7f436ba..b9a1534 100644
--- a/arch/powerpc/include/asm/prom.h
+++ b/arch/powerpc/include/asm/prom.h
@@ -109,6 +109,18 @@ struct of_drconf_cell_v2 {
  
 extern void read_drconf_cell_v2(struct of_drconf_cell_v2 *drmem,
const __be32 **cellp);
+
+extern void read_one_drc_info(int **info, char **drc_type, char **drc_name,
+   unsigned long int *fdi_p, unsigned long int *nsl_p,
+   unsigned long int *si_p, unsigned long int *ldi_p);
+
+static inline int dyn_mem_v2_len(int entries)
+{
+   int drconf_v2_cells = (n_mem_addr_cells + 4);
+   int drconf_v2_cells_len = (drconf_v2_cells * sizeof(unsigned int));
+   return (((entries) * drconf_v2_cells_len) +
+(1 * sizeof(unsigned int)));
+}
 
 /*
  * There are two methods for telling firmware what our capabilities are.
diff --git a/arch/powerpc/mm/numa.c b/arch/powerpc/mm/numa.c
index 669a15e..18b4ee7 100644
--- a/arch/powerpc/mm/numa.c
+++ b/arch/powerpc/mm/numa.c
@@ -427,30 +426,55 @@
 EXPORT_SYMBOL(read_drconf_cell_v2);
 
 /*
- * Retrieve and validate the ibm,dynamic-memory property of the device tree.
+ * Retrieve and validate the ibm,dynamic-memory[-v2] property of the
+ * device tree.
+ *
+ * The layout of the ibm,dynamic-memory property is a number N of memory
+ * block description list entries followed by N memory block description
+ * list entries.  Each memory block description list entry contains
+ * information as laid out in the of_drconf_cell struct above.
  *
- * The layout of the ibm,dynamic-memory property is a number N of memblock
- * list entries followed by N memblock list entries.  Each memblock list entry
- * contains information as laid out in the of_drconf_cell struct above.
+ * The layout of the ibm,dynamic-memory-v2 property is a number N of memory
+ * block set description list entries, followed by N memory block set
+ * description set entries.
  */
 static int of_get_drconf_memory(struct device_node *memory, const __be32 **dm)
 {
const __be32 *prop;
u32 len, entries;
 
-   prop = of_get_property(memory, "ibm,dynamic-memory", &len);
-   if (!prop || len < sizeof(unsigned int))
-   return 0;
+   if (firmware_has_feature(FW_FEATURE_DYN_MEM_V2)) {
 
-   entries = of_read_number(prop++, 1);
+   prop = of_get_property(memory, "ibm,dynamic-memory-v2", &len);
+   if (!prop || len < sizeof(unsigned int))
+   return 0;
 
-   /* Now that we know the number of entries, revalidate the size
-* of the property read in to ensure we have everything
-*/
-   if (len < (entries * (n_mem_addr_cells + 4) + 1) * sizeof(unsigned int))
-   return 0;
+   entries = of_read_number(prop++, 1);
+
+   /* Now that we know the number of set entries, revalidate the
+* size of the property read in to ensure we have everything.
+*/
+   if (len < dyn_mem_v2_len(entries))
+   return 0;
+
+   *dm = prop;
+   } else {
+   prop = of_get_property(memory, "ibm,dynamic-memory", &len);
+   if (!prop || len < sizeof(unsigned int))
+   return 0;
+
+   entries = of_read_number(prop++, 1);
+
+   /* Now that we know the number of entries, revalidate the size
+* of the property read in to ensure we have everything
+*/
+   if (len < (entries * (n_mem_addr_cells + 4) + 1) *
+  sizeof(unsigned int))
+   return 0;
+
+   *dm = prop;
+   }
 
-   *dm = prop;
return entries;
 }
 
@@ -513,7 +537,7 @@
  * This is like of_node_to_nid_single() for memory represented in the
  * ibm,dynamic-reconfiguration-memory node.
  */
-static int of_drconf_to_nid_single(struct of_drconf_cell *drmem,
+static int of_drconf_to_nid_single(u32 drmem_flags, u32 drmem_aa_index,
   struct assoc_arrays *aa)
 {
int default_nid = 0;
@@ -521,16 +545,16 @@
int index;
 
if (min_common_depth > 0 && min_common_depth <= aa->array_sz &&
-   !(drmem->fl

[PATCH V4 4/8] pseries/hotplug init: Convert new DRC memory property for hotplug runtime

2016-08-18 Thread Michael Bringmann
hotplug_init: Simplify the code needed for runtime memory hotplug and
maintenance with a conversion routine that transforms the compressed
property "ibm,dynamic-memory-v2" to the form of "ibm,dynamic-memory"
within the "ibm,dynamic-reconfiguration-memory" property.  Thus only
a single set of routines should be required at runtime to parse, edit,
and manipulate the memory representation in the device tree.  Similarly,
any userspace applications that need this information will only need
to recognize the older format to be able to continue to operate.

[V4: Remove unneeded code braces.]
[V4: Simplify allocation of a couple of loop index variables.]

Signed-off-by: Michael Bringmann 
---
diff --git a/arch/powerpc/platforms/pseries/hotplug-memory.c 
b/arch/powerpc/platforms/pseries/hotplug-memory.c
index 2ce1385..0c46fbc 100644
--- a/arch/powerpc/platforms/pseries/hotplug-memory.c
+++ b/arch/powerpc/platforms/pseries/hotplug-memory.c
@@ -24,6 +24,8 @@
 #include 
 #include "pseries.h"
 
+#ifdef CONFIG_MEMORY_HOTPLUG
+
 static bool rtas_hp_event;
 
 unsigned long pseries_memory_block_size(void)
@@ -887,11 +889,102 @@ static int pseries_memory_notifier(struct notifier_block 
*nb,
 static struct notifier_block pseries_mem_nb = {
.notifier_call = pseries_memory_notifier,
 };
+#endif /* CONFIG_MEMORY_HOTPLUG */
+
+static int pseries_rewrite_dynamic_memory_v2(void)
+{
+   unsigned long memblock_size;
+   struct device_node *dn;
+   struct property *prop, *prop_v2;
+   __be32 *p;
+   struct of_drconf_cell *lmbs;
+   u32 num_lmb_desc_sets, num_lmbs;
+   int i, j, k;
+
+   dn = of_find_node_by_path("/ibm,dynamic-reconfiguration-memory");
+   if (!dn)
+   return -EINVAL;
+
+   prop_v2 = of_find_property(dn, "ibm,dynamic-memory-v2", NULL);
+   if (!prop_v2)
+   return -EINVAL;
+
+   memblock_size = pseries_memory_block_size();
+   if (!memblock_size)
+   return -EINVAL;
+
+   /* The first int of the property is the number of lmb sets
+* described by the property.
+*/
+   p = (__be32 *)prop_v2->value;
+   num_lmb_desc_sets = be32_to_cpu(*p++);
+
+   /* Count the number of LMBs for generating the alternate format
+*/
+   for (i = 0, num_lmbs = 0; i < num_lmb_desc_sets; i++) {
+   struct of_drconf_cell_v2 drmem;
+
+   read_drconf_cell_v2(&drmem, (const __be32 **)&p);
+   num_lmbs += drmem.num_seq_lmbs;
+   }
+
+   /* Create an empty copy of the new 'ibm,dynamic-memory' property
+*/
+   prop = kzalloc(sizeof(*prop), GFP_KERNEL);
+   if (!prop)
+   return -ENOMEM;
+   prop->name = kstrdup("ibm,dynamic-memory", GFP_KERNEL);
+   prop->length = dyn_mem_v2_len(num_lmbs);
+   prop->value = kzalloc(prop->length, GFP_KERNEL);
+
+   /* Copy/expand the ibm,dynamic-memory-v2 format to produce the
+* ibm,dynamic-memory format.
+*/
+   p = (__be32 *)prop->value;
+   *p = cpu_to_be32(num_lmbs);
+   p++;
+   lmbs = (struct of_drconf_cell *)p;
+
+   p = (__be32 *)prop_v2->value;
+   p++;
+
+   for (i = 0, k = 0; i < num_lmb_desc_sets; i++) {
+   struct of_drconf_cell_v2 drmem;
+
+   read_drconf_cell_v2(&drmem, (const __be32 **)&p);
+
+   for (j = 0; j < drmem.num_seq_lmbs; j++) {
+   lmbs[k+j].base_addr = be64_to_cpu(drmem.base_addr);
+   lmbs[k+j].drc_index = be32_to_cpu(drmem.drc_index);
+   lmbs[k+j].reserved  = 0;
+   lmbs[k+j].aa_index  = be32_to_cpu(drmem.aa_index);
+   lmbs[k+i].flags = be32_to_cpu(drmem.flags);
+
+   drmem.base_addr += memblock_size;
+   drmem.drc_index++;
+   }
+
+   k += drmem.num_seq_lmbs;
+   }
+
+   of_remove_property(dn, prop_v2);
+
+   of_add_property(dn, prop);
+
+   /* And disable feature flag since the property has gone away */
+   powerpc_firmware_features &= ~FW_FEATURE_DYN_MEM_V2;
+
+   return 0;
+}
 
 static int __init pseries_memory_hotplug_init(void)
 {
+   if (firmware_has_feature(FW_FEATURE_DYN_MEM_V2))
+   pseries_rewrite_dynamic_memory_v2();
+#ifdef CONFIG_MEMORY_HOTPLUG
if (firmware_has_feature(FW_FEATURE_LPAR))
of_reconfig_notifier_register(&pseries_mem_nb);
+#endif /* CONFIG_MEMORY_HOTPLUG */
 
return 0;
 }
diff --git a/arch/powerpc/platforms/pseries/Makefile 
b/arch/powerpc/platforms/pseries/Makefile
index fedc2ccf0..e74cf6c 100644
--- a/arch/powerpc/platforms/pseries/Makefile
+++ b/arch/powerpc/platforms/pseries/Makefile
@@ -5,14 +5,14 @@ obj-y := lpar.o hvCall.o nvram.o reconfig.o \
   of_helpers.o \
   setup.o iommu.o event_sources.o ras.o \
   firmware.o power.o dlpa

Re: [PATCH 2/6] cxlflash: Cache owning adapter within context

2016-08-18 Thread Manoj Kumar

Acked-by: Manoj N. Kumar 

On 8/9/2016 6:39 PM, Matthew R. Ochs wrote:

The context removal routine requires access to the owning adapter
structure to reset the context within the AFU as part of the tear
down sequence. In order to support kref adoption, the owning adapter
must be accessible from the release handler. As the kref framework
only provides the kref reference as the sole parameter, another means
is needed to derive the owning adapter.

As a remedy, the owning adapter reference is saved off within the
context during initialization.

Signed-off-by: Matthew R. Ochs 
---
 drivers/scsi/cxlflash/superpipe.c | 1 +
 drivers/scsi/cxlflash/superpipe.h | 1 +
 2 files changed, 2 insertions(+)

diff --git a/drivers/scsi/cxlflash/superpipe.c 
b/drivers/scsi/cxlflash/superpipe.c
index ab5c893..640c3a2 100644
--- a/drivers/scsi/cxlflash/superpipe.c
+++ b/drivers/scsi/cxlflash/superpipe.c
@@ -804,6 +804,7 @@ static void init_context(struct ctx_info *ctxi, struct 
cxlflash_cfg *cfg,
ctxi->lfd = adap_fd;
ctxi->pid = current->tgid; /* tgid = pid */
ctxi->ctx = ctx;
+   ctxi->cfg = cfg;
ctxi->file = file;
ctxi->initialized = true;
mutex_init(&ctxi->mutex);
diff --git a/drivers/scsi/cxlflash/superpipe.h 
b/drivers/scsi/cxlflash/superpipe.h
index 5f9a091..61404f2 100644
--- a/drivers/scsi/cxlflash/superpipe.h
+++ b/drivers/scsi/cxlflash/superpipe.h
@@ -107,6 +107,7 @@ struct ctx_info {
bool err_recovery_active;
struct mutex mutex; /* Context protection */
struct cxl_context *ctx;
+   struct cxlflash_cfg *cfg;
struct list_head luns;  /* LUNs attached to this context */
const struct vm_operations_struct *cxl_mmap_vmops;
struct file *file;





Re: [PowerPC] today's main line failed to build on PowerPC

2016-08-18 Thread Segher Boessenkool
On Thu, Aug 18, 2016 at 12:48:17PM +0530, Abdul Haleem wrote:
> >09:34:24 00:05:01 ERROR| [stderr] arch/powerpc/mm/hash_low_32.S: 
> >Assembler messages:
> >09:34:24 00:05:01 ERROR| [stderr] arch/powerpc/mm/hash_low_32.S:353: 
> >Error: missing operand
> >09:34:24 00:05:01 ERROR| [stderr] arch/powerpc/mm/hash_low_32.S:612: 
> >Error: missing operand
> >09:34:24 00:05:01 ERROR| [stderr] arch/powerpc/mm/hash_low_32.S:670: 
> >Error: missing operand

You're building 32-bit no-smp...

> CONFIG_SMP=y
> CONFIG_WORD_SIZE=64

... but your config says otherwise.  Some (re-)configuration mishap?


Segher


[PATCH] powerpc/8xx: use SPRN_EIE and SPRN_EID to enable/disable interrupts

2016-08-18 Thread Christophe Leroy
The 8xx has two special registers called EID (External Interrupt
Disable) and EIE (External Interrupt Enable) for clearing/setting
EE in MSR. It avoids the three instructions set mfmsr/ori/mtmsr or
mfmsr/rlwinm/mtmsr.

We just have to write something in the register to change MSR EE
bit. So we write r0 into the register, regardless of r0 value.

Signed-off-by: Christophe Leroy 
---
 arch/powerpc/include/asm/hw_irq.h  | 6 ++
 arch/powerpc/include/asm/reg.h | 2 ++
 arch/powerpc/include/asm/reg_8xx.h | 5 +
 3 files changed, 13 insertions(+)

diff --git a/arch/powerpc/include/asm/hw_irq.h 
b/arch/powerpc/include/asm/hw_irq.h
index c7d82ff..7ffb392 100644
--- a/arch/powerpc/include/asm/hw_irq.h
+++ b/arch/powerpc/include/asm/hw_irq.h
@@ -155,6 +155,8 @@ static inline unsigned long arch_local_irq_save(void)
unsigned long flags = arch_local_save_flags();
 #ifdef CONFIG_BOOKE
asm volatile("wrteei 0" : : : "memory");
+#elif defined(CONFIG_PPC_8xx)
+   wrtspr(SPRN_EID);
 #else
SET_MSR_EE(flags & ~MSR_EE);
 #endif
@@ -165,6 +167,8 @@ static inline void arch_local_irq_disable(void)
 {
 #ifdef CONFIG_BOOKE
asm volatile("wrteei 0" : : : "memory");
+#elif defined(CONFIG_PPC_8xx)
+   wrtspr(SPRN_EID);
 #else
arch_local_irq_save();
 #endif
@@ -174,6 +178,8 @@ static inline void arch_local_irq_enable(void)
 {
 #ifdef CONFIG_BOOKE
asm volatile("wrteei 1" : : : "memory");
+#elif defined(CONFIG_PPC_8xx)
+   wrtspr(SPRN_EIE);
 #else
unsigned long msr = mfmsr();
SET_MSR_EE(msr | MSR_EE);
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index f69f40f..4bbd9be 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -1246,6 +1246,8 @@ static inline void mtmsr_isync(unsigned long val)
 : "r" ((unsigned long)(v)) \
 : "memory")
 #endif
+#define wrtspr(rn) asm volatile("mtspr " __stringify(rn) ",0" : \
+: : "memory")
 
 extern void msr_check_and_set(unsigned long bits);
 extern bool strict_msr_control;
diff --git a/arch/powerpc/include/asm/reg_8xx.h 
b/arch/powerpc/include/asm/reg_8xx.h
index 6dae71f..d4bca3de 100644
--- a/arch/powerpc/include/asm/reg_8xx.h
+++ b/arch/powerpc/include/asm/reg_8xx.h
@@ -6,6 +6,11 @@
 
 #include 
 
+/* Special MSR manipulation registers */
+#define SPRN_EIE   80  /* External interrupt enable (EE=1, RI=1) */
+#define SPRN_EID   81  /* External interrupt disable (EE=0, RI=1) */
+#define SPRN_NRI   81  /* Non Recoverable interrupt (EE=0, RI=0) */
+
 /* Cache control on the MPC8xx is provided through some additional
  * special purpose registers.
  */
-- 
2.1.0



[PATCH] ibmvnic: Handle backing device failover and reinitialization

2016-08-18 Thread Thomas Falcon
An upcoming feature of IBM VNIC protocol is the ability to configure
redundant backing devices for a VNIC client. In case of a failure
on the current backing device, the driver will receive a signal
from the hypervisor indicating that a failover will occur. The driver
will then wait for a message from the backing device before 
establishing a new connection.

Signed-off-by: Thomas Falcon 
---
 drivers/net/ethernet/ibm/ibmvnic.c | 34 --
 drivers/net/ethernet/ibm/ibmvnic.h |  2 ++
 2 files changed, 34 insertions(+), 2 deletions(-)

diff --git a/drivers/net/ethernet/ibm/ibmvnic.c 
b/drivers/net/ethernet/ibm/ibmvnic.c
index 88f3c85..b942108 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.c
+++ b/drivers/net/ethernet/ibm/ibmvnic.c
@@ -203,7 +203,8 @@ static void free_long_term_buff(struct ibmvnic_adapter 
*adapter,
struct device *dev = &adapter->vdev->dev;
 
dma_free_coherent(dev, ltb->size, ltb->buff, ltb->addr);
-   send_request_unmap(adapter, ltb->map_id);
+   if (!adapter->failover)
+   send_request_unmap(adapter, ltb->map_id);
 }
 
 static int alloc_rx_pool(struct ibmvnic_adapter *adapter,
@@ -522,7 +523,8 @@ static int ibmvnic_close(struct net_device *netdev)
for (i = 0; i < adapter->req_rx_queues; i++)
napi_disable(&adapter->napi[i]);
 
-   netif_tx_stop_all_queues(netdev);
+   if (!adapter->failover)
+   netif_tx_stop_all_queues(netdev);
 
if (adapter->bounce_buffer) {
if (!dma_mapping_error(dev, adapter->bounce_buffer_dma)) {
@@ -3280,6 +3282,10 @@ static void ibmvnic_handle_crq(union ibmvnic_crq *crq,
rc = ibmvnic_send_crq_init(adapter);
if (rc)
dev_err(dev, "Error sending init rc=%ld\n", rc);
+   } else if (gen_crq->cmd == IBMVNIC_DEVICE_FAILOVER) {
+   dev_info(dev, "Backing device failover detected\n");
+   netif_carrier_off(netdev);
+   adapter->failover = true;
} else {
/* The adapter lost the connection */
dev_err(dev, "Virtual Adapter failed (rc=%d)\n",
@@ -3615,8 +3621,18 @@ static void handle_crq_init_rsp(struct work_struct *work)
struct device *dev = &adapter->vdev->dev;
struct net_device *netdev = adapter->netdev;
unsigned long timeout = msecs_to_jiffies(3);
+   bool restart = false;
int rc;
 
+   if (adapter->failover) {
+   release_sub_crqs(adapter);
+   if (netif_running(netdev)) {
+   netif_tx_disable(netdev);
+   ibmvnic_close(netdev);
+   restart = true;
+   }
+   }
+
send_version_xchg(adapter);
reinit_completion(&adapter->init_done);
if (!wait_for_completion_timeout(&adapter->init_done, timeout)) {
@@ -3645,6 +3661,17 @@ static void handle_crq_init_rsp(struct work_struct *work)
 
netdev->real_num_tx_queues = adapter->req_tx_queues;
 
+   if (adapter->failover) {
+   adapter->failover = false;
+   if (restart) {
+   rc = ibmvnic_open(netdev);
+   if (rc)
+   goto restart_failed;
+   }
+   netif_carrier_on(netdev);
+   return;
+   }
+
rc = register_netdev(netdev);
if (rc) {
dev_err(dev,
@@ -3655,6 +3682,8 @@ static void handle_crq_init_rsp(struct work_struct *work)
 
return;
 
+restart_failed:
+   dev_err(dev, "Failed to restart ibmvnic, rc=%d\n", rc);
 register_failed:
release_sub_crqs(adapter);
 task_failed:
@@ -3692,6 +3721,7 @@ static int ibmvnic_probe(struct vio_dev *dev, const 
struct vio_device_id *id)
dev_set_drvdata(&dev->dev, netdev);
adapter->vdev = dev;
adapter->netdev = netdev;
+   adapter->failover = false;
 
ether_addr_copy(adapter->mac_addr, mac_addr_p);
ether_addr_copy(netdev->dev_addr, adapter->mac_addr);
diff --git a/drivers/net/ethernet/ibm/ibmvnic.h 
b/drivers/net/ethernet/ibm/ibmvnic.h
index e82898f..bfc84c7 100644
--- a/drivers/net/ethernet/ibm/ibmvnic.h
+++ b/drivers/net/ethernet/ibm/ibmvnic.h
@@ -830,6 +830,7 @@ enum ibmvfc_crq_format {
IBMVNIC_CRQ_INIT = 0x01,
IBMVNIC_CRQ_INIT_COMPLETE= 0x02,
IBMVNIC_PARTITION_MIGRATED   = 0x06,
+   IBMVNIC_DEVICE_FAILOVER  = 0x08,
 };
 
 struct ibmvnic_crq_queue {
@@ -1047,4 +1048,5 @@ struct ibmvnic_adapter {
u8 map_id;
 
struct work_struct vnic_crq_init;
+   bool failover;
 };
-- 
1.8.3.1



Re: [PATCH] powerpc/8xx: use SPRN_EIE and SPRN_EID to enable/disable interrupts

2016-08-18 Thread Segher Boessenkool
On Thu, Aug 18, 2016 at 05:56:02PM +0200, Christophe Leroy wrote:
> The 8xx has two special registers called EID (External Interrupt
> Disable) and EIE (External Interrupt Enable) for clearing/setting
> EE in MSR. It avoids the three instructions set mfmsr/ori/mtmsr or
> mfmsr/rlwinm/mtmsr.

All 8xx?  What other models?  (5xx for example).

> +/* Special MSR manipulation registers */
> +#define SPRN_EIE 80  /* External interrupt enable (EE=1, RI=1) */
> +#define SPRN_EID 81  /* External interrupt disable (EE=0, RI=1) */
> +#define SPRN_NRI 81  /* Non Recoverable interrupt (EE=0, RI=0) */

This is wrong (NRI is 82).  Don't write code you cannot test / don't submit
code you haven't tested?  :-)


Segher


Re: [PATCH] powerpc/8xx: use SPRN_EIE and SPRN_EID to enable/disable interrupts

2016-08-18 Thread Christophe Leroy



Le 18/08/2016 à 18:34, Segher Boessenkool a écrit :

On Thu, Aug 18, 2016 at 05:56:02PM +0200, Christophe Leroy wrote:

The 8xx has two special registers called EID (External Interrupt
Disable) and EIE (External Interrupt Enable) for clearing/setting
EE in MSR. It avoids the three instructions set mfmsr/ori/mtmsr or
mfmsr/rlwinm/mtmsr.


All 8xx?  What other models?  (5xx for example).


At least 823, 860, 866 and 885 have it.

Looks like the 5xx have it too (at least the 565). Does Linux supports 
that one at all ?


8272 and 8323 don't have it.




+/* Special MSR manipulation registers */
+#define SPRN_EIE   80  /* External interrupt enable (EE=1, RI=1) */
+#define SPRN_EID   81  /* External interrupt disable (EE=0, RI=1) */
+#define SPRN_NRI   81  /* Non Recoverable interrupt (EE=0, RI=0) */


This is wrong (NRI is 82).  Don't write code you cannot test / don't submit
code you haven't tested?  :-)


Oops. You're right, copy/paste failure.
Was tested on an 885. Unfortunatly SPRN_NRI is not used (yet) :-(

Christophe


Re: [PATCH] cxl: use pcibios_free_controller_deferred() when removing vPHBs

2016-08-18 Thread Matthew R. Ochs
> On Aug 18, 2016, at 2:35 AM, Andrew Donnellan  
> wrote:
> 
> When cxl removes a vPHB, it's possible that the pci_controller may be freed
> before all references to the devices on the vPHB have been released. This
> in turn causes an invalid memory access when the devices are eventually
> released, as pcibios_release_device() attempts to call the phb's
> release_device hook.
> 
> In cxl_pci_vphb_remove(), remove the existing call to
> pcibios_free_controller(). Instead, use
> pcibios_free_controller_deferred() to free the pci_controller after all
> devices have been released. Export pci_set_host_bridge_release() so we can
> do this.
> 
> Cc: sta...@vger.kernel.org
> Signed-off-by: Andrew Donnellan 

Reviewed-by: Matthew R. Ochs 



Re: [PATCH] powerpc/8xx: use SPRN_EIE and SPRN_EID to enable/disable interrupts

2016-08-18 Thread Segher Boessenkool
On Thu, Aug 18, 2016 at 06:52:47PM +0200, Christophe Leroy wrote:
> Le 18/08/2016 à 18:34, Segher Boessenkool a écrit :
> >On Thu, Aug 18, 2016 at 05:56:02PM +0200, Christophe Leroy wrote:
> >>The 8xx has two special registers called EID (External Interrupt
> >>Disable) and EIE (External Interrupt Enable) for clearing/setting
> >>EE in MSR. It avoids the three instructions set mfmsr/ori/mtmsr or
> >>mfmsr/rlwinm/mtmsr.
> >
> >All 8xx?  What other models?  (5xx for example).
> 
> At least 823, 860, 866 and 885 have it.

I haven't been able to find a manual for all 8xx.  But there is AN2055,
which suggests EIE etc. is for all 8xx indeed.

> Looks like the 5xx have it too (at least the 565). Does Linux supports 
> that one at all ?

All 5xx have it, there is a manual for *that* ("RCPU") :-)

> >>+/* Special MSR manipulation registers */
> >>+#define SPRN_EIE   80  /* External interrupt enable (EE=1, RI=1) */
> >>+#define SPRN_EID   81  /* External interrupt disable (EE=0, RI=1) */
> >>+#define SPRN_NRI   81  /* Non Recoverable interrupt (EE=0, RI=0) */

Is it correct to set RI in all places you do now?

> >This is wrong (NRI is 82).  Don't write code you cannot test / don't submit
> >code you haven't tested?  :-)
> 
> Oops. You're right, copy/paste failure.
> Was tested on an 885. Unfortunatly SPRN_NRI is not used (yet) :-(

Well, that was my point!


Segher


Re: [PATCH 3/6] cxlflash: Add kref to context

2016-08-18 Thread Manoj Kumar

Acked-by: Manoj N. Kumar 

On 8/9/2016 6:39 PM, Matthew R. Ochs wrote:

Currently, context user references are tracked via the list of LUNs
that have attached to the context. While convenient, this is not
intuitive without a deep study of the code and is inconsistent with
the existing reference tracking patterns within the kernel. This design
choice can lead to future bug injection.

To improve code comprehension and better protect against future bugs, add
explicit reference counting to contexts and migrate the context removal
code to the kref release handler.

Inspired-by: Al Viro 
Signed-off-by: Matthew R. Ochs 




Re: [PATCH v2 3/6] kexec_file: Allow skipping checksum calculation for some segments.

2016-08-18 Thread Thiago Jung Bauermann
Hello Dave,

Thanks for your review!

[ Trimming down Cc: list a little to try to clear the "too many recipients"   
  mailing list restriction. ]

Am Donnerstag, 18 August 2016, 17:03:30 schrieb Dave Young:
> On 08/13/16 at 12:18am, Thiago Jung Bauermann wrote:
> > Adds checksum argument to kexec_add_buffer specifying whether the given
> > segment should be part of the checksum calculation.
> 
> Since it is used with add buffer, could it be added to kbuf as a new
> field?

I was on the fence about adding it as a new argument to kexec_add_buffer or 
as a new field to struct kexec_buf. Both alternatives make sense to me. I 
implemented your suggestion in the patch below, what do you think?

> Like kbuf.no_checksum, default value is 0 that means checksum is needed
> if it is 1 then no need a checksum.

It's an interesting idea and I implemented it that way, though in practice 
all current users of struct kexec_buf put it on the stack so the field needs 
to be initialized explicitly.

-- 
[]'s
Thiago Jung Bauermann
IBM Linux Technology Center


Subject: [PATCH v2 3/6] kexec_file: Allow skipping checksum calculation for
 some segments.

Add skip_checksum member to struct kexec_buf to specify whether the
corresponding segment should be part of the checksum calculation.

The next patch will add a way to update segments after a kimage is loaded.
Segments that will be updated in this way should not be checksummed,
otherwise they will cause the purgatory checksum verification to fail
when the machine is rebooted.

As a bonus, we don't need to special-case the purgatory segment anymore
to avoid checksumming it.

Adjust places using struct kexec_buf to set skip_checksum.

Signed-off-by: Thiago Jung Bauermann 
---
 arch/powerpc/kernel/kexec_elf_64.c |  5 +++--
 arch/x86/kernel/crash.c|  3 ++-
 arch/x86/kernel/kexec-bzimage64.c  |  2 +-
 include/linux/kexec.h  | 23 ++-
 kernel/kexec_file.c| 15 +++
 5 files changed, 27 insertions(+), 21 deletions(-)

diff --git a/arch/powerpc/kernel/kexec_elf_64.c 
b/arch/powerpc/kernel/kexec_elf_64.c
index 22afc7b5ee73..d009f5363968 100644
--- a/arch/powerpc/kernel/kexec_elf_64.c
+++ b/arch/powerpc/kernel/kexec_elf_64.c
@@ -107,7 +107,7 @@ static int elf_exec_load(struct kimage *image, struct 
elfhdr *ehdr,
int ret;
size_t i;
struct kexec_buf kbuf = { .image = image, .buf_max = ppc64_rma_size,
- .top_down = false };
+ .top_down = false, .skip_checksum = false };
 
/* Read in the PT_LOAD segments. */
for (i = 0; i < ehdr->e_phnum; i++) {
@@ -162,7 +162,8 @@ void *elf64_load(struct kimage *image, char *kernel_buf,
struct elf_info elf_info;
struct fdt_reserve_entry *rsvmap;
struct kexec_buf kbuf = { .image = image, .buf_min = 0,
- .buf_max = ppc64_rma_size };
+ .buf_max = ppc64_rma_size,
+ .skip_checksum = false };
 
ret = build_elf_exec_info(kernel_buf, kernel_len, &ehdr, &elf_info);
if (ret)
diff --git a/arch/x86/kernel/crash.c b/arch/x86/kernel/crash.c
index 38a1cdf6aa05..7b8f62c86651 100644
--- a/arch/x86/kernel/crash.c
+++ b/arch/x86/kernel/crash.c
@@ -617,7 +617,8 @@ int crash_load_segments(struct kimage *image)
 {
int ret;
struct kexec_buf kbuf = { .image = image, .buf_min = 0,
- .buf_max = ULONG_MAX, .top_down = false };
+ .buf_max = ULONG_MAX, .top_down = false,
+ .skip_checksum = false };
 
/*
 * Determine and load a segment for backup area. First 640K RAM
diff --git a/arch/x86/kernel/kexec-bzimage64.c 
b/arch/x86/kernel/kexec-bzimage64.c
index 4b3a75329fb6..449f433cd225 100644
--- a/arch/x86/kernel/kexec-bzimage64.c
+++ b/arch/x86/kernel/kexec-bzimage64.c
@@ -341,7 +341,7 @@ static void *bzImage64_load(struct kimage *image, char 
*kernel,
unsigned int setup_hdr_offset = offsetof(struct boot_params, hdr);
unsigned int efi_map_offset, efi_map_sz, efi_setup_data_offset;
struct kexec_buf kbuf = { .image = image, .buf_max = ULONG_MAX,
- .top_down = true };
+ .top_down = true, .skip_checksum = false };
 
header = (struct setup_header *)(kernel + setup_hdr_offset);
setup_sects = header->setup_sects;
diff --git a/include/linux/kexec.h b/include/linux/kexec.h
index 4559a1a01b0a..e5b3d99cbe50 100644
--- a/include/linux/kexec.h
+++ b/include/linux/kexec.h
@@ -100,6 +100,9 @@ struct kexec_segment {
size_t bufsz;
unsigned long mem;
size_t memsz;
+
+   /* Whether this segment is ignored in the checksum calculation. */
+   bool skip_checksum;
 };
 
 #ifdef CONFIG_COMPAT
@@ -151,15 +154,16 @@ struct kexec_file_ops {
 
 /**
  * s

Re: [PATCH v2] powerpc: move hmi.c to arch/powerpc/kvm/

2016-08-18 Thread Benjamin Herrenschmidt
On Thu, 2016-08-18 at 10:53 +0200, Paolo Bonzini wrote:
> 
> On 11/08/2016 15:07, Paolo Bonzini wrote:
> > 
> > hmi.c functions are unused unless sibling_subcore_state is nonzero,
> > and
> > that in turn happens only if KVM is in use.  So move the code to
> > arch/powerpc/kvm/, putting it under CONFIG_KVM_BOOK3S_HV_POSSIBLE
> > rather than CONFIG_PPC_BOOK3S_64.  The sibling_subcore_state is
> > also
> > included in struct paca_struct only if KVM is supported by the
> > kernel.
> 

Mahesh, can you review this ?

> > Cc: Daniel Axtens 
> > Cc: Michael Ellerman 
> > Cc: Mahesh Salgaonkar 
> > Cc: Paul Mackerras 
> > Cc: linuxppc-dev@lists.ozlabs.org
> > Cc: kvm-...@vger.kernel.org
> > Cc: k...@vger.kernel.org
> > Signed-off-by: Paolo Bonzini 
> > ---
> > v1->v2: use CONFIG_KVM_BOOK3S_HV_POSSIBLE, not
> > CONFIG_KVM_BOOK3S_64_HANDLER.  The former implies
> > the latter, but the reverse is not true.
> > 
> >  arch/powerpc/include/asm/hmi.h |  2 +-
> >  arch/powerpc/include/asm/paca.h| 12 +++---
> > --
> >  arch/powerpc/kernel/Makefile   |  2 +-
> >  arch/powerpc/kvm/Makefile  |  1 +
> >  arch/powerpc/{kernel/hmi.c => kvm/book3s_hv_hmi.c} |  0
> >  5 files changed, 10 insertions(+), 7 deletions(-)
> >  rename arch/powerpc/{kernel/hmi.c => kvm/book3s_hv_hmi.c} (100%)
> > 
> > diff --git a/arch/powerpc/include/asm/hmi.h
> > b/arch/powerpc/include/asm/hmi.h
> > index 88b4901ac4ee..85b7a1a21e22 100644
> > --- a/arch/powerpc/include/asm/hmi.h
> > +++ b/arch/powerpc/include/asm/hmi.h
> > @@ -21,7 +21,7 @@
> >  #ifndef __ASM_PPC64_HMI_H__
> >  #define __ASM_PPC64_HMI_H__
> >  
> > -#ifdef CONFIG_PPC_BOOK3S_64
> > +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> >  
> >  #defineCORE_TB_RESYNC_REQ_BIT  63
> >  #define MAX_SUBCORE_PER_CORE   4
> > diff --git a/arch/powerpc/include/asm/paca.h
> > b/arch/powerpc/include/asm/paca.h
> > index 148303e7771f..6a6792bb39fb 100644
> > --- a/arch/powerpc/include/asm/paca.h
> > +++ b/arch/powerpc/include/asm/paca.h
> > @@ -183,11 +183,6 @@ struct paca_struct {
> >      */
> >     u16 in_mce;
> >     u8 hmi_event_available;  /* HMI event is
> > available */
> > -   /*
> > -    * Bitmap for sibling subcore status. See
> > kvm/book3s_hv_ras.c for
> > -    * more details
> > -    */
> > -   struct sibling_subcore_state *sibling_subcore_state;
> >  #endif
> >  
> >     /* Stuff for accurate time accounting */
> > @@ -202,6 +197,13 @@ struct paca_struct {
> >     struct kvmppc_book3s_shadow_vcpu shadow_vcpu;
> >  #endif
> >     struct kvmppc_host_state kvm_hstate;
> > +#ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> > +   /*
> > +    * Bitmap for sibling subcore status. See
> > kvm/book3s_hv_ras.c for
> > +    * more details
> > +    */
> > +   struct sibling_subcore_state *sibling_subcore_state;
> > +#endif
> >  #endif
> >  };
> >  
> > diff --git a/arch/powerpc/kernel/Makefile
> > b/arch/powerpc/kernel/Makefile
> > index b2027a5cf508..fe4c075bcf50 100644
> > --- a/arch/powerpc/kernel/Makefile
> > +++ b/arch/powerpc/kernel/Makefile
> > @@ -41,7 +41,7 @@ obj-$(CONFIG_VDSO32)  += vdso32/
> >  obj-$(CONFIG_HAVE_HW_BREAKPOINT)   += hw_breakpoint.o
> >  obj-$(CONFIG_PPC_BOOK3S_64)+= cpu_setup_ppc970.o
> > cpu_setup_pa6t.o
> >  obj-$(CONFIG_PPC_BOOK3S_64)+= cpu_setup_power.o
> > -obj-$(CONFIG_PPC_BOOK3S_64)+= mce.o mce_power.o hmi.o
> > +obj-$(CONFIG_PPC_BOOK3S_64)+= mce.o mce_power.o
> >  obj-$(CONFIG_PPC_BOOK3E_64)+= exceptions-64e.o
> > idle_book3e.o
> >  obj-$(CONFIG_PPC64)+= vdso64/
> >  obj-$(CONFIG_ALTIVEC)  += vecemu.o
> > diff --git a/arch/powerpc/kvm/Makefile b/arch/powerpc/kvm/Makefile
> > index 1f9e5529e692..855d4b95d752 100644
> > --- a/arch/powerpc/kvm/Makefile
> > +++ b/arch/powerpc/kvm/Makefile
> > @@ -78,6 +78,7 @@ kvm-book3s_64-builtin-xics-objs-
> > $(CONFIG_KVM_XICS) := \
> >  
> >  ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
> >  kvm-book3s_64-builtin-objs-$(CONFIG_KVM_BOOK3S_64_HANDLER) += \
> > +   book3s_hv_hmi.o \
> >     book3s_hv_rmhandlers.o \
> >     book3s_hv_rm_mmu.o \
> >     book3s_hv_ras.o \
> > diff --git a/arch/powerpc/kernel/hmi.c
> > b/arch/powerpc/kvm/book3s_hv_hmi.c
> > similarity index 100%
> > rename from arch/powerpc/kernel/hmi.c
> > rename to arch/powerpc/kvm/book3s_hv_hmi.c
> > 
> 
> Ping?
> 
> Paolo



[RFC/PATCH 1/2] cpuidle: Allow idle-states to be disabled at start

2016-08-18 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

Currently all the idle states registered by a cpu-idle driver are
enabled by default. This patch adds a mechanism which allows the
driver to hint if an idle-state should start in a disabled state. The
cpu-idle core will use this hint to appropriately initialize the
usage->disable knob of the CPU device idle state.

The state can be enabled at run time by echo'ing a zero to the sysfs
"disable" control file.

Signed-off-by: Gautham R. Shenoy 
---
 drivers/cpuidle/cpuidle.c | 7 +++
 include/linux/cpuidle.h   | 7 ++-
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/drivers/cpuidle/cpuidle.c b/drivers/cpuidle/cpuidle.c
index c73207a..b4debc7 100644
--- a/drivers/cpuidle/cpuidle.c
+++ b/drivers/cpuidle/cpuidle.c
@@ -439,7 +439,14 @@ static void __cpuidle_unregister_device(struct 
cpuidle_device *dev)
 
 static void __cpuidle_device_init(struct cpuidle_device *dev)
 {
+   struct cpuidle_driver *drv = cpuidle_get_cpu_driver(dev);
+   int i;
+
memset(dev->states_usage, 0, sizeof(dev->states_usage));
+   for (i = 0; i < drv->state_count; i++) {
+   if (drv->states[i].disable_use_at_start)
+   dev->states_usage[i].disable = 1;
+   }
dev->last_residency = 0;
 }
 
diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h
index bb31373..f3fe855 100644
--- a/include/linux/cpuidle.h
+++ b/include/linux/cpuidle.h
@@ -44,7 +44,12 @@ struct cpuidle_state {
int power_usage; /* in mW */
unsigned inttarget_residency; /* in US */
booldisabled; /* disabled on all CPUs */
-
+   /*
+* disable_use_at_start: If true, then this idle state will be
+* disabled by default. It can be enabled at runtime using the
+* per-cpu cpuidle sysfs control file named "disable".
+*/
+   booldisable_use_at_start;
int (*enter)(struct cpuidle_device *dev,
struct cpuidle_driver *drv,
int index);
-- 
1.9.4



[RFC/PATCH 0/2] powernv:cpuidle: Enable winkle idle state

2016-08-18 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

Hi,

The patches in these series enable support for Winkle idle state in
CPU-Idle.

The first patch is a platform-independent CPU-Idle patch that allows
CPU-Idle states to be  disabled at start (Currently they are all
enabled by default).

The second patch adds the winkle enablement for powernv-cpuidle. By
default, the winkle idle-state is disabled. It can be enabled by
writing zero to the per-cpu cpuidle sysfs control file named
"disable".

This series has been lightly tested on a 2-socket POWER8 system and
the machine was pretty stable while running kernbench and ebizzy. I
didn't see any regressions with those.

I haven't yet evaluated the impact that these patches might have
on latency sensitive workloads. I hope to do that in a day or two.

On the power-savings front, I could observe 6-8% additional
power-savings when winkle state was enabled on an idle system with
SMT=on. With SMT=off, additional idle power-savings observed with
winkle enabled were greater than 15%.  The numbers indicate that it
might be worth the while to pursue this!


Gautham R. Shenoy (2):
  cpuidle: Allow idle-states to be disabled at start
  powernv:cpuidle: Enable winkle idle state in CPU-Idle.

 drivers/cpuidle/cpuidle-powernv.c | 44 ---
 drivers/cpuidle/cpuidle.c |  7 +++
 include/linux/cpuidle.h   |  7 ++-
 3 files changed, 49 insertions(+), 9 deletions(-)

-- 
1.9.4



[RFC/PATCH 2/2] powernv:cpuidle: Enable winkle idle state in CPU-Idle.

2016-08-18 Thread Gautham R. Shenoy
From: "Gautham R. Shenoy" 

cpu-idle on powernv currently has support for only snooze, nap and
fastsleep states. Winkle idle state was excluded due to its large
exit-latency.

This patch adds winkle as a cpu-idle state for experimental
purposes. This state is disabled at start by default. However, should an
adventurous user want to enable it on a particular CPU(s), they can do
so by echo'ing a zero into the per-cpu sysfs cpuidle control file named
"disable" corresponding to this state.

Signed-off-by: Gautham R. Shenoy 
---
 drivers/cpuidle/cpuidle-powernv.c | 44 ---
 1 file changed, 36 insertions(+), 8 deletions(-)

diff --git a/drivers/cpuidle/cpuidle-powernv.c 
b/drivers/cpuidle/cpuidle-powernv.c
index f7ca891..0437d8a 100644
--- a/drivers/cpuidle/cpuidle-powernv.c
+++ b/drivers/cpuidle/cpuidle-powernv.c
@@ -20,7 +20,6 @@
 #include 
 #include 
 
-#define POWERNV_THRESHOLD_LATENCY_NS 20
 
 struct cpuidle_driver powernv_idle_driver = {
.name = "powernv_idle",
@@ -95,6 +94,30 @@ static int fastsleep_loop(struct cpuidle_device *dev,
 
return index;
 }
+
+static int winkle_loop(struct cpuidle_device *dev,
+   struct cpuidle_driver *drv,
+   int index)
+{
+   unsigned long old_lpcr = mfspr(SPRN_LPCR);
+   unsigned long new_lpcr;
+
+   if (unlikely(system_state < SYSTEM_RUNNING))
+   return index;
+
+   new_lpcr = old_lpcr;
+   /* Do not exit powersave upon decrementer as we've setup the timer
+* offload.
+*/
+   new_lpcr &= ~LPCR_PECE1;
+
+   mtspr(SPRN_LPCR, new_lpcr);
+   power7_winkle();
+
+   mtspr(SPRN_LPCR, old_lpcr);
+
+   return index;
+}
 #endif
 
 static int stop_loop(struct cpuidle_device *dev,
@@ -246,13 +269,6 @@ static int powernv_add_idle_states(void)
"ibm,cpu-idle-state-residency-ns", residency_ns, 
dt_idle_states);
 
for (i = 0; i < dt_idle_states; i++) {
-   /*
-* If an idle state has exit latency beyond
-* POWERNV_THRESHOLD_LATENCY_NS then don't use it
-* in cpu-idle.
-*/
-   if (latency_ns[i] > POWERNV_THRESHOLD_LATENCY_NS)
-   continue;
 
/*
 * Cpuidle accepts exit_latency and target_residency in us.
@@ -301,6 +317,18 @@ static int powernv_add_idle_states(void)
powernv_states[nr_idle_states].enter = stop_loop;
stop_psscr_table[nr_idle_states] = psscr_val[i];
}
+
+   if (flags[i] & OPAL_PM_WINKLE_ENABLED) {
+   int state_idx = nr_idle_states;
+
+   strcpy(powernv_states[state_idx].name, "Winkle");
+   strcpy(powernv_states[state_idx].desc, "Winkle");
+   powernv_states[state_idx].flags =
+   CPUIDLE_FLAG_TIMER_STOP;
+   powernv_states[state_idx].target_residency = 50;
+   powernv_states[state_idx].enter = winkle_loop;
+   powernv_states[state_idx].disable_use_at_start = true;
+   }
 #endif
powernv_states[nr_idle_states].exit_latency =
((unsigned int)latency_ns[i]) / 1000;
-- 
1.9.4



Re: [PATCH 0/6] cxlflash: Improvements and cleanup

2016-08-18 Thread Martin K. Petersen
> "Matthew" == Matthew R Ochs  writes:

Matthew> This patch set contains various code improvements and cleanups
Matthew> that were inspired by Al Viro upon reviewing the cxlflash
Matthew> driver. The core improvement is that the driver will no longer
Matthew> cache the adapter file descriptor associated with a
Matthew> context. This results in a user API change that is documented
Matthew> alongside the modifications.

Applied patches 1-3 to 4.9/scsi-queue. The remainder await reviews.

Matthew> The series is based upon 4.8-rc1, intended for 4.9, and is
Matthew> bisectable.

Thanks for making that explicit. Makes my life easier!

-- 
Martin K. Petersen  Oracle Linux Engineering


Re: [PATCH] cxl: use pcibios_free_controller_deferred() when removing vPHBs

2016-08-18 Thread Ian Munsie
Acked-by: Ian Munsie 



Re: linux-next: build warnings after merge of the kbuild tree

2016-08-18 Thread Stephen Rothwell
Hi Nick,

On Thu, 18 Aug 2016 11:09:48 +1000 Nicholas Piggin  wrote:
>
> On Wed, 17 Aug 2016 14:59:59 +0200
> Michal Marek  wrote:
> 
> > On 2016-08-17 03:44, Stephen Rothwell wrote:  
> > > 
> > > After merging the kbuild tree, today's linux-next build (powerpc
> > > ppc64_defconfig) produced these warnings:
> > > 
> > > WARNING: 25 bad relocations
> > > c0cf2570 R_PPC64_ADDR64__crc___arch_hweight16
> > [...]  
> > > Introduced by commit
> > > 
> > >   9445aa1a3062 ("ppc: move exports to definitions")
> > > 
> > > I have reverted that commit for today.
> > > 
> > > [cc-ing the ppc guys for clues - also involved is commit
> > > 
> > >   22823ab419d8 ("EXPORT_SYMBOL() for asm")
> > > ]
> > 
> > FWIW, I see these warnings as well. Any help from ppc developers is
> > appreciated - should the R_PPC64_ADDR64 be whitelisted for exported asm
> > symbols (their CRCs actually)?  
> 
> The dangling relocation is a side effect of linker unable to resolve the
> reference to the undefined weak symbols. So the real question is, why has
> genksyms not overridden these symbols with their CRC values?
> 
> This may not even be powerpc specific, but  I'll poke at it a bit more
> when I get a chance.

Not sure if this is relevant, but with the commit reverted, the
__crc___... symbols are absolute.

f55b3b3d A __crc___arch_hweight16

-- 
Cheers,
Stephen Rothwell


Re: [PATCH 4/6] cxlflash: Transition to application close model

2016-08-18 Thread Manoj Kumar


Acked-by: Manoj N. Kumar 


On 8/9/2016 6:39 PM, Matthew R. Ochs wrote:

Caching the adapter file descriptor and performing a close on behalf
of an application is a poor design. This is due to the fact that once
a file descriptor in installed, it is free to be altered without the
knowledge of the cxlflash driver. This can lead to inconsistencies
between the application and kernel. Furthermore, the nature of the
former design is more exploitable and thus should be abandoned.

To support applications performing a close on the adapter file that
is associated with a context, a new flag is introduced to the user
API to indicate to applications that they are responsible for the
close following the cleanup (detach) of a context. The documentation
is also updated to reflect this change in behavior.

Inspired-by: Al Viro 
Signed-off-by: Matthew R. Ochs 




Re: [PATCH v3 19/21] powerpc: tm: Always use fp_state and vr_state to store live registers

2016-08-18 Thread Simon Guo
On Wed, Aug 17, 2016 at 01:43:21PM +1000, Cyril Bur wrote:
> There is currently an inconsistency as to how the entire CPU register
> state is saved and restored when a thread uses transactional memory
> (TM).
> 
> Using transactional memory results in the CPU having duplicated
> (almost all) of its register state. This duplication results in a set
> of registers which can be considered 'live', those being currently
> modified by the instructions being executed and another set that is
> frozen at a point in time.
> 
> On context switch, both sets of state have to be saved and (later)
> restored. These two states are often called a variety of different
> things. Common terms for the state which only exists after has entered
> a transaction (performed a TBEGIN instruction) in hardware is the
> 'transactional' or 'speculative'.
> 
> Between a TBEGIN and a TEND or TABORT (or an event that causes the
> hardware to abort), regardless of the use of TSUSPEND the
> transactional state can be referred to as the live state.
> 
> The second state is often to referred to as the 'checkpointed' state
> and is a duplication of the live state when the TBEGIN instruction is
> executed. This state is kept in the hardware and will be rolled back
> to on transaction failure.
> 
> Currently all the registers stored in pt_regs are ALWAYS the live
> registers, that is, when a thread has transactional registers their
> values are stored in pt_regs and the checkpointed state is in
> ckpt_regs. A strange opposite is true for fp_state. When a thread is
> non transactional fp_state holds the live registers. When a thread
> has initiated a transaction fp_state holds the checkpointed state and
> transact_fp becomes the structure which holds the live state (at this
> point it is a transactional state). The same is true for vr_state
> 
> This method creates confusion as to where the live state is, in some
> circumstances it requires extra work to determine where to put the
> live state and prevents the use of common functions designed (probably
> before TM) to save the live state.
> 
> With this patch pt_regs, fp_state and vr_state all represent the
> same thing and the other structures [pending rename] are for
> checkpointed state.
> 
> Signed-off-by: Cyril Bur 
Acked-by: Simon Guo 

Thanks,
- Simon


Re: linux-next: build warnings after merge of the kbuild tree

2016-08-18 Thread Stephen Rothwell
Hi Nick,

On Fri, 19 Aug 2016 13:38:54 +1000 Stephen Rothwell  
wrote:
>
> On Thu, 18 Aug 2016 11:09:48 +1000 Nicholas Piggin  wrote:
> >
> > On Wed, 17 Aug 2016 14:59:59 +0200
> > Michal Marek  wrote:
> >   
> > > On 2016-08-17 03:44, Stephen Rothwell wrote:
> > > > 
> > > > After merging the kbuild tree, today's linux-next build (powerpc
> > > > ppc64_defconfig) produced these warnings:
> > > > 
> > > > WARNING: 25 bad relocations
> > > > c0cf2570 R_PPC64_ADDR64__crc___arch_hweight16  
> > > [...]
> > > > Introduced by commit
> > > > 
> > > >   9445aa1a3062 ("ppc: move exports to definitions")
> > > > 
> > > > I have reverted that commit for today.
> > > > 
> > > > [cc-ing the ppc guys for clues - also involved is commit
> > > > 
> > > >   22823ab419d8 ("EXPORT_SYMBOL() for asm")
> > > > ]  
> > > 
> > > FWIW, I see these warnings as well. Any help from ppc developers is
> > > appreciated - should the R_PPC64_ADDR64 be whitelisted for exported asm
> > > symbols (their CRCs actually)?
> > 
> > The dangling relocation is a side effect of linker unable to resolve the
> > reference to the undefined weak symbols. So the real question is, why has
> > genksyms not overridden these symbols with their CRC values?
> > 
> > This may not even be powerpc specific, but  I'll poke at it a bit more
> > when I get a chance.  
> 
> Not sure if this is relevant, but with the commit reverted, the
> __crc___... symbols are absolute.
> 
> f55b3b3d A __crc___arch_hweight16

Ignore that :-)

I just had a look at a x86_64 allmodconfig result and it looks like the
weak symbols are not resolved their either ...

I may be missing something, but genksyms generates the crc's off the
preprocessed C source code and we don't have any for the asm files ...
-- 
Cheers,
Stephen Rothwell


[PATCH v5 1/7] perf: Define macro for normalized arch names

2016-08-18 Thread Ravi Bangoria
Define macro for each normalized arch name and use them instead
of using arch name as string.

Signed-off-by: Ravi Bangoria 
---
Changes in v5:
  - No changes.

 tools/perf/arch/common.c   | 36 ++--
 tools/perf/arch/common.h   | 11 +++
 tools/perf/util/unwind-libunwind.c |  4 ++--
 3 files changed, 31 insertions(+), 20 deletions(-)

diff --git a/tools/perf/arch/common.c b/tools/perf/arch/common.c
index 886dd2a..f763666 100644
--- a/tools/perf/arch/common.c
+++ b/tools/perf/arch/common.c
@@ -123,25 +123,25 @@ static int lookup_triplets(const char *const *triplets, 
const char *name)
 const char *normalize_arch(char *arch)
 {
if (!strcmp(arch, "x86_64"))
-   return "x86";
+   return NORM_X86;
if (arch[0] == 'i' && arch[2] == '8' && arch[3] == '6')
-   return "x86";
+   return NORM_X86;
if (!strcmp(arch, "sun4u") || !strncmp(arch, "sparc", 5))
-   return "sparc";
+   return NORM_SPARC;
if (!strcmp(arch, "aarch64") || !strcmp(arch, "arm64"))
-   return "arm64";
+   return NORM_ARM64;
if (!strncmp(arch, "arm", 3) || !strcmp(arch, "sa110"))
-   return "arm";
+   return NORM_ARM;
if (!strncmp(arch, "s390", 4))
-   return "s390";
+   return NORM_S390;
if (!strncmp(arch, "parisc", 6))
-   return "parisc";
+   return NORM_PARISC;
if (!strncmp(arch, "powerpc", 7) || !strncmp(arch, "ppc", 3))
-   return "powerpc";
+   return NORM_POWERPC;
if (!strncmp(arch, "mips", 4))
-   return "mips";
+   return NORM_MIPS;
if (!strncmp(arch, "sh", 2) && isdigit(arch[2]))
-   return "sh";
+   return NORM_SH;
 
return arch;
 }
@@ -181,21 +181,21 @@ static int perf_env__lookup_binutils_path(struct perf_env 
*env,
zfree(&buf);
}
 
-   if (!strcmp(arch, "arm"))
+   if (!strcmp(arch, NORM_ARM))
path_list = arm_triplets;
-   else if (!strcmp(arch, "arm64"))
+   else if (!strcmp(arch, NORM_ARM64))
path_list = arm64_triplets;
-   else if (!strcmp(arch, "powerpc"))
+   else if (!strcmp(arch, NORM_POWERPC))
path_list = powerpc_triplets;
-   else if (!strcmp(arch, "sh"))
+   else if (!strcmp(arch, NORM_SH))
path_list = sh_triplets;
-   else if (!strcmp(arch, "s390"))
+   else if (!strcmp(arch, NORM_S390))
path_list = s390_triplets;
-   else if (!strcmp(arch, "sparc"))
+   else if (!strcmp(arch, NORM_SPARC))
path_list = sparc_triplets;
-   else if (!strcmp(arch, "x86"))
+   else if (!strcmp(arch, NORM_X86))
path_list = x86_triplets;
-   else if (!strcmp(arch, "mips"))
+   else if (!strcmp(arch, NORM_MIPS))
path_list = mips_triplets;
else {
ui__error("binutils for %s not supported.\n", arch);
diff --git a/tools/perf/arch/common.h b/tools/perf/arch/common.h
index 6b01c73..14ca8ca 100644
--- a/tools/perf/arch/common.h
+++ b/tools/perf/arch/common.h
@@ -5,6 +5,17 @@
 
 extern const char *objdump_path;
 
+/* Macro for normalized arch names */
+#define NORM_X86   "x86"
+#define NORM_SPARC "sparc"
+#define NORM_ARM64 "arm64"
+#define NORM_ARM   "arm"
+#define NORM_S390  "s390"
+#define NORM_PARISC"parisc"
+#define NORM_POWERPC   "powerpc"
+#define NORM_MIPS  "mips"
+#define NORM_SH"sh"
+
 int perf_env__lookup_objdump(struct perf_env *env);
 const char *normalize_arch(char *arch);
 
diff --git a/tools/perf/util/unwind-libunwind.c 
b/tools/perf/util/unwind-libunwind.c
index 6d542a4..6199102 100644
--- a/tools/perf/util/unwind-libunwind.c
+++ b/tools/perf/util/unwind-libunwind.c
@@ -40,10 +40,10 @@ int unwind__prepare_access(struct thread *thread, struct 
map *map,
 
arch = normalize_arch(thread->mg->machine->env->arch);
 
-   if (!strcmp(arch, "x86")) {
+   if (!strcmp(arch, NORM_X86)) {
if (dso_type != DSO__TYPE_64BIT)
ops = x86_32_unwind_libunwind_ops;
-   } else if (!strcmp(arch, "arm64") || !strcmp(arch, "arm")) {
+   } else if (!strcmp(arch, NORM_ARM64) || !strcmp(arch, NORM_ARM)) {
if (dso_type == DSO__TYPE_64BIT)
ops = arm64_unwind_libunwind_ops;
}
-- 
2.5.5



[PATCH v5 0/7] perf: Cross arch annotate + few miscellaneous fixes

2016-08-18 Thread Ravi Bangoria
Currently Perf annotate support code navigation (branches and calls)
only when run on the same architecture where perf.data was recorded.
But, for example, record on powerpc server and annotate on client's
x86 desktop is not supported.

This patchset enables cross arch annotate. Currently I've used x86
and arm instructions which are already available and added support
for powerpc.

Additionally this patch series also contains few other related fixes.

Patches are prepared on top of acme/perf/core and tested it with x86
and powerpc only.

Note for arm:
Few instructions were defined under #if __arm__ which I've used as a
table for arm. But I'm not sure whether instruction defined outside of
that also contains arm instructions. Apart from that, 'call__parse()'
and 'move__parse()' contains #ifdef __arm__ directive. I've changed it
to  if (!strcmp(norm_arch, arm)). I don't have a arm machine to test
these changes.

Example:

  Record on powerpc:
  $ ./perf record -a

  Report -> Annotate on x86:
  $ ./perf report -i perf.data.powerpc --vmlinux vmlinux.powerpc

Changes in v5:
  - Replaced symbol__annotate with symbol__disassemble.
  - Removed hacks for jump and call instructions like bctr and bctrl
respectively from generic patch that enables support for powerpc
and made separate patch for that.
  - v4 was not annotating powerpc 'btar' instruction. Included that.
  - Added few generic fixes.

v4 link:
  https://lkml.org/lkml/2016/7/8/10

Naveen N. Rao (1):
  perf annotate: Add support for powerpc

Ravi Bangoria (6):
  perf: Define macro for normalized arch names
  perf annotate: Add cross arch annotate support
  perf annotate: Do not ignore call instruction with indirect target
  perf annotate: Show raw form for jump instruction with indirect
target
  perf annotate: Support jump instruction with target as second operand
  perf annotate: Fix jump target outside of function address range

 tools/perf/arch/common.c   |  36 ++---
 tools/perf/arch/common.h   |  11 ++
 tools/perf/builtin-top.c   |   2 +-
 tools/perf/ui/browsers/annotate.c  |   8 +-
 tools/perf/ui/gtk/annotate.c   |   2 +-
 tools/perf/util/annotate.c | 276 +
 tools/perf/util/annotate.h |  10 +-
 tools/perf/util/unwind-libunwind.c |   4 +-
 8 files changed, 262 insertions(+), 87 deletions(-)

-- 
2.5.5



[PATCH v5 2/7] perf annotate: Add cross arch annotate support

2016-08-18 Thread Ravi Bangoria
Change current data structures and function to enable cross arch
annotate.

Current perf implementation does not support cross arch annotate.
To make it truly cross arch, instruction table of all arch should
be present in perf binary. And use appropriate table based on arch
where perf.data was recorded.

Signed-off-by: Ravi Bangoria 
---
Changes in v5:
  - Replaced symbol__annotate with symbol__disassemble.

 tools/perf/builtin-top.c  |   2 +-
 tools/perf/ui/browsers/annotate.c |   3 +-
 tools/perf/ui/gtk/annotate.c  |   2 +-
 tools/perf/util/annotate.c| 133 --
 tools/perf/util/annotate.h|   5 +-
 5 files changed, 92 insertions(+), 53 deletions(-)

diff --git a/tools/perf/builtin-top.c b/tools/perf/builtin-top.c
index a3223aa..fdd4203 100644
--- a/tools/perf/builtin-top.c
+++ b/tools/perf/builtin-top.c
@@ -129,7 +129,7 @@ static int perf_top__parse_source(struct perf_top *top, 
struct hist_entry *he)
return err;
}
 
-   err = symbol__disassemble(sym, map, 0);
+   err = symbol__disassemble(sym, map, 0, NULL);
if (err == 0) {
 out_assign:
top->sym_filter_entry = he;
diff --git a/tools/perf/ui/browsers/annotate.c 
b/tools/perf/ui/browsers/annotate.c
index 2e2d100..21c5e10 100644
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -1050,7 +1050,8 @@ int symbol__tui_annotate(struct symbol *sym, struct map 
*map,
  (nr_pcnt - 1);
}
 
-   err = symbol__disassemble(sym, map, sizeof_bdl);
+   err = symbol__disassemble(sym, map, sizeof_bdl,
+ perf_evsel__env_arch(evsel));
if (err) {
char msg[BUFSIZ];
symbol__strerror_disassemble(sym, map, err, msg, sizeof(msg));
diff --git a/tools/perf/ui/gtk/annotate.c b/tools/perf/ui/gtk/annotate.c
index 42d3199..c127aba 100644
--- a/tools/perf/ui/gtk/annotate.c
+++ b/tools/perf/ui/gtk/annotate.c
@@ -167,7 +167,7 @@ static int symbol__gtk_annotate(struct symbol *sym, struct 
map *map,
if (map->dso->annotate_warned)
return -1;
 
-   err = symbol__disassemble(sym, map, 0);
+   err = symbol__disassemble(sym, map, 0, perf_evsel__env_arch(evsel));
if (err) {
char msg[BUFSIZ];
symbol__strerror_disassemble(sym, map, err, msg, sizeof(msg));
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 25a9259..deb9af0 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -20,12 +20,14 @@
 #include 
 #include 
 #include 
+#include 
+#include "../arch/common.h"
 
 const char *disassembler_style;
 const char *objdump_path;
 static regex_t  file_lineno;
 
-static struct ins *ins__find(const char *name);
+static struct ins *ins__find(const char *name, const char *norm_arch);
 static int disasm_line__parse(char *line, char **namep, char **rawp);
 
 static void ins__delete(struct ins_operands *ops)
@@ -53,7 +55,7 @@ int ins__scnprintf(struct ins *ins, char *bf, size_t size,
return ins__raw_scnprintf(ins, bf, size, ops);
 }
 
-static int call__parse(struct ins_operands *ops)
+static int call__parse(struct ins_operands *ops, const char *norm_arch)
 {
char *endptr, *tok, *name;
 
@@ -65,10 +67,8 @@ static int call__parse(struct ins_operands *ops)
 
name++;
 
-#ifdef __arm__
-   if (strchr(name, '+'))
+   if (!strcmp(norm_arch, NORM_ARM) && strchr(name, '+'))
return -1;
-#endif
 
tok = strchr(name, '>');
if (tok == NULL)
@@ -117,7 +117,8 @@ bool ins__is_call(const struct ins *ins)
return ins->ops == &call_ops;
 }
 
-static int jump__parse(struct ins_operands *ops)
+static int jump__parse(struct ins_operands *ops,
+  const char *norm_arch __maybe_unused)
 {
const char *s = strchr(ops->raw, '+');
 
@@ -172,7 +173,7 @@ static int comment__symbol(char *raw, char *comment, u64 
*addrp, char **namep)
return 0;
 }
 
-static int lock__parse(struct ins_operands *ops)
+static int lock__parse(struct ins_operands *ops, const char *norm_arch)
 {
char *name;
 
@@ -183,7 +184,7 @@ static int lock__parse(struct ins_operands *ops)
if (disasm_line__parse(ops->raw, &name, &ops->locked.ops->raw) < 0)
goto out_free_ops;
 
-   ops->locked.ins = ins__find(name);
+   ops->locked.ins = ins__find(name, norm_arch);
free(name);
 
if (ops->locked.ins == NULL)
@@ -193,7 +194,7 @@ static int lock__parse(struct ins_operands *ops)
return 0;
 
if (ops->locked.ins->ops->parse &&
-   ops->locked.ins->ops->parse(ops->locked.ops) < 0)
+   ops->locked.ins->ops->parse(ops->locked.ops, norm_arch) < 0)
goto out_free_ops;
 
return 0;
@@ -236,7 +237,7 @@ static struct ins_ops lock_ops = {
.scnprintf = lock__scnprintf,
 };
 
-static int mov__parse(s

[PATCH v5 4/7] perf annotate: Do not ignore call instruction with indirect target

2016-08-18 Thread Ravi Bangoria
Do not ignore call instruction with indirect target when its already
identified as a call. This is an extension of commit e8ea1561952b
("perf annotate: Use raw form for register indirect call instructions")
to generalize annotation for all instructions with indirect calls.

This is needed for certain powerpc call instructions that use address
in a register (such as bctrl, btarl, ...).

Apart from that, when kcore is used to disassemble function, all call
instructions were ignored. This patch will fix it as a side effect by
not ignoring them. For example,

Before (with kcore):
   mov%r13,%rdi
   callq  0x811a7e70
 ^ jmpq   64
   mov%gs:0x7ef41a6e(%rip),%al

After (with kcore):
   mov%r13,%rdi
 > callq  0x811a7e70
 ^ jmpq   64
   mov%gs:0x7ef41a6e(%rip),%al

Suggested-by: Michael Ellerman 
[Suggested about 'bctrl' instruction]
Signed-off-by: Ravi Bangoria 
---
Changes in v5:
  - New patch, introduced to annotate all indirect call instructions.

 tools/perf/util/annotate.c | 8 ++--
 1 file changed, 2 insertions(+), 6 deletions(-)

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 0b64841..6368ba9 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -81,16 +81,12 @@ static int call__parse(struct ins_operands *ops, const char 
*norm_arch)
return ops->target.name == NULL ? -1 : 0;
 
 indirect_call:
-   tok = strchr(endptr, '(');
-   if (tok != NULL) {
+   tok = strchr(endptr, '*');
+   if (tok == NULL) {
ops->target.addr = 0;
return 0;
}
 
-   tok = strchr(endptr, '*');
-   if (tok == NULL)
-   return -1;
-
ops->target.addr = strtoull(tok + 1, NULL, 16);
return 0;
 }
-- 
2.5.5



[PATCH v5 7/7] perf annotate: Fix jump target outside of function address range

2016-08-18 Thread Ravi Bangoria
If jump target is outside of function range, perf is not handling it
correctly. Especially when target address is lesser than function start
address, target offset will be negative. But, target address declared
to be unsigned, converts negative number into 2's complement. See below
example. Here target of 'jumpq' instruction at 34cf8 is 34ac0 which is
lesser than function start address(34cf0).

34ac0 - 34cf0 = -0x230 = 0xfdd0

Objdump output:

  00034cf0 <__sigaction>:
  __GI___sigaction():
34cf0: lea-0x20(%rdi),%eax
34cf3: cmp-bashx1,%eax
34cf6: jbe34d00 <__sigaction+0x10>
34cf8: jmpq   34ac0 <__GI___libc_sigaction>
34cfd: nopl   (%rax)
34d00: mov0x386161(%rip),%rax# 3bae68 <_DYNAMIC+0x2e8>
34d07: movl   -bashx16,%fs:(%rax)
34d0e: mov-bashx,%eax
34d13: retq

perf annotate before applying patch:

  __GI___sigaction  /usr/lib64/libc-2.22.so
   lea-0x20(%rdi),%eax
   cmp-bashx1,%eax
V  jbe10
V  jmpq   fdd0
   nop
10:mov_DYNAMIC+0x2e8,%rax
   movl   -bashx16,%fs:(%rax)
   mov-bashx,%eax
   retq

perf annotate after applying patch:

  __GI___sigaction  /usr/lib64/libc-2.22.so
   lea-0x20(%rdi),%eax
   cmp-bashx1,%eax
V  jbe10
^  jmpq   34ac0 <__GI___libc_sigaction>
   nop
10:mov_DYNAMIC+0x2e8,%rax
   movl   -bashx16,%fs:(%rax)
   mov-bashx,%eax
   retq

Signed-off-by: Ravi Bangoria 
---
Changes in v5:
  - New patch

 tools/perf/ui/browsers/annotate.c |  5 +++--
 tools/perf/util/annotate.c| 14 +-
 tools/perf/util/annotate.h|  5 +++--
 3 files changed, 15 insertions(+), 9 deletions(-)

diff --git a/tools/perf/ui/browsers/annotate.c 
b/tools/perf/ui/browsers/annotate.c
index 21c5e10..c13df5b 100644
--- a/tools/perf/ui/browsers/annotate.c
+++ b/tools/perf/ui/browsers/annotate.c
@@ -215,7 +215,7 @@ static void annotate_browser__write(struct ui_browser 
*browser, void *entry, int
ui_browser__set_color(browser, color);
if (dl->ins && dl->ins->ops->scnprintf) {
if (ins__is_jump(dl->ins)) {
-   bool fwd = dl->ops.target.offset > 
(u64)dl->offset;
+   bool fwd = dl->ops.target.offset > dl->offset;
 
ui_browser__write_graph(browser, fwd ? 
SLSMG_DARROW_CHAR :

SLSMG_UARROW_CHAR);
@@ -245,7 +245,8 @@ static bool disasm_line__is_valid_jump(struct disasm_line 
*dl, struct symbol *sy
 {
if (!dl || !dl->ins || !ins__is_jump(dl->ins)
|| !disasm_line__has_offset(dl)
-   || dl->ops.target.offset >= symbol__size(sym))
+   || dl->ops.target.offset < 0
+   || dl->ops.target.offset >= (s64)symbol__size(sym))
return false;
 
return true;
diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 678fb81..c8b017c 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -124,10 +124,12 @@ static int jump__parse(struct ins_operands *ops,
else
ops->target.addr = strtoull(ops->raw, NULL, 16);
 
-   if (s++ != NULL)
+   if (s++ != NULL) {
ops->target.offset = strtoull(s, NULL, 16);
-   else
-   ops->target.offset = UINT64_MAX;
+   ops->target.offset_avail = true;
+   } else {
+   ops->target.offset_avail = false;
+   }
 
return 0;
 }
@@ -135,7 +137,7 @@ static int jump__parse(struct ins_operands *ops,
 static int jump__scnprintf(struct ins *ins, char *bf, size_t size,
   struct ins_operands *ops)
 {
-   if (!ops->target.addr)
+   if (!ops->target.addr || ops->target.offset < 0)
return ins__raw_scnprintf(ins, bf, size, ops);
 
return scnprintf(bf, size, "%-6.6s %" PRIx64, ins->name, 
ops->target.offset);
@@ -1228,9 +1230,11 @@ static int symbol__parse_objdump_line(struct symbol 
*sym, struct map *map,
if (dl == NULL)
return -1;
 
-   if (dl->ops.target.offset == UINT64_MAX)
+   if (!disasm_line__has_offset(dl)) {
dl->ops.target.offset = dl->ops.target.addr -
map__rip_2objdump(map, sym->start);
+   dl->ops.target.offset_avail = true;
+   }
 
/* kcore has no symbols, so add the call target name */
if (dl->ins && ins__is_call(dl->ins) && !dl->ops.target.name) {
diff --git a/tools/perf/util/annotate.h b/tools/perf/util/annotate.h
index 5cfad4e..5787ed8 100644
--- a/tools/perf/util/annotate.h
+++ b/tools/perf/util/annotate.h
@@ -19,7 +19,8 @@ struct ins_operands {
char*raw;
char*na

[PATCH v5 5/7] perf annotate: Show raw form for jump instruction with indirect target

2016-08-18 Thread Ravi Bangoria
For jump instructions that does not include target address as direct
operand, use raw value for that. This is needed for certain powerpc
jump instructions that use target address in a register (such as bctr,
btar, ...).

Suggested-by: Michael Ellerman 
Signed-off-by: Ravi Bangoria 
---
Changes in v5:
  - New patch introduced to annotate jump instruction with indirect target

 tools/perf/util/annotate.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 6368ba9..4a4a583 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -131,6 +131,9 @@ static int jump__parse(struct ins_operands *ops,
 static int jump__scnprintf(struct ins *ins, char *bf, size_t size,
   struct ins_operands *ops)
 {
+   if (!ops->target.addr)
+   return ins__raw_scnprintf(ins, bf, size, ops);
+
return scnprintf(bf, size, "%-6.6s %" PRIx64, ins->name, 
ops->target.offset);
 }
 
-- 
2.5.5



[PATCH v5 6/7] perf annotate: Support jump instruction with target as second operand

2016-08-18 Thread Ravi Bangoria
Current perf is not able to parse jump instruction when second operand
contains target address. Arch like powerpc has such instructions. For
example, 'beq  cr7,10173e60'.

Signed-off-by: Ravi Bangoria 
---
Changes in v5:
  - New patch

 tools/perf/util/annotate.c | 6 +-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index 4a4a583..678fb81 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -117,8 +117,12 @@ static int jump__parse(struct ins_operands *ops,
   const char *norm_arch __maybe_unused)
 {
const char *s = strchr(ops->raw, '+');
+   const char *c = strchr(ops->raw, ',');
 
-   ops->target.addr = strtoull(ops->raw, NULL, 16);
+   if (c++ != NULL)
+   ops->target.addr = strtoull(c, NULL, 16);
+   else
+   ops->target.addr = strtoull(ops->raw, NULL, 16);
 
if (s++ != NULL)
ops->target.offset = strtoull(s, NULL, 16);
-- 
2.5.5



[PATCH v5 3/7] perf annotate: Add support for powerpc

2016-08-18 Thread Ravi Bangoria
From: "Naveen N. Rao" 

Current perf can disassemble annotated function but it does not have
parsing logic for powerpc instructions. So all navigation options are
not available for powerpc.

Apart from that, Powerpc has long list of branch instructions and
hardcoding them in table appears to be error-prone. So, add function
to find instruction instead of creating table. This function dynamically
create table (list of 'struct ins'), and instead of creating object
every time, first check if list already contain object for that
instruction.

Signed-off-by: Naveen N. Rao 
Signed-off-by: Ravi Bangoria 
---
Changes in v5:
  - Removed hacks for instructions like bctr and bctrl from this patch.

 tools/perf/util/annotate.c | 116 +
 1 file changed, 116 insertions(+)

diff --git a/tools/perf/util/annotate.c b/tools/perf/util/annotate.c
index deb9af0..0b64841 100644
--- a/tools/perf/util/annotate.c
+++ b/tools/perf/util/annotate.c
@@ -459,6 +459,11 @@ static struct ins instructions_arm[] = {
{ .name = "bne",   .ops  = &jump_ops, },
 };
 
+struct instructions_powerpc {
+   struct ins *ins;
+   struct list_head list;
+};
+
 static int ins__key_cmp(const void *name, const void *insp)
 {
const struct ins *ins = insp;
@@ -474,6 +479,115 @@ static int ins__cmp(const void *a, const void *b)
return strcmp(ia->name, ib->name);
 }
 
+static struct ins *list_add__ins_powerpc(struct instructions_powerpc *head,
+const char *name, struct ins_ops *ops)
+{
+   struct instructions_powerpc *ins_powerpc;
+   struct ins *ins;
+
+   ins = zalloc(sizeof(struct ins));
+   if (!ins)
+   return NULL;
+
+   ins_powerpc = zalloc(sizeof(struct instructions_powerpc));
+   if (!ins_powerpc)
+   goto out_free_ins;
+
+   ins->name = strdup(name);
+   if (!ins->name)
+   goto out_free_ins_power;
+
+   ins->ops = ops;
+   ins_powerpc->ins = ins;
+   list_add_tail(&(ins_powerpc->list), &(head->list));
+
+   return ins;
+
+out_free_ins_power:
+   zfree(&ins_powerpc);
+out_free_ins:
+   zfree(&ins);
+   return NULL;
+}
+
+static struct ins *list_search__ins_powerpc(struct instructions_powerpc *head,
+   const char *name)
+{
+   struct instructions_powerpc *pos;
+
+   list_for_each_entry(pos, &head->list, list) {
+   if (!strcmp(pos->ins->name, name))
+   return pos->ins;
+   }
+   return NULL;
+}
+
+static struct ins *ins__find_powerpc(const char *name)
+{
+   int i;
+   struct ins *ins;
+   struct ins_ops *ops;
+   static struct instructions_powerpc head;
+   static bool list_initialized;
+
+   /*
+* - Interested only if instruction starts with 'b'.
+* - Few start with 'b', but aren't branch instructions.
+*/
+   if (name[0] != 'b' ||
+   !strncmp(name, "bcd", 3)   ||
+   !strncmp(name, "brinc", 5) ||
+   !strncmp(name, "bper", 4))
+   return NULL;
+
+   if (!list_initialized) {
+   INIT_LIST_HEAD(&head.list);
+   list_initialized = true;
+   }
+
+   /*
+* Return if we already have object of 'struct ins' for this instruction
+*/
+   ins = list_search__ins_powerpc(&head, name);
+   if (ins)
+   return ins;
+
+   ops = &jump_ops;
+
+   i = strlen(name) - 1;
+   if (i < 0)
+   return NULL;
+
+   /* ignore optional hints at the end of the instructions */
+   if (name[i] == '+' || name[i] == '-')
+   i--;
+
+   if (name[i] == 'l' || (name[i] == 'a' && name[i-1] == 'l')) {
+   /*
+* if the instruction ends up with 'l' or 'la', then
+* those are considered 'calls' since they update LR.
+* ... except for 'bnl' which is branch if not less than
+* and the absolute form of the same.
+*/
+   if (strcmp(name, "bnl") && strcmp(name, "bnl+") &&
+   strcmp(name, "bnl-") && strcmp(name, "bnla") &&
+   strcmp(name, "bnla+") && strcmp(name, "bnla-"))
+   ops = &call_ops;
+   }
+   if (name[i] == 'r' && name[i-1] == 'l')
+   /*
+* instructions ending with 'lr' are considered to be
+* return instructions
+*/
+   ops = &ret_ops;
+
+   /*
+* Add instruction to list so next time no need to
+* allocate memory for it.
+*/
+   return list_add__ins_powerpc(&head, name, ops);
+}
+
 static void ins__sort(struct ins *instructions, int nmemb)
 {
qsort(instructions, nmemb, sizeof(struct ins), ins__cmp);
@@ -509,6 +623,8 @@ static struct ins *ins__find(const char *name, const char 
*norm_arch)
} e

Re: linux-next: build warnings after merge of the kbuild tree

2016-08-18 Thread Nicholas Piggin
On Fri, 19 Aug 2016 15:09:14 +1000
Stephen Rothwell  wrote:

> Hi Nick,
> 
> On Fri, 19 Aug 2016 13:38:54 +1000 Stephen Rothwell  
> wrote:
> >
> > On Thu, 18 Aug 2016 11:09:48 +1000 Nicholas Piggin  
> > wrote:  
> > >
> > > On Wed, 17 Aug 2016 14:59:59 +0200
> > > Michal Marek  wrote:
> > > 
> > > > On 2016-08-17 03:44, Stephen Rothwell wrote:  
> > > > > 
> > > > > After merging the kbuild tree, today's linux-next build (powerpc
> > > > > ppc64_defconfig) produced these warnings:
> > > > > 
> > > > > WARNING: 25 bad relocations
> > > > > c0cf2570 R_PPC64_ADDR64__crc___arch_hweight16
> > > > [...]  
> > > > > Introduced by commit
> > > > > 
> > > > >   9445aa1a3062 ("ppc: move exports to definitions")
> > > > > 
> > > > > I have reverted that commit for today.
> > > > > 
> > > > > [cc-ing the ppc guys for clues - also involved is commit
> > > > > 
> > > > >   22823ab419d8 ("EXPORT_SYMBOL() for asm")
> > > > > ]
> > > > 
> > > > FWIW, I see these warnings as well. Any help from ppc developers is
> > > > appreciated - should the R_PPC64_ADDR64 be whitelisted for exported asm
> > > > symbols (their CRCs actually)?  
> > > 
> > > The dangling relocation is a side effect of linker unable to resolve the
> > > reference to the undefined weak symbols. So the real question is, why has
> > > genksyms not overridden these symbols with their CRC values?
> > > 
> > > This may not even be powerpc specific, but  I'll poke at it a bit more
> > > when I get a chance.
> > 
> > Not sure if this is relevant, but with the commit reverted, the
> > __crc___... symbols are absolute.
> > 
> > f55b3b3d A __crc___arch_hweight16  
> 
> Ignore that :-)
> 
> I just had a look at a x86_64 allmodconfig result and it looks like the
> weak symbols are not resolved their either ...
> 
> I may be missing something, but genksyms generates the crc's off the
> preprocessed C source code and we don't have any for the asm files ...

Looks like you're right, good find!

Thanks,
Nick


[PATCH 01/13] powerpc: Add simple cache inhibited MMIO accessors

2016-08-18 Thread Paul Mackerras
From: Suresh Warrier 

Add simple cache inhibited accessors for memory mapped I/O.
Unlike the accessors built from the DEF_MMIO_* macros, these
don't include any hardware memory barriers, callers need to
manage memory barriers on their own. These can only be called
in hypervisor real mode.

Signed-off-by: Suresh Warrier 
[pau...@ozlabs.org - added line to comment]
Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/io.h | 29 +
 1 file changed, 29 insertions(+)

diff --git a/arch/powerpc/include/asm/io.h b/arch/powerpc/include/asm/io.h
index 2fd1690..f6fda84 100644
--- a/arch/powerpc/include/asm/io.h
+++ b/arch/powerpc/include/asm/io.h
@@ -241,6 +241,35 @@ static inline void out_be64(volatile u64 __iomem *addr, 
u64 val)
 #endif
 #endif /* __powerpc64__ */
 
+
+/*
+ * Simple Cache inhibited accessors
+ * Unlike the DEF_MMIO_* macros, these don't include any h/w memory
+ * barriers, callers need to manage memory barriers on their own.
+ * These can only be used in hypervisor real mode.
+ */
+
+static inline u32 _lwzcix(unsigned long addr)
+{
+   u32 ret;
+
+   __asm__ __volatile__("lwzcix %0,0, %1"
+: "=r" (ret) : "r" (addr) : "memory");
+   return ret;
+}
+
+static inline void _stbcix(u64 addr, u8 val)
+{
+   __asm__ __volatile__("stbcix %0,0,%1"
+   : : "r" (val), "r" (addr) : "memory");
+}
+
+static inline void _stwcix(u64 addr, u32 val)
+{
+   __asm__ __volatile__("stwcix %0,0,%1"
+   : : "r" (val), "r" (addr) : "memory");
+}
+
 /*
  * Low level IO stream instructions are defined out of line for now
  */
-- 
2.8.1



[PATCH 00/13] Real-mode acceleration of device interrupts in HV KVM

2016-08-18 Thread Paul Mackerras
This patch set reduces the latency for presenting interrupts from PCI
pass-through devices to a Book3S HV guest.  Currently, if an interrupt
arrives from a PCI pass-through device while a guest is running, it
causes an exit of all threads on the core to the host, where the
interrupt is handled by making an interrupt pending in the virtual
XICS interrupt controller for the guest that owns the device.
Furthermore, there is currently no attempt to direct PCI pass-through
device interrupts to the physical core where the VCPU that they are
directed to is running, so they often land on a different core and
require an IPI to interrupt the VCPU.

With this patch set, if the interrupt arrives on a core where the
correct guest is running, it can be handled in hypervisor real mode
without needing an exit to host context.  If the destination VCPU is
on the same core, then we can interrupt it using at most a msgsnd
(message send) instruction, which is considerably faster than an IPI.

Further, if an interrupt arrives on a different core, we then change
the destination for the interrupt in the physical interrupt controller
to point to the core where the VCPU is running.  For now, we always
direct the interrupt to thread 0 of the core because the other threads
are offline from the point of view of the host, and the offline loop
(which is where those other threads run when thread 0 is in host
context) doesn't handle device interrupts.

This patch set is based on a patch set from Suresh Warrier, with
considerable revision by me.  The data structure for mapping host
interrupt numbers to guest interrupt numbers is just a flat array that
is searched linearly, which works and is simple but could perform
poorly with large numbers of interrupt sources.  It would be simple to
replace this mapping array with a more sophisticated data structure in
future.

To test the performance of this patch set, I used a network one-byte
ping-pong test between a guest with a Mellanox CX-3 passed through to
it, connected over 10Gb ethernet to another POWER8 system running
bare-metal with a Chelsio 10Gb ethernet adapter.  (The guest was
running Ubuntu 16.04.1 under QEMU v2.7-rc2 on a POWER8.)  Without this
patchset, the round-trip latency was 43us, and with it the latency was
41us, a saving of 2us per round-trip.

Paul.
-
 arch/powerpc/include/asm/io.h  |  29 
 arch/powerpc/include/asm/kvm_asm.h |  10 ++
 arch/powerpc/include/asm/kvm_book3s.h  |   1 +
 arch/powerpc/include/asm/kvm_host.h|  20 +++
 arch/powerpc/include/asm/kvm_ppc.h |  28 
 arch/powerpc/include/asm/opal.h|   1 +
 arch/powerpc/include/asm/pnv-pci.h |   3 +
 arch/powerpc/kvm/Kconfig   |   2 +
 arch/powerpc/kvm/book3s.c  |   3 +
 arch/powerpc/kvm/book3s_hv.c   | 199 -
 arch/powerpc/kvm/book3s_hv_builtin.c   | 141 ++
 arch/powerpc/kvm/book3s_hv_rm_xics.c   | 120 +++
 arch/powerpc/kvm/book3s_hv_rmhandlers.S| 183 +--
 arch/powerpc/kvm/book3s_xics.c |  55 ++-
 arch/powerpc/kvm/book3s_xics.h |   2 +
 arch/powerpc/kvm/powerpc.c |  38 +
 arch/powerpc/platforms/powernv/opal-wrappers.S |   1 +
 arch/powerpc/platforms/powernv/pci-ioda.c  |  24 ++-
 18 files changed, 773 insertions(+), 87 deletions(-)


[PATCH 02/13] KVM: PPC: Book3S HV: Convert kvmppc_read_intr to a C function

2016-08-18 Thread Paul Mackerras
From: Suresh Warrier 

Modify kvmppc_read_intr to make it a C function.  Because it is called
from kvmppc_check_wake_reason, any of the assembler code that calls
either kvmppc_read_intr or kvmppc_check_wake_reason now has to assume
that the volatile registers might have been modified.

This also adds in the optimization of clearing saved_xirr in the case
where we completely handle and EOI an IPI.  Without this, the next
device interrupt will require two trips through the host interrupt
handling code.

[pau...@ozlabs.org - made kvmppc_check_wake_reason create a stack frame
 when it is calling kvmppc_read_intr, which means we can set r12 to
 the trap number (0x500) after the call to kvmppc_read_intr, instead
 of using r31.  Also moved the deliver_guest_interrupt label so as to
 restore XER and CTR, plus other minor tweaks.]

Signed-off-by: Suresh Warrier 
Signed-off-by: Paul Mackerras 
---
 arch/powerpc/kvm/book3s_hv_builtin.c|  84 +
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 158 +++-
 2 files changed, 158 insertions(+), 84 deletions(-)

diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c 
b/arch/powerpc/kvm/book3s_hv_builtin.c
index 5f0380d..b476a6a 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #define KVM_CMA_CHUNK_ORDER18
 
@@ -286,3 +287,86 @@ void kvmhv_commence_exit(int trap)
 
 struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
 EXPORT_SYMBOL_GPL(kvmppc_host_rm_ops_hv);
+
+/*
+ * Determine what sort of external interrupt is pending (if any).
+ * Returns:
+ * 0 if no interrupt is pending
+ * 1 if an interrupt is pending that needs to be handled by the host
+ * -1 if there was a guest wakeup IPI (which has now been cleared)
+ */
+
+long kvmppc_read_intr(void)
+{
+   unsigned long xics_phys;
+   u32 h_xirr;
+   __be32 xirr;
+   u32 xisr;
+   u8 host_ipi;
+
+   /* see if a host IPI is pending */
+   host_ipi = local_paca->kvm_hstate.host_ipi;
+   if (host_ipi)
+   return 1;
+
+   /* Now read the interrupt from the ICP */
+   xics_phys = local_paca->kvm_hstate.xics_phys;
+   if (unlikely(!xics_phys))
+   return 1;
+
+   /*
+* Save XIRR for later. Since we get control in reverse endian
+* on LE systems, save it byte reversed and fetch it back in
+* host endian. Note that xirr is the value read from the
+* XIRR register, while h_xirr is the host endian version.
+*/
+   xirr = _lwzcix(xics_phys + XICS_XIRR);
+   h_xirr = be32_to_cpu(xirr);
+   local_paca->kvm_hstate.saved_xirr = h_xirr;
+   xisr = h_xirr & 0xff;
+   /*
+* Ensure that the store/load complete to guarantee all side
+* effects of loading from XIRR has completed
+*/
+   smp_mb();
+
+   /* if nothing pending in the ICP */
+   if (!xisr)
+   return 0;
+
+   /* We found something in the ICP...
+*
+* If it is an IPI, clear the MFRR and EOI it.
+*/
+   if (xisr == XICS_IPI) {
+   _stbcix(xics_phys + XICS_MFRR, 0xff);
+   _stwcix(xics_phys + XICS_XIRR, xirr);
+   /*
+* Need to ensure side effects of above stores
+* complete before proceeding.
+*/
+   smp_mb();
+
+   /*
+* We need to re-check host IPI now in case it got set in the
+* meantime. If it's clear, we bounce the interrupt to the
+* guest
+*/
+   host_ipi = local_paca->kvm_hstate.host_ipi;
+   if (unlikely(host_ipi != 0)) {
+   /* We raced with the host,
+* we need to resend that IPI, bummer
+*/
+   _stbcix(xics_phys + XICS_MFRR, IPI_PRIORITY);
+   /* Let side effects complete */
+   smp_mb();
+   return 1;
+   }
+
+   /* OK, it's an IPI for us */
+   local_paca->kvm_hstate.saved_xirr = 0;
+   return -1;
+   }
+
+   return 1;
+}
diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index 9756555..dccfa85 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -221,6 +221,13 @@ kvmppc_primary_no_guest:
li  r3, 0   /* Don't wake on privileged (OS) doorbell */
b   kvm_do_nap
 
+/*
+ * kvm_novcpu_wakeup
+ * Entered from kvm_start_guest if kvm_hstate.napping is set
+ * to NAPPING_NOVCPU
+ * r2 = kernel TOC
+ * r13 = paca
+ */
 kvm_novcpu_wakeup:
ld  r1, HSTATE_HOST_R1(r13)
ld  r5, HSTATE_KVM_VCORE(r13)
@@ -230,6 +237,13 @@ kvm_novcpu_wakeup:
/* c

[PATCH 03/13] KVM: PPC: select IRQ_BYPASS_MANAGER

2016-08-18 Thread Paul Mackerras
From: Suresh Warrier 

Select IRQ_BYPASS_MANAGER for PPC when CONFIG_KVM is set.
Add the PPC producer functions for add and del producer.

[pau...@ozlabs.org - Moved new functions from book3s.c to powerpc.c
 so booke compiles; added kvm_arch_has_irq_bypass implementation.]

Signed-off-by: Suresh Warrier 
Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/kvm_ppc.h |  4 
 arch/powerpc/kvm/Kconfig   |  2 ++
 arch/powerpc/kvm/powerpc.c | 38 ++
 3 files changed, 44 insertions(+)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 2544eda..94715e2 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -287,6 +287,10 @@ struct kvmppc_ops {
long (*arch_vm_ioctl)(struct file *filp, unsigned int ioctl,
  unsigned long arg);
int (*hcall_implemented)(unsigned long hcall);
+   int (*irq_bypass_add_producer)(struct irq_bypass_consumer *,
+  struct irq_bypass_producer *);
+   void (*irq_bypass_del_producer)(struct irq_bypass_consumer *,
+   struct irq_bypass_producer *);
 };
 
 extern struct kvmppc_ops *kvmppc_hv_ops;
diff --git a/arch/powerpc/kvm/Kconfig b/arch/powerpc/kvm/Kconfig
index c2024ac..7ac0569 100644
--- a/arch/powerpc/kvm/Kconfig
+++ b/arch/powerpc/kvm/Kconfig
@@ -22,6 +22,8 @@ config KVM
select ANON_INODES
select HAVE_KVM_EVENTFD
select SRCU
+   select IRQ_BYPASS_MANAGER
+   select HAVE_KVM_IRQ_BYPASS
 
 config KVM_BOOK3S_HANDLER
bool
diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
index 6ce40dd..6d51e0f 100644
--- a/arch/powerpc/kvm/powerpc.c
+++ b/arch/powerpc/kvm/powerpc.c
@@ -27,6 +27,8 @@
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 #include 
 #include 
@@ -739,6 +741,42 @@ void kvm_arch_vcpu_put(struct kvm_vcpu *vcpu)
 #endif
 }
 
+/*
+ * irq_bypass_add_producer and irq_bypass_del_producer are only
+ * useful if the architecture supports PCI passthrough.
+ * irq_bypass_stop and irq_bypass_start are not needed and so
+ * kvm_ops are not defined for them.
+ */
+bool kvm_arch_has_irq_bypass(void)
+{
+   return ((kvmppc_hv_ops && kvmppc_hv_ops->irq_bypass_add_producer) ||
+   (kvmppc_pr_ops && kvmppc_pr_ops->irq_bypass_add_producer));
+}
+
+int kvm_arch_irq_bypass_add_producer(struct irq_bypass_consumer *cons,
+struct irq_bypass_producer *prod)
+{
+   struct kvm_kernel_irqfd *irqfd =
+   container_of(cons, struct kvm_kernel_irqfd, consumer);
+   struct kvm *kvm = irqfd->kvm;
+
+   if (kvm->arch.kvm_ops->irq_bypass_add_producer)
+   return kvm->arch.kvm_ops->irq_bypass_add_producer(cons, prod);
+
+   return 0;
+}
+
+void kvm_arch_irq_bypass_del_producer(struct irq_bypass_consumer *cons,
+ struct irq_bypass_producer *prod)
+{
+   struct kvm_kernel_irqfd *irqfd =
+   container_of(cons, struct kvm_kernel_irqfd, consumer);
+   struct kvm *kvm = irqfd->kvm;
+
+   if (kvm->arch.kvm_ops->irq_bypass_del_producer)
+   kvm->arch.kvm_ops->irq_bypass_del_producer(cons, prod);
+}
+
 static void kvmppc_complete_mmio_load(struct kvm_vcpu *vcpu,
   struct kvm_run *run)
 {
-- 
2.8.1



[PATCH 04/13] KVM: PPC: Book3S HV: Introduce kvmppc_passthru_irqmap

2016-08-18 Thread Paul Mackerras
From: Suresh Warrier 

This patch introduces an IRQ mapping structure, the
kvmppc_passthru_irqmap structure that is to be used
to map the real hardware IRQ in the host with the virtual
hardware IRQ (gsi) that is injected into a guest by KVM for
passthrough adapters.

Currently, we assume a separate IRQ mapping structure for
each guest. Each kvmppc_passthru_irqmap has a mapping arrays,
containing all defined real<->virtual IRQs.

[pau...@ozlabs.org - removed irq_chip field from struct
 kvmppc_passthru_irqmap; changed parameter for
 kvmppc_get_passthru_irqmap from struct kvm_vcpu * to struct
 kvm *, removed small cached array.]

Signed-off-by: Suresh Warrier 
Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/kvm_host.h | 17 +
 arch/powerpc/include/asm/kvm_ppc.h  | 14 ++
 arch/powerpc/kvm/book3s_hv.c| 13 +
 3 files changed, 44 insertions(+)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index ec35af3..3eb5092 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -197,6 +197,8 @@ struct kvmppc_spapr_tce_table {
 struct kvmppc_xics;
 struct kvmppc_icp;
 
+struct kvmppc_passthru_irqmap;
+
 /*
  * The reverse mapping array has one entry for each HPTE,
  * which stores the guest's view of the second word of the HPTE
@@ -267,6 +269,7 @@ struct kvm_arch {
 #endif
 #ifdef CONFIG_KVM_XICS
struct kvmppc_xics *xics;
+   struct kvmppc_passthru_irqmap *pimap;
 #endif
struct kvmppc_ops *kvm_ops;
 #ifdef CONFIG_KVM_BOOK3S_HV_POSSIBLE
@@ -397,6 +400,20 @@ struct kvmhv_tb_accumulator {
u64 tb_max; /* max time */
 };
 
+#ifdef CONFIG_PPC_BOOK3S_64
+struct kvmppc_irq_map {
+   u32 r_hwirq;
+   u32 v_hwirq;
+   struct irq_desc *desc;
+};
+
+#defineKVMPPC_PIRQ_MAPPED  1024
+struct kvmppc_passthru_irqmap {
+   int n_mapped;
+   struct kvmppc_irq_map mapped[KVMPPC_PIRQ_MAPPED];
+};
+#endif
+
 # ifdef CONFIG_PPC_FSL_BOOK3E
 #define KVMPPC_BOOKE_IAC_NUM   2
 #define KVMPPC_BOOKE_DAC_NUM   2
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 94715e2..4ca2ba3 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -457,8 +457,18 @@ static inline int kvmppc_xics_enabled(struct kvm_vcpu 
*vcpu)
 {
return vcpu->arch.irq_type == KVMPPC_IRQ_XICS;
 }
+
+static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap(
+   struct kvm *kvm)
+{
+   if (kvm)
+   return kvm->arch.pimap;
+   return NULL;
+}
+
 extern void kvmppc_alloc_host_rm_ops(void);
 extern void kvmppc_free_host_rm_ops(void);
+extern void kvmppc_free_pimap(struct kvm *kvm);
 extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
 extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server);
 extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args);
@@ -470,8 +480,12 @@ extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev,
 extern void kvmppc_xics_ipi_action(void);
 extern int h_ipi_redirect;
 #else
+static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap(
+   struct kvm *kvm)
+   { return NULL; }
 static inline void kvmppc_alloc_host_rm_ops(void) {};
 static inline void kvmppc_free_host_rm_ops(void) {};
+static inline void kvmppc_free_pimap(struct kvm *kvm) {};
 static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
{ return 0; }
 static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 2fd5580..413b5c2f 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3282,6 +3282,19 @@ static int kvmppc_core_check_processor_compat_hv(void)
return 0;
 }
 
+#ifdef CONFIG_KVM_XICS
+
+void kvmppc_free_pimap(struct kvm *kvm)
+{
+   kfree(kvm->arch.pimap);
+}
+
+struct kvmppc_passthru_irqmap *kvmppc_alloc_pimap(void)
+{
+   return kzalloc(sizeof(struct kvmppc_passthru_irqmap), GFP_KERNEL);
+}
+#endif
+
 static long kvm_arch_vm_ioctl_hv(struct file *filp,
 unsigned int ioctl, unsigned long arg)
 {
-- 
2.8.1



[PATCH 06/13] KVM: PPC: Book3S HV: Enable IRQ bypass

2016-08-18 Thread Paul Mackerras
From: Suresh Warrier 

Add the irq_bypass_add_producer and irq_bypass_del_producer
functions. These functions get called whenever a GSI is being
defined for a guest. They create/remove the mapping between
host real IRQ numbers and the guest GSI.

Add the following helper functions to manage the
passthrough IRQ map.

kvmppc_set_passthru_irq()
  Creates a mapping in the passthrough IRQ map that maps a host
  IRQ to a guest GSI. It allocates the structure (one per guest VM)
  the first time it is called.

kvmppc_clr_passthru_irq()
  Removes the passthrough IRQ map entry given a guest GSI.
  The passthrough IRQ map structure is not freed even when the
  number of mapped entries goes to zero. It is only freed when
  the VM is destroyed.

[pau...@ozlabs.org - modified to use is_pnv_opal_msi() rather than
 requiring all passed-through interrupts to use the same irq_chip;
 changed deletion so it zeroes out the r_hwirq field rather than
 copying the last entry down and decrementing the number of entries.]

Signed-off-by: Suresh Warrier 
Signed-off-by: Paul Mackerras 
---
 arch/powerpc/kvm/book3s_hv.c | 160 ++-
 1 file changed, 159 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 413b5c2f..aa11647 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -53,10 +53,13 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
 #include 
+#include 
+#include 
 #include 
 
 #include "book3s.h"
@@ -3247,6 +3250,8 @@ static void kvmppc_core_destroy_vm_hv(struct kvm *kvm)
kvmppc_free_vcores(kvm);
 
kvmppc_free_hpt(kvm);
+
+   kvmppc_free_pimap(kvm);
 }
 
 /* We don't need to emulate any privileged instructions or dcbz */
@@ -3289,10 +3294,159 @@ void kvmppc_free_pimap(struct kvm *kvm)
kfree(kvm->arch.pimap);
 }
 
-struct kvmppc_passthru_irqmap *kvmppc_alloc_pimap(void)
+static struct kvmppc_passthru_irqmap *kvmppc_alloc_pimap(void)
 {
return kzalloc(sizeof(struct kvmppc_passthru_irqmap), GFP_KERNEL);
 }
+
+static int kvmppc_set_passthru_irq(struct kvm *kvm, int host_irq, int 
guest_gsi)
+{
+   struct irq_desc *desc;
+   struct kvmppc_irq_map *irq_map;
+   struct kvmppc_passthru_irqmap *pimap;
+   struct irq_chip *chip;
+   int i;
+
+   desc = irq_to_desc(host_irq);
+   if (!desc)
+   return -EIO;
+
+   mutex_lock(&kvm->lock);
+
+   pimap = kvm->arch.pimap;
+   if (pimap == NULL) {
+   /* First call, allocate structure to hold IRQ map */
+   pimap = kvmppc_alloc_pimap();
+   if (pimap == NULL) {
+   mutex_unlock(&kvm->lock);
+   return -ENOMEM;
+   }
+   kvm->arch.pimap = pimap;
+   }
+
+   /*
+* For now, we only support interrupts for which the EOI operation
+* is an OPAL call followed by a write to XIRR, since that's
+* what our real-mode EOI code does.
+*/
+   chip = irq_data_get_irq_chip(&desc->irq_data);
+   if (!chip || !is_pnv_opal_msi(chip)) {
+   pr_warn("kvmppc_set_passthru_irq_hv: Could not assign IRQ map 
for (%d,%d)\n",
+   host_irq, guest_gsi);
+   mutex_unlock(&kvm->lock);
+   return -ENOENT;
+   }
+
+   /*
+* See if we already have an entry for this guest IRQ number.
+* If it's mapped to a hardware IRQ number, that's an error,
+* otherwise re-use this entry.
+*/
+   for (i = 0; i < pimap->n_mapped; i++) {
+   if (guest_gsi == pimap->mapped[i].v_hwirq) {
+   if (pimap->mapped[i].r_hwirq) {
+   mutex_unlock(&kvm->lock);
+   return -EINVAL;
+   }
+   break;
+   }
+   }
+
+   if (i == KVMPPC_PIRQ_MAPPED) {
+   mutex_unlock(&kvm->lock);
+   return -EAGAIN; /* table is full */
+   }
+
+   irq_map = &pimap->mapped[i];
+
+   irq_map->v_hwirq = guest_gsi;
+   irq_map->r_hwirq = desc->irq_data.hwirq;
+   irq_map->desc = desc;
+
+   if (i == pimap->n_mapped)
+   pimap->n_mapped++;
+
+   mutex_unlock(&kvm->lock);
+
+   return 0;
+}
+
+static int kvmppc_clr_passthru_irq(struct kvm *kvm, int host_irq, int 
guest_gsi)
+{
+   struct irq_desc *desc;
+   struct kvmppc_passthru_irqmap *pimap;
+   int i;
+
+   desc = irq_to_desc(host_irq);
+   if (!desc)
+   return -EIO;
+
+   mutex_lock(&kvm->lock);
+
+   if (kvm->arch.pimap == NULL) {
+   mutex_unlock(&kvm->lock);
+   return 0;
+   }
+   pimap = kvm->arch.pimap;
+
+   for (i = 0; i < pimap->n_mapped; i++) {
+   if (guest_gsi == pimap->mapped[i].v_hwirq)
+   break;
+   }
+
+ 

[PATCH 05/13] powerpc/powernv: Provide facilities for EOI, usable from real mode

2016-08-18 Thread Paul Mackerras
From: Suresh Warrier 

This adds a new function pnv_opal_pci_msi_eoi() which does the part of
end-of-interrupt (EOI) handling of an MSI which involves doing an
OPAL call.  This function can be called in real mode.  This doesn't
just export pnv_ioda2_msi_eoi() because that does a call to
icp_native_eoi(), which does not work in real mode.

This also adds a function, is_pnv_opal_msi(), which KVM can call to
check whether an interrupt is one for which we should be calling
pnv_opal_pci_msi_eoi() when we need to do an EOI.

[pau...@ozlabs.org - split out the addition of pnv_opal_pci_msi_eoi()
 from Suresh's patch "KVM: PPC: Book3S HV: Handle passthrough
 interrupts in guest"; added is_pnv_opal_msi(); wrote description.]

Signed-off-by: Suresh Warrier 
Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/pnv-pci.h|  3 +++
 arch/powerpc/platforms/powernv/pci-ioda.c | 24 
 2 files changed, 23 insertions(+), 4 deletions(-)

diff --git a/arch/powerpc/include/asm/pnv-pci.h 
b/arch/powerpc/include/asm/pnv-pci.h
index 0cbd813..1b46b52 100644
--- a/arch/powerpc/include/asm/pnv-pci.h
+++ b/arch/powerpc/include/asm/pnv-pci.h
@@ -12,6 +12,7 @@
 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -33,6 +34,8 @@ int pnv_cxl_alloc_hwirqs(struct pci_dev *dev, int num);
 void pnv_cxl_release_hwirqs(struct pci_dev *dev, int hwirq, int num);
 int pnv_cxl_get_irq_count(struct pci_dev *dev);
 struct device_node *pnv_pci_get_phb_node(struct pci_dev *dev);
+int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq);
+bool is_pnv_opal_msi(struct irq_chip *chip);
 
 #ifdef CONFIG_CXL_BASE
 int pnv_cxl_alloc_hwirq_ranges(struct cxl_irq_ranges *irqs,
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index fd9444f..9ce48ae 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -2710,15 +2710,21 @@ static void pnv_pci_ioda2_setup_dma_pe(struct pnv_phb 
*phb,
 }
 
 #ifdef CONFIG_PCI_MSI
-static void pnv_ioda2_msi_eoi(struct irq_data *d)
+int64_t pnv_opal_pci_msi_eoi(struct irq_chip *chip, unsigned int hw_irq)
 {
-   unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
-   struct irq_chip *chip = irq_data_get_irq_chip(d);
struct pnv_phb *phb = container_of(chip, struct pnv_phb,
   ioda.irq_chip);
+
+   return opal_pci_msi_eoi(phb->opal_id, hw_irq);
+}
+
+static void pnv_ioda2_msi_eoi(struct irq_data *d)
+{
int64_t rc;
+   unsigned int hw_irq = (unsigned int)irqd_to_hwirq(d);
+   struct irq_chip *chip = irq_data_get_irq_chip(d);
 
-   rc = opal_pci_msi_eoi(phb->opal_id, hw_irq);
+   rc = pnv_opal_pci_msi_eoi(chip, hw_irq);
WARN_ON_ONCE(rc);
 
icp_native_eoi(d);
@@ -2748,6 +2754,16 @@ void pnv_set_msi_irq_chip(struct pnv_phb *phb, unsigned 
int virq)
irq_set_chip(virq, &phb->ioda.irq_chip);
 }
 
+/*
+ * Returns true iff chip is something that we could call
+ * pnv_opal_pci_msi_eoi for.
+ */
+bool is_pnv_opal_msi(struct irq_chip *chip)
+{
+   return chip->irq_eoi == pnv_ioda2_msi_eoi;
+}
+EXPORT_SYMBOL_GPL(is_pnv_opal_msi);
+
 static int pnv_pci_ioda_msi_setup(struct pnv_phb *phb, struct pci_dev *dev,
  unsigned int hwirq, unsigned int virq,
  unsigned int is_64, struct msi_msg *msg)
-- 
2.8.1



[PATCH 07/13] KVM: PPC: Book3S HV: Handle passthrough interrupts in guest

2016-08-18 Thread Paul Mackerras
From: Suresh Warrier 

Currently, KVM switches back to the host to handle any external
interrupt (when the interrupt is received while running in the
guest). This patch updates real-mode KVM to check if an interrupt
is generated by a passthrough adapter that is owned by this guest.
If so, the real mode KVM will directly inject the corresponding
virtual interrupt to the guest VCPU's ICS and also EOI the interrupt
in hardware. In short, the interrupt is handled entirely in real
mode in the guest context without switching back to the host.

In some rare cases, the interrupt cannot be completely handled in
real mode, for instance, a VCPU that is sleeping needs to be woken
up. In this case, KVM simply switches back to the host with trap
reason set to 0x500. This works, but it is clearly not very efficient.
A following patch will distinguish this case and handle it
correctly in the host. Note that we can use the existing
check_too_hard() routine even though we are not in a hypercall to
determine if there is unfinished business that needs to be
completed in host virtual mode.

The patch assumes that the mapping between hardware interrupt IRQ
and virtual IRQ to be injected to the guest already exists for the
PCI passthrough interrupts that need to be handled in real mode.
If the mapping does not exist, KVM falls back to the default
existing behavior.

The KVM real mode code reads mappings from the mapped array in the
passthrough IRQ map without taking any lock.  We carefully order the
loads and stores of the fields in the kvmppc_irq_map data structure
using memory barriers to avoid an inconsistent mapping being seen by
the reader. Thus, although it is possible to miss a map entry, it is
not possible to read a stale value.

[pau...@ozlabs.org - get irq_chip from irq_map rather than pimap,
 pulled out powernv eoi change into a separate patch, made
 kvmppc_read_intr get the vcpu from the paca rather than being
 passed in, rewrote the logic at the end of kvmppc_read_intr to
 avoid deep indentation, simplified logic in book3s_hv_rmhandlers.S
 since we were always restoring SRR0/1 anyway, get rid of the cached
 array (just use the mapped array), removed the kick_all_cpus_sync()
 call, clear saved_xirr PACA field when we handle the interrupt in
 real mode.]

Signed-off-by: Suresh Warrier 
Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/kvm_ppc.h  |  3 ++
 arch/powerpc/kvm/book3s_hv.c|  8 -
 arch/powerpc/kvm/book3s_hv_builtin.c| 58 -
 arch/powerpc/kvm/book3s_hv_rm_xics.c| 44 +
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |  6 
 5 files changed, 117 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 4ca2ba3..4299a1f 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -478,6 +478,9 @@ extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 
icpval);
 extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev,
struct kvm_vcpu *vcpu, u32 cpu);
 extern void kvmppc_xics_ipi_action(void);
+extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, u32 xirr,
+struct kvmppc_irq_map *irq_map,
+struct kvmppc_passthru_irqmap *pimap);
 extern int h_ipi_redirect;
 #else
 static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap(
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index aa11647..175bdab 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3360,9 +3360,15 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int 
host_irq, int guest_gsi)
irq_map = &pimap->mapped[i];
 
irq_map->v_hwirq = guest_gsi;
-   irq_map->r_hwirq = desc->irq_data.hwirq;
irq_map->desc = desc;
 
+   /*
+* Order the above two stores before the next to serialize with
+* the KVM real mode handler.
+*/
+   smp_wmb();
+   irq_map->r_hwirq = desc->irq_data.hwirq;
+
if (i == pimap->n_mapped)
pimap->n_mapped++;
 
diff --git a/arch/powerpc/kvm/book3s_hv_builtin.c 
b/arch/powerpc/kvm/book3s_hv_builtin.c
index b476a6a..fdb8aef 100644
--- a/arch/powerpc/kvm/book3s_hv_builtin.c
+++ b/arch/powerpc/kvm/book3s_hv_builtin.c
@@ -288,12 +288,41 @@ void kvmhv_commence_exit(int trap)
 struct kvmppc_host_rm_ops *kvmppc_host_rm_ops_hv;
 EXPORT_SYMBOL_GPL(kvmppc_host_rm_ops_hv);
 
+static struct kvmppc_irq_map *get_irqmap(struct kvmppc_passthru_irqmap *pimap,
+u32 xisr)
+{
+   int i;
+
+   /*
+* We access the mapped array here without a lock.  That
+* is safe because we never reduce the number of entries
+* in the array and we never change the v_hwirq field of
+* an entry once it is set.
+*
+* We have also carefully ordered the stores

[PATCH 09/13] KVM: PPC: Book3S HV: Dump irqmap in debugfs

2016-08-18 Thread Paul Mackerras
From: Suresh Warrier 

Dump the passthrough irqmap structure associated with a
guest as part of /sys/kernel/debug/powerpc/kvm-xics-*.

Signed-off-by: Suresh Warrier 
Signed-off-by: Paul Mackerras 
---
 arch/powerpc/kvm/book3s_xics.c | 17 +
 1 file changed, 17 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_xics.c b/arch/powerpc/kvm/book3s_xics.c
index d528d22..b41f1d3 100644
--- a/arch/powerpc/kvm/book3s_xics.c
+++ b/arch/powerpc/kvm/book3s_xics.c
@@ -893,6 +893,21 @@ EXPORT_SYMBOL_GPL(kvmppc_xics_hcall);
 
 /* -- Initialisation code etc. -- */
 
+static void xics_debugfs_irqmap(struct seq_file *m,
+   struct kvmppc_passthru_irqmap *pimap)
+{
+   int i;
+
+   if (!pimap)
+   return;
+   seq_printf(m, "\nPIRQ mappings: %d maps\n===\n",
+   pimap->n_mapped);
+   for (i = 0; i < pimap->n_mapped; i++)  {
+   seq_printf(m, "r_hwirq=%x, v_hwirq=%x\n",
+   pimap->mapped[i].r_hwirq, pimap->mapped[i].v_hwirq);
+   }
+}
+
 static int xics_debug_show(struct seq_file *m, void *private)
 {
struct kvmppc_xics *xics = m->private;
@@ -914,6 +929,8 @@ static int xics_debug_show(struct seq_file *m, void 
*private)
t_check_resend = 0;
t_reject = 0;
 
+   xics_debugfs_irqmap(m, kvm->arch.pimap);
+
seq_printf(m, "=\nICP state\n=\n");
 
kvm_for_each_vcpu(i, vcpu, kvm) {
-- 
2.8.1



[PATCH 08/13] KVM: PPC: Book3S HV: Complete passthrough interrupt in host

2016-08-18 Thread Paul Mackerras
From: Suresh Warrier 

In existing real mode ICP code, when updating the virtual ICP
state, if there is a required action that cannot be completely
handled in real mode, as for instance, a VCPU needs to be woken
up, flags are set in the ICP to indicate the required action.
This is checked when returning from hypercalls to decide whether
the call needs switch back to the host where the action can be
performed in virtual mode. Note that if h_ipi_redirect is enabled,
real mode code will first try to message a free host CPU to
complete this job instead of returning the host to do it ourselves.

Currently, the real mode PCI passthrough interrupt handling code
checks if any of these flags are set and simply returns to the host.
This is not good enough as the trap value (0x500) is treated as an
external interrupt by the host code. It is only when the trap value
is a hypercall that the host code searches for and acts on unfinished
work by calling kvmppc_xics_rm_complete.

This patch introduces a special trap BOOK3S_INTERRUPT_HV_RM_HARD
which is returned by KVM if there is unfinished business to be
completed in host virtual mode after handling a PCI passthrough
interrupt. The host checks for this special interrupt condition
and calls into the kvmppc_xics_rm_complete, which is made an
exported function for this reason.

[pau...@ozlabs.org - moved logic to set r12 to BOOK3S_INTERRUPT_HV_RM_HARD
 in book3s_hv_rmhandlers.S into the end of kvmppc_check_wake_reason.]

Signed-off-by: Suresh Warrier 
Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/kvm_asm.h  | 10 ++
 arch/powerpc/include/asm/kvm_ppc.h  |  3 +++
 arch/powerpc/kvm/book3s_hv.c|  8 +++-
 arch/powerpc/kvm/book3s_hv_builtin.c|  1 +
 arch/powerpc/kvm/book3s_hv_rm_xics.c|  2 +-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 25 +
 arch/powerpc/kvm/book3s_xics.c  |  3 ++-
 7 files changed, 49 insertions(+), 3 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_asm.h 
b/arch/powerpc/include/asm/kvm_asm.h
index 5bca220..05cabed 100644
--- a/arch/powerpc/include/asm/kvm_asm.h
+++ b/arch/powerpc/include/asm/kvm_asm.h
@@ -105,6 +105,15 @@
 #define BOOK3S_INTERRUPT_FAC_UNAVAIL   0xf60
 #define BOOK3S_INTERRUPT_H_FAC_UNAVAIL 0xf80
 
+/* book3s_hv */
+
+/*
+ * Special trap used to indicate to host that this is a
+ * passthrough interrupt that could not be handled
+ * completely in the guest.
+ */
+#define BOOK3S_INTERRUPT_HV_RM_HARD0x
+
 #define BOOK3S_IRQPRIO_SYSTEM_RESET0
 #define BOOK3S_IRQPRIO_DATA_SEGMENT1
 #define BOOK3S_IRQPRIO_INST_SEGMENT2
@@ -136,6 +145,7 @@
 #define RESUME_FLAG_NV  (1<<0)  /* Reload guest nonvolatile state? */
 #define RESUME_FLAG_HOST(1<<1)  /* Resume host? */
 #define RESUME_FLAG_ARCH1  (1<<2)
+#define RESUME_FLAG_ARCH2  (1<<3)
 
 #define RESUME_GUEST0
 #define RESUME_GUEST_NV RESUME_FLAG_NV
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 4299a1f..e0ada31 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -469,6 +469,7 @@ static inline struct kvmppc_passthru_irqmap 
*kvmppc_get_passthru_irqmap(
 extern void kvmppc_alloc_host_rm_ops(void);
 extern void kvmppc_free_host_rm_ops(void);
 extern void kvmppc_free_pimap(struct kvm *kvm);
+extern int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall);
 extern void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu);
 extern int kvmppc_xics_create_icp(struct kvm_vcpu *vcpu, unsigned long server);
 extern int kvm_vm_ioctl_xics_irq(struct kvm *kvm, struct kvm_irq_level *args);
@@ -489,6 +490,8 @@ static inline struct kvmppc_passthru_irqmap 
*kvmppc_get_passthru_irqmap(
 static inline void kvmppc_alloc_host_rm_ops(void) {};
 static inline void kvmppc_free_host_rm_ops(void) {};
 static inline void kvmppc_free_pimap(struct kvm *kvm) {};
+static inline int kvmppc_xics_rm_complete(struct kvm_vcpu *vcpu, u32 hcall)
+   { return 0; }
 static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
{ return 0; }
 static inline void kvmppc_xics_free_icp(struct kvm_vcpu *vcpu) { }
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 175bdab..cfddafa 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -73,6 +73,8 @@
 
 /* Used to indicate that a guest page fault needs to be handled */
 #define RESUME_PAGE_FAULT  (RESUME_GUEST | RESUME_FLAG_ARCH1)
+/* Used to indicate that a guest passthrough interrupt needs to be handled */
+#define RESUME_PASSTHROUGH (RESUME_GUEST | RESUME_FLAG_ARCH2)
 
 /* Used as a "null" value for timebase values */
 #define TB_NIL (~(u64)0)
@@ -994,6 +996,9 @@ static int kvmppc_handle_exit_hv(struct kvm_run *run, 
struct kvm_vcpu *vcpu,
kvmppc_core_queue_program(vcpu, SRR1_PROGILL);
r = RESUME_GUEST;
break;
+   case BOOK3S

[PATCH 10/13] KVM: PPC: Book3S HV: Tunable to disable KVM IRQ bypass

2016-08-18 Thread Paul Mackerras
From: Suresh Warrier 

Add a  module parameter kvm_irq_bypass for kvm_hv.ko to
disable IRQ bypass for passthrough interrupts. The default
value of this tunable is 1 - that is enable the feature.

Since the tunable is used by built-in kernel code, we use
the module_param_cb macro to achieve this.

Signed-off-by: Suresh Warrier 
Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/kvm_book3s.h |  1 +
 arch/powerpc/include/asm/kvm_ppc.h|  2 +-
 arch/powerpc/kvm/book3s_hv.c  | 10 ++
 arch/powerpc/kvm/book3s_hv_rm_xics.c  |  2 ++
 4 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s.h 
b/arch/powerpc/include/asm/kvm_book3s.h
index 8f39796..8e5fac6 100644
--- a/arch/powerpc/include/asm/kvm_book3s.h
+++ b/arch/powerpc/include/asm/kvm_book3s.h
@@ -191,6 +191,7 @@ extern void kvmppc_copy_to_svcpu(struct 
kvmppc_book3s_shadow_vcpu *svcpu,
 struct kvm_vcpu *vcpu);
 extern void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu,
   struct kvmppc_book3s_shadow_vcpu *svcpu);
+extern int kvm_irq_bypass;
 
 static inline struct kvmppc_vcpu_book3s *to_book3s(struct kvm_vcpu *vcpu)
 {
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index e0ada31..97b9bad 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -461,7 +461,7 @@ static inline int kvmppc_xics_enabled(struct kvm_vcpu *vcpu)
 static inline struct kvmppc_passthru_irqmap *kvmppc_get_passthru_irqmap(
struct kvm *kvm)
 {
-   if (kvm)
+   if (kvm && kvm_irq_bypass)
return kvm->arch.pimap;
return NULL;
 }
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index cfddafa..2e71518 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -94,6 +94,10 @@ static struct kernel_param_ops module_param_ops = {
.get = param_get_int,
 };
 
+module_param_cb(kvm_irq_bypass, &module_param_ops, &kvm_irq_bypass,
+   S_IRUGO | S_IWUSR);
+MODULE_PARM_DESC(kvm_irq_bypass, "Bypass passthrough interrupt optimization");
+
 module_param_cb(h_ipi_redirect, &module_param_ops, &h_ipi_redirect,
S_IRUGO | S_IWUSR);
 MODULE_PARM_DESC(h_ipi_redirect, "Redirect H_IPI wakeup to a free host core");
@@ -3313,6 +3317,9 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int 
host_irq, int guest_gsi)
struct irq_chip *chip;
int i;
 
+   if (!kvm_irq_bypass)
+   return 1;
+
desc = irq_to_desc(host_irq);
if (!desc)
return -EIO;
@@ -3389,6 +3396,9 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int 
host_irq, int guest_gsi)
struct kvmppc_passthru_irqmap *pimap;
int i;
 
+   if (!kvm_irq_bypass)
+   return 0;
+
desc = irq_to_desc(host_irq);
if (!desc)
return -EIO;
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c 
b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 3b8d7ac..00b9dfde 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -27,6 +27,8 @@
 
 int h_ipi_redirect = 1;
 EXPORT_SYMBOL(h_ipi_redirect);
+int kvm_irq_bypass = 1;
+EXPORT_SYMBOL(kvm_irq_bypass);
 
 static void icp_rm_deliver_irq(struct kvmppc_xics *xics, struct kvmppc_icp 
*icp,
u32 new_irq);
-- 
2.8.1



[PATCH 11/13] KVM: PPC: Book3S HV: Update irq stats for IRQs handled in real mode

2016-08-18 Thread Paul Mackerras
From: Suresh Warrier 

When a passthrough IRQ is handled completely within KVM real
mode code, it has to also update the IRQ stats since this
does not go through the generic IRQ handling code.

However, the per CPU kstat_irqs field is an allocated (not static)
field and so cannot be directly accessed in real mode safely.

The function this_cpu_inc_rm() is introduced to safely increment
per CPU fields (currently coded for unsigned integers only) that
are allocated and could thus be vmalloced also.

Signed-off-by: Suresh Warrier 
Signed-off-by: Paul Mackerras 
---
 arch/powerpc/kvm/book3s_hv_rm_xics.c | 50 
 1 file changed, 50 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c 
b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 00b9dfde..554cdfa 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -10,6 +10,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include 
 #include 
@@ -18,6 +19,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 
@@ -734,6 +736,53 @@ static void icp_eoi(struct irq_chip *c, u32 hwirq, u32 
xirr)
_stwcix(xics_phys + XICS_XIRR, xirr);
 }
 
+/*
+ * Increment a per-CPU 32-bit unsigned integer variable.
+ * Safe to call in real-mode. Handles vmalloc'ed addresses
+ *
+ * ToDo: Make this work for any integral type
+ */
+
+static inline void this_cpu_inc_rm(unsigned int __percpu *addr)
+{
+   unsigned long l;
+   unsigned int *raddr;
+   int cpu = smp_processor_id();
+
+   raddr = per_cpu_ptr(addr, cpu);
+   l = (unsigned long)raddr;
+
+   if (REGION_ID(l) == VMALLOC_REGION_ID) {
+   l = vmalloc_to_phys(raddr);
+   raddr = (unsigned int *)l;
+   }
+   ++*raddr;
+}
+
+/*
+ * We don't try to update the flags in the irq_desc 'istate' field in
+ * here as would happen in the normal IRQ handling path for several reasons:
+ *  - state flags represent internal IRQ state and are not expected to be
+ *updated outside the IRQ subsystem
+ *  - more importantly, these are useful for edge triggered interrupts,
+ *IRQ probing, etc., but we are only handling MSI/MSIx interrupts here
+ *and these states shouldn't apply to us.
+ *
+ * However, we do update irq_stats - we somewhat duplicate the code in
+ * kstat_incr_irqs_this_cpu() for this since this function is defined
+ * in irq/internal.h which we don't want to include here.
+ * The only difference is that desc->kstat_irqs is an allocated per CPU
+ * variable and could have been vmalloc'ed, so we can't directly
+ * call __this_cpu_inc() on it. The kstat structure is a static
+ * per CPU variable and it should be accessible by real-mode KVM.
+ *
+ */
+static void kvmppc_rm_handle_irq_desc(struct irq_desc *desc)
+{
+   this_cpu_inc_rm(desc->kstat_irqs);
+   __this_cpu_inc(kstat.irqs_sum);
+}
+
 long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
 u32 xirr,
 struct kvmppc_irq_map *irq_map,
@@ -747,6 +796,7 @@ long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu,
xics = vcpu->kvm->arch.xics;
icp = vcpu->arch.icp;
 
+   kvmppc_rm_handle_irq_desc(irq_map->desc);
icp_rm_deliver_irq(xics, icp, irq);
 
/* EOI the interrupt */
-- 
2.8.1



[PATCH 12/13] KVM: PPC: Book3S HV: Set server for passed-through interrupts

2016-08-18 Thread Paul Mackerras
When a guest has a PCI pass-through device with an interrupt, it
will direct the interrupt to a particular guest VCPU.  In fact the
physical interrupt might arrive on any CPU, and then get
delivered to the target VCPU in the emulated XICS (guest interrupt
controller), and eventually delivered to the target VCPU.

Now that we have code to handle device interrupts in real mode
without exiting to the host kernel, there is an advantage to having
the device interrupt arrive on the same sub(core) as the target
VCPU is running on.  In this situation, the interrupt can be
delivered to the target VCPU without any exit to the host kernel
(using a hypervisor doorbell interrupt between threads if
necessary).

This patch aims to get passed-through device interrupts arriving
on the correct core by setting the interrupt server in the real
hardware XICS for the interrupt to the first thread in the (sub)core
where its target VCPU is running.  We do this in the real-mode H_EOI
code because the H_EOI handler already needs to look at the
emulated ICS state for the interrupt (whereas the H_XIRR handler
doesn't), and we know we are running in the target VCPU context
at that point.

We set the server CPU in hardware using an OPAL call, regardless of
what the IRQ affinity mask for the interrupt says, and without
updating the affinity mask.  This amounts to saying that when an
interrupt is passed through to a guest, as a matter of policy we
allow the guest's affinity for the interrupt to override the host's.

This is inspired by an earlier patch from Suresh Warrier, although
none of this code came from that earlier patch.

Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/kvm_ppc.h |  4 +++
 arch/powerpc/include/asm/opal.h|  1 +
 arch/powerpc/kvm/book3s_hv.c   |  4 +++
 arch/powerpc/kvm/book3s_hv_rm_xics.c   | 16 
 arch/powerpc/kvm/book3s_xics.c | 35 ++
 arch/powerpc/kvm/book3s_xics.h |  2 ++
 arch/powerpc/platforms/powernv/opal-wrappers.S |  1 +
 7 files changed, 63 insertions(+)

diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 97b9bad..f6e4964 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -479,6 +479,10 @@ extern int kvmppc_xics_set_icp(struct kvm_vcpu *vcpu, u64 
icpval);
 extern int kvmppc_xics_connect_vcpu(struct kvm_device *dev,
struct kvm_vcpu *vcpu, u32 cpu);
 extern void kvmppc_xics_ipi_action(void);
+extern void kvmppc_xics_set_mapped(struct kvm *kvm, unsigned long guest_irq,
+  unsigned long host_irq);
+extern void kvmppc_xics_clr_mapped(struct kvm *kvm, unsigned long guest_irq,
+  unsigned long host_irq);
 extern long kvmppc_deliver_irq_passthru(struct kvm_vcpu *vcpu, u32 xirr,
 struct kvmppc_irq_map *irq_map,
 struct kvmppc_passthru_irqmap *pimap);
diff --git a/arch/powerpc/include/asm/opal.h b/arch/powerpc/include/asm/opal.h
index ee05bd2..e958b70 100644
--- a/arch/powerpc/include/asm/opal.h
+++ b/arch/powerpc/include/asm/opal.h
@@ -67,6 +67,7 @@ int64_t opal_pci_config_write_half_word(uint64_t phb_id, 
uint64_t bus_dev_func,
 int64_t opal_pci_config_write_word(uint64_t phb_id, uint64_t bus_dev_func,
   uint64_t offset, uint32_t data);
 int64_t opal_set_xive(uint32_t isn, uint16_t server, uint8_t priority);
+int64_t opal_rm_set_xive(uint32_t isn, uint16_t server, uint8_t priority);
 int64_t opal_get_xive(uint32_t isn, __be16 *server, uint8_t *priority);
 int64_t opal_register_exception_handler(uint64_t opal_exception,
uint64_t handler_address,
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index 2e71518..b969abc 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -3385,6 +3385,8 @@ static int kvmppc_set_passthru_irq(struct kvm *kvm, int 
host_irq, int guest_gsi)
if (i == pimap->n_mapped)
pimap->n_mapped++;
 
+   kvmppc_xics_set_mapped(kvm, guest_gsi, desc->irq_data.hwirq);
+
mutex_unlock(&kvm->lock);
 
return 0;
@@ -3421,6 +3423,8 @@ static int kvmppc_clr_passthru_irq(struct kvm *kvm, int 
host_irq, int guest_gsi)
return -ENODEV;
}
 
+   kvmppc_xics_clr_mapped(kvm, guest_gsi, pimap->mapped[i].r_hwirq);
+
/* invalidate the entry */
pimap->mapped[i].r_hwirq = 0;
 
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c 
b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 554cdfa..5f7527e 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -22,6 +22,7 @@
 #include 
 #include 
 #include 
+#include 
 
 #include "book3s_xics.h"
 
@@ -34,6 +35,7 @@ EXPORT_SYMBOL(kvm_irq_bypass);
 
 static void icp_rm_deliver_irq(struc

[PATCH 13/13] KVM: PPC: Book3S HV: Counters for passthrough IRQ stats

2016-08-18 Thread Paul Mackerras
From: Suresh Warrier 

Add VCPU stat counters to track affinity for passthrough
interrupts.

pthru_all: Counts all passthrough interrupts whose IRQ mappings are
   in the kvmppc_passthru_irq_map structure.
pthru_host: Counts all cached passthrough interrupts that were injected
from the host through kvm_set_irq (i.e. not handled in
real mode).
pthru_bad_aff: Counts how many cached passthrough interrupts have
   bad affinity (receiving CPU is not running VCPU that is
   the target of the virtual interrupt in the guest).

Signed-off-by: Suresh Warrier 
Signed-off-by: Paul Mackerras 
---
 arch/powerpc/include/asm/kvm_host.h  |  3 +++
 arch/powerpc/kvm/book3s.c|  3 +++
 arch/powerpc/kvm/book3s_hv_rm_xics.c | 18 +-
 3 files changed, 19 insertions(+), 5 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 3eb5092..f371a23 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -131,6 +131,9 @@ struct kvm_vcpu_stat {
u32 ld_slow;
u32 st_slow;
 #endif
+   u32 pthru_all;
+   u32 pthru_host;
+   u32 pthru_bad_aff;
 };
 
 enum kvm_exit_types {
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 47018fc..6d0c45b 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -64,6 +64,9 @@ struct kvm_stats_debugfs_item debugfs_entries[] = {
{ "ld_slow", VCPU_STAT(ld_slow) },
{ "st",  VCPU_STAT(st) },
{ "st_slow", VCPU_STAT(st_slow) },
+   { "pthru_all",   VCPU_STAT(pthru_all) },
+   { "pthru_host",  VCPU_STAT(pthru_host) },
+   { "pthru_bad_aff",   VCPU_STAT(pthru_bad_aff) },
{ NULL }
 };
 
diff --git a/arch/powerpc/kvm/book3s_hv_rm_xics.c 
b/arch/powerpc/kvm/book3s_hv_rm_xics.c
index 5f7527e..82ff5de 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_xics.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_xics.c
@@ -716,11 +716,19 @@ int kvmppc_rm_h_eoi(struct kvm_vcpu *vcpu, unsigned long 
xirr)
icp->rm_eoied_irq = irq;
}
 
-   if (state->host_irq && state->intr_cpu != -1) {
-   int pcpu = cpu_first_thread_sibling(raw_smp_processor_id());
-   if (state->intr_cpu != pcpu)
-   xics_opal_rm_set_server(state->host_irq, pcpu);
-   state->intr_cpu = -1;
+   if (state->host_irq) {
+   ++vcpu->stat.pthru_all;
+   if (state->intr_cpu != -1) {
+   int pcpu = raw_smp_processor_id();
+
+   pcpu = cpu_first_thread_sibling(pcpu);
+   ++vcpu->stat.pthru_host;
+   if (state->intr_cpu != pcpu) {
+   ++vcpu->stat.pthru_bad_aff;
+   xics_opal_rm_set_server(state->host_irq, pcpu);
+   }
+   state->intr_cpu = -1;
+   }
}
  bail:
return check_too_hard(xics, icp);
-- 
2.8.1



Re: [PATCH v4 0/3] perf annotate: Enable cross arch annotate

2016-08-18 Thread Ravi Bangoria
I've sent v5 series for this. Please review it.

Thanks,
Ravi

On Wednesday 13 July 2016 03:15 PM, Ravi Bangoria wrote:
> Arnaldo, Michael,
>
> I've tested this patchset on ppc64 BE and LE both. Please review this.
>
> -Ravi
>
> On Friday 08 July 2016 10:10 AM, Ravi Bangoria wrote:
>> Perf can currently only support code navigation (branches and calls) in
>> annotate when run on the same architecture where perf.data was recorded.
>> But cross arch annotate is not supported.
>>
>> This patchset enables cross arch annotate. Currently I've used x86
>> and arm instructions which are already available and adding support
>> for powerpc as well. Adding support for other arch will be easy.
>>
>> I've created this patch on top of acme/perf/core. And tested it with
>> x86 and powerpc only.
>>
>> Note for arm:
>> Few instructions were defined under #if __arm__ which I've used as a
>> table for arm. But I'm not sure whether instruction defined outside of
>> that also contains arm instructions. Apart from that, 'call__parse()'
>> and 'move__parse()' contains #ifdef __arm__ directive. I've changed it
>> to  if (!strcmp(norm_arch, arm)). I don't have a arm machine to test
>> these changes.
>>
>> Example:
>>
>>Record on powerpc:
>>$ ./perf record -a
>>
>>Report -> Annotate on x86:
>>$ ./perf report -i perf.data.powerpc --vmlinux vmlinux.powerpc
>>
>> Changes in v4:
>>- powerpc: Added support for branch instructions that includes 'ctr'
>>- __maybe_unused was misplaced at few location. Corrected it.
>>- Moved position of v3 last patch that define macro for each arch name
>>
>> v3 link: https://lkml.org/lkml/2016/6/30/99
>>
>> Naveen N. Rao (1):
>>perf annotate: add powerpc support
>>
>> Ravi Bangoria (2):
>>perf: Define macro for normalized arch names
>>perf annotate: Enable cross arch annotate
>>
>>   tools/perf/arch/common.c   |  36 ++---
>>   tools/perf/arch/common.h   |  11 ++
>>   tools/perf/builtin-top.c   |   2 +-
>>   tools/perf/ui/browsers/annotate.c  |   3 +-
>>   tools/perf/ui/gtk/annotate.c   |   2 +-
>>   tools/perf/util/annotate.c | 273 
>> ++---
>>   tools/perf/util/annotate.h |   6 +-
>>   tools/perf/util/unwind-libunwind.c |   4 +-
>>   8 files changed, 265 insertions(+), 72 deletions(-)
>>
>> -- 
>> 2.5.5
>>
>



Re: [PATCH v3 02/21] powerpc: Always restore FPU/VEC/VSX if hardware transactional memory in use

2016-08-18 Thread Michael Neuling
On Wed, 2016-08-17 at 13:43 +1000, Cyril Bur wrote:
> Comment from arch/powerpc/kernel/process.c:967:
>  If userspace is inside a transaction (whether active or
>  suspended) and FP/VMX/VSX instructions have ever been enabled
>  inside that transaction, then we have to keep them enabled
>  and keep the FP/VMX/VSX state loaded while ever the transaction
>  continues.  The reason is that if we didn't, and subsequently
>  got a FP/VMX/VSX unavailable interrupt inside a transaction,
>  we don't know whether it's the same transaction, and thus we
>  don't know which of the checkpointed state and the ransactional
>  state to use.
> 
> restore_math() restore_fp() and restore_altivec() currently may not
> restore the registers. It doesn't appear that this is more serious
> than a performance penalty. If the math registers aren't restored the
> userspace thread will still be run with the facility disabled.
> Userspace will not be able to read invalid values. On the first access
> it will take an facility unavailable exception and the kernel will
> detected an active transaction, at which point it will abort the
> transaction. There is the possibility for a pathological case
> preventing any progress by transactions, however, transactions
> are never guaranteed to make progress.
> 
> Fixes: 70fe3d9 ("powerpc: Restore FPU/VEC/VSX if previously used")
> Signed-off-by: Cyril Bur 
> ---
>  arch/powerpc/kernel/process.c | 21 ++---
>  1 file changed, 18 insertions(+), 3 deletions(-)
> 
> diff --git a/arch/powerpc/kernel/process.c
> b/arch/powerpc/kernel/process.c
> index 58ccf86..cdf2d20 100644
> --- a/arch/powerpc/kernel/process.c
> +++ b/arch/powerpc/kernel/process.c
> @@ -88,7 +88,13 @@ static void check_if_tm_restore_required(struct
> task_struct *tsk)
>   set_thread_flag(TIF_RESTORE_TM);
>   }
>  }
> +
> +static inline bool msr_tm_active(unsigned long msr)
> +{
> + return MSR_TM_ACTIVE(msr);
> +}

I'm not sure what value this function is adding.  The MSR_TM_ACTIVE() is
used in a lot of other places and is well known so I'd prefer to just keep
using it, rather than adding some other abstraction that others have to
learn.

Other than that, the patch seems good.  

Mikey

>  #else
> +static inline bool msr_tm_active(unsigned long msr) { return false; }
>  static inline void check_if_tm_restore_required(struct task_struct *tsk)
> { }
>  #endif /* CONFIG_PPC_TRANSACTIONAL_MEM */
>  
> @@ -208,7 +214,7 @@ void enable_kernel_fp(void)
>  EXPORT_SYMBOL(enable_kernel_fp);
>  
>  static int restore_fp(struct task_struct *tsk) {
> - if (tsk->thread.load_fp) {
> + if (tsk->thread.load_fp || msr_tm_active(tsk->thread.regs->msr)) 
> {
>   load_fp_state(¤t->thread.fp_state);
>   current->thread.load_fp++;
>   return 1;
> @@ -278,7 +284,8 @@ EXPORT_SYMBOL_GPL(flush_altivec_to_thread);
>  
>  static int restore_altivec(struct task_struct *tsk)
>  {
> - if (cpu_has_feature(CPU_FTR_ALTIVEC) && tsk->thread.load_vec) {
> + if (cpu_has_feature(CPU_FTR_ALTIVEC) &&
> + (tsk->thread.load_vec || msr_tm_active(tsk->thread.regs-
> >msr))) {
>   load_vr_state(&tsk->thread.vr_state);
>   tsk->thread.used_vr = 1;
>   tsk->thread.load_vec++;
> @@ -464,7 +471,8 @@ void restore_math(struct pt_regs *regs)
>  {
>   unsigned long msr;
>  
> - if (!current->thread.load_fp && !loadvec(current->thread))
> + if (!msr_tm_active(regs->msr) &&
> + !current->thread.load_fp && !loadvec(current->thread))
>   return;
>  
>   msr = regs->msr;
> @@ -983,6 +991,13 @@ void restore_tm_state(struct pt_regs *regs)
>   msr_diff = current->thread.ckpt_regs.msr & ~regs->msr;
>   msr_diff &= MSR_FP | MSR_VEC | MSR_VSX;
>  
> + /* Ensure that restore_math() will restore */
> + if (msr_diff & MSR_FP)
> + current->thread.load_fp = 1;
> +#ifdef CONFIG_ALIVEC
> + if (cpu_has_feature(CPU_FTR_ALTIVEC) && msr_diff & MSR_VEC)
> + current->thread.load_vec = 1;
> +#endif
>   restore_math(regs);
>  
>   regs->msr |= msr_diff;


Re: [PATCH v3 03/21] powerpc: Add check_if_tm_restore_required() to giveup_all()

2016-08-18 Thread Michael Neuling
On Wed, 2016-08-17 at 13:43 +1000, Cyril Bur wrote:
> giveup_all() causes FPU/VMX/VSX facilitities to be disabled in a

facilities.

> threads MSR. If this thread was transactional this should be recorded
> as reclaiming/recheckpointing code will need to know.

Can you expand on this?  It's not clear to me how this relates to the code.

Mikey

> Fixes: c208505 ("powerpc: create giveup_all()")
> Signed-off-by: Cyril Bur 
> ---
>  arch/powerpc/kernel/process.c | 1 +
>  1 file changed, 1 insertion(+)
> 
> diff --git a/arch/powerpc/kernel/process.c
> b/arch/powerpc/kernel/process.c
> index cdf2d20..82308fd 100644
> --- a/arch/powerpc/kernel/process.c
> +++ b/arch/powerpc/kernel/process.c
> @@ -445,6 +445,7 @@ void giveup_all(struct task_struct *tsk)
>   return;
>  
>   msr_check_and_set(msr_all_available);
> + check_if_tm_restore_required(tsk);
>  
>  #ifdef CONFIG_PPC_FPU
>   if (usermsr & MSR_FP)


Re: [PATCH v3 04/21] powerpc: Return the new MSR from msr_check_and_set()

2016-08-18 Thread Michael Neuling
On Wed, 2016-08-17 at 13:43 +1000, Cyril Bur wrote:
> mfmsr() is a fairly expensive call and callers of msr_check_and_set()
> may want to make decisions bits in the MSR that it did not change but
> may not know the value of.

I can't grok this.  Please reword.

Mikey

> This patch would avoid a two calls to mfmsr().
> 
> Signed-off-by: Cyril Bur 
> ---
>  arch/powerpc/include/asm/reg.h | 2 +-
>  arch/powerpc/kernel/process.c  | 4 +++-
>  2 files changed, 4 insertions(+), 2 deletions(-)
> 
> diff --git a/arch/powerpc/include/asm/reg.h
> b/arch/powerpc/include/asm/reg.h
> index f69f40f..0a3dde9 100644
> --- a/arch/powerpc/include/asm/reg.h
> +++ b/arch/powerpc/include/asm/reg.h
> @@ -1247,7 +1247,7 @@ static inline void mtmsr_isync(unsigned long val)
>    : "memory")
>  #endif
>  
> -extern void msr_check_and_set(unsigned long bits);
> +extern unsigned long msr_check_and_set(unsigned long bits);
>  extern bool strict_msr_control;
>  extern void __msr_check_and_clear(unsigned long bits);
>  static inline void msr_check_and_clear(unsigned long bits)
> diff --git a/arch/powerpc/kernel/process.c
> b/arch/powerpc/kernel/process.c
> index 82308fd..c42581b 100644
> --- a/arch/powerpc/kernel/process.c
> +++ b/arch/powerpc/kernel/process.c
> @@ -110,7 +110,7 @@ static int __init enable_strict_msr_control(char
> *str)
>  }
>  early_param("ppc_strict_facility_enable", enable_strict_msr_control);
>  
> -void msr_check_and_set(unsigned long bits)
> +unsigned long msr_check_and_set(unsigned long bits)
>  {
>   unsigned long oldmsr = mfmsr();
>   unsigned long newmsr;
> @@ -124,6 +124,8 @@ void msr_check_and_set(unsigned long bits)
>  
>   if (oldmsr != newmsr)
>   mtmsr_isync(newmsr);
> +
> + return newmsr;
>  }
>  
>  void __msr_check_and_clear(unsigned long bits)