[PATCH 1/1 v2] powerpc: Correct DSCR during TM context switch
Correct the DSCR SPR becoming temporarily corrupted if a task is context switched during a transaction. The problem occurs while suspending the task and is caused by saving the DSCR to thread.dscr after it has already been set to the CPU's default value: __switch_to() calls __switch_to_tm() which calls tm_reclaim_task() which calls tm_reclaim_thread() which calls tm_reclaim() where the DSCR is set to the CPU's default __switch_to() calls _switch() where thread.dscr is set to the DSCR When the task is resumed, it's transaction will be doomed (as usual) and the DSCR SPR will be corrupted, although the checkpointed value will be correct. Therefore the DSCR will be immediately corrected by the transaction aborting, unless it has been suspended. In that case the incorrect value can be seen by the task until it resumes the transaction. The fix is to treat the DSCR similarly to the TAR and save it early in __switch_to(). A program exposing the problem is added to the kernel self tests as: tools/testing/selftests/powerpc/tm/tm-resched-dscr. Signed-off-by: Sam Bobroff sam.bobr...@au1.ibm.com --- Changes: v2: * Reworked commit message. * Adjusted test code and added it to kernel self tests. --- arch/powerpc/include/asm/switch_to.h |6 +- arch/powerpc/kernel/entry_64.S |6 -- arch/powerpc/kernel/process.c |8 +- tools/testing/selftests/powerpc/Makefile |2 +- tools/testing/selftests/powerpc/tm/Makefile| 15 .../testing/selftests/powerpc/tm/tm-resched-dscr.c | 90 6 files changed, 114 insertions(+), 13 deletions(-) create mode 100644 tools/testing/selftests/powerpc/tm/Makefile create mode 100644 tools/testing/selftests/powerpc/tm/tm-resched-dscr.c diff --git a/arch/powerpc/include/asm/switch_to.h b/arch/powerpc/include/asm/switch_to.h index 2737f46..3efd0e5 100644 --- a/arch/powerpc/include/asm/switch_to.h +++ b/arch/powerpc/include/asm/switch_to.h @@ -16,13 +16,15 @@ struct thread_struct; extern struct task_struct *_switch(struct thread_struct *prev, struct thread_struct *next); #ifdef CONFIG_PPC_BOOK3S_64 -static inline void save_tar(struct thread_struct *prev) +static inline void save_early_sprs(struct thread_struct *prev) { if (cpu_has_feature(CPU_FTR_ARCH_207S)) prev-tar = mfspr(SPRN_TAR); + if (cpu_has_feature(CPU_FTR_DSCR)) + prev-dscr = mfspr(SPRN_DSCR); } #else -static inline void save_tar(struct thread_struct *prev) {} +static inline void save_early_sprs(struct thread_struct *prev) {} #endif extern void enable_kernel_fp(void); diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S index 662c6dd..a107f4a 100644 --- a/arch/powerpc/kernel/entry_64.S +++ b/arch/powerpc/kernel/entry_64.S @@ -432,12 +432,6 @@ BEGIN_FTR_SECTION std r24,THREAD_VRSAVE(r3) END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC) #endif /* CONFIG_ALTIVEC */ -#ifdef CONFIG_PPC64 -BEGIN_FTR_SECTION - mfspr r25,SPRN_DSCR - std r25,THREAD_DSCR(r3) -END_FTR_SECTION_IFSET(CPU_FTR_DSCR) -#endif and.r0,r0,r22 beq+1f andcr22,r22,r0 diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c index e247898..8d2065e 100644 --- a/arch/powerpc/kernel/process.c +++ b/arch/powerpc/kernel/process.c @@ -771,15 +771,15 @@ struct task_struct *__switch_to(struct task_struct *prev, WARN_ON(!irqs_disabled()); - /* Back up the TAR across context switches. + /* Back up the TAR and DSCR across context switches. * Note that the TAR is not available for use in the kernel. (To * provide this, the TAR should be backed up/restored on exception * entry/exit instead, and be in pt_regs. FIXME, this should be in * pt_regs anyway (for debug).) -* Save the TAR here before we do treclaim/trecheckpoint as these -* will change the TAR. +* Save the TAR and DSCR here before we do treclaim/trecheckpoint as +* these will change them. */ - save_tar(prev-thread); + save_early_sprs(prev-thread); __switch_to_tm(prev); diff --git a/tools/testing/selftests/powerpc/Makefile b/tools/testing/selftests/powerpc/Makefile index 316194f..e1544e8 100644 --- a/tools/testing/selftests/powerpc/Makefile +++ b/tools/testing/selftests/powerpc/Makefile @@ -13,7 +13,7 @@ CFLAGS := -Wall -O2 -flto -Wall -Werror -DGIT_VERSION='$(GIT_VERSION)' -I$(CUR export CC CFLAGS -TARGETS = pmu copyloops +TARGETS = pmu copyloops tm endif diff --git a/tools/testing/selftests/powerpc/tm/Makefile b/tools/testing/selftests/powerpc/tm/Makefile new file mode 100644 index 000..51267f4 --- /dev/null +++ b/tools/testing/selftests/powerpc/tm/Makefile @@ -0,0 +1,15 @@ +PROGS := tm-resched-dscr + +all: $(PROGS) + +$(PROGS): +
[PATCH v8 1/3] powerpc/eeh: Avoid event on passed PE
We must not handle EEH error on devices which are passed to somebody else. Instead, we expect that the frozen device owner detects an EEH error and recovers from it. This avoids EEH error handling on passed through devices so the device owner gets a chance to handle them. Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com --- arch/powerpc/include/asm/eeh.h| 7 +++ arch/powerpc/kernel/eeh.c | 8 arch/powerpc/platforms/powernv/eeh-ioda.c | 3 ++- 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 7782056..653d981 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -25,6 +25,7 @@ #include linux/list.h #include linux/string.h #include linux/time.h +#include linux/atomic.h struct pci_dev; struct pci_bus; @@ -84,6 +85,7 @@ struct eeh_pe { int freeze_count; /* Times of froze up*/ struct timeval tstamp; /* Time on first-time freeze*/ int false_positives;/* Times of reported #ff's */ + atomic_t pass_dev_cnt; /* Count of passed through devs */ struct eeh_pe *parent; /* Parent PE*/ struct list_head child_list;/* Link PE to the child list*/ struct list_head edevs; /* Link list of EEH devices */ @@ -93,6 +95,11 @@ struct eeh_pe { #define eeh_pe_for_each_dev(pe, edev, tmp) \ list_for_each_entry_safe(edev, tmp, pe-edevs, list) +static inline bool eeh_pe_passed(struct eeh_pe *pe) +{ + return pe ? !!atomic_read(pe-pass_dev_cnt) : false; +} + /* * The struct is used to trace EEH state for the associated * PCI device node or PCI device. In future, it might diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 9c6b899..3bc8b12 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -400,6 +400,14 @@ int eeh_dev_check_failure(struct eeh_dev *edev) if (ret 0) return ret; + /* +* If the PE isn't owned by us, we shouldn't check the +* state. Instead, let the owner handle it if the PE has +* been frozen. +*/ + if (eeh_pe_passed(pe)) + return 0; + /* If we already have a pending isolation event for this * slot, we know it's bad already, we don't need to check. * Do this checking under a lock; as multiple PCI devices diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c index cab3e62..79193eb 100644 --- a/arch/powerpc/platforms/powernv/eeh-ioda.c +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c @@ -892,7 +892,8 @@ static int ioda_eeh_next_error(struct eeh_pe **pe) opal_pci_eeh_freeze_clear(phb-opal_id, frozen_pe_no, OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); ret = EEH_NEXT_ERR_NONE; - } else if ((*pe)-state EEH_PE_ISOLATED) { + } else if ((*pe)-state EEH_PE_ISOLATED || + eeh_pe_passed(*pe)) { ret = EEH_NEXT_ERR_NONE; } else { pr_err(EEH: Frozen PHB#%x-PE#%x (%s) detected\n, -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v8 2/3] powerpc/eeh: EEH support for VFIO PCI device
The patch exports functions to be used by new VFIO ioctl command, which will be introduced in subsequent patch, to support EEH functinality for VFIO PCI devices. Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com --- arch/powerpc/include/asm/eeh.h | 14 +++ arch/powerpc/kernel/eeh.c | 268 + 2 files changed, 282 insertions(+) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 653d981..5b4cc4e 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -166,6 +166,8 @@ enum { #define EEH_OPT_ENABLE 1 /* EEH enable */ #define EEH_OPT_THAW_MMIO 2 /* MMIO enable */ #define EEH_OPT_THAW_DMA 3 /* DMA enable */ +#define EEH_OPT_GET_PE_ADDR0 /* Get PE addr */ +#define EEH_OPT_GET_PE_MODE1 /* Get PE mode */ #define EEH_STATE_UNAVAILABLE (1 0)/* State unavailable*/ #define EEH_STATE_NOT_SUPPORT (1 1)/* EEH not supported*/ #define EEH_STATE_RESET_ACTIVE (1 2)/* Active reset */ @@ -173,6 +175,11 @@ enum { #define EEH_STATE_DMA_ACTIVE (1 4)/* Active DMA */ #define EEH_STATE_MMIO_ENABLED (1 5)/* MMIO enabled */ #define EEH_STATE_DMA_ENABLED (1 6)/* DMA enabled */ +#define EEH_PE_STATE_NORMAL0 /* Normal state */ +#define EEH_PE_STATE_RESET 1 /* PE reset asserted*/ +#define EEH_PE_STATE_STOPPED_IO_DMA2 /* Frozen PE*/ +#define EEH_PE_STATE_STOPPED_DMA 4 /* Stopped DMA, Enabled IO */ +#define EEH_PE_STATE_UNAVAIL 5 /* Unavailable */ #define EEH_RESET_DEACTIVATE 0 /* Deactivate the PE reset */ #define EEH_RESET_HOT 1 /* Hot reset*/ #define EEH_RESET_FUNDAMENTAL 3 /* Fundamental reset*/ @@ -280,6 +287,13 @@ void eeh_add_device_late(struct pci_dev *); void eeh_add_device_tree_late(struct pci_bus *); void eeh_add_sysfs_files(struct pci_bus *); void eeh_remove_device(struct pci_dev *); +int eeh_dev_open(struct pci_dev *pdev); +void eeh_dev_release(struct pci_dev *pdev); +struct eeh_pe *eeh_iommu_group_to_pe(struct iommu_group *group); +int eeh_pe_set_option(struct eeh_pe *pe, int option); +int eeh_pe_get_state(struct eeh_pe *pe); +int eeh_pe_reset(struct eeh_pe *pe, int option); +int eeh_pe_configure(struct eeh_pe *pe); /** * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure. diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 3bc8b12..fc90df0 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -40,6 +40,7 @@ #include asm/eeh.h #include asm/eeh_event.h #include asm/io.h +#include asm/iommu.h #include asm/machdep.h #include asm/ppc-pci.h #include asm/rtas.h @@ -108,6 +109,9 @@ struct eeh_ops *eeh_ops = NULL; /* Lock to avoid races due to multiple reports of an error */ DEFINE_RAW_SPINLOCK(confirm_error_lock); +/* Lock to protect passed flags */ +static DEFINE_MUTEX(eeh_dev_mutex); + /* Buffer for reporting pci register dumps. Its here in BSS, and * not dynamically alloced, so that it ends up in RMO where RTAS * can access it. @@ -1106,6 +1110,270 @@ void eeh_remove_device(struct pci_dev *dev) edev-mode = ~EEH_DEV_SYSFS; } +/** + * eeh_dev_open - Increase count of pass through devices for PE + * @pdev: PCI device + * + * Increase count of passed through devices for the indicated + * PE. In the result, the EEH errors detected on the PE won't be + * reported. The PE owner will be responsible for detection + * and recovery. + */ +int eeh_dev_open(struct pci_dev *pdev) +{ + struct eeh_dev *edev; + + mutex_lock(eeh_dev_mutex); + + /* No PCI device ? */ + if (!pdev) + goto out; + + /* No EEH device or PE ? */ + edev = pci_dev_to_eeh_dev(pdev); + if (!edev || !edev-pe) + goto out; + + /* Increase PE's pass through count */ + atomic_inc(edev-pe-pass_dev_cnt); + mutex_unlock(eeh_dev_mutex); + + return 0; +out: + mutex_unlock(eeh_dev_mutex); + return -ENODEV; +} +EXPORT_SYMBOL_GPL(eeh_dev_open); + +/** + * eeh_dev_release - Decrease count of pass through devices for PE + * @pdev: PCI device + * + * Decrease count of pass through devices for the indicated PE. If + * there is no passed through device in PE, the EEH errors detected + * on the PE will be reported and handled as usual. + */ +void eeh_dev_release(struct pci_dev *pdev) +{ + struct eeh_dev *edev; + + mutex_lock(eeh_dev_mutex); + + /* No PCI device ? */ + if (!pdev) + goto out; + + /* No EEH device ? */ + edev = pci_dev_to_eeh_dev(pdev); + if (!edev || !edev-pe || !eeh_pe_passed(edev-pe)) + goto out; + + /* Decrease PE's pass through count */ +
[PATCH v8 0/3] EEH Support for VFIO PCI Device
The series of patches adds support EEH for PCI devices, which are passed through to PowerKVM based guest via VFIO. The implementation is straightforward based on the issues or problems we have to resolve to support EEH for PowerKVM based guest. - Emulation for EEH RTAS requests. All EEH RTAS requests goes to QEMU firstly. If QEMU can't handle it, the request will be sent to host via newly introduced VFIO container IOCTL command (VFIO_EEH_OP) and gets handled in host kernel. The series of patches requires corresponding QEMU changes. Change log == v1 - v2: * EEH RTAS requests are routed to QEMU, and then possiblly to host kerenl. The mechanism KVM in-kernel handling is dropped. * Error injection is reimplemented based syscall, instead of KVM in-kerenl handling. The logic for error injection token management is moved to QEMU. The error injection request is routed to QEMU and then possiblly to host kernel. v2 - v3: * Make the fields in struct eeh_vfio_pci_addr, struct vfio_eeh_info based on the comments from Alexey. * Define macros for EEH VFIO operations (Alexey). * Clear frozen state after successful PE reset. * Merge original [PATCH 1/2/3] to one. v3 - v4: * Remove the error injection from the patchset. Mike or I will work on that later. * Rename CONFIG_VFIO_EEH to VFIO_PCI_EEH. * Rename the IOCTL command to VFIO_EEH_OP and it's handled by VFIO-PCI device instead of VFIO container. * Rename the IOCTL argument structure to vfio_eeh_op accordingly. Also, more fields added to hold return values for RTAS requests. * The address mapping stuff is totally removed. When opening or releasing VFIO PCI device, notification sent to EEH to update the flags indicates the device is passed to guest or not. * Change pr_warn() to pr_debug() to avoid DOS as pointed by Alex.W * Argument size check issue pointed by Alex.W. v4 - v5: * Functions for VFIO PCI EEH support are moved to eeh.c and exported from there. VFIO PCI driver just uses those functions to tackle IOCTL command VFIO_EEH_OP. All of this is to make the code organized in a good way as suggested by Alex.G. Another potential benefit is PowerNV/pSeries are sharing eeh_ops and same infrastructure could possiblly work for KVM_PR and KVM_HV mode at the same time. * Don't clear error injection registers after finishing PE reset as the patchset is doing nothing related to error injection. * Amending Documentation/vfio.txt, which was missed in last revision. * No QEMU changes for this revision. v4 works well. Also, remove RFC from the subject as the design is basically recognized. v5 - v6: * CONFIG_VFIO_PCI_EEH removed. Instead to use CONFIG_EEH. * Split one ioctl command to 5. * In eeh.c, description has been added for those exported functions. Also, the functions have negative return values for error and information with other values. All digital numbers have been replaced by macros defined in eeh.h. The comments, including the function names have been amended not to mention guest or vfio. * Add one mutex to protect flag in eeh_dev_open()/release(). * More information on how to use those ioctl commands to Documentation/vfio.txt. v6 - v7: * Remove ioctl command VFIO_EEH_PE_GET_ADDR, the PE address will be figured out in userland (e.g. QEMU) as Alex.G suggested. * Let sPAPR VFIO container process the ioctl commands as VFIO container is naturally corresponds to IOMMU group (aka PE on sPAPR platform). * All VFIO PCI EEH ioctl commands have argsz+flags for its companion data struct. * For VFIO PCI EEH ioctl commands, ioctl() returns negative number to indicate error or zero for success. Additinal output information is transported by the companion data struct. * Explaining PE in Documentation/vfio.txt, typo fixes, more comments suggested by Alex.G. * Split/merge patches according to suggestions from Alex.G and Alex.W. * To have EEH stub in drivers/vfio/pci/, which was suggested by Alex.W. * Define various EEH options as macros in vfio.h for userland to use. v7 - v8: * Change ioctl commands back to combined one. * EEH related logic was put into drivers/vfio/vfio_eeh.c, which is only built with CONFIG_EEH. Otherwise, inline functions defined in include/linux/vfio.h * Change vfio.txt according to the source code changes. * Fix various comments from internal reviews by Alexey. Thanks to Alexey. Gavin Shan (3): powerpc/eeh: Avoid event on passed PE powerpc/eeh: EEH support for VFIO PCI device drivers/vfio: EEH support
[PATCH v8 3/3] drivers/vfio: EEH support for VFIO PCI device
The patch adds new IOCTL commands for sPAPR VFIO container device to support EEH functionality for PCI devices, which have been passed through from host to somebody else via VFIO. Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com --- Documentation/vfio.txt | 87 ++-- drivers/vfio/Makefile | 1 + drivers/vfio/pci/vfio_pci.c | 20 ++--- drivers/vfio/vfio_iommu_spapr_tce.c | 17 ++- drivers/vfio/vfio_spapr_eeh.c | 89 + include/linux/vfio.h| 23 ++ include/uapi/linux/vfio.h | 35 +++ 7 files changed, 262 insertions(+), 10 deletions(-) create mode 100644 drivers/vfio/vfio_spapr_eeh.c diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt index b9ca023..3fa4538 100644 --- a/Documentation/vfio.txt +++ b/Documentation/vfio.txt @@ -305,7 +305,15 @@ faster, the map/unmap handling has been implemented in real mode which provides an excellent performance which has limitations such as inability to do locked pages accounting in real time. -So 3 additional ioctls have been added: +4) According to sPAPR specification, A Partitionable Endpoint (PE) is an I/O +subtree that can be treated as a unit for the purposes of partitioning and +error recovery. A PE may be a single or multi-function IOA (IO Adapter), a +function of a multi-function IOA, or multiple IOAs (possibly including switch +and bridge structures above the multiple IOAs). PPC64 guests detect PCI errors +and recover from them via EEH RTAS services, which works on the basis of +additional ioctl commands. + +So 4 additional ioctls have been added: VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start of the DMA window on the PCI bus. @@ -316,9 +324,12 @@ So 3 additional ioctls have been added: VFIO_IOMMU_DISABLE - disables the container. + VFIO_EEH_PE_OP - provides an API for EEH setup, error detection and recovery. The code flow from the example above should be slightly changed: + struct vfio_eeh_pe_op pe_op = { .argsz = sizeof(pe_op) }; + . /* Add the group to the container */ ioctl(group, VFIO_GROUP_SET_CONTAINER, container); @@ -342,9 +353,79 @@ The code flow from the example above should be slightly changed: dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; /* Check here is .iova/.size are within DMA window from spapr_iommu_info */ - ioctl(container, VFIO_IOMMU_MAP_DMA, dma_map); - . + + /* Get a file descriptor for the device */ + device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, :06:0d.0); + + + + /* Gratuitous device reset and go... */ + ioctl(device, VFIO_DEVICE_RESET); + + /* Make sure EEH is supported */ + ioctl(container, VFIO_CHECK_EXTENSION, VFIO_EEH); + + /* Enable the EEH functionality on the device */ + pe_op.op = VFIO_EEH_PE_ENABLE; + ioctl(container, VFIO_EEH_PE_OP, pe_op); + + /* You're suggested to create additional data struct to represent +* PE, and put child devices belonging to same IOMMU group to the +* PE instance for later reference. +*/ + + /* Check the PE's state and make sure it's in functional state */ + pe_op.op = VFIO_EEH_PE_GET_STATE; + ioctl(container, VFIO_EEH_PE_OP, pe_op); + + /* Save device state using pci_save_state(). +* EEH should be enabled on the specified device. +*/ + + + + /* When 0xFF's returned from reading PCI config space or IO BARs +* of the PCI device. Check the PE's state to see if that has been +* frozen. +*/ + ioctl(container, VFIO_EEH_PE_OP, pe_op); + + /* Waiting for pending PCI transactions to be completed and don't +* produce any more PCI traffic from/to the affected PE until +* recovery is finished. +*/ + + /* Enable IO for the affected PE and collect logs. Usually, the +* standard part of PCI config space, AER registers are dumped +* as logs for further analysis. +*/ + pe_op.op = VFIO_EEH_PE_UNFREEZE_IO; + ioctl(container, VFIO_EEH_PE_OP, pe_op); + + /* +* Issue PE reset: hot or fundamental reset. Usually, hot reset +* is enough. However, the firmware of some PCI adapters would +* require fundamental reset. +*/ + pe_op.op = VFIO_EEH_PE_RESET_HOT; + ioctl(container, VFIO_EEH_PE_OP, pe_op); + pe_op.op = VFIO_EEH_PE_RESET_DEACTIVATE; + ioctl(container, VFIO_EEH_PE_OP, pe_op); + + /* Configure the PCI bridges for the affected PE */ + pe_op.op = VFIO_EEH_PE_CONFIGURE; + ioctl(container, VFIO_EEH_PE_OP, pe_op); + + /* Restored state we saved at initialization time. pci_restore_state() +* is good enough as an example. +*/ + + /*
Re: [PATCH v8 2/3] powerpc/eeh: EEH support for VFIO PCI device
On Thu, 2014-06-05 at 16:36 +1000, Gavin Shan wrote: +#define EEH_OPT_GET_PE_ADDR0 /* Get PE addr */ +#define EEH_OPT_GET_PE_MODE1 /* Get PE mode */ I assume that's just some leftover from the previous patches :-) Don't respin just yet, let's see what other comments come in. Cheers, Ben. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 2/3] PPC: KVM: Reserve KVM_CAP_SPAPR_TCE_64 capability number
This adds a capability number for 64-bit TCE tables support. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index 944cd21..e6972bf 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -744,6 +744,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_ENABLE_CAP_VM 98 #define KVM_CAP_S390_IRQCHIP 99 #define KVM_CAP_SPAPR_TCE_VFIO 100 +#define KVM_CAP_SPAPR_TCE_64 101 #ifdef KVM_CAP_IRQ_ROUTING -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 0/3] Prepare for in-kernel VFIO DMA operations acceleration
This reserves 2 capability numbers. This implements an extended version of KVM_CREATE_SPAPR_TCE_64 ioctl. Please advise how to proceed with these patches as I suspect that first two should go via Paolo's tree while the last one via Alex Graf's tree (correct?). Thanks! Alexey Kardashevskiy (3): PPC: KVM: Reserve KVM_CAP_SPAPR_TCE_VFIO capability number PPC: KVM: Reserve KVM_CAP_SPAPR_TCE_64 capability number PPC: KVM: Add support for 64bit TCE windows Documentation/virtual/kvm/api.txt | 46 + arch/powerpc/include/asm/kvm_host.h | 4 +++- arch/powerpc/include/asm/kvm_ppc.h | 2 +- arch/powerpc/include/uapi/asm/kvm.h | 9 arch/powerpc/kvm/book3s_64_vio.c| 4 +++- arch/powerpc/kvm/powerpc.c | 24 ++- include/uapi/linux/kvm.h| 4 7 files changed, 89 insertions(+), 4 deletions(-) -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 3/3] PPC: KVM: Add support for 64bit TCE windows
The existing KVM_CREATE_SPAPR_TCE only supports 32bit windows which is not enough for directly mapped windows as the guest can get more than 4GB. This adds KVM_CREATE_SPAPR_TCE_64 ioctl and advertises it via KVM_CAP_SPAPR_TCE_64 capability. Since 64bit windows are to support Dynamic DMA windows (DDW), let's add @bus_offset and @page_shift which are also required by DDW. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- Documentation/virtual/kvm/api.txt | 46 + arch/powerpc/include/asm/kvm_host.h | 4 +++- arch/powerpc/include/asm/kvm_ppc.h | 2 +- arch/powerpc/include/uapi/asm/kvm.h | 9 arch/powerpc/kvm/book3s_64_vio.c| 4 +++- arch/powerpc/kvm/powerpc.c | 24 ++- include/uapi/linux/kvm.h| 2 ++ 7 files changed, 87 insertions(+), 4 deletions(-) diff --git a/Documentation/virtual/kvm/api.txt b/Documentation/virtual/kvm/api.txt index b4f5365..8a2a2da 100644 --- a/Documentation/virtual/kvm/api.txt +++ b/Documentation/virtual/kvm/api.txt @@ -2484,6 +2484,52 @@ calls by the guest for that service will be passed to userspace to be handled. +4.87 KVM_CREATE_SPAPR_TCE_64 + +Capability: KVM_CAP_SPAPR_TCE_64 +Architectures: powerpc +Type: vm ioctl +Parameters: struct kvm_create_spapr_tce_64 (in) +Returns: file descriptor for manipulating the created TCE table + +This is an extension for KVM_CAP_SPAPR_TCE which only supports 32bit +windows. + +This creates a virtual TCE (translation control entry) table, which +is an IOMMU for PAPR-style virtual I/O. It is used to translate +logical addresses used in virtual I/O into guest physical addresses, +and provides a scatter/gather capability for PAPR virtual I/O. + +/* for KVM_CAP_SPAPR_TCE_64 */ +struct kvm_create_spapr_tce_64 { + __u64 liobn; + __u64 window_size; + __u64 bus_offset; + __u32 page_shift; + __u32 flags; +}; + +The liobn field gives the logical IO bus number for which to create a +TCE table. The window_size field specifies the size of the DMA window +which this TCE table will translate - the table will contain one 64 +bit TCE entry for every IOMMU page. The bus_offset field tells where +this window is mapped on the IO bus. The page_size field tells a size +of the pages in this window, can be 4K, 64K, 16MB, etc. The flags field +is not used at the moment but provides the room for extensions. + +When the guest issues an H_PUT_TCE/H_PUT_TCE_INDIRECT/H_STUFF_TCE hcall +on a liobn for which a TCE table has been created using this ioctl(), +the kernel will handle it in real or virtual mode, updating the TCE table. +If liobn has not been registered with this ioctl, H_PUT_TCE/etc calls +will cause a vm exit and must be handled by userspace. + +The return value is a file descriptor which can be passed to mmap(2) +to map the created TCE table into userspace. This lets userspace read +the entries written by kernel-handled H_PUT_TCE calls, and also lets +userspace update the TCE table directly which is useful in some +circumstances. + + 5. The kvm_run structure diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 1eaea2d..260a810 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -179,7 +179,9 @@ struct kvmppc_spapr_tce_table { struct list_head list; struct kvm *kvm; u64 liobn; - u32 window_size; + u64 window_size; + u64 bus_offset; + u32 page_shift; struct page *pages[0]; }; diff --git a/arch/powerpc/include/asm/kvm_ppc.h b/arch/powerpc/include/asm/kvm_ppc.h index 4096f16..b472fd3 100644 --- a/arch/powerpc/include/asm/kvm_ppc.h +++ b/arch/powerpc/include/asm/kvm_ppc.h @@ -126,7 +126,7 @@ extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu, extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu); extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm, - struct kvm_create_spapr_tce *args); + struct kvm_create_spapr_tce_64 *args); extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn, unsigned long ioba, unsigned long tce); extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn, diff --git a/arch/powerpc/include/uapi/asm/kvm.h b/arch/powerpc/include/uapi/asm/kvm.h index a6665be..0ada7b4 100644 --- a/arch/powerpc/include/uapi/asm/kvm.h +++ b/arch/powerpc/include/uapi/asm/kvm.h @@ -333,6 +333,15 @@ struct kvm_create_spapr_tce { __u32 window_size; }; +/* for KVM_CAP_SPAPR_TCE_64 */ +struct kvm_create_spapr_tce_64 { + __u64 liobn; + __u64 window_size; + __u64 bus_offset; + __u32 page_shift; + __u32 flags; +}; + /* for KVM_ALLOCATE_RMA */ struct kvm_allocate_rma { __u64 rma_size; diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c index 54cf9bc..230fa5f
[PATCH 1/3] PPC: KVM: Reserve KVM_CAP_SPAPR_TCE_VFIO capability number
This adds a capability number for in-kernel support for VFIO on SPAPR platform. The capability will tell the user space whether in-kernel handlers of H_PUT_TCE can handle VFIO-targeted requests or not. If not, the user space must not attempt allocating a TCE table in the host kernel via the KVM_CREATE_SPAPR_TCE KVM ioctl because in that case TCE requests will not be passed to the user space which is desired action in the situation like that. Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru --- include/uapi/linux/kvm.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h index a8f4ee5..944cd21 100644 --- a/include/uapi/linux/kvm.h +++ b/include/uapi/linux/kvm.h @@ -743,6 +743,7 @@ struct kvm_ppc_smmu_info { #define KVM_CAP_IOAPIC_POLARITY_IGNORED 97 #define KVM_CAP_ENABLE_CAP_VM 98 #define KVM_CAP_S390_IRQCHIP 99 +#define KVM_CAP_SPAPR_TCE_VFIO 100 #ifdef KVM_CAP_IRQ_ROUTING -- 2.0.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 3/3] PPC: KVM: Add support for 64bit TCE windows
On Thu, 2014-06-05 at 17:25 +1000, Alexey Kardashevskiy wrote: +This creates a virtual TCE (translation control entry) table, which +is an IOMMU for PAPR-style virtual I/O. It is used to translate +logical addresses used in virtual I/O into guest physical addresses, +and provides a scatter/gather capability for PAPR virtual I/O. + +/* for KVM_CAP_SPAPR_TCE_64 */ +struct kvm_create_spapr_tce_64 { + __u64 liobn; + __u64 window_size; + __u64 bus_offset; + __u32 page_shift; + __u32 flags; +}; + +The liobn field gives the logical IO bus number for which to create a +TCE table. The window_size field specifies the size of the DMA window +which this TCE table will translate - the table will contain one 64 +bit TCE entry for every IOMMU page. The bus_offset field tells where +this window is mapped on the IO bus. Hrm, the bus_offset cannot be set arbitrarily, it has some pretty strong HW limits depending on the type of bridge architecture version... Do you plan to have that knowledge in qemu ? Or do you have some other mechanism to query it ? (I might be missing a piece of the puzzle here). Also one thing I've been pondering ... We'll end up wasting a ton of memory with those TCE tables. If you have 3 PEs mapped into a guest, it will try to create 3 DDW's mapping the entire guest memory and so 3 TCE tables large enough for that ... and which will contain exactly the same entries ! We really want to look into extending PAPR to allow the creation of table aliases so that the guest can essentially create one table and associate it with multiple PEs. We might still decide to do multiple copies for NUMA reasons but no more than one per node for example... at least we can have the policy in qemu/kvm. Also, do you currently require allocating a single physically contiguous table or do you support TCE trees in your implementation ? Cheers, Ben. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 3/3] PPC: KVM: Add support for 64bit TCE windows
On 06/05/2014 05:38 PM, Benjamin Herrenschmidt wrote: On Thu, 2014-06-05 at 17:25 +1000, Alexey Kardashevskiy wrote: +This creates a virtual TCE (translation control entry) table, which +is an IOMMU for PAPR-style virtual I/O. It is used to translate +logical addresses used in virtual I/O into guest physical addresses, +and provides a scatter/gather capability for PAPR virtual I/O. + +/* for KVM_CAP_SPAPR_TCE_64 */ +struct kvm_create_spapr_tce_64 { + __u64 liobn; + __u64 window_size; + __u64 bus_offset; + __u32 page_shift; + __u32 flags; +}; + +The liobn field gives the logical IO bus number for which to create a +TCE table. The window_size field specifies the size of the DMA window +which this TCE table will translate - the table will contain one 64 +bit TCE entry for every IOMMU page. The bus_offset field tells where +this window is mapped on the IO bus. Hrm, the bus_offset cannot be set arbitrarily, it has some pretty strong HW limits depending on the type of bridge architecture version... Do you plan to have that knowledge in qemu ? Or do you have some other mechanism to query it ? (I might be missing a piece of the puzzle here). Yes. QEMU will have this knowledge as it has to implement ibm,create-pe-dma-window and return this address to the guest. There will be a container API to receive it from powernv code via funky ppc_md callback. There are 2 steps: 1. query + create window 2. enable in-kernel KVM acceleration for it. Everything will work without step2 and, frankly speaking, we do not need it too much for DDW but it does not cost much. By having bus_offset in ioctl which is only used for step2, I reduce dependance from powernv. Also one thing I've been pondering ... We'll end up wasting a ton of memory with those TCE tables. If you have 3 PEs mapped into a guest, it will try to create 3 DDW's mapping the entire guest memory and so 3 TCE tables large enough for that ... and which will contain exactly the same entries ! This is in the plan too, do not rush :) We really want to look into extending PAPR to allow the creation of table aliases so that the guest can essentially create one table and associate it with multiple PEs. We might still decide to do multiple copies for NUMA reasons but no more than one per node for example... at least we can have the policy in qemu/kvm. Also, do you currently require allocating a single physically contiguous table or do you support TCE trees in your implementation ? No trees yet. For 64GB window we need (6430)/(1620)*8 = 32K TCE table. Do we really need trees? -- Alexey ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 3/3] PPC: KVM: Add support for 64bit TCE windows
On Thu, 2014-06-05 at 19:26 +1000, Alexey Kardashevskiy wrote: No trees yet. For 64GB window we need (6430)/(1620)*8 = 32K TCE table. Do we really need trees? The above is assuming hugetlbfs backed guests. These are the least of my worry indeed. But we need to deal with 4k and 64k guests. Cheers, Ben ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 3/3] PPC: KVM: Add support for 64bit TCE windows
On 05.06.14 12:27, Benjamin Herrenschmidt wrote: On Thu, 2014-06-05 at 19:26 +1000, Alexey Kardashevskiy wrote: No trees yet. For 64GB window we need (6430)/(1620)*8 = 32K TCE table. Do we really need trees? The above is assuming hugetlbfs backed guests. These are the least of my worry indeed. But we need to deal with 4k and 64k guests. What if we ask user space to give us a pointer to user space allocated memory along with the TCE registration? We would still ask user space to only use the returned fd for TCE modifications, but would have some nicely swappable memory we can store the TCE entries in. In fact, the code as is today can allocate an arbitrary amount of pinned kernel memory from within user space without any checks. Alex ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 0/3] Prepare for in-kernel VFIO DMA operations acceleration
On 05.06.14 09:25, Alexey Kardashevskiy wrote: This reserves 2 capability numbers. This implements an extended version of KVM_CREATE_SPAPR_TCE_64 ioctl. Please advise how to proceed with these patches as I suspect that first two should go via Paolo's tree while the last one via Alex Graf's tree (correct?). They would just go via my tree, but only be actually allocated (read: mergable to qemu) when they hit Paolo's tree. In fact, I don't think it makes sense to split them off at all. Alex ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
KVM: PPC: BOOK3S: PR: P8 Support
This patchset adds support for emulating VTB, IC and Doorbell features in P8. Doorbell support is dummy since we don't support SMT cores with PR-KVM. -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 2/4] KVM: PPC: BOOK3S: PR: Doorbell support
We don't have SMT support yet, hence we should not find a doorbell message generated Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/kvm/book3s_emulate.c | 18 ++ 1 file changed, 18 insertions(+) diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 1bb16a59dcbc..d6c87d085182 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -28,7 +28,9 @@ #define OP_19_XOP_RFI 50 #define OP_31_XOP_MFMSR83 +#define OP_31_XOP_MSGSNDP 142 #define OP_31_XOP_MTMSR146 +#define OP_31_XOP_MSGCLRP 174 #define OP_31_XOP_MTMSRD 178 #define OP_31_XOP_MTSR 210 #define OP_31_XOP_MTSRIN 242 @@ -303,6 +305,22 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, break; } + case OP_31_XOP_MSGSNDP: + { + /* +* PR KVM still don't support SMT mode. So we should +* not see a MSGSNDP/MSGCLRP used with PR KVM +*/ + pr_info(KVM: MSGSNDP used in non SMT case\n); + emulated = EMULATE_FAIL; + break; + } + case OP_31_XOP_MSGCLRP: + { + pr_info(KVM: MSGCLRP used in non SMT case\n); + emulated = EMULATE_FAIL; + break; + } default: emulated = EMULATE_FAIL; } -- 1.9.1 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 1/4] KVM: PPC: BOOK3S: PR: Emulate virtual timebase register
virtual time base register is a per VM, per cpu register that needs to be saved and restored on vm exit and entry. Writing to VTB is not allowed in the privileged mode. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/include/asm/reg.h | 15 +++ arch/powerpc/include/asm/time.h | 9 + arch/powerpc/kvm/book3s.c | 6 ++ arch/powerpc/kvm/book3s_emulate.c | 3 +++ arch/powerpc/kvm/book3s_hv.c| 6 -- arch/powerpc/kvm/book3s_pr.c| 3 ++- 7 files changed, 36 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 4a58731a0a72..bd3caeaeebe1 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -505,6 +505,7 @@ struct kvm_vcpu_arch { #endif /* Time base value when we entered the guest */ u64 entry_tb; + u64 entry_vtb; u32 tcr; ulong tsr; /* we need to perform set/clr_bits() which requires ulong */ u32 ivor[64]; diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 4852bcf270f3..3e7085d8af90 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -25,6 +25,7 @@ #ifdef CONFIG_8xx #include asm/reg_8xx.h #endif /* CONFIG_8xx */ +#include asm/bug.h #define MSR_SF_LG 63 /* Enable 64 bit mode */ #define MSR_ISF_LG 61 /* Interrupt 64b mode valid on 630 */ @@ -1193,6 +1194,20 @@ : r ((unsigned long)(v)) \ : memory) +static inline unsigned long mfvtb (void) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + return mfspr(SPRN_VTB); +#endif + /* +* The above mfspr will be a no-op on anything before Power8 +* That can result in random values returned. We need to +* capture that. +*/ + BUG(); +} + #ifdef __powerpc64__ #if defined(CONFIG_PPC_CELL) || defined(CONFIG_PPC_FSL_BOOK3E) #define mftb() ({unsigned long rval; \ diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 1d428e6007ca..03cbada59d3a 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -102,6 +102,15 @@ static inline u64 get_rtc(void) return (u64)hi * 10 + lo; } +static inline u64 get_vtb(void) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + return mfvtb(); +#endif + return 0; +} + #ifdef CONFIG_PPC64 static inline u64 get_tb(void) { diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 52c654dbd41a..ae43e4178ecd 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -646,6 +646,9 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) case KVM_REG_PPC_BESCR: val = get_reg_val(reg-id, vcpu-arch.bescr); break; + case KVM_REG_PPC_VTB: + val = get_reg_val(reg-id, vcpu-arch.vtb); + break; default: r = -EINVAL; break; @@ -750,6 +753,9 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) case KVM_REG_PPC_BESCR: vcpu-arch.bescr = set_reg_val(reg-id, val); break; + case KVM_REG_PPC_VTB: + vcpu-arch.vtb = set_reg_val(reg-id, val); + break; default: r = -EINVAL; break; diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 3565e775b61b..1bb16a59dcbc 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -577,6 +577,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val */ *spr_val = vcpu-arch.spurr; break; + case SPRN_VTB: + *spr_val = vcpu-arch.vtb; + break; case SPRN_GQR0: case SPRN_GQR1: case SPRN_GQR2: diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index aba05bbb3e74..f6ac58336b3f 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -897,9 +897,6 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_IC: *val = get_reg_val(id, vcpu-arch.ic); break; - case KVM_REG_PPC_VTB: - *val = get_reg_val(id, vcpu-arch.vtb); - break; case KVM_REG_PPC_CSIGR: *val = get_reg_val(id, vcpu-arch.csigr);
[PATCH 3/4] KVM: PPC: BOOK3S: PR: Emulate DPDES register
Since we don't support SMT yet, we should always find zero in Directed privileged doorbell exception state register. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/kvm/book3s_emulate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index d6c87d085182..062b5da7786e 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -655,6 +655,7 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val case SPRN_MMCR1: case SPRN_MMCR2: case SPRN_TIR: + case SPRN_DPDES: #endif *spr_val = 0; break; -- 1.9.1 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 4/4] KVM: PPC: BOOK3S: PR: Emulate instruction counter
Writing to IC is not allowed in the privileged mode. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/kvm/book3s.c | 6 ++ arch/powerpc/kvm/book3s_emulate.c | 3 +++ arch/powerpc/kvm/book3s_hv.c| 6 -- arch/powerpc/kvm/book3s_pr.c| 4 5 files changed, 14 insertions(+), 6 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index bd3caeaeebe1..f9ae69682ce1 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -506,6 +506,7 @@ struct kvm_vcpu_arch { /* Time base value when we entered the guest */ u64 entry_tb; u64 entry_vtb; + u64 entry_ic; u32 tcr; ulong tsr; /* we need to perform set/clr_bits() which requires ulong */ u32 ivor[64]; diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index ae43e4178ecd..52c4c43900cb 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -649,6 +649,9 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) case KVM_REG_PPC_VTB: val = get_reg_val(reg-id, vcpu-arch.vtb); break; + case KVM_REG_PPC_IC: + val = get_reg_val(reg-id, vcpu-arch.ic); + break; default: r = -EINVAL; break; @@ -756,6 +759,9 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) case KVM_REG_PPC_VTB: vcpu-arch.vtb = set_reg_val(reg-id, val); break; + case KVM_REG_PPC_IC: + vcpu-arch.ic = set_reg_val(reg-id, val); + break; default: r = -EINVAL; break; diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 062b5da7786e..e6912c618160 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -598,6 +598,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val case SPRN_VTB: *spr_val = vcpu-arch.vtb; break; + case SPRN_IC: + *spr_val = vcpu-arch.ic; + break; case SPRN_GQR0: case SPRN_GQR1: case SPRN_GQR2: diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index f6ac58336b3f..c38cf9f836c0 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -894,9 +894,6 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_CIABR: *val = get_reg_val(id, vcpu-arch.ciabr); break; - case KVM_REG_PPC_IC: - *val = get_reg_val(id, vcpu-arch.ic); - break; case KVM_REG_PPC_CSIGR: *val = get_reg_val(id, vcpu-arch.csigr); break; @@ -1091,9 +1088,6 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, if ((vcpu-arch.ciabr CIABR_PRIV) == CIABR_PRIV_HYPER) vcpu-arch.ciabr = ~CIABR_PRIV;/* disable */ break; - case KVM_REG_PPC_IC: - vcpu-arch.ic = set_reg_val(id, *val); - break; case KVM_REG_PPC_CSIGR: vcpu-arch.csigr = set_reg_val(id, *val); break; diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c index 96cdf89a8c86..03fc8847cd67 100644 --- a/arch/powerpc/kvm/book3s_pr.c +++ b/arch/powerpc/kvm/book3s_pr.c @@ -126,6 +126,8 @@ void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu *svcpu, */ vcpu-arch.entry_tb = get_tb(); vcpu-arch.entry_vtb = get_vtb(); + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + vcpu-arch.entry_ic = mfspr(SPRN_IC); svcpu-in_use = true; } @@ -178,6 +180,8 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu, vcpu-arch.purr += get_tb() - vcpu-arch.entry_tb; vcpu-arch.spurr += get_tb() - vcpu-arch.entry_tb; vcpu-arch.vtb += get_vtb() - vcpu-arch.entry_vtb; + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + vcpu-arch.ic += mfspr(SPRN_IC) - vcpu-arch.entry_ic; svcpu-in_use = false; out: -- 1.9.1 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 1/4] KVM: PPC: BOOK3S: PR: Emulate virtual timebase register
On 05.06.14 14:08, Aneesh Kumar K.V wrote: virtual time base register is a per VM, per cpu register that needs to be saved and restored on vm exit and entry. Writing to VTB is not allowed in the privileged mode. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/include/asm/reg.h | 15 +++ arch/powerpc/include/asm/time.h | 9 + arch/powerpc/kvm/book3s.c | 6 ++ arch/powerpc/kvm/book3s_emulate.c | 3 +++ arch/powerpc/kvm/book3s_hv.c| 6 -- arch/powerpc/kvm/book3s_pr.c| 3 ++- 7 files changed, 36 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 4a58731a0a72..bd3caeaeebe1 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -505,6 +505,7 @@ struct kvm_vcpu_arch { #endif /* Time base value when we entered the guest */ u64 entry_tb; + u64 entry_vtb; u32 tcr; ulong tsr; /* we need to perform set/clr_bits() which requires ulong */ u32 ivor[64]; diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 4852bcf270f3..3e7085d8af90 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -25,6 +25,7 @@ #ifdef CONFIG_8xx #include asm/reg_8xx.h #endif /* CONFIG_8xx */ +#include asm/bug.h #define MSR_SF_LG 63 /* Enable 64 bit mode */ #define MSR_ISF_LG61 /* Interrupt 64b mode valid on 630 */ @@ -1193,6 +1194,20 @@ : r ((unsigned long)(v)) \ : memory) +static inline unsigned long mfvtb (void) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + return mfspr(SPRN_VTB); +#endif + /* +* The above mfspr will be a no-op on anything before Power8 +* That can result in random values returned. We need to +* capture that. +*/ + BUG(); +} + #ifdef __powerpc64__ #if defined(CONFIG_PPC_CELL) || defined(CONFIG_PPC_FSL_BOOK3E) #define mftb()({unsigned long rval; \ diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 1d428e6007ca..03cbada59d3a 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -102,6 +102,15 @@ static inline u64 get_rtc(void) return (u64)hi * 10 + lo; } +static inline u64 get_vtb(void) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + return mfvtb(); +#endif + return 0; +} + #ifdef CONFIG_PPC64 static inline u64 get_tb(void) { diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 52c654dbd41a..ae43e4178ecd 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -646,6 +646,9 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) case KVM_REG_PPC_BESCR: val = get_reg_val(reg-id, vcpu-arch.bescr); break; + case KVM_REG_PPC_VTB: + val = get_reg_val(reg-id, vcpu-arch.vtb); + break; default: r = -EINVAL; break; @@ -750,6 +753,9 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) case KVM_REG_PPC_BESCR: vcpu-arch.bescr = set_reg_val(reg-id, val); break; + case KVM_REG_PPC_VTB: + vcpu-arch.vtb = set_reg_val(reg-id, val); + break; default: r = -EINVAL; break; diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 3565e775b61b..1bb16a59dcbc 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -577,6 +577,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val */ *spr_val = vcpu-arch.spurr; break; + case SPRN_VTB: + *spr_val = vcpu-arch.vtb; Doesn't this mean that vtb can be the same 2 when the guest reads it 2 times in a row without getting preempted? Alex + break; case SPRN_GQR0: case SPRN_GQR1: case SPRN_GQR2: diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c index aba05bbb3e74..f6ac58336b3f 100644 --- a/arch/powerpc/kvm/book3s_hv.c +++ b/arch/powerpc/kvm/book3s_hv.c @@ -897,9 +897,6 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 id, case KVM_REG_PPC_IC: *val = get_reg_val(id, vcpu-arch.ic); break; -
Re: [PATCH 2/4] KVM: PPC: BOOK3S: PR: Doorbell support
On 05.06.14 14:08, Aneesh Kumar K.V wrote: We don't have SMT support yet, hence we should not find a doorbell message generated Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/kvm/book3s_emulate.c | 18 ++ 1 file changed, 18 insertions(+) diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 1bb16a59dcbc..d6c87d085182 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -28,7 +28,9 @@ #define OP_19_XOP_RFI 50 #define OP_31_XOP_MFMSR 83 +#define OP_31_XOP_MSGSNDP 142 #define OP_31_XOP_MTMSR 146 +#define OP_31_XOP_MSGCLRP 174 #define OP_31_XOP_MTMSRD 178 #define OP_31_XOP_MTSR210 #define OP_31_XOP_MTSRIN 242 @@ -303,6 +305,22 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, break; } + case OP_31_XOP_MSGSNDP: + { + /* +* PR KVM still don't support SMT mode. So we should still? +* not see a MSGSNDP/MSGCLRP used with PR KVM +*/ + pr_info(KVM: MSGSNDP used in non SMT case\n); + emulated = EMULATE_FAIL; What would happen on an HV guest with only 1 thread that MSGSNDs to thread 0? Would the guest get an illegal instruction trap, a self-interrupt or would this be a simple nop? Alex + break; + } + case OP_31_XOP_MSGCLRP: + { + pr_info(KVM: MSGCLRP used in non SMT case\n); + emulated = EMULATE_FAIL; + break; + } default: emulated = EMULATE_FAIL; } ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 2/4] KVM: PPC: BOOK3S: PR: Doorbell support
On 05.06.14 14:21, Alexander Graf wrote: On 05.06.14 14:08, Aneesh Kumar K.V wrote: We don't have SMT support yet, hence we should not find a doorbell message generated Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/kvm/book3s_emulate.c | 18 ++ 1 file changed, 18 insertions(+) diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 1bb16a59dcbc..d6c87d085182 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -28,7 +28,9 @@ #define OP_19_XOP_RFI50 #define OP_31_XOP_MFMSR83 +#define OP_31_XOP_MSGSNDP142 #define OP_31_XOP_MTMSR146 +#define OP_31_XOP_MSGCLRP174 #define OP_31_XOP_MTMSRD178 #define OP_31_XOP_MTSR210 #define OP_31_XOP_MTSRIN242 @@ -303,6 +305,22 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, break; } +case OP_31_XOP_MSGSNDP: +{ +/* + * PR KVM still don't support SMT mode. So we should still? + * not see a MSGSNDP/MSGCLRP used with PR KVM + */ +pr_info(KVM: MSGSNDP used in non SMT case\n); +emulated = EMULATE_FAIL; What would happen on an HV guest with only 1 thread that MSGSNDs to thread 0? Would the guest get an illegal instruction trap, a self-interrupt or would this be a simple nop? What I'm trying to say here is that it's ok to treat it as illegal instructions, but then we don't need this patch :). Alex ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 3/3] PPC: KVM: Add support for 64bit TCE windows
On Thu, 2014-06-05 at 13:56 +0200, Alexander Graf wrote: What if we ask user space to give us a pointer to user space allocated memory along with the TCE registration? We would still ask user space to only use the returned fd for TCE modifications, but would have some nicely swappable memory we can store the TCE entries in. That isn't going to work terribly well for VFIO :-) But yes, for emulated devices, we could improve things a bit, including for the 32-bit TCE tables. For emulated, the real mode path could walk the page tables and fallback to virtual mode get_user if the page isn't present, thus operating directly on qemu memory TCE tables instead of the current pinned stuff. However that has a cost in performance, but since that's really only used for emulated devices and PAPR VIOs, it might not be a huge issue. But for VFIO we don't have much choice, we need to create something the HW can access. In fact, the code as is today can allocate an arbitrary amount of pinned kernel memory from within user space without any checks. Right. We should at least account it in the locked limit. Cheers, Ben. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 3/3] PPC: KVM: Add support for 64bit TCE windows
On 05.06.14 14:30, Benjamin Herrenschmidt wrote: On Thu, 2014-06-05 at 13:56 +0200, Alexander Graf wrote: What if we ask user space to give us a pointer to user space allocated memory along with the TCE registration? We would still ask user space to only use the returned fd for TCE modifications, but would have some nicely swappable memory we can store the TCE entries in. That isn't going to work terribly well for VFIO :-) But yes, for emulated devices, we could improve things a bit, including for the 32-bit TCE tables. For emulated, the real mode path could walk the page tables and fallback to virtual mode get_user if the page isn't present, thus operating directly on qemu memory TCE tables instead of the current pinned stuff. However that has a cost in performance, but since that's really only used for emulated devices and PAPR VIOs, it might not be a huge issue. But for VFIO we don't have much choice, we need to create something the HW can access. But we need to create separate tables for VFIO anyways, because these TCE tables contain virtual addresses, no? Alex In fact, the code as is today can allocate an arbitrary amount of pinned kernel memory from within user space without any checks. Right. We should at least account it in the locked limit. Cheers, Ben. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v8 0/3] EEH Support for VFIO PCI Device
On 05.06.14 08:36, Gavin Shan wrote: The series of patches adds support EEH for PCI devices, which are passed through to PowerKVM based guest via VFIO. The implementation is straightforward based on the issues or problems we have to resolve to support EEH for PowerKVM based guest. - Emulation for EEH RTAS requests. All EEH RTAS requests goes to QEMU firstly. If QEMU can't handle it, the request will be sent to host via newly introduced VFIO container IOCTL command (VFIO_EEH_OP) and gets handled in host kernel. The series of patches requires corresponding QEMU changes. Acked-by: Alexander Graf ag...@suse.de Alex ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 3/3] PPC: KVM: Add support for 64bit TCE windows
On 06/05/2014 10:30 PM, Benjamin Herrenschmidt wrote: On Thu, 2014-06-05 at 13:56 +0200, Alexander Graf wrote: What if we ask user space to give us a pointer to user space allocated memory along with the TCE registration? We would still ask user space to only use the returned fd for TCE modifications, but would have some nicely swappable memory we can store the TCE entries in. That isn't going to work terribly well for VFIO :-) But yes, for emulated devices, we could improve things a bit, including for the 32-bit TCE tables. For emulated, the real mode path could walk the page tables and fallback to virtual mode get_user if the page isn't present, thus operating directly on qemu memory TCE tables instead of the current pinned stuff. However that has a cost in performance, but since that's really only used for emulated devices and PAPR VIOs, it might not be a huge issue. But for VFIO we don't have much choice, we need to create something the HW can access. You are confusing things here. There are 2 tables: 1. guest-visible TCE table, this is what is allocated for VIO or emulated PCI; 2. real HW DMA window, one exists already for DMA32 and one I will allocated for a huge window. I have just #2 for VFIO now but we will need both in order to implement H_GET_TCE correctly, and this is the table I will allocate by this new ioctl. In fact, the code as is today can allocate an arbitrary amount of pinned kernel memory from within user space without any checks. Right. We should at least account it in the locked limit. Yup. And (probably) this thing will keep a counter of how many windows were created per KVM instance to avoid having multiple copies of the same table. -- Alexey ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 1/2] powerpc/powernv: include asm/smp.h to handle UP config
Build throws following errors when CONFIG_SMP=n arch/powerpc/platforms/powernv/setup.c: In function ‘pnv_kexec_wait_secondaries_down’: arch/powerpc/platforms/powernv/setup.c:179:4: error: implicit declaration of function ‘get_hard_smp_processor_id’ rc = opal_query_cpu_status(get_hard_smp_processor_id(i), The usage of get_hard_smp_processor_id() needs the declaration from asm/smp.h. The file setup.c includes linux/sched.h, which in-turn includes linux/smp.h. However, linux/smp.h includes asm/smp.h only on SMP configs and hence UP builds fail. Fix this by directly including asm/smp.h in setup.c unconditionally. Reported-by: Geert Uytterhoeven ge...@linux-m68k.org Reviewed-by: Srivatsa S. Bhat srivatsa.b...@linux.vnet.ibm.com Signed-off-by: Shreyas B. Prabhu shre...@linux.vnet.ibm.com --- arch/powerpc/platforms/powernv/setup.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/powerpc/platforms/powernv/setup.c b/arch/powerpc/platforms/powernv/setup.c index 8c16a5f..678573c 100644 --- a/arch/powerpc/platforms/powernv/setup.c +++ b/arch/powerpc/platforms/powernv/setup.c @@ -35,6 +35,7 @@ #include asm/rtas.h #include asm/opal.h #include asm/kexec.h +#include asm/smp.h #include powernv.h -- 1.9.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH 2/2] powerpc/powernv : Disable subcore for UP configs
Build throws following errors when CONFIG_SMP=n arch/powerpc/platforms/powernv/subcore.c: In function ‘cpu_update_split_mode’: arch/powerpc/platforms/powernv/subcore.c:274:15: error: ‘setup_max_cpus’ undeclared (first use in this function) arch/powerpc/platforms/powernv/subcore.c:285:5: error: lvalue required as left operand of assignment 'setup_max_cpus' variable is relevant only on SMP, so there is no point working around it for UP. Furthermore, subcore.c itself is relevant only on SMP and hence the better solution is to exclude subcore.c for UP builds. Signed-off-by: Shreyas B. Prabhu shre...@linux.vnet.ibm.com --- This patch applies on top of ben/powerpc.git/next branch arch/powerpc/platforms/powernv/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 4ad0d34..636d206 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -1,9 +1,9 @@ obj-y += setup.o opal-takeover.o opal-wrappers.o opal.o opal-async.o obj-y += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o obj-y += rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o -obj-y += opal-msglog.o subcore.o subcore-asm.o +obj-y += opal-msglog.o subcore-asm.o -obj-$(CONFIG_SMP) += smp.o +obj-$(CONFIG_SMP) += smp.o subcore.o obj-$(CONFIG_PCI) += pci.o pci-p5ioc2.o pci-ioda.o obj-$(CONFIG_EEH) += eeh-ioda.o eeh-powernv.o obj-$(CONFIG_PPC_SCOM) += opal-xscom.o -- 1.9.0 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 1/4] KVM: PPC: BOOK3S: PR: Emulate virtual timebase register
Alexander Graf ag...@suse.de writes: On 05.06.14 14:08, Aneesh Kumar K.V wrote: virtual time base register is a per VM, per cpu register that needs to be saved and restored on vm exit and entry. Writing to VTB is not allowed in the privileged mode. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/include/asm/reg.h | 15 +++ arch/powerpc/include/asm/time.h | 9 + arch/powerpc/kvm/book3s.c | 6 ++ arch/powerpc/kvm/book3s_emulate.c | 3 +++ arch/powerpc/kvm/book3s_hv.c| 6 -- arch/powerpc/kvm/book3s_pr.c| 3 ++- 7 files changed, 36 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 4a58731a0a72..bd3caeaeebe1 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -505,6 +505,7 @@ struct kvm_vcpu_arch { #endif /* Time base value when we entered the guest */ u64 entry_tb; +u64 entry_vtb; u32 tcr; ulong tsr; /* we need to perform set/clr_bits() which requires ulong */ u32 ivor[64]; diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 4852bcf270f3..3e7085d8af90 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -25,6 +25,7 @@ #ifdef CONFIG_8xx #include asm/reg_8xx.h #endif /* CONFIG_8xx */ +#include asm/bug.h #define MSR_SF_LG 63 /* Enable 64 bit mode */ #define MSR_ISF_LG 61 /* Interrupt 64b mode valid on 630 */ @@ -1193,6 +1194,20 @@ : r ((unsigned long)(v)) \ : memory) +static inline unsigned long mfvtb (void) +{ +#ifdef CONFIG_PPC_BOOK3S_64 +if (cpu_has_feature(CPU_FTR_ARCH_207S)) +return mfspr(SPRN_VTB); +#endif +/* + * The above mfspr will be a no-op on anything before Power8 + * That can result in random values returned. We need to + * capture that. + */ +BUG(); +} + #ifdef __powerpc64__ #if defined(CONFIG_PPC_CELL) || defined(CONFIG_PPC_FSL_BOOK3E) #define mftb() ({unsigned long rval; \ diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 1d428e6007ca..03cbada59d3a 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -102,6 +102,15 @@ static inline u64 get_rtc(void) return (u64)hi * 10 + lo; } +static inline u64 get_vtb(void) +{ +#ifdef CONFIG_PPC_BOOK3S_64 +if (cpu_has_feature(CPU_FTR_ARCH_207S)) +return mfvtb(); +#endif +return 0; +} + #ifdef CONFIG_PPC64 static inline u64 get_tb(void) { diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 52c654dbd41a..ae43e4178ecd 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -646,6 +646,9 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) case KVM_REG_PPC_BESCR: val = get_reg_val(reg-id, vcpu-arch.bescr); break; +case KVM_REG_PPC_VTB: +val = get_reg_val(reg-id, vcpu-arch.vtb); +break; default: r = -EINVAL; break; @@ -750,6 +753,9 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) case KVM_REG_PPC_BESCR: vcpu-arch.bescr = set_reg_val(reg-id, val); break; +case KVM_REG_PPC_VTB: +vcpu-arch.vtb = set_reg_val(reg-id, val); +break; default: r = -EINVAL; break; diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 3565e775b61b..1bb16a59dcbc 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -577,6 +577,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val */ *spr_val = vcpu-arch.spurr; break; +case SPRN_VTB: +*spr_val = vcpu-arch.vtb; Doesn't this mean that vtb can be the same 2 when the guest reads it 2 times in a row without getting preempted? But a mfspr will result in VM exit and that would make sure we update vcpu-arch.vtb with the correct value. -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 2/4] KVM: PPC: BOOK3S: PR: Doorbell support
Alexander Graf ag...@suse.de writes: On 05.06.14 14:21, Alexander Graf wrote: On 05.06.14 14:08, Aneesh Kumar K.V wrote: We don't have SMT support yet, hence we should not find a doorbell message generated Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/kvm/book3s_emulate.c | 18 ++ 1 file changed, 18 insertions(+) diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 1bb16a59dcbc..d6c87d085182 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -28,7 +28,9 @@ #define OP_19_XOP_RFI50 #define OP_31_XOP_MFMSR83 +#define OP_31_XOP_MSGSNDP142 #define OP_31_XOP_MTMSR146 +#define OP_31_XOP_MSGCLRP174 #define OP_31_XOP_MTMSRD178 #define OP_31_XOP_MTSR210 #define OP_31_XOP_MTSRIN242 @@ -303,6 +305,22 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct kvm_vcpu *vcpu, break; } +case OP_31_XOP_MSGSNDP: +{ +/* + * PR KVM still don't support SMT mode. So we should still? + * not see a MSGSNDP/MSGCLRP used with PR KVM + */ +pr_info(KVM: MSGSNDP used in non SMT case\n); +emulated = EMULATE_FAIL; What would happen on an HV guest with only 1 thread that MSGSNDs to thread 0? Would the guest get an illegal instruction trap, a self-interrupt or would this be a simple nop? What I'm trying to say here is that it's ok to treat it as illegal instructions, but then we don't need this patch :). Agreed. I will verify whether it is treated as a nop. If so will send an updated patch. -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 1/4] KVM: PPC: BOOK3S: PR: Emulate virtual timebase register
On 05.06.14 17:50, Aneesh Kumar K.V wrote: Alexander Graf ag...@suse.de writes: On 05.06.14 14:08, Aneesh Kumar K.V wrote: virtual time base register is a per VM, per cpu register that needs to be saved and restored on vm exit and entry. Writing to VTB is not allowed in the privileged mode. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com --- arch/powerpc/include/asm/kvm_host.h | 1 + arch/powerpc/include/asm/reg.h | 15 +++ arch/powerpc/include/asm/time.h | 9 + arch/powerpc/kvm/book3s.c | 6 ++ arch/powerpc/kvm/book3s_emulate.c | 3 +++ arch/powerpc/kvm/book3s_hv.c| 6 -- arch/powerpc/kvm/book3s_pr.c| 3 ++- 7 files changed, 36 insertions(+), 7 deletions(-) diff --git a/arch/powerpc/include/asm/kvm_host.h b/arch/powerpc/include/asm/kvm_host.h index 4a58731a0a72..bd3caeaeebe1 100644 --- a/arch/powerpc/include/asm/kvm_host.h +++ b/arch/powerpc/include/asm/kvm_host.h @@ -505,6 +505,7 @@ struct kvm_vcpu_arch { #endif /* Time base value when we entered the guest */ u64 entry_tb; + u64 entry_vtb; u32 tcr; ulong tsr; /* we need to perform set/clr_bits() which requires ulong */ u32 ivor[64]; diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h index 4852bcf270f3..3e7085d8af90 100644 --- a/arch/powerpc/include/asm/reg.h +++ b/arch/powerpc/include/asm/reg.h @@ -25,6 +25,7 @@ #ifdef CONFIG_8xx #include asm/reg_8xx.h #endif /* CONFIG_8xx */ +#include asm/bug.h #define MSR_SF_LG 63 /* Enable 64 bit mode */ #define MSR_ISF_LG 61 /* Interrupt 64b mode valid on 630 */ @@ -1193,6 +1194,20 @@ : r ((unsigned long)(v)) \ : memory) +static inline unsigned long mfvtb (void) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + return mfspr(SPRN_VTB); +#endif + /* +* The above mfspr will be a no-op on anything before Power8 +* That can result in random values returned. We need to +* capture that. +*/ + BUG(); +} + #ifdef __powerpc64__ #if defined(CONFIG_PPC_CELL) || defined(CONFIG_PPC_FSL_BOOK3E) #define mftb() ({unsigned long rval; \ diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h index 1d428e6007ca..03cbada59d3a 100644 --- a/arch/powerpc/include/asm/time.h +++ b/arch/powerpc/include/asm/time.h @@ -102,6 +102,15 @@ static inline u64 get_rtc(void) return (u64)hi * 10 + lo; } +static inline u64 get_vtb(void) +{ +#ifdef CONFIG_PPC_BOOK3S_64 + if (cpu_has_feature(CPU_FTR_ARCH_207S)) + return mfvtb(); +#endif + return 0; +} + #ifdef CONFIG_PPC64 static inline u64 get_tb(void) { diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c index 52c654dbd41a..ae43e4178ecd 100644 --- a/arch/powerpc/kvm/book3s.c +++ b/arch/powerpc/kvm/book3s.c @@ -646,6 +646,9 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) case KVM_REG_PPC_BESCR: val = get_reg_val(reg-id, vcpu-arch.bescr); break; + case KVM_REG_PPC_VTB: + val = get_reg_val(reg-id, vcpu-arch.vtb); + break; default: r = -EINVAL; break; @@ -750,6 +753,9 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, struct kvm_one_reg *reg) case KVM_REG_PPC_BESCR: vcpu-arch.bescr = set_reg_val(reg-id, val); break; + case KVM_REG_PPC_VTB: + vcpu-arch.vtb = set_reg_val(reg-id, val); + break; default: r = -EINVAL; break; diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 3565e775b61b..1bb16a59dcbc 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -577,6 +577,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val */ *spr_val = vcpu-arch.spurr; break; + case SPRN_VTB: + *spr_val = vcpu-arch.vtb; Doesn't this mean that vtb can be the same 2 when the guest reads it 2 times in a row without getting preempted? But a mfspr will result in VM exit and that would make sure we update vcpu-arch.vtb with the correct value. We only call kvmppc_core_vcpu_put_pr() when we context switch away from KVM, so it won't be updated, no? Alex ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [RFC PATCH 3/3] PPC, KVM, CMA: use general CMA reserved area management framework
Paolo Bonzini pbonz...@redhat.com writes: Il 03/06/2014 09:02, Michal Nazarewicz ha scritto: On Tue, Jun 03 2014, Joonsoo Kim wrote: Now, we have general CMA reserved area management framework, so use it for future maintainabilty. There is no functional change. Signed-off-by: Joonsoo Kim iamjoonsoo@lge.com Acked-by: Michal Nazarewicz min...@mina86.com Acked-by: Paolo Bonzini pbonz...@redhat.com Aneesh, can you test this series? Sorry for the late reply. I will test this and update here. -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v8 3/3] drivers/vfio: EEH support for VFIO PCI device
On Thu, 2014-06-05 at 16:36 +1000, Gavin Shan wrote: The patch adds new IOCTL commands for sPAPR VFIO container device to support EEH functionality for PCI devices, which have been passed through from host to somebody else via VFIO. Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com --- Documentation/vfio.txt | 87 ++-- drivers/vfio/Makefile | 1 + drivers/vfio/pci/vfio_pci.c | 20 ++--- drivers/vfio/vfio_iommu_spapr_tce.c | 17 ++- drivers/vfio/vfio_spapr_eeh.c | 89 + include/linux/vfio.h| 23 ++ include/uapi/linux/vfio.h | 35 +++ 7 files changed, 262 insertions(+), 10 deletions(-) create mode 100644 drivers/vfio/vfio_spapr_eeh.c diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt index b9ca023..3fa4538 100644 --- a/Documentation/vfio.txt +++ b/Documentation/vfio.txt @@ -305,7 +305,15 @@ faster, the map/unmap handling has been implemented in real mode which provides an excellent performance which has limitations such as inability to do locked pages accounting in real time. -So 3 additional ioctls have been added: +4) According to sPAPR specification, A Partitionable Endpoint (PE) is an I/O +subtree that can be treated as a unit for the purposes of partitioning and +error recovery. A PE may be a single or multi-function IOA (IO Adapter), a +function of a multi-function IOA, or multiple IOAs (possibly including switch +and bridge structures above the multiple IOAs). PPC64 guests detect PCI errors +and recover from them via EEH RTAS services, which works on the basis of +additional ioctl commands. + +So 4 additional ioctls have been added: VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start of the DMA window on the PCI bus. @@ -316,9 +324,12 @@ So 3 additional ioctls have been added: VFIO_IOMMU_DISABLE - disables the container. + VFIO_EEH_PE_OP - provides an API for EEH setup, error detection and recovery. The code flow from the example above should be slightly changed: + struct vfio_eeh_pe_op pe_op = { .argsz = sizeof(pe_op) }; + . /* Add the group to the container */ ioctl(group, VFIO_GROUP_SET_CONTAINER, container); @@ -342,9 +353,79 @@ The code flow from the example above should be slightly changed: dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; /* Check here is .iova/.size are within DMA window from spapr_iommu_info */ - ioctl(container, VFIO_IOMMU_MAP_DMA, dma_map); - . + + /* Get a file descriptor for the device */ + device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, :06:0d.0); + + + + /* Gratuitous device reset and go... */ + ioctl(device, VFIO_DEVICE_RESET); + + /* Make sure EEH is supported */ + ioctl(container, VFIO_CHECK_EXTENSION, VFIO_EEH); + + /* Enable the EEH functionality on the device */ + pe_op.op = VFIO_EEH_PE_ENABLE; + ioctl(container, VFIO_EEH_PE_OP, pe_op); + + /* You're suggested to create additional data struct to represent + * PE, and put child devices belonging to same IOMMU group to the + * PE instance for later reference. + */ + + /* Check the PE's state and make sure it's in functional state */ + pe_op.op = VFIO_EEH_PE_GET_STATE; + ioctl(container, VFIO_EEH_PE_OP, pe_op); + + /* Save device state using pci_save_state(). + * EEH should be enabled on the specified device. + */ + + + + /* When 0xFF's returned from reading PCI config space or IO BARs + * of the PCI device. Check the PE's state to see if that has been + * frozen. + */ + ioctl(container, VFIO_EEH_PE_OP, pe_op); + + /* Waiting for pending PCI transactions to be completed and don't + * produce any more PCI traffic from/to the affected PE until + * recovery is finished. + */ + + /* Enable IO for the affected PE and collect logs. Usually, the + * standard part of PCI config space, AER registers are dumped + * as logs for further analysis. + */ + pe_op.op = VFIO_EEH_PE_UNFREEZE_IO; + ioctl(container, VFIO_EEH_PE_OP, pe_op); + + /* + * Issue PE reset: hot or fundamental reset. Usually, hot reset + * is enough. However, the firmware of some PCI adapters would + * require fundamental reset. + */ + pe_op.op = VFIO_EEH_PE_RESET_HOT; + ioctl(container, VFIO_EEH_PE_OP, pe_op); + pe_op.op = VFIO_EEH_PE_RESET_DEACTIVATE; + ioctl(container, VFIO_EEH_PE_OP, pe_op); + + /* Configure the PCI bridges for the affected PE */ + pe_op.op = VFIO_EEH_PE_CONFIGURE; + ioctl(container, VFIO_EEH_PE_OP, pe_op); + + /* Restored state we saved at initialization time. pci_restore_state() +
Re: [PATCH 1/4] KVM: PPC: BOOK3S: PR: Emulate virtual timebase register
Alexander Graf ag...@suse.de writes: On 05.06.14 17:50, Aneesh Kumar K.V wrote: Alexander Graf ag...@suse.de writes: On 05.06.14 14:08, Aneesh Kumar K.V wrote: virtual time base register is a per VM, per cpu register that needs to be saved and restored on vm exit and entry. Writing to VTB is not allowed in the privileged mode. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com ... break; diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 3565e775b61b..1bb16a59dcbc 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -577,6 +577,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val */ *spr_val = vcpu-arch.spurr; break; + case SPRN_VTB: + *spr_val = vcpu-arch.vtb; Doesn't this mean that vtb can be the same 2 when the guest reads it 2 times in a row without getting preempted? But a mfspr will result in VM exit and that would make sure we update vcpu-arch.vtb with the correct value. We only call kvmppc_core_vcpu_put_pr() when we context switch away from KVM, so it won't be updated, no? kvmppc_copy_from_svcpu is also called from VM exit path (book3s_interrupt.S) -aneesh ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [RFC PATCH 1/3] CMA: generalize CMA reserved area management functionality
Joonsoo Kim iamjoonsoo@lge.com writes: Currently, there are two users on CMA functionality, one is the DMA subsystem and the other is the kvm on powerpc. They have their own code to manage CMA reserved area even if they looks really similar. From my guess, it is caused by some needs on bitmap management. Kvm side wants to maintain bitmap not for 1 page, but for more size. Eventually it use bitmap where one bit represents 64 pages. When I implement CMA related patches, I should change those two places to apply my change and it seem to be painful to me. I want to change this situation and reduce future code management overhead through this patch. This change could also help developer who want to use CMA in their new feature development, since they can use CMA easily without copying pasting this reserved area management code. Signed-off-by: Joonsoo Kim iamjoonsoo@lge.com The way patches are split makes it difficult to review. Will it be possible to make changes against one implementation and them move that to generic code. That helps in finding out what exactly changed. diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig index 00e13ce..b3fe1cc 100644 --- a/drivers/base/Kconfig +++ b/drivers/base/Kconfig @@ -283,7 +283,7 @@ config CMA_ALIGNMENT If unsure, leave the default value 8. -config CMA_AREAS +config DMA_CMA_AREAS int Maximum count of the CMA device-private areas default 7 help for ex: The above can be a seperate patch along with further DMA related cleanups . diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c index 83969f8..48cdac8 100644 --- a/drivers/base/dma-contiguous.c +++ b/drivers/base/dma-contiguous.c @@ -186,7 +186,7 @@ static int __init cma_activate_area(struct cma *cma) return 0; } -static struct cma cma_areas[MAX_CMA_AREAS]; +static struct cma cma_areas[MAX_DMA_CMA_AREAS]; static unsigned cma_area_count; static int __init cma_init_reserved_areas(void) diff --git a/include/linux/cma.h b/include/linux/cma.h new file mode 100644 index 000..60ba06f --- /dev/null +++ b/include/linux/cma.h @@ -0,0 +1,28 @@ +/* + * Contiguous Memory Allocator + * + * Copyright LG Electronics Inc., 2014 + * Written by: + * Joonsoo Kim iamjoonsoo@lge.com + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License as + * published by the Free Software Foundation; either version 2 of the + * License or (at your optional) any later version of the license. + * + */ + +#ifndef __CMA_H__ +#define __CMA_H__ + +struct cma; + +extern struct page *cma_alloc(struct cma *cma, unsigned long count, + unsigned long align); +extern bool cma_release(struct cma *cma, struct page *pages, + unsigned long count); +extern int __init cma_declare_contiguous(phys_addr_t size, phys_addr_t base, + phys_addr_t limit, phys_addr_t alignment, + unsigned long bitmap_shift, bool fixed, + struct cma **res_cma); +#endif diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h index 772eab5..dfb1dc9 100644 --- a/include/linux/dma-contiguous.h +++ b/include/linux/dma-contiguous.h @@ -63,7 +63,7 @@ struct device; * There is always at least global CMA area and a few optional device * private areas configured in kernel .config. */ -#define MAX_CMA_AREAS(1 + CONFIG_CMA_AREAS) +#define MAX_DMA_CMA_AREAS (1 + CONFIG_DMA_CMA_AREAS) extern struct cma *dma_contiguous_default_area; @@ -123,7 +123,7 @@ bool dma_release_from_contiguous(struct device *dev, struct page *pages, #else -#define MAX_CMA_AREAS(0) +#define MAX_DMA_CMA_AREAS(0) static inline struct cma *dev_get_cma_area(struct device *dev) { diff --git a/mm/Kconfig b/mm/Kconfig index 7511b4a..0877ddc 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -515,6 +515,17 @@ config CMA_DEBUG processing calls such as dma_alloc_from_contiguous(). This option does not affect warning and error messages. +config CMA_AREAS + int Maximum count of the CMA areas + depends on CMA + default 7 + help + CMA allows to create CMA areas for particular purpose, mainly, + used as device private area. This parameter sets the maximum + number of CMA area in the system. + + If unsure, leave the default value 7. + If we have 7 declare continugous request, a kvm cma allocation request will fail ? config ZBUD tristate default n diff --git a/mm/Makefile b/mm/Makefile index 1eaa70b..bc0422b 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -62,3 +62,4 @@ obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o obj-$(CONFIG_ZBUD) += zbud.o obj-$(CONFIG_ZSMALLOC) += zsmalloc.o
Re: [PATCH 1/4] KVM: PPC: BOOK3S: PR: Emulate virtual timebase register
On 05.06.14 19:33, Aneesh Kumar K.V wrote: Alexander Graf ag...@suse.de writes: On 05.06.14 17:50, Aneesh Kumar K.V wrote: Alexander Graf ag...@suse.de writes: On 05.06.14 14:08, Aneesh Kumar K.V wrote: virtual time base register is a per VM, per cpu register that needs to be saved and restored on vm exit and entry. Writing to VTB is not allowed in the privileged mode. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com ... break; diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 3565e775b61b..1bb16a59dcbc 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -577,6 +577,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val */ *spr_val = vcpu-arch.spurr; break; + case SPRN_VTB: + *spr_val = vcpu-arch.vtb; Doesn't this mean that vtb can be the same 2 when the guest reads it 2 times in a row without getting preempted? But a mfspr will result in VM exit and that would make sure we update vcpu-arch.vtb with the correct value. We only call kvmppc_core_vcpu_put_pr() when we context switch away from KVM, so it won't be updated, no? kvmppc_copy_from_svcpu is also called from VM exit path (book3s_interrupt.S) ... where it will run into this code path: /* * Maybe we were already preempted and synced the svcpu from * our preempt notifiers. Don't bother touching this svcpu then. */ if (!svcpu-in_use) goto out; Alex ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 1/4] KVM: PPC: BOOK3S: PR: Emulate virtual timebase register
On 06.06.14 00:32, Alexander Graf wrote: On 05.06.14 19:33, Aneesh Kumar K.V wrote: Alexander Graf ag...@suse.de writes: On 05.06.14 17:50, Aneesh Kumar K.V wrote: Alexander Graf ag...@suse.de writes: On 05.06.14 14:08, Aneesh Kumar K.V wrote: virtual time base register is a per VM, per cpu register that needs to be saved and restored on vm exit and entry. Writing to VTB is not allowed in the privileged mode. Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com ... break; diff --git a/arch/powerpc/kvm/book3s_emulate.c b/arch/powerpc/kvm/book3s_emulate.c index 3565e775b61b..1bb16a59dcbc 100644 --- a/arch/powerpc/kvm/book3s_emulate.c +++ b/arch/powerpc/kvm/book3s_emulate.c @@ -577,6 +577,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int sprn, ulong *spr_val */ *spr_val = vcpu-arch.spurr; break; +case SPRN_VTB: +*spr_val = vcpu-arch.vtb; Doesn't this mean that vtb can be the same 2 when the guest reads it 2 times in a row without getting preempted? But a mfspr will result in VM exit and that would make sure we update vcpu-arch.vtb with the correct value. We only call kvmppc_core_vcpu_put_pr() when we context switch away from KVM, so it won't be updated, no? kvmppc_copy_from_svcpu is also called from VM exit path (book3s_interrupt.S) ... where it will run into this code path: /* * Maybe we were already preempted and synced the svcpu from * our preempt notifiers. Don't bother touching this svcpu then. */ if (!svcpu-in_use) goto out; Scratch that. We're always calling this on entry/exit, so you're right. Alex ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 0/3] Prepare for in-kernel VFIO DMA operations acceleration
On 06/05/2014 09:57 PM, Alexander Graf wrote: On 05.06.14 09:25, Alexey Kardashevskiy wrote: This reserves 2 capability numbers. This implements an extended version of KVM_CREATE_SPAPR_TCE_64 ioctl. Please advise how to proceed with these patches as I suspect that first two should go via Paolo's tree while the last one via Alex Graf's tree (correct?). They would just go via my tree, but only be actually allocated (read: mergable to qemu) when they hit Paolo's tree. In fact, I don't think it makes sense to split them off at all. So? Are these patches going anywhere? Thanks. -- Alexey ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH v8 3/3] drivers/vfio: EEH support for VFIO PCI device
On Thu, Jun 05, 2014 at 11:18:34AM -0600, Alex Williamson wrote: On Thu, 2014-06-05 at 16:36 +1000, Gavin Shan wrote: The patch adds new IOCTL commands for sPAPR VFIO container device to support EEH functionality for PCI devices, which have been passed through from host to somebody else via VFIO. Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com --- Documentation/vfio.txt | 87 ++-- drivers/vfio/Makefile | 1 + drivers/vfio/pci/vfio_pci.c | 20 ++--- drivers/vfio/vfio_iommu_spapr_tce.c | 17 ++- drivers/vfio/vfio_spapr_eeh.c | 89 + include/linux/vfio.h| 23 ++ include/uapi/linux/vfio.h | 35 +++ 7 files changed, 262 insertions(+), 10 deletions(-) create mode 100644 drivers/vfio/vfio_spapr_eeh.c diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt index b9ca023..3fa4538 100644 --- a/Documentation/vfio.txt +++ b/Documentation/vfio.txt @@ -305,7 +305,15 @@ faster, the map/unmap handling has been implemented in real mode which provides an excellent performance which has limitations such as inability to do locked pages accounting in real time. -So 3 additional ioctls have been added: +4) According to sPAPR specification, A Partitionable Endpoint (PE) is an I/O +subtree that can be treated as a unit for the purposes of partitioning and +error recovery. A PE may be a single or multi-function IOA (IO Adapter), a +function of a multi-function IOA, or multiple IOAs (possibly including switch +and bridge structures above the multiple IOAs). PPC64 guests detect PCI errors +and recover from them via EEH RTAS services, which works on the basis of +additional ioctl commands. + +So 4 additional ioctls have been added: VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start of the DMA window on the PCI bus. @@ -316,9 +324,12 @@ So 3 additional ioctls have been added: VFIO_IOMMU_DISABLE - disables the container. +VFIO_EEH_PE_OP - provides an API for EEH setup, error detection and recovery. The code flow from the example above should be slightly changed: +struct vfio_eeh_pe_op pe_op = { .argsz = sizeof(pe_op) }; + . /* Add the group to the container */ ioctl(group, VFIO_GROUP_SET_CONTAINER, container); @@ -342,9 +353,79 @@ The code flow from the example above should be slightly changed: dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; /* Check here is .iova/.size are within DMA window from spapr_iommu_info */ - ioctl(container, VFIO_IOMMU_MAP_DMA, dma_map); -. + +/* Get a file descriptor for the device */ +device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, :06:0d.0); + + + +/* Gratuitous device reset and go... */ +ioctl(device, VFIO_DEVICE_RESET); + +/* Make sure EEH is supported */ +ioctl(container, VFIO_CHECK_EXTENSION, VFIO_EEH); + +/* Enable the EEH functionality on the device */ +pe_op.op = VFIO_EEH_PE_ENABLE; +ioctl(container, VFIO_EEH_PE_OP, pe_op); + +/* You're suggested to create additional data struct to represent + * PE, and put child devices belonging to same IOMMU group to the + * PE instance for later reference. + */ + +/* Check the PE's state and make sure it's in functional state */ +pe_op.op = VFIO_EEH_PE_GET_STATE; +ioctl(container, VFIO_EEH_PE_OP, pe_op); + +/* Save device state using pci_save_state(). + * EEH should be enabled on the specified device. + */ + + + +/* When 0xFF's returned from reading PCI config space or IO BARs + * of the PCI device. Check the PE's state to see if that has been + * frozen. + */ +ioctl(container, VFIO_EEH_PE_OP, pe_op); + +/* Waiting for pending PCI transactions to be completed and don't + * produce any more PCI traffic from/to the affected PE until + * recovery is finished. + */ + +/* Enable IO for the affected PE and collect logs. Usually, the + * standard part of PCI config space, AER registers are dumped + * as logs for further analysis. + */ +pe_op.op = VFIO_EEH_PE_UNFREEZE_IO; +ioctl(container, VFIO_EEH_PE_OP, pe_op); + +/* + * Issue PE reset: hot or fundamental reset. Usually, hot reset + * is enough. However, the firmware of some PCI adapters would + * require fundamental reset. + */ +pe_op.op = VFIO_EEH_PE_RESET_HOT; +ioctl(container, VFIO_EEH_PE_OP, pe_op); +pe_op.op = VFIO_EEH_PE_RESET_DEACTIVATE; +ioctl(container, VFIO_EEH_PE_OP, pe_op); + +/* Configure the PCI bridges for the affected PE */ +pe_op.op = VFIO_EEH_PE_CONFIGURE; +ioctl(container, VFIO_EEH_PE_OP, pe_op); + +/* Restored state we saved at initialization time.
Re: [PATCH v8 0/3] EEH Support for VFIO PCI Device
On Thu, Jun 05, 2014 at 02:54:47PM +0200, Alexander Graf wrote: On 05.06.14 08:36, Gavin Shan wrote: The series of patches adds support EEH for PCI devices, which are passed through to PowerKVM based guest via VFIO. The implementation is straightforward based on the issues or problems we have to resolve to support EEH for PowerKVM based guest. - Emulation for EEH RTAS requests. All EEH RTAS requests goes to QEMU firstly. If QEMU can't handle it, the request will be sent to host via newly introduced VFIO container IOCTL command (VFIO_EEH_OP) and gets handled in host kernel. The series of patches requires corresponding QEMU changes. Acked-by: Alexander Graf ag...@suse.de Thanks, Alex :) Alex ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v4 1/2]: Allow architectures to skip a callchain entry
The kernel code in Powerpc conservatively saves excess information in the callchain. While most entries are often needed, under some specific conditions, some of the entries are redundant and cause duplicate arcs in the call-graph. Eg: the value in the link register (LR) is needed only when it holds the return address of a function. At other times it must be ignored. In the next commit, we will use the application's DWARF debug information to identify and skip over the redundant entries. To minimize performance impact on other architectures, define and use two following static inline interfaces: arch_skip_callchain_idx() next_callchain_ip() Reported-by: Maynard Johnson mayn...@us.ibm.com Tested-by: Maynard Johnson mayn...@us.ibm.com Signed-off-by: Sukadev Bhattiprolu suka...@linux.vnet.ibm.com --- Changelog[v4] Move Powerpc-specific code to separate patch [Jiri Olsa] Minimize performance impact to other architectures include/uapi/linux/perf_event.h |2 ++ tools/perf/arch/powerpc/Makefile |1 + tools/perf/arch/powerpc/util/skip-callchain-idx.c | 25 ++ tools/perf/config/Makefile|4 +++ tools/perf/util/callchain.h | 37 + tools/perf/util/machine.c | 11 +++--- 6 files changed, 76 insertions(+), 4 deletions(-) create mode 100644 tools/perf/arch/powerpc/util/skip-callchain-idx.c diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h index e3fc8f0..b671abf 100644 --- a/include/uapi/linux/perf_event.h +++ b/include/uapi/linux/perf_event.h @@ -719,6 +719,8 @@ enum perf_callchain_context { PERF_CONTEXT_GUEST_KERNEL = (__u64)-2176, PERF_CONTEXT_GUEST_USER = (__u64)-2560, + PERF_CONTEXT_IGNORE = (__u64)-3840, + PERF_CONTEXT_MAX= (__u64)-4095, }; diff --git a/tools/perf/arch/powerpc/Makefile b/tools/perf/arch/powerpc/Makefile index 744e629..b92219b 100644 --- a/tools/perf/arch/powerpc/Makefile +++ b/tools/perf/arch/powerpc/Makefile @@ -3,3 +3,4 @@ PERF_HAVE_DWARF_REGS := 1 LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o endif LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/header.o +LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/skip-callchain-idx.o diff --git a/tools/perf/arch/powerpc/util/skip-callchain-idx.c b/tools/perf/arch/powerpc/util/skip-callchain-idx.c new file mode 100644 index 000..7350c36 --- /dev/null +++ b/tools/perf/arch/powerpc/util/skip-callchain-idx.c @@ -0,0 +1,25 @@ +/* + * Use DWARF Debug information to skip unnecessary callchain entries. + * + * Copyright (C) 2014 Sukadev Bhattiprolu, IBM Corporation. + * Copyright (C) 2014 Ulrich Weigand, IBM Corporation. + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ +#include inttypes.h +#include dwarf.h +#include elfutils/libdwfl.h + +#include util/thread.h +#include util/callchain.h + +/* Stub for now */ +int arch_skip_callchain_idx(struct machine *machine __maybe_unused, + struct thread *thread __maybe_unused, + struct ip_callchain *chain __maybe_unused) +{ + return -1; +} diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile index 729bbdf..8d1417d 100644 --- a/tools/perf/config/Makefile +++ b/tools/perf/config/Makefile @@ -48,6 +48,10 @@ ifneq ($(ARCH),$(filter $(ARCH),x86 arm)) NO_LIBDW_DWARF_UNWIND := 1 endif +ifeq ($(ARCH),powerpc) + CFLAGS += -DHAVE_SKIP_CALLCHAIN_IDX +endif + ifeq ($(LIBUNWIND_LIBS),) NO_LIBUNWIND := 1 else diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h index 8f84423..57d3d33 100644 --- a/tools/perf/util/callchain.h +++ b/tools/perf/util/callchain.h @@ -176,4 +176,41 @@ static inline void callchain_cursor_snapshot(struct callchain_cursor *dest, dest-first = src-curr; dest-nr -= src-pos; } + +/* + * Some architectures (eg: Powerpc), check DWARF debug information + * and skip a specific callchain entry in the @chain-ips[] list. + * + * Return index of the entry to skip or -1 to not skip any entry. + */ +#ifdef HAVE_SKIP_CALLCHAIN_IDX +extern int +arch_skip_callchain_idx(struct machine *machine __maybe_unused, + struct thread *thread __maybe_unused, + struct ip_callchain *chain __maybe_unused); +#else +static inline int +arch_skip_callchain_idx(struct machine *machine __maybe_unused, + struct thread *thread __maybe_unused, + struct ip_callchain *chain __maybe_unused) +{ + return -1; +} +#endif + +static inline u64 +next_callchain_ip(struct ip_callchain *chain, + enum chain_order order, +
[PATCH v4 2/2]: powerpc/perf: Adjust callchain based on DWARF debug info
Replace the arch_skip_callchain_idx() stub in Powerpc with code that checks the DWARF debug information and identifies the callchain entry to skip. Callgraph before the patch: 14.67% 2234 sprintft libc-2.18.so [.] __random | --- __random | |--61.12%-- __random | | | |--97.15%-- rand | | do_my_sprintf | | main | | generic_start_main.isra.0 | | __libc_start_main | | 0x0 | | | --2.85%-- do_my_sprintf | main | generic_start_main.isra.0 | __libc_start_main | 0x0 | --38.88%-- rand | |--94.01%-- rand | do_my_sprintf | main | generic_start_main.isra.0 | __libc_start_main | 0x0 | --5.99%-- do_my_sprintf main generic_start_main.isra.0 __libc_start_main 0x0 Callgraph after the patch: 14.67% 2234 sprintft libc-2.18.so [.] __random | --- __random | |--95.93%-- rand | do_my_sprintf | main | generic_start_main.isra.0 | __libc_start_main | 0x0 | --4.07%-- do_my_sprintf main generic_start_main.isra.0 __libc_start_main 0x0 TODO: For split-debug info objects like glibc, we can only determine the call-frame-address only when both .eh_frame and .debug_info sections are available. We should be able to determin the CFA even without the .eh_frame section. Fix suggested by Anton Blanchard. Thanks to valuable input on DWARF debug information from Ulrich Weigand. Reported-by: Maynard Johnson mayn...@us.ibm.com Tested-by: Maynard Johnson mayn...@us.ibm.com Signed-off-by: Sukadev Bhattiprolu suka...@linux.vnet.ibm.com --- Changelog[v4] Move Powerpc-specific code into a separate patch Changelog[v3] [Jiri Olsa] Rename function to arch_skip_callchain_idx() to be consistent with behavior. [Jiri Olsa] Add '__maybe_unused' tags for unused parameters. Changelog[v2]: Add missing dwfl_end() Fix merge conflicts due to some unwind code tools/perf/arch/powerpc/util/skip-callchain-idx.c | 251 - 1 file changed, 246 insertions(+), 5 deletions(-) diff --git a/tools/perf/arch/powerpc/util/skip-callchain-idx.c b/tools/perf/arch/powerpc/util/skip-callchain-idx.c index 7350c36..a7c23a4 100644 --- a/tools/perf/arch/powerpc/util/skip-callchain-idx.c +++ b/tools/perf/arch/powerpc/util/skip-callchain-idx.c @@ -16,10 +16,251 @@ #include util/thread.h #include util/callchain.h -/* Stub for now */ -int arch_skip_callchain_idx(struct machine *machine __maybe_unused, - struct thread *thread __maybe_unused, - struct ip_callchain *chain __maybe_unused) +/* + * When saving the callchain on Power, the kernel conservatively saves + * excess entries in the callchain. A few of these entries are needed + * in some cases but not others. If the unnecessary entries are not + * ignored, we end up with duplicate arcs in the call-graphs. Use + * DWARF debug information to skip over any unnecessary callchain + * entries. + * + * See function header for arch_adjust_callchain() below for more details. + * + * The libdwfl code in this file is based on code from elfutils + * (libdwfl/argp-std.c, libdwfl/tests/addrcfi.c, etc). + */ +static char *debuginfo_path; + +static const Dwfl_Callbacks offline_callbacks = { + .debuginfo_path = debuginfo_path, + .find_debuginfo = dwfl_standard_find_debuginfo, + .section_address = dwfl_offline_section_address, +}; + + +/* + * Use the DWARF expression for the Call-frame-address and determine + * if return address is in LR and if a new frame was allocated. + */ +static int check_return_reg(int ra_regno, Dwarf_Frame *frame) +{ + Dwarf_Op ops_mem[2]; + Dwarf_Op dummy; + Dwarf_Op *ops = dummy; + size_t nops; + int result; + + result = dwarf_frame_register(frame, ra_regno, ops_mem,
[PATCH] powerpc: Don't setup CPUs with bad status
OPAL will mark a CPU that is guarded as bad in the status property of the CPU node. Unfortunatley Linux doesn't check this property and will put the bad CPU in the present map. This has caused hangs on booting when we try to unsplit the core. This patch checks the CPU is avaliable via this status property before putting it in the present map. Signed-off-by: Michael Neuling mi...@neuling.org Tested-by: Anton Blanchard an...@samba.org cc: sta...@vger.kernel.org --- arch/powerpc/kernel/setup-common.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/powerpc/kernel/setup-common.c b/arch/powerpc/kernel/setup-common.c index d4d4183..e239df3 100644 --- a/arch/powerpc/kernel/setup-common.c +++ b/arch/powerpc/kernel/setup-common.c @@ -471,7 +471,7 @@ void __init smp_setup_cpu_maps(void) for (j = 0; j nthreads cpu nr_cpu_ids; j++) { DBG(thread %d - cpu %d (hard id %d)\n, j, cpu, be32_to_cpu(intserv[j])); - set_cpu_present(cpu, true); + set_cpu_present(cpu, of_device_is_available(dn)); set_hard_smp_processor_id(cpu, be32_to_cpu(intserv[j])); set_cpu_possible(cpu, true); cpu++; -- 1.9.1 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v9 1/3] powerpc/eeh: Avoid event on passed PE
We must not handle EEH error on devices which are passed to somebody else. Instead, we expect that the frozen device owner detects an EEH error and recovers from it. This avoids EEH error handling on passed through devices so the device owner gets a chance to handle them. Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com Acked-by: Alexander Graf ag...@suse.de --- arch/powerpc/include/asm/eeh.h| 7 +++ arch/powerpc/kernel/eeh.c | 8 arch/powerpc/platforms/powernv/eeh-ioda.c | 3 ++- 3 files changed, 17 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 7782056..653d981 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -25,6 +25,7 @@ #include linux/list.h #include linux/string.h #include linux/time.h +#include linux/atomic.h struct pci_dev; struct pci_bus; @@ -84,6 +85,7 @@ struct eeh_pe { int freeze_count; /* Times of froze up*/ struct timeval tstamp; /* Time on first-time freeze*/ int false_positives;/* Times of reported #ff's */ + atomic_t pass_dev_cnt; /* Count of passed through devs */ struct eeh_pe *parent; /* Parent PE*/ struct list_head child_list;/* Link PE to the child list*/ struct list_head edevs; /* Link list of EEH devices */ @@ -93,6 +95,11 @@ struct eeh_pe { #define eeh_pe_for_each_dev(pe, edev, tmp) \ list_for_each_entry_safe(edev, tmp, pe-edevs, list) +static inline bool eeh_pe_passed(struct eeh_pe *pe) +{ + return pe ? !!atomic_read(pe-pass_dev_cnt) : false; +} + /* * The struct is used to trace EEH state for the associated * PCI device node or PCI device. In future, it might diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 9c6b899..3bc8b12 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -400,6 +400,14 @@ int eeh_dev_check_failure(struct eeh_dev *edev) if (ret 0) return ret; + /* +* If the PE isn't owned by us, we shouldn't check the +* state. Instead, let the owner handle it if the PE has +* been frozen. +*/ + if (eeh_pe_passed(pe)) + return 0; + /* If we already have a pending isolation event for this * slot, we know it's bad already, we don't need to check. * Do this checking under a lock; as multiple PCI devices diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c b/arch/powerpc/platforms/powernv/eeh-ioda.c index cab3e62..79193eb 100644 --- a/arch/powerpc/platforms/powernv/eeh-ioda.c +++ b/arch/powerpc/platforms/powernv/eeh-ioda.c @@ -892,7 +892,8 @@ static int ioda_eeh_next_error(struct eeh_pe **pe) opal_pci_eeh_freeze_clear(phb-opal_id, frozen_pe_no, OPAL_EEH_ACTION_CLEAR_FREEZE_ALL); ret = EEH_NEXT_ERR_NONE; - } else if ((*pe)-state EEH_PE_ISOLATED) { + } else if ((*pe)-state EEH_PE_ISOLATED || + eeh_pe_passed(*pe)) { ret = EEH_NEXT_ERR_NONE; } else { pr_err(EEH: Frozen PHB#%x-PE#%x (%s) detected\n, -- 1.8.3.2 ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
[PATCH v9 2/3] powerpc/eeh: EEH support for VFIO PCI device
The patch exports functions to be used by new VFIO ioctl command, which will be introduced in subsequent patch, to support EEH functinality for VFIO PCI devices. Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com Acked-by: Alexander Graf ag...@suse.de --- arch/powerpc/include/asm/eeh.h | 12 ++ arch/powerpc/kernel/eeh.c | 268 + 2 files changed, 280 insertions(+) diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h index 653d981..b733044 100644 --- a/arch/powerpc/include/asm/eeh.h +++ b/arch/powerpc/include/asm/eeh.h @@ -173,6 +173,11 @@ enum { #define EEH_STATE_DMA_ACTIVE (1 4)/* Active DMA */ #define EEH_STATE_MMIO_ENABLED (1 5)/* MMIO enabled */ #define EEH_STATE_DMA_ENABLED (1 6)/* DMA enabled */ +#define EEH_PE_STATE_NORMAL0 /* Normal state */ +#define EEH_PE_STATE_RESET 1 /* PE reset asserted*/ +#define EEH_PE_STATE_STOPPED_IO_DMA2 /* Frozen PE*/ +#define EEH_PE_STATE_STOPPED_DMA 4 /* Stopped DMA, Enabled IO */ +#define EEH_PE_STATE_UNAVAIL 5 /* Unavailable */ #define EEH_RESET_DEACTIVATE 0 /* Deactivate the PE reset */ #define EEH_RESET_HOT 1 /* Hot reset*/ #define EEH_RESET_FUNDAMENTAL 3 /* Fundamental reset*/ @@ -280,6 +285,13 @@ void eeh_add_device_late(struct pci_dev *); void eeh_add_device_tree_late(struct pci_bus *); void eeh_add_sysfs_files(struct pci_bus *); void eeh_remove_device(struct pci_dev *); +int eeh_dev_open(struct pci_dev *pdev); +void eeh_dev_release(struct pci_dev *pdev); +struct eeh_pe *eeh_iommu_group_to_pe(struct iommu_group *group); +int eeh_pe_set_option(struct eeh_pe *pe, int option); +int eeh_pe_get_state(struct eeh_pe *pe); +int eeh_pe_reset(struct eeh_pe *pe, int option); +int eeh_pe_configure(struct eeh_pe *pe); /** * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure. diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c index 3bc8b12..fc90df0 100644 --- a/arch/powerpc/kernel/eeh.c +++ b/arch/powerpc/kernel/eeh.c @@ -40,6 +40,7 @@ #include asm/eeh.h #include asm/eeh_event.h #include asm/io.h +#include asm/iommu.h #include asm/machdep.h #include asm/ppc-pci.h #include asm/rtas.h @@ -108,6 +109,9 @@ struct eeh_ops *eeh_ops = NULL; /* Lock to avoid races due to multiple reports of an error */ DEFINE_RAW_SPINLOCK(confirm_error_lock); +/* Lock to protect passed flags */ +static DEFINE_MUTEX(eeh_dev_mutex); + /* Buffer for reporting pci register dumps. Its here in BSS, and * not dynamically alloced, so that it ends up in RMO where RTAS * can access it. @@ -1106,6 +1110,270 @@ void eeh_remove_device(struct pci_dev *dev) edev-mode = ~EEH_DEV_SYSFS; } +/** + * eeh_dev_open - Increase count of pass through devices for PE + * @pdev: PCI device + * + * Increase count of passed through devices for the indicated + * PE. In the result, the EEH errors detected on the PE won't be + * reported. The PE owner will be responsible for detection + * and recovery. + */ +int eeh_dev_open(struct pci_dev *pdev) +{ + struct eeh_dev *edev; + + mutex_lock(eeh_dev_mutex); + + /* No PCI device ? */ + if (!pdev) + goto out; + + /* No EEH device or PE ? */ + edev = pci_dev_to_eeh_dev(pdev); + if (!edev || !edev-pe) + goto out; + + /* Increase PE's pass through count */ + atomic_inc(edev-pe-pass_dev_cnt); + mutex_unlock(eeh_dev_mutex); + + return 0; +out: + mutex_unlock(eeh_dev_mutex); + return -ENODEV; +} +EXPORT_SYMBOL_GPL(eeh_dev_open); + +/** + * eeh_dev_release - Decrease count of pass through devices for PE + * @pdev: PCI device + * + * Decrease count of pass through devices for the indicated PE. If + * there is no passed through device in PE, the EEH errors detected + * on the PE will be reported and handled as usual. + */ +void eeh_dev_release(struct pci_dev *pdev) +{ + struct eeh_dev *edev; + + mutex_lock(eeh_dev_mutex); + + /* No PCI device ? */ + if (!pdev) + goto out; + + /* No EEH device ? */ + edev = pci_dev_to_eeh_dev(pdev); + if (!edev || !edev-pe || !eeh_pe_passed(edev-pe)) + goto out; + + /* Decrease PE's pass through count */ + atomic_dec(edev-pe-pass_dev_cnt); + WARN_ON(atomic_read(edev-pe-pass_dev_cnt) 0); +out: + mutex_unlock(eeh_dev_mutex); +} +EXPORT_SYMBOL(eeh_dev_release); + +/** + * eeh_iommu_group_to_pe - Convert IOMMU group to EEH PE + * @group: IOMMU group + * + * The routine is called to convert IOMMU group to EEH PE. + */ +struct eeh_pe *eeh_iommu_group_to_pe(struct iommu_group *group) +{ + struct iommu_table *tbl; + struct pci_dev *pdev = NULL; + struct eeh_dev *edev; + bool found = false; + +
[PATCH v9 0/3] EEH Support for VFIO PCI Device
The series of patches adds support EEH for PCI devices, which are passed through to PowerKVM based guest via VFIO. The implementation is straightforward based on the issues or problems we have to resolve to support EEH for PowerKVM based guest. - Emulation for EEH RTAS requests. All EEH RTAS requests goes to QEMU firstly. If QEMU can't handle it, the request will be sent to host via newly introduced VFIO container IOCTL command (VFIO_EEH_OP) and gets handled in host kernel. The series of patches requires corresponding QEMU changes. Change log == v1 - v2: * EEH RTAS requests are routed to QEMU, and then possiblly to host kerenl. The mechanism KVM in-kernel handling is dropped. * Error injection is reimplemented based syscall, instead of KVM in-kerenl handling. The logic for error injection token management is moved to QEMU. The error injection request is routed to QEMU and then possiblly to host kernel. v2 - v3: * Make the fields in struct eeh_vfio_pci_addr, struct vfio_eeh_info based on the comments from Alexey. * Define macros for EEH VFIO operations (Alexey). * Clear frozen state after successful PE reset. * Merge original [PATCH 1/2/3] to one. v3 - v4: * Remove the error injection from the patchset. Mike or I will work on that later. * Rename CONFIG_VFIO_EEH to VFIO_PCI_EEH. * Rename the IOCTL command to VFIO_EEH_OP and it's handled by VFIO-PCI device instead of VFIO container. * Rename the IOCTL argument structure to vfio_eeh_op accordingly. Also, more fields added to hold return values for RTAS requests. * The address mapping stuff is totally removed. When opening or releasing VFIO PCI device, notification sent to EEH to update the flags indicates the device is passed to guest or not. * Change pr_warn() to pr_debug() to avoid DOS as pointed by Alex.W * Argument size check issue pointed by Alex.W. v4 - v5: * Functions for VFIO PCI EEH support are moved to eeh.c and exported from there. VFIO PCI driver just uses those functions to tackle IOCTL command VFIO_EEH_OP. All of this is to make the code organized in a good way as suggested by Alex.G. Another potential benefit is PowerNV/pSeries are sharing eeh_ops and same infrastructure could possiblly work for KVM_PR and KVM_HV mode at the same time. * Don't clear error injection registers after finishing PE reset as the patchset is doing nothing related to error injection. * Amending Documentation/vfio.txt, which was missed in last revision. * No QEMU changes for this revision. v4 works well. Also, remove RFC from the subject as the design is basically recognized. v5 - v6: * CONFIG_VFIO_PCI_EEH removed. Instead to use CONFIG_EEH. * Split one ioctl command to 5. * In eeh.c, description has been added for those exported functions. Also, the functions have negative return values for error and information with other values. All digital numbers have been replaced by macros defined in eeh.h. The comments, including the function names have been amended not to mention guest or vfio. * Add one mutex to protect flag in eeh_dev_open()/release(). * More information on how to use those ioctl commands to Documentation/vfio.txt. v6 - v7: * Remove ioctl command VFIO_EEH_PE_GET_ADDR, the PE address will be figured out in userland (e.g. QEMU) as Alex.G suggested. * Let sPAPR VFIO container process the ioctl commands as VFIO container is naturally corresponds to IOMMU group (aka PE on sPAPR platform). * All VFIO PCI EEH ioctl commands have argsz+flags for its companion data struct. * For VFIO PCI EEH ioctl commands, ioctl() returns negative number to indicate error or zero for success. Additinal output information is transported by the companion data struct. * Explaining PE in Documentation/vfio.txt, typo fixes, more comments suggested by Alex.G. * Split/merge patches according to suggestions from Alex.G and Alex.W. * To have EEH stub in drivers/vfio/pci/, which was suggested by Alex.W. * Define various EEH options as macros in vfio.h for userland to use. v7 - v8: * Change ioctl commands back to combined one. * EEH related logic was put into drivers/vfio/vfio_eeh.c, which is only built with CONFIG_EEH. Otherwise, inline functions defined in include/linux/vfio.h * Change vfio.txt according to the source code changes. * Fix various comments from internal reviews by Alexey. Thanks to Alexey. v8 - v9: * Remove unused macros in asm/include/eeh.h * Missed to disable VFIO device on error from
[PATCH v9 3/3] drivers/vfio: EEH support for VFIO PCI device
The patch adds new IOCTL commands for sPAPR VFIO container device to support EEH functionality for PCI devices, which have been passed through from host to somebody else via VFIO. Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com Acked-by: Alexander Graf ag...@suse.de --- Documentation/vfio.txt | 87 +++-- drivers/vfio/Makefile | 1 + drivers/vfio/pci/vfio_pci.c | 18 ++-- drivers/vfio/vfio_iommu_spapr_tce.c | 17 +++- drivers/vfio/vfio_spapr_eeh.c | 87 + include/linux/vfio.h| 23 ++ include/uapi/linux/vfio.h | 34 +++ 7 files changed, 259 insertions(+), 8 deletions(-) create mode 100644 drivers/vfio/vfio_spapr_eeh.c diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt index b9ca023..3fa4538 100644 --- a/Documentation/vfio.txt +++ b/Documentation/vfio.txt @@ -305,7 +305,15 @@ faster, the map/unmap handling has been implemented in real mode which provides an excellent performance which has limitations such as inability to do locked pages accounting in real time. -So 3 additional ioctls have been added: +4) According to sPAPR specification, A Partitionable Endpoint (PE) is an I/O +subtree that can be treated as a unit for the purposes of partitioning and +error recovery. A PE may be a single or multi-function IOA (IO Adapter), a +function of a multi-function IOA, or multiple IOAs (possibly including switch +and bridge structures above the multiple IOAs). PPC64 guests detect PCI errors +and recover from them via EEH RTAS services, which works on the basis of +additional ioctl commands. + +So 4 additional ioctls have been added: VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start of the DMA window on the PCI bus. @@ -316,9 +324,12 @@ So 3 additional ioctls have been added: VFIO_IOMMU_DISABLE - disables the container. + VFIO_EEH_PE_OP - provides an API for EEH setup, error detection and recovery. The code flow from the example above should be slightly changed: + struct vfio_eeh_pe_op pe_op = { .argsz = sizeof(pe_op) }; + . /* Add the group to the container */ ioctl(group, VFIO_GROUP_SET_CONTAINER, container); @@ -342,9 +353,79 @@ The code flow from the example above should be slightly changed: dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE; /* Check here is .iova/.size are within DMA window from spapr_iommu_info */ - ioctl(container, VFIO_IOMMU_MAP_DMA, dma_map); - . + + /* Get a file descriptor for the device */ + device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, :06:0d.0); + + + + /* Gratuitous device reset and go... */ + ioctl(device, VFIO_DEVICE_RESET); + + /* Make sure EEH is supported */ + ioctl(container, VFIO_CHECK_EXTENSION, VFIO_EEH); + + /* Enable the EEH functionality on the device */ + pe_op.op = VFIO_EEH_PE_ENABLE; + ioctl(container, VFIO_EEH_PE_OP, pe_op); + + /* You're suggested to create additional data struct to represent +* PE, and put child devices belonging to same IOMMU group to the +* PE instance for later reference. +*/ + + /* Check the PE's state and make sure it's in functional state */ + pe_op.op = VFIO_EEH_PE_GET_STATE; + ioctl(container, VFIO_EEH_PE_OP, pe_op); + + /* Save device state using pci_save_state(). +* EEH should be enabled on the specified device. +*/ + + + + /* When 0xFF's returned from reading PCI config space or IO BARs +* of the PCI device. Check the PE's state to see if that has been +* frozen. +*/ + ioctl(container, VFIO_EEH_PE_OP, pe_op); + + /* Waiting for pending PCI transactions to be completed and don't +* produce any more PCI traffic from/to the affected PE until +* recovery is finished. +*/ + + /* Enable IO for the affected PE and collect logs. Usually, the +* standard part of PCI config space, AER registers are dumped +* as logs for further analysis. +*/ + pe_op.op = VFIO_EEH_PE_UNFREEZE_IO; + ioctl(container, VFIO_EEH_PE_OP, pe_op); + + /* +* Issue PE reset: hot or fundamental reset. Usually, hot reset +* is enough. However, the firmware of some PCI adapters would +* require fundamental reset. +*/ + pe_op.op = VFIO_EEH_PE_RESET_HOT; + ioctl(container, VFIO_EEH_PE_OP, pe_op); + pe_op.op = VFIO_EEH_PE_RESET_DEACTIVATE; + ioctl(container, VFIO_EEH_PE_OP, pe_op); + + /* Configure the PCI bridges for the affected PE */ + pe_op.op = VFIO_EEH_PE_CONFIGURE; + ioctl(container, VFIO_EEH_PE_OP, pe_op); + + /* Restored state we saved at initialization time. pci_restore_state() +* is good enough
RE: [linuxppc-release] [PATCH][v10] powerpc/mpc85xx:Add initial device tree support of T104x
-Original Message- From: linuxppc-release-boun...@linux.freescale.net [mailto:linuxppc- release-boun...@linux.freescale.net] On Behalf Of Prabhakar Kushwaha Sent: Monday, April 21, 2014 7:34 PM To: linuxppc-dev@lists.ozlabs.org Cc: Wood Scott-B07421; Jain Priyanka-B32167; Aggrwal Poonam-B10812; Kushwaha Prabhakar-B32579 Subject: [linuxppc-release] [PATCH][v10] powerpc/mpc85xx:Add initial device tree support of T104x The QorIQ T1040/T1042 processor support four integrated 64-bit e5500 PA processor cores with high-performance data path acceleration architecture and network peripheral interfaces required for networking telecommunications. + + iommu@2 { + compatible = fsl,pamu-v1.0, fsl,pamu; + reg = 0x2 0x1000; + ranges = 0 0x2 0x1000; + #address-cells = 1; + #size-cells = 1; + interrupts = + 24 2 0 0 + 16 2 1 30; + pamu0: pamu@0 { + reg = 0 0x1000; + fsl,primary-cache-geometry = 128 1; + fsl,secondary-cache-geometry = 16 2; + }; [Shengzhou] T1040 RM says: Hardware coherent PAMU Look-aside caches to improve performance * A 32-entry, direct-mapped primary PAACT cache * A 128-entry, 2-way, set-associative secondary PAACT cache It appears it should be: fsl,primary-cache-geometry = 32 1; fsl,secondary-cache-geometry = 128 2; is there any reason that it was 128 1, 16 2 ? ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 1/2] powerpc/powernv: include asm/smp.h to handle UP config
On 06/05/2014 08:51 PM, Shreyas B. Prabhu wrote: Build throws following errors when CONFIG_SMP=n arch/powerpc/platforms/powernv/setup.c: In function ‘pnv_kexec_wait_secondaries_down’: arch/powerpc/platforms/powernv/setup.c:179:4: error: implicit declaration of function ‘get_hard_smp_processor_id’ rc = opal_query_cpu_status(get_hard_smp_processor_id(i), The usage of get_hard_smp_processor_id() needs the declaration from asm/smp.h. The file setup.c includes linux/sched.h, which in-turn includes linux/smp.h. However, linux/smp.h includes asm/smp.h only on SMP configs and hence UP builds fail. Fix this by directly including asm/smp.h in setup.c unconditionally. Can you please clean up the description in the commit message ? and also the first line in the commit message should mention that the patch is trying to fix a UP specific build failure. ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev
Re: [PATCH 2/2] powerpc/powernv : Disable subcore for UP configs
On 06/05/2014 08:54 PM, Shreyas B. Prabhu wrote: Build throws following errors when CONFIG_SMP=n arch/powerpc/platforms/powernv/subcore.c: In function ‘cpu_update_split_mode’: arch/powerpc/platforms/powernv/subcore.c:274:15: error: ‘setup_max_cpus’ undeclared (first use in this function) arch/powerpc/platforms/powernv/subcore.c:285:5: error: lvalue required as left operand of assignment 'setup_max_cpus' variable is relevant only on SMP, so there is no point working around it for UP. Furthermore, subcore.c itself is relevant only on SMP and hence the better solution is to exclude subcore.c for UP builds. Signed-off-by: Shreyas B. Prabhu shre...@linux.vnet.ibm.com --- This patch applies on top of ben/powerpc.git/next branch arch/powerpc/platforms/powernv/Makefile | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/arch/powerpc/platforms/powernv/Makefile b/arch/powerpc/platforms/powernv/Makefile index 4ad0d34..636d206 100644 --- a/arch/powerpc/platforms/powernv/Makefile +++ b/arch/powerpc/platforms/powernv/Makefile @@ -1,9 +1,9 @@ obj-y+= setup.o opal-takeover.o opal-wrappers.o opal.o opal-async.o obj-y+= opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o obj-y+= rng.o opal-elog.o opal-dump.o opal-sysparam.o opal-sensor.o -obj-y+= opal-msglog.o subcore.o subcore-asm.o +obj-y+= opal-msglog.o subcore-asm.o subcore-asm.o can also move down here as well ? -obj-$(CONFIG_SMP)+= smp.o +obj-$(CONFIG_SMP)+= smp.o subcore.o ___ Linuxppc-dev mailing list Linuxppc-dev@lists.ozlabs.org https://lists.ozlabs.org/listinfo/linuxppc-dev