[PATCH 1/1 v2] powerpc: Correct DSCR during TM context switch

2014-06-05 Thread Sam Bobroff
Correct the DSCR SPR becoming temporarily corrupted if a task is
context switched during a transaction.

The problem occurs while suspending the task and is caused by saving
the DSCR to thread.dscr after it has already been set to the CPU's
default value:

__switch_to() calls __switch_to_tm()
which calls tm_reclaim_task()
which calls tm_reclaim_thread()
which calls tm_reclaim()
where the DSCR is set to the CPU's default
__switch_to() calls _switch()
where thread.dscr is set to the DSCR

When the task is resumed, it's transaction will be doomed (as usual)
and the DSCR SPR will be corrupted, although the checkpointed value
will be correct. Therefore the DSCR will be immediately corrected by
the transaction aborting, unless it has been suspended. In that case
the incorrect value can be seen by the task until it resumes the
transaction.

The fix is to treat the DSCR similarly to the TAR and save it early
in __switch_to().

A program exposing the problem is added to the kernel self tests as:
tools/testing/selftests/powerpc/tm/tm-resched-dscr.

Signed-off-by: Sam Bobroff sam.bobr...@au1.ibm.com
---
Changes:
v2:
* Reworked commit message.
* Adjusted test code and added it to kernel self tests.
---
 arch/powerpc/include/asm/switch_to.h   |6 +-
 arch/powerpc/kernel/entry_64.S |6 --
 arch/powerpc/kernel/process.c  |8 +-
 tools/testing/selftests/powerpc/Makefile   |2 +-
 tools/testing/selftests/powerpc/tm/Makefile|   15 
 .../testing/selftests/powerpc/tm/tm-resched-dscr.c |   90 
 6 files changed, 114 insertions(+), 13 deletions(-)
 create mode 100644 tools/testing/selftests/powerpc/tm/Makefile
 create mode 100644 tools/testing/selftests/powerpc/tm/tm-resched-dscr.c

diff --git a/arch/powerpc/include/asm/switch_to.h 
b/arch/powerpc/include/asm/switch_to.h
index 2737f46..3efd0e5 100644
--- a/arch/powerpc/include/asm/switch_to.h
+++ b/arch/powerpc/include/asm/switch_to.h
@@ -16,13 +16,15 @@ struct thread_struct;
 extern struct task_struct *_switch(struct thread_struct *prev,
   struct thread_struct *next);
 #ifdef CONFIG_PPC_BOOK3S_64
-static inline void save_tar(struct thread_struct *prev)
+static inline void save_early_sprs(struct thread_struct *prev)
 {
if (cpu_has_feature(CPU_FTR_ARCH_207S))
prev-tar = mfspr(SPRN_TAR);
+   if (cpu_has_feature(CPU_FTR_DSCR))
+   prev-dscr = mfspr(SPRN_DSCR);
 }
 #else
-static inline void save_tar(struct thread_struct *prev) {}
+static inline void save_early_sprs(struct thread_struct *prev) {}
 #endif
 
 extern void enable_kernel_fp(void);
diff --git a/arch/powerpc/kernel/entry_64.S b/arch/powerpc/kernel/entry_64.S
index 662c6dd..a107f4a 100644
--- a/arch/powerpc/kernel/entry_64.S
+++ b/arch/powerpc/kernel/entry_64.S
@@ -432,12 +432,6 @@ BEGIN_FTR_SECTION
std r24,THREAD_VRSAVE(r3)
 END_FTR_SECTION_IFSET(CPU_FTR_ALTIVEC)
 #endif /* CONFIG_ALTIVEC */
-#ifdef CONFIG_PPC64
-BEGIN_FTR_SECTION
-   mfspr   r25,SPRN_DSCR
-   std r25,THREAD_DSCR(r3)
-END_FTR_SECTION_IFSET(CPU_FTR_DSCR)
-#endif
and.r0,r0,r22
beq+1f
andcr22,r22,r0
diff --git a/arch/powerpc/kernel/process.c b/arch/powerpc/kernel/process.c
index e247898..8d2065e 100644
--- a/arch/powerpc/kernel/process.c
+++ b/arch/powerpc/kernel/process.c
@@ -771,15 +771,15 @@ struct task_struct *__switch_to(struct task_struct *prev,
 
WARN_ON(!irqs_disabled());
 
-   /* Back up the TAR across context switches.
+   /* Back up the TAR and DSCR across context switches.
 * Note that the TAR is not available for use in the kernel.  (To
 * provide this, the TAR should be backed up/restored on exception
 * entry/exit instead, and be in pt_regs.  FIXME, this should be in
 * pt_regs anyway (for debug).)
-* Save the TAR here before we do treclaim/trecheckpoint as these
-* will change the TAR.
+* Save the TAR and DSCR here before we do treclaim/trecheckpoint as
+* these will change them.
 */
-   save_tar(prev-thread);
+   save_early_sprs(prev-thread);
 
__switch_to_tm(prev);
 
diff --git a/tools/testing/selftests/powerpc/Makefile 
b/tools/testing/selftests/powerpc/Makefile
index 316194f..e1544e8 100644
--- a/tools/testing/selftests/powerpc/Makefile
+++ b/tools/testing/selftests/powerpc/Makefile
@@ -13,7 +13,7 @@ CFLAGS := -Wall -O2 -flto -Wall -Werror 
-DGIT_VERSION='$(GIT_VERSION)' -I$(CUR
 
 export CC CFLAGS
 
-TARGETS = pmu copyloops
+TARGETS = pmu copyloops tm
 
 endif
 
diff --git a/tools/testing/selftests/powerpc/tm/Makefile 
b/tools/testing/selftests/powerpc/tm/Makefile
new file mode 100644
index 000..51267f4
--- /dev/null
+++ b/tools/testing/selftests/powerpc/tm/Makefile
@@ -0,0 +1,15 @@
+PROGS := tm-resched-dscr
+
+all: $(PROGS)
+
+$(PROGS):
+

[PATCH v8 1/3] powerpc/eeh: Avoid event on passed PE

2014-06-05 Thread Gavin Shan
We must not handle EEH error on devices which are passed to somebody
else. Instead, we expect that the frozen device owner detects an EEH
error and recovers from it.

This avoids EEH error handling on passed through devices so the device
owner gets a chance to handle them.

Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/eeh.h| 7 +++
 arch/powerpc/kernel/eeh.c | 8 
 arch/powerpc/platforms/powernv/eeh-ioda.c | 3 ++-
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 7782056..653d981 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -25,6 +25,7 @@
 #include linux/list.h
 #include linux/string.h
 #include linux/time.h
+#include linux/atomic.h
 
 struct pci_dev;
 struct pci_bus;
@@ -84,6 +85,7 @@ struct eeh_pe {
int freeze_count;   /* Times of froze up*/
struct timeval tstamp;  /* Time on first-time freeze*/
int false_positives;/* Times of reported #ff's  */
+   atomic_t pass_dev_cnt;  /* Count of passed through devs */
struct eeh_pe *parent;  /* Parent PE*/
struct list_head child_list;/* Link PE to the child list*/
struct list_head edevs; /* Link list of EEH devices */
@@ -93,6 +95,11 @@ struct eeh_pe {
 #define eeh_pe_for_each_dev(pe, edev, tmp) \
list_for_each_entry_safe(edev, tmp, pe-edevs, list)
 
+static inline bool eeh_pe_passed(struct eeh_pe *pe)
+{
+   return pe ? !!atomic_read(pe-pass_dev_cnt) : false;
+}
+
 /*
  * The struct is used to trace EEH state for the associated
  * PCI device node or PCI device. In future, it might
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 9c6b899..3bc8b12 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -400,6 +400,14 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
if (ret  0)
return ret;
 
+   /*
+* If the PE isn't owned by us, we shouldn't check the
+* state. Instead, let the owner handle it if the PE has
+* been frozen.
+*/
+   if (eeh_pe_passed(pe))
+   return 0;
+
/* If we already have a pending isolation event for this
 * slot, we know it's bad already, we don't need to check.
 * Do this checking under a lock; as multiple PCI devices
diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c 
b/arch/powerpc/platforms/powernv/eeh-ioda.c
index cab3e62..79193eb 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -892,7 +892,8 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
opal_pci_eeh_freeze_clear(phb-opal_id, 
frozen_pe_no,
OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
ret = EEH_NEXT_ERR_NONE;
-   } else if ((*pe)-state  EEH_PE_ISOLATED) {
+   } else if ((*pe)-state  EEH_PE_ISOLATED ||
+  eeh_pe_passed(*pe)) {
ret = EEH_NEXT_ERR_NONE;
} else {
pr_err(EEH: Frozen PHB#%x-PE#%x (%s) 
detected\n,
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v8 2/3] powerpc/eeh: EEH support for VFIO PCI device

2014-06-05 Thread Gavin Shan
The patch exports functions to be used by new VFIO ioctl command,
which will be introduced in subsequent patch, to support EEH
functinality for VFIO PCI devices.

Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/eeh.h |  14 +++
 arch/powerpc/kernel/eeh.c  | 268 +
 2 files changed, 282 insertions(+)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 653d981..5b4cc4e 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -166,6 +166,8 @@ enum {
 #define EEH_OPT_ENABLE 1   /* EEH enable   */
 #define EEH_OPT_THAW_MMIO  2   /* MMIO enable  */
 #define EEH_OPT_THAW_DMA   3   /* DMA enable   */
+#define EEH_OPT_GET_PE_ADDR0   /* Get PE addr  */
+#define EEH_OPT_GET_PE_MODE1   /* Get PE mode  */
 #define EEH_STATE_UNAVAILABLE  (1  0)/* State unavailable*/
 #define EEH_STATE_NOT_SUPPORT  (1  1)/* EEH not supported*/
 #define EEH_STATE_RESET_ACTIVE (1  2)/* Active reset */
@@ -173,6 +175,11 @@ enum {
 #define EEH_STATE_DMA_ACTIVE   (1  4)/* Active DMA   */
 #define EEH_STATE_MMIO_ENABLED (1  5)/* MMIO enabled */
 #define EEH_STATE_DMA_ENABLED  (1  6)/* DMA enabled  */
+#define EEH_PE_STATE_NORMAL0   /* Normal state */
+#define EEH_PE_STATE_RESET 1   /* PE reset asserted*/
+#define EEH_PE_STATE_STOPPED_IO_DMA2   /* Frozen PE*/
+#define EEH_PE_STATE_STOPPED_DMA   4   /* Stopped DMA, Enabled IO */
+#define EEH_PE_STATE_UNAVAIL   5   /* Unavailable  */
 #define EEH_RESET_DEACTIVATE   0   /* Deactivate the PE reset  */
 #define EEH_RESET_HOT  1   /* Hot reset*/
 #define EEH_RESET_FUNDAMENTAL  3   /* Fundamental reset*/
@@ -280,6 +287,13 @@ void eeh_add_device_late(struct pci_dev *);
 void eeh_add_device_tree_late(struct pci_bus *);
 void eeh_add_sysfs_files(struct pci_bus *);
 void eeh_remove_device(struct pci_dev *);
+int eeh_dev_open(struct pci_dev *pdev);
+void eeh_dev_release(struct pci_dev *pdev);
+struct eeh_pe *eeh_iommu_group_to_pe(struct iommu_group *group);
+int eeh_pe_set_option(struct eeh_pe *pe, int option);
+int eeh_pe_get_state(struct eeh_pe *pe);
+int eeh_pe_reset(struct eeh_pe *pe, int option);
+int eeh_pe_configure(struct eeh_pe *pe);
 
 /**
  * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure.
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 3bc8b12..fc90df0 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -40,6 +40,7 @@
 #include asm/eeh.h
 #include asm/eeh_event.h
 #include asm/io.h
+#include asm/iommu.h
 #include asm/machdep.h
 #include asm/ppc-pci.h
 #include asm/rtas.h
@@ -108,6 +109,9 @@ struct eeh_ops *eeh_ops = NULL;
 /* Lock to avoid races due to multiple reports of an error */
 DEFINE_RAW_SPINLOCK(confirm_error_lock);
 
+/* Lock to protect passed flags */
+static DEFINE_MUTEX(eeh_dev_mutex);
+
 /* Buffer for reporting pci register dumps. Its here in BSS, and
  * not dynamically alloced, so that it ends up in RMO where RTAS
  * can access it.
@@ -1106,6 +1110,270 @@ void eeh_remove_device(struct pci_dev *dev)
edev-mode = ~EEH_DEV_SYSFS;
 }
 
+/**
+ * eeh_dev_open - Increase count of pass through devices for PE
+ * @pdev: PCI device
+ *
+ * Increase count of passed through devices for the indicated
+ * PE. In the result, the EEH errors detected on the PE won't be
+ * reported. The PE owner will be responsible for detection
+ * and recovery.
+ */
+int eeh_dev_open(struct pci_dev *pdev)
+{
+   struct eeh_dev *edev;
+
+   mutex_lock(eeh_dev_mutex);
+
+   /* No PCI device ? */
+   if (!pdev)
+   goto out;
+
+   /* No EEH device or PE ? */
+   edev = pci_dev_to_eeh_dev(pdev);
+   if (!edev || !edev-pe)
+   goto out;
+
+   /* Increase PE's pass through count */
+   atomic_inc(edev-pe-pass_dev_cnt);
+   mutex_unlock(eeh_dev_mutex);
+
+   return 0;
+out:
+   mutex_unlock(eeh_dev_mutex);
+   return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(eeh_dev_open);
+
+/**
+ * eeh_dev_release - Decrease count of pass through devices for PE
+ * @pdev: PCI device
+ *
+ * Decrease count of pass through devices for the indicated PE. If
+ * there is no passed through device in PE, the EEH errors detected
+ * on the PE will be reported and handled as usual.
+ */
+void eeh_dev_release(struct pci_dev *pdev)
+{
+   struct eeh_dev *edev;
+
+   mutex_lock(eeh_dev_mutex);
+
+   /* No PCI device ? */
+   if (!pdev)
+   goto out;
+
+   /* No EEH device ? */
+   edev = pci_dev_to_eeh_dev(pdev);
+   if (!edev || !edev-pe || !eeh_pe_passed(edev-pe))
+   goto out;
+
+   /* Decrease PE's pass through count */
+   

[PATCH v8 0/3] EEH Support for VFIO PCI Device

2014-06-05 Thread Gavin Shan
The series of patches adds support EEH for PCI devices, which are passed
through to PowerKVM based guest via VFIO. The implementation is straightforward
based on the issues or problems we have to resolve to support EEH for PowerKVM
based guest.

- Emulation for EEH RTAS requests. All EEH RTAS requests goes to QEMU firstly.
  If QEMU can't handle it, the request will be sent to host via newly introduced
  VFIO container IOCTL command (VFIO_EEH_OP) and gets handled in host kernel.

The series of patches requires corresponding QEMU changes.

Change log
==
v1 - v2:
* EEH RTAS requests are routed to QEMU, and then possiblly to host 
kerenl.
  The mechanism KVM in-kernel handling is dropped.
* Error injection is reimplemented based syscall, instead of KVM 
in-kerenl
  handling. The logic for error injection token management is moved to
  QEMU. The error injection request is routed to QEMU and then possiblly
  to host kernel.
v2 - v3:
* Make the fields in struct eeh_vfio_pci_addr, struct vfio_eeh_info 
based
  on the comments from Alexey.
* Define macros for EEH VFIO operations (Alexey).
* Clear frozen state after successful PE reset.
* Merge original [PATCH 1/2/3] to one.
v3 - v4:
* Remove the error injection from the patchset. Mike or I will work on 
that
  later.
* Rename CONFIG_VFIO_EEH to VFIO_PCI_EEH.
* Rename the IOCTL command to VFIO_EEH_OP and it's handled by VFIO-PCI 
device
  instead of VFIO container.
* Rename the IOCTL argument structure to vfio_eeh_op accordingly. 
Also, more
  fields added to hold return values for RTAS requests.
* The address mapping stuff is totally removed. When opening or 
releasing VFIO
  PCI device, notification sent to EEH to update the flags indicates 
the device
  is passed to guest or not.
* Change pr_warn() to pr_debug() to avoid DOS as pointed by Alex.W
* Argument size check issue pointed by Alex.W.
v4 - v5:
* Functions for VFIO PCI EEH support are moved to eeh.c and exported 
from there.
  VFIO PCI driver just uses those functions to tackle IOCTL command 
VFIO_EEH_OP.
  All of this is to make the code organized in a good way as suggested 
by Alex.G.
  Another potential benefit is PowerNV/pSeries are sharing eeh_ops 
and same
  infrastructure could possiblly work for KVM_PR and KVM_HV mode at the 
same time.
* Don't clear error injection registers after finishing PE reset as the 
patchset
  is doing nothing related to error injection.
* Amending Documentation/vfio.txt, which was missed in last revision.
* No QEMU changes for this revision. v4 works well. Also, remove 
RFC from the
  subject as the design is basically recognized.
v5 - v6:
* CONFIG_VFIO_PCI_EEH removed. Instead to use CONFIG_EEH.
* Split one ioctl command to 5.
* In eeh.c, description has been added for those exported functions. 
Also, the
  functions have negative return values for error and information with 
other values.
  All digital numbers have been replaced by macros defined in eeh.h. 
The comments,
  including the function names have been amended not to mention guest 
or vfio.
* Add one mutex to protect flag in eeh_dev_open()/release().
* More information on how to use those ioctl commands to 
Documentation/vfio.txt.
v6 - v7:
* Remove ioctl command VFIO_EEH_PE_GET_ADDR, the PE address will be 
figured out
  in userland (e.g. QEMU) as Alex.G suggested.
* Let sPAPR VFIO container process the ioctl commands as VFIO container 
is naturally
  corresponds to IOMMU group (aka PE on sPAPR platform).
* All VFIO PCI EEH ioctl commands have argsz+flags for its companion 
data struct.
* For VFIO PCI EEH ioctl commands, ioctl() returns negative number to 
indicate error
  or zero for success. Additinal output information is transported by 
the companion
  data struct.
* Explaining PE in Documentation/vfio.txt, typo fixes, more comments 
suggested by
  Alex.G.
* Split/merge patches according to suggestions from Alex.G and Alex.W.
* To have EEH stub in drivers/vfio/pci/, which was suggested by Alex.W.
* Define various EEH options as macros in vfio.h for userland to use.
v7 - v8:
* Change ioctl commands back to combined one.
* EEH related logic was put into drivers/vfio/vfio_eeh.c, which is only 
built with
  CONFIG_EEH. Otherwise, inline functions defined in 
include/linux/vfio.h
* Change vfio.txt according to the source code changes.
* Fix various comments from internal reviews by Alexey. Thanks to 
Alexey.
 
Gavin Shan (3):
  powerpc/eeh: Avoid event on passed PE
  powerpc/eeh: EEH support for VFIO PCI device
  drivers/vfio: EEH support 

[PATCH v8 3/3] drivers/vfio: EEH support for VFIO PCI device

2014-06-05 Thread Gavin Shan
The patch adds new IOCTL commands for sPAPR VFIO container device
to support EEH functionality for PCI devices, which have been passed
through from host to somebody else via VFIO.

Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com
---
 Documentation/vfio.txt  | 87 ++--
 drivers/vfio/Makefile   |  1 +
 drivers/vfio/pci/vfio_pci.c | 20 ++---
 drivers/vfio/vfio_iommu_spapr_tce.c | 17 ++-
 drivers/vfio/vfio_spapr_eeh.c   | 89 +
 include/linux/vfio.h| 23 ++
 include/uapi/linux/vfio.h   | 35 +++
 7 files changed, 262 insertions(+), 10 deletions(-)
 create mode 100644 drivers/vfio/vfio_spapr_eeh.c

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index b9ca023..3fa4538 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -305,7 +305,15 @@ faster, the map/unmap handling has been implemented in 
real mode which provides
 an excellent performance which has limitations such as inability to do
 locked pages accounting in real time.
 
-So 3 additional ioctls have been added:
+4) According to sPAPR specification, A Partitionable Endpoint (PE) is an I/O
+subtree that can be treated as a unit for the purposes of partitioning and
+error recovery. A PE may be a single or multi-function IOA (IO Adapter), a
+function of a multi-function IOA, or multiple IOAs (possibly including switch
+and bridge structures above the multiple IOAs). PPC64 guests detect PCI errors
+and recover from them via EEH RTAS services, which works on the basis of
+additional ioctl commands.
+
+So 4 additional ioctls have been added:
 
VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
of the DMA window on the PCI bus.
@@ -316,9 +324,12 @@ So 3 additional ioctls have been added:
 
VFIO_IOMMU_DISABLE - disables the container.
 
+   VFIO_EEH_PE_OP - provides an API for EEH setup, error detection and 
recovery.
 
 The code flow from the example above should be slightly changed:
 
+   struct vfio_eeh_pe_op pe_op = { .argsz = sizeof(pe_op) };
+
.
/* Add the group to the container */
ioctl(group, VFIO_GROUP_SET_CONTAINER, container);
@@ -342,9 +353,79 @@ The code flow from the example above should be slightly 
changed:
dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
 
/* Check here is .iova/.size are within DMA window from 
spapr_iommu_info */
-
ioctl(container, VFIO_IOMMU_MAP_DMA, dma_map);
-   .
+
+   /* Get a file descriptor for the device */
+   device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, :06:0d.0);
+
+   
+
+   /* Gratuitous device reset and go... */
+   ioctl(device, VFIO_DEVICE_RESET);
+
+   /* Make sure EEH is supported */
+   ioctl(container, VFIO_CHECK_EXTENSION, VFIO_EEH);
+
+   /* Enable the EEH functionality on the device */
+   pe_op.op = VFIO_EEH_PE_ENABLE;
+   ioctl(container, VFIO_EEH_PE_OP, pe_op);
+
+   /* You're suggested to create additional data struct to represent
+* PE, and put child devices belonging to same IOMMU group to the
+* PE instance for later reference.
+*/
+
+   /* Check the PE's state and make sure it's in functional state */
+   pe_op.op = VFIO_EEH_PE_GET_STATE;
+   ioctl(container, VFIO_EEH_PE_OP, pe_op);
+
+   /* Save device state using pci_save_state().
+* EEH should be enabled on the specified device.
+*/
+
+   
+
+   /* When 0xFF's returned from reading PCI config space or IO BARs
+* of the PCI device. Check the PE's state to see if that has been
+* frozen.
+*/
+   ioctl(container, VFIO_EEH_PE_OP, pe_op);
+
+   /* Waiting for pending PCI transactions to be completed and don't
+* produce any more PCI traffic from/to the affected PE until
+* recovery is finished.
+*/
+
+   /* Enable IO for the affected PE and collect logs. Usually, the
+* standard part of PCI config space, AER registers are dumped
+* as logs for further analysis.
+*/
+   pe_op.op = VFIO_EEH_PE_UNFREEZE_IO;
+   ioctl(container, VFIO_EEH_PE_OP, pe_op);
+
+   /*
+* Issue PE reset: hot or fundamental reset. Usually, hot reset
+* is enough. However, the firmware of some PCI adapters would
+* require fundamental reset.
+*/
+   pe_op.op = VFIO_EEH_PE_RESET_HOT;
+   ioctl(container, VFIO_EEH_PE_OP, pe_op);
+   pe_op.op = VFIO_EEH_PE_RESET_DEACTIVATE;
+   ioctl(container, VFIO_EEH_PE_OP, pe_op);
+
+   /* Configure the PCI bridges for the affected PE */
+   pe_op.op = VFIO_EEH_PE_CONFIGURE;
+   ioctl(container, VFIO_EEH_PE_OP, pe_op);
+
+   /* Restored state we saved at initialization time. pci_restore_state()
+* is good enough as an example.
+*/
+
+   /* 

Re: [PATCH v8 2/3] powerpc/eeh: EEH support for VFIO PCI device

2014-06-05 Thread Benjamin Herrenschmidt
On Thu, 2014-06-05 at 16:36 +1000, Gavin Shan wrote:
 +#define EEH_OPT_GET_PE_ADDR0   /* Get PE addr  */
 +#define EEH_OPT_GET_PE_MODE1   /* Get PE mode  */

I assume that's just some leftover from the previous patches :-)

Don't respin just yet, let's see what other comments come in.

Cheers,
Ben.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 2/3] PPC: KVM: Reserve KVM_CAP_SPAPR_TCE_64 capability number

2014-06-05 Thread Alexey Kardashevskiy
This adds a capability number for 64-bit TCE tables support.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 include/uapi/linux/kvm.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 944cd21..e6972bf 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -744,6 +744,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_ENABLE_CAP_VM 98
 #define KVM_CAP_S390_IRQCHIP 99
 #define KVM_CAP_SPAPR_TCE_VFIO 100
+#define KVM_CAP_SPAPR_TCE_64 101
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 0/3] Prepare for in-kernel VFIO DMA operations acceleration

2014-06-05 Thread Alexey Kardashevskiy
This reserves 2 capability numbers.

This implements an extended version of KVM_CREATE_SPAPR_TCE_64 ioctl.

Please advise how to proceed with these patches as I suspect that
first two should go via Paolo's tree while the last one via Alex Graf's tree
(correct?).

Thanks!

Alexey Kardashevskiy (3):
  PPC: KVM: Reserve KVM_CAP_SPAPR_TCE_VFIO capability number
  PPC: KVM: Reserve KVM_CAP_SPAPR_TCE_64 capability number
  PPC: KVM: Add support for 64bit TCE windows

 Documentation/virtual/kvm/api.txt   | 46 +
 arch/powerpc/include/asm/kvm_host.h |  4 +++-
 arch/powerpc/include/asm/kvm_ppc.h  |  2 +-
 arch/powerpc/include/uapi/asm/kvm.h |  9 
 arch/powerpc/kvm/book3s_64_vio.c|  4 +++-
 arch/powerpc/kvm/powerpc.c  | 24 ++-
 include/uapi/linux/kvm.h|  4 
 7 files changed, 89 insertions(+), 4 deletions(-)

-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 3/3] PPC: KVM: Add support for 64bit TCE windows

2014-06-05 Thread Alexey Kardashevskiy
The existing KVM_CREATE_SPAPR_TCE only supports 32bit windows which is not
enough for directly mapped windows as the guest can get more than 4GB.

This adds KVM_CREATE_SPAPR_TCE_64 ioctl and advertises it
via KVM_CAP_SPAPR_TCE_64 capability.

Since 64bit windows are to support Dynamic DMA windows (DDW), let's add
@bus_offset and @page_shift which are also required by DDW.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 Documentation/virtual/kvm/api.txt   | 46 +
 arch/powerpc/include/asm/kvm_host.h |  4 +++-
 arch/powerpc/include/asm/kvm_ppc.h  |  2 +-
 arch/powerpc/include/uapi/asm/kvm.h |  9 
 arch/powerpc/kvm/book3s_64_vio.c|  4 +++-
 arch/powerpc/kvm/powerpc.c  | 24 ++-
 include/uapi/linux/kvm.h|  2 ++
 7 files changed, 87 insertions(+), 4 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index b4f5365..8a2a2da 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2484,6 +2484,52 @@ calls by the guest for that service will be passed to 
userspace to be
 handled.
 
 
+4.87 KVM_CREATE_SPAPR_TCE_64
+
+Capability: KVM_CAP_SPAPR_TCE_64
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_create_spapr_tce_64 (in)
+Returns: file descriptor for manipulating the created TCE table
+
+This is an extension for KVM_CAP_SPAPR_TCE which only supports 32bit
+windows.
+
+This creates a virtual TCE (translation control entry) table, which
+is an IOMMU for PAPR-style virtual I/O.  It is used to translate
+logical addresses used in virtual I/O into guest physical addresses,
+and provides a scatter/gather capability for PAPR virtual I/O.
+
+/* for KVM_CAP_SPAPR_TCE_64 */
+struct kvm_create_spapr_tce_64 {
+   __u64 liobn;
+   __u64 window_size;
+   __u64 bus_offset;
+   __u32 page_shift;
+   __u32 flags;
+};
+
+The liobn field gives the logical IO bus number for which to create a
+TCE table. The window_size field specifies the size of the DMA window
+which this TCE table will translate - the table will contain one 64
+bit TCE entry for every IOMMU page. The bus_offset field tells where
+this window is mapped on the IO bus. The page_size field tells a size
+of the pages in this window, can be 4K, 64K, 16MB, etc. The flags field
+is not used at the moment but provides the room for extensions.
+
+When the guest issues an H_PUT_TCE/H_PUT_TCE_INDIRECT/H_STUFF_TCE hcall
+on a liobn for which a TCE table has been created using this ioctl(),
+the kernel will handle it in real or virtual mode, updating the TCE table.
+If liobn has not been registered with this ioctl, H_PUT_TCE/etc calls
+will cause a vm exit and must be handled by userspace.
+
+The return value is a file descriptor which can be passed to mmap(2)
+to map the created TCE table into userspace.  This lets userspace read
+the entries written by kernel-handled H_PUT_TCE calls, and also lets
+userspace update the TCE table directly which is useful in some
+circumstances.
+
+
 5. The kvm_run structure
 
 
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 1eaea2d..260a810 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -179,7 +179,9 @@ struct kvmppc_spapr_tce_table {
struct list_head list;
struct kvm *kvm;
u64 liobn;
-   u32 window_size;
+   u64 window_size;
+   u64 bus_offset;
+   u32 page_shift;
struct page *pages[0];
 };
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 4096f16..b472fd3 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -126,7 +126,7 @@ extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
 
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
-   struct kvm_create_spapr_tce *args);
+   struct kvm_create_spapr_tce_64 *args);
 extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 unsigned long ioba, unsigned long tce);
 extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
b/arch/powerpc/include/uapi/asm/kvm.h
index a6665be..0ada7b4 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -333,6 +333,15 @@ struct kvm_create_spapr_tce {
__u32 window_size;
 };
 
+/* for KVM_CAP_SPAPR_TCE_64 */
+struct kvm_create_spapr_tce_64 {
+   __u64 liobn;
+   __u64 window_size;
+   __u64 bus_offset;
+   __u32 page_shift;
+   __u32 flags;
+};
+
 /* for KVM_ALLOCATE_RMA */
 struct kvm_allocate_rma {
__u64 rma_size;
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 54cf9bc..230fa5f 

[PATCH 1/3] PPC: KVM: Reserve KVM_CAP_SPAPR_TCE_VFIO capability number

2014-06-05 Thread Alexey Kardashevskiy
This adds a capability number for in-kernel support for VFIO on
SPAPR platform.

The capability will tell the user space whether in-kernel handlers of
H_PUT_TCE can handle VFIO-targeted requests or not. If not, the user space
must not attempt allocating a TCE table in the host kernel via
the KVM_CREATE_SPAPR_TCE KVM ioctl because in that case TCE requests
will not be passed to the user space which is desired action in
the situation like that.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 include/uapi/linux/kvm.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index a8f4ee5..944cd21 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -743,6 +743,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_IOAPIC_POLARITY_IGNORED 97
 #define KVM_CAP_ENABLE_CAP_VM 98
 #define KVM_CAP_S390_IRQCHIP 99
+#define KVM_CAP_SPAPR_TCE_VFIO 100
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
2.0.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 3/3] PPC: KVM: Add support for 64bit TCE windows

2014-06-05 Thread Benjamin Herrenschmidt
On Thu, 2014-06-05 at 17:25 +1000, Alexey Kardashevskiy wrote:
 +This creates a virtual TCE (translation control entry) table, which
 +is an IOMMU for PAPR-style virtual I/O.  It is used to translate
 +logical addresses used in virtual I/O into guest physical addresses,
 +and provides a scatter/gather capability for PAPR virtual I/O.
 +
 +/* for KVM_CAP_SPAPR_TCE_64 */
 +struct kvm_create_spapr_tce_64 {
 +   __u64 liobn;
 +   __u64 window_size;
 +   __u64 bus_offset;
 +   __u32 page_shift;
 +   __u32 flags;
 +};
 +
 +The liobn field gives the logical IO bus number for which to create a
 +TCE table. The window_size field specifies the size of the DMA window
 +which this TCE table will translate - the table will contain one 64
 +bit TCE entry for every IOMMU page. The bus_offset field tells where
 +this window is mapped on the IO bus. 

Hrm, the bus_offset cannot be set arbitrarily, it has some pretty strong
HW limits depending on the type of bridge  architecture version...

Do you plan to have that knowledge in qemu ? Or do you have some other
mechanism to query it ? (I might be missing a piece of the puzzle here).

Also one thing I've been pondering ...

We'll end up wasting a ton of memory with those TCE tables. If you have
3 PEs mapped into a guest, it will try to create 3 DDW's mapping the
entire guest memory and so 3 TCE tables large enough for that ... and
which will contain exactly the same entries !

We really want to look into extending PAPR to allow the creation of
table aliases so that the guest can essentially create one table and
associate it with multiple PEs. We might still decide to do multiple
copies for NUMA reasons but no more than one per node for example... at
least we can have the policy in qemu/kvm.

Also, do you currently require allocating a single physically contiguous
table or do you support TCE trees in your implementation ?

Cheers,
Ben.

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 3/3] PPC: KVM: Add support for 64bit TCE windows

2014-06-05 Thread Alexey Kardashevskiy
On 06/05/2014 05:38 PM, Benjamin Herrenschmidt wrote:
 On Thu, 2014-06-05 at 17:25 +1000, Alexey Kardashevskiy wrote:
 +This creates a virtual TCE (translation control entry) table, which
 +is an IOMMU for PAPR-style virtual I/O.  It is used to translate
 +logical addresses used in virtual I/O into guest physical addresses,
 +and provides a scatter/gather capability for PAPR virtual I/O.
 +
 +/* for KVM_CAP_SPAPR_TCE_64 */
 +struct kvm_create_spapr_tce_64 {
 +   __u64 liobn;
 +   __u64 window_size;
 +   __u64 bus_offset;
 +   __u32 page_shift;
 +   __u32 flags;
 +};
 +
 +The liobn field gives the logical IO bus number for which to create a
 +TCE table. The window_size field specifies the size of the DMA window
 +which this TCE table will translate - the table will contain one 64
 +bit TCE entry for every IOMMU page. The bus_offset field tells where
 +this window is mapped on the IO bus. 
 
 Hrm, the bus_offset cannot be set arbitrarily, it has some pretty strong
 HW limits depending on the type of bridge  architecture version...
 
 Do you plan to have that knowledge in qemu ? Or do you have some other
 mechanism to query it ? (I might be missing a piece of the puzzle here).


Yes. QEMU will have this knowledge as it has to implement
ibm,create-pe-dma-window and return this address to the guest. There will
be a container API to receive it from powernv code via funky ppc_md callback.

There are 2 steps:
1. query + create window
2. enable in-kernel KVM acceleration for it.

Everything will work without step2 and, frankly speaking, we do not need it
too much for DDW but it does not cost much.

By having bus_offset in ioctl which is only used for step2, I reduce
dependance from powernv.


 Also one thing I've been pondering ...
 
 We'll end up wasting a ton of memory with those TCE tables. If you have
 3 PEs mapped into a guest, it will try to create 3 DDW's mapping the
 entire guest memory and so 3 TCE tables large enough for that ... and
 which will contain exactly the same entries !

This is in the plan too, do not rush :)


 We really want to look into extending PAPR to allow the creation of
 table aliases so that the guest can essentially create one table and
 associate it with multiple PEs. We might still decide to do multiple
 copies for NUMA reasons but no more than one per node for example... at
 least we can have the policy in qemu/kvm.
 
 Also, do you currently require allocating a single physically contiguous
 table or do you support TCE trees in your implementation ?


No trees yet. For 64GB window we need (6430)/(1620)*8 = 32K TCE table.
Do we really need trees?


-- 
Alexey
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 3/3] PPC: KVM: Add support for 64bit TCE windows

2014-06-05 Thread Benjamin Herrenschmidt
On Thu, 2014-06-05 at 19:26 +1000, Alexey Kardashevskiy wrote:
 
 No trees yet. For 64GB window we need (6430)/(1620)*8 = 32K TCE table.
 Do we really need trees?

The above is assuming hugetlbfs backed guests. These are the least of my worry
indeed. But we need to deal with 4k and 64k guests.

Cheers,
Ben


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 3/3] PPC: KVM: Add support for 64bit TCE windows

2014-06-05 Thread Alexander Graf


On 05.06.14 12:27, Benjamin Herrenschmidt wrote:

On Thu, 2014-06-05 at 19:26 +1000, Alexey Kardashevskiy wrote:

No trees yet. For 64GB window we need (6430)/(1620)*8 = 32K TCE table.
Do we really need trees?

The above is assuming hugetlbfs backed guests. These are the least of my worry
indeed. But we need to deal with 4k and 64k guests.


What if we ask user space to give us a pointer to user space allocated 
memory along with the TCE registration? We would still ask user space to 
only use the returned fd for TCE modifications, but would have some 
nicely swappable memory we can store the TCE entries in.


In fact, the code as is today can allocate an arbitrary amount of pinned 
kernel memory from within user space without any checks.



Alex

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 0/3] Prepare for in-kernel VFIO DMA operations acceleration

2014-06-05 Thread Alexander Graf


On 05.06.14 09:25, Alexey Kardashevskiy wrote:

This reserves 2 capability numbers.

This implements an extended version of KVM_CREATE_SPAPR_TCE_64 ioctl.

Please advise how to proceed with these patches as I suspect that
first two should go via Paolo's tree while the last one via Alex Graf's tree
(correct?).


They would just go via my tree, but only be actually allocated (read: 
mergable to qemu) when they hit Paolo's tree.


In fact, I don't think it makes sense to split them off at all.


Alex

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

KVM: PPC: BOOK3S: PR: P8 Support

2014-06-05 Thread Aneesh Kumar K.V
This patchset adds support for emulating VTB, IC and Doorbell features in P8.
Doorbell support is dummy since we don't support SMT cores with PR-KVM.


-aneesh


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 2/4] KVM: PPC: BOOK3S: PR: Doorbell support

2014-06-05 Thread Aneesh Kumar K.V
We don't have SMT support yet, hence we should not find a doorbell
message generated

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/kvm/book3s_emulate.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 1bb16a59dcbc..d6c87d085182 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -28,7 +28,9 @@
 #define OP_19_XOP_RFI  50
 
 #define OP_31_XOP_MFMSR83
+#define OP_31_XOP_MSGSNDP  142
 #define OP_31_XOP_MTMSR146
+#define OP_31_XOP_MSGCLRP  174
 #define OP_31_XOP_MTMSRD   178
 #define OP_31_XOP_MTSR 210
 #define OP_31_XOP_MTSRIN   242
@@ -303,6 +305,22 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
 
break;
}
+   case OP_31_XOP_MSGSNDP:
+   {
+   /*
+* PR KVM still don't support SMT mode. So we should
+* not see a MSGSNDP/MSGCLRP used with PR KVM
+*/
+   pr_info(KVM: MSGSNDP used in non SMT case\n);
+   emulated = EMULATE_FAIL;
+   break;
+   }
+   case OP_31_XOP_MSGCLRP:
+   {
+   pr_info(KVM: MSGCLRP used in non SMT case\n);
+   emulated = EMULATE_FAIL;
+   break;
+   }
default:
emulated = EMULATE_FAIL;
}
-- 
1.9.1

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 1/4] KVM: PPC: BOOK3S: PR: Emulate virtual timebase register

2014-06-05 Thread Aneesh Kumar K.V
virtual time base register is a per VM, per cpu register that needs
to be saved and restored on vm exit and entry. Writing to VTB is not
allowed in the privileged mode.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/kvm_host.h |  1 +
 arch/powerpc/include/asm/reg.h  | 15 +++
 arch/powerpc/include/asm/time.h |  9 +
 arch/powerpc/kvm/book3s.c   |  6 ++
 arch/powerpc/kvm/book3s_emulate.c   |  3 +++
 arch/powerpc/kvm/book3s_hv.c|  6 --
 arch/powerpc/kvm/book3s_pr.c|  3 ++-
 7 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 4a58731a0a72..bd3caeaeebe1 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -505,6 +505,7 @@ struct kvm_vcpu_arch {
 #endif
/* Time base value when we entered the guest */
u64 entry_tb;
+   u64 entry_vtb;
u32 tcr;
ulong tsr; /* we need to perform set/clr_bits() which requires ulong */
u32 ivor[64];
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 4852bcf270f3..3e7085d8af90 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -25,6 +25,7 @@
 #ifdef CONFIG_8xx
 #include asm/reg_8xx.h
 #endif /* CONFIG_8xx */
+#include asm/bug.h
 
 #define MSR_SF_LG  63  /* Enable 64 bit mode */
 #define MSR_ISF_LG 61  /* Interrupt 64b mode valid on 630 */
@@ -1193,6 +1194,20 @@
 : r ((unsigned long)(v)) \
 : memory)
 
+static inline unsigned long mfvtb (void)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+   if (cpu_has_feature(CPU_FTR_ARCH_207S))
+   return mfspr(SPRN_VTB);
+#endif
+   /*
+* The above mfspr will be a no-op on anything before Power8
+* That can result in random values returned. We need to
+* capture that.
+*/
+   BUG();
+}
+
 #ifdef __powerpc64__
 #if defined(CONFIG_PPC_CELL) || defined(CONFIG_PPC_FSL_BOOK3E)
 #define mftb() ({unsigned long rval;   \
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 1d428e6007ca..03cbada59d3a 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -102,6 +102,15 @@ static inline u64 get_rtc(void)
return (u64)hi * 10 + lo;
 }
 
+static inline u64 get_vtb(void)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+   if (cpu_has_feature(CPU_FTR_ARCH_207S))
+   return mfvtb();
+#endif
+   return 0;
+}
+
 #ifdef CONFIG_PPC64
 static inline u64 get_tb(void)
 {
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 52c654dbd41a..ae43e4178ecd 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -646,6 +646,9 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, 
struct kvm_one_reg *reg)
case KVM_REG_PPC_BESCR:
val = get_reg_val(reg-id, vcpu-arch.bescr);
break;
+   case KVM_REG_PPC_VTB:
+   val = get_reg_val(reg-id, vcpu-arch.vtb);
+   break;
default:
r = -EINVAL;
break;
@@ -750,6 +753,9 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, 
struct kvm_one_reg *reg)
case KVM_REG_PPC_BESCR:
vcpu-arch.bescr = set_reg_val(reg-id, val);
break;
+   case KVM_REG_PPC_VTB:
+   vcpu-arch.vtb = set_reg_val(reg-id, val);
+   break;
default:
r = -EINVAL;
break;
diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 3565e775b61b..1bb16a59dcbc 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -577,6 +577,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int 
sprn, ulong *spr_val
 */
*spr_val = vcpu-arch.spurr;
break;
+   case SPRN_VTB:
+   *spr_val = vcpu-arch.vtb;
+   break;
case SPRN_GQR0:
case SPRN_GQR1:
case SPRN_GQR2:
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index aba05bbb3e74..f6ac58336b3f 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -897,9 +897,6 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 
id,
case KVM_REG_PPC_IC:
*val = get_reg_val(id, vcpu-arch.ic);
break;
-   case KVM_REG_PPC_VTB:
-   *val = get_reg_val(id, vcpu-arch.vtb);
-   break;
case KVM_REG_PPC_CSIGR:
*val = get_reg_val(id, vcpu-arch.csigr);

[PATCH 3/4] KVM: PPC: BOOK3S: PR: Emulate DPDES register

2014-06-05 Thread Aneesh Kumar K.V
Since we don't support SMT yet, we should always find zero in
Directed privileged doorbell exception state register.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/kvm/book3s_emulate.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index d6c87d085182..062b5da7786e 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -655,6 +655,7 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int 
sprn, ulong *spr_val
case SPRN_MMCR1:
case SPRN_MMCR2:
case SPRN_TIR:
+   case SPRN_DPDES:
 #endif
*spr_val = 0;
break;
-- 
1.9.1

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 4/4] KVM: PPC: BOOK3S: PR: Emulate instruction counter

2014-06-05 Thread Aneesh Kumar K.V
Writing to IC is not allowed in the privileged mode.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
 arch/powerpc/include/asm/kvm_host.h | 1 +
 arch/powerpc/kvm/book3s.c   | 6 ++
 arch/powerpc/kvm/book3s_emulate.c   | 3 +++
 arch/powerpc/kvm/book3s_hv.c| 6 --
 arch/powerpc/kvm/book3s_pr.c| 4 
 5 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index bd3caeaeebe1..f9ae69682ce1 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -506,6 +506,7 @@ struct kvm_vcpu_arch {
/* Time base value when we entered the guest */
u64 entry_tb;
u64 entry_vtb;
+   u64 entry_ic;
u32 tcr;
ulong tsr; /* we need to perform set/clr_bits() which requires ulong */
u32 ivor[64];
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index ae43e4178ecd..52c4c43900cb 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -649,6 +649,9 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, 
struct kvm_one_reg *reg)
case KVM_REG_PPC_VTB:
val = get_reg_val(reg-id, vcpu-arch.vtb);
break;
+   case KVM_REG_PPC_IC:
+   val = get_reg_val(reg-id, vcpu-arch.ic);
+   break;
default:
r = -EINVAL;
break;
@@ -756,6 +759,9 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, 
struct kvm_one_reg *reg)
case KVM_REG_PPC_VTB:
vcpu-arch.vtb = set_reg_val(reg-id, val);
break;
+   case KVM_REG_PPC_IC:
+   vcpu-arch.ic = set_reg_val(reg-id, val);
+   break;
default:
r = -EINVAL;
break;
diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 062b5da7786e..e6912c618160 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -598,6 +598,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int 
sprn, ulong *spr_val
case SPRN_VTB:
*spr_val = vcpu-arch.vtb;
break;
+   case SPRN_IC:
+   *spr_val = vcpu-arch.ic;
+   break;
case SPRN_GQR0:
case SPRN_GQR1:
case SPRN_GQR2:
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index f6ac58336b3f..c38cf9f836c0 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -894,9 +894,6 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 
id,
case KVM_REG_PPC_CIABR:
*val = get_reg_val(id, vcpu-arch.ciabr);
break;
-   case KVM_REG_PPC_IC:
-   *val = get_reg_val(id, vcpu-arch.ic);
-   break;
case KVM_REG_PPC_CSIGR:
*val = get_reg_val(id, vcpu-arch.csigr);
break;
@@ -1091,9 +1088,6 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, 
u64 id,
if ((vcpu-arch.ciabr  CIABR_PRIV) == CIABR_PRIV_HYPER)
vcpu-arch.ciabr = ~CIABR_PRIV;/* disable */
break;
-   case KVM_REG_PPC_IC:
-   vcpu-arch.ic = set_reg_val(id, *val);
-   break;
case KVM_REG_PPC_CSIGR:
vcpu-arch.csigr = set_reg_val(id, *val);
break;
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 96cdf89a8c86..03fc8847cd67 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -126,6 +126,8 @@ void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu 
*svcpu,
 */
vcpu-arch.entry_tb = get_tb();
vcpu-arch.entry_vtb = get_vtb();
+   if (cpu_has_feature(CPU_FTR_ARCH_207S))
+   vcpu-arch.entry_ic = mfspr(SPRN_IC);
svcpu-in_use = true;
 }
 
@@ -178,6 +180,8 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu,
vcpu-arch.purr += get_tb() - vcpu-arch.entry_tb;
vcpu-arch.spurr += get_tb() - vcpu-arch.entry_tb;
vcpu-arch.vtb += get_vtb() - vcpu-arch.entry_vtb;
+   if (cpu_has_feature(CPU_FTR_ARCH_207S))
+   vcpu-arch.ic += mfspr(SPRN_IC) - vcpu-arch.entry_ic;
svcpu-in_use = false;
 
 out:
-- 
1.9.1

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 1/4] KVM: PPC: BOOK3S: PR: Emulate virtual timebase register

2014-06-05 Thread Alexander Graf


On 05.06.14 14:08, Aneesh Kumar K.V wrote:

virtual time base register is a per VM, per cpu register that needs
to be saved and restored on vm exit and entry. Writing to VTB is not
allowed in the privileged mode.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
  arch/powerpc/include/asm/kvm_host.h |  1 +
  arch/powerpc/include/asm/reg.h  | 15 +++
  arch/powerpc/include/asm/time.h |  9 +
  arch/powerpc/kvm/book3s.c   |  6 ++
  arch/powerpc/kvm/book3s_emulate.c   |  3 +++
  arch/powerpc/kvm/book3s_hv.c|  6 --
  arch/powerpc/kvm/book3s_pr.c|  3 ++-
  7 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 4a58731a0a72..bd3caeaeebe1 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -505,6 +505,7 @@ struct kvm_vcpu_arch {
  #endif
/* Time base value when we entered the guest */
u64 entry_tb;
+   u64 entry_vtb;
u32 tcr;
ulong tsr; /* we need to perform set/clr_bits() which requires ulong */
u32 ivor[64];
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 4852bcf270f3..3e7085d8af90 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -25,6 +25,7 @@
  #ifdef CONFIG_8xx
  #include asm/reg_8xx.h
  #endif /* CONFIG_8xx */
+#include asm/bug.h
  
  #define MSR_SF_LG	63  /* Enable 64 bit mode */

  #define MSR_ISF_LG61  /* Interrupt 64b mode valid on 630 */
@@ -1193,6 +1194,20 @@
 : r ((unsigned long)(v)) \
 : memory)
  
+static inline unsigned long mfvtb (void)

+{
+#ifdef CONFIG_PPC_BOOK3S_64
+   if (cpu_has_feature(CPU_FTR_ARCH_207S))
+   return mfspr(SPRN_VTB);
+#endif
+   /*
+* The above mfspr will be a no-op on anything before Power8
+* That can result in random values returned. We need to
+* capture that.
+*/
+   BUG();
+}
+
  #ifdef __powerpc64__
  #if defined(CONFIG_PPC_CELL) || defined(CONFIG_PPC_FSL_BOOK3E)
  #define mftb()({unsigned long rval;   
\
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 1d428e6007ca..03cbada59d3a 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -102,6 +102,15 @@ static inline u64 get_rtc(void)
return (u64)hi * 10 + lo;
  }
  
+static inline u64 get_vtb(void)

+{
+#ifdef CONFIG_PPC_BOOK3S_64
+   if (cpu_has_feature(CPU_FTR_ARCH_207S))
+   return mfvtb();
+#endif
+   return 0;
+}
+
  #ifdef CONFIG_PPC64
  static inline u64 get_tb(void)
  {
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 52c654dbd41a..ae43e4178ecd 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -646,6 +646,9 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, 
struct kvm_one_reg *reg)
case KVM_REG_PPC_BESCR:
val = get_reg_val(reg-id, vcpu-arch.bescr);
break;
+   case KVM_REG_PPC_VTB:
+   val = get_reg_val(reg-id, vcpu-arch.vtb);
+   break;
default:
r = -EINVAL;
break;
@@ -750,6 +753,9 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, 
struct kvm_one_reg *reg)
case KVM_REG_PPC_BESCR:
vcpu-arch.bescr = set_reg_val(reg-id, val);
break;
+   case KVM_REG_PPC_VTB:
+   vcpu-arch.vtb = set_reg_val(reg-id, val);
+   break;
default:
r = -EINVAL;
break;
diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 3565e775b61b..1bb16a59dcbc 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -577,6 +577,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int 
sprn, ulong *spr_val
 */
*spr_val = vcpu-arch.spurr;
break;
+   case SPRN_VTB:
+   *spr_val = vcpu-arch.vtb;


Doesn't this mean that vtb can be the same 2 when the guest reads it 2 
times in a row without getting preempted?



Alex


+   break;
case SPRN_GQR0:
case SPRN_GQR1:
case SPRN_GQR2:
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index aba05bbb3e74..f6ac58336b3f 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -897,9 +897,6 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 
id,
case KVM_REG_PPC_IC:
*val = get_reg_val(id, vcpu-arch.ic);
break;
-   

Re: [PATCH 2/4] KVM: PPC: BOOK3S: PR: Doorbell support

2014-06-05 Thread Alexander Graf


On 05.06.14 14:08, Aneesh Kumar K.V wrote:

We don't have SMT support yet, hence we should not find a doorbell
message generated

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
  arch/powerpc/kvm/book3s_emulate.c | 18 ++
  1 file changed, 18 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 1bb16a59dcbc..d6c87d085182 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -28,7 +28,9 @@
  #define OP_19_XOP_RFI 50
  
  #define OP_31_XOP_MFMSR		83

+#define OP_31_XOP_MSGSNDP  142
  #define OP_31_XOP_MTMSR   146
+#define OP_31_XOP_MSGCLRP  174
  #define OP_31_XOP_MTMSRD  178
  #define OP_31_XOP_MTSR210
  #define OP_31_XOP_MTSRIN  242
@@ -303,6 +305,22 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
  
  			break;

}
+   case OP_31_XOP_MSGSNDP:
+   {
+   /*
+* PR KVM still don't support SMT mode. So we should


still?


+* not see a MSGSNDP/MSGCLRP used with PR KVM
+*/
+   pr_info(KVM: MSGSNDP used in non SMT case\n);
+   emulated = EMULATE_FAIL;


What would happen on an HV guest with only 1 thread that MSGSNDs to 
thread 0? Would the guest get an illegal instruction trap, a 
self-interrupt or would this be a simple nop?



Alex


+   break;
+   }
+   case OP_31_XOP_MSGCLRP:
+   {
+   pr_info(KVM: MSGCLRP used in non SMT case\n);
+   emulated = EMULATE_FAIL;
+   break;
+   }
default:
emulated = EMULATE_FAIL;
}


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 2/4] KVM: PPC: BOOK3S: PR: Doorbell support

2014-06-05 Thread Alexander Graf


On 05.06.14 14:21, Alexander Graf wrote:


On 05.06.14 14:08, Aneesh Kumar K.V wrote:

We don't have SMT support yet, hence we should not find a doorbell
message generated

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
  arch/powerpc/kvm/book3s_emulate.c | 18 ++
  1 file changed, 18 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c

index 1bb16a59dcbc..d6c87d085182 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -28,7 +28,9 @@
  #define OP_19_XOP_RFI50
#define OP_31_XOP_MFMSR83
+#define OP_31_XOP_MSGSNDP142
  #define OP_31_XOP_MTMSR146
+#define OP_31_XOP_MSGCLRP174
  #define OP_31_XOP_MTMSRD178
  #define OP_31_XOP_MTSR210
  #define OP_31_XOP_MTSRIN242
@@ -303,6 +305,22 @@ int kvmppc_core_emulate_op_pr(struct kvm_run 
*run, struct kvm_vcpu *vcpu,

break;
  }
+case OP_31_XOP_MSGSNDP:
+{
+/*
+ * PR KVM still don't support SMT mode. So we should


still?


+ * not see a MSGSNDP/MSGCLRP used with PR KVM
+ */
+pr_info(KVM: MSGSNDP used in non SMT case\n);
+emulated = EMULATE_FAIL;


What would happen on an HV guest with only 1 thread that MSGSNDs to 
thread 0? Would the guest get an illegal instruction trap, a 
self-interrupt or would this be a simple nop?


What I'm trying to say here is that it's ok to treat it as illegal 
instructions, but then we don't need this patch :).



Alex

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 3/3] PPC: KVM: Add support for 64bit TCE windows

2014-06-05 Thread Benjamin Herrenschmidt
On Thu, 2014-06-05 at 13:56 +0200, Alexander Graf wrote:
 What if we ask user space to give us a pointer to user space allocated 
 memory along with the TCE registration? We would still ask user space to 
 only use the returned fd for TCE modifications, but would have some 
 nicely swappable memory we can store the TCE entries in.

That isn't going to work terribly well for VFIO :-) But yes, for
emulated devices, we could improve things a bit, including for
the 32-bit TCE tables.

For emulated, the real mode path could walk the page tables and fallback
to virtual mode  get_user if the page isn't present, thus operating
directly on qemu memory TCE tables instead of the current pinned stuff.

However that has a cost in performance, but since that's really only
used for emulated devices and PAPR VIOs, it might not be a huge issue.

But for VFIO we don't have much choice, we need to create something the
HW can access.

 In fact, the code as is today can allocate an arbitrary amount of pinned 
 kernel memory from within user space without any checks.

Right. We should at least account it in the locked limit.

Cheers,
Ben.


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 3/3] PPC: KVM: Add support for 64bit TCE windows

2014-06-05 Thread Alexander Graf


On 05.06.14 14:30, Benjamin Herrenschmidt wrote:

On Thu, 2014-06-05 at 13:56 +0200, Alexander Graf wrote:

What if we ask user space to give us a pointer to user space allocated
memory along with the TCE registration? We would still ask user space to
only use the returned fd for TCE modifications, but would have some
nicely swappable memory we can store the TCE entries in.

That isn't going to work terribly well for VFIO :-) But yes, for
emulated devices, we could improve things a bit, including for
the 32-bit TCE tables.

For emulated, the real mode path could walk the page tables and fallback
to virtual mode  get_user if the page isn't present, thus operating
directly on qemu memory TCE tables instead of the current pinned stuff.

However that has a cost in performance, but since that's really only
used for emulated devices and PAPR VIOs, it might not be a huge issue.

But for VFIO we don't have much choice, we need to create something the
HW can access.


But we need to create separate tables for VFIO anyways, because these 
TCE tables contain virtual addresses, no?



Alex




In fact, the code as is today can allocate an arbitrary amount of pinned
kernel memory from within user space without any checks.

Right. We should at least account it in the locked limit.

Cheers,
Ben.




___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v8 0/3] EEH Support for VFIO PCI Device

2014-06-05 Thread Alexander Graf


On 05.06.14 08:36, Gavin Shan wrote:

The series of patches adds support EEH for PCI devices, which are passed
through to PowerKVM based guest via VFIO. The implementation is straightforward
based on the issues or problems we have to resolve to support EEH for PowerKVM
based guest.

- Emulation for EEH RTAS requests. All EEH RTAS requests goes to QEMU firstly.
   If QEMU can't handle it, the request will be sent to host via newly 
introduced
   VFIO container IOCTL command (VFIO_EEH_OP) and gets handled in host kernel.

The series of patches requires corresponding QEMU changes.


Acked-by: Alexander Graf ag...@suse.de


Alex

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 3/3] PPC: KVM: Add support for 64bit TCE windows

2014-06-05 Thread Alexey Kardashevskiy
On 06/05/2014 10:30 PM, Benjamin Herrenschmidt wrote:
 On Thu, 2014-06-05 at 13:56 +0200, Alexander Graf wrote:
 What if we ask user space to give us a pointer to user space allocated 
 memory along with the TCE registration? We would still ask user space to 
 only use the returned fd for TCE modifications, but would have some 
 nicely swappable memory we can store the TCE entries in.
 
 That isn't going to work terribly well for VFIO :-) But yes, for
 emulated devices, we could improve things a bit, including for
 the 32-bit TCE tables.
 
 For emulated, the real mode path could walk the page tables and fallback
 to virtual mode  get_user if the page isn't present, thus operating
 directly on qemu memory TCE tables instead of the current pinned stuff.
 
 However that has a cost in performance, but since that's really only
 used for emulated devices and PAPR VIOs, it might not be a huge issue.
 
 But for VFIO we don't have much choice, we need to create something the
 HW can access.

You are confusing things here.

There are 2 tables:
1. guest-visible TCE table, this is what is allocated for VIO or emulated PCI;
2. real HW DMA window, one exists already for DMA32 and one I will
allocated for a huge window.

I have just #2 for VFIO now but we will need both in order to implement
H_GET_TCE correctly, and this is the table I will allocate by this new ioctl.


 In fact, the code as is today can allocate an arbitrary amount of pinned 
 kernel memory from within user space without any checks.
 
 Right. We should at least account it in the locked limit.

Yup. And (probably) this thing will keep a counter of how many windows were
created per KVM instance to avoid having multiple copies of the same table.


-- 
Alexey
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 1/2] powerpc/powernv: include asm/smp.h to handle UP config

2014-06-05 Thread Shreyas B. Prabhu
Build throws following errors when CONFIG_SMP=n
arch/powerpc/platforms/powernv/setup.c: In function 
‘pnv_kexec_wait_secondaries_down’:
arch/powerpc/platforms/powernv/setup.c:179:4: error: implicit declaration of 
function ‘get_hard_smp_processor_id’
rc = opal_query_cpu_status(get_hard_smp_processor_id(i),

The usage of get_hard_smp_processor_id() needs the declaration from
asm/smp.h. The file setup.c includes linux/sched.h, which in-turn
includes linux/smp.h. However, linux/smp.h includes asm/smp.h
only on SMP configs and hence UP builds fail.

Fix this by directly including asm/smp.h in setup.c unconditionally.

Reported-by: Geert Uytterhoeven ge...@linux-m68k.org
Reviewed-by: Srivatsa S. Bhat srivatsa.b...@linux.vnet.ibm.com
Signed-off-by: Shreyas B. Prabhu shre...@linux.vnet.ibm.com
---
 arch/powerpc/platforms/powernv/setup.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/platforms/powernv/setup.c 
b/arch/powerpc/platforms/powernv/setup.c
index 8c16a5f..678573c 100644
--- a/arch/powerpc/platforms/powernv/setup.c
+++ b/arch/powerpc/platforms/powernv/setup.c
@@ -35,6 +35,7 @@
 #include asm/rtas.h
 #include asm/opal.h
 #include asm/kexec.h
+#include asm/smp.h
 
 #include powernv.h
 
-- 
1.9.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH 2/2] powerpc/powernv : Disable subcore for UP configs

2014-06-05 Thread Shreyas B. Prabhu
Build throws following errors when CONFIG_SMP=n
arch/powerpc/platforms/powernv/subcore.c: In function ‘cpu_update_split_mode’:
arch/powerpc/platforms/powernv/subcore.c:274:15: error: ‘setup_max_cpus’ 
undeclared (first use in this function)
arch/powerpc/platforms/powernv/subcore.c:285:5: error: lvalue required as left 
operand of assignment

'setup_max_cpus' variable is relevant only on SMP, so there is no point
working around it for UP. Furthermore, subcore.c itself is relevant only
on SMP and hence the better solution is to exclude subcore.c for UP builds.

Signed-off-by: Shreyas B. Prabhu shre...@linux.vnet.ibm.com
---
This patch applies on top of ben/powerpc.git/next branch

 arch/powerpc/platforms/powernv/Makefile | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/platforms/powernv/Makefile 
b/arch/powerpc/platforms/powernv/Makefile
index 4ad0d34..636d206 100644
--- a/arch/powerpc/platforms/powernv/Makefile
+++ b/arch/powerpc/platforms/powernv/Makefile
@@ -1,9 +1,9 @@
 obj-y  += setup.o opal-takeover.o opal-wrappers.o opal.o 
opal-async.o
 obj-y  += opal-rtc.o opal-nvram.o opal-lpc.o opal-flash.o
 obj-y  += rng.o opal-elog.o opal-dump.o opal-sysparam.o 
opal-sensor.o
-obj-y  += opal-msglog.o subcore.o subcore-asm.o
+obj-y  += opal-msglog.o subcore-asm.o
 
-obj-$(CONFIG_SMP)  += smp.o
+obj-$(CONFIG_SMP)  += smp.o subcore.o
 obj-$(CONFIG_PCI)  += pci.o pci-p5ioc2.o pci-ioda.o
 obj-$(CONFIG_EEH)  += eeh-ioda.o eeh-powernv.o
 obj-$(CONFIG_PPC_SCOM) += opal-xscom.o
-- 
1.9.0

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 1/4] KVM: PPC: BOOK3S: PR: Emulate virtual timebase register

2014-06-05 Thread Aneesh Kumar K.V
Alexander Graf ag...@suse.de writes:

 On 05.06.14 14:08, Aneesh Kumar K.V wrote:
 virtual time base register is a per VM, per cpu register that needs
 to be saved and restored on vm exit and entry. Writing to VTB is not
 allowed in the privileged mode.

 Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 ---
   arch/powerpc/include/asm/kvm_host.h |  1 +
   arch/powerpc/include/asm/reg.h  | 15 +++
   arch/powerpc/include/asm/time.h |  9 +
   arch/powerpc/kvm/book3s.c   |  6 ++
   arch/powerpc/kvm/book3s_emulate.c   |  3 +++
   arch/powerpc/kvm/book3s_hv.c|  6 --
   arch/powerpc/kvm/book3s_pr.c|  3 ++-
   7 files changed, 36 insertions(+), 7 deletions(-)

 diff --git a/arch/powerpc/include/asm/kvm_host.h 
 b/arch/powerpc/include/asm/kvm_host.h
 index 4a58731a0a72..bd3caeaeebe1 100644
 --- a/arch/powerpc/include/asm/kvm_host.h
 +++ b/arch/powerpc/include/asm/kvm_host.h
 @@ -505,6 +505,7 @@ struct kvm_vcpu_arch {
   #endif
  /* Time base value when we entered the guest */
  u64 entry_tb;
 +u64 entry_vtb;
  u32 tcr;
  ulong tsr; /* we need to perform set/clr_bits() which requires ulong */
  u32 ivor[64];
 diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
 index 4852bcf270f3..3e7085d8af90 100644
 --- a/arch/powerpc/include/asm/reg.h
 +++ b/arch/powerpc/include/asm/reg.h
 @@ -25,6 +25,7 @@
   #ifdef CONFIG_8xx
   #include asm/reg_8xx.h
   #endif /* CONFIG_8xx */
 +#include asm/bug.h
   
   #define MSR_SF_LG  63  /* Enable 64 bit mode */
   #define MSR_ISF_LG 61  /* Interrupt 64b mode valid on 630 */
 @@ -1193,6 +1194,20 @@
   : r ((unsigned long)(v)) \
   : memory)
   
 +static inline unsigned long mfvtb (void)
 +{
 +#ifdef CONFIG_PPC_BOOK3S_64
 +if (cpu_has_feature(CPU_FTR_ARCH_207S))
 +return mfspr(SPRN_VTB);
 +#endif
 +/*
 + * The above mfspr will be a no-op on anything before Power8
 + * That can result in random values returned. We need to
 + * capture that.
 + */
 +BUG();
 +}
 +
   #ifdef __powerpc64__
   #if defined(CONFIG_PPC_CELL) || defined(CONFIG_PPC_FSL_BOOK3E)
   #define mftb() ({unsigned long rval;   
 \
 diff --git a/arch/powerpc/include/asm/time.h 
 b/arch/powerpc/include/asm/time.h
 index 1d428e6007ca..03cbada59d3a 100644
 --- a/arch/powerpc/include/asm/time.h
 +++ b/arch/powerpc/include/asm/time.h
 @@ -102,6 +102,15 @@ static inline u64 get_rtc(void)
  return (u64)hi * 10 + lo;
   }
   
 +static inline u64 get_vtb(void)
 +{
 +#ifdef CONFIG_PPC_BOOK3S_64
 +if (cpu_has_feature(CPU_FTR_ARCH_207S))
 +return mfvtb();
 +#endif
 +return 0;
 +}
 +
   #ifdef CONFIG_PPC64
   static inline u64 get_tb(void)
   {
 diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
 index 52c654dbd41a..ae43e4178ecd 100644
 --- a/arch/powerpc/kvm/book3s.c
 +++ b/arch/powerpc/kvm/book3s.c
 @@ -646,6 +646,9 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, 
 struct kvm_one_reg *reg)
  case KVM_REG_PPC_BESCR:
  val = get_reg_val(reg-id, vcpu-arch.bescr);
  break;
 +case KVM_REG_PPC_VTB:
 +val = get_reg_val(reg-id, vcpu-arch.vtb);
 +break;
  default:
  r = -EINVAL;
  break;
 @@ -750,6 +753,9 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, 
 struct kvm_one_reg *reg)
  case KVM_REG_PPC_BESCR:
  vcpu-arch.bescr = set_reg_val(reg-id, val);
  break;
 +case KVM_REG_PPC_VTB:
 +vcpu-arch.vtb = set_reg_val(reg-id, val);
 +break;
  default:
  r = -EINVAL;
  break;
 diff --git a/arch/powerpc/kvm/book3s_emulate.c 
 b/arch/powerpc/kvm/book3s_emulate.c
 index 3565e775b61b..1bb16a59dcbc 100644
 --- a/arch/powerpc/kvm/book3s_emulate.c
 +++ b/arch/powerpc/kvm/book3s_emulate.c
 @@ -577,6 +577,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, 
 int sprn, ulong *spr_val
   */
  *spr_val = vcpu-arch.spurr;
  break;
 +case SPRN_VTB:
 +*spr_val = vcpu-arch.vtb;

 Doesn't this mean that vtb can be the same 2 when the guest reads it 2 
 times in a row without getting preempted?


But a mfspr will result in VM exit and that would make sure we
update vcpu-arch.vtb with the correct value.


-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 2/4] KVM: PPC: BOOK3S: PR: Doorbell support

2014-06-05 Thread Aneesh Kumar K.V
Alexander Graf ag...@suse.de writes:

 On 05.06.14 14:21, Alexander Graf wrote:

 On 05.06.14 14:08, Aneesh Kumar K.V wrote:
 We don't have SMT support yet, hence we should not find a doorbell
 message generated

 Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 ---
   arch/powerpc/kvm/book3s_emulate.c | 18 ++
   1 file changed, 18 insertions(+)

 diff --git a/arch/powerpc/kvm/book3s_emulate.c 
 b/arch/powerpc/kvm/book3s_emulate.c
 index 1bb16a59dcbc..d6c87d085182 100644
 --- a/arch/powerpc/kvm/book3s_emulate.c
 +++ b/arch/powerpc/kvm/book3s_emulate.c
 @@ -28,7 +28,9 @@
   #define OP_19_XOP_RFI50
 #define OP_31_XOP_MFMSR83
 +#define OP_31_XOP_MSGSNDP142
   #define OP_31_XOP_MTMSR146
 +#define OP_31_XOP_MSGCLRP174
   #define OP_31_XOP_MTMSRD178
   #define OP_31_XOP_MTSR210
   #define OP_31_XOP_MTSRIN242
 @@ -303,6 +305,22 @@ int kvmppc_core_emulate_op_pr(struct kvm_run 
 *run, struct kvm_vcpu *vcpu,
 break;
   }
 +case OP_31_XOP_MSGSNDP:
 +{
 +/*
 + * PR KVM still don't support SMT mode. So we should

 still?

 + * not see a MSGSNDP/MSGCLRP used with PR KVM
 + */
 +pr_info(KVM: MSGSNDP used in non SMT case\n);
 +emulated = EMULATE_FAIL;

 What would happen on an HV guest with only 1 thread that MSGSNDs to 
 thread 0? Would the guest get an illegal instruction trap, a 
 self-interrupt or would this be a simple nop?

 What I'm trying to say here is that it's ok to treat it as illegal 
 instructions, but then we don't need this patch :).


Agreed. I will verify whether it is treated as a nop. If so will send an
updated patch.

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 1/4] KVM: PPC: BOOK3S: PR: Emulate virtual timebase register

2014-06-05 Thread Alexander Graf


On 05.06.14 17:50, Aneesh Kumar K.V wrote:

Alexander Graf ag...@suse.de writes:


On 05.06.14 14:08, Aneesh Kumar K.V wrote:

virtual time base register is a per VM, per cpu register that needs
to be saved and restored on vm exit and entry. Writing to VTB is not
allowed in the privileged mode.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
---
   arch/powerpc/include/asm/kvm_host.h |  1 +
   arch/powerpc/include/asm/reg.h  | 15 +++
   arch/powerpc/include/asm/time.h |  9 +
   arch/powerpc/kvm/book3s.c   |  6 ++
   arch/powerpc/kvm/book3s_emulate.c   |  3 +++
   arch/powerpc/kvm/book3s_hv.c|  6 --
   arch/powerpc/kvm/book3s_pr.c|  3 ++-
   7 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 4a58731a0a72..bd3caeaeebe1 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -505,6 +505,7 @@ struct kvm_vcpu_arch {
   #endif
/* Time base value when we entered the guest */
u64 entry_tb;
+   u64 entry_vtb;
u32 tcr;
ulong tsr; /* we need to perform set/clr_bits() which requires ulong */
u32 ivor[64];
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 4852bcf270f3..3e7085d8af90 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -25,6 +25,7 @@
   #ifdef CONFIG_8xx
   #include asm/reg_8xx.h
   #endif /* CONFIG_8xx */
+#include asm/bug.h
   
   #define MSR_SF_LG	63  /* Enable 64 bit mode */

   #define MSR_ISF_LG   61  /* Interrupt 64b mode valid on 630 */
@@ -1193,6 +1194,20 @@
 : r ((unsigned long)(v)) \
 : memory)
   
+static inline unsigned long mfvtb (void)

+{
+#ifdef CONFIG_PPC_BOOK3S_64
+   if (cpu_has_feature(CPU_FTR_ARCH_207S))
+   return mfspr(SPRN_VTB);
+#endif
+   /*
+* The above mfspr will be a no-op on anything before Power8
+* That can result in random values returned. We need to
+* capture that.
+*/
+   BUG();
+}
+
   #ifdef __powerpc64__
   #if defined(CONFIG_PPC_CELL) || defined(CONFIG_PPC_FSL_BOOK3E)
   #define mftb()   ({unsigned long rval;   
\
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 1d428e6007ca..03cbada59d3a 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -102,6 +102,15 @@ static inline u64 get_rtc(void)
return (u64)hi * 10 + lo;
   }
   
+static inline u64 get_vtb(void)

+{
+#ifdef CONFIG_PPC_BOOK3S_64
+   if (cpu_has_feature(CPU_FTR_ARCH_207S))
+   return mfvtb();
+#endif
+   return 0;
+}
+
   #ifdef CONFIG_PPC64
   static inline u64 get_tb(void)
   {
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 52c654dbd41a..ae43e4178ecd 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -646,6 +646,9 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, 
struct kvm_one_reg *reg)
case KVM_REG_PPC_BESCR:
val = get_reg_val(reg-id, vcpu-arch.bescr);
break;
+   case KVM_REG_PPC_VTB:
+   val = get_reg_val(reg-id, vcpu-arch.vtb);
+   break;
default:
r = -EINVAL;
break;
@@ -750,6 +753,9 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, 
struct kvm_one_reg *reg)
case KVM_REG_PPC_BESCR:
vcpu-arch.bescr = set_reg_val(reg-id, val);
break;
+   case KVM_REG_PPC_VTB:
+   vcpu-arch.vtb = set_reg_val(reg-id, val);
+   break;
default:
r = -EINVAL;
break;
diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 3565e775b61b..1bb16a59dcbc 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -577,6 +577,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int 
sprn, ulong *spr_val
 */
*spr_val = vcpu-arch.spurr;
break;
+   case SPRN_VTB:
+   *spr_val = vcpu-arch.vtb;

Doesn't this mean that vtb can be the same 2 when the guest reads it 2
times in a row without getting preempted?


But a mfspr will result in VM exit and that would make sure we
update vcpu-arch.vtb with the correct value.


We only call kvmppc_core_vcpu_put_pr() when we context switch away from 
KVM, so it won't be updated, no?



Alex

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC PATCH 3/3] PPC, KVM, CMA: use general CMA reserved area management framework

2014-06-05 Thread Aneesh Kumar K.V
Paolo Bonzini pbonz...@redhat.com writes:

 Il 03/06/2014 09:02, Michal Nazarewicz ha scritto:
 On Tue, Jun 03 2014, Joonsoo Kim wrote:
 Now, we have general CMA reserved area management framework,
 so use it for future maintainabilty. There is no functional change.

 Signed-off-by: Joonsoo Kim iamjoonsoo@lge.com

 Acked-by: Michal Nazarewicz min...@mina86.com


 Acked-by: Paolo Bonzini pbonz...@redhat.com

 Aneesh, can you test this series?

Sorry for the late reply. I will test this and update here.

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v8 3/3] drivers/vfio: EEH support for VFIO PCI device

2014-06-05 Thread Alex Williamson
On Thu, 2014-06-05 at 16:36 +1000, Gavin Shan wrote:
 The patch adds new IOCTL commands for sPAPR VFIO container device
 to support EEH functionality for PCI devices, which have been passed
 through from host to somebody else via VFIO.
 
 Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com
 ---
  Documentation/vfio.txt  | 87 ++--
  drivers/vfio/Makefile   |  1 +
  drivers/vfio/pci/vfio_pci.c | 20 ++---
  drivers/vfio/vfio_iommu_spapr_tce.c | 17 ++-
  drivers/vfio/vfio_spapr_eeh.c   | 89 
 +
  include/linux/vfio.h| 23 ++
  include/uapi/linux/vfio.h   | 35 +++
  7 files changed, 262 insertions(+), 10 deletions(-)
  create mode 100644 drivers/vfio/vfio_spapr_eeh.c
 
 diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
 index b9ca023..3fa4538 100644
 --- a/Documentation/vfio.txt
 +++ b/Documentation/vfio.txt
 @@ -305,7 +305,15 @@ faster, the map/unmap handling has been implemented in 
 real mode which provides
  an excellent performance which has limitations such as inability to do
  locked pages accounting in real time.
  
 -So 3 additional ioctls have been added:
 +4) According to sPAPR specification, A Partitionable Endpoint (PE) is an I/O
 +subtree that can be treated as a unit for the purposes of partitioning and
 +error recovery. A PE may be a single or multi-function IOA (IO Adapter), a
 +function of a multi-function IOA, or multiple IOAs (possibly including switch
 +and bridge structures above the multiple IOAs). PPC64 guests detect PCI 
 errors
 +and recover from them via EEH RTAS services, which works on the basis of
 +additional ioctl commands.
 +
 +So 4 additional ioctls have been added:
  
   VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
   of the DMA window on the PCI bus.
 @@ -316,9 +324,12 @@ So 3 additional ioctls have been added:
  
   VFIO_IOMMU_DISABLE - disables the container.
  
 + VFIO_EEH_PE_OP - provides an API for EEH setup, error detection and 
 recovery.
  
  The code flow from the example above should be slightly changed:
  
 + struct vfio_eeh_pe_op pe_op = { .argsz = sizeof(pe_op) };
 +
   .
   /* Add the group to the container */
   ioctl(group, VFIO_GROUP_SET_CONTAINER, container);
 @@ -342,9 +353,79 @@ The code flow from the example above should be slightly 
 changed:
   dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
  
   /* Check here is .iova/.size are within DMA window from 
 spapr_iommu_info */
 -
   ioctl(container, VFIO_IOMMU_MAP_DMA, dma_map);
 - .
 +
 + /* Get a file descriptor for the device */
 + device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, :06:0d.0);
 +
 + 
 +
 + /* Gratuitous device reset and go... */
 + ioctl(device, VFIO_DEVICE_RESET);
 +
 + /* Make sure EEH is supported */
 + ioctl(container, VFIO_CHECK_EXTENSION, VFIO_EEH);
 +
 + /* Enable the EEH functionality on the device */
 + pe_op.op = VFIO_EEH_PE_ENABLE;
 + ioctl(container, VFIO_EEH_PE_OP, pe_op);
 +
 + /* You're suggested to create additional data struct to represent
 +  * PE, and put child devices belonging to same IOMMU group to the
 +  * PE instance for later reference.
 +  */
 +
 + /* Check the PE's state and make sure it's in functional state */
 + pe_op.op = VFIO_EEH_PE_GET_STATE;
 + ioctl(container, VFIO_EEH_PE_OP, pe_op);
 +
 + /* Save device state using pci_save_state().
 +  * EEH should be enabled on the specified device.
 +  */
 +
 + 
 +
 + /* When 0xFF's returned from reading PCI config space or IO BARs
 +  * of the PCI device. Check the PE's state to see if that has been
 +  * frozen.
 +  */
 + ioctl(container, VFIO_EEH_PE_OP, pe_op);
 +
 + /* Waiting for pending PCI transactions to be completed and don't
 +  * produce any more PCI traffic from/to the affected PE until
 +  * recovery is finished.
 +  */
 +
 + /* Enable IO for the affected PE and collect logs. Usually, the
 +  * standard part of PCI config space, AER registers are dumped
 +  * as logs for further analysis.
 +  */
 + pe_op.op = VFIO_EEH_PE_UNFREEZE_IO;
 + ioctl(container, VFIO_EEH_PE_OP, pe_op);
 +
 + /*
 +  * Issue PE reset: hot or fundamental reset. Usually, hot reset
 +  * is enough. However, the firmware of some PCI adapters would
 +  * require fundamental reset.
 +  */
 + pe_op.op = VFIO_EEH_PE_RESET_HOT;
 + ioctl(container, VFIO_EEH_PE_OP, pe_op);
 + pe_op.op = VFIO_EEH_PE_RESET_DEACTIVATE;
 + ioctl(container, VFIO_EEH_PE_OP, pe_op);
 +
 + /* Configure the PCI bridges for the affected PE */
 + pe_op.op = VFIO_EEH_PE_CONFIGURE;
 + ioctl(container, VFIO_EEH_PE_OP, pe_op);
 +
 + /* Restored state we saved at initialization time. pci_restore_state()
 + 

Re: [PATCH 1/4] KVM: PPC: BOOK3S: PR: Emulate virtual timebase register

2014-06-05 Thread Aneesh Kumar K.V
Alexander Graf ag...@suse.de writes:

 On 05.06.14 17:50, Aneesh Kumar K.V wrote:
 Alexander Graf ag...@suse.de writes:

 On 05.06.14 14:08, Aneesh Kumar K.V wrote:
 virtual time base register is a per VM, per cpu register that needs
 to be saved and restored on vm exit and entry. Writing to VTB is not
 allowed in the privileged mode.

 Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

...

break;
 diff --git a/arch/powerpc/kvm/book3s_emulate.c 
 b/arch/powerpc/kvm/book3s_emulate.c
 index 3565e775b61b..1bb16a59dcbc 100644
 --- a/arch/powerpc/kvm/book3s_emulate.c
 +++ b/arch/powerpc/kvm/book3s_emulate.c
 @@ -577,6 +577,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu 
 *vcpu, int sprn, ulong *spr_val
 */
*spr_val = vcpu-arch.spurr;
break;
 +  case SPRN_VTB:
 +  *spr_val = vcpu-arch.vtb;
 Doesn't this mean that vtb can be the same 2 when the guest reads it 2
 times in a row without getting preempted?

 But a mfspr will result in VM exit and that would make sure we
 update vcpu-arch.vtb with the correct value.

 We only call kvmppc_core_vcpu_put_pr() when we context switch away from 
 KVM, so it won't be updated, no?



kvmppc_copy_from_svcpu is also called from VM exit path (book3s_interrupt.S)

-aneesh

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [RFC PATCH 1/3] CMA: generalize CMA reserved area management functionality

2014-06-05 Thread Aneesh Kumar K.V
Joonsoo Kim iamjoonsoo@lge.com writes:

 Currently, there are two users on CMA functionality, one is the DMA
 subsystem and the other is the kvm on powerpc. They have their own code
 to manage CMA reserved area even if they looks really similar.
 From my guess, it is caused by some needs on bitmap management. Kvm side
 wants to maintain bitmap not for 1 page, but for more size. Eventually it
 use bitmap where one bit represents 64 pages.

 When I implement CMA related patches, I should change those two places
 to apply my change and it seem to be painful to me. I want to change
 this situation and reduce future code management overhead through
 this patch.

 This change could also help developer who want to use CMA in their
 new feature development, since they can use CMA easily without
 copying  pasting this reserved area management code.

 Signed-off-by: Joonsoo Kim iamjoonsoo@lge.com


The way patches are split makes it difficult to review. Will it be
possible to make changes against one implementation and them move that
to generic code. That helps in finding out what exactly changed.


 diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
 index 00e13ce..b3fe1cc 100644
 --- a/drivers/base/Kconfig
 +++ b/drivers/base/Kconfig
 @@ -283,7 +283,7 @@ config CMA_ALIGNMENT

 If unsure, leave the default value 8.

 -config CMA_AREAS
 +config DMA_CMA_AREAS
   int Maximum count of the CMA device-private areas
   default 7
   help

for ex: The above can be a seperate patch along with further DMA related
cleanups . 


 diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c
 index 83969f8..48cdac8 100644
 --- a/drivers/base/dma-contiguous.c
 +++ b/drivers/base/dma-contiguous.c
 @@ -186,7 +186,7 @@ static int __init cma_activate_area(struct cma *cma)
   return 0;
  }

 -static struct cma cma_areas[MAX_CMA_AREAS];
 +static struct cma cma_areas[MAX_DMA_CMA_AREAS];
  static unsigned cma_area_count;

  static int __init cma_init_reserved_areas(void)
 diff --git a/include/linux/cma.h b/include/linux/cma.h
 new file mode 100644
 index 000..60ba06f
 --- /dev/null
 +++ b/include/linux/cma.h
 @@ -0,0 +1,28 @@
 +/*
 + * Contiguous Memory Allocator
 + *
 + * Copyright LG Electronics Inc., 2014
 + * Written by:
 + *   Joonsoo Kim iamjoonsoo@lge.com
 + *
 + * This program is free software; you can redistribute it and/or
 + * modify it under the terms of the GNU General Public License as
 + * published by the Free Software Foundation; either version 2 of the
 + * License or (at your optional) any later version of the license.
 + *
 + */
 +
 +#ifndef __CMA_H__
 +#define __CMA_H__
 +
 +struct cma;
 +
 +extern struct page *cma_alloc(struct cma *cma, unsigned long count,
 + unsigned long align);
 +extern bool cma_release(struct cma *cma, struct page *pages,
 + unsigned long count);
 +extern int __init cma_declare_contiguous(phys_addr_t size, phys_addr_t base,
 + phys_addr_t limit, phys_addr_t alignment,
 + unsigned long bitmap_shift, bool fixed,
 + struct cma **res_cma);
 +#endif
 diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h
 index 772eab5..dfb1dc9 100644
 --- a/include/linux/dma-contiguous.h
 +++ b/include/linux/dma-contiguous.h
 @@ -63,7 +63,7 @@ struct device;
   * There is always at least global CMA area and a few optional device
   * private areas configured in kernel .config.
   */
 -#define MAX_CMA_AREAS(1 + CONFIG_CMA_AREAS)
 +#define MAX_DMA_CMA_AREAS  (1 + CONFIG_DMA_CMA_AREAS)

  extern struct cma *dma_contiguous_default_area;

 @@ -123,7 +123,7 @@ bool dma_release_from_contiguous(struct device *dev, 
 struct page *pages,

  #else

 -#define MAX_CMA_AREAS(0)
 +#define MAX_DMA_CMA_AREAS(0)

  static inline struct cma *dev_get_cma_area(struct device *dev)
  {
 diff --git a/mm/Kconfig b/mm/Kconfig
 index 7511b4a..0877ddc 100644
 --- a/mm/Kconfig
 +++ b/mm/Kconfig
 @@ -515,6 +515,17 @@ config CMA_DEBUG
 processing calls such as dma_alloc_from_contiguous().
 This option does not affect warning and error messages.

 +config CMA_AREAS
 + int Maximum count of the CMA areas
 + depends on CMA
 + default 7
 + help
 +   CMA allows to create CMA areas for particular purpose, mainly,
 +   used as device private area. This parameter sets the maximum
 +   number of CMA area in the system.
 +
 +   If unsure, leave the default value 7.
 +

If we have 7 declare continugous request, a kvm cma allocation request will 
fail ?

  config ZBUD
   tristate
   default n
 diff --git a/mm/Makefile b/mm/Makefile
 index 1eaa70b..bc0422b 100644
 --- a/mm/Makefile
 +++ b/mm/Makefile
 @@ -62,3 +62,4 @@ obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
  obj-$(CONFIG_ZBUD)   += zbud.o
  obj-$(CONFIG_ZSMALLOC)   += zsmalloc.o
  

Re: [PATCH 1/4] KVM: PPC: BOOK3S: PR: Emulate virtual timebase register

2014-06-05 Thread Alexander Graf


On 05.06.14 19:33, Aneesh Kumar K.V wrote:

Alexander Graf ag...@suse.de writes:


On 05.06.14 17:50, Aneesh Kumar K.V wrote:

Alexander Graf ag...@suse.de writes:


On 05.06.14 14:08, Aneesh Kumar K.V wrote:

virtual time base register is a per VM, per cpu register that needs
to be saved and restored on vm exit and entry. Writing to VTB is not
allowed in the privileged mode.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

...


break;
diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 3565e775b61b..1bb16a59dcbc 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -577,6 +577,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int 
sprn, ulong *spr_val
 */
*spr_val = vcpu-arch.spurr;
break;
+   case SPRN_VTB:
+   *spr_val = vcpu-arch.vtb;

Doesn't this mean that vtb can be the same 2 when the guest reads it 2
times in a row without getting preempted?

But a mfspr will result in VM exit and that would make sure we
update vcpu-arch.vtb with the correct value.

We only call kvmppc_core_vcpu_put_pr() when we context switch away from
KVM, so it won't be updated, no?



kvmppc_copy_from_svcpu is also called from VM exit path (book3s_interrupt.S)


... where it will run into this code path:

/*
 * Maybe we were already preempted and synced the svcpu from
 * our preempt notifiers. Don't bother touching this svcpu then.
 */
if (!svcpu-in_use)
goto out;


Alex

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 1/4] KVM: PPC: BOOK3S: PR: Emulate virtual timebase register

2014-06-05 Thread Alexander Graf


On 06.06.14 00:32, Alexander Graf wrote:


On 05.06.14 19:33, Aneesh Kumar K.V wrote:

Alexander Graf ag...@suse.de writes:


On 05.06.14 17:50, Aneesh Kumar K.V wrote:

Alexander Graf ag...@suse.de writes:


On 05.06.14 14:08, Aneesh Kumar K.V wrote:

virtual time base register is a per VM, per cpu register that needs
to be saved and restored on vm exit and entry. Writing to VTB is not
allowed in the privileged mode.

Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

...


break;
diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c

index 3565e775b61b..1bb16a59dcbc 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -577,6 +577,9 @@ int kvmppc_core_emulate_mfspr_pr(struct 
kvm_vcpu *vcpu, int sprn, ulong *spr_val

 */
*spr_val = vcpu-arch.spurr;
break;
+case SPRN_VTB:
+*spr_val = vcpu-arch.vtb;
Doesn't this mean that vtb can be the same 2 when the guest reads 
it 2

times in a row without getting preempted?

But a mfspr will result in VM exit and that would make sure we
update vcpu-arch.vtb with the correct value.

We only call kvmppc_core_vcpu_put_pr() when we context switch away from
KVM, so it won't be updated, no?


kvmppc_copy_from_svcpu is also called from VM exit path 
(book3s_interrupt.S)


... where it will run into this code path:

/*
 * Maybe we were already preempted and synced the svcpu from
 * our preempt notifiers. Don't bother touching this svcpu then.
 */
if (!svcpu-in_use)
goto out;


Scratch that. We're always calling this on entry/exit, so you're right.


Alex

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 0/3] Prepare for in-kernel VFIO DMA operations acceleration

2014-06-05 Thread Alexey Kardashevskiy
On 06/05/2014 09:57 PM, Alexander Graf wrote:
 
 On 05.06.14 09:25, Alexey Kardashevskiy wrote:
 This reserves 2 capability numbers.

 This implements an extended version of KVM_CREATE_SPAPR_TCE_64 ioctl.

 Please advise how to proceed with these patches as I suspect that
 first two should go via Paolo's tree while the last one via Alex Graf's tree
 (correct?).
 
 They would just go via my tree, but only be actually allocated (read:
 mergable to qemu) when they hit Paolo's tree.
 
 In fact, I don't think it makes sense to split them off at all.


So? Are these patches going anywhere? Thanks.


-- 
Alexey
___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH v8 3/3] drivers/vfio: EEH support for VFIO PCI device

2014-06-05 Thread Gavin Shan
On Thu, Jun 05, 2014 at 11:18:34AM -0600, Alex Williamson wrote:
On Thu, 2014-06-05 at 16:36 +1000, Gavin Shan wrote:
 The patch adds new IOCTL commands for sPAPR VFIO container device
 to support EEH functionality for PCI devices, which have been passed
 through from host to somebody else via VFIO.
 
 Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com
 ---
  Documentation/vfio.txt  | 87 
 ++--
  drivers/vfio/Makefile   |  1 +
  drivers/vfio/pci/vfio_pci.c | 20 ++---
  drivers/vfio/vfio_iommu_spapr_tce.c | 17 ++-
  drivers/vfio/vfio_spapr_eeh.c   | 89 
 +
  include/linux/vfio.h| 23 ++
  include/uapi/linux/vfio.h   | 35 +++
  7 files changed, 262 insertions(+), 10 deletions(-)
  create mode 100644 drivers/vfio/vfio_spapr_eeh.c
 
 diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
 index b9ca023..3fa4538 100644
 --- a/Documentation/vfio.txt
 +++ b/Documentation/vfio.txt
 @@ -305,7 +305,15 @@ faster, the map/unmap handling has been implemented in 
 real mode which provides
  an excellent performance which has limitations such as inability to do
  locked pages accounting in real time.
  
 -So 3 additional ioctls have been added:
 +4) According to sPAPR specification, A Partitionable Endpoint (PE) is an I/O
 +subtree that can be treated as a unit for the purposes of partitioning and
 +error recovery. A PE may be a single or multi-function IOA (IO Adapter), a
 +function of a multi-function IOA, or multiple IOAs (possibly including 
 switch
 +and bridge structures above the multiple IOAs). PPC64 guests detect PCI 
 errors
 +and recover from them via EEH RTAS services, which works on the basis of
 +additional ioctl commands.
 +
 +So 4 additional ioctls have been added:
  
  VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
  of the DMA window on the PCI bus.
 @@ -316,9 +324,12 @@ So 3 additional ioctls have been added:
  
  VFIO_IOMMU_DISABLE - disables the container.
  
 +VFIO_EEH_PE_OP - provides an API for EEH setup, error detection and 
 recovery.
  
  The code flow from the example above should be slightly changed:
  
 +struct vfio_eeh_pe_op pe_op = { .argsz = sizeof(pe_op) };
 +
  .
  /* Add the group to the container */
  ioctl(group, VFIO_GROUP_SET_CONTAINER, container);
 @@ -342,9 +353,79 @@ The code flow from the example above should be slightly 
 changed:
  dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
  
  /* Check here is .iova/.size are within DMA window from 
 spapr_iommu_info */
 -
  ioctl(container, VFIO_IOMMU_MAP_DMA, dma_map);
 -.
 +
 +/* Get a file descriptor for the device */
 +device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, :06:0d.0);
 +
 +
 +
 +/* Gratuitous device reset and go... */
 +ioctl(device, VFIO_DEVICE_RESET);
 +
 +/* Make sure EEH is supported */
 +ioctl(container, VFIO_CHECK_EXTENSION, VFIO_EEH);
 +
 +/* Enable the EEH functionality on the device */
 +pe_op.op = VFIO_EEH_PE_ENABLE;
 +ioctl(container, VFIO_EEH_PE_OP, pe_op);
 +
 +/* You're suggested to create additional data struct to represent
 + * PE, and put child devices belonging to same IOMMU group to the
 + * PE instance for later reference.
 + */
 +
 +/* Check the PE's state and make sure it's in functional state */
 +pe_op.op = VFIO_EEH_PE_GET_STATE;
 +ioctl(container, VFIO_EEH_PE_OP, pe_op);
 +
 +/* Save device state using pci_save_state().
 + * EEH should be enabled on the specified device.
 + */
 +
 +
 +
 +/* When 0xFF's returned from reading PCI config space or IO BARs
 + * of the PCI device. Check the PE's state to see if that has been
 + * frozen.
 + */
 +ioctl(container, VFIO_EEH_PE_OP, pe_op);
 +
 +/* Waiting for pending PCI transactions to be completed and don't
 + * produce any more PCI traffic from/to the affected PE until
 + * recovery is finished.
 + */
 +
 +/* Enable IO for the affected PE and collect logs. Usually, the
 + * standard part of PCI config space, AER registers are dumped
 + * as logs for further analysis.
 + */
 +pe_op.op = VFIO_EEH_PE_UNFREEZE_IO;
 +ioctl(container, VFIO_EEH_PE_OP, pe_op);
 +
 +/*
 + * Issue PE reset: hot or fundamental reset. Usually, hot reset
 + * is enough. However, the firmware of some PCI adapters would
 + * require fundamental reset.
 + */
 +pe_op.op = VFIO_EEH_PE_RESET_HOT;
 +ioctl(container, VFIO_EEH_PE_OP, pe_op);
 +pe_op.op = VFIO_EEH_PE_RESET_DEACTIVATE;
 +ioctl(container, VFIO_EEH_PE_OP, pe_op);
 +
 +/* Configure the PCI bridges for the affected PE */
 +pe_op.op = VFIO_EEH_PE_CONFIGURE;
 +ioctl(container, VFIO_EEH_PE_OP, pe_op);
 +
 +/* Restored state we saved at initialization time. 

Re: [PATCH v8 0/3] EEH Support for VFIO PCI Device

2014-06-05 Thread Gavin Shan
On Thu, Jun 05, 2014 at 02:54:47PM +0200, Alexander Graf wrote:

On 05.06.14 08:36, Gavin Shan wrote:
The series of patches adds support EEH for PCI devices, which are passed
through to PowerKVM based guest via VFIO. The implementation is 
straightforward
based on the issues or problems we have to resolve to support EEH for PowerKVM
based guest.

- Emulation for EEH RTAS requests. All EEH RTAS requests goes to QEMU firstly.
   If QEMU can't handle it, the request will be sent to host via newly 
 introduced
   VFIO container IOCTL command (VFIO_EEH_OP) and gets handled in host kernel.

The series of patches requires corresponding QEMU changes.

Acked-by: Alexander Graf ag...@suse.de


Thanks, Alex :)


Alex


___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v4 1/2]: Allow architectures to skip a callchain entry

2014-06-05 Thread Sukadev Bhattiprolu

The kernel code in Powerpc conservatively saves excess information in
the callchain. While most entries are often needed, under some specific
conditions, some of the entries are redundant and cause duplicate arcs
in the call-graph.

Eg: the value in the link register (LR) is needed only when it holds
the return address of a function. At other times it must be ignored.

In the next commit, we will use the application's DWARF debug information
to identify and skip over the redundant entries.

To minimize performance impact on other architectures, define and use two
following static inline interfaces:

arch_skip_callchain_idx()
next_callchain_ip()

Reported-by: Maynard Johnson mayn...@us.ibm.com
Tested-by: Maynard Johnson mayn...@us.ibm.com
Signed-off-by: Sukadev Bhattiprolu suka...@linux.vnet.ibm.com
---
Changelog[v4]
Move Powerpc-specific code to separate patch
[Jiri Olsa] Minimize performance impact to other architectures

 include/uapi/linux/perf_event.h   |2 ++
 tools/perf/arch/powerpc/Makefile  |1 +
 tools/perf/arch/powerpc/util/skip-callchain-idx.c |   25 ++
 tools/perf/config/Makefile|4 +++
 tools/perf/util/callchain.h   |   37 +
 tools/perf/util/machine.c |   11 +++---
 6 files changed, 76 insertions(+), 4 deletions(-)
 create mode 100644 tools/perf/arch/powerpc/util/skip-callchain-idx.c

diff --git a/include/uapi/linux/perf_event.h b/include/uapi/linux/perf_event.h
index e3fc8f0..b671abf 100644
--- a/include/uapi/linux/perf_event.h
+++ b/include/uapi/linux/perf_event.h
@@ -719,6 +719,8 @@ enum perf_callchain_context {
PERF_CONTEXT_GUEST_KERNEL   = (__u64)-2176,
PERF_CONTEXT_GUEST_USER = (__u64)-2560,
 
+   PERF_CONTEXT_IGNORE = (__u64)-3840,
+
PERF_CONTEXT_MAX= (__u64)-4095,
 };
 
diff --git a/tools/perf/arch/powerpc/Makefile b/tools/perf/arch/powerpc/Makefile
index 744e629..b92219b 100644
--- a/tools/perf/arch/powerpc/Makefile
+++ b/tools/perf/arch/powerpc/Makefile
@@ -3,3 +3,4 @@ PERF_HAVE_DWARF_REGS := 1
 LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/dwarf-regs.o
 endif
 LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/header.o
+LIB_OBJS += $(OUTPUT)arch/$(ARCH)/util/skip-callchain-idx.o
diff --git a/tools/perf/arch/powerpc/util/skip-callchain-idx.c 
b/tools/perf/arch/powerpc/util/skip-callchain-idx.c
new file mode 100644
index 000..7350c36
--- /dev/null
+++ b/tools/perf/arch/powerpc/util/skip-callchain-idx.c
@@ -0,0 +1,25 @@
+/*
+ * Use DWARF Debug information to skip unnecessary callchain entries.
+ *
+ * Copyright (C) 2014 Sukadev Bhattiprolu, IBM Corporation.
+ * Copyright (C) 2014 Ulrich Weigand, IBM Corporation.
+ *
+ * This program is free software; you can redistribute it and/or
+ * modify it under the terms of the GNU General Public License
+ * as published by the Free Software Foundation; either version
+ * 2 of the License, or (at your option) any later version.
+ */
+#include inttypes.h
+#include dwarf.h
+#include elfutils/libdwfl.h
+
+#include util/thread.h
+#include util/callchain.h
+
+/* Stub for now */
+int arch_skip_callchain_idx(struct machine *machine __maybe_unused,
+   struct thread *thread __maybe_unused,
+   struct ip_callchain *chain __maybe_unused)
+{
+   return -1;
+}
diff --git a/tools/perf/config/Makefile b/tools/perf/config/Makefile
index 729bbdf..8d1417d 100644
--- a/tools/perf/config/Makefile
+++ b/tools/perf/config/Makefile
@@ -48,6 +48,10 @@ ifneq ($(ARCH),$(filter $(ARCH),x86 arm))
   NO_LIBDW_DWARF_UNWIND := 1
 endif
 
+ifeq ($(ARCH),powerpc)
+  CFLAGS += -DHAVE_SKIP_CALLCHAIN_IDX
+endif
+
 ifeq ($(LIBUNWIND_LIBS),)
   NO_LIBUNWIND := 1
 else
diff --git a/tools/perf/util/callchain.h b/tools/perf/util/callchain.h
index 8f84423..57d3d33 100644
--- a/tools/perf/util/callchain.h
+++ b/tools/perf/util/callchain.h
@@ -176,4 +176,41 @@ static inline void callchain_cursor_snapshot(struct 
callchain_cursor *dest,
dest-first = src-curr;
dest-nr -= src-pos;
 }
+
+/*
+ * Some architectures (eg: Powerpc), check DWARF debug information
+ * and skip a specific callchain entry in the @chain-ips[] list.
+ *
+ * Return index of the entry to skip or -1 to not skip any entry.
+ */
+#ifdef HAVE_SKIP_CALLCHAIN_IDX
+extern int
+arch_skip_callchain_idx(struct machine *machine __maybe_unused,
+   struct thread *thread __maybe_unused,
+   struct ip_callchain *chain __maybe_unused);
+#else
+static inline int
+arch_skip_callchain_idx(struct machine *machine __maybe_unused,
+   struct thread *thread __maybe_unused,
+   struct ip_callchain *chain __maybe_unused)
+{
+   return -1;
+}
+#endif
+
+static inline u64
+next_callchain_ip(struct ip_callchain *chain,
+   enum chain_order order,
+ 

[PATCH v4 2/2]: powerpc/perf: Adjust callchain based on DWARF debug info

2014-06-05 Thread Sukadev Bhattiprolu

Replace the arch_skip_callchain_idx() stub in Powerpc with code that
checks the DWARF debug information and identifies the callchain entry
to skip.

Callgraph before the patch:

14.67%  2234  sprintft  libc-2.18.so   [.] __random
|
--- __random
   |
   |--61.12%-- __random
   |  |
   |  |--97.15%-- rand
   |  |  do_my_sprintf
   |  |  main
   |  |  generic_start_main.isra.0
   |  |  __libc_start_main
   |  |  0x0
   |  |
   |   --2.85%-- do_my_sprintf
   | main
   | generic_start_main.isra.0
   | __libc_start_main
   | 0x0
   |
--38.88%-- rand
  |
  |--94.01%-- rand
  |  do_my_sprintf
  |  main
  |  generic_start_main.isra.0
  |  __libc_start_main
  |  0x0
  |
   --5.99%-- do_my_sprintf
 main
 generic_start_main.isra.0
 __libc_start_main
 0x0

Callgraph after the patch:

14.67%  2234  sprintft  libc-2.18.so   [.] __random
|
--- __random
   |
   |--95.93%-- rand
   |  do_my_sprintf
   |  main
   |  generic_start_main.isra.0
   |  __libc_start_main
   |  0x0
   |
--4.07%-- do_my_sprintf
  main
  generic_start_main.isra.0
  __libc_start_main
  0x0

TODO:   For split-debug info objects like glibc, we can only determine
the call-frame-address only when both .eh_frame and .debug_info
sections are available. We should be able to determin the CFA
even without the .eh_frame section.

Fix suggested by Anton Blanchard.

Thanks to valuable input on DWARF debug information from Ulrich Weigand.

Reported-by: Maynard Johnson mayn...@us.ibm.com
Tested-by: Maynard Johnson mayn...@us.ibm.com
Signed-off-by: Sukadev Bhattiprolu suka...@linux.vnet.ibm.com
---
Changelog[v4]
Move Powerpc-specific code into a separate patch

Changelog[v3]
[Jiri Olsa] Rename function to arch_skip_callchain_idx() to be
consistent with behavior.
[Jiri Olsa] Add '__maybe_unused' tags for unused parameters.

Changelog[v2]:
Add missing dwfl_end()
Fix merge conflicts due to some unwind code

 tools/perf/arch/powerpc/util/skip-callchain-idx.c |  251 -
 1 file changed, 246 insertions(+), 5 deletions(-)

diff --git a/tools/perf/arch/powerpc/util/skip-callchain-idx.c 
b/tools/perf/arch/powerpc/util/skip-callchain-idx.c
index 7350c36..a7c23a4 100644
--- a/tools/perf/arch/powerpc/util/skip-callchain-idx.c
+++ b/tools/perf/arch/powerpc/util/skip-callchain-idx.c
@@ -16,10 +16,251 @@
 #include util/thread.h
 #include util/callchain.h
 
-/* Stub for now */
-int arch_skip_callchain_idx(struct machine *machine __maybe_unused,
-   struct thread *thread __maybe_unused,
-   struct ip_callchain *chain __maybe_unused)
+/*
+ * When saving the callchain on Power, the kernel conservatively saves
+ * excess entries in the callchain. A few of these entries are needed
+ * in some cases but not others. If the unnecessary entries are not
+ * ignored, we end up with duplicate arcs in the call-graphs. Use
+ * DWARF debug information to skip over any unnecessary callchain
+ * entries.
+ *
+ * See function header for arch_adjust_callchain() below for more details.
+ *
+ * The libdwfl code in this file is based on code from elfutils
+ * (libdwfl/argp-std.c, libdwfl/tests/addrcfi.c, etc).
+ */
+static char *debuginfo_path;
+
+static const Dwfl_Callbacks offline_callbacks = {
+   .debuginfo_path = debuginfo_path,
+   .find_debuginfo = dwfl_standard_find_debuginfo,
+   .section_address = dwfl_offline_section_address,
+};
+
+
+/*
+ * Use the DWARF expression for the Call-frame-address and determine
+ * if return address is in LR and if a new frame was allocated.
+ */
+static int check_return_reg(int ra_regno, Dwarf_Frame *frame)
+{
+   Dwarf_Op ops_mem[2];
+   Dwarf_Op dummy;
+   Dwarf_Op *ops = dummy;
+   size_t nops;
+   int result;
+
+   result = dwarf_frame_register(frame, ra_regno, ops_mem, 

[PATCH] powerpc: Don't setup CPUs with bad status

2014-06-05 Thread Michael Neuling
OPAL will mark a CPU that is guarded as bad in the status property of the CPU
node.

Unfortunatley Linux doesn't check this property and will put the bad CPU in the
present map.  This has caused hangs on booting when we try to unsplit the core.

This patch checks the CPU is avaliable via this status property before putting
it in the present map.

Signed-off-by: Michael Neuling mi...@neuling.org
Tested-by: Anton Blanchard an...@samba.org
cc: sta...@vger.kernel.org
---
 arch/powerpc/kernel/setup-common.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kernel/setup-common.c 
b/arch/powerpc/kernel/setup-common.c
index d4d4183..e239df3 100644
--- a/arch/powerpc/kernel/setup-common.c
+++ b/arch/powerpc/kernel/setup-common.c
@@ -471,7 +471,7 @@ void __init smp_setup_cpu_maps(void)
for (j = 0; j  nthreads  cpu  nr_cpu_ids; j++) {
DBG(thread %d - cpu %d (hard id %d)\n,
j, cpu, be32_to_cpu(intserv[j]));
-   set_cpu_present(cpu, true);
+   set_cpu_present(cpu, of_device_is_available(dn));
set_hard_smp_processor_id(cpu, be32_to_cpu(intserv[j]));
set_cpu_possible(cpu, true);
cpu++;
-- 
1.9.1

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v9 1/3] powerpc/eeh: Avoid event on passed PE

2014-06-05 Thread Gavin Shan
We must not handle EEH error on devices which are passed to somebody
else. Instead, we expect that the frozen device owner detects an EEH
error and recovers from it.

This avoids EEH error handling on passed through devices so the device
owner gets a chance to handle them.

Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com
Acked-by: Alexander Graf ag...@suse.de
---
 arch/powerpc/include/asm/eeh.h| 7 +++
 arch/powerpc/kernel/eeh.c | 8 
 arch/powerpc/platforms/powernv/eeh-ioda.c | 3 ++-
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 7782056..653d981 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -25,6 +25,7 @@
 #include linux/list.h
 #include linux/string.h
 #include linux/time.h
+#include linux/atomic.h
 
 struct pci_dev;
 struct pci_bus;
@@ -84,6 +85,7 @@ struct eeh_pe {
int freeze_count;   /* Times of froze up*/
struct timeval tstamp;  /* Time on first-time freeze*/
int false_positives;/* Times of reported #ff's  */
+   atomic_t pass_dev_cnt;  /* Count of passed through devs */
struct eeh_pe *parent;  /* Parent PE*/
struct list_head child_list;/* Link PE to the child list*/
struct list_head edevs; /* Link list of EEH devices */
@@ -93,6 +95,11 @@ struct eeh_pe {
 #define eeh_pe_for_each_dev(pe, edev, tmp) \
list_for_each_entry_safe(edev, tmp, pe-edevs, list)
 
+static inline bool eeh_pe_passed(struct eeh_pe *pe)
+{
+   return pe ? !!atomic_read(pe-pass_dev_cnt) : false;
+}
+
 /*
  * The struct is used to trace EEH state for the associated
  * PCI device node or PCI device. In future, it might
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 9c6b899..3bc8b12 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -400,6 +400,14 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
if (ret  0)
return ret;
 
+   /*
+* If the PE isn't owned by us, we shouldn't check the
+* state. Instead, let the owner handle it if the PE has
+* been frozen.
+*/
+   if (eeh_pe_passed(pe))
+   return 0;
+
/* If we already have a pending isolation event for this
 * slot, we know it's bad already, we don't need to check.
 * Do this checking under a lock; as multiple PCI devices
diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c 
b/arch/powerpc/platforms/powernv/eeh-ioda.c
index cab3e62..79193eb 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -892,7 +892,8 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
opal_pci_eeh_freeze_clear(phb-opal_id, 
frozen_pe_no,
OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
ret = EEH_NEXT_ERR_NONE;
-   } else if ((*pe)-state  EEH_PE_ISOLATED) {
+   } else if ((*pe)-state  EEH_PE_ISOLATED ||
+  eeh_pe_passed(*pe)) {
ret = EEH_NEXT_ERR_NONE;
} else {
pr_err(EEH: Frozen PHB#%x-PE#%x (%s) 
detected\n,
-- 
1.8.3.2

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

[PATCH v9 2/3] powerpc/eeh: EEH support for VFIO PCI device

2014-06-05 Thread Gavin Shan
The patch exports functions to be used by new VFIO ioctl command,
which will be introduced in subsequent patch, to support EEH
functinality for VFIO PCI devices.

Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com
Acked-by: Alexander Graf ag...@suse.de
---
 arch/powerpc/include/asm/eeh.h |  12 ++
 arch/powerpc/kernel/eeh.c  | 268 +
 2 files changed, 280 insertions(+)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 653d981..b733044 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -173,6 +173,11 @@ enum {
 #define EEH_STATE_DMA_ACTIVE   (1  4)/* Active DMA   */
 #define EEH_STATE_MMIO_ENABLED (1  5)/* MMIO enabled */
 #define EEH_STATE_DMA_ENABLED  (1  6)/* DMA enabled  */
+#define EEH_PE_STATE_NORMAL0   /* Normal state */
+#define EEH_PE_STATE_RESET 1   /* PE reset asserted*/
+#define EEH_PE_STATE_STOPPED_IO_DMA2   /* Frozen PE*/
+#define EEH_PE_STATE_STOPPED_DMA   4   /* Stopped DMA, Enabled IO */
+#define EEH_PE_STATE_UNAVAIL   5   /* Unavailable  */
 #define EEH_RESET_DEACTIVATE   0   /* Deactivate the PE reset  */
 #define EEH_RESET_HOT  1   /* Hot reset*/
 #define EEH_RESET_FUNDAMENTAL  3   /* Fundamental reset*/
@@ -280,6 +285,13 @@ void eeh_add_device_late(struct pci_dev *);
 void eeh_add_device_tree_late(struct pci_bus *);
 void eeh_add_sysfs_files(struct pci_bus *);
 void eeh_remove_device(struct pci_dev *);
+int eeh_dev_open(struct pci_dev *pdev);
+void eeh_dev_release(struct pci_dev *pdev);
+struct eeh_pe *eeh_iommu_group_to_pe(struct iommu_group *group);
+int eeh_pe_set_option(struct eeh_pe *pe, int option);
+int eeh_pe_get_state(struct eeh_pe *pe);
+int eeh_pe_reset(struct eeh_pe *pe, int option);
+int eeh_pe_configure(struct eeh_pe *pe);
 
 /**
  * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure.
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 3bc8b12..fc90df0 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -40,6 +40,7 @@
 #include asm/eeh.h
 #include asm/eeh_event.h
 #include asm/io.h
+#include asm/iommu.h
 #include asm/machdep.h
 #include asm/ppc-pci.h
 #include asm/rtas.h
@@ -108,6 +109,9 @@ struct eeh_ops *eeh_ops = NULL;
 /* Lock to avoid races due to multiple reports of an error */
 DEFINE_RAW_SPINLOCK(confirm_error_lock);
 
+/* Lock to protect passed flags */
+static DEFINE_MUTEX(eeh_dev_mutex);
+
 /* Buffer for reporting pci register dumps. Its here in BSS, and
  * not dynamically alloced, so that it ends up in RMO where RTAS
  * can access it.
@@ -1106,6 +1110,270 @@ void eeh_remove_device(struct pci_dev *dev)
edev-mode = ~EEH_DEV_SYSFS;
 }
 
+/**
+ * eeh_dev_open - Increase count of pass through devices for PE
+ * @pdev: PCI device
+ *
+ * Increase count of passed through devices for the indicated
+ * PE. In the result, the EEH errors detected on the PE won't be
+ * reported. The PE owner will be responsible for detection
+ * and recovery.
+ */
+int eeh_dev_open(struct pci_dev *pdev)
+{
+   struct eeh_dev *edev;
+
+   mutex_lock(eeh_dev_mutex);
+
+   /* No PCI device ? */
+   if (!pdev)
+   goto out;
+
+   /* No EEH device or PE ? */
+   edev = pci_dev_to_eeh_dev(pdev);
+   if (!edev || !edev-pe)
+   goto out;
+
+   /* Increase PE's pass through count */
+   atomic_inc(edev-pe-pass_dev_cnt);
+   mutex_unlock(eeh_dev_mutex);
+
+   return 0;
+out:
+   mutex_unlock(eeh_dev_mutex);
+   return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(eeh_dev_open);
+
+/**
+ * eeh_dev_release - Decrease count of pass through devices for PE
+ * @pdev: PCI device
+ *
+ * Decrease count of pass through devices for the indicated PE. If
+ * there is no passed through device in PE, the EEH errors detected
+ * on the PE will be reported and handled as usual.
+ */
+void eeh_dev_release(struct pci_dev *pdev)
+{
+   struct eeh_dev *edev;
+
+   mutex_lock(eeh_dev_mutex);
+
+   /* No PCI device ? */
+   if (!pdev)
+   goto out;
+
+   /* No EEH device ? */
+   edev = pci_dev_to_eeh_dev(pdev);
+   if (!edev || !edev-pe || !eeh_pe_passed(edev-pe))
+   goto out;
+
+   /* Decrease PE's pass through count */
+   atomic_dec(edev-pe-pass_dev_cnt);
+   WARN_ON(atomic_read(edev-pe-pass_dev_cnt)  0);
+out:
+   mutex_unlock(eeh_dev_mutex);
+}
+EXPORT_SYMBOL(eeh_dev_release);
+
+/**
+ * eeh_iommu_group_to_pe - Convert IOMMU group to EEH PE
+ * @group: IOMMU group
+ *
+ * The routine is called to convert IOMMU group to EEH PE.
+ */
+struct eeh_pe *eeh_iommu_group_to_pe(struct iommu_group *group)
+{
+   struct iommu_table *tbl;
+   struct pci_dev *pdev = NULL;
+   struct eeh_dev *edev;
+   bool found = false;
+
+  

[PATCH v9 0/3] EEH Support for VFIO PCI Device

2014-06-05 Thread Gavin Shan
The series of patches adds support EEH for PCI devices, which are passed
through to PowerKVM based guest via VFIO. The implementation is straightforward
based on the issues or problems we have to resolve to support EEH for PowerKVM
based guest.

- Emulation for EEH RTAS requests. All EEH RTAS requests goes to QEMU firstly.
  If QEMU can't handle it, the request will be sent to host via newly introduced
  VFIO container IOCTL command (VFIO_EEH_OP) and gets handled in host kernel.

The series of patches requires corresponding QEMU changes.

Change log
==
v1 - v2:
* EEH RTAS requests are routed to QEMU, and then possiblly to host 
kerenl.
  The mechanism KVM in-kernel handling is dropped.
* Error injection is reimplemented based syscall, instead of KVM 
in-kerenl
  handling. The logic for error injection token management is moved to
  QEMU. The error injection request is routed to QEMU and then possiblly
  to host kernel.
v2 - v3:
* Make the fields in struct eeh_vfio_pci_addr, struct vfio_eeh_info 
based
  on the comments from Alexey.
* Define macros for EEH VFIO operations (Alexey).
* Clear frozen state after successful PE reset.
* Merge original [PATCH 1/2/3] to one.
v3 - v4:
* Remove the error injection from the patchset. Mike or I will work on 
that
  later.
* Rename CONFIG_VFIO_EEH to VFIO_PCI_EEH.
* Rename the IOCTL command to VFIO_EEH_OP and it's handled by VFIO-PCI 
device
  instead of VFIO container.
* Rename the IOCTL argument structure to vfio_eeh_op accordingly. 
Also, more
  fields added to hold return values for RTAS requests.
* The address mapping stuff is totally removed. When opening or 
releasing VFIO
  PCI device, notification sent to EEH to update the flags indicates 
the device
  is passed to guest or not.
* Change pr_warn() to pr_debug() to avoid DOS as pointed by Alex.W
* Argument size check issue pointed by Alex.W.
v4 - v5:
* Functions for VFIO PCI EEH support are moved to eeh.c and exported 
from there.
  VFIO PCI driver just uses those functions to tackle IOCTL command 
VFIO_EEH_OP.
  All of this is to make the code organized in a good way as suggested 
by Alex.G.
  Another potential benefit is PowerNV/pSeries are sharing eeh_ops 
and same
  infrastructure could possiblly work for KVM_PR and KVM_HV mode at the 
same time.
* Don't clear error injection registers after finishing PE reset as the 
patchset
  is doing nothing related to error injection.
* Amending Documentation/vfio.txt, which was missed in last revision.
* No QEMU changes for this revision. v4 works well. Also, remove 
RFC from the
  subject as the design is basically recognized.
v5 - v6:
* CONFIG_VFIO_PCI_EEH removed. Instead to use CONFIG_EEH.
* Split one ioctl command to 5.
* In eeh.c, description has been added for those exported functions. 
Also, the
  functions have negative return values for error and information with 
other values.
  All digital numbers have been replaced by macros defined in eeh.h. 
The comments,
  including the function names have been amended not to mention guest 
or vfio.
* Add one mutex to protect flag in eeh_dev_open()/release().
* More information on how to use those ioctl commands to 
Documentation/vfio.txt.
v6 - v7:
* Remove ioctl command VFIO_EEH_PE_GET_ADDR, the PE address will be 
figured out
  in userland (e.g. QEMU) as Alex.G suggested.
* Let sPAPR VFIO container process the ioctl commands as VFIO container 
is naturally
  corresponds to IOMMU group (aka PE on sPAPR platform).
* All VFIO PCI EEH ioctl commands have argsz+flags for its companion 
data struct.
* For VFIO PCI EEH ioctl commands, ioctl() returns negative number to 
indicate error
  or zero for success. Additinal output information is transported by 
the companion
  data struct.
* Explaining PE in Documentation/vfio.txt, typo fixes, more comments 
suggested by
  Alex.G.
* Split/merge patches according to suggestions from Alex.G and Alex.W.
* To have EEH stub in drivers/vfio/pci/, which was suggested by Alex.W.
* Define various EEH options as macros in vfio.h for userland to use.
v7 - v8:
* Change ioctl commands back to combined one.
* EEH related logic was put into drivers/vfio/vfio_eeh.c, which is only 
built with
  CONFIG_EEH. Otherwise, inline functions defined in 
include/linux/vfio.h
* Change vfio.txt according to the source code changes.
* Fix various comments from internal reviews by Alexey. Thanks to 
Alexey.
v8 - v9:
* Remove unused macros in asm/include/eeh.h
* Missed to disable VFIO device on error from 

[PATCH v9 3/3] drivers/vfio: EEH support for VFIO PCI device

2014-06-05 Thread Gavin Shan
The patch adds new IOCTL commands for sPAPR VFIO container device
to support EEH functionality for PCI devices, which have been passed
through from host to somebody else via VFIO.

Signed-off-by: Gavin Shan gws...@linux.vnet.ibm.com
Acked-by: Alexander Graf ag...@suse.de
---
 Documentation/vfio.txt  | 87 +++--
 drivers/vfio/Makefile   |  1 +
 drivers/vfio/pci/vfio_pci.c | 18 ++--
 drivers/vfio/vfio_iommu_spapr_tce.c | 17 +++-
 drivers/vfio/vfio_spapr_eeh.c   | 87 +
 include/linux/vfio.h| 23 ++
 include/uapi/linux/vfio.h   | 34 +++
 7 files changed, 259 insertions(+), 8 deletions(-)
 create mode 100644 drivers/vfio/vfio_spapr_eeh.c

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index b9ca023..3fa4538 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -305,7 +305,15 @@ faster, the map/unmap handling has been implemented in 
real mode which provides
 an excellent performance which has limitations such as inability to do
 locked pages accounting in real time.
 
-So 3 additional ioctls have been added:
+4) According to sPAPR specification, A Partitionable Endpoint (PE) is an I/O
+subtree that can be treated as a unit for the purposes of partitioning and
+error recovery. A PE may be a single or multi-function IOA (IO Adapter), a
+function of a multi-function IOA, or multiple IOAs (possibly including switch
+and bridge structures above the multiple IOAs). PPC64 guests detect PCI errors
+and recover from them via EEH RTAS services, which works on the basis of
+additional ioctl commands.
+
+So 4 additional ioctls have been added:
 
VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
of the DMA window on the PCI bus.
@@ -316,9 +324,12 @@ So 3 additional ioctls have been added:
 
VFIO_IOMMU_DISABLE - disables the container.
 
+   VFIO_EEH_PE_OP - provides an API for EEH setup, error detection and 
recovery.
 
 The code flow from the example above should be slightly changed:
 
+   struct vfio_eeh_pe_op pe_op = { .argsz = sizeof(pe_op) };
+
.
/* Add the group to the container */
ioctl(group, VFIO_GROUP_SET_CONTAINER, container);
@@ -342,9 +353,79 @@ The code flow from the example above should be slightly 
changed:
dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
 
/* Check here is .iova/.size are within DMA window from 
spapr_iommu_info */
-
ioctl(container, VFIO_IOMMU_MAP_DMA, dma_map);
-   .
+
+   /* Get a file descriptor for the device */
+   device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, :06:0d.0);
+
+   
+
+   /* Gratuitous device reset and go... */
+   ioctl(device, VFIO_DEVICE_RESET);
+
+   /* Make sure EEH is supported */
+   ioctl(container, VFIO_CHECK_EXTENSION, VFIO_EEH);
+
+   /* Enable the EEH functionality on the device */
+   pe_op.op = VFIO_EEH_PE_ENABLE;
+   ioctl(container, VFIO_EEH_PE_OP, pe_op);
+
+   /* You're suggested to create additional data struct to represent
+* PE, and put child devices belonging to same IOMMU group to the
+* PE instance for later reference.
+*/
+
+   /* Check the PE's state and make sure it's in functional state */
+   pe_op.op = VFIO_EEH_PE_GET_STATE;
+   ioctl(container, VFIO_EEH_PE_OP, pe_op);
+
+   /* Save device state using pci_save_state().
+* EEH should be enabled on the specified device.
+*/
+
+   
+
+   /* When 0xFF's returned from reading PCI config space or IO BARs
+* of the PCI device. Check the PE's state to see if that has been
+* frozen.
+*/
+   ioctl(container, VFIO_EEH_PE_OP, pe_op);
+
+   /* Waiting for pending PCI transactions to be completed and don't
+* produce any more PCI traffic from/to the affected PE until
+* recovery is finished.
+*/
+
+   /* Enable IO for the affected PE and collect logs. Usually, the
+* standard part of PCI config space, AER registers are dumped
+* as logs for further analysis.
+*/
+   pe_op.op = VFIO_EEH_PE_UNFREEZE_IO;
+   ioctl(container, VFIO_EEH_PE_OP, pe_op);
+
+   /*
+* Issue PE reset: hot or fundamental reset. Usually, hot reset
+* is enough. However, the firmware of some PCI adapters would
+* require fundamental reset.
+*/
+   pe_op.op = VFIO_EEH_PE_RESET_HOT;
+   ioctl(container, VFIO_EEH_PE_OP, pe_op);
+   pe_op.op = VFIO_EEH_PE_RESET_DEACTIVATE;
+   ioctl(container, VFIO_EEH_PE_OP, pe_op);
+
+   /* Configure the PCI bridges for the affected PE */
+   pe_op.op = VFIO_EEH_PE_CONFIGURE;
+   ioctl(container, VFIO_EEH_PE_OP, pe_op);
+
+   /* Restored state we saved at initialization time. pci_restore_state()
+* is good enough 

RE: [linuxppc-release] [PATCH][v10] powerpc/mpc85xx:Add initial device tree support of T104x

2014-06-05 Thread shengzhou....@freescale.com


 -Original Message-
 From: linuxppc-release-boun...@linux.freescale.net [mailto:linuxppc-
 release-boun...@linux.freescale.net] On Behalf Of Prabhakar Kushwaha
 Sent: Monday, April 21, 2014 7:34 PM
 To: linuxppc-dev@lists.ozlabs.org
 Cc: Wood Scott-B07421; Jain Priyanka-B32167; Aggrwal Poonam-B10812;
 Kushwaha Prabhakar-B32579
 Subject: [linuxppc-release] [PATCH][v10] powerpc/mpc85xx:Add initial
 device tree support of T104x
 
 The QorIQ T1040/T1042 processor support four integrated 64-bit e5500 PA
 processor cores with high-performance data path acceleration architecture
 and network peripheral interfaces required for networking 
 telecommunications.
 
 +
 + iommu@2 {
 + compatible = fsl,pamu-v1.0, fsl,pamu;
 + reg = 0x2 0x1000;
 + ranges = 0 0x2 0x1000;
 + #address-cells = 1;
 + #size-cells = 1;
 + interrupts = 
 + 24 2 0 0
 + 16 2 1 30;
 + pamu0: pamu@0 {
 + reg = 0 0x1000;
 + fsl,primary-cache-geometry = 128 1;
 + fsl,secondary-cache-geometry = 16 2;
 + };


[Shengzhou]  T1040 RM says:
Hardware coherent PAMU Look-aside caches to improve performance
* A 32-entry, direct-mapped primary PAACT cache
* A 128-entry, 2-way, set-associative secondary PAACT cache
It appears it should be: 
   fsl,primary-cache-geometry = 32 1;
   fsl,secondary-cache-geometry = 128 2;

is there any reason that it was 128 1,  16 2 ?

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 1/2] powerpc/powernv: include asm/smp.h to handle UP config

2014-06-05 Thread Anshuman Khandual
On 06/05/2014 08:51 PM, Shreyas B. Prabhu wrote:
 Build throws following errors when CONFIG_SMP=n
 arch/powerpc/platforms/powernv/setup.c: In function 
 ‘pnv_kexec_wait_secondaries_down’:
 arch/powerpc/platforms/powernv/setup.c:179:4: error: implicit declaration of 
 function ‘get_hard_smp_processor_id’
 rc = opal_query_cpu_status(get_hard_smp_processor_id(i),
 
 The usage of get_hard_smp_processor_id() needs the declaration from
 asm/smp.h. The file setup.c includes linux/sched.h, which in-turn
 includes linux/smp.h. However, linux/smp.h includes asm/smp.h
 only on SMP configs and hence UP builds fail.
 
 Fix this by directly including asm/smp.h in setup.c unconditionally.

Can you please clean up the description in the commit message ? and also
the first line in the commit message should mention that the patch is
trying to fix a UP specific build failure.

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev

Re: [PATCH 2/2] powerpc/powernv : Disable subcore for UP configs

2014-06-05 Thread Anshuman Khandual
On 06/05/2014 08:54 PM, Shreyas B. Prabhu wrote:
 Build throws following errors when CONFIG_SMP=n
 arch/powerpc/platforms/powernv/subcore.c: In function ‘cpu_update_split_mode’:
 arch/powerpc/platforms/powernv/subcore.c:274:15: error: ‘setup_max_cpus’ 
 undeclared (first use in this function)
 arch/powerpc/platforms/powernv/subcore.c:285:5: error: lvalue required as 
 left operand of assignment
 
 'setup_max_cpus' variable is relevant only on SMP, so there is no point
 working around it for UP. Furthermore, subcore.c itself is relevant only
 on SMP and hence the better solution is to exclude subcore.c for UP builds.
 
 Signed-off-by: Shreyas B. Prabhu shre...@linux.vnet.ibm.com
 ---
 This patch applies on top of ben/powerpc.git/next branch
 
  arch/powerpc/platforms/powernv/Makefile | 4 ++--
  1 file changed, 2 insertions(+), 2 deletions(-)
 
 diff --git a/arch/powerpc/platforms/powernv/Makefile 
 b/arch/powerpc/platforms/powernv/Makefile
 index 4ad0d34..636d206 100644
 --- a/arch/powerpc/platforms/powernv/Makefile
 +++ b/arch/powerpc/platforms/powernv/Makefile
 @@ -1,9 +1,9 @@
  obj-y+= setup.o opal-takeover.o opal-wrappers.o 
 opal.o opal-async.o
  obj-y+= opal-rtc.o opal-nvram.o opal-lpc.o 
 opal-flash.o
  obj-y+= rng.o opal-elog.o opal-dump.o 
 opal-sysparam.o opal-sensor.o
 -obj-y+= opal-msglog.o subcore.o subcore-asm.o
 +obj-y+= opal-msglog.o subcore-asm.o
 

subcore-asm.o can also move down here as well ?

 -obj-$(CONFIG_SMP)+= smp.o
 +obj-$(CONFIG_SMP)+= smp.o subcore.o

___
Linuxppc-dev mailing list
Linuxppc-dev@lists.ozlabs.org
https://lists.ozlabs.org/listinfo/linuxppc-dev