[PATCH 0/8 v5] KVM: PPC: IOMMU in-kernel handling

2013-07-06 Thread Alexey Kardashevskiy
The changes are:
1. rebased on v3.10
2. added arch_spin_locks to protect TCE table in real mode
3. reworked VFIO external API
4. added missing bits for real mode handling of TCE requests on p7ioc

MOre details in the individual patch comments.

Depends on hashtable: add hash_for_each_possible_rcu_notrace(),
posted earlier today.

Alexey Kardashevskiy (8):
  KVM: PPC: reserve a capability number for multitce support
  KVM: PPC: reserve a capability and ioctl numbers for realmode VFIO
  vfio: add external user support
  powerpc: Prepare to support kernel handling of IOMMU map/unmap
  powerpc: add real mode support for dma operations on powernv
  KVM: PPC: Add support for multiple-TCE hcalls
  KVM: PPC: Add support for IOMMU in-kernel handling
  KVM: PPC: Add hugepage support for IOMMU in-kernel handling

 Documentation/virtual/kvm/api.txt |  51 +++
 arch/powerpc/include/asm/iommu.h  |   9 +-
 arch/powerpc/include/asm/kvm_host.h   |  37 ++
 arch/powerpc/include/asm/kvm_ppc.h|  18 +-
 arch/powerpc/include/asm/machdep.h|  12 +
 arch/powerpc/include/asm/pgtable-ppc64.h  |   4 +
 arch/powerpc/include/uapi/asm/kvm.h   |   7 +
 arch/powerpc/kernel/iommu.c   | 200 +++
 arch/powerpc/kvm/book3s_64_vio.c  | 541 +-
 arch/powerpc/kvm/book3s_64_vio_hv.c   | 404 --
 arch/powerpc/kvm/book3s_hv.c  |  41 ++-
 arch/powerpc/kvm/book3s_hv_rmhandlers.S   |   6 +
 arch/powerpc/kvm/book3s_pr_papr.c |  37 +-
 arch/powerpc/kvm/powerpc.c|  15 +
 arch/powerpc/mm/init_64.c |  78 -
 arch/powerpc/platforms/powernv/pci-ioda.c |  26 +-
 arch/powerpc/platforms/powernv/pci.c  |  38 ++-
 arch/powerpc/platforms/powernv/pci.h  |   2 +-
 drivers/vfio/vfio.c   |  35 ++
 include/linux/page-flags.h|   4 +-
 include/linux/vfio.h  |   7 +
 include/uapi/linux/kvm.h  |   3 +
 22 files changed, 1453 insertions(+), 122 deletions(-)

-- 
1.8.3.2

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 4/8] powerpc: Prepare to support kernel handling of IOMMU map/unmap

2013-07-06 Thread Alexey Kardashevskiy
The current VFIO-on-POWER implementation supports only user mode
driven mapping, i.e. QEMU is sending requests to map/unmap pages.
However this approach is really slow, so we want to move that to KVM.
Since H_PUT_TCE can be extremely performance sensitive (especially with
network adapters where each packet needs to be mapped/unmapped) we chose
to implement that as a fast hypercall directly in real
mode (processor still in the guest context but MMU off).

To be able to do that, we need to provide some facilities to
access the struct page count within that real mode environment as things
like the sparsemem vmemmap mappings aren't accessible.

This adds an API to increment/decrement page counter as
get_user_pages API used for user mode mapping does not work
in the real mode.

CONFIG_SPARSEMEM_VMEMMAP and CONFIG_FLATMEM are supported.

Reviewed-by: Paul Mackerras pau...@samba.org
Signed-off-by: Paul Mackerras pau...@samba.org
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru

---

Changes:
2013/06/27:
* realmode_get_page() fixed to use get_page_unless_zero(). If failed,
the call will be passed from real to virtual mode and safely handled.
* added comment to PageCompound() in include/linux/page-flags.h.

2013/05/20:
* PageTail() is replaced by PageCompound() in order to have the same checks
for whether the page is huge in realmode_get_page() and realmode_put_page()

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/pgtable-ppc64.h |  4 ++
 arch/powerpc/mm/init_64.c| 78 +++-
 include/linux/page-flags.h   |  4 +-
 3 files changed, 84 insertions(+), 2 deletions(-)

diff --git a/arch/powerpc/include/asm/pgtable-ppc64.h 
b/arch/powerpc/include/asm/pgtable-ppc64.h
index e3d55f6f..7b46e5f 100644
--- a/arch/powerpc/include/asm/pgtable-ppc64.h
+++ b/arch/powerpc/include/asm/pgtable-ppc64.h
@@ -376,6 +376,10 @@ static inline pte_t *find_linux_pte_or_hugepte(pgd_t 
*pgdir, unsigned long ea,
 }
 #endif /* !CONFIG_HUGETLB_PAGE */
 
+struct page *realmode_pfn_to_page(unsigned long pfn);
+int realmode_get_page(struct page *page);
+int realmode_put_page(struct page *page);
+
 #endif /* __ASSEMBLY__ */
 
 #endif /* _ASM_POWERPC_PGTABLE_PPC64_H_ */
diff --git a/arch/powerpc/mm/init_64.c b/arch/powerpc/mm/init_64.c
index a90b9c4..7031be3 100644
--- a/arch/powerpc/mm/init_64.c
+++ b/arch/powerpc/mm/init_64.c
@@ -297,5 +297,81 @@ void vmemmap_free(unsigned long start, unsigned long end)
 {
 }
 
-#endif /* CONFIG_SPARSEMEM_VMEMMAP */
+/*
+ * We do not have access to the sparsemem vmemmap, so we fallback to
+ * walking the list of sparsemem blocks which we already maintain for
+ * the sake of crashdump. In the long run, we might want to maintain
+ * a tree if performance of that linear walk becomes a problem.
+ *
+ * Any of realmode_ functions can fail due to:
+ * 1) As real sparsemem blocks do not lay in RAM continously (they
+ * are in virtual address space which is not available in the real mode),
+ * the requested page struct can be split between blocks so get_page/put_page
+ * may fail.
+ * 2) When huge pages are used, the get_page/put_page API will fail
+ * in real mode as the linked addresses in the page struct are virtual
+ * too.
+ * When 1) or 2) takes place, the API returns an error code to cause
+ * an exit to kernel virtual mode where the operation will be completed.
+ */
+struct page *realmode_pfn_to_page(unsigned long pfn)
+{
+   struct vmemmap_backing *vmem_back;
+   struct page *page;
+   unsigned long page_size = 1  mmu_psize_defs[mmu_vmemmap_psize].shift;
+   unsigned long pg_va = (unsigned long) pfn_to_page(pfn);
 
+   for (vmem_back = vmemmap_list; vmem_back; vmem_back = vmem_back-list) {
+   if (pg_va  vmem_back-virt_addr)
+   continue;
+
+   /* Check that page struct is not split between real pages */
+   if ((pg_va + sizeof(struct page)) 
+   (vmem_back-virt_addr + page_size))
+   return NULL;
+
+   page = (struct page *) (vmem_back-phys + pg_va -
+   vmem_back-virt_addr);
+   return page;
+   }
+
+   return NULL;
+}
+EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
+
+#elif defined(CONFIG_FLATMEM)
+
+struct page *realmode_pfn_to_page(unsigned long pfn)
+{
+   struct page *page = pfn_to_page(pfn);
+   return page;
+}
+EXPORT_SYMBOL_GPL(realmode_pfn_to_page);
+
+#endif /* CONFIG_SPARSEMEM_VMEMMAP/CONFIG_FLATMEM */
+
+#if defined(CONFIG_SPARSEMEM_VMEMMAP) || defined(CONFIG_FLATMEM)
+int realmode_get_page(struct page *page)
+{
+   if (PageCompound(page))
+   return -EAGAIN;
+
+   if (!get_page_unless_zero(page))
+   return -EAGAIN;
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(realmode_get_page);
+
+int realmode_put_page(struct page *page)
+{
+   if (PageCompound(page))
+   return -EAGAIN;
+
+  

[PATCH 6/8] KVM: PPC: Add support for multiple-TCE hcalls

2013-07-06 Thread Alexey Kardashevskiy
This adds real mode handlers for the H_PUT_TCE_INDIRECT and
H_STUFF_TCE hypercalls for QEMU emulated devices such as IBMVIO
devices or emulated PCI.  These calls allow adding multiple entries
(up to 512) into the TCE table in one call which saves time on
transition to/from real mode.

This adds a tce_tmp cache to kvm_vcpu_arch to save valid TCEs
(copied from user and verified) before writing the whole list into
the TCE table. This cache will be utilized more in the upcoming
VFIO/IOMMU support to continue TCE list processing in the virtual
mode in the case if the real mode handler failed for some reason.

This adds a guest physical to host real address converter
and calls the existing H_PUT_TCE handler. The converting function
is going to be fully utilized by upcoming VFIO supporting patches.

This also implements the KVM_CAP_PPC_MULTITCE capability,
so in order to support the functionality of this patch, QEMU
needs to query for this capability and set the hcall-multi-tce
hypertas property only if the capability is present, otherwise
there will be serious performance degradation.

Signed-off-by: Paul Mackerras pau...@samba.org
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru

---
Changelog:
2013/07/06:
* fixed number of wrong get_page()/put_page() calls

2013/06/27:
* fixed clear of BUSY bit in kvmppc_lookup_pte()
* H_PUT_TCE_INDIRECT does realmode_get_page() now
* KVM_CAP_SPAPR_MULTITCE now depends on CONFIG_PPC_BOOK3S_64
* updated doc

2013/06/05:
* fixed mistype about IBMVIO in the commit message
* updated doc and moved it to another section
* changed capability number

2013/05/21:
* added kvm_vcpu_arch::tce_tmp
* removed cleanup if put_indirect failed, instead we do not even start
writing to TCE table if we cannot get TCEs from the user and they are
invalid
* kvmppc_emulated_h_put_tce is split to kvmppc_emulated_put_tce
and kvmppc_emulated_validate_tce (for the previous item)
* fixed bug with failthrough for H_IPI
* removed all get_user() from real mode handlers
* kvmppc_lookup_pte() added (instead of making lookup_linux_pte public)

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 Documentation/virtual/kvm/api.txt   |  25 +++
 arch/powerpc/include/asm/kvm_host.h |   9 ++
 arch/powerpc/include/asm/kvm_ppc.h  |  16 +-
 arch/powerpc/kvm/book3s_64_vio.c| 154 ++-
 arch/powerpc/kvm/book3s_64_vio_hv.c | 260 
 arch/powerpc/kvm/book3s_hv.c|  41 -
 arch/powerpc/kvm/book3s_hv_rmhandlers.S |   6 +
 arch/powerpc/kvm/book3s_pr_papr.c   |  37 -
 arch/powerpc/kvm/powerpc.c  |   3 +
 9 files changed, 517 insertions(+), 34 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 6365fef..762c703 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2362,6 +2362,31 @@ calls by the guest for that service will be passed to 
userspace to be
 handled.
 
 
+4.86 KVM_CAP_PPC_MULTITCE
+
+Capability: KVM_CAP_PPC_MULTITCE
+Architectures: ppc
+Type: vm
+
+This capability means the kernel is capable of handling hypercalls
+H_PUT_TCE_INDIRECT and H_STUFF_TCE without passing those into the user
+space. This significanly accelerates DMA operations for PPC KVM guests.
+The user space should expect that its handlers for these hypercalls
+are not going to be called.
+
+In order to enable H_PUT_TCE_INDIRECT and H_STUFF_TCE use in the guest,
+the user space might have to advertise it for the guest. For example,
+IBM pSeries guest starts using them if hcall-multi-tce is present in
+the ibm,hypertas-functions device-tree property.
+
+Without this capability, only H_PUT_TCE is handled by the kernel and
+therefore the use of H_PUT_TCE_INDIRECT and H_STUFF_TCE is not recommended
+unless the capability is present as passing hypercalls to the userspace
+slows operations a lot.
+
+Unlike other capabilities of this section, this one is always enabled.
+
+
 5. The kvm_run structure
 
 
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index af326cd..20d04bd 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -180,6 +180,7 @@ struct kvmppc_spapr_tce_table {
struct kvm *kvm;
u64 liobn;
u32 window_size;
+   struct { struct { unsigned long put, indir, stuff; } rm, vm; } stat;
struct page *pages[0];
 };
 
@@ -609,6 +610,14 @@ struct kvm_vcpu_arch {
spinlock_t tbacct_lock;
u64 busy_stolen;
u64 busy_preempt;
+
+   unsigned long *tce_tmp_hpas;/* TCE cache for TCE_PUT_INDIRECT hcall 
*/
+   enum {
+   TCERM_NONE,
+   TCERM_GETPAGE,
+   TCERM_PUTTCE,
+   TCERM_PUTLIST,
+   } tce_rm_fail;  /* failed stage of request processing */
 #endif
 };
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 

[PATCH 3/8] vfio: add external user support

2013-07-06 Thread Alexey Kardashevskiy
VFIO is designed to be used via ioctls on file descriptors
returned by VFIO.

However in some situations support for an external user is required.
The first user is KVM on PPC64 (SPAPR TCE protocol) which is going to
use the existing VFIO groups for exclusive access in real/virtual mode
on a host to avoid passing map/unmap requests to the user space which
would made things pretty slow.

The proposed protocol includes:

1. do normal VFIO init stuff such as opening a new container, attaching
group(s) to it, setting an IOMMU driver for a container. When IOMMU is
set for a container, all groups in it are considered ready to use by
an external user.

2. pass a fd of the group we want to accelerate to KVM. KVM calls
vfio_group_get_external_user() to verify if the group is initialized,
IOMMU is set for it and increment the container user counter to prevent
the VFIO group from disposal prior to KVM exit.
The current TCE IOMMU driver marks the whole IOMMU table as busy when
IOMMU is set for a container what prevents other DMA users from
allocating from it so it is safe to grant user space access to it.

3. KVM calls vfio_external_user_iommu_id() to obtian an IOMMU ID which
KVM uses to get an iommu_group struct for later use.

4. When KVM is finished, it calls vfio_group_put_external_user() to
release the VFIO group by decrementing the container user counter.
Everything gets released.

The vfio: Limit group opens patch is also required for the consistency.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index c488da5..57aa191 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1370,6 +1370,62 @@ static const struct file_operations vfio_device_fops = {
 };
 
 /**
+ * External user API, exported by symbols to be linked dynamically.
+ *
+ * The protocol includes:
+ *  1. do normal VFIO init operation:
+ * - opening a new container;
+ * - attaching group(s) to it;
+ * - setting an IOMMU driver for a container.
+ * When IOMMU is set for a container, all groups in it are
+ * considered ready to use by an external user.
+ *
+ * 2. The user space passed a group fd which we want to accelerate in
+ * KVM. KVM uses vfio_group_get_external_user() to verify that:
+ * - the group is initialized;
+ * - IOMMU is set for it.
+ * Then vfio_group_get_external_user() increments the container user
+ * counter to prevent the VFIO group from disposal prior to KVM exit.
+ *
+ * 3. KVM calls vfio_external_user_iommu_id() to know an IOMMU ID which
+ * KVM uses to get an iommu_group struct for later use.
+ *
+ * 4. When KVM is finished, it calls vfio_group_put_external_user() to
+ * release the VFIO group by decrementing the container user counter.
+ */
+struct vfio_group *vfio_group_get_external_user(struct file *filep)
+{
+   struct vfio_group *group = filep-private_data;
+
+   if (filep-f_op != vfio_group_fops)
+   return NULL;
+
+   if (!atomic_inc_not_zero(group-container_users))
+   return NULL;
+
+   if (!group-container-iommu_driver ||
+   !vfio_group_viable(group)) {
+   atomic_dec(group-container_users);
+   return NULL;
+   }
+
+   return group;
+}
+EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
+
+void vfio_group_put_external_user(struct vfio_group *group)
+{
+   vfio_group_try_dissolve_container(group);
+}
+EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
+
+int vfio_external_user_iommu_id(struct vfio_group *group)
+{
+   return iommu_group_id(group-iommu_group);
+}
+EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
+
+/**
  * Module/class support
  */
 static char *vfio_devnode(struct device *dev, umode_t *mode)
diff --git a/include/linux/vfio.h b/include/linux/vfio.h
index ac8d488..24579a0 100644
--- a/include/linux/vfio.h
+++ b/include/linux/vfio.h
@@ -90,4 +90,11 @@ extern void vfio_unregister_iommu_driver(
TYPE tmp;   \
offsetof(TYPE, MEMBER) + sizeof(tmp.MEMBER); }) \
 
+/*
+ * External user API
+ */
+extern struct vfio_group *vfio_group_get_external_user(struct file *filep);
+extern void vfio_group_put_external_user(struct vfio_group *group);
+extern int vfio_external_user_iommu_id(struct vfio_group *group);
+
 #endif /* VFIO_H */
-- 
1.8.3.2

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/8] KVM: PPC: reserve a capability and ioctl numbers for realmode VFIO

2013-07-06 Thread Alexey Kardashevskiy
This is to reserve a capablity number for upcoming support
of VFIO-IOMMU DMA operations in real mode.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 include/uapi/linux/kvm.h | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 970b1f5..0865c01 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -667,6 +667,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_PPC_RTAS 91
 #define KVM_CAP_IRQ_XICS 92
 #define KVM_CAP_SPAPR_MULTITCE 93
+#define KVM_CAP_SPAPR_TCE_IOMMU 94
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
@@ -923,6 +924,7 @@ struct kvm_s390_ucas_mapping {
 /* Available with KVM_CAP_PPC_ALLOC_HTAB */
 #define KVM_PPC_ALLOCATE_HTAB_IOWR(KVMIO, 0xa7, __u32)
 #define KVM_CREATE_SPAPR_TCE _IOW(KVMIO,  0xa8, struct 
kvm_create_spapr_tce)
+#define KVM_CREATE_SPAPR_TCE_IOMMU _IOW(KVMIO,  0xaf, struct 
kvm_create_spapr_tce_iommu)
 /* Available with KVM_CAP_RMA */
 #define KVM_ALLOCATE_RMA _IOR(KVMIO,  0xa9, struct kvm_allocate_rma)
 /* Available with KVM_CAP_PPC_HTAB_FD */
-- 
1.8.3.2

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 7/8] KVM: PPC: Add support for IOMMU in-kernel handling

2013-07-06 Thread Alexey Kardashevskiy
This allows the host kernel to handle H_PUT_TCE, H_PUT_TCE_INDIRECT
and H_STUFF_TCE requests without passing them to QEMU, which saves time
on switching to QEMU and back.

Both real and virtual modes are supported. First the kernel tries to
handle a TCE request in the real mode, if failed it passes it to
the virtual mode to complete the operation. If it a virtual mode
handler fails, a request is passed to the user mode.

This adds a new KVM_CAP_SPAPR_TCE_IOMMU ioctl to associate
a virtual PCI bus ID (LIOBN) with an IOMMU group which enables
in-kernel handling of IOMMU map/unmap. The external user API support
in VFIO is required.

Tests show that this patch increases transmission speed from 220MB/s
to 750..1020MB/s on 10Gb network (Chelsea CXGB3 10Gb ethernet card).

Signed-off-by: Paul Mackerras pau...@samba.org
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru

---

Changes:
2013/07/06:
* added realmode arch_spin_lock to protect TCE table from races
in real and virtual modes
* POWERPC IOMMU API is changed to support real mode
* iommu_take_ownership and iommu_release_ownership are protected by
iommu_table's locks
* VFIO external user API use rewritten
* multiple small fixes

2013/06/27:
* tce_list page is referenced now in order to protect it from accident
invalidation during H_PUT_TCE_INDIRECT execution
* added use of the external user VFIO API

2013/06/05:
* changed capability number
* changed ioctl number
* update the doc article number

2013/05/20:
* removed get_user() from real mode handlers
* kvm_vcpu_arch::tce_tmp usage extended. Now real mode handler puts there
translated TCEs, tries realmode_get_page() on those and if it fails, it
passes control over the virtual mode handler which tries to finish
the request handling
* kvmppc_lookup_pte() now does realmode_get_page() protected by BUSY bit
on a page
* The only reason to pass the request to user mode now is when the user mode
did not register TCE table in the kernel, in all other cases the virtual mode
handler is expected to do the job

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 Documentation/virtual/kvm/api.txt   |  26 
 arch/powerpc/include/asm/iommu.h|   9 +-
 arch/powerpc/include/asm/kvm_host.h |   3 +
 arch/powerpc/include/asm/kvm_ppc.h  |   2 +
 arch/powerpc/include/uapi/asm/kvm.h |   7 +
 arch/powerpc/kernel/iommu.c | 196 +++
 arch/powerpc/kvm/book3s_64_vio.c| 299 +++-
 arch/powerpc/kvm/book3s_64_vio_hv.c | 129 
 arch/powerpc/kvm/powerpc.c  |  12 ++
 9 files changed, 609 insertions(+), 74 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index 762c703..01b0dc2 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2387,6 +2387,32 @@ slows operations a lot.
 Unlike other capabilities of this section, this one is always enabled.
 
 
+4.87 KVM_CREATE_SPAPR_TCE_IOMMU
+
+Capability: KVM_CAP_SPAPR_TCE_IOMMU
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_create_spapr_tce_iommu (in)
+Returns: 0 on success, -1 on error
+
+struct kvm_create_spapr_tce_iommu {
+   __u64 liobn;
+   __u32 iommu_id;
+   __u32 flags;
+};
+
+This creates a link between IOMMU group and a hardware TCE (translation
+control entry) table. This link lets the host kernel know what IOMMU
+group (i.e. TCE table) to use for the LIOBN number passed with
+H_PUT_TCE, H_PUT_TCE_INDIRECT, H_STUFF_TCE hypercalls.
+
+In response to a TCE hypercall, the kernel looks for a TCE table descriptor
+in the list and handles the hypercall in real or virtual modes if
+the descriptor is found. Otherwise the hypercall is passed to the user mode.
+
+No flag is supported at the moment.
+
+
 5. The kvm_run structure
 
 
diff --git a/arch/powerpc/include/asm/iommu.h b/arch/powerpc/include/asm/iommu.h
index 98d1422..0845505 100644
--- a/arch/powerpc/include/asm/iommu.h
+++ b/arch/powerpc/include/asm/iommu.h
@@ -78,6 +78,7 @@ struct iommu_table {
unsigned long *it_map;   /* A simple allocation bitmap for now */
 #ifdef CONFIG_IOMMU_API
struct iommu_group *it_group;
+   arch_spinlock_t it_rm_lock;
 #endif
 };
 
@@ -159,9 +160,9 @@ extern int iommu_tce_clear_param_check(struct iommu_table 
*tbl,
 extern int iommu_tce_put_param_check(struct iommu_table *tbl,
unsigned long ioba, unsigned long tce);
 extern int iommu_tce_build(struct iommu_table *tbl, unsigned long entry,
-   unsigned long hwaddr, enum dma_data_direction direction);
-extern unsigned long iommu_clear_tce(struct iommu_table *tbl,
-   unsigned long entry);
+   unsigned long *hpas, unsigned long npages, bool rm);
+extern int iommu_free_tces(struct iommu_table *tbl, unsigned long entry,
+   unsigned long npages, bool rm);
 extern int iommu_clear_tces_and_put_pages(struct iommu_table *tbl,
unsigned long 

[PATCH 1/8] KVM: PPC: reserve a capability number for multitce support

2013-07-06 Thread Alexey Kardashevskiy
This is to reserve a capablity number for upcoming support
of H_PUT_TCE_INDIRECT and H_STUFF_TCE pseries hypercalls
which support mulptiple DMA map/unmap operations per one call.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 include/uapi/linux/kvm.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index d88c8ee..970b1f5 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -666,6 +666,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_IRQ_MPIC 90
 #define KVM_CAP_PPC_RTAS 91
 #define KVM_CAP_IRQ_XICS 92
+#define KVM_CAP_SPAPR_MULTITCE 93
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
1.8.3.2

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 8/8] KVM: PPC: Add hugepage support for IOMMU in-kernel handling

2013-07-06 Thread Alexey Kardashevskiy
This adds special support for huge pages (16MB).  The reference
counting cannot be easily done for such pages in real mode (when
MMU is off) so we added a list of huge pages.  It is populated in
virtual mode and get_page is called just once per a huge page.
Real mode handlers check if the requested page is huge and in the list,
then no reference counting is done, otherwise an exit to virtual mode
happens.  The list is released at KVM exit.  At the moment the fastest
card available for tests uses up to 9 huge pages so walking through this
list is not very expensive.  However this can change and we may want
to optimize this.

Signed-off-by: Paul Mackerras pau...@samba.org
Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru

---

Changes:
2013/06/27:
* list of huge pages replaces with hashtable for better performance
* spinlock removed from real mode and only protects insertion of new
huge [ages descriptors into the hashtable

2013/06/05:
* fixed compile error when CONFIG_IOMMU_API=n

2013/05/20:
* the real mode handler now searches for a huge page by gpa (used to be pte)
* the virtual mode handler prints warning if it is called twice for the same
huge page as the real mode handler is expected to fail just once - when a huge
page is not in the list yet.
* the huge page is refcounted twice - when added to the hugepage list and
when used in the virtual mode hcall handler (can be optimized but it will
make the patch less nice).

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/kvm_host.h |  25 +
 arch/powerpc/kernel/iommu.c |   6 ++-
 arch/powerpc/kvm/book3s_64_vio.c| 104 +---
 arch/powerpc/kvm/book3s_64_vio_hv.c |  21 ++--
 4 files changed, 146 insertions(+), 10 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 53e61b2..a7508cf 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -30,6 +30,7 @@
 #include linux/kvm_para.h
 #include linux/list.h
 #include linux/atomic.h
+#include linux/hashtable.h
 #include asm/kvm_asm.h
 #include asm/processor.h
 #include asm/page.h
@@ -182,10 +183,34 @@ struct kvmppc_spapr_tce_table {
u32 window_size;
struct iommu_group *grp;/* used for IOMMU groups */
struct vfio_group *vfio_grp;/* used for IOMMU groups */
+   DECLARE_HASHTABLE(hash_tab, ilog2(64)); /* used for IOMMU groups */
+   spinlock_t hugepages_write_lock;/* used for IOMMU groups */
struct { struct { unsigned long put, indir, stuff; } rm, vm; } stat;
struct page *pages[0];
 };
 
+/*
+ * The KVM guest can be backed with 16MB pages.
+ * In this case, we cannot do page counting from the real mode
+ * as the compound pages are used - they are linked in a list
+ * with pointers as virtual addresses which are inaccessible
+ * in real mode.
+ *
+ * The code below keeps a 16MB pages list and uses page struct
+ * in real mode if it is already locked in RAM and inserted into
+ * the list or switches to the virtual mode where it can be
+ * handled in a usual manner.
+ */
+#define KVMPPC_SPAPR_HUGEPAGE_HASH(gpa)hash_32(gpa  24, 32)
+
+struct kvmppc_spapr_iommu_hugepage {
+   struct hlist_node hash_node;
+   unsigned long gpa;  /* Guest physical address */
+   unsigned long hpa;  /* Host physical address */
+   struct page *page;  /* page struct of the very first subpage */
+   unsigned long size; /* Huge page size (always 16MB at the moment) */
+};
+
 struct kvmppc_linear_info {
void*base_virt;
unsigned longbase_pfn;
diff --git a/arch/powerpc/kernel/iommu.c b/arch/powerpc/kernel/iommu.c
index 51678ec..e0b6eca 100644
--- a/arch/powerpc/kernel/iommu.c
+++ b/arch/powerpc/kernel/iommu.c
@@ -999,7 +999,8 @@ int iommu_free_tces(struct iommu_table *tbl, unsigned long 
entry,
if (!pg) {
ret = -EAGAIN;
} else if (PageCompound(pg)) {
-   ret = -EAGAIN;
+   /* Hugepages will be released at KVM exit */
+   ret = 0;
} else {
if (oldtce  TCE_PCI_WRITE)
SetPageDirty(pg);
@@ -1009,6 +1010,9 @@ int iommu_free_tces(struct iommu_table *tbl, unsigned 
long entry,
struct page *pg = pfn_to_page(oldtce  PAGE_SHIFT);
if (!pg) {
ret = -EAGAIN;
+   } else if (PageCompound(pg)) {
+   /* Hugepages will be released at KVM exit */
+   ret = 0;
} else {
if (oldtce  TCE_PCI_WRITE)
SetPageDirty(pg);
diff --git 

[PATCH 5/8] powerpc: add real mode support for dma operations on powernv

2013-07-06 Thread Alexey Kardashevskiy
The existing TCE machine calls (tce_build and tce_free) only support
virtual mode as they call __raw_writeq for TCE invalidation what
fails in real mode.

This introduces tce_build_rm and tce_free_rm real mode versions
which do mostly the same but use Store Doubleword Caching Inhibited
Indexed instruction for TCE invalidation.

This new feature is going to be utilized by real mode support of VFIO.

Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
---
 arch/powerpc/include/asm/machdep.h| 12 ++
 arch/powerpc/platforms/powernv/pci-ioda.c | 26 +++--
 arch/powerpc/platforms/powernv/pci.c  | 38 ++-
 arch/powerpc/platforms/powernv/pci.h  |  2 +-
 4 files changed, 64 insertions(+), 14 deletions(-)

diff --git a/arch/powerpc/include/asm/machdep.h 
b/arch/powerpc/include/asm/machdep.h
index 92386fc..0c19eef 100644
--- a/arch/powerpc/include/asm/machdep.h
+++ b/arch/powerpc/include/asm/machdep.h
@@ -75,6 +75,18 @@ struct machdep_calls {
long index);
void(*tce_flush)(struct iommu_table *tbl);
 
+   /* _rm versions are for real mode use only */
+   int (*tce_build_rm)(struct iommu_table *tbl,
+long index,
+long npages,
+unsigned long uaddr,
+enum dma_data_direction direction,
+struct dma_attrs *attrs);
+   void(*tce_free_rm)(struct iommu_table *tbl,
+   long index,
+   long npages);
+   void(*tce_flush_rm)(struct iommu_table *tbl);
+
void __iomem *  (*ioremap)(phys_addr_t addr, unsigned long size,
   unsigned long flags, void *caller);
void(*iounmap)(volatile void __iomem *token);
diff --git a/arch/powerpc/platforms/powernv/pci-ioda.c 
b/arch/powerpc/platforms/powernv/pci-ioda.c
index 2931d97..2797dec 100644
--- a/arch/powerpc/platforms/powernv/pci-ioda.c
+++ b/arch/powerpc/platforms/powernv/pci-ioda.c
@@ -68,6 +68,12 @@ define_pe_printk_level(pe_err, KERN_ERR);
 define_pe_printk_level(pe_warn, KERN_WARNING);
 define_pe_printk_level(pe_info, KERN_INFO);
 
+static inline void rm_writed(unsigned long paddr, u64 val)
+{
+   __asm__ __volatile__(sync; stdcix %0,0,%1
+   : : r (val), r (paddr) : memory);
+}
+
 static int pnv_ioda_alloc_pe(struct pnv_phb *phb)
 {
unsigned long pe;
@@ -442,7 +448,7 @@ static void pnv_pci_ioda_dma_dev_setup(struct pnv_phb *phb, 
struct pci_dev *pdev
 }
 
 static void pnv_pci_ioda1_tce_invalidate(struct iommu_table *tbl,
-u64 *startp, u64 *endp)
+u64 *startp, u64 *endp, bool rm)
 {
u64 __iomem *invalidate = (u64 __iomem *)tbl-it_index;
unsigned long start, end, inc;
@@ -471,7 +477,10 @@ static void pnv_pci_ioda1_tce_invalidate(struct 
iommu_table *tbl,
 
 mb(); /* Ensure above stores are visible */
 while (start = end) {
-__raw_writeq(start, invalidate);
+   if (rm)
+   rm_writed((unsigned long) invalidate, start);
+   else
+   __raw_writeq(start, invalidate);
 start += inc;
 }
 
@@ -483,7 +492,7 @@ static void pnv_pci_ioda1_tce_invalidate(struct iommu_table 
*tbl,
 
 static void pnv_pci_ioda2_tce_invalidate(struct pnv_ioda_pe *pe,
 struct iommu_table *tbl,
-u64 *startp, u64 *endp)
+u64 *startp, u64 *endp, bool rm)
 {
unsigned long start, end, inc;
u64 __iomem *invalidate = (u64 __iomem *)tbl-it_index;
@@ -502,22 +511,25 @@ static void pnv_pci_ioda2_tce_invalidate(struct 
pnv_ioda_pe *pe,
mb();
 
while (start = end) {
-   __raw_writeq(start, invalidate);
+   if (rm)
+   rm_writed((unsigned long) invalidate, start);
+   else
+   __raw_writeq(start, invalidate);
start += inc;
}
 }
 
 void pnv_pci_ioda_tce_invalidate(struct iommu_table *tbl,
-u64 *startp, u64 *endp)
+u64 *startp, u64 *endp, bool rm)
 {
struct pnv_ioda_pe *pe = container_of(tbl, struct pnv_ioda_pe,
  tce32_table);
struct pnv_phb *phb = pe-phb;
 
if (phb-type == PNV_PHB_IODA1)
-   pnv_pci_ioda1_tce_invalidate(tbl, startp, endp);
+   pnv_pci_ioda1_tce_invalidate(tbl, startp, endp, rm);
else
-   pnv_pci_ioda2_tce_invalidate(pe, tbl, startp, endp);
+   pnv_pci_ioda2_tce_invalidate(pe, tbl,