On Wed, 2013-06-05 at 16:11 +1000, Alexey Kardashevskiy wrote:
> This adds real mode handlers for the H_PUT_TCE_INDIRECT and
> H_STUFF_TCE hypercalls for QEMU emulated devices such as IBMVIO
> devices or emulated PCI.  These calls allow adding multiple entries
> (up to 512) into the TCE table in one call which saves time on
> transition to/from real mode.
> 
> This adds a tce_tmp cache to kvm_vcpu_arch to save valid TCEs
> (copied from user and verified) before writing the whole list into
> the TCE table. This cache will be utilized more in the upcoming
> VFIO/IOMMU support to continue TCE list processing in the virtual
> mode in the case if the real mode handler failed for some reason.
> 
> This adds a guest physical to host real address converter
> and calls the existing H_PUT_TCE handler. The converting function
> is going to be fully utilized by upcoming VFIO supporting patches.
> 
> This also implements the KVM_CAP_PPC_MULTITCE capability,
> so in order to support the functionality of this patch, QEMU
> needs to query for this capability and set the "hcall-multi-tce"
> hypertas property only if the capability is present, otherwise
> there will be serious performance degradation.
> 
> Cc: David Gibson <da...@gibson.dropbear.id.au>
> Signed-off-by: Alexey Kardashevskiy <a...@ozlabs.ru>
> Signed-off-by: Paul Mackerras <pau...@samba.org>
> 
> ---
> Changelog:
> 2013/06/05:
> * fixed mistype about IBMVIO in the commit message
> * updated doc and moved it to another section
> * changed capability number
> 
> 2013/05/21:
> * added kvm_vcpu_arch::tce_tmp
> * removed cleanup if put_indirect failed, instead we do not even start
> writing to TCE table if we cannot get TCEs from the user and they are
> invalid
> * kvmppc_emulated_h_put_tce is split to kvmppc_emulated_put_tce
> and kvmppc_emulated_validate_tce (for the previous item)
> * fixed bug with failthrough for H_IPI
> * removed all get_user() from real mode handlers
> * kvmppc_lookup_pte() added (instead of making lookup_linux_pte public)
> ---
>  Documentation/virtual/kvm/api.txt       |   17 ++
>  arch/powerpc/include/asm/kvm_host.h     |    2 +
>  arch/powerpc/include/asm/kvm_ppc.h      |   16 +-
>  arch/powerpc/kvm/book3s_64_vio.c        |  118 ++++++++++++++
>  arch/powerpc/kvm/book3s_64_vio_hv.c     |  266 
> +++++++++++++++++++++++++++----
>  arch/powerpc/kvm/book3s_hv.c            |   39 +++++
>  arch/powerpc/kvm/book3s_hv_rmhandlers.S |    6 +
>  arch/powerpc/kvm/book3s_pr_papr.c       |   37 ++++-
>  arch/powerpc/kvm/powerpc.c              |    3 +
>  include/uapi/linux/kvm.h                |    1 +
>  10 files changed, 473 insertions(+), 32 deletions(-)
> 
> diff --git a/Documentation/virtual/kvm/api.txt 
> b/Documentation/virtual/kvm/api.txt
> index 5f91eda..6c082ff 100644
> --- a/Documentation/virtual/kvm/api.txt
> +++ b/Documentation/virtual/kvm/api.txt
> @@ -2362,6 +2362,23 @@ calls by the guest for that service will be passed to 
> userspace to be
>  handled.
>  
> 
> +4.83 KVM_CAP_PPC_MULTITCE
> +
> +Capability: KVM_CAP_PPC_MULTITCE
> +Architectures: ppc
> +Type: vm
> +
> +This capability tells the guest that multiple TCE entry add/remove hypercalls
> +handling is supported by the kernel. This significanly accelerates DMA
> +operations for PPC KVM guests.
> +
> +Unlike other capabilities in this section, this one does not have an ioctl.
> +Instead, when the capability is present, the H_PUT_TCE_INDIRECT and
> +H_STUFF_TCE hypercalls are to be handled in the host kernel and not passed to
> +the guest. Othwerwise it might be better for the guest to continue using 
> H_PUT_TCE
> +hypercall (if KVM_CAP_SPAPR_TCE or KVM_CAP_SPAPR_TCE_IOMMU are present).
> +
> +
>  5. The kvm_run structure
>  ------------------------
>  
> diff --git a/arch/powerpc/include/asm/kvm_host.h 
> b/arch/powerpc/include/asm/kvm_host.h
> index af326cd..85d8f26 100644
> --- a/arch/powerpc/include/asm/kvm_host.h
> +++ b/arch/powerpc/include/asm/kvm_host.h
> @@ -609,6 +609,8 @@ struct kvm_vcpu_arch {
>       spinlock_t tbacct_lock;
>       u64 busy_stolen;
>       u64 busy_preempt;
> +
> +     unsigned long *tce_tmp;    /* TCE cache for TCE_PUT_INDIRECT hall */
>  #endif
>  };
>  
> diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
> b/arch/powerpc/include/asm/kvm_ppc.h
> index a5287fe..e852921b 100644
> --- a/arch/powerpc/include/asm/kvm_ppc.h
> +++ b/arch/powerpc/include/asm/kvm_ppc.h
> @@ -133,8 +133,20 @@ extern int kvmppc_pseries_do_hcall(struct kvm_vcpu 
> *vcpu);
>  
>  extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
>                               struct kvm_create_spapr_tce *args);
> -extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
> -                          unsigned long ioba, unsigned long tce);
> +extern struct kvmppc_spapr_tce_table *kvmppc_find_tce_table(
> +             struct kvm_vcpu *vcpu, unsigned long liobn);
> +extern long kvmppc_emulated_validate_tce(unsigned long tce);
> +extern void kvmppc_emulated_put_tce(struct kvmppc_spapr_tce_table *tt,
> +             unsigned long ioba, unsigned long tce);
> +extern long kvmppc_virtmode_h_put_tce(struct kvm_vcpu *vcpu,
> +             unsigned long liobn, unsigned long ioba,
> +             unsigned long tce);
> +extern long kvmppc_virtmode_h_put_tce_indirect(struct kvm_vcpu *vcpu,
> +             unsigned long liobn, unsigned long ioba,
> +             unsigned long tce_list, unsigned long npages);
> +extern long kvmppc_virtmode_h_stuff_tce(struct kvm_vcpu *vcpu,
> +             unsigned long liobn, unsigned long ioba,
> +             unsigned long tce_value, unsigned long npages);
>  extern long kvm_vm_ioctl_allocate_rma(struct kvm *kvm,
>                               struct kvm_allocate_rma *rma);
>  extern struct kvmppc_linear_info *kvm_alloc_rma(void);
> diff --git a/arch/powerpc/kvm/book3s_64_vio.c 
> b/arch/powerpc/kvm/book3s_64_vio.c
> index b2d3f3b..06b7b20 100644
> --- a/arch/powerpc/kvm/book3s_64_vio.c
> +++ b/arch/powerpc/kvm/book3s_64_vio.c
> @@ -14,6 +14,7 @@
>   *
>   * Copyright 2010 Paul Mackerras, IBM Corp. <pau...@au1.ibm.com>
>   * Copyright 2011 David Gibson, IBM Corporation <d...@au1.ibm.com>
> + * Copyright 2013 Alexey Kardashevskiy, IBM Corporation <a...@au1.ibm.com>
>   */
>  
>  #include <linux/types.h>
> @@ -36,8 +37,11 @@
>  #include <asm/ppc-opcode.h>
>  #include <asm/kvm_host.h>
>  #include <asm/udbg.h>
> +#include <asm/iommu.h>
> +#include <asm/tce.h>
>  
>  #define TCES_PER_PAGE        (PAGE_SIZE / sizeof(u64))
> +#define ERROR_ADDR      ((void *)~(unsigned long)0x0)
>  
>  static long kvmppc_stt_npages(unsigned long window_size)
>  {
> @@ -148,3 +152,117 @@ fail:
>       }
>       return ret;
>  }
> +
> +/* Converts guest physical address into host virtual */
> +static void __user *kvmppc_virtmode_gpa_to_hva(struct kvm_vcpu *vcpu,
> +             unsigned long gpa)
> +{
> +     unsigned long hva, gfn = gpa >> PAGE_SHIFT;
> +     struct kvm_memory_slot *memslot;
> +
> +     memslot = search_memslots(kvm_memslots(vcpu->kvm), gfn);
> +     if (!memslot)
> +             return ERROR_ADDR;
> +
> +     hva = __gfn_to_hva_memslot(memslot, gfn) + (gpa & ~PAGE_MASK);
> +     return (void *) hva;
> +}
> +
> +long kvmppc_virtmode_h_put_tce(struct kvm_vcpu *vcpu,
> +             unsigned long liobn, unsigned long ioba,
> +             unsigned long tce)
> +{
> +     long ret;
> +     struct kvmppc_spapr_tce_table *tt;
> +
> +     tt = kvmppc_find_tce_table(vcpu, liobn);
> +     /* Didn't find the liobn, put it to userspace */
> +     if (!tt)
> +             return H_TOO_HARD;
> +
> +     /* Emulated IO */
> +     if (ioba >= tt->window_size)
> +             return H_PARAMETER;
> +
> +     ret = kvmppc_emulated_validate_tce(tce);
> +     if (ret)
> +             return ret;
> +
> +     kvmppc_emulated_put_tce(tt, ioba, tce);
> +
> +     return H_SUCCESS;
> +}
> +
> +long kvmppc_virtmode_h_put_tce_indirect(struct kvm_vcpu *vcpu,
> +             unsigned long liobn, unsigned long ioba,
> +             unsigned long tce_list, unsigned long npages)
> +{
> +     struct kvmppc_spapr_tce_table *tt;
> +     long i, ret;
> +     unsigned long __user *tces;
> +
> +     tt = kvmppc_find_tce_table(vcpu, liobn);
> +     /* Didn't find the liobn, put it to userspace */
> +     if (!tt)
> +             return H_TOO_HARD;
> +
> +     /*
> +      * The spec says that the maximum size of the list is 512 TCEs so
> +      * so the whole table addressed resides in 4K page
> +      */
> +     if (npages > 512)
> +             return H_PARAMETER;
> +
> +     if (tce_list & ~IOMMU_PAGE_MASK)
> +             return H_PARAMETER;
> +
> +     tces = kvmppc_virtmode_gpa_to_hva(vcpu, tce_list);
> +     if (tces == ERROR_ADDR)
> +             return H_TOO_HARD;
> +
> +     /* Emulated IO */
> +     if ((ioba + (npages << IOMMU_PAGE_SHIFT)) > tt->window_size)
> +             return H_PARAMETER;
> +
> +     for (i = 0; i < npages; ++i) {
> +             if (get_user(vcpu->arch.tce_tmp[i], tces + i))
> +                     return H_PARAMETER;
> +
> +             ret = kvmppc_emulated_validate_tce(vcpu->arch.tce_tmp[i]);
> +             if (ret)
> +                     return ret;
> +     }
> +
> +     for (i = 0; i < npages; ++i)
> +             kvmppc_emulated_put_tce(tt,
> +                             ioba + (i << IOMMU_PAGE_SHIFT),
> +                             vcpu->arch.tce_tmp[i]);
> +
> +     return H_SUCCESS;
> +}
> +
> +long kvmppc_virtmode_h_stuff_tce(struct kvm_vcpu *vcpu,
> +             unsigned long liobn, unsigned long ioba,
> +             unsigned long tce_value, unsigned long npages)
> +{
> +     struct kvmppc_spapr_tce_table *tt;
> +     long i, ret;
> +
> +     tt = kvmppc_find_tce_table(vcpu, liobn);
> +     /* Didn't find the liobn, put it to userspace */
> +     if (!tt)
> +             return H_TOO_HARD;
> +
> +     /* Emulated IO */
> +     if ((ioba + (npages << IOMMU_PAGE_SHIFT)) > tt->window_size)
> +             return H_PARAMETER;
> +
> +     ret = kvmppc_emulated_validate_tce(tce_value);
> +     if (ret || (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ)))
> +             return H_PARAMETER;
> +
> +     for (i = 0; i < npages; ++i, ioba += IOMMU_PAGE_SIZE)
> +             kvmppc_emulated_put_tce(tt, ioba, tce_value);
> +
> +     return H_SUCCESS;
> +}
> diff --git a/arch/powerpc/kvm/book3s_64_vio_hv.c 
> b/arch/powerpc/kvm/book3s_64_vio_hv.c
> index 30c2f3b..c68d538 100644
> --- a/arch/powerpc/kvm/book3s_64_vio_hv.c
> +++ b/arch/powerpc/kvm/book3s_64_vio_hv.c
> @@ -14,6 +14,7 @@
>   *
>   * Copyright 2010 Paul Mackerras, IBM Corp. <pau...@au1.ibm.com>
>   * Copyright 2011 David Gibson, IBM Corporation <d...@au1.ibm.com>
> + * Copyright 2013 Alexey Kardashevskiy, IBM Corporation <a...@au1.ibm.com>
>   */
>  
>  #include <linux/types.h>
> @@ -35,42 +36,249 @@
>  #include <asm/ppc-opcode.h>
>  #include <asm/kvm_host.h>
>  #include <asm/udbg.h>
> +#include <asm/iommu.h>
> +#include <asm/tce.h>
>  
>  #define TCES_PER_PAGE        (PAGE_SIZE / sizeof(u64))
> +#define ERROR_ADDR      (~(unsigned long)0x0)
>  
> -/* WARNING: This will be called in real-mode on HV KVM and virtual
> - *          mode on PR KVM
> +/* Finds a TCE table descriptor by LIOBN */
> +struct kvmppc_spapr_tce_table *kvmppc_find_tce_table(struct kvm_vcpu *vcpu,
> +             unsigned long liobn)
> +{
> +     struct kvmppc_spapr_tce_table *tt;
> +
> +     list_for_each_entry(tt, &vcpu->kvm->arch.spapr_tce_tables, list) {
> +             if (tt->liobn == liobn)
> +                     return tt;
> +     }
> +
> +     return NULL;
> +}
> +EXPORT_SYMBOL_GPL(kvmppc_find_tce_table);
> +
> +/*
> + * Validate TCE address.
> + * At the moment only flags are validated
> + * as other check will significantly slow down
> + * or can make it even impossible to handle TCE requests
> + * in real mode.
> + */
> +long kvmppc_emulated_validate_tce(unsigned long tce)
> +{
> +     if (tce & ~(IOMMU_PAGE_MASK | TCE_PCI_WRITE | TCE_PCI_READ))
> +             return H_PARAMETER;
> +
> +     return H_SUCCESS;
> +}
> +EXPORT_SYMBOL_GPL(kvmppc_emulated_validate_tce);
> +
> +/*
> + * kvmppc_emulated_put_tce() handles TCE requests for devices emulated
> + * by QEMU. It puts guest TCE values into the table and expects
> + * the QEMU to convert them later in the QEMU device implementation.
> + * Wiorks in both real and virtual modes.
> + * It cannot fail so kvmppc_emulated_validate_tce must be called before it.
>   */
> +void kvmppc_emulated_put_tce(struct kvmppc_spapr_tce_table *tt,
> +             unsigned long ioba, unsigned long tce)
> +{
> +     unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
> +     struct page *page;
> +     u64 *tbl;
> +
> +     /*
> +      * Note on the use of page_address() in real mode,
> +      *
> +      * It is safe to use page_address() in real mode on ppc64 because
> +      * page_address() is always defined as lowmem_page_address()
> +      * which returns __va(PFN_PHYS(page_to_pfn(page))) which is arithmetial
> +      * operation and does not access page struct.
> +      *
> +      * Theoretically page_address() could be defined different
> +      * but either WANT_PAGE_VIRTUAL or HASHED_PAGE_VIRTUAL
> +      * should be enabled.
> +      * WANT_PAGE_VIRTUAL is never enabled on ppc32/ppc64,
> +      * HASHED_PAGE_VIRTUAL could be enabled for ppc32 only and only
> +      * if CONFIG_HIGHMEM is defined. As CONFIG_SPARSEMEM_VMEMMAP
> +      * is not expected to be enabled on ppc32, page_address()
> +      * is safe for ppc32 as well.
> +      */
> +#if defined(HASHED_PAGE_VIRTUAL) || defined(WANT_PAGE_VIRTUAL)
> +#error TODO: fix to avoid page_address() here
> +#endif
> +     page = tt->pages[idx / TCES_PER_PAGE];
> +     tbl = (u64 *)page_address(page);
> +
> +     /* udbg_printf("tce @ %p\n", &tbl[idx % TCES_PER_PAGE]); */
> +     tbl[idx % TCES_PER_PAGE] = tce;
> +}
> +EXPORT_SYMBOL_GPL(kvmppc_emulated_put_tce);
> +
> +#ifdef CONFIG_KVM_BOOK3S_64_HV
> +
> +static pte_t kvmppc_lookup_pte(pgd_t *pgdir, unsigned long hva, bool writing,
> +                     unsigned long *pte_sizep)
> +{
> +     pte_t *ptep;
> +     unsigned int shift = 0;
> +     pte_t pte, tmp, ret;
> +
> +     ptep = find_linux_pte_or_hugepte(pgdir, hva, &shift);
> +     if (!ptep)
> +             return __pte(0);
> +     if (shift)
> +             *pte_sizep = 1ul << shift;
> +     else
> +             *pte_sizep = PAGE_SIZE;
> +
> +     if (!pte_present(*ptep))
> +             return __pte(0);
> +
> +     /* wait until _PAGE_BUSY is clear then set it atomically */
> +     __asm__ __volatile__ (
> +             "1:     ldarx   %0,0,%3\n"
> +             "       andi.   %1,%0,%4\n"
> +             "       bne-    1b\n"
> +             "       ori     %1,%0,%4\n"
> +             "       stdcx.  %1,0,%3\n"
> +             "       bne-    1b"
> +             : "=&r" (pte), "=&r" (tmp), "=m" (*ptep)
> +             : "r" (ptep), "i" (_PAGE_BUSY)
> +             : "cc");
> +
> +     ret = pte;
> +
> +     return ret;
> +}

The test for pte_present() needs to be done again after you lock the PTE
since potentially you could have raced with an invalidation.

More worrisome: You set _PAGE_BUSY above (lock the PTE). But when do you
clear it ?

It looks like you rely on _PAGE_BUSY being set to protect yourself
against any concurrent invalidation (or other change to the PTE) while
you access the underlying page. This is *somewhat* ok, though frowned
upon since you end up locking the PTE a lot longer (in real mode) than
we normally do, but in any case, you need to ensure that you release
that lock.

Also you must *not* continue using the resulting physical address after
releasing the lock since the page might be invalidated/freed/swapped_out
etc... at any point once you clear busy.

It might be better to use the MMU notifiers here to catch concurrent
invalidations rather than locking the PTE for a long time. If a
concurrent invalidation happens, just return TOO_HARD.

> +
> +/*
> + * Converts guest physical address into host physical address.
> + * Also returns pte and page size if the page is present in page table.
> + */
> +static unsigned long kvmppc_realmode_gpa_to_hpa(struct kvm_vcpu *vcpu,
> +             unsigned long gpa)
> +{
> +     struct kvm_memory_slot *memslot;
> +     pte_t pte;
> +     unsigned long hva, hpa, pg_size = 0, offset;
> +     unsigned long gfn = gpa >> PAGE_SHIFT;
> +     bool writing = gpa & TCE_PCI_WRITE;
> +
> +     /* Find a KVM memslot */
> +     memslot = search_memslots(kvm_memslots(vcpu->kvm), gfn);
> +     if (!memslot)
> +             return ERROR_ADDR;
> +
> +     /* Convert guest physical address to host virtual */
> +     hva = __gfn_to_hva_memslot(memslot, gfn);
> +
> +     /* Find a PTE and determine the size */
> +     pte = kvmppc_lookup_pte(vcpu->arch.pgdir, hva,
> +                     writing, &pg_size);
> +     if (!pte)
> +             return ERROR_ADDR;
> +
> +     /* Calculate host phys address keeping flags and offset in the page */
> +     offset = gpa & (pg_size - 1);
> +
> +     /* pte_pfn(pte) should return an address aligned to pg_size */
> +     hpa = (pte_pfn(pte) << PAGE_SHIFT) + offset;
> +
> +     return hpa;
> +}

Do you ever test whether the page protection on the PTE allows for
access ? At the moment you only use that to read from TCEs so chances
that this is wrong are slim (you wouldn't have PROT_NONE on TCE tables),
but it's still a worry to have code like that.

Also you do not set _PAGE_ACCESSED either, which means the VM doesn't
know the page is being accessed. Not necessarily a huge deal in this
specific case, but still.

>  long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
>                     unsigned long ioba, unsigned long tce)
>  {
> -     struct kvm *kvm = vcpu->kvm;
> -     struct kvmppc_spapr_tce_table *stt;
> -
> -     /* udbg_printf("H_PUT_TCE(): liobn=0x%lx ioba=0x%lx, tce=0x%lx\n", */
> -     /*          liobn, ioba, tce); */
> -
> -     list_for_each_entry(stt, &kvm->arch.spapr_tce_tables, list) {
> -             if (stt->liobn == liobn) {
> -                     unsigned long idx = ioba >> SPAPR_TCE_SHIFT;
> -                     struct page *page;
> -                     u64 *tbl;
> -
> -                     /* udbg_printf("H_PUT_TCE: liobn 0x%lx => stt=%p  
> window_size=0x%x\n", */
> -                     /*          liobn, stt, stt->window_size); */
> -                     if (ioba >= stt->window_size)
> -                             return H_PARAMETER;
> -
> -                     page = stt->pages[idx / TCES_PER_PAGE];
> -                     tbl = (u64 *)page_address(page);
> -
> -                     /* FIXME: Need to validate the TCE itself */
> -                     /* udbg_printf("tce @ %p\n", &tbl[idx % 
> TCES_PER_PAGE]); */
> -                     tbl[idx % TCES_PER_PAGE] = tce;
> -                     return H_SUCCESS;
> -             }
> +     long ret;
> +     struct kvmppc_spapr_tce_table *tt;
> +
> +     tt = kvmppc_find_tce_table(vcpu, liobn);
> +     /* Didn't find the liobn, put it to virtual space */
> +     if (!tt)
> +             return H_TOO_HARD;
> +
> +     /* Emulated IO */
> +     if (ioba >= tt->window_size)
> +             return H_PARAMETER;
> +
> +     ret = kvmppc_emulated_validate_tce(tce);
> +     if (ret)
> +             return ret;
> +
> +     kvmppc_emulated_put_tce(tt, ioba, tce);
> +
> +     return H_SUCCESS;
> +}
> +
> +long kvmppc_h_put_tce_indirect(struct kvm_vcpu *vcpu,
> +             unsigned long liobn, unsigned long ioba,
> +             unsigned long tce_list, unsigned long npages)
> +{
> +     struct kvmppc_spapr_tce_table *tt;
> +     long i, ret;
> +     unsigned long *tces;
> +
> +     tt = kvmppc_find_tce_table(vcpu, liobn);
> +     /* Didn't find the liobn, put it to virtual space */
> +     if (!tt)
> +             return H_TOO_HARD;
> +
> +     /*
> +      * The spec says that the maximum size of the list is 512 TCEs so
> +      * so the whole table addressed resides in 4K page
> +      */
> +     if (npages > 512)
> +             return H_PARAMETER;
> +
> +     if (tce_list & ~IOMMU_PAGE_MASK)
> +             return H_PARAMETER;
> +
> +     tces = (unsigned long *) kvmppc_realmode_gpa_to_hpa(vcpu, tce_list);
> +     if ((unsigned long)tces == ERROR_ADDR)
> +             return H_TOO_HARD;
> +
> +     /* Emulated IO */
> +     if ((ioba + (npages << IOMMU_PAGE_SHIFT)) > tt->window_size)
> +             return H_PARAMETER;
> +
> +     for (i = 0; i < npages; ++i) {
> +             ret = kvmppc_emulated_validate_tce(tces[i]);
> +             if (ret)
> +                     return ret;
>       }
>  
> -     /* Didn't find the liobn, punt it to userspace */
> -     return H_TOO_HARD;
> +     for (i = 0; i < npages; ++i)
> +             kvmppc_emulated_put_tce(tt, ioba + (i << IOMMU_PAGE_SHIFT),
> +                             tces[i]);
> +
> +     return H_SUCCESS;
> +}
> +
> +long kvmppc_h_stuff_tce(struct kvm_vcpu *vcpu,
> +             unsigned long liobn, unsigned long ioba,
> +             unsigned long tce_value, unsigned long npages)
> +{
> +     struct kvmppc_spapr_tce_table *tt;
> +     long i, ret;
> +
> +     tt = kvmppc_find_tce_table(vcpu, liobn);
> +     /* Didn't find the liobn, put it to virtual space */
> +     if (!tt)
> +             return H_TOO_HARD;
> +
> +     /* Emulated IO */
> +     if ((ioba + (npages << IOMMU_PAGE_SHIFT)) > tt->window_size)
> +             return H_PARAMETER;
> +
> +     ret = kvmppc_emulated_validate_tce(tce_value);
> +     if (ret || (tce_value & (TCE_PCI_WRITE | TCE_PCI_READ)))
> +             return H_PARAMETER;
> +
> +     for (i = 0; i < npages; ++i, ioba += IOMMU_PAGE_SIZE)
> +             kvmppc_emulated_put_tce(tt, ioba, tce_value);
> +
> +     return H_SUCCESS;
>  }
> +#endif /* CONFIG_KVM_BOOK3S_64_HV */
> diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
> index 550f592..a39039a 100644
> --- a/arch/powerpc/kvm/book3s_hv.c
> +++ b/arch/powerpc/kvm/book3s_hv.c
> @@ -568,6 +568,30 @@ int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu)
>                       ret = kvmppc_xics_hcall(vcpu, req);
>                       break;
>               } /* fallthrough */
> +             return RESUME_HOST;
> +     case H_PUT_TCE:
> +             ret = kvmppc_virtmode_h_put_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
> +                                             kvmppc_get_gpr(vcpu, 5),
> +                                             kvmppc_get_gpr(vcpu, 6));
> +             if (ret == H_TOO_HARD)
> +                     return RESUME_HOST;
> +             break;
> +     case H_PUT_TCE_INDIRECT:
> +             ret = kvmppc_virtmode_h_put_tce_indirect(vcpu, 
> kvmppc_get_gpr(vcpu, 4),
> +                                             kvmppc_get_gpr(vcpu, 5),
> +                                             kvmppc_get_gpr(vcpu, 6),
> +                                             kvmppc_get_gpr(vcpu, 7));
> +             if (ret == H_TOO_HARD)
> +                     return RESUME_HOST;
> +             break;
> +     case H_STUFF_TCE:
> +             ret = kvmppc_virtmode_h_stuff_tce(vcpu, kvmppc_get_gpr(vcpu, 4),
> +                                             kvmppc_get_gpr(vcpu, 5),
> +                                             kvmppc_get_gpr(vcpu, 6),
> +                                             kvmppc_get_gpr(vcpu, 7));
> +             if (ret == H_TOO_HARD)
> +                     return RESUME_HOST;
> +             break;
>       default:
>               return RESUME_HOST;
>       }
> @@ -958,6 +982,20 @@ struct kvm_vcpu *kvmppc_core_vcpu_create(struct kvm 
> *kvm, unsigned int id)
>       vcpu->arch.cpu_type = KVM_CPU_3S_64;
>       kvmppc_sanity_check(vcpu);
>  
> +     /*
> +      * As we want to minimize the chance of having H_PUT_TCE_INDIRECT
> +      * half executed, we first read TCEs from the user, check them and
> +      * return error if something went wrong and only then put TCEs into
> +      * the TCE table.
> +      *
> +      * tce_tmp is a cache for TCEs to avoid stack allocation or
> +      * kmalloc as the whole TCE list can take up to 512 items 8 bytes
> +      * each (4096 bytes).
> +      */
> +     vcpu->arch.tce_tmp = kmalloc(4096, GFP_KERNEL);
> +     if (!vcpu->arch.tce_tmp)
> +             goto free_vcpu;
> +
>       return vcpu;
>  
>  free_vcpu:
> @@ -980,6 +1018,7 @@ void kvmppc_core_vcpu_free(struct kvm_vcpu *vcpu)
>       unpin_vpa(vcpu->kvm, &vcpu->arch.slb_shadow);
>       unpin_vpa(vcpu->kvm, &vcpu->arch.vpa);
>       spin_unlock(&vcpu->arch.vpa_update_lock);
> +     kfree(vcpu->arch.tce_tmp);
>       kvm_vcpu_uninit(vcpu);
>       kmem_cache_free(kvm_vcpu_cache, vcpu);
>  }
> diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
> b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> index b02f91e..d35554e 100644
> --- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> +++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
> @@ -1490,6 +1490,12 @@ hcall_real_table:
>       .long   0               /* 0x11c */
>       .long   0               /* 0x120 */
>       .long   .kvmppc_h_bulk_remove - hcall_real_table
> +     .long   0               /* 0x128 */
> +     .long   0               /* 0x12c */
> +     .long   0               /* 0x130 */
> +     .long   0               /* 0x134 */
> +     .long   .kvmppc_h_stuff_tce - hcall_real_table
> +     .long   .kvmppc_h_put_tce_indirect - hcall_real_table
>  hcall_real_table_end:
>  
>  ignore_hdec:
> diff --git a/arch/powerpc/kvm/book3s_pr_papr.c 
> b/arch/powerpc/kvm/book3s_pr_papr.c
> index da0e0bc..91d4b45 100644
> --- a/arch/powerpc/kvm/book3s_pr_papr.c
> +++ b/arch/powerpc/kvm/book3s_pr_papr.c
> @@ -220,7 +220,38 @@ static int kvmppc_h_pr_put_tce(struct kvm_vcpu *vcpu)
>       unsigned long tce = kvmppc_get_gpr(vcpu, 6);
>       long rc;
>  
> -     rc = kvmppc_h_put_tce(vcpu, liobn, ioba, tce);
> +     rc = kvmppc_virtmode_h_put_tce(vcpu, liobn, ioba, tce);
> +     if (rc == H_TOO_HARD)
> +             return EMULATE_FAIL;
> +     kvmppc_set_gpr(vcpu, 3, rc);
> +     return EMULATE_DONE;
> +}
> +
> +static int kvmppc_h_pr_put_tce_indirect(struct kvm_vcpu *vcpu)
> +{
> +     unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
> +     unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
> +     unsigned long tce = kvmppc_get_gpr(vcpu, 6);
> +     unsigned long npages = kvmppc_get_gpr(vcpu, 7);
> +     long rc;
> +
> +     rc = kvmppc_virtmode_h_put_tce_indirect(vcpu, liobn, ioba,
> +                     tce, npages);
> +     if (rc == H_TOO_HARD)
> +             return EMULATE_FAIL;
> +     kvmppc_set_gpr(vcpu, 3, rc);
> +     return EMULATE_DONE;
> +}
> +
> +static int kvmppc_h_pr_stuff_tce(struct kvm_vcpu *vcpu)
> +{
> +     unsigned long liobn = kvmppc_get_gpr(vcpu, 4);
> +     unsigned long ioba = kvmppc_get_gpr(vcpu, 5);
> +     unsigned long tce_value = kvmppc_get_gpr(vcpu, 6);
> +     unsigned long npages = kvmppc_get_gpr(vcpu, 7);
> +     long rc;
> +
> +     rc = kvmppc_virtmode_h_stuff_tce(vcpu, liobn, ioba, tce_value, npages);
>       if (rc == H_TOO_HARD)
>               return EMULATE_FAIL;
>       kvmppc_set_gpr(vcpu, 3, rc);
> @@ -247,6 +278,10 @@ int kvmppc_h_pr(struct kvm_vcpu *vcpu, unsigned long cmd)
>               return kvmppc_h_pr_bulk_remove(vcpu);
>       case H_PUT_TCE:
>               return kvmppc_h_pr_put_tce(vcpu);
> +     case H_PUT_TCE_INDIRECT:
> +             return kvmppc_h_pr_put_tce_indirect(vcpu);
> +     case H_STUFF_TCE:
> +             return kvmppc_h_pr_stuff_tce(vcpu);
>       case H_CEDE:
>               vcpu->arch.shared->msr |= MSR_EE;
>               kvm_vcpu_block(vcpu);
> diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
> index 6316ee3..8465c2a 100644
> --- a/arch/powerpc/kvm/powerpc.c
> +++ b/arch/powerpc/kvm/powerpc.c
> @@ -395,6 +395,9 @@ int kvm_dev_ioctl_check_extension(long ext)
>               r = 1;
>               break;
>  #endif
> +     case KVM_CAP_SPAPR_MULTITCE:
> +             r = 1;
> +             break;
>       default:
>               r = 0;
>               break;
> diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
> index a5c86fc..fc0d6b9 100644
> --- a/include/uapi/linux/kvm.h
> +++ b/include/uapi/linux/kvm.h
> @@ -666,6 +666,7 @@ struct kvm_ppc_smmu_info {
>  #define KVM_CAP_IRQ_MPIC 90
>  #define KVM_CAP_PPC_RTAS 91
>  #define KVM_CAP_IRQ_XICS 92
> +#define KVM_CAP_SPAPR_MULTITCE 93
>  
>  #ifdef KVM_CAP_IRQ_ROUTING
>  


--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html
Please read the FAQ at  http://www.tux.org/lkml/

Reply via email to