[PATCH 1/2] KVM: PPC: Book3S HV: Correct tlbie usage

2013-07-08 Thread Paul Mackerras
This corrects the usage of the tlbie (TLB invalidate entry) instruction
in HV KVM.  The tlbie instruction changed between PPC970 and POWER7.
On the PPC970, the bit to select large vs. small page is in the instruction,
not in the RB register value.  This changes the code to use the correct
form on PPC970.

On POWER7 we were calculating the AVAL (Abbreviated Virtual Address, Lower)
field of the RB value incorrectly for 64k pages.  This fixes it.

Since we now have several cases to handle for the tlbie instruction, this
factors out the code to do a sequence of tlbies into a new function,
do_tlbies(), and calls that from the various places where the code was
doing tlbie instructions inline.  It also makes kvmppc_h_bulk_remove()
use the same global_invalidates() function for determining whether to do
local or global TLB invalidations as is used in other places, for
consistency, and also to make sure that kvm-arch.need_tlb_flush gets
updated properly.

Signed-off-by: Paul Mackerras pau...@samba.org
Cc: sta...@vger.kernel.org
---
 arch/powerpc/include/asm/kvm_book3s_64.h |   2 +-
 arch/powerpc/kvm/book3s_hv_rm_mmu.c  | 139 ++-
 2 files changed, 82 insertions(+), 59 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_book3s_64.h 
b/arch/powerpc/include/asm/kvm_book3s_64.h
index 9c1ff33..dc6b84a 100644
--- a/arch/powerpc/include/asm/kvm_book3s_64.h
+++ b/arch/powerpc/include/asm/kvm_book3s_64.h
@@ -100,7 +100,7 @@ static inline unsigned long compute_tlbie_rb(unsigned long 
v, unsigned long r,
/* (masks depend on page size) */
rb |= 0x1000;   /* page encoding in LP field */
rb |= (va_low  0x7f)  16; /* 7b of VA in AVA/LP 
field */
-   rb |= (va_low  0xfe);  /* AVAL field (P7 doesn't seem 
to care) */
+   rb |= ((va_low  4)  0xf0);   /* AVAL field (P7 
doesn't seem to care) */
}
} else {
/* 4kB page */
diff --git a/arch/powerpc/kvm/book3s_hv_rm_mmu.c 
b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
index 6dcbb49..105b00f 100644
--- a/arch/powerpc/kvm/book3s_hv_rm_mmu.c
+++ b/arch/powerpc/kvm/book3s_hv_rm_mmu.c
@@ -385,6 +385,80 @@ static inline int try_lock_tlbie(unsigned int *lock)
return old == 0;
 }
 
+/*
+ * tlbie/tlbiel is a bit different on the PPC970 compared to later
+ * processors such as POWER7; the large page bit is in the instruction
+ * not RB, and the top 16 bits and the bottom 12 bits of the VA
+ * in RB must be 0.
+ */
+static void do_tlbies_970(struct kvm *kvm, unsigned long *rbvalues,
+ long npages, int global, bool need_sync)
+{
+   long i;
+
+   if (global) {
+   while (!try_lock_tlbie(kvm-arch.tlbie_lock))
+   cpu_relax();
+   if (need_sync)
+   asm volatile(ptesync : : : memory);
+   for (i = 0; i  npages; ++i) {
+   unsigned long rb = rbvalues[i];
+
+   if (rb  1) /* large page */
+   asm volatile(tlbie %0,1 : :
+r (rb  0xf000ul));
+   else
+   asm volatile(tlbie %0,0 : :
+r (rb  0xf000ul));
+   }
+   asm volatile(eieio; tlbsync; ptesync : : : memory);
+   kvm-arch.tlbie_lock = 0;
+   } else {
+   if (need_sync)
+   asm volatile(ptesync : : : memory);
+   for (i = 0; i  npages; ++i) {
+   unsigned long rb = rbvalues[i];
+
+   if (rb  1) /* large page */
+   asm volatile(tlbiel %0,1 : :
+r (rb  0xf000ul));
+   else
+   asm volatile(tlbiel %0,0 : :
+r (rb  0xf000ul));
+   }
+   asm volatile(ptesync : : : memory);
+   }
+}
+
+static void do_tlbies(struct kvm *kvm, unsigned long *rbvalues,
+ long npages, int global, bool need_sync)
+{
+   long i;
+
+   if (cpu_has_feature(CPU_FTR_ARCH_201)) {
+   /* PPC970 tlbie instruction is a bit different */
+   do_tlbies_970(kvm, rbvalues, npages, global, need_sync);
+   return;
+   }
+   if (global) {
+   while (!try_lock_tlbie(kvm-arch.tlbie_lock))
+   cpu_relax();
+   if (need_sync)
+   asm volatile(ptesync : : : memory);
+   for (i = 0; i  npages; ++i)
+   asm volatile(PPC_TLBIE(%1,%0) : :
+r (rbvalues[i]), r (kvm-arch.lpid));
+   asm volatile(eieio; tlbsync; ptesync 

[PATCH 2/2] KVM: PPC: Book3S HV: Allow negative offsets to real-mode hcall handlers

2013-07-08 Thread Paul Mackerras
The table of offsets to real-mode hcall handlers in book3s_hv_rmhandlers.S
can contain negative values, if some of the handlers end up before the
table in the vmlinux binary.  Thus we need to use a sign-extending load
to read the values in the table rather than a zero-extending load.
Without this, the host crashes when the guest does one of the hcalls
with negative offsets, due to jumping to a bogus address.

Signed-off-by: Paul Mackerras pau...@samba.org
Cc: sta...@vger.kernel.org
---
 arch/powerpc/kvm/book3s_hv_rmhandlers.S | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/arch/powerpc/kvm/book3s_hv_rmhandlers.S 
b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
index b02f91e..60dce5b 100644
--- a/arch/powerpc/kvm/book3s_hv_rmhandlers.S
+++ b/arch/powerpc/kvm/book3s_hv_rmhandlers.S
@@ -1381,7 +1381,7 @@ hcall_try_real_mode:
cmpldi  r3,hcall_real_table_end - hcall_real_table
bge guest_exit_cont
LOAD_REG_ADDR(r4, hcall_real_table)
-   lwzxr3,r3,r4
+   lwaxr3,r3,r4
cmpwi   r3,0
beq guest_exit_cont
add r3,r3,r4
-- 
1.8.3.1

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] KVM: PPC: Book3E: Get vcpu's last instruction for emulation

2013-07-08 Thread Alexander Graf

On 28.06.2013, at 11:20, Mihai Caraman wrote:

 lwepx faults needs to be handled by KVM and this implies additional code
 in DO_KVM macro to identify the source of the exception originated from
 host context. This requires to check the Exception Syndrome Register
 (ESR[EPID]) and External PID Load Context Register (EPLC[EGS]) for DTB_MISS,
 DSI and LRAT exceptions which is too intrusive for the host.
 
 Get rid of lwepx and acquire last instuction in kvmppc_handle_exit() by
 searching for the physical address and kmap it. This fixes an infinite loop

What's the difference in speed for this?

Also, could we call lwepx later in host code, when kvmppc_get_last_inst() gets 
invoked?

 caused by lwepx's data TLB miss handled in the host and the TODO for TLB
 eviction and execute-but-not-read entries.
 
 Signed-off-by: Mihai Caraman mihai.cara...@freescale.com
 ---
 Resend this pacth for Alex G. he was unsubscribed from kvm-ppc mailist
 for a while.
 
 arch/powerpc/include/asm/mmu-book3e.h |6 ++-
 arch/powerpc/kvm/booke.c  |6 +++
 arch/powerpc/kvm/booke.h  |2 +
 arch/powerpc/kvm/bookehv_interrupts.S |   32 ++-
 arch/powerpc/kvm/e500.c   |4 ++
 arch/powerpc/kvm/e500mc.c |   69 +
 6 files changed, 91 insertions(+), 28 deletions(-)
 
 diff --git a/arch/powerpc/include/asm/mmu-book3e.h 
 b/arch/powerpc/include/asm/mmu-book3e.h
 index 99d43e0..32e470e 100644
 --- a/arch/powerpc/include/asm/mmu-book3e.h
 +++ b/arch/powerpc/include/asm/mmu-book3e.h
 @@ -40,7 +40,10 @@
 
 /* MAS registers bit definitions */
 
 -#define MAS0_TLBSEL(x)   (((x)  28)  0x3000)
 +#define MAS0_TLBSEL_MASK 0x3000
 +#define MAS0_TLBSEL_SHIFT28
 +#define MAS0_TLBSEL(x)   (((x)  MAS0_TLBSEL_SHIFT)  
 MAS0_TLBSEL_MASK)
 +#define MAS0_GET_TLBSEL(mas0)(((mas0)  MAS0_TLBSEL_MASK)  
 MAS0_TLBSEL_SHIFT)
 #define MAS0_ESEL_MASK0x0FFF
 #define MAS0_ESEL_SHIFT   16
 #define MAS0_ESEL(x)  (((x)  MAS0_ESEL_SHIFT)  MAS0_ESEL_MASK)
 @@ -58,6 +61,7 @@
 #define MAS1_TSIZE_MASK   0x0f80
 #define MAS1_TSIZE_SHIFT  7
 #define MAS1_TSIZE(x) (((x)  MAS1_TSIZE_SHIFT)  MAS1_TSIZE_MASK)
 +#define MAS1_GET_TSIZE(mas1) (((mas1)  MAS1_TSIZE_MASK)  MAS1_TSIZE_SHIFT)
 
 #define MAS2_EPN  (~0xFFFUL)
 #define MAS2_X0   0x0040
 diff --git a/arch/powerpc/kvm/booke.c b/arch/powerpc/kvm/booke.c
 index 1020119..6764a8e 100644
 --- a/arch/powerpc/kvm/booke.c
 +++ b/arch/powerpc/kvm/booke.c
 @@ -836,6 +836,12 @@ int kvmppc_handle_exit(struct kvm_run *run, struct 
 kvm_vcpu *vcpu,
   /* update before a new last_exit_type is rewritten */
   kvmppc_update_timing_stats(vcpu);
 
 + /*
 +  * The exception type can change at this point, such as if the TLB entry
 +  * for the emulated instruction has been evicted.
 +  */
 + kvmppc_prepare_for_emulation(vcpu, exit_nr);

Please model this the same way as book3s. Check out kvmppc_get_last_inst() as a 
starting point.

 +
   /* restart interrupts if they were meant for the host */
   kvmppc_restart_interrupt(vcpu, exit_nr);
 
 diff --git a/arch/powerpc/kvm/booke.h b/arch/powerpc/kvm/booke.h
 index 5fd1ba6..a0d0fea 100644
 --- a/arch/powerpc/kvm/booke.h
 +++ b/arch/powerpc/kvm/booke.h
 @@ -90,6 +90,8 @@ void kvmppc_vcpu_disable_spe(struct kvm_vcpu *vcpu);
 void kvmppc_booke_vcpu_load(struct kvm_vcpu *vcpu, int cpu);
 void kvmppc_booke_vcpu_put(struct kvm_vcpu *vcpu);
 
 +void kvmppc_prepare_for_emulation(struct kvm_vcpu *vcpu, unsigned int 
 *exit_nr);
 +
 enum int_class {
   INT_CLASS_NONCRIT,
   INT_CLASS_CRIT,
 diff --git a/arch/powerpc/kvm/bookehv_interrupts.S 
 b/arch/powerpc/kvm/bookehv_interrupts.S
 index 20c7a54..0538ab9 100644
 --- a/arch/powerpc/kvm/bookehv_interrupts.S
 +++ b/arch/powerpc/kvm/bookehv_interrupts.S
 @@ -120,37 +120,20 @@
 
   .if \flags  NEED_EMU
   /*
 -  * This assumes you have external PID support.
 -  * To support a bookehv CPU without external PID, you'll
 -  * need to look up the TLB entry and create a temporary mapping.
 -  *
 -  * FIXME: we don't currently handle if the lwepx faults.  PR-mode
 -  * booke doesn't handle it either.  Since Linux doesn't use
 -  * broadcast tlbivax anymore, the only way this should happen is
 -  * if the guest maps its memory execute-but-not-read, or if we
 -  * somehow take a TLB miss in the middle of this entry code and
 -  * evict the relevant entry.  On e500mc, all kernel lowmem is
 -  * bolted into TLB1 large page mappings, and we don't use
 -  * broadcast invalidates, so we should not take a TLB miss here.
 -  *
 -  * Later we'll need to deal with faults here.  Disallowing guest
 -  * mappings that are execute-but-not-read could be an option on
 -  * e500mc, but not on chips with an LRAT if it is used.
 

Re: [PATCH -V3 1/4] mm/cma: Move dma contiguous changes into a seperate config

2013-07-08 Thread Alexander Graf

On 02.07.2013, at 07:45, Aneesh Kumar K.V wrote:

 From: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com
 
 We want to use CMA for allocating hash page table and real mode area for
 PPC64. Hence move DMA contiguous related changes into a seperate config
 so that ppc64 can enable CMA without requiring DMA contiguous.
 
 Acked-by: Michal Nazarewicz min...@mina86.com
 Acked-by: Paul Mackerras pau...@samba.org
 Signed-off-by: Aneesh Kumar K.V aneesh.ku...@linux.vnet.ibm.com

Thanks, applied all to kvm-ppc-queue. Please provide a cover letter next time 
:).


Alex

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/2] KVM: PPC: Fix kvm_exit_names array

2013-07-08 Thread Alexander Graf

On 03.07.2013, at 15:30, Mihai Caraman wrote:

 Some exit ids where left out from kvm_exit_names array.
 
 Signed-off-by: Mihai Caraman mihai.cara...@freescale.com
 ---
 arch/powerpc/kvm/timing.c |4 +++-
 1 files changed, 3 insertions(+), 1 deletions(-)
 
 diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c
 index 07b6110..c392d26 100644
 --- a/arch/powerpc/kvm/timing.c
 +++ b/arch/powerpc/kvm/timing.c
 @@ -135,7 +135,9 @@ static const char 
 *kvm_exit_names[__NUMBER_OF_KVM_EXIT_TYPES] = {
   [USR_PR_INST] = USR_PR_INST,
   [FP_UNAVAIL] =  FP_UNAVAIL,
   [DEBUG_EXITS] = DEBUG,
 - [TIMEINGUEST] = TIMEINGUEST
 + [TIMEINGUEST] = TIMEINGUEST,
 + [DBELL_EXITS] = DBELL,
 + [GDBELL_EXITS] =GDBELL

Please add a comma at the end here, so that we don't have to uselessly touch 
the entry next time again.


Alex

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/2] KVM: PPC: Book3E: Emulate MCSRR0/1 SPR and rfmci instruction

2013-07-08 Thread Alexander Graf

On 03.07.2013, at 15:30, Mihai Caraman wrote:

 Some guests are making use of return from machine check instruction
 to do crazy things even though the 64-bit kernel doesn't handle yet
 this interrupt. Emulate MCSRR0/1 SPR and rfmci instruction accordingly.
 
 Signed-off-by: Mihai Caraman mihai.cara...@freescale.com
 ---
 arch/powerpc/include/asm/kvm_host.h |1 +
 arch/powerpc/kvm/booke_emulate.c|   25 +
 arch/powerpc/kvm/timing.c   |1 +
 3 files changed, 27 insertions(+), 0 deletions(-)
 
 diff --git a/arch/powerpc/include/asm/kvm_host.h 
 b/arch/powerpc/include/asm/kvm_host.h
 index af326cd..0466789 100644
 --- a/arch/powerpc/include/asm/kvm_host.h
 +++ b/arch/powerpc/include/asm/kvm_host.h
 @@ -148,6 +148,7 @@ enum kvm_exit_types {
   EMULATED_TLBWE_EXITS,
   EMULATED_RFI_EXITS,
   EMULATED_RFCI_EXITS,
 + EMULATED_RFMCI_EXITS,

I would quite frankly prefer to see us abandon the whole exit timing framework 
in the kernel and instead use trace points. Then we don't have to maintain all 
of this randomly exercised code.

FWIW I think in this case however, treating RFMCI the same as RFI or random 
instruction emulation shouldn't hurt. This whole table is only about timing 
measurements. If you want to know for real what's going on, use trace points.

Otherwise looks good.


Alex

   DEC_EXITS,
   EXT_INTR_EXITS,
   HALT_WAKEUP,
 diff --git a/arch/powerpc/kvm/booke_emulate.c 
 b/arch/powerpc/kvm/booke_emulate.c
 index 27a4b28..aaff1b7 100644
 --- a/arch/powerpc/kvm/booke_emulate.c
 +++ b/arch/powerpc/kvm/booke_emulate.c
 @@ -23,6 +23,7 @@
 
 #include booke.h
 
 +#define OP_19_XOP_RFMCI   38
 #define OP_19_XOP_RFI 50
 #define OP_19_XOP_RFCI51
 
 @@ -43,6 +44,12 @@ static void kvmppc_emul_rfci(struct kvm_vcpu *vcpu)
   kvmppc_set_msr(vcpu, vcpu-arch.csrr1);
 }
 
 +static void kvmppc_emul_rfmci(struct kvm_vcpu *vcpu)
 +{
 + vcpu-arch.pc = vcpu-arch.mcsrr0;
 + kvmppc_set_msr(vcpu, vcpu-arch.mcsrr1);
 +}
 +
 int kvmppc_booke_emulate_op(struct kvm_run *run, struct kvm_vcpu *vcpu,
 unsigned int inst, int *advance)
 {
 @@ -65,6 +72,12 @@ int kvmppc_booke_emulate_op(struct kvm_run *run, struct 
 kvm_vcpu *vcpu,
   *advance = 0;
   break;
 
 + case OP_19_XOP_RFMCI:
 + kvmppc_emul_rfmci(vcpu);
 + kvmppc_set_exit_type(vcpu, EMULATED_RFMCI_EXITS);
 + *advance = 0;
 + break;
 +
   default:
   emulated = EMULATE_FAIL;
   break;
 @@ -138,6 +151,12 @@ int kvmppc_booke_emulate_mtspr(struct kvm_vcpu *vcpu, 
 int sprn, ulong spr_val)
   case SPRN_DBCR1:
   vcpu-arch.dbg_reg.dbcr1 = spr_val;
   break;
 + case SPRN_MCSRR0:
 + vcpu-arch.mcsrr0 = spr_val;
 + break;
 + case SPRN_MCSRR1:
 + vcpu-arch.mcsrr1 = spr_val;
 + break;
   case SPRN_DBSR:
   vcpu-arch.dbsr = ~spr_val;
   break;
 @@ -284,6 +303,12 @@ int kvmppc_booke_emulate_mfspr(struct kvm_vcpu *vcpu, 
 int sprn, ulong *spr_val)
   case SPRN_DBCR1:
   *spr_val = vcpu-arch.dbg_reg.dbcr1;
   break;
 + case SPRN_MCSRR0:
 + *spr_val = vcpu-arch.mcsrr0;
 + break;
 + case SPRN_MCSRR1:
 + *spr_val = vcpu-arch.mcsrr1;
 + break;
   case SPRN_DBSR:
   *spr_val = vcpu-arch.dbsr;
   break;
 diff --git a/arch/powerpc/kvm/timing.c b/arch/powerpc/kvm/timing.c
 index c392d26..670f63d 100644
 --- a/arch/powerpc/kvm/timing.c
 +++ b/arch/powerpc/kvm/timing.c
 @@ -129,6 +129,7 @@ static const char 
 *kvm_exit_names[__NUMBER_OF_KVM_EXIT_TYPES] = {
   [EMULATED_TLBSX_EXITS] =EMUL_TLBSX,
   [EMULATED_TLBWE_EXITS] =EMUL_TLBWE,
   [EMULATED_RFI_EXITS] =  EMUL_RFI,
 + [EMULATED_RFMCI_EXITS] =EMUL_RFMCI,
   [DEC_EXITS] =   DEC,
   [EXT_INTR_EXITS] =  EXTINT,
   [HALT_WAKEUP] = HALT,
 -- 
 1.7.3.4
 
 
 --
 To unsubscribe from this list: send the line unsubscribe kvm-ppc in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html

--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/8] vfio: add external user support

2013-07-08 Thread Alex Williamson
On Sun, 2013-07-07 at 01:07 +1000, Alexey Kardashevskiy wrote:
 VFIO is designed to be used via ioctls on file descriptors
 returned by VFIO.
 
 However in some situations support for an external user is required.
 The first user is KVM on PPC64 (SPAPR TCE protocol) which is going to
 use the existing VFIO groups for exclusive access in real/virtual mode
 on a host to avoid passing map/unmap requests to the user space which
 would made things pretty slow.
 
 The proposed protocol includes:
 
 1. do normal VFIO init stuff such as opening a new container, attaching
 group(s) to it, setting an IOMMU driver for a container. When IOMMU is
 set for a container, all groups in it are considered ready to use by
 an external user.
 
 2. pass a fd of the group we want to accelerate to KVM. KVM calls
 vfio_group_get_external_user() to verify if the group is initialized,
 IOMMU is set for it and increment the container user counter to prevent
 the VFIO group from disposal prior to KVM exit.
 The current TCE IOMMU driver marks the whole IOMMU table as busy when
 IOMMU is set for a container what prevents other DMA users from
 allocating from it so it is safe to grant user space access to it.
 
 3. KVM calls vfio_external_user_iommu_id() to obtian an IOMMU ID which
 KVM uses to get an iommu_group struct for later use.
 
 4. When KVM is finished, it calls vfio_group_put_external_user() to
 release the VFIO group by decrementing the container user counter.
 Everything gets released.
 
 The vfio: Limit group opens patch is also required for the consistency.
 
 Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
 ---
 diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
 index c488da5..57aa191 100644
 --- a/drivers/vfio/vfio.c
 +++ b/drivers/vfio/vfio.c
 @@ -1370,6 +1370,62 @@ static const struct file_operations vfio_device_fops = 
 {
  };
  
  /**
 + * External user API, exported by symbols to be linked dynamically.
 + *
 + * The protocol includes:
 + *  1. do normal VFIO init operation:
 + *   - opening a new container;
 + *   - attaching group(s) to it;
 + *   - setting an IOMMU driver for a container.
 + * When IOMMU is set for a container, all groups in it are
 + * considered ready to use by an external user.
 + *
 + * 2. The user space passed a group fd which we want to accelerate in
 + * KVM. KVM uses vfio_group_get_external_user() to verify that:
 + *   - the group is initialized;
 + *   - IOMMU is set for it.
 + * Then vfio_group_get_external_user() increments the container user
 + * counter to prevent the VFIO group from disposal prior to KVM exit.
 + *
 + * 3. KVM calls vfio_external_user_iommu_id() to know an IOMMU ID which
 + * KVM uses to get an iommu_group struct for later use.
 + *
 + * 4. When KVM is finished, it calls vfio_group_put_external_user() to
 + * release the VFIO group by decrementing the container user counter.

nit, the interface is for any external user, not just kvm.

 + */
 +struct vfio_group *vfio_group_get_external_user(struct file *filep)
 +{
 + struct vfio_group *group = filep-private_data;
 +
 + if (filep-f_op != vfio_group_fops)
 + return NULL;

ERR_PTR(-EINVAL)

There also needs to be a vfio_group_get(group) here and put in error
cases.

 +
 + if (!atomic_inc_not_zero(group-container_users))
 + return NULL;

ERR_PTR(-EINVAL)

 +
 + if (!group-container-iommu_driver ||
 + !vfio_group_viable(group)) {
 + atomic_dec(group-container_users);
 + return NULL;

ERR_PTR(-EINVAL)

 + }
 +
 + return group;
 +}
 +EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
 +
 +void vfio_group_put_external_user(struct vfio_group *group)
 +{
 + vfio_group_try_dissolve_container(group);

And a vfio_group_put(group) here

 +}
 +EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
 +
 +int vfio_external_user_iommu_id(struct vfio_group *group)
 +{
 + return iommu_group_id(group-iommu_group);
 +}
 +EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
 +
 +/**
   * Module/class support
   */
  static char *vfio_devnode(struct device *dev, umode_t *mode)
 diff --git a/include/linux/vfio.h b/include/linux/vfio.h
 index ac8d488..24579a0 100644
 --- a/include/linux/vfio.h
 +++ b/include/linux/vfio.h
 @@ -90,4 +90,11 @@ extern void vfio_unregister_iommu_driver(
   TYPE tmp;   \
   offsetof(TYPE, MEMBER) + sizeof(tmp.MEMBER); }) \
  
 +/*
 + * External user API
 + */
 +extern struct vfio_group *vfio_group_get_external_user(struct file *filep);
 +extern void vfio_group_put_external_user(struct vfio_group *group);
 +extern int vfio_external_user_iommu_id(struct vfio_group *group);
 +
  #endif /* VFIO_H */



--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/8] vfio: add external user support

2013-07-08 Thread Alexey Kardashevskiy
On 07/09/2013 07:52 AM, Alex Williamson wrote:
 On Sun, 2013-07-07 at 01:07 +1000, Alexey Kardashevskiy wrote:
 VFIO is designed to be used via ioctls on file descriptors
 returned by VFIO.

 However in some situations support for an external user is required.
 The first user is KVM on PPC64 (SPAPR TCE protocol) which is going to
 use the existing VFIO groups for exclusive access in real/virtual mode
 on a host to avoid passing map/unmap requests to the user space which
 would made things pretty slow.

 The proposed protocol includes:

 1. do normal VFIO init stuff such as opening a new container, attaching
 group(s) to it, setting an IOMMU driver for a container. When IOMMU is
 set for a container, all groups in it are considered ready to use by
 an external user.

 2. pass a fd of the group we want to accelerate to KVM. KVM calls
 vfio_group_get_external_user() to verify if the group is initialized,
 IOMMU is set for it and increment the container user counter to prevent
 the VFIO group from disposal prior to KVM exit.
 The current TCE IOMMU driver marks the whole IOMMU table as busy when
 IOMMU is set for a container what prevents other DMA users from
 allocating from it so it is safe to grant user space access to it.

 3. KVM calls vfio_external_user_iommu_id() to obtian an IOMMU ID which
 KVM uses to get an iommu_group struct for later use.

 4. When KVM is finished, it calls vfio_group_put_external_user() to
 release the VFIO group by decrementing the container user counter.
 Everything gets released.

 The vfio: Limit group opens patch is also required for the consistency.

 Signed-off-by: Alexey Kardashevskiy a...@ozlabs.ru
 ---
 diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
 index c488da5..57aa191 100644
 --- a/drivers/vfio/vfio.c
 +++ b/drivers/vfio/vfio.c
 @@ -1370,6 +1370,62 @@ static const struct file_operations vfio_device_fops 
 = {
  };
  
  /**
 + * External user API, exported by symbols to be linked dynamically.
 + *
 + * The protocol includes:
 + *  1. do normal VFIO init operation:
 + *  - opening a new container;
 + *  - attaching group(s) to it;
 + *  - setting an IOMMU driver for a container.
 + * When IOMMU is set for a container, all groups in it are
 + * considered ready to use by an external user.
 + *
 + * 2. The user space passed a group fd which we want to accelerate in
 + * KVM. KVM uses vfio_group_get_external_user() to verify that:
 + *  - the group is initialized;
 + *  - IOMMU is set for it.
 + * Then vfio_group_get_external_user() increments the container user
 + * counter to prevent the VFIO group from disposal prior to KVM exit.
 + *
 + * 3. KVM calls vfio_external_user_iommu_id() to know an IOMMU ID which
 + * KVM uses to get an iommu_group struct for later use.
 + *
 + * 4. When KVM is finished, it calls vfio_group_put_external_user() to
 + * release the VFIO group by decrementing the container user counter.
 
 nit, the interface is for any external user, not just kvm.

s/KVM/An external user/ ?
Or add the description below uses KVM just as an example of an external user?


 + */
 +struct vfio_group *vfio_group_get_external_user(struct file *filep)
 +{
 +struct vfio_group *group = filep-private_data;
 +
 +if (filep-f_op != vfio_group_fops)
 +return NULL;
 
 ERR_PTR(-EINVAL)
 
 There also needs to be a vfio_group_get(group) here and put in error
 cases.


Is that because I do not hold a reference to the file anymore?


 +
 +if (!atomic_inc_not_zero(group-container_users))
 +return NULL;
 
 ERR_PTR(-EINVAL)
 
 +
 +if (!group-container-iommu_driver ||
 +!vfio_group_viable(group)) {
 +atomic_dec(group-container_users);
 +return NULL;
 
 ERR_PTR(-EINVAL)
 
 +}
 +
 +return group;
 +}
 +EXPORT_SYMBOL_GPL(vfio_group_get_external_user);
 +
 +void vfio_group_put_external_user(struct vfio_group *group)
 +{
 +vfio_group_try_dissolve_container(group);
 
 And a vfio_group_put(group) here
 
 +}
 +EXPORT_SYMBOL_GPL(vfio_group_put_external_user);
 +
 +int vfio_external_user_iommu_id(struct vfio_group *group)
 +{
 +return iommu_group_id(group-iommu_group);
 +}
 +EXPORT_SYMBOL_GPL(vfio_external_user_iommu_id);
 +
 +/**
   * Module/class support
   */
  static char *vfio_devnode(struct device *dev, umode_t *mode)
 diff --git a/include/linux/vfio.h b/include/linux/vfio.h
 index ac8d488..24579a0 100644
 --- a/include/linux/vfio.h
 +++ b/include/linux/vfio.h
 @@ -90,4 +90,11 @@ extern void vfio_unregister_iommu_driver(
  TYPE tmp;   \
  offsetof(TYPE, MEMBER) + sizeof(tmp.MEMBER); }) \
  
 +/*
 + * External user API
 + */
 +extern struct vfio_group *vfio_group_get_external_user(struct file *filep);
 +extern void vfio_group_put_external_user(struct vfio_group *group);
 +extern int vfio_external_user_iommu_id(struct vfio_group *group);
 +
  #endif /* VFIO_H */
 
 
 


-- 
Alexey
--
To unsubscribe from this list: send the