[PATCH 0/3] Prepare for in-kernel VFIO DMA operations acceleration

2014-06-05 Thread Alexey Kardashevskiy
This reserves 2 capability numbers.

This implements an extended version of KVM_CREATE_SPAPR_TCE_64 ioctl.

Please advise how to proceed with these patches as I suspect that
first two should go via Paolo's tree while the last one via Alex Graf's tree
(correct?).

Thanks!

Alexey Kardashevskiy (3):
  PPC: KVM: Reserve KVM_CAP_SPAPR_TCE_VFIO capability number
  PPC: KVM: Reserve KVM_CAP_SPAPR_TCE_64 capability number
  PPC: KVM: Add support for 64bit TCE windows

 Documentation/virtual/kvm/api.txt   | 46 +
 arch/powerpc/include/asm/kvm_host.h |  4 +++-
 arch/powerpc/include/asm/kvm_ppc.h  |  2 +-
 arch/powerpc/include/uapi/asm/kvm.h |  9 
 arch/powerpc/kvm/book3s_64_vio.c|  4 +++-
 arch/powerpc/kvm/powerpc.c  | 24 ++-
 include/uapi/linux/kvm.h|  4 
 7 files changed, 89 insertions(+), 4 deletions(-)

-- 
2.0.0

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/3] PPC: KVM: Add support for 64bit TCE windows

2014-06-05 Thread Alexey Kardashevskiy
The existing KVM_CREATE_SPAPR_TCE only supports 32bit windows which is not
enough for directly mapped windows as the guest can get more than 4GB.

This adds KVM_CREATE_SPAPR_TCE_64 ioctl and advertises it
via KVM_CAP_SPAPR_TCE_64 capability.

Since 64bit windows are to support Dynamic DMA windows (DDW), let's add
@bus_offset and @page_shift which are also required by DDW.

Signed-off-by: Alexey Kardashevskiy 
---
 Documentation/virtual/kvm/api.txt   | 46 +
 arch/powerpc/include/asm/kvm_host.h |  4 +++-
 arch/powerpc/include/asm/kvm_ppc.h  |  2 +-
 arch/powerpc/include/uapi/asm/kvm.h |  9 
 arch/powerpc/kvm/book3s_64_vio.c|  4 +++-
 arch/powerpc/kvm/powerpc.c  | 24 ++-
 include/uapi/linux/kvm.h|  2 ++
 7 files changed, 87 insertions(+), 4 deletions(-)

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index b4f5365..8a2a2da 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -2484,6 +2484,52 @@ calls by the guest for that service will be passed to 
userspace to be
 handled.
 
 
+4.87 KVM_CREATE_SPAPR_TCE_64
+
+Capability: KVM_CAP_SPAPR_TCE_64
+Architectures: powerpc
+Type: vm ioctl
+Parameters: struct kvm_create_spapr_tce_64 (in)
+Returns: file descriptor for manipulating the created TCE table
+
+This is an extension for KVM_CAP_SPAPR_TCE which only supports 32bit
+windows.
+
+This creates a virtual TCE (translation control entry) table, which
+is an IOMMU for PAPR-style virtual I/O.  It is used to translate
+logical addresses used in virtual I/O into guest physical addresses,
+and provides a scatter/gather capability for PAPR virtual I/O.
+
+/* for KVM_CAP_SPAPR_TCE_64 */
+struct kvm_create_spapr_tce_64 {
+   __u64 liobn;
+   __u64 window_size;
+   __u64 bus_offset;
+   __u32 page_shift;
+   __u32 flags;
+};
+
+The liobn field gives the logical IO bus number for which to create a
+TCE table. The window_size field specifies the size of the DMA window
+which this TCE table will translate - the table will contain one 64
+bit TCE entry for every IOMMU page. The bus_offset field tells where
+this window is mapped on the IO bus. The page_size field tells a size
+of the pages in this window, can be 4K, 64K, 16MB, etc. The flags field
+is not used at the moment but provides the room for extensions.
+
+When the guest issues an H_PUT_TCE/H_PUT_TCE_INDIRECT/H_STUFF_TCE hcall
+on a liobn for which a TCE table has been created using this ioctl(),
+the kernel will handle it in real or virtual mode, updating the TCE table.
+If liobn has not been registered with this ioctl, H_PUT_TCE/etc calls
+will cause a vm exit and must be handled by userspace.
+
+The return value is a file descriptor which can be passed to mmap(2)
+to map the created TCE table into userspace.  This lets userspace read
+the entries written by kernel-handled H_PUT_TCE calls, and also lets
+userspace update the TCE table directly which is useful in some
+circumstances.
+
+
 5. The kvm_run structure
 
 
diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 1eaea2d..260a810 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -179,7 +179,9 @@ struct kvmppc_spapr_tce_table {
struct list_head list;
struct kvm *kvm;
u64 liobn;
-   u32 window_size;
+   u64 window_size;
+   u64 bus_offset;
+   u32 page_shift;
struct page *pages[0];
 };
 
diff --git a/arch/powerpc/include/asm/kvm_ppc.h 
b/arch/powerpc/include/asm/kvm_ppc.h
index 4096f16..b472fd3 100644
--- a/arch/powerpc/include/asm/kvm_ppc.h
+++ b/arch/powerpc/include/asm/kvm_ppc.h
@@ -126,7 +126,7 @@ extern void kvmppc_map_vrma(struct kvm_vcpu *vcpu,
 extern int kvmppc_pseries_do_hcall(struct kvm_vcpu *vcpu);
 
 extern long kvm_vm_ioctl_create_spapr_tce(struct kvm *kvm,
-   struct kvm_create_spapr_tce *args);
+   struct kvm_create_spapr_tce_64 *args);
 extern long kvmppc_h_put_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
 unsigned long ioba, unsigned long tce);
 extern long kvmppc_h_get_tce(struct kvm_vcpu *vcpu, unsigned long liobn,
diff --git a/arch/powerpc/include/uapi/asm/kvm.h 
b/arch/powerpc/include/uapi/asm/kvm.h
index a6665be..0ada7b4 100644
--- a/arch/powerpc/include/uapi/asm/kvm.h
+++ b/arch/powerpc/include/uapi/asm/kvm.h
@@ -333,6 +333,15 @@ struct kvm_create_spapr_tce {
__u32 window_size;
 };
 
+/* for KVM_CAP_SPAPR_TCE_64 */
+struct kvm_create_spapr_tce_64 {
+   __u64 liobn;
+   __u64 window_size;
+   __u64 bus_offset;
+   __u32 page_shift;
+   __u32 flags;
+};
+
 /* for KVM_ALLOCATE_RMA */
 struct kvm_allocate_rma {
__u64 rma_size;
diff --git a/arch/powerpc/kvm/book3s_64_vio.c b/arch/powerpc/kvm/book3s_64_vio.c
index 54cf9bc..230fa5f 100644
--- a/arch/

[PATCH 2/3] PPC: KVM: Reserve KVM_CAP_SPAPR_TCE_64 capability number

2014-06-05 Thread Alexey Kardashevskiy
This adds a capability number for 64-bit TCE tables support.

Signed-off-by: Alexey Kardashevskiy 
---
 include/uapi/linux/kvm.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index 944cd21..e6972bf 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -744,6 +744,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_ENABLE_CAP_VM 98
 #define KVM_CAP_S390_IRQCHIP 99
 #define KVM_CAP_SPAPR_TCE_VFIO 100
+#define KVM_CAP_SPAPR_TCE_64 101
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
2.0.0

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/3] PPC: KVM: Reserve KVM_CAP_SPAPR_TCE_VFIO capability number

2014-06-05 Thread Alexey Kardashevskiy
This adds a capability number for in-kernel support for VFIO on
SPAPR platform.

The capability will tell the user space whether in-kernel handlers of
H_PUT_TCE can handle VFIO-targeted requests or not. If not, the user space
must not attempt allocating a TCE table in the host kernel via
the KVM_CREATE_SPAPR_TCE KVM ioctl because in that case TCE requests
will not be passed to the user space which is desired action in
the situation like that.

Signed-off-by: Alexey Kardashevskiy 
---
 include/uapi/linux/kvm.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
index a8f4ee5..944cd21 100644
--- a/include/uapi/linux/kvm.h
+++ b/include/uapi/linux/kvm.h
@@ -743,6 +743,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_IOAPIC_POLARITY_IGNORED 97
 #define KVM_CAP_ENABLE_CAP_VM 98
 #define KVM_CAP_S390_IRQCHIP 99
+#define KVM_CAP_SPAPR_TCE_VFIO 100
 
 #ifdef KVM_CAP_IRQ_ROUTING
 
-- 
2.0.0

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] PPC: KVM: Add support for 64bit TCE windows

2014-06-05 Thread Benjamin Herrenschmidt
On Thu, 2014-06-05 at 17:25 +1000, Alexey Kardashevskiy wrote:
> +This creates a virtual TCE (translation control entry) table, which
> +is an IOMMU for PAPR-style virtual I/O.  It is used to translate
> +logical addresses used in virtual I/O into guest physical addresses,
> +and provides a scatter/gather capability for PAPR virtual I/O.
> +
> +/* for KVM_CAP_SPAPR_TCE_64 */
> +struct kvm_create_spapr_tce_64 {
> +   __u64 liobn;
> +   __u64 window_size;
> +   __u64 bus_offset;
> +   __u32 page_shift;
> +   __u32 flags;
> +};
> +
> +The liobn field gives the logical IO bus number for which to create a
> +TCE table. The window_size field specifies the size of the DMA window
> +which this TCE table will translate - the table will contain one 64
> +bit TCE entry for every IOMMU page. The bus_offset field tells where
> +this window is mapped on the IO bus. 

Hrm, the bus_offset cannot be set arbitrarily, it has some pretty strong
HW limits depending on the type of bridge & architecture version...

Do you plan to have that knowledge in qemu ? Or do you have some other
mechanism to query it ? (I might be missing a piece of the puzzle here).

Also one thing I've been pondering ...

We'll end up wasting a ton of memory with those TCE tables. If you have
3 PEs mapped into a guest, it will try to create 3 DDW's mapping the
entire guest memory and so 3 TCE tables large enough for that ... and
which will contain exactly the same entries !

We really want to look into extending PAPR to allow the creation of
table "aliases" so that the guest can essentially create one table and
associate it with multiple PEs. We might still decide to do multiple
copies for NUMA reasons but no more than one per node for example... at
least we can have the policy in qemu/kvm.

Also, do you currently require allocating a single physically contiguous
table or do you support TCE trees in your implementation ?

Cheers,
Ben.

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] PPC: KVM: Add support for 64bit TCE windows

2014-06-05 Thread Alexey Kardashevskiy
On 06/05/2014 05:38 PM, Benjamin Herrenschmidt wrote:
> On Thu, 2014-06-05 at 17:25 +1000, Alexey Kardashevskiy wrote:
>> +This creates a virtual TCE (translation control entry) table, which
>> +is an IOMMU for PAPR-style virtual I/O.  It is used to translate
>> +logical addresses used in virtual I/O into guest physical addresses,
>> +and provides a scatter/gather capability for PAPR virtual I/O.
>> +
>> +/* for KVM_CAP_SPAPR_TCE_64 */
>> +struct kvm_create_spapr_tce_64 {
>> +   __u64 liobn;
>> +   __u64 window_size;
>> +   __u64 bus_offset;
>> +   __u32 page_shift;
>> +   __u32 flags;
>> +};
>> +
>> +The liobn field gives the logical IO bus number for which to create a
>> +TCE table. The window_size field specifies the size of the DMA window
>> +which this TCE table will translate - the table will contain one 64
>> +bit TCE entry for every IOMMU page. The bus_offset field tells where
>> +this window is mapped on the IO bus. 
> 
> Hrm, the bus_offset cannot be set arbitrarily, it has some pretty strong
> HW limits depending on the type of bridge & architecture version...
> 
> Do you plan to have that knowledge in qemu ? Or do you have some other
> mechanism to query it ? (I might be missing a piece of the puzzle here).


Yes. QEMU will have this knowledge as it has to implement
ibm,create-pe-dma-window and return this address to the guest. There will
be a container API to receive it from powernv code via funky ppc_md callback.

There are 2 steps:
1. query + create window
2. enable in-kernel KVM acceleration for it.

Everything will work without step2 and, frankly speaking, we do not need it
too much for DDW but it does not cost much.

By having bus_offset in ioctl which is only used for step2, I reduce
dependance from powernv.


> Also one thing I've been pondering ...
> 
> We'll end up wasting a ton of memory with those TCE tables. If you have
> 3 PEs mapped into a guest, it will try to create 3 DDW's mapping the
> entire guest memory and so 3 TCE tables large enough for that ... and
> which will contain exactly the same entries !

This is in the plan too, do not rush :)


> We really want to look into extending PAPR to allow the creation of
> table "aliases" so that the guest can essentially create one table and
> associate it with multiple PEs. We might still decide to do multiple
> copies for NUMA reasons but no more than one per node for example... at
> least we can have the policy in qemu/kvm.
> 
> Also, do you currently require allocating a single physically contiguous
> table or do you support TCE trees in your implementation ?


No trees yet. For 64GB window we need (64<<30)/(16<<20)*8 = 32K TCE table.
Do we really need trees?


-- 
Alexey
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] PPC: KVM: Add support for 64bit TCE windows

2014-06-05 Thread Benjamin Herrenschmidt
On Thu, 2014-06-05 at 19:26 +1000, Alexey Kardashevskiy wrote:
> 
> No trees yet. For 64GB window we need (64<<30)/(16<<20)*8 = 32K TCE table.
> Do we really need trees?

The above is assuming hugetlbfs backed guests. These are the least of my worry
indeed. But we need to deal with 4k and 64k guests.

Cheers,
Ben


--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] PPC: KVM: Add support for 64bit TCE windows

2014-06-05 Thread Alexander Graf


On 05.06.14 12:27, Benjamin Herrenschmidt wrote:

On Thu, 2014-06-05 at 19:26 +1000, Alexey Kardashevskiy wrote:

No trees yet. For 64GB window we need (64<<30)/(16<<20)*8 = 32K TCE table.
Do we really need trees?

The above is assuming hugetlbfs backed guests. These are the least of my worry
indeed. But we need to deal with 4k and 64k guests.


What if we ask user space to give us a pointer to user space allocated 
memory along with the TCE registration? We would still ask user space to 
only use the returned fd for TCE modifications, but would have some 
nicely swappable memory we can store the TCE entries in.


In fact, the code as is today can allocate an arbitrary amount of pinned 
kernel memory from within user space without any checks.



Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/3] Prepare for in-kernel VFIO DMA operations acceleration

2014-06-05 Thread Alexander Graf


On 05.06.14 09:25, Alexey Kardashevskiy wrote:

This reserves 2 capability numbers.

This implements an extended version of KVM_CREATE_SPAPR_TCE_64 ioctl.

Please advise how to proceed with these patches as I suspect that
first two should go via Paolo's tree while the last one via Alex Graf's tree
(correct?).


They would just go via my tree, but only be actually allocated (read: 
mergable to qemu) when they hit Paolo's tree.


In fact, I don't think it makes sense to split them off at all.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


KVM: PPC: BOOK3S: PR: P8 Support

2014-06-05 Thread Aneesh Kumar K.V
This patchset adds support for emulating VTB, IC and Doorbell features in P8.
Doorbell support is dummy since we don't support SMT cores with PR-KVM.


-aneesh


--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 2/4] KVM: PPC: BOOK3S: PR: Doorbell support

2014-06-05 Thread Aneesh Kumar K.V
We don't have SMT support yet, hence we should not find a doorbell
message generated

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/kvm/book3s_emulate.c | 18 ++
 1 file changed, 18 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 1bb16a59dcbc..d6c87d085182 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -28,7 +28,9 @@
 #define OP_19_XOP_RFI  50
 
 #define OP_31_XOP_MFMSR83
+#define OP_31_XOP_MSGSNDP  142
 #define OP_31_XOP_MTMSR146
+#define OP_31_XOP_MSGCLRP  174
 #define OP_31_XOP_MTMSRD   178
 #define OP_31_XOP_MTSR 210
 #define OP_31_XOP_MTSRIN   242
@@ -303,6 +305,22 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
 
break;
}
+   case OP_31_XOP_MSGSNDP:
+   {
+   /*
+* PR KVM still don't support SMT mode. So we should
+* not see a MSGSNDP/MSGCLRP used with PR KVM
+*/
+   pr_info("KVM: MSGSNDP used in non SMT case\n");
+   emulated = EMULATE_FAIL;
+   break;
+   }
+   case OP_31_XOP_MSGCLRP:
+   {
+   pr_info("KVM: MSGCLRP used in non SMT case\n");
+   emulated = EMULATE_FAIL;
+   break;
+   }
default:
emulated = EMULATE_FAIL;
}
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 1/4] KVM: PPC: BOOK3S: PR: Emulate virtual timebase register

2014-06-05 Thread Aneesh Kumar K.V
virtual time base register is a per VM, per cpu register that needs
to be saved and restored on vm exit and entry. Writing to VTB is not
allowed in the privileged mode.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/kvm_host.h |  1 +
 arch/powerpc/include/asm/reg.h  | 15 +++
 arch/powerpc/include/asm/time.h |  9 +
 arch/powerpc/kvm/book3s.c   |  6 ++
 arch/powerpc/kvm/book3s_emulate.c   |  3 +++
 arch/powerpc/kvm/book3s_hv.c|  6 --
 arch/powerpc/kvm/book3s_pr.c|  3 ++-
 7 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 4a58731a0a72..bd3caeaeebe1 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -505,6 +505,7 @@ struct kvm_vcpu_arch {
 #endif
/* Time base value when we entered the guest */
u64 entry_tb;
+   u64 entry_vtb;
u32 tcr;
ulong tsr; /* we need to perform set/clr_bits() which requires ulong */
u32 ivor[64];
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 4852bcf270f3..3e7085d8af90 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -25,6 +25,7 @@
 #ifdef CONFIG_8xx
 #include 
 #endif /* CONFIG_8xx */
+#include 
 
 #define MSR_SF_LG  63  /* Enable 64 bit mode */
 #define MSR_ISF_LG 61  /* Interrupt 64b mode valid on 630 */
@@ -1193,6 +1194,20 @@
 : "r" ((unsigned long)(v)) \
 : "memory")
 
+static inline unsigned long mfvtb (void)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+   if (cpu_has_feature(CPU_FTR_ARCH_207S))
+   return mfspr(SPRN_VTB);
+#endif
+   /*
+* The above mfspr will be a no-op on anything before Power8
+* That can result in random values returned. We need to
+* capture that.
+*/
+   BUG();
+}
+
 #ifdef __powerpc64__
 #if defined(CONFIG_PPC_CELL) || defined(CONFIG_PPC_FSL_BOOK3E)
 #define mftb() ({unsigned long rval;   \
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 1d428e6007ca..03cbada59d3a 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -102,6 +102,15 @@ static inline u64 get_rtc(void)
return (u64)hi * 10 + lo;
 }
 
+static inline u64 get_vtb(void)
+{
+#ifdef CONFIG_PPC_BOOK3S_64
+   if (cpu_has_feature(CPU_FTR_ARCH_207S))
+   return mfvtb();
+#endif
+   return 0;
+}
+
 #ifdef CONFIG_PPC64
 static inline u64 get_tb(void)
 {
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 52c654dbd41a..ae43e4178ecd 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -646,6 +646,9 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, 
struct kvm_one_reg *reg)
case KVM_REG_PPC_BESCR:
val = get_reg_val(reg->id, vcpu->arch.bescr);
break;
+   case KVM_REG_PPC_VTB:
+   val = get_reg_val(reg->id, vcpu->arch.vtb);
+   break;
default:
r = -EINVAL;
break;
@@ -750,6 +753,9 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, 
struct kvm_one_reg *reg)
case KVM_REG_PPC_BESCR:
vcpu->arch.bescr = set_reg_val(reg->id, val);
break;
+   case KVM_REG_PPC_VTB:
+   vcpu->arch.vtb = set_reg_val(reg->id, val);
+   break;
default:
r = -EINVAL;
break;
diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 3565e775b61b..1bb16a59dcbc 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -577,6 +577,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int 
sprn, ulong *spr_val
 */
*spr_val = vcpu->arch.spurr;
break;
+   case SPRN_VTB:
+   *spr_val = vcpu->arch.vtb;
+   break;
case SPRN_GQR0:
case SPRN_GQR1:
case SPRN_GQR2:
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index aba05bbb3e74..f6ac58336b3f 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -897,9 +897,6 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 
id,
case KVM_REG_PPC_IC:
*val = get_reg_val(id, vcpu->arch.ic);
break;
-   case KVM_REG_PPC_VTB:
-   *val = get_reg_val(id, vcpu->arch.vtb);
-   break;
case KVM_REG_PPC_CSIGR:
*val = get_reg_val(id, vcpu->arch.csigr);
break;
@@ -1097,9 +1094,6 @@ static 

[PATCH 4/4] KVM: PPC: BOOK3S: PR: Emulate instruction counter

2014-06-05 Thread Aneesh Kumar K.V
Writing to IC is not allowed in the privileged mode.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/include/asm/kvm_host.h | 1 +
 arch/powerpc/kvm/book3s.c   | 6 ++
 arch/powerpc/kvm/book3s_emulate.c   | 3 +++
 arch/powerpc/kvm/book3s_hv.c| 6 --
 arch/powerpc/kvm/book3s_pr.c| 4 
 5 files changed, 14 insertions(+), 6 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index bd3caeaeebe1..f9ae69682ce1 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -506,6 +506,7 @@ struct kvm_vcpu_arch {
/* Time base value when we entered the guest */
u64 entry_tb;
u64 entry_vtb;
+   u64 entry_ic;
u32 tcr;
ulong tsr; /* we need to perform set/clr_bits() which requires ulong */
u32 ivor[64];
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index ae43e4178ecd..52c4c43900cb 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -649,6 +649,9 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, 
struct kvm_one_reg *reg)
case KVM_REG_PPC_VTB:
val = get_reg_val(reg->id, vcpu->arch.vtb);
break;
+   case KVM_REG_PPC_IC:
+   val = get_reg_val(reg->id, vcpu->arch.ic);
+   break;
default:
r = -EINVAL;
break;
@@ -756,6 +759,9 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, 
struct kvm_one_reg *reg)
case KVM_REG_PPC_VTB:
vcpu->arch.vtb = set_reg_val(reg->id, val);
break;
+   case KVM_REG_PPC_IC:
+   vcpu->arch.ic = set_reg_val(reg->id, val);
+   break;
default:
r = -EINVAL;
break;
diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 062b5da7786e..e6912c618160 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -598,6 +598,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int 
sprn, ulong *spr_val
case SPRN_VTB:
*spr_val = vcpu->arch.vtb;
break;
+   case SPRN_IC:
+   *spr_val = vcpu->arch.ic;
+   break;
case SPRN_GQR0:
case SPRN_GQR1:
case SPRN_GQR2:
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index f6ac58336b3f..c38cf9f836c0 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -894,9 +894,6 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 
id,
case KVM_REG_PPC_CIABR:
*val = get_reg_val(id, vcpu->arch.ciabr);
break;
-   case KVM_REG_PPC_IC:
-   *val = get_reg_val(id, vcpu->arch.ic);
-   break;
case KVM_REG_PPC_CSIGR:
*val = get_reg_val(id, vcpu->arch.csigr);
break;
@@ -1091,9 +1088,6 @@ static int kvmppc_set_one_reg_hv(struct kvm_vcpu *vcpu, 
u64 id,
if ((vcpu->arch.ciabr & CIABR_PRIV) == CIABR_PRIV_HYPER)
vcpu->arch.ciabr &= ~CIABR_PRIV;/* disable */
break;
-   case KVM_REG_PPC_IC:
-   vcpu->arch.ic = set_reg_val(id, *val);
-   break;
case KVM_REG_PPC_CSIGR:
vcpu->arch.csigr = set_reg_val(id, *val);
break;
diff --git a/arch/powerpc/kvm/book3s_pr.c b/arch/powerpc/kvm/book3s_pr.c
index 96cdf89a8c86..03fc8847cd67 100644
--- a/arch/powerpc/kvm/book3s_pr.c
+++ b/arch/powerpc/kvm/book3s_pr.c
@@ -126,6 +126,8 @@ void kvmppc_copy_to_svcpu(struct kvmppc_book3s_shadow_vcpu 
*svcpu,
 */
vcpu->arch.entry_tb = get_tb();
vcpu->arch.entry_vtb = get_vtb();
+   if (cpu_has_feature(CPU_FTR_ARCH_207S))
+   vcpu->arch.entry_ic = mfspr(SPRN_IC);
svcpu->in_use = true;
 }
 
@@ -178,6 +180,8 @@ void kvmppc_copy_from_svcpu(struct kvm_vcpu *vcpu,
vcpu->arch.purr += get_tb() - vcpu->arch.entry_tb;
vcpu->arch.spurr += get_tb() - vcpu->arch.entry_tb;
vcpu->arch.vtb += get_vtb() - vcpu->arch.entry_vtb;
+   if (cpu_has_feature(CPU_FTR_ARCH_207S))
+   vcpu->arch.ic += mfspr(SPRN_IC) - vcpu->arch.entry_ic;
svcpu->in_use = false;
 
 out:
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH 3/4] KVM: PPC: BOOK3S: PR: Emulate DPDES register

2014-06-05 Thread Aneesh Kumar K.V
Since we don't support SMT yet, we should always find zero in
Directed privileged doorbell exception state register.

Signed-off-by: Aneesh Kumar K.V 
---
 arch/powerpc/kvm/book3s_emulate.c | 1 +
 1 file changed, 1 insertion(+)

diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index d6c87d085182..062b5da7786e 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -655,6 +655,7 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int 
sprn, ulong *spr_val
case SPRN_MMCR1:
case SPRN_MMCR2:
case SPRN_TIR:
+   case SPRN_DPDES:
 #endif
*spr_val = 0;
break;
-- 
1.9.1

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/4] KVM: PPC: BOOK3S: PR: Emulate virtual timebase register

2014-06-05 Thread Alexander Graf


On 05.06.14 14:08, Aneesh Kumar K.V wrote:

virtual time base register is a per VM, per cpu register that needs
to be saved and restored on vm exit and entry. Writing to VTB is not
allowed in the privileged mode.

Signed-off-by: Aneesh Kumar K.V 
---
  arch/powerpc/include/asm/kvm_host.h |  1 +
  arch/powerpc/include/asm/reg.h  | 15 +++
  arch/powerpc/include/asm/time.h |  9 +
  arch/powerpc/kvm/book3s.c   |  6 ++
  arch/powerpc/kvm/book3s_emulate.c   |  3 +++
  arch/powerpc/kvm/book3s_hv.c|  6 --
  arch/powerpc/kvm/book3s_pr.c|  3 ++-
  7 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 4a58731a0a72..bd3caeaeebe1 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -505,6 +505,7 @@ struct kvm_vcpu_arch {
  #endif
/* Time base value when we entered the guest */
u64 entry_tb;
+   u64 entry_vtb;
u32 tcr;
ulong tsr; /* we need to perform set/clr_bits() which requires ulong */
u32 ivor[64];
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 4852bcf270f3..3e7085d8af90 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -25,6 +25,7 @@
  #ifdef CONFIG_8xx
  #include 
  #endif /* CONFIG_8xx */
+#include 
  
  #define MSR_SF_LG	63  /* Enable 64 bit mode */

  #define MSR_ISF_LG61  /* Interrupt 64b mode valid on 630 */
@@ -1193,6 +1194,20 @@
 : "r" ((unsigned long)(v)) \
 : "memory")
  
+static inline unsigned long mfvtb (void)

+{
+#ifdef CONFIG_PPC_BOOK3S_64
+   if (cpu_has_feature(CPU_FTR_ARCH_207S))
+   return mfspr(SPRN_VTB);
+#endif
+   /*
+* The above mfspr will be a no-op on anything before Power8
+* That can result in random values returned. We need to
+* capture that.
+*/
+   BUG();
+}
+
  #ifdef __powerpc64__
  #if defined(CONFIG_PPC_CELL) || defined(CONFIG_PPC_FSL_BOOK3E)
  #define mftb()({unsigned long rval;   
\
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 1d428e6007ca..03cbada59d3a 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -102,6 +102,15 @@ static inline u64 get_rtc(void)
return (u64)hi * 10 + lo;
  }
  
+static inline u64 get_vtb(void)

+{
+#ifdef CONFIG_PPC_BOOK3S_64
+   if (cpu_has_feature(CPU_FTR_ARCH_207S))
+   return mfvtb();
+#endif
+   return 0;
+}
+
  #ifdef CONFIG_PPC64
  static inline u64 get_tb(void)
  {
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 52c654dbd41a..ae43e4178ecd 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -646,6 +646,9 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, 
struct kvm_one_reg *reg)
case KVM_REG_PPC_BESCR:
val = get_reg_val(reg->id, vcpu->arch.bescr);
break;
+   case KVM_REG_PPC_VTB:
+   val = get_reg_val(reg->id, vcpu->arch.vtb);
+   break;
default:
r = -EINVAL;
break;
@@ -750,6 +753,9 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, 
struct kvm_one_reg *reg)
case KVM_REG_PPC_BESCR:
vcpu->arch.bescr = set_reg_val(reg->id, val);
break;
+   case KVM_REG_PPC_VTB:
+   vcpu->arch.vtb = set_reg_val(reg->id, val);
+   break;
default:
r = -EINVAL;
break;
diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 3565e775b61b..1bb16a59dcbc 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -577,6 +577,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int 
sprn, ulong *spr_val
 */
*spr_val = vcpu->arch.spurr;
break;
+   case SPRN_VTB:
+   *spr_val = vcpu->arch.vtb;


Doesn't this mean that vtb can be the same 2 when the guest reads it 2 
times in a row without getting preempted?



Alex


+   break;
case SPRN_GQR0:
case SPRN_GQR1:
case SPRN_GQR2:
diff --git a/arch/powerpc/kvm/book3s_hv.c b/arch/powerpc/kvm/book3s_hv.c
index aba05bbb3e74..f6ac58336b3f 100644
--- a/arch/powerpc/kvm/book3s_hv.c
+++ b/arch/powerpc/kvm/book3s_hv.c
@@ -897,9 +897,6 @@ static int kvmppc_get_one_reg_hv(struct kvm_vcpu *vcpu, u64 
id,
case KVM_REG_PPC_IC:
*val = get_reg_val(id, vcpu->arch.ic);
break;
-   case KVM_REG_PPC_VTB:
-   *val

Re: [PATCH 2/4] KVM: PPC: BOOK3S: PR: Doorbell support

2014-06-05 Thread Alexander Graf


On 05.06.14 14:08, Aneesh Kumar K.V wrote:

We don't have SMT support yet, hence we should not find a doorbell
message generated

Signed-off-by: Aneesh Kumar K.V 
---
  arch/powerpc/kvm/book3s_emulate.c | 18 ++
  1 file changed, 18 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 1bb16a59dcbc..d6c87d085182 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -28,7 +28,9 @@
  #define OP_19_XOP_RFI 50
  
  #define OP_31_XOP_MFMSR		83

+#define OP_31_XOP_MSGSNDP  142
  #define OP_31_XOP_MTMSR   146
+#define OP_31_XOP_MSGCLRP  174
  #define OP_31_XOP_MTMSRD  178
  #define OP_31_XOP_MTSR210
  #define OP_31_XOP_MTSRIN  242
@@ -303,6 +305,22 @@ int kvmppc_core_emulate_op_pr(struct kvm_run *run, struct 
kvm_vcpu *vcpu,
  
  			break;

}
+   case OP_31_XOP_MSGSNDP:
+   {
+   /*
+* PR KVM still don't support SMT mode. So we should


still?


+* not see a MSGSNDP/MSGCLRP used with PR KVM
+*/
+   pr_info("KVM: MSGSNDP used in non SMT case\n");
+   emulated = EMULATE_FAIL;


What would happen on an HV guest with only 1 thread that MSGSNDs to 
thread 0? Would the guest get an illegal instruction trap, a 
self-interrupt or would this be a simple nop?



Alex


+   break;
+   }
+   case OP_31_XOP_MSGCLRP:
+   {
+   pr_info("KVM: MSGCLRP used in non SMT case\n");
+   emulated = EMULATE_FAIL;
+   break;
+   }
default:
emulated = EMULATE_FAIL;
}


--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 2/4] KVM: PPC: BOOK3S: PR: Doorbell support

2014-06-05 Thread Alexander Graf


On 05.06.14 14:21, Alexander Graf wrote:


On 05.06.14 14:08, Aneesh Kumar K.V wrote:

We don't have SMT support yet, hence we should not find a doorbell
message generated

Signed-off-by: Aneesh Kumar K.V 
---
  arch/powerpc/kvm/book3s_emulate.c | 18 ++
  1 file changed, 18 insertions(+)

diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c

index 1bb16a59dcbc..d6c87d085182 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -28,7 +28,9 @@
  #define OP_19_XOP_RFI50
#define OP_31_XOP_MFMSR83
+#define OP_31_XOP_MSGSNDP142
  #define OP_31_XOP_MTMSR146
+#define OP_31_XOP_MSGCLRP174
  #define OP_31_XOP_MTMSRD178
  #define OP_31_XOP_MTSR210
  #define OP_31_XOP_MTSRIN242
@@ -303,6 +305,22 @@ int kvmppc_core_emulate_op_pr(struct kvm_run 
*run, struct kvm_vcpu *vcpu,

break;
  }
+case OP_31_XOP_MSGSNDP:
+{
+/*
+ * PR KVM still don't support SMT mode. So we should


still?


+ * not see a MSGSNDP/MSGCLRP used with PR KVM
+ */
+pr_info("KVM: MSGSNDP used in non SMT case\n");
+emulated = EMULATE_FAIL;


What would happen on an HV guest with only 1 thread that MSGSNDs to 
thread 0? Would the guest get an illegal instruction trap, a 
self-interrupt or would this be a simple nop?


What I'm trying to say here is that it's ok to treat it as illegal 
instructions, but then we don't need this patch :).



Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] PPC: KVM: Add support for 64bit TCE windows

2014-06-05 Thread Benjamin Herrenschmidt
On Thu, 2014-06-05 at 13:56 +0200, Alexander Graf wrote:
> What if we ask user space to give us a pointer to user space allocated 
> memory along with the TCE registration? We would still ask user space to 
> only use the returned fd for TCE modifications, but would have some 
> nicely swappable memory we can store the TCE entries in.

That isn't going to work terribly well for VFIO :-) But yes, for
emulated devices, we could improve things a bit, including for
the 32-bit TCE tables.

For emulated, the real mode path could walk the page tables and fallback
to virtual mode & get_user if the page isn't present, thus operating
directly on qemu memory TCE tables instead of the current pinned stuff.

However that has a cost in performance, but since that's really only
used for emulated devices and PAPR VIOs, it might not be a huge issue.

But for VFIO we don't have much choice, we need to create something the
HW can access.

> In fact, the code as is today can allocate an arbitrary amount of pinned 
> kernel memory from within user space without any checks.

Right. We should at least account it in the locked limit.

Cheers,
Ben.


--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] PPC: KVM: Add support for 64bit TCE windows

2014-06-05 Thread Alexander Graf


On 05.06.14 14:30, Benjamin Herrenschmidt wrote:

On Thu, 2014-06-05 at 13:56 +0200, Alexander Graf wrote:

What if we ask user space to give us a pointer to user space allocated
memory along with the TCE registration? We would still ask user space to
only use the returned fd for TCE modifications, but would have some
nicely swappable memory we can store the TCE entries in.

That isn't going to work terribly well for VFIO :-) But yes, for
emulated devices, we could improve things a bit, including for
the 32-bit TCE tables.

For emulated, the real mode path could walk the page tables and fallback
to virtual mode & get_user if the page isn't present, thus operating
directly on qemu memory TCE tables instead of the current pinned stuff.

However that has a cost in performance, but since that's really only
used for emulated devices and PAPR VIOs, it might not be a huge issue.

But for VFIO we don't have much choice, we need to create something the
HW can access.


But we need to create separate tables for VFIO anyways, because these 
TCE tables contain virtual addresses, no?



Alex




In fact, the code as is today can allocate an arbitrary amount of pinned
kernel memory from within user space without any checks.

Right. We should at least account it in the locked limit.

Cheers,
Ben.




--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v8 0/3] EEH Support for VFIO PCI Device

2014-06-05 Thread Alexander Graf


On 05.06.14 08:36, Gavin Shan wrote:

The series of patches adds support EEH for PCI devices, which are passed
through to PowerKVM based guest via VFIO. The implementation is straightforward
based on the issues or problems we have to resolve to support EEH for PowerKVM
based guest.

- Emulation for EEH RTAS requests. All EEH RTAS requests goes to QEMU firstly.
   If QEMU can't handle it, the request will be sent to host via newly 
introduced
   VFIO container IOCTL command (VFIO_EEH_OP) and gets handled in host kernel.

The series of patches requires corresponding QEMU changes.


Acked-by: Alexander Graf 


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/3] PPC: KVM: Add support for 64bit TCE windows

2014-06-05 Thread Alexey Kardashevskiy
On 06/05/2014 10:30 PM, Benjamin Herrenschmidt wrote:
> On Thu, 2014-06-05 at 13:56 +0200, Alexander Graf wrote:
>> What if we ask user space to give us a pointer to user space allocated 
>> memory along with the TCE registration? We would still ask user space to 
>> only use the returned fd for TCE modifications, but would have some 
>> nicely swappable memory we can store the TCE entries in.
> 
> That isn't going to work terribly well for VFIO :-) But yes, for
> emulated devices, we could improve things a bit, including for
> the 32-bit TCE tables.
> 
> For emulated, the real mode path could walk the page tables and fallback
> to virtual mode & get_user if the page isn't present, thus operating
> directly on qemu memory TCE tables instead of the current pinned stuff.
> 
> However that has a cost in performance, but since that's really only
> used for emulated devices and PAPR VIOs, it might not be a huge issue.
> 
> But for VFIO we don't have much choice, we need to create something the
> HW can access.

You are confusing things here.

There are 2 tables:
1. guest-visible TCE table, this is what is allocated for VIO or emulated PCI;
2. real HW DMA window, one exists already for DMA32 and one I will
allocated for a huge window.

I have just #2 for VFIO now but we will need both in order to implement
H_GET_TCE correctly, and this is the table I will allocate by this new ioctl.


>> In fact, the code as is today can allocate an arbitrary amount of pinned 
>> kernel memory from within user space without any checks.
> 
> Right. We should at least account it in the locked limit.

Yup. And (probably) this thing will keep a counter of how many windows were
created per KVM instance to avoid having multiple copies of the same table.


-- 
Alexey
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/4] KVM: PPC: BOOK3S: PR: Emulate virtual timebase register

2014-06-05 Thread Aneesh Kumar K.V
Alexander Graf  writes:

> On 05.06.14 14:08, Aneesh Kumar K.V wrote:
>> virtual time base register is a per VM, per cpu register that needs
>> to be saved and restored on vm exit and entry. Writing to VTB is not
>> allowed in the privileged mode.
>>
>> Signed-off-by: Aneesh Kumar K.V 
>> ---
>>   arch/powerpc/include/asm/kvm_host.h |  1 +
>>   arch/powerpc/include/asm/reg.h  | 15 +++
>>   arch/powerpc/include/asm/time.h |  9 +
>>   arch/powerpc/kvm/book3s.c   |  6 ++
>>   arch/powerpc/kvm/book3s_emulate.c   |  3 +++
>>   arch/powerpc/kvm/book3s_hv.c|  6 --
>>   arch/powerpc/kvm/book3s_pr.c|  3 ++-
>>   7 files changed, 36 insertions(+), 7 deletions(-)
>>
>> diff --git a/arch/powerpc/include/asm/kvm_host.h 
>> b/arch/powerpc/include/asm/kvm_host.h
>> index 4a58731a0a72..bd3caeaeebe1 100644
>> --- a/arch/powerpc/include/asm/kvm_host.h
>> +++ b/arch/powerpc/include/asm/kvm_host.h
>> @@ -505,6 +505,7 @@ struct kvm_vcpu_arch {
>>   #endif
>>  /* Time base value when we entered the guest */
>>  u64 entry_tb;
>> +u64 entry_vtb;
>>  u32 tcr;
>>  ulong tsr; /* we need to perform set/clr_bits() which requires ulong */
>>  u32 ivor[64];
>> diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
>> index 4852bcf270f3..3e7085d8af90 100644
>> --- a/arch/powerpc/include/asm/reg.h
>> +++ b/arch/powerpc/include/asm/reg.h
>> @@ -25,6 +25,7 @@
>>   #ifdef CONFIG_8xx
>>   #include 
>>   #endif /* CONFIG_8xx */
>> +#include 
>>   
>>   #define MSR_SF_LG  63  /* Enable 64 bit mode */
>>   #define MSR_ISF_LG 61  /* Interrupt 64b mode valid on 630 */
>> @@ -1193,6 +1194,20 @@
>>   : "r" ((unsigned long)(v)) \
>>   : "memory")
>>   
>> +static inline unsigned long mfvtb (void)
>> +{
>> +#ifdef CONFIG_PPC_BOOK3S_64
>> +if (cpu_has_feature(CPU_FTR_ARCH_207S))
>> +return mfspr(SPRN_VTB);
>> +#endif
>> +/*
>> + * The above mfspr will be a no-op on anything before Power8
>> + * That can result in random values returned. We need to
>> + * capture that.
>> + */
>> +BUG();
>> +}
>> +
>>   #ifdef __powerpc64__
>>   #if defined(CONFIG_PPC_CELL) || defined(CONFIG_PPC_FSL_BOOK3E)
>>   #define mftb() ({unsigned long rval;   
>> \
>> diff --git a/arch/powerpc/include/asm/time.h 
>> b/arch/powerpc/include/asm/time.h
>> index 1d428e6007ca..03cbada59d3a 100644
>> --- a/arch/powerpc/include/asm/time.h
>> +++ b/arch/powerpc/include/asm/time.h
>> @@ -102,6 +102,15 @@ static inline u64 get_rtc(void)
>>  return (u64)hi * 10 + lo;
>>   }
>>   
>> +static inline u64 get_vtb(void)
>> +{
>> +#ifdef CONFIG_PPC_BOOK3S_64
>> +if (cpu_has_feature(CPU_FTR_ARCH_207S))
>> +return mfvtb();
>> +#endif
>> +return 0;
>> +}
>> +
>>   #ifdef CONFIG_PPC64
>>   static inline u64 get_tb(void)
>>   {
>> diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
>> index 52c654dbd41a..ae43e4178ecd 100644
>> --- a/arch/powerpc/kvm/book3s.c
>> +++ b/arch/powerpc/kvm/book3s.c
>> @@ -646,6 +646,9 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, 
>> struct kvm_one_reg *reg)
>>  case KVM_REG_PPC_BESCR:
>>  val = get_reg_val(reg->id, vcpu->arch.bescr);
>>  break;
>> +case KVM_REG_PPC_VTB:
>> +val = get_reg_val(reg->id, vcpu->arch.vtb);
>> +break;
>>  default:
>>  r = -EINVAL;
>>  break;
>> @@ -750,6 +753,9 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, 
>> struct kvm_one_reg *reg)
>>  case KVM_REG_PPC_BESCR:
>>  vcpu->arch.bescr = set_reg_val(reg->id, val);
>>  break;
>> +case KVM_REG_PPC_VTB:
>> +vcpu->arch.vtb = set_reg_val(reg->id, val);
>> +break;
>>  default:
>>  r = -EINVAL;
>>  break;
>> diff --git a/arch/powerpc/kvm/book3s_emulate.c 
>> b/arch/powerpc/kvm/book3s_emulate.c
>> index 3565e775b61b..1bb16a59dcbc 100644
>> --- a/arch/powerpc/kvm/book3s_emulate.c
>> +++ b/arch/powerpc/kvm/book3s_emulate.c
>> @@ -577,6 +577,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, 
>> int sprn, ulong *spr_val
>>   */
>>  *spr_val = vcpu->arch.spurr;
>>  break;
>> +case SPRN_VTB:
>> +*spr_val = vcpu->arch.vtb;
>
> Doesn't this mean that vtb can be the same 2 when the guest reads it 2 
> times in a row without getting preempted?


But a mfspr will result in VM exit and that would make sure we
update vcpu->arch.vtb with the correct value.


-aneesh

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://

Re: [PATCH 2/4] KVM: PPC: BOOK3S: PR: Doorbell support

2014-06-05 Thread Aneesh Kumar K.V
Alexander Graf  writes:

> On 05.06.14 14:21, Alexander Graf wrote:
>>
>> On 05.06.14 14:08, Aneesh Kumar K.V wrote:
>>> We don't have SMT support yet, hence we should not find a doorbell
>>> message generated
>>>
>>> Signed-off-by: Aneesh Kumar K.V 
>>> ---
>>>   arch/powerpc/kvm/book3s_emulate.c | 18 ++
>>>   1 file changed, 18 insertions(+)
>>>
>>> diff --git a/arch/powerpc/kvm/book3s_emulate.c 
>>> b/arch/powerpc/kvm/book3s_emulate.c
>>> index 1bb16a59dcbc..d6c87d085182 100644
>>> --- a/arch/powerpc/kvm/book3s_emulate.c
>>> +++ b/arch/powerpc/kvm/book3s_emulate.c
>>> @@ -28,7 +28,9 @@
>>>   #define OP_19_XOP_RFI50
>>> #define OP_31_XOP_MFMSR83
>>> +#define OP_31_XOP_MSGSNDP142
>>>   #define OP_31_XOP_MTMSR146
>>> +#define OP_31_XOP_MSGCLRP174
>>>   #define OP_31_XOP_MTMSRD178
>>>   #define OP_31_XOP_MTSR210
>>>   #define OP_31_XOP_MTSRIN242
>>> @@ -303,6 +305,22 @@ int kvmppc_core_emulate_op_pr(struct kvm_run 
>>> *run, struct kvm_vcpu *vcpu,
>>> break;
>>>   }
>>> +case OP_31_XOP_MSGSNDP:
>>> +{
>>> +/*
>>> + * PR KVM still don't support SMT mode. So we should
>>
>> still?
>>
>>> + * not see a MSGSNDP/MSGCLRP used with PR KVM
>>> + */
>>> +pr_info("KVM: MSGSNDP used in non SMT case\n");
>>> +emulated = EMULATE_FAIL;
>>
>> What would happen on an HV guest with only 1 thread that MSGSNDs to 
>> thread 0? Would the guest get an illegal instruction trap, a 
>> self-interrupt or would this be a simple nop?
>
> What I'm trying to say here is that it's ok to treat it as illegal 
> instructions, but then we don't need this patch :).
>

Agreed. I will verify whether it is treated as a nop. If so will send an
updated patch.

-aneesh

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/4] KVM: PPC: BOOK3S: PR: Emulate virtual timebase register

2014-06-05 Thread Alexander Graf


On 05.06.14 17:50, Aneesh Kumar K.V wrote:

Alexander Graf  writes:


On 05.06.14 14:08, Aneesh Kumar K.V wrote:

virtual time base register is a per VM, per cpu register that needs
to be saved and restored on vm exit and entry. Writing to VTB is not
allowed in the privileged mode.

Signed-off-by: Aneesh Kumar K.V 
---
   arch/powerpc/include/asm/kvm_host.h |  1 +
   arch/powerpc/include/asm/reg.h  | 15 +++
   arch/powerpc/include/asm/time.h |  9 +
   arch/powerpc/kvm/book3s.c   |  6 ++
   arch/powerpc/kvm/book3s_emulate.c   |  3 +++
   arch/powerpc/kvm/book3s_hv.c|  6 --
   arch/powerpc/kvm/book3s_pr.c|  3 ++-
   7 files changed, 36 insertions(+), 7 deletions(-)

diff --git a/arch/powerpc/include/asm/kvm_host.h 
b/arch/powerpc/include/asm/kvm_host.h
index 4a58731a0a72..bd3caeaeebe1 100644
--- a/arch/powerpc/include/asm/kvm_host.h
+++ b/arch/powerpc/include/asm/kvm_host.h
@@ -505,6 +505,7 @@ struct kvm_vcpu_arch {
   #endif
/* Time base value when we entered the guest */
u64 entry_tb;
+   u64 entry_vtb;
u32 tcr;
ulong tsr; /* we need to perform set/clr_bits() which requires ulong */
u32 ivor[64];
diff --git a/arch/powerpc/include/asm/reg.h b/arch/powerpc/include/asm/reg.h
index 4852bcf270f3..3e7085d8af90 100644
--- a/arch/powerpc/include/asm/reg.h
+++ b/arch/powerpc/include/asm/reg.h
@@ -25,6 +25,7 @@
   #ifdef CONFIG_8xx
   #include 
   #endif /* CONFIG_8xx */
+#include 
   
   #define MSR_SF_LG	63  /* Enable 64 bit mode */

   #define MSR_ISF_LG   61  /* Interrupt 64b mode valid on 630 */
@@ -1193,6 +1194,20 @@
 : "r" ((unsigned long)(v)) \
 : "memory")
   
+static inline unsigned long mfvtb (void)

+{
+#ifdef CONFIG_PPC_BOOK3S_64
+   if (cpu_has_feature(CPU_FTR_ARCH_207S))
+   return mfspr(SPRN_VTB);
+#endif
+   /*
+* The above mfspr will be a no-op on anything before Power8
+* That can result in random values returned. We need to
+* capture that.
+*/
+   BUG();
+}
+
   #ifdef __powerpc64__
   #if defined(CONFIG_PPC_CELL) || defined(CONFIG_PPC_FSL_BOOK3E)
   #define mftb()   ({unsigned long rval;   
\
diff --git a/arch/powerpc/include/asm/time.h b/arch/powerpc/include/asm/time.h
index 1d428e6007ca..03cbada59d3a 100644
--- a/arch/powerpc/include/asm/time.h
+++ b/arch/powerpc/include/asm/time.h
@@ -102,6 +102,15 @@ static inline u64 get_rtc(void)
return (u64)hi * 10 + lo;
   }
   
+static inline u64 get_vtb(void)

+{
+#ifdef CONFIG_PPC_BOOK3S_64
+   if (cpu_has_feature(CPU_FTR_ARCH_207S))
+   return mfvtb();
+#endif
+   return 0;
+}
+
   #ifdef CONFIG_PPC64
   static inline u64 get_tb(void)
   {
diff --git a/arch/powerpc/kvm/book3s.c b/arch/powerpc/kvm/book3s.c
index 52c654dbd41a..ae43e4178ecd 100644
--- a/arch/powerpc/kvm/book3s.c
+++ b/arch/powerpc/kvm/book3s.c
@@ -646,6 +646,9 @@ int kvm_vcpu_ioctl_get_one_reg(struct kvm_vcpu *vcpu, 
struct kvm_one_reg *reg)
case KVM_REG_PPC_BESCR:
val = get_reg_val(reg->id, vcpu->arch.bescr);
break;
+   case KVM_REG_PPC_VTB:
+   val = get_reg_val(reg->id, vcpu->arch.vtb);
+   break;
default:
r = -EINVAL;
break;
@@ -750,6 +753,9 @@ int kvm_vcpu_ioctl_set_one_reg(struct kvm_vcpu *vcpu, 
struct kvm_one_reg *reg)
case KVM_REG_PPC_BESCR:
vcpu->arch.bescr = set_reg_val(reg->id, val);
break;
+   case KVM_REG_PPC_VTB:
+   vcpu->arch.vtb = set_reg_val(reg->id, val);
+   break;
default:
r = -EINVAL;
break;
diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 3565e775b61b..1bb16a59dcbc 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -577,6 +577,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int 
sprn, ulong *spr_val
 */
*spr_val = vcpu->arch.spurr;
break;
+   case SPRN_VTB:
+   *spr_val = vcpu->arch.vtb;

Doesn't this mean that vtb can be the same 2 when the guest reads it 2
times in a row without getting preempted?


But a mfspr will result in VM exit and that would make sure we
update vcpu->arch.vtb with the correct value.


We only call kvmppc_core_vcpu_put_pr() when we context switch away from 
KVM, so it won't be updated, no?



Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 3/3] PPC, KVM, CMA: use general CMA reserved area management framework

2014-06-05 Thread Aneesh Kumar K.V
Paolo Bonzini  writes:

> Il 03/06/2014 09:02, Michal Nazarewicz ha scritto:
>> On Tue, Jun 03 2014, Joonsoo Kim wrote:
>>> Now, we have general CMA reserved area management framework,
>>> so use it for future maintainabilty. There is no functional change.
>>>
>>> Signed-off-by: Joonsoo Kim 
>>
>> Acked-by: Michal Nazarewicz 
>>
>
> Acked-by: Paolo Bonzini 
>
> Aneesh, can you test this series?

Sorry for the late reply. I will test this and update here.

-aneesh

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v8 3/3] drivers/vfio: EEH support for VFIO PCI device

2014-06-05 Thread Alex Williamson
On Thu, 2014-06-05 at 16:36 +1000, Gavin Shan wrote:
> The patch adds new IOCTL commands for sPAPR VFIO container device
> to support EEH functionality for PCI devices, which have been passed
> through from host to somebody else via VFIO.
> 
> Signed-off-by: Gavin Shan 
> ---
>  Documentation/vfio.txt  | 87 ++--
>  drivers/vfio/Makefile   |  1 +
>  drivers/vfio/pci/vfio_pci.c | 20 ++---
>  drivers/vfio/vfio_iommu_spapr_tce.c | 17 ++-
>  drivers/vfio/vfio_spapr_eeh.c   | 89 
> +
>  include/linux/vfio.h| 23 ++
>  include/uapi/linux/vfio.h   | 35 +++
>  7 files changed, 262 insertions(+), 10 deletions(-)
>  create mode 100644 drivers/vfio/vfio_spapr_eeh.c
> 
> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
> index b9ca023..3fa4538 100644
> --- a/Documentation/vfio.txt
> +++ b/Documentation/vfio.txt
> @@ -305,7 +305,15 @@ faster, the map/unmap handling has been implemented in 
> real mode which provides
>  an excellent performance which has limitations such as inability to do
>  locked pages accounting in real time.
>  
> -So 3 additional ioctls have been added:
> +4) According to sPAPR specification, A Partitionable Endpoint (PE) is an I/O
> +subtree that can be treated as a unit for the purposes of partitioning and
> +error recovery. A PE may be a single or multi-function IOA (IO Adapter), a
> +function of a multi-function IOA, or multiple IOAs (possibly including switch
> +and bridge structures above the multiple IOAs). PPC64 guests detect PCI 
> errors
> +and recover from them via EEH RTAS services, which works on the basis of
> +additional ioctl commands.
> +
> +So 4 additional ioctls have been added:
>  
>   VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
>   of the DMA window on the PCI bus.
> @@ -316,9 +324,12 @@ So 3 additional ioctls have been added:
>  
>   VFIO_IOMMU_DISABLE - disables the container.
>  
> + VFIO_EEH_PE_OP - provides an API for EEH setup, error detection and 
> recovery.
>  
>  The code flow from the example above should be slightly changed:
>  
> + struct vfio_eeh_pe_op pe_op = { .argsz = sizeof(pe_op) };
> +
>   .
>   /* Add the group to the container */
>   ioctl(group, VFIO_GROUP_SET_CONTAINER, &container);
> @@ -342,9 +353,79 @@ The code flow from the example above should be slightly 
> changed:
>   dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
>  
>   /* Check here is .iova/.size are within DMA window from 
> spapr_iommu_info */
> -
>   ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);
> - .
> +
> + /* Get a file descriptor for the device */
> + device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, ":06:0d.0");
> +
> + 
> +
> + /* Gratuitous device reset and go... */
> + ioctl(device, VFIO_DEVICE_RESET);
> +
> + /* Make sure EEH is supported */
> + ioctl(container, VFIO_CHECK_EXTENSION, VFIO_EEH);
> +
> + /* Enable the EEH functionality on the device */
> + pe_op.op = VFIO_EEH_PE_ENABLE;
> + ioctl(container, VFIO_EEH_PE_OP, &pe_op);
> +
> + /* You're suggested to create additional data struct to represent
> +  * PE, and put child devices belonging to same IOMMU group to the
> +  * PE instance for later reference.
> +  */
> +
> + /* Check the PE's state and make sure it's in functional state */
> + pe_op.op = VFIO_EEH_PE_GET_STATE;
> + ioctl(container, VFIO_EEH_PE_OP, &pe_op);
> +
> + /* Save device state using pci_save_state().
> +  * EEH should be enabled on the specified device.
> +  */
> +
> + 
> +
> + /* When 0xFF's returned from reading PCI config space or IO BARs
> +  * of the PCI device. Check the PE's state to see if that has been
> +  * frozen.
> +  */
> + ioctl(container, VFIO_EEH_PE_OP, &pe_op);
> +
> + /* Waiting for pending PCI transactions to be completed and don't
> +  * produce any more PCI traffic from/to the affected PE until
> +  * recovery is finished.
> +  */
> +
> + /* Enable IO for the affected PE and collect logs. Usually, the
> +  * standard part of PCI config space, AER registers are dumped
> +  * as logs for further analysis.
> +  */
> + pe_op.op = VFIO_EEH_PE_UNFREEZE_IO;
> + ioctl(container, VFIO_EEH_PE_OP, &pe_op);
> +
> + /*
> +  * Issue PE reset: hot or fundamental reset. Usually, hot reset
> +  * is enough. However, the firmware of some PCI adapters would
> +  * require fundamental reset.
> +  */
> + pe_op.op = VFIO_EEH_PE_RESET_HOT;
> + ioctl(container, VFIO_EEH_PE_OP, &pe_op);
> + pe_op.op = VFIO_EEH_PE_RESET_DEACTIVATE;
> + ioctl(container, VFIO_EEH_PE_OP, &pe_op);
> +
> + /* Configure the PCI bridges for the affected PE */
> + pe_op.op = VFIO_EEH_PE_CONFIGURE;
> + ioctl(container, VF

Re: [PATCH 1/4] KVM: PPC: BOOK3S: PR: Emulate virtual timebase register

2014-06-05 Thread Aneesh Kumar K.V
Alexander Graf  writes:

> On 05.06.14 17:50, Aneesh Kumar K.V wrote:
>> Alexander Graf  writes:
>>
>>> On 05.06.14 14:08, Aneesh Kumar K.V wrote:
 virtual time base register is a per VM, per cpu register that needs
 to be saved and restored on vm exit and entry. Writing to VTB is not
 allowed in the privileged mode.

 Signed-off-by: Aneesh Kumar K.V 

...

break;
 diff --git a/arch/powerpc/kvm/book3s_emulate.c 
 b/arch/powerpc/kvm/book3s_emulate.c
 index 3565e775b61b..1bb16a59dcbc 100644
 --- a/arch/powerpc/kvm/book3s_emulate.c
 +++ b/arch/powerpc/kvm/book3s_emulate.c
 @@ -577,6 +577,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu 
 *vcpu, int sprn, ulong *spr_val
 */
*spr_val = vcpu->arch.spurr;
break;
 +  case SPRN_VTB:
 +  *spr_val = vcpu->arch.vtb;
>>> Doesn't this mean that vtb can be the same 2 when the guest reads it 2
>>> times in a row without getting preempted?
>>
>> But a mfspr will result in VM exit and that would make sure we
>> update vcpu->arch.vtb with the correct value.
>
> We only call kvmppc_core_vcpu_put_pr() when we context switch away from 
> KVM, so it won't be updated, no?
>
>

kvmppc_copy_from_svcpu is also called from VM exit path (book3s_interrupt.S)

-aneesh

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC PATCH 1/3] CMA: generalize CMA reserved area management functionality

2014-06-05 Thread Aneesh Kumar K.V
Joonsoo Kim  writes:

> Currently, there are two users on CMA functionality, one is the DMA
> subsystem and the other is the kvm on powerpc. They have their own code
> to manage CMA reserved area even if they looks really similar.
> From my guess, it is caused by some needs on bitmap management. Kvm side
> wants to maintain bitmap not for 1 page, but for more size. Eventually it
> use bitmap where one bit represents 64 pages.
>
> When I implement CMA related patches, I should change those two places
> to apply my change and it seem to be painful to me. I want to change
> this situation and reduce future code management overhead through
> this patch.
>
> This change could also help developer who want to use CMA in their
> new feature development, since they can use CMA easily without
> copying & pasting this reserved area management code.
>
> Signed-off-by: Joonsoo Kim 


The way patches are split makes it difficult to review. Will it be
possible to make changes against one implementation and them move that
to generic code. That helps in finding out what exactly changed.

>
> diff --git a/drivers/base/Kconfig b/drivers/base/Kconfig
> index 00e13ce..b3fe1cc 100644
> --- a/drivers/base/Kconfig
> +++ b/drivers/base/Kconfig
> @@ -283,7 +283,7 @@ config CMA_ALIGNMENT
>
> If unsure, leave the default value "8".
>
> -config CMA_AREAS
> +config DMA_CMA_AREAS
>   int "Maximum count of the CMA device-private areas"
>   default 7
>   help

for ex: The above can be a seperate patch along with further DMA related
cleanups . 


> diff --git a/drivers/base/dma-contiguous.c b/drivers/base/dma-contiguous.c
> index 83969f8..48cdac8 100644
> --- a/drivers/base/dma-contiguous.c
> +++ b/drivers/base/dma-contiguous.c
> @@ -186,7 +186,7 @@ static int __init cma_activate_area(struct cma *cma)
>   return 0;
>  }
>
> -static struct cma cma_areas[MAX_CMA_AREAS];
> +static struct cma cma_areas[MAX_DMA_CMA_AREAS];
>  static unsigned cma_area_count;
>
>  static int __init cma_init_reserved_areas(void)
> diff --git a/include/linux/cma.h b/include/linux/cma.h
> new file mode 100644
> index 000..60ba06f
> --- /dev/null
> +++ b/include/linux/cma.h
> @@ -0,0 +1,28 @@
> +/*
> + * Contiguous Memory Allocator
> + *
> + * Copyright LG Electronics Inc., 2014
> + * Written by:
> + *   Joonsoo Kim 
> + *
> + * This program is free software; you can redistribute it and/or
> + * modify it under the terms of the GNU General Public License as
> + * published by the Free Software Foundation; either version 2 of the
> + * License or (at your optional) any later version of the license.
> + *
> + */
> +
> +#ifndef __CMA_H__
> +#define __CMA_H__
> +
> +struct cma;
> +
> +extern struct page *cma_alloc(struct cma *cma, unsigned long count,
> + unsigned long align);
> +extern bool cma_release(struct cma *cma, struct page *pages,
> + unsigned long count);
> +extern int __init cma_declare_contiguous(phys_addr_t size, phys_addr_t base,
> + phys_addr_t limit, phys_addr_t alignment,
> + unsigned long bitmap_shift, bool fixed,
> + struct cma **res_cma);
> +#endif
> diff --git a/include/linux/dma-contiguous.h b/include/linux/dma-contiguous.h
> index 772eab5..dfb1dc9 100644
> --- a/include/linux/dma-contiguous.h
> +++ b/include/linux/dma-contiguous.h
> @@ -63,7 +63,7 @@ struct device;
>   * There is always at least global CMA area and a few optional device
>   * private areas configured in kernel .config.
>   */
> -#define MAX_CMA_AREAS(1 + CONFIG_CMA_AREAS)
> +#define MAX_DMA_CMA_AREAS  (1 + CONFIG_DMA_CMA_AREAS)
>
>  extern struct cma *dma_contiguous_default_area;
>
> @@ -123,7 +123,7 @@ bool dma_release_from_contiguous(struct device *dev, 
> struct page *pages,
>
>  #else
>
> -#define MAX_CMA_AREAS(0)
> +#define MAX_DMA_CMA_AREAS(0)
>
>  static inline struct cma *dev_get_cma_area(struct device *dev)
>  {
> diff --git a/mm/Kconfig b/mm/Kconfig
> index 7511b4a..0877ddc 100644
> --- a/mm/Kconfig
> +++ b/mm/Kconfig
> @@ -515,6 +515,17 @@ config CMA_DEBUG
> processing calls such as dma_alloc_from_contiguous().
> This option does not affect warning and error messages.
>
> +config CMA_AREAS
> + int "Maximum count of the CMA areas"
> + depends on CMA
> + default 7
> + help
> +   CMA allows to create CMA areas for particular purpose, mainly,
> +   used as device private area. This parameter sets the maximum
> +   number of CMA area in the system.
> +
> +   If unsure, leave the default value "7".
> +

If we have 7 declare continugous request, a kvm cma allocation request will 
fail ?

>  config ZBUD
>   tristate
>   default n
> diff --git a/mm/Makefile b/mm/Makefile
> index 1eaa70b..bc0422b 100644
> --- a/mm/Makefile
> +++ b/mm/Makefile
> @@ -62,3 +62,4 @@ obj-$(CONFIG_MEMORY_ISOLATION) += page_isolation.o
>  obj-$(CONFIG_ZBUD)   += zb

Re: [PATCH 1/4] KVM: PPC: BOOK3S: PR: Emulate virtual timebase register

2014-06-05 Thread Alexander Graf


On 05.06.14 19:33, Aneesh Kumar K.V wrote:

Alexander Graf  writes:


On 05.06.14 17:50, Aneesh Kumar K.V wrote:

Alexander Graf  writes:


On 05.06.14 14:08, Aneesh Kumar K.V wrote:

virtual time base register is a per VM, per cpu register that needs
to be saved and restored on vm exit and entry. Writing to VTB is not
allowed in the privileged mode.

Signed-off-by: Aneesh Kumar K.V 

...


break;
diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c
index 3565e775b61b..1bb16a59dcbc 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -577,6 +577,9 @@ int kvmppc_core_emulate_mfspr_pr(struct kvm_vcpu *vcpu, int 
sprn, ulong *spr_val
 */
*spr_val = vcpu->arch.spurr;
break;
+   case SPRN_VTB:
+   *spr_val = vcpu->arch.vtb;

Doesn't this mean that vtb can be the same 2 when the guest reads it 2
times in a row without getting preempted?

But a mfspr will result in VM exit and that would make sure we
update vcpu->arch.vtb with the correct value.

We only call kvmppc_core_vcpu_put_pr() when we context switch away from
KVM, so it won't be updated, no?



kvmppc_copy_from_svcpu is also called from VM exit path (book3s_interrupt.S)


... where it will run into this code path:

/*
 * Maybe we were already preempted and synced the svcpu from
 * our preempt notifiers. Don't bother touching this svcpu then.
 */
if (!svcpu->in_use)
goto out;


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 1/4] KVM: PPC: BOOK3S: PR: Emulate virtual timebase register

2014-06-05 Thread Alexander Graf


On 06.06.14 00:32, Alexander Graf wrote:


On 05.06.14 19:33, Aneesh Kumar K.V wrote:

Alexander Graf  writes:


On 05.06.14 17:50, Aneesh Kumar K.V wrote:

Alexander Graf  writes:


On 05.06.14 14:08, Aneesh Kumar K.V wrote:

virtual time base register is a per VM, per cpu register that needs
to be saved and restored on vm exit and entry. Writing to VTB is not
allowed in the privileged mode.

Signed-off-by: Aneesh Kumar K.V 

...


break;
diff --git a/arch/powerpc/kvm/book3s_emulate.c 
b/arch/powerpc/kvm/book3s_emulate.c

index 3565e775b61b..1bb16a59dcbc 100644
--- a/arch/powerpc/kvm/book3s_emulate.c
+++ b/arch/powerpc/kvm/book3s_emulate.c
@@ -577,6 +577,9 @@ int kvmppc_core_emulate_mfspr_pr(struct 
kvm_vcpu *vcpu, int sprn, ulong *spr_val

 */
*spr_val = vcpu->arch.spurr;
break;
+case SPRN_VTB:
+*spr_val = vcpu->arch.vtb;
Doesn't this mean that vtb can be the same 2 when the guest reads 
it 2

times in a row without getting preempted?

But a mfspr will result in VM exit and that would make sure we
update vcpu->arch.vtb with the correct value.

We only call kvmppc_core_vcpu_put_pr() when we context switch away from
KVM, so it won't be updated, no?


kvmppc_copy_from_svcpu is also called from VM exit path 
(book3s_interrupt.S)


... where it will run into this code path:

/*
 * Maybe we were already preempted and synced the svcpu from
 * our preempt notifiers. Don't bother touching this svcpu then.
 */
if (!svcpu->in_use)
goto out;


Scratch that. We're always calling this on entry/exit, so you're right.


Alex

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 0/3] Prepare for in-kernel VFIO DMA operations acceleration

2014-06-05 Thread Alexey Kardashevskiy
On 06/05/2014 09:57 PM, Alexander Graf wrote:
> 
> On 05.06.14 09:25, Alexey Kardashevskiy wrote:
>> This reserves 2 capability numbers.
>>
>> This implements an extended version of KVM_CREATE_SPAPR_TCE_64 ioctl.
>>
>> Please advise how to proceed with these patches as I suspect that
>> first two should go via Paolo's tree while the last one via Alex Graf's tree
>> (correct?).
> 
> They would just go via my tree, but only be actually allocated (read:
> mergable to qemu) when they hit Paolo's tree.
> 
> In fact, I don't think it makes sense to split them off at all.


So? Are these patches going anywhere? Thanks.


-- 
Alexey
--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v8 2/3] powerpc/eeh: EEH support for VFIO PCI device

2014-06-05 Thread Gavin Shan
On Thu, Jun 05, 2014 at 04:50:04PM +1000, Benjamin Herrenschmidt wrote:
>On Thu, 2014-06-05 at 16:36 +1000, Gavin Shan wrote:
>> +#define EEH_OPT_GET_PE_ADDR0   /* Get PE addr  */
>> +#define EEH_OPT_GET_PE_MODE1   /* Get PE mode  */
>
>I assume that's just some leftover from the previous patches :-)
>
>Don't respin just yet, let's see what other comments come in.
>

Yep, I'll remove them in next revision. Thanks, Ben.

Thanks,
Gavin

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v8 3/3] drivers/vfio: EEH support for VFIO PCI device

2014-06-05 Thread Gavin Shan
On Thu, Jun 05, 2014 at 11:18:34AM -0600, Alex Williamson wrote:
>On Thu, 2014-06-05 at 16:36 +1000, Gavin Shan wrote:
>> The patch adds new IOCTL commands for sPAPR VFIO container device
>> to support EEH functionality for PCI devices, which have been passed
>> through from host to somebody else via VFIO.
>> 
>> Signed-off-by: Gavin Shan 
>> ---
>>  Documentation/vfio.txt  | 87 
>> ++--
>>  drivers/vfio/Makefile   |  1 +
>>  drivers/vfio/pci/vfio_pci.c | 20 ++---
>>  drivers/vfio/vfio_iommu_spapr_tce.c | 17 ++-
>>  drivers/vfio/vfio_spapr_eeh.c   | 89 
>> +
>>  include/linux/vfio.h| 23 ++
>>  include/uapi/linux/vfio.h   | 35 +++
>>  7 files changed, 262 insertions(+), 10 deletions(-)
>>  create mode 100644 drivers/vfio/vfio_spapr_eeh.c
>> 
>> diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
>> index b9ca023..3fa4538 100644
>> --- a/Documentation/vfio.txt
>> +++ b/Documentation/vfio.txt
>> @@ -305,7 +305,15 @@ faster, the map/unmap handling has been implemented in 
>> real mode which provides
>>  an excellent performance which has limitations such as inability to do
>>  locked pages accounting in real time.
>>  
>> -So 3 additional ioctls have been added:
>> +4) According to sPAPR specification, A Partitionable Endpoint (PE) is an I/O
>> +subtree that can be treated as a unit for the purposes of partitioning and
>> +error recovery. A PE may be a single or multi-function IOA (IO Adapter), a
>> +function of a multi-function IOA, or multiple IOAs (possibly including 
>> switch
>> +and bridge structures above the multiple IOAs). PPC64 guests detect PCI 
>> errors
>> +and recover from them via EEH RTAS services, which works on the basis of
>> +additional ioctl commands.
>> +
>> +So 4 additional ioctls have been added:
>>  
>>  VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
>>  of the DMA window on the PCI bus.
>> @@ -316,9 +324,12 @@ So 3 additional ioctls have been added:
>>  
>>  VFIO_IOMMU_DISABLE - disables the container.
>>  
>> +VFIO_EEH_PE_OP - provides an API for EEH setup, error detection and 
>> recovery.
>>  
>>  The code flow from the example above should be slightly changed:
>>  
>> +struct vfio_eeh_pe_op pe_op = { .argsz = sizeof(pe_op) };
>> +
>>  .
>>  /* Add the group to the container */
>>  ioctl(group, VFIO_GROUP_SET_CONTAINER, &container);
>> @@ -342,9 +353,79 @@ The code flow from the example above should be slightly 
>> changed:
>>  dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
>>  
>>  /* Check here is .iova/.size are within DMA window from 
>> spapr_iommu_info */
>> -
>>  ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);
>> -.
>> +
>> +/* Get a file descriptor for the device */
>> +device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, ":06:0d.0");
>> +
>> +
>> +
>> +/* Gratuitous device reset and go... */
>> +ioctl(device, VFIO_DEVICE_RESET);
>> +
>> +/* Make sure EEH is supported */
>> +ioctl(container, VFIO_CHECK_EXTENSION, VFIO_EEH);
>> +
>> +/* Enable the EEH functionality on the device */
>> +pe_op.op = VFIO_EEH_PE_ENABLE;
>> +ioctl(container, VFIO_EEH_PE_OP, &pe_op);
>> +
>> +/* You're suggested to create additional data struct to represent
>> + * PE, and put child devices belonging to same IOMMU group to the
>> + * PE instance for later reference.
>> + */
>> +
>> +/* Check the PE's state and make sure it's in functional state */
>> +pe_op.op = VFIO_EEH_PE_GET_STATE;
>> +ioctl(container, VFIO_EEH_PE_OP, &pe_op);
>> +
>> +/* Save device state using pci_save_state().
>> + * EEH should be enabled on the specified device.
>> + */
>> +
>> +
>> +
>> +/* When 0xFF's returned from reading PCI config space or IO BARs
>> + * of the PCI device. Check the PE's state to see if that has been
>> + * frozen.
>> + */
>> +ioctl(container, VFIO_EEH_PE_OP, &pe_op);
>> +
>> +/* Waiting for pending PCI transactions to be completed and don't
>> + * produce any more PCI traffic from/to the affected PE until
>> + * recovery is finished.
>> + */
>> +
>> +/* Enable IO for the affected PE and collect logs. Usually, the
>> + * standard part of PCI config space, AER registers are dumped
>> + * as logs for further analysis.
>> + */
>> +pe_op.op = VFIO_EEH_PE_UNFREEZE_IO;
>> +ioctl(container, VFIO_EEH_PE_OP, &pe_op);
>> +
>> +/*
>> + * Issue PE reset: hot or fundamental reset. Usually, hot reset
>> + * is enough. However, the firmware of some PCI adapters would
>> + * require fundamental reset.
>> + */
>> +pe_op.op = VFIO_EEH_PE_RESET_HOT;
>> +ioctl(container, VFIO_EEH_PE_OP, &pe_op);
>> +pe_op.op = VFIO_EEH_PE_RESET_DEACTIVATE;
>> +ioctl(container, VFIO_EEH_PE_OP, &pe_

Re: [PATCH v8 0/3] EEH Support for VFIO PCI Device

2014-06-05 Thread Gavin Shan
On Thu, Jun 05, 2014 at 02:54:47PM +0200, Alexander Graf wrote:
>
>On 05.06.14 08:36, Gavin Shan wrote:
>>The series of patches adds support EEH for PCI devices, which are passed
>>through to PowerKVM based guest via VFIO. The implementation is 
>>straightforward
>>based on the issues or problems we have to resolve to support EEH for PowerKVM
>>based guest.
>>
>>- Emulation for EEH RTAS requests. All EEH RTAS requests goes to QEMU firstly.
>>   If QEMU can't handle it, the request will be sent to host via newly 
>> introduced
>>   VFIO container IOCTL command (VFIO_EEH_OP) and gets handled in host kernel.
>>
>>The series of patches requires corresponding QEMU changes.
>
>Acked-by: Alexander Graf 
>

Thanks, Alex :)

>
>Alex
>

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v9 3/3] drivers/vfio: EEH support for VFIO PCI device

2014-06-05 Thread Gavin Shan
The patch adds new IOCTL commands for sPAPR VFIO container device
to support EEH functionality for PCI devices, which have been passed
through from host to somebody else via VFIO.

Signed-off-by: Gavin Shan 
Acked-by: Alexander Graf 
---
 Documentation/vfio.txt  | 87 +++--
 drivers/vfio/Makefile   |  1 +
 drivers/vfio/pci/vfio_pci.c | 18 ++--
 drivers/vfio/vfio_iommu_spapr_tce.c | 17 +++-
 drivers/vfio/vfio_spapr_eeh.c   | 87 +
 include/linux/vfio.h| 23 ++
 include/uapi/linux/vfio.h   | 34 +++
 7 files changed, 259 insertions(+), 8 deletions(-)
 create mode 100644 drivers/vfio/vfio_spapr_eeh.c

diff --git a/Documentation/vfio.txt b/Documentation/vfio.txt
index b9ca023..3fa4538 100644
--- a/Documentation/vfio.txt
+++ b/Documentation/vfio.txt
@@ -305,7 +305,15 @@ faster, the map/unmap handling has been implemented in 
real mode which provides
 an excellent performance which has limitations such as inability to do
 locked pages accounting in real time.
 
-So 3 additional ioctls have been added:
+4) According to sPAPR specification, A Partitionable Endpoint (PE) is an I/O
+subtree that can be treated as a unit for the purposes of partitioning and
+error recovery. A PE may be a single or multi-function IOA (IO Adapter), a
+function of a multi-function IOA, or multiple IOAs (possibly including switch
+and bridge structures above the multiple IOAs). PPC64 guests detect PCI errors
+and recover from them via EEH RTAS services, which works on the basis of
+additional ioctl commands.
+
+So 4 additional ioctls have been added:
 
VFIO_IOMMU_SPAPR_TCE_GET_INFO - returns the size and the start
of the DMA window on the PCI bus.
@@ -316,9 +324,12 @@ So 3 additional ioctls have been added:
 
VFIO_IOMMU_DISABLE - disables the container.
 
+   VFIO_EEH_PE_OP - provides an API for EEH setup, error detection and 
recovery.
 
 The code flow from the example above should be slightly changed:
 
+   struct vfio_eeh_pe_op pe_op = { .argsz = sizeof(pe_op) };
+
.
/* Add the group to the container */
ioctl(group, VFIO_GROUP_SET_CONTAINER, &container);
@@ -342,9 +353,79 @@ The code flow from the example above should be slightly 
changed:
dma_map.flags = VFIO_DMA_MAP_FLAG_READ | VFIO_DMA_MAP_FLAG_WRITE;
 
/* Check here is .iova/.size are within DMA window from 
spapr_iommu_info */
-
ioctl(container, VFIO_IOMMU_MAP_DMA, &dma_map);
-   .
+
+   /* Get a file descriptor for the device */
+   device = ioctl(group, VFIO_GROUP_GET_DEVICE_FD, ":06:0d.0");
+
+   
+
+   /* Gratuitous device reset and go... */
+   ioctl(device, VFIO_DEVICE_RESET);
+
+   /* Make sure EEH is supported */
+   ioctl(container, VFIO_CHECK_EXTENSION, VFIO_EEH);
+
+   /* Enable the EEH functionality on the device */
+   pe_op.op = VFIO_EEH_PE_ENABLE;
+   ioctl(container, VFIO_EEH_PE_OP, &pe_op);
+
+   /* You're suggested to create additional data struct to represent
+* PE, and put child devices belonging to same IOMMU group to the
+* PE instance for later reference.
+*/
+
+   /* Check the PE's state and make sure it's in functional state */
+   pe_op.op = VFIO_EEH_PE_GET_STATE;
+   ioctl(container, VFIO_EEH_PE_OP, &pe_op);
+
+   /* Save device state using pci_save_state().
+* EEH should be enabled on the specified device.
+*/
+
+   
+
+   /* When 0xFF's returned from reading PCI config space or IO BARs
+* of the PCI device. Check the PE's state to see if that has been
+* frozen.
+*/
+   ioctl(container, VFIO_EEH_PE_OP, &pe_op);
+
+   /* Waiting for pending PCI transactions to be completed and don't
+* produce any more PCI traffic from/to the affected PE until
+* recovery is finished.
+*/
+
+   /* Enable IO for the affected PE and collect logs. Usually, the
+* standard part of PCI config space, AER registers are dumped
+* as logs for further analysis.
+*/
+   pe_op.op = VFIO_EEH_PE_UNFREEZE_IO;
+   ioctl(container, VFIO_EEH_PE_OP, &pe_op);
+
+   /*
+* Issue PE reset: hot or fundamental reset. Usually, hot reset
+* is enough. However, the firmware of some PCI adapters would
+* require fundamental reset.
+*/
+   pe_op.op = VFIO_EEH_PE_RESET_HOT;
+   ioctl(container, VFIO_EEH_PE_OP, &pe_op);
+   pe_op.op = VFIO_EEH_PE_RESET_DEACTIVATE;
+   ioctl(container, VFIO_EEH_PE_OP, &pe_op);
+
+   /* Configure the PCI bridges for the affected PE */
+   pe_op.op = VFIO_EEH_PE_CONFIGURE;
+   ioctl(container, VFIO_EEH_PE_OP, &pe_op);
+
+   /* Restored state we saved at initialization time. pci_restore_state()
+* is good enough as an example.
+*/
+

[PATCH v9 1/3] powerpc/eeh: Avoid event on passed PE

2014-06-05 Thread Gavin Shan
We must not handle EEH error on devices which are passed to somebody
else. Instead, we expect that the frozen device owner detects an EEH
error and recovers from it.

This avoids EEH error handling on passed through devices so the device
owner gets a chance to handle them.

Signed-off-by: Gavin Shan 
Acked-by: Alexander Graf 
---
 arch/powerpc/include/asm/eeh.h| 7 +++
 arch/powerpc/kernel/eeh.c | 8 
 arch/powerpc/platforms/powernv/eeh-ioda.c | 3 ++-
 3 files changed, 17 insertions(+), 1 deletion(-)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 7782056..653d981 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -25,6 +25,7 @@
 #include 
 #include 
 #include 
+#include 
 
 struct pci_dev;
 struct pci_bus;
@@ -84,6 +85,7 @@ struct eeh_pe {
int freeze_count;   /* Times of froze up*/
struct timeval tstamp;  /* Time on first-time freeze*/
int false_positives;/* Times of reported #ff's  */
+   atomic_t pass_dev_cnt;  /* Count of passed through devs */
struct eeh_pe *parent;  /* Parent PE*/
struct list_head child_list;/* Link PE to the child list*/
struct list_head edevs; /* Link list of EEH devices */
@@ -93,6 +95,11 @@ struct eeh_pe {
 #define eeh_pe_for_each_dev(pe, edev, tmp) \
list_for_each_entry_safe(edev, tmp, &pe->edevs, list)
 
+static inline bool eeh_pe_passed(struct eeh_pe *pe)
+{
+   return pe ? !!atomic_read(&pe->pass_dev_cnt) : false;
+}
+
 /*
  * The struct is used to trace EEH state for the associated
  * PCI device node or PCI device. In future, it might
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 9c6b899..3bc8b12 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -400,6 +400,14 @@ int eeh_dev_check_failure(struct eeh_dev *edev)
if (ret > 0)
return ret;
 
+   /*
+* If the PE isn't owned by us, we shouldn't check the
+* state. Instead, let the owner handle it if the PE has
+* been frozen.
+*/
+   if (eeh_pe_passed(pe))
+   return 0;
+
/* If we already have a pending isolation event for this
 * slot, we know it's bad already, we don't need to check.
 * Do this checking under a lock; as multiple PCI devices
diff --git a/arch/powerpc/platforms/powernv/eeh-ioda.c 
b/arch/powerpc/platforms/powernv/eeh-ioda.c
index cab3e62..79193eb 100644
--- a/arch/powerpc/platforms/powernv/eeh-ioda.c
+++ b/arch/powerpc/platforms/powernv/eeh-ioda.c
@@ -892,7 +892,8 @@ static int ioda_eeh_next_error(struct eeh_pe **pe)
opal_pci_eeh_freeze_clear(phb->opal_id, 
frozen_pe_no,
OPAL_EEH_ACTION_CLEAR_FREEZE_ALL);
ret = EEH_NEXT_ERR_NONE;
-   } else if ((*pe)->state & EEH_PE_ISOLATED) {
+   } else if ((*pe)->state & EEH_PE_ISOLATED ||
+  eeh_pe_passed(*pe)) {
ret = EEH_NEXT_ERR_NONE;
} else {
pr_err("EEH: Frozen PHB#%x-PE#%x (%s) 
detected\n",
-- 
1.8.3.2

--
To unsubscribe from this list: send the line "unsubscribe kvm-ppc" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v9 0/3] EEH Support for VFIO PCI Device

2014-06-05 Thread Gavin Shan
The series of patches adds support EEH for PCI devices, which are passed
through to PowerKVM based guest via VFIO. The implementation is straightforward
based on the issues or problems we have to resolve to support EEH for PowerKVM
based guest.

- Emulation for EEH RTAS requests. All EEH RTAS requests goes to QEMU firstly.
  If QEMU can't handle it, the request will be sent to host via newly introduced
  VFIO container IOCTL command (VFIO_EEH_OP) and gets handled in host kernel.

The series of patches requires corresponding QEMU changes.

Change log
==
v1 -> v2:
* EEH RTAS requests are routed to QEMU, and then possiblly to host 
kerenl.
  The mechanism KVM in-kernel handling is dropped.
* Error injection is reimplemented based syscall, instead of KVM 
in-kerenl
  handling. The logic for error injection token management is moved to
  QEMU. The error injection request is routed to QEMU and then possiblly
  to host kernel.
v2 -> v3:
* Make the fields in struct eeh_vfio_pci_addr, struct vfio_eeh_info 
based
  on the comments from Alexey.
* Define macros for EEH VFIO operations (Alexey).
* Clear frozen state after successful PE reset.
* Merge original [PATCH 1/2/3] to one.
v3 -> v4:
* Remove the error injection from the patchset. Mike or I will work on 
that
  later.
* Rename CONFIG_VFIO_EEH to VFIO_PCI_EEH.
* Rename the IOCTL command to VFIO_EEH_OP and it's handled by VFIO-PCI 
device
  instead of VFIO container.
* Rename the IOCTL argument structure to "vfio_eeh_op" accordingly. 
Also, more
  fields added to hold return values for RTAS requests.
* The address mapping stuff is totally removed. When opening or 
releasing VFIO
  PCI device, notification sent to EEH to update the flags indicates 
the device
  is passed to guest or not.
* Change pr_warn() to pr_debug() to avoid DOS as pointed by Alex.W
* Argument size check issue pointed by Alex.W.
v4 -> v5:
* Functions for VFIO PCI EEH support are moved to eeh.c and exported 
from there.
  VFIO PCI driver just uses those functions to tackle IOCTL command 
VFIO_EEH_OP.
  All of this is to make the code organized in a good way as suggested 
by Alex.G.
  Another potential benefit is PowerNV/pSeries are sharing "eeh_ops" 
and same
  infrastructure could possiblly work for KVM_PR and KVM_HV mode at the 
same time.
* Don't clear error injection registers after finishing PE reset as the 
patchset
  is doing nothing related to error injection.
* Amending Documentation/vfio.txt, which was missed in last revision.
* No QEMU changes for this revision. "v4" works well. Also, remove 
"RFC" from the
  subject as the design is basically recognized.
v5 -> v6:
* CONFIG_VFIO_PCI_EEH removed. Instead to use CONFIG_EEH.
* Split one ioctl command to 5.
* In eeh.c, description has been added for those exported functions. 
Also, the
  functions have negative return values for error and information with 
other values.
  All digital numbers have been replaced by macros defined in eeh.h. 
The comments,
  including the function names have been amended not to mention "guest" 
or "vfio".
* Add one mutex to protect flag in eeh_dev_open()/release().
* More information on how to use those ioctl commands to 
Documentation/vfio.txt.
v6 -> v7:
* Remove ioctl command VFIO_EEH_PE_GET_ADDR, the PE address will be 
figured out
  in userland (e.g. QEMU) as Alex.G suggested.
* Let sPAPR VFIO container process the ioctl commands as VFIO container 
is naturally
  corresponds to IOMMU group (aka PE on sPAPR platform).
* All VFIO PCI EEH ioctl commands have "argsz+flags" for its companion 
data struct.
* For VFIO PCI EEH ioctl commands, ioctl() returns negative number to 
indicate error
  or zero for success. Additinal output information is transported by 
the companion
  data struct.
* Explaining PE in Documentation/vfio.txt, typo fixes, more comments 
suggested by
  Alex.G.
* Split/merge patches according to suggestions from Alex.G and Alex.W.
* To have EEH stub in drivers/vfio/pci/, which was suggested by Alex.W.
* Define various EEH options as macros in vfio.h for userland to use.
v7 -> v8:
* Change ioctl commands back to combined one.
* EEH related logic was put into drivers/vfio/vfio_eeh.c, which is only 
built with
  CONFIG_EEH. Otherwise, inline functions defined in 
include/linux/vfio.h
* Change vfio.txt according to the source code changes.
* Fix various comments from internal reviews by Alexey. Thanks to 
Alexey.
v8 -> v9:
* Remove unused macros in asm/include/eeh.h
* Missed to disable VFIO device on error fro

[PATCH v9 2/3] powerpc/eeh: EEH support for VFIO PCI device

2014-06-05 Thread Gavin Shan
The patch exports functions to be used by new VFIO ioctl command,
which will be introduced in subsequent patch, to support EEH
functinality for VFIO PCI devices.

Signed-off-by: Gavin Shan 
Acked-by: Alexander Graf 
---
 arch/powerpc/include/asm/eeh.h |  12 ++
 arch/powerpc/kernel/eeh.c  | 268 +
 2 files changed, 280 insertions(+)

diff --git a/arch/powerpc/include/asm/eeh.h b/arch/powerpc/include/asm/eeh.h
index 653d981..b733044 100644
--- a/arch/powerpc/include/asm/eeh.h
+++ b/arch/powerpc/include/asm/eeh.h
@@ -173,6 +173,11 @@ enum {
 #define EEH_STATE_DMA_ACTIVE   (1 << 4)/* Active DMA   */
 #define EEH_STATE_MMIO_ENABLED (1 << 5)/* MMIO enabled */
 #define EEH_STATE_DMA_ENABLED  (1 << 6)/* DMA enabled  */
+#define EEH_PE_STATE_NORMAL0   /* Normal state */
+#define EEH_PE_STATE_RESET 1   /* PE reset asserted*/
+#define EEH_PE_STATE_STOPPED_IO_DMA2   /* Frozen PE*/
+#define EEH_PE_STATE_STOPPED_DMA   4   /* Stopped DMA, Enabled IO */
+#define EEH_PE_STATE_UNAVAIL   5   /* Unavailable  */
 #define EEH_RESET_DEACTIVATE   0   /* Deactivate the PE reset  */
 #define EEH_RESET_HOT  1   /* Hot reset*/
 #define EEH_RESET_FUNDAMENTAL  3   /* Fundamental reset*/
@@ -280,6 +285,13 @@ void eeh_add_device_late(struct pci_dev *);
 void eeh_add_device_tree_late(struct pci_bus *);
 void eeh_add_sysfs_files(struct pci_bus *);
 void eeh_remove_device(struct pci_dev *);
+int eeh_dev_open(struct pci_dev *pdev);
+void eeh_dev_release(struct pci_dev *pdev);
+struct eeh_pe *eeh_iommu_group_to_pe(struct iommu_group *group);
+int eeh_pe_set_option(struct eeh_pe *pe, int option);
+int eeh_pe_get_state(struct eeh_pe *pe);
+int eeh_pe_reset(struct eeh_pe *pe, int option);
+int eeh_pe_configure(struct eeh_pe *pe);
 
 /**
  * EEH_POSSIBLE_ERROR() -- test for possible MMIO failure.
diff --git a/arch/powerpc/kernel/eeh.c b/arch/powerpc/kernel/eeh.c
index 3bc8b12..fc90df0 100644
--- a/arch/powerpc/kernel/eeh.c
+++ b/arch/powerpc/kernel/eeh.c
@@ -40,6 +40,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -108,6 +109,9 @@ struct eeh_ops *eeh_ops = NULL;
 /* Lock to avoid races due to multiple reports of an error */
 DEFINE_RAW_SPINLOCK(confirm_error_lock);
 
+/* Lock to protect passed flags */
+static DEFINE_MUTEX(eeh_dev_mutex);
+
 /* Buffer for reporting pci register dumps. Its here in BSS, and
  * not dynamically alloced, so that it ends up in RMO where RTAS
  * can access it.
@@ -1106,6 +1110,270 @@ void eeh_remove_device(struct pci_dev *dev)
edev->mode &= ~EEH_DEV_SYSFS;
 }
 
+/**
+ * eeh_dev_open - Increase count of pass through devices for PE
+ * @pdev: PCI device
+ *
+ * Increase count of passed through devices for the indicated
+ * PE. In the result, the EEH errors detected on the PE won't be
+ * reported. The PE owner will be responsible for detection
+ * and recovery.
+ */
+int eeh_dev_open(struct pci_dev *pdev)
+{
+   struct eeh_dev *edev;
+
+   mutex_lock(&eeh_dev_mutex);
+
+   /* No PCI device ? */
+   if (!pdev)
+   goto out;
+
+   /* No EEH device or PE ? */
+   edev = pci_dev_to_eeh_dev(pdev);
+   if (!edev || !edev->pe)
+   goto out;
+
+   /* Increase PE's pass through count */
+   atomic_inc(&edev->pe->pass_dev_cnt);
+   mutex_unlock(&eeh_dev_mutex);
+
+   return 0;
+out:
+   mutex_unlock(&eeh_dev_mutex);
+   return -ENODEV;
+}
+EXPORT_SYMBOL_GPL(eeh_dev_open);
+
+/**
+ * eeh_dev_release - Decrease count of pass through devices for PE
+ * @pdev: PCI device
+ *
+ * Decrease count of pass through devices for the indicated PE. If
+ * there is no passed through device in PE, the EEH errors detected
+ * on the PE will be reported and handled as usual.
+ */
+void eeh_dev_release(struct pci_dev *pdev)
+{
+   struct eeh_dev *edev;
+
+   mutex_lock(&eeh_dev_mutex);
+
+   /* No PCI device ? */
+   if (!pdev)
+   goto out;
+
+   /* No EEH device ? */
+   edev = pci_dev_to_eeh_dev(pdev);
+   if (!edev || !edev->pe || !eeh_pe_passed(edev->pe))
+   goto out;
+
+   /* Decrease PE's pass through count */
+   atomic_dec(&edev->pe->pass_dev_cnt);
+   WARN_ON(atomic_read(&edev->pe->pass_dev_cnt) < 0);
+out:
+   mutex_unlock(&eeh_dev_mutex);
+}
+EXPORT_SYMBOL(eeh_dev_release);
+
+/**
+ * eeh_iommu_group_to_pe - Convert IOMMU group to EEH PE
+ * @group: IOMMU group
+ *
+ * The routine is called to convert IOMMU group to EEH PE.
+ */
+struct eeh_pe *eeh_iommu_group_to_pe(struct iommu_group *group)
+{
+   struct iommu_table *tbl;
+   struct pci_dev *pdev = NULL;
+   struct eeh_dev *edev;
+   bool found = false;
+
+   /* No IOMMU group ? */
+   if (!group)
+   return NULL;
+
+   /* N