Re: [PATCH v7 03/11] KVM: MMU: fast invalidate all pages

2013-05-26 Thread Gleb Natapov
On Fri, May 24, 2013 at 05:23:07PM -0300, Marcelo Tosatti wrote:
 Hi Xiao,
 
 On Thu, May 23, 2013 at 03:55:52AM +0800, Xiao Guangrong wrote:
  The current kvm_mmu_zap_all is really slow - it is holding mmu-lock to
  walk and zap all shadow pages one by one, also it need to zap all guest
  page's rmap and all shadow page's parent spte list. Particularly, things
  become worse if guest uses more memory or vcpus. It is not good for
  scalability
  
  In this patch, we introduce a faster way to invalidate all shadow pages.
  KVM maintains a global mmu invalid generation-number which is stored in
  kvm-arch.mmu_valid_gen and every shadow page stores the current global
  generation-number into sp-mmu_valid_gen when it is created
  
  When KVM need zap all shadow pages sptes, it just simply increase the
  global generation-number then reload root shadow pages on all vcpus.
  Vcpu will create a new shadow page table according to current kvm's
  generation-number. It ensures the old pages are not used any more.
  Then the obsolete pages (sp-mmu_valid_gen != kvm-arch.mmu_valid_gen)
  are zapped by using lock-break technique
  
  Signed-off-by: Xiao Guangrong xiaoguangr...@linux.vnet.ibm.com
  ---
   arch/x86/include/asm/kvm_host.h |2 +
   arch/x86/kvm/mmu.c  |   84 
  +++
   arch/x86/kvm/mmu.h  |1 +
   3 files changed, 87 insertions(+), 0 deletions(-)
  
  diff --git a/arch/x86/include/asm/kvm_host.h 
  b/arch/x86/include/asm/kvm_host.h
  index 3741c65..bff7d46 100644
  --- a/arch/x86/include/asm/kvm_host.h
  +++ b/arch/x86/include/asm/kvm_host.h
  @@ -222,6 +222,7 @@ struct kvm_mmu_page {
  int root_count;  /* Currently serving as active root */
  unsigned int unsync_children;
  unsigned long parent_ptes;  /* Reverse mapping for parent_pte */
  +   unsigned long mmu_valid_gen;
  DECLARE_BITMAP(unsync_child_bitmap, 512);
   
   #ifdef CONFIG_X86_32
  @@ -529,6 +530,7 @@ struct kvm_arch {
  unsigned int n_requested_mmu_pages;
  unsigned int n_max_mmu_pages;
  unsigned int indirect_shadow_pages;
  +   unsigned long mmu_valid_gen;
  struct hlist_head mmu_page_hash[KVM_NUM_MMU_PAGES];
  /*
   * Hash table of struct kvm_mmu_page.
  diff --git a/arch/x86/kvm/mmu.c b/arch/x86/kvm/mmu.c
  index f8ca2f3..f302540 100644
  --- a/arch/x86/kvm/mmu.c
  +++ b/arch/x86/kvm/mmu.c
  @@ -1838,6 +1838,11 @@ static void clear_sp_write_flooding_count(u64 *spte)
  __clear_sp_write_flooding_count(sp);
   }
   
  +static bool is_obsolete_sp(struct kvm *kvm, struct kvm_mmu_page *sp)
  +{
  +   return unlikely(sp-mmu_valid_gen != kvm-arch.mmu_valid_gen);
  +}
  +
   static struct kvm_mmu_page *kvm_mmu_get_page(struct kvm_vcpu *vcpu,
   gfn_t gfn,
   gva_t gaddr,
  @@ -1900,6 +1905,7 @@ static struct kvm_mmu_page *kvm_mmu_get_page(struct 
  kvm_vcpu *vcpu,
   
  account_shadowed(vcpu-kvm, gfn);
  }
  +   sp-mmu_valid_gen = vcpu-kvm-arch.mmu_valid_gen;
  init_shadow_page_table(sp);
  trace_kvm_mmu_get_page(sp, true);
  return sp;
  @@ -2070,8 +2076,10 @@ static int kvm_mmu_prepare_zap_page(struct kvm *kvm, 
  struct kvm_mmu_page *sp,
  ret = mmu_zap_unsync_children(kvm, sp, invalid_list);
  kvm_mmu_page_unlink_children(kvm, sp);
  kvm_mmu_unlink_parents(kvm, sp);
  +
  if (!sp-role.invalid  !sp-role.direct)
  unaccount_shadowed(kvm, sp-gfn);
  +
  if (sp-unsync)
  kvm_unlink_unsync_page(kvm, sp);
  if (!sp-root_count) {
  @@ -4195,6 +4203,82 @@ restart:
  spin_unlock(kvm-mmu_lock);
   }
   
  +static void kvm_zap_obsolete_pages(struct kvm *kvm)
  +{
  +   struct kvm_mmu_page *sp, *node;
  +   LIST_HEAD(invalid_list);
  +
  +restart:
  +   list_for_each_entry_safe_reverse(sp, node,
  + kvm-arch.active_mmu_pages, link) {
  +   /*
  +* No obsolete page exists before new created page since
  +* active_mmu_pages is the FIFO list.
  +*/
  +   if (!is_obsolete_sp(kvm, sp))
  +   break;
 
 Can you add a comment to list_add(x, active_mmu_pages) callsites
 mentioning this case?
 
 Because it'll break silently if people do list_add_tail().
 
  +   /*
  +* Do not repeatedly zap a root page to avoid unnecessary
  +* KVM_REQ_MMU_RELOAD, otherwise we may not be able to
  +* progress:
  +*vcpu 0vcpu 1
  +* call vcpu_enter_guest():
  +*1): handle KVM_REQ_MMU_RELOAD
  +*and require mmu-lock to
  +*load mmu
  +* repeat:
  +*1): zap root page and
  +*send KVM_REQ_MMU_RELOAD
  +*
  +*2): if 

[PATCH RFC] KVM: Fix race in apic-pending_events processing

2013-05-26 Thread Gleb Natapov
apic-pending_events processing has a race that may cause INIT and SIPI
processing to be reordered:

vpu0:vcpu1:
set INIT
   test_and_clear_bit(KVM_APIC_INIT)
  process INIT
set INIT
set SIPI
   test_and_clear_bit(KVM_APIC_SIPI)
  process SIPI

At the and INIT is left pending in pending_events. The following patch
tries to fix this using the fact that if INIT comes after SIPI it drops
SIPI from the pending_events, so if pending_events is different after
SIPI is processed it means that INIT was issued after SIPI otherwise
all pending event are processed and pending_events can be reset to zero.
 
Signed-off-by: Gleb Natapov g...@redhat.com
diff --git a/arch/x86/kvm/lapic.c b/arch/x86/kvm/lapic.c
index 9d75193..67686b8 100644
--- a/arch/x86/kvm/lapic.c
+++ b/arch/x86/kvm/lapic.c
@@ -1850,6 +1850,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
 {
struct kvm_lapic *apic = vcpu-arch.apic;
unsigned int sipi_vector;
+   unsigned long pe;
 
if (!kvm_vcpu_has_lapic(vcpu))
return;
@@ -1862,7 +1863,8 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
else
vcpu-arch.mp_state = KVM_MP_STATE_INIT_RECEIVED;
}
-   if (test_and_clear_bit(KVM_APIC_SIPI, apic-pending_events) 
+   pe = apic-pending_events;
+   if (test_bit(KVM_APIC_SIPI, pe) 
vcpu-arch.mp_state == KVM_MP_STATE_INIT_RECEIVED) {
/* evaluate pending_events before reading the vector */
smp_rmb();
@@ -1871,6 +1873,7 @@ void kvm_apic_accept_events(struct kvm_vcpu *vcpu)
 vcpu-vcpu_id, sipi_vector);
kvm_vcpu_deliver_sipi_vector(vcpu, sipi_vector);
vcpu-arch.mp_state = KVM_MP_STATE_RUNNABLE;
+   cmpxchg(apic-pending_events, pe, 0);
}
 }
 
--
Gleb.
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3-resend 00/11] uaccess: better might_sleep/might_fault behavior

2013-05-26 Thread Michael S. Tsirkin
I seem to have mis-sent v3.  Trying again with same patches after
fixing the message id for the cover letter. I hope the duplicates
that are thus created don't inconvenience people too much.
If they do, I apologize.
I have pared down the Cc list to reduce the noise.
sched maintainers are Cc'd on all patches since that's
the tree I aim for with these patches.

This improves the might_fault annotations used
by uaccess routines:

1. The only reason uaccess routines might sleep
   is if they fault. Make this explicit for
   all architectures.
2. a voluntary preempt point in uaccess functions
   means compiler can't inline them efficiently,
   this breaks assumptions that they are very
   fast and small that e.g. net code seems to make.
   remove this preempt point so behaviour
   matches what callers assume.
3. Accesses (e.g through socket ops) to kernel memory
   with KERNEL_DS like net/sunrpc does will never sleep.
   Remove an unconditinal might_sleep in the inline
   might_fault in kernel.h
   (used when PROVE_LOCKING is not set).
4. Accesses with pagefault_disable return EFAULT
   but won't cause caller to sleep.
   Check for that and avoid might_sleep when
   PROVE_LOCKING is set.

I'd like these changes to go in for 3.11:
besides a general benefit of improved
consistency and performance, I would also like them
for the vhost driver where we want to call socket ops
under a spinlock, and fall back on slower thread handler
on error.

If the changes look good, would sched maintainers
please consider merging them through sched/core because of the
interaction with the scheduler?

Please review, and consider for 3.11.

Note on arch code updates:
I tested x86_64 code.
Other architectures were build-tested.
I don't have cross-build environment for arm64, tile, microblaze and
mn10300 architectures. arm64 and tile got acks.
The arch changes look generally safe enough
but would appreciate review/acks from arch maintainers.
core changes naturally need acks from sched maintainers.

Version 1 of this change was titled
x86: uaccess s/might_sleep/might_fault/

Changes from v2:
add a patch removing a colunatry preempt point
in uaccess functions when PREEMPT_VOLUNATRY is set.
Addresses comments by Arnd Bergmann,
and Peter Zijlstra.
comment on future possible simplifications in the git log
for the powerpc patch. Addresses a comment
by Arnd Bergmann.

Changes from v1:
add more architectures
fix might_fault() scheduling differently depending
on CONFIG_PROVE_LOCKING, as suggested by Ingo

Michael S. Tsirkin (11):
  asm-generic: uaccess s/might_sleep/might_fault/
  arm64: uaccess s/might_sleep/might_fault/
  frv: uaccess s/might_sleep/might_fault/
  m32r: uaccess s/might_sleep/might_fault/
  microblaze: uaccess s/might_sleep/might_fault/
  mn10300: uaccess s/might_sleep/might_fault/
  powerpc: uaccess s/might_sleep/might_fault/
  tile: uaccess s/might_sleep/might_fault/
  x86: uaccess s/might_sleep/might_fault/
  kernel: drop voluntary schedule from might_fault
  kernel: uaccess in atomic with pagefault_disable

 arch/arm64/include/asm/uaccess.h  |  4 ++--
 arch/frv/include/asm/uaccess.h|  4 ++--
 arch/m32r/include/asm/uaccess.h   | 12 ++--
 arch/microblaze/include/asm/uaccess.h |  6 +++---
 arch/mn10300/include/asm/uaccess.h|  4 ++--
 arch/powerpc/include/asm/uaccess.h| 16 
 arch/tile/include/asm/uaccess.h   |  2 +-
 arch/x86/include/asm/uaccess_64.h |  2 +-
 include/asm-generic/uaccess.h | 10 +-
 include/linux/kernel.h|  7 ++-
 mm/memory.c   | 10 +++---
 11 files changed, 39 insertions(+), 38 deletions(-)

-- 
MST
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm tools: add status notification hook to virtio-mmio

2013-05-26 Thread Sasha Levin
On 05/21/2013 08:29 AM, Marc Zyngier wrote:
 Patch e03b449cbddf (kvm tools: add status notification hook for
 virtio) converted virtio-net to use a new notification mechanism,
 but unfortunately left virtio-mmio behind.
 
 The direct consequence is that both arm and arm64 guests are left
 without any form of networking. Not good.
 
 Bring virtio-mmio into this brave new notified world, and feel the
 power of being able to ping again.
 
 Cc: Sasha Levin sasha.le...@oracle.com
 Cc: Will Deacon will.dea...@arm.com
 Signed-off-by: Marc Zyngier marc.zyng...@arm.com
 ---
  tools/kvm/virtio/mmio.c | 7 ++-
  1 file changed, 6 insertions(+), 1 deletion(-)
 
 diff --git a/tools/kvm/virtio/mmio.c b/tools/kvm/virtio/mmio.c
 index a4e4855..afa2692 100644
 --- a/tools/kvm/virtio/mmio.c
 +++ b/tools/kvm/virtio/mmio.c
 @@ -144,16 +144,21 @@ static void virtio_mmio_config_out(u64 addr, void 
 *data, u32 len,
  struct virtio_device *vdev)
  {
   struct virtio_mmio *vmmio = vdev-virtio;
 + struct kvm *kvm = vmmio-kvm;
   u32 val = 0;
  
   switch (addr) {
   case VIRTIO_MMIO_HOST_FEATURES_SEL:
   case VIRTIO_MMIO_GUEST_FEATURES_SEL:
   case VIRTIO_MMIO_QUEUE_SEL:
 - case VIRTIO_MMIO_STATUS:
   val = ioport__read32(data);
   *(u32 *)(((void *)vmmio-hdr) + addr) = val;
   break;
 + case VIRTIO_MMIO_STATUS:
 + vmmio-hdr.status = ioport__read32(data);
 + if (vdev-ops-notify_status)
 + vdev-ops-notify_status(kvm, vmmio-dev, 
 vmmio-hdr.status);
 + break;
   case VIRTIO_MMIO_GUEST_FEATURES:
   if (vmmio-hdr.guest_features_sel == 0) {
   val = ioport__read32(data);
 

Pekka, could you apply please?

Acked-by: Sasha Levin sasha.le...@oracle.com


Thanks,
Sasha
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] kvm tools: add status notification hook to virtio-mmio

2013-05-26 Thread Asias He
On Sun, May 26, 2013 at 11:33 PM, Sasha Levin sasha.le...@oracle.com wrote:
 On 05/21/2013 08:29 AM, Marc Zyngier wrote:
 Patch e03b449cbddf (kvm tools: add status notification hook for
 virtio) converted virtio-net to use a new notification mechanism,
 but unfortunately left virtio-mmio behind.

 The direct consequence is that both arm and arm64 guests are left
 without any form of networking. Not good.

 Bring virtio-mmio into this brave new notified world, and feel the
 power of being able to ping again.

 Cc: Sasha Levin sasha.le...@oracle.com
 Cc: Will Deacon will.dea...@arm.com
 Signed-off-by: Marc Zyngier marc.zyng...@arm.com

Acked-by: Asias He asias.he...@gmail.com

 ---
  tools/kvm/virtio/mmio.c | 7 ++-
  1 file changed, 6 insertions(+), 1 deletion(-)

 diff --git a/tools/kvm/virtio/mmio.c b/tools/kvm/virtio/mmio.c
 index a4e4855..afa2692 100644
 --- a/tools/kvm/virtio/mmio.c
 +++ b/tools/kvm/virtio/mmio.c
 @@ -144,16 +144,21 @@ static void virtio_mmio_config_out(u64 addr, void 
 *data, u32 len,
  struct virtio_device *vdev)
  {
   struct virtio_mmio *vmmio = vdev-virtio;
 + struct kvm *kvm = vmmio-kvm;
   u32 val = 0;

   switch (addr) {
   case VIRTIO_MMIO_HOST_FEATURES_SEL:
   case VIRTIO_MMIO_GUEST_FEATURES_SEL:
   case VIRTIO_MMIO_QUEUE_SEL:
 - case VIRTIO_MMIO_STATUS:
   val = ioport__read32(data);
   *(u32 *)(((void *)vmmio-hdr) + addr) = val;
   break;
 + case VIRTIO_MMIO_STATUS:
 + vmmio-hdr.status = ioport__read32(data);
 + if (vdev-ops-notify_status)
 + vdev-ops-notify_status(kvm, vmmio-dev, 
 vmmio-hdr.status);
 + break;
   case VIRTIO_MMIO_GUEST_FEATURES:
   if (vmmio-hdr.guest_features_sel == 0) {
   val = ioport__read32(data);


 Pekka, could you apply please?

 Acked-by: Sasha Levin sasha.le...@oracle.com


 Thanks,
 Sasha
 --
 To unsubscribe from this list: send the line unsubscribe kvm in
 the body of a message to majord...@vger.kernel.org
 More majordomo info at  http://vger.kernel.org/majordomo-info.html



--
Asias
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 03/11] KVM: MMU: fast invalidate all pages

2013-05-26 Thread Xiao Guangrong
Hi Marcelo,

On 05/25/2013 04:23 AM, Marcelo Tosatti wrote:

 +static void kvm_zap_obsolete_pages(struct kvm *kvm)
 +{
 +struct kvm_mmu_page *sp, *node;
 +LIST_HEAD(invalid_list);
 +
 +restart:
 +list_for_each_entry_safe_reverse(sp, node,
 +  kvm-arch.active_mmu_pages, link) {
 +/*
 + * No obsolete page exists before new created page since
 + * active_mmu_pages is the FIFO list.
 + */
 +if (!is_obsolete_sp(kvm, sp))
 +break;
 
 Can you add a comment to list_add(x, active_mmu_pages) callsites
 mentioning this case?
 
 Because it'll break silently if people do list_add_tail().

Sure, I will do it in the next version.

And i totally agree with Gleb's points that reply your other questions
in this patch.

Thank you all!

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v7 04/11] KVM: MMU: zap pages in batch

2013-05-26 Thread Xiao Guangrong
On 05/25/2013 04:34 AM, Marcelo Tosatti wrote:
 On Thu, May 23, 2013 at 03:55:53AM +0800, Xiao Guangrong wrote:
 Zap at lease 10 pages before releasing mmu-lock to reduce the overload
 caused by requiring lock

 After the patch, kvm_zap_obsolete_pages can forward progress anyway,
 so update the comments

 [ It improves kernel building 0.6% ~ 1% ]
 
 Can you please describe the overload in more detail? Under what scenario
 is kernel building improved?

Yes.

The scenario is we do kernel building, meanwhile, repeatedly read PCI rom
every one second.

[
   echo 1  /sys/bus/pci/devices/\:00\:03.0/rom
   cat /sys/bus/pci/devices/\:00\:03.0/rom  /dev/null
]

--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/4] KVM: PPC: Add support for IOMMU in-kernel handling

2013-05-26 Thread Alexey Kardashevskiy
On 05/25/2013 12:45 PM, David Gibson wrote:
 On Wed, May 22, 2013 at 04:06:57PM -0500, Scott Wood wrote:
 On 05/20/2013 10:06:46 PM, Alexey Kardashevskiy wrote:
 diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
 index 8465c2a..da6bf61 100644
 --- a/arch/powerpc/kvm/powerpc.c
 @@ -396,6 +396,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 +++ b/arch/powerpc/kvm/powerpc.c
 break;
 #endif
 case KVM_CAP_SPAPR_MULTITCE:
 +   case KVM_CAP_SPAPR_TCE_IOMMU:
 r = 1;
 break;
 default:

 Don't advertise SPAPR capabilities if it's not book3s -- and
 probably there's some additional limitation that would be
 appropriate.
 
 So, in the case of MULTITCE, that's not quite right.  PR KVM can
 emulate a PAPR system on a BookE machine, and there's no reason not to
 allow TCE acceleration as well.  We can't make it dependent on PAPR
 mode being selected, because that's enabled per-vcpu, whereas these
 capabilities are queried on the VM before the vcpus are created.
 
 CAP_SPAPR_TCE_IOMMU should be dependent on the presence of suitable
 host side hardware (i.e. a PAPR style IOMMU), though.


The capability says that the ioctl is supported. If there is no IOMMU group
registered, than it will fail with a reasonable error and nobody gets hurt.
What is the problem?



 @@ -1025,6 +1026,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
 r = kvm_vm_ioctl_create_spapr_tce(kvm, create_tce);
 goto out;
 }
 +   case KVM_CREATE_SPAPR_TCE_IOMMU: {
 +   struct kvm_create_spapr_tce_iommu create_tce_iommu;
 +   struct kvm *kvm = filp-private_data;
 +
 +   r = -EFAULT;
 +   if (copy_from_user(create_tce_iommu, argp,
 +   sizeof(create_tce_iommu)))
 +   goto out;
 +   r = kvm_vm_ioctl_create_spapr_tce_iommu(kvm,
 create_tce_iommu);
 +   goto out;
 +   }
 #endif /* CONFIG_PPC_BOOK3S_64 */

 #ifdef CONFIG_KVM_BOOK3S_64_HV
 diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
 index 5a2afda..450c82a 100644
 --- a/include/uapi/linux/kvm.h
 +++ b/include/uapi/linux/kvm.h
 @@ -667,6 +667,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_PPC_RTAS 91
 #define KVM_CAP_IRQ_XICS 92
 #define KVM_CAP_SPAPR_MULTITCE (0x11 + 89)
 +#define KVM_CAP_SPAPR_TCE_IOMMU (0x11 + 90)

 Hmm...
 
 Ah, yeah, that needs to be fixed.  Those were interim numbers so that
 we didn't have to keep changing our internal trees as new upstream
 ioctls got added to the list.  We need to get a proper number for the
 merge, though.

 @@ -939,6 +940,9 @@ struct kvm_s390_ucas_mapping {
 #define KVM_GET_DEVICE_ATTR   _IOW(KVMIO,  0xe2, struct
 kvm_device_attr)
 #define KVM_HAS_DEVICE_ATTR   _IOW(KVMIO,  0xe3, struct
 kvm_device_attr)

 +/* ioctl for SPAPR TCE IOMMU */
 +#define KVM_CREATE_SPAPR_TCE_IOMMU _IOW(KVMIO,  0xe4, struct
 kvm_create_spapr_tce_iommu)

 Shouldn't this go under the vm ioctl section?


The KVM_CREATE_SPAPR_TCE_IOMMU ioctl (the version for emulated devices) is
in this section so I decided to keep them together. Wrong?


-- 
Alexey
--
To unsubscribe from this list: send the line unsubscribe kvm in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/4] KVM: PPC: Add support for IOMMU in-kernel handling

2013-05-26 Thread Alexey Kardashevskiy
On 05/25/2013 12:45 PM, David Gibson wrote:
 On Wed, May 22, 2013 at 04:06:57PM -0500, Scott Wood wrote:
 On 05/20/2013 10:06:46 PM, Alexey Kardashevskiy wrote:
 diff --git a/arch/powerpc/kvm/powerpc.c b/arch/powerpc/kvm/powerpc.c
 index 8465c2a..da6bf61 100644
 --- a/arch/powerpc/kvm/powerpc.c
 @@ -396,6 +396,7 @@ int kvm_dev_ioctl_check_extension(long ext)
 +++ b/arch/powerpc/kvm/powerpc.c
 break;
 #endif
 case KVM_CAP_SPAPR_MULTITCE:
 +   case KVM_CAP_SPAPR_TCE_IOMMU:
 r = 1;
 break;
 default:

 Don't advertise SPAPR capabilities if it's not book3s -- and
 probably there's some additional limitation that would be
 appropriate.
 
 So, in the case of MULTITCE, that's not quite right.  PR KVM can
 emulate a PAPR system on a BookE machine, and there's no reason not to
 allow TCE acceleration as well.  We can't make it dependent on PAPR
 mode being selected, because that's enabled per-vcpu, whereas these
 capabilities are queried on the VM before the vcpus are created.
 
 CAP_SPAPR_TCE_IOMMU should be dependent on the presence of suitable
 host side hardware (i.e. a PAPR style IOMMU), though.


The capability says that the ioctl is supported. If there is no IOMMU group
registered, than it will fail with a reasonable error and nobody gets hurt.
What is the problem?



 @@ -1025,6 +1026,17 @@ long kvm_arch_vm_ioctl(struct file *filp,
 r = kvm_vm_ioctl_create_spapr_tce(kvm, create_tce);
 goto out;
 }
 +   case KVM_CREATE_SPAPR_TCE_IOMMU: {
 +   struct kvm_create_spapr_tce_iommu create_tce_iommu;
 +   struct kvm *kvm = filp-private_data;
 +
 +   r = -EFAULT;
 +   if (copy_from_user(create_tce_iommu, argp,
 +   sizeof(create_tce_iommu)))
 +   goto out;
 +   r = kvm_vm_ioctl_create_spapr_tce_iommu(kvm,
 create_tce_iommu);
 +   goto out;
 +   }
 #endif /* CONFIG_PPC_BOOK3S_64 */

 #ifdef CONFIG_KVM_BOOK3S_64_HV
 diff --git a/include/uapi/linux/kvm.h b/include/uapi/linux/kvm.h
 index 5a2afda..450c82a 100644
 --- a/include/uapi/linux/kvm.h
 +++ b/include/uapi/linux/kvm.h
 @@ -667,6 +667,7 @@ struct kvm_ppc_smmu_info {
 #define KVM_CAP_PPC_RTAS 91
 #define KVM_CAP_IRQ_XICS 92
 #define KVM_CAP_SPAPR_MULTITCE (0x11 + 89)
 +#define KVM_CAP_SPAPR_TCE_IOMMU (0x11 + 90)

 Hmm...
 
 Ah, yeah, that needs to be fixed.  Those were interim numbers so that
 we didn't have to keep changing our internal trees as new upstream
 ioctls got added to the list.  We need to get a proper number for the
 merge, though.

 @@ -939,6 +940,9 @@ struct kvm_s390_ucas_mapping {
 #define KVM_GET_DEVICE_ATTR   _IOW(KVMIO,  0xe2, struct
 kvm_device_attr)
 #define KVM_HAS_DEVICE_ATTR   _IOW(KVMIO,  0xe3, struct
 kvm_device_attr)

 +/* ioctl for SPAPR TCE IOMMU */
 +#define KVM_CREATE_SPAPR_TCE_IOMMU _IOW(KVMIO,  0xe4, struct
 kvm_create_spapr_tce_iommu)

 Shouldn't this go under the vm ioctl section?


The KVM_CREATE_SPAPR_TCE_IOMMU ioctl (the version for emulated devices) is
in this section so I decided to keep them together. Wrong?


-- 
Alexey
--
To unsubscribe from this list: send the line unsubscribe kvm-ppc in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html