Re: [PATCH 3/4] arm/arm64: KVM: Fix migration race in the arch timer

2015-03-02 Thread Alex Bennée

Marc Zyngier  writes:

> On Wed, 25 Feb 2015 15:36:21 +
> Alex Bennée  wrote:
>
> Alex, Christoffer,
>

>
> So the first half of the patch looks perfectly OK to me...
>
>> diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
>> index af6a521..3b4ded2 100644
>> --- a/virt/kvm/arm/vgic.c
>> +++ b/virt/kvm/arm/vgic.c
>> @@ -263,6 +263,13 @@ static int vgic_irq_is_queued(struct kvm_vcpu
>> *vcpu, int irq) return vgic_bitmap_get_irq_val(&dist->irq_queued,
>> vcpu->vcpu_id, irq); }
>>  
>> +static int vgic_irq_is_active(struct kvm_vcpu *vcpu, int irq)
>> +{
>> +struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
>> +
>> +return vgic_bitmap_get_irq_val(&dist->irq_active,
>> vcpu->vcpu_id, irq); +}
>> +
>>  static void vgic_irq_set_queued(struct kvm_vcpu *vcpu, int irq)
>>  {
>>  struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
>> @@ -285,6 +292,13 @@ static void vgic_irq_set_active(struct kvm_vcpu
>> *vcpu, int irq) vgic_bitmap_set_irq_val(&dist->irq_active,
>> vcpu->vcpu_id, irq, 1); }
>>  
>> +static void vgic_irq_clear_active(struct kvm_vcpu *vcpu, int irq)
>> +{
>> +struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
>> +
>> +vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id,
>> irq, 0); +}
>> +
>>  static int vgic_dist_irq_get_level(struct kvm_vcpu *vcpu, int irq)
>>  {
>>  struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
>> @@ -634,16 +648,12 @@ bool vgic_handle_cfg_reg(u32 *reg, struct
>> kvm_exit_mmio *mmio, }
>>  
>>  /**
>> - * vgic_unqueue_irqs - move pending IRQs from LRs to the distributor
>> + * vgic_unqueue_irqs - move pending/active IRQs from LRs to the
>> distributor
>>   * @vgic_cpu: Pointer to the vgic_cpu struct holding the LRs
>>   *
>> - * Move any pending IRQs that have already been assigned to LRs back
>> to the
>> + * Move any IRQs that have already been assigned to LRs back to the
>>   * emulated distributor state so that the complete emulated state
>> can be read
>>   * from the main emulation structures without investigating the LRs.
>> - *
>> - * Note that IRQs in the active state in the LRs get their pending
>> state moved
>> - * to the distributor but the active state stays in the LRs, because
>> we don't
>> - * track the active state on the distributor side.
>>   */
>>  void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
>>  {
>> @@ -919,7 +929,7 @@ static int compute_pending_for_cpu(struct
>> kvm_vcpu *vcpu) 
>>  /*
>>   * Update the interrupt state and determine which CPUs have pending
>> - * interrupts. Must be called with distributor lock held.
>> + * or active interrupts. Must be called with distributor lock held.
>>   */
>>  void vgic_update_state(struct kvm *kvm)
>>  {
>> @@ -1036,6 +1046,25 @@ static void vgic_retire_disabled_irqs(struct
>> kvm_vcpu *vcpu) }
>>  }
>>  
>> +static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
>> + int lr_nr, struct vgic_lr vlr)
>> +{
>> +if (vgic_irq_is_active(vcpu, irq)) {
>> +vlr.state |= LR_STATE_ACTIVE;
>> +kvm_debug("Set active, clear distributor: 0x%x\n",
>> vlr.state);
>> +vgic_irq_clear_active(vcpu, irq);
>> +vgic_update_state(vcpu->kvm);
>> +} else if (vgic_dist_irq_is_pending(vcpu, irq)) {
>> +vlr.state |= LR_STATE_PENDING;
>> +kvm_debug("Set pending: 0x%x\n", vlr.state);
>> +}
>> +
>> +if (!vgic_irq_is_edge(vcpu, irq))
>> +vlr.state |= LR_EOI_INT;
>> +
>> +vgic_set_lr(vcpu, lr_nr, vlr);
>> +}
>> +
>>  /*
>>   * Queue an interrupt to a CPU virtual interface. Return true on
>> success,
>>   * or false if it wasn't possible to queue it.
>> @@ -1063,8 +1092,7 @@ bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8
>> sgi_source_id, int irq) if (vlr.source == sgi_source_id) {
>>  kvm_debug("LR%d piggyback for IRQ%d\n", lr,
>> vlr.irq); BUG_ON(!test_bit(lr, vgic_cpu->lr_used));
>> -vlr.state |= LR_STATE_PENDING;
>> -vgic_set_lr(vcpu, lr, vlr);
>> +vgic_queue_irq_to_lr(vcpu, irq, lr, vlr);
>>  return true;
>>  }
>>  }
>> @@ -1081,11 +1109,8 @@ bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8
>> sgi_source_id, int irq) 
>>  vlr.irq = irq;
>>  vlr.source = sgi_source_id;
>> -vlr.state = LR_STATE_PENDING;
>> -if (!vgic_irq_is_edge(vcpu, irq))
>> -vlr.state |= LR_EOI_INT;
>> -
>> -vgic_set_lr(vcpu, lr, vlr);
>> +vlr.state = 0;
>> +vgic_queue_irq_to_lr(vcpu, irq, lr, vlr);
>>  
>>  return true;
>>  }
>
>
> ... but this whole vgic rework seems rather out of place, and I can't
> really see its connection with the timer. Isn't it logically part of the
> previous patch?

Probably - I was going to re-factor that code with the original patch
but it was on the todo list once we had it working. Christoffer than
cleaned it up when he fixed the race hence it being here.

Would you like it as a separate patch (between 2 and 3) or just rolled
i

Re: [PATCH 3/4] arm/arm64: KVM: Fix migration race in the arch timer

2015-03-02 Thread Marc Zyngier
On 02/03/15 08:50, Alex Bennée wrote:
> 
> Marc Zyngier  writes:
> 
>> On Wed, 25 Feb 2015 15:36:21 +
>> Alex Bennée  wrote:
>>
>> Alex, Christoffer,
>>
> 
>>
>> So the first half of the patch looks perfectly OK to me...
>>
>>> diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
>>> index af6a521..3b4ded2 100644
>>> --- a/virt/kvm/arm/vgic.c
>>> +++ b/virt/kvm/arm/vgic.c
>>> @@ -263,6 +263,13 @@ static int vgic_irq_is_queued(struct kvm_vcpu
>>> *vcpu, int irq) return vgic_bitmap_get_irq_val(&dist->irq_queued,
>>> vcpu->vcpu_id, irq); }
>>>  
>>> +static int vgic_irq_is_active(struct kvm_vcpu *vcpu, int irq)
>>> +{
>>> +   struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
>>> +
>>> +   return vgic_bitmap_get_irq_val(&dist->irq_active,
>>> vcpu->vcpu_id, irq); +}
>>> +
>>>  static void vgic_irq_set_queued(struct kvm_vcpu *vcpu, int irq)
>>>  {
>>> struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
>>> @@ -285,6 +292,13 @@ static void vgic_irq_set_active(struct kvm_vcpu
>>> *vcpu, int irq) vgic_bitmap_set_irq_val(&dist->irq_active,
>>> vcpu->vcpu_id, irq, 1); }
>>>  
>>> +static void vgic_irq_clear_active(struct kvm_vcpu *vcpu, int irq)
>>> +{
>>> +   struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
>>> +
>>> +   vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id,
>>> irq, 0); +}
>>> +
>>>  static int vgic_dist_irq_get_level(struct kvm_vcpu *vcpu, int irq)
>>>  {
>>> struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
>>> @@ -634,16 +648,12 @@ bool vgic_handle_cfg_reg(u32 *reg, struct
>>> kvm_exit_mmio *mmio, }
>>>  
>>>  /**
>>> - * vgic_unqueue_irqs - move pending IRQs from LRs to the distributor
>>> + * vgic_unqueue_irqs - move pending/active IRQs from LRs to the
>>> distributor
>>>   * @vgic_cpu: Pointer to the vgic_cpu struct holding the LRs
>>>   *
>>> - * Move any pending IRQs that have already been assigned to LRs back
>>> to the
>>> + * Move any IRQs that have already been assigned to LRs back to the
>>>   * emulated distributor state so that the complete emulated state
>>> can be read
>>>   * from the main emulation structures without investigating the LRs.
>>> - *
>>> - * Note that IRQs in the active state in the LRs get their pending
>>> state moved
>>> - * to the distributor but the active state stays in the LRs, because
>>> we don't
>>> - * track the active state on the distributor side.
>>>   */
>>>  void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
>>>  {
>>> @@ -919,7 +929,7 @@ static int compute_pending_for_cpu(struct
>>> kvm_vcpu *vcpu) 
>>>  /*
>>>   * Update the interrupt state and determine which CPUs have pending
>>> - * interrupts. Must be called with distributor lock held.
>>> + * or active interrupts. Must be called with distributor lock held.
>>>   */
>>>  void vgic_update_state(struct kvm *kvm)
>>>  {
>>> @@ -1036,6 +1046,25 @@ static void vgic_retire_disabled_irqs(struct
>>> kvm_vcpu *vcpu) }
>>>  }
>>>  
>>> +static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
>>> +int lr_nr, struct vgic_lr vlr)
>>> +{
>>> +   if (vgic_irq_is_active(vcpu, irq)) {
>>> +   vlr.state |= LR_STATE_ACTIVE;
>>> +   kvm_debug("Set active, clear distributor: 0x%x\n",
>>> vlr.state);
>>> +   vgic_irq_clear_active(vcpu, irq);
>>> +   vgic_update_state(vcpu->kvm);
>>> +   } else if (vgic_dist_irq_is_pending(vcpu, irq)) {
>>> +   vlr.state |= LR_STATE_PENDING;
>>> +   kvm_debug("Set pending: 0x%x\n", vlr.state);
>>> +   }
>>> +
>>> +   if (!vgic_irq_is_edge(vcpu, irq))
>>> +   vlr.state |= LR_EOI_INT;
>>> +
>>> +   vgic_set_lr(vcpu, lr_nr, vlr);
>>> +}
>>> +
>>>  /*
>>>   * Queue an interrupt to a CPU virtual interface. Return true on
>>> success,
>>>   * or false if it wasn't possible to queue it.
>>> @@ -1063,8 +1092,7 @@ bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8
>>> sgi_source_id, int irq) if (vlr.source == sgi_source_id) {
>>> kvm_debug("LR%d piggyback for IRQ%d\n", lr,
>>> vlr.irq); BUG_ON(!test_bit(lr, vgic_cpu->lr_used));
>>> -   vlr.state |= LR_STATE_PENDING;
>>> -   vgic_set_lr(vcpu, lr, vlr);
>>> +   vgic_queue_irq_to_lr(vcpu, irq, lr, vlr);
>>> return true;
>>> }
>>> }
>>> @@ -1081,11 +1109,8 @@ bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8
>>> sgi_source_id, int irq) 
>>> vlr.irq = irq;
>>> vlr.source = sgi_source_id;
>>> -   vlr.state = LR_STATE_PENDING;
>>> -   if (!vgic_irq_is_edge(vcpu, irq))
>>> -   vlr.state |= LR_EOI_INT;
>>> -
>>> -   vgic_set_lr(vcpu, lr, vlr);
>>> +   vlr.state = 0;
>>> +   vgic_queue_irq_to_lr(vcpu, irq, lr, vlr);
>>>  
>>> return true;
>>>  }
>>
>>
>> ... but this whole vgic rework seems rather out of place, and I can't
>> really see its connection with the timer. Isn't it logically part of the
>> previous patch?
> 
> Probably - I was going to re-factor that code with the original patch
> but it was on the todo list once we had it working. C

RE: [v3 23/26] KVM: Update Posted-Interrupts Descriptor when vCPU is preempted

2015-03-02 Thread Wu, Feng


> -Original Message-
> From: Marcelo Tosatti [mailto:mtosa...@redhat.com]
> Sent: Tuesday, February 24, 2015 6:22 AM
> To: Wu, Feng
> Cc: t...@linutronix.de; mi...@redhat.com; h...@zytor.com; x...@kernel.org;
> g...@kernel.org; pbonz...@redhat.com; dw...@infradead.org;
> j...@8bytes.org; alex.william...@redhat.com; jiang@linux.intel.com;
> eric.au...@linaro.org; linux-ker...@vger.kernel.org;
> io...@lists.linux-foundation.org; kvm@vger.kernel.org
> Subject: Re: [v3 23/26] KVM: Update Posted-Interrupts Descriptor when vCPU
> is preempted
> 
> On Fri, Dec 12, 2014 at 11:14:57PM +0800, Feng Wu wrote:
> > This patch updates the Posted-Interrupts Descriptor when vCPU
> > is preempted.
> >
> > sched out:
> > - Set 'SN' to suppress furture non-urgent interrupts posted for
> > the vCPU.
> 
> What wakes the vcpu in the case of a non-urgent interrupt, then?

Here we set 'SN' when vCPU's state is transmitted from running to
runnable (waiting in the runqueue), after the vCPU is chosen to run
again, the 'SN' will be clear. So no need to wakeup it explicitly.

> 
> I wonder how is software suppose to configure the urgent/non-urgent
> flag. Can you give examples of (hypothetical) urgent and non-urgent
> interrupts.

Well, urgent and non-urgent flag is supported in hardware, I think the
original purpose of urgent interrupts is for real time usage. Then, when
such urgent interrupts happen, we can change the behavior of the
scheduler and make the related vCPU run immediately. However, from
software's point of view, we didn't find a clear picture about which
interrupts should be urgent and how to configure it, so we don't support
this currently.

> 
> > sched in:
> > - Clear 'SN'
> > - Change NDST if vCPU is scheduled to a different CPU
> > - Set 'NV' to POSTED_INTR_VECTOR
> 
> What about:
> 
> POSTED_INTR_VECTOR interrupt handler:
> - Wakeup vcpu.
- If the vCPU is still running (not preempted), we don't need
to wakeup it. 
- In POSTED_INTR_VECTOR interrupt handler, it is a little hard
to get vCPU related information, even if we get, it is not accurate
and may harm the performance. (need search)

> - Set 'SN' to suppress future interrupts.
We only need to set 'SN' when the vCPU is waiting on the runqueue,
So seems set 'SN' in this handler is not a good idea.

> 
> HLT emulation entry:
> - Clear 'SN' to receive VT-d interrupt notification.
> 
> > Signed-off-by: Feng Wu 
> > ---
> >  arch/x86/kvm/vmx.c | 44
> 
> >  1 file changed, 44 insertions(+)
> >
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > index ee3b735..bf2e6cd 100644
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -1916,10 +1916,54 @@ static void vmx_vcpu_load(struct kvm_vcpu
> *vcpu, int cpu)
> > vmcs_writel(HOST_IA32_SYSENTER_ESP, sysenter_esp); /* 22.2.3 */
> > vmx->loaded_vmcs->cpu = cpu;
> > }
> > +
> > +   if (irq_remapping_cap(IRQ_POSTING_CAP)) {
> > +   struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
> > +   struct pi_desc old, new;
> > +   unsigned int dest;
> > +
> > +   memset(&old, 0, sizeof(old));
> > +   memset(&new, 0, sizeof(new));
> > +
> > +   do {
> > +   old.control = new.control = pi_desc->control;
> > +   if (vcpu->cpu != cpu) {
> > +   dest = cpu_physical_id(cpu);
> > +
> > +   if (x2apic_enabled())
> > +   new.ndst = dest;
> > +   else
> > +   new.ndst = (dest << 8) & 0xFF00;
> > +   }
> > +
> > +   pi_clear_sn(&new);
> > +
> > +   /* set 'NV' to 'notification vector' */
> > +   new.nv = POSTED_INTR_VECTOR;
> > +   } while (cmpxchg(&pi_desc->control, old.control,
> > +   new.control) != old.control);
> > +   }
> >  }
> >
> >  static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
> >  {
> > +   if (irq_remapping_cap(IRQ_POSTING_CAP)) {
> > +   struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
> > +   struct pi_desc old, new;
> > +
> > +   memset(&old, 0, sizeof(old));
> > +   memset(&new, 0, sizeof(new));
> > +
> > +   /* Set SN when the vCPU is preempted */
> > +   if (vcpu->preempted) {
> > +   do {
> > +   old.control = new.control = pi_desc->control;
> > +   pi_set_sn(&new);
> > +   } while (cmpxchg(&pi_desc->control, old.control,
> > +   new.control) != old.control);
> > +   }
> > +   }
> > +
> > __vmx_load_host_state(to_vmx(vcpu));
> > if (!vmm_exclusive) {
> > __loaded_vmcs_clear(to_vmx(vcpu)->loaded_vmcs);
> > --
> > 1.9.1
> >
> > --
> > To unsubscribe from this list: send the line "unsubscribe kvm" in
> > the body of a message to majord

Re: [PATCH resend v5 6/6] KVM: nVMX: Enable nested posted interrupt processing

2015-03-02 Thread Yong Wang
On Fri, Feb 27, 2015 at 04:56:06PM +0800, Wincy Van wrote:
> On Sun, Feb 15, 2015 at 2:27 PM, Yong Wang  
> wrote:
> >
> > Wincy, our QA found regressions with this patch that 64bit L2 linux guest
> > fails to boot up when running nested kvm on kvm.
> >
> > Environment:
> > 
> > Host OS (ia32/ia32e/IA64):ia32e
> > Guest OS (ia32/ia32e/IA64):ia32e
> > Guest OS Type (Linux/Windows):Linux
> > kvm.git Commit:6557bada461afeaa920a189fae2cff7c8fdce39f
> > qemu.kvm Commit:5c697ae74170d43928cb185f5ac1a9058adcae0b
> > Host Kernel Version:3.19.0-rc3
> > Hardware:Ivytown_EP, Haswell_EP
> >
> >
> > Bug detailed description:
> > --
> > create 64bit linux guest as L2 guest, the guest boot up fail
> >
> > note:
> > 1. create a 32bit linux guest as L2 guest, the guest boots up fine.
> > 2. create a 64bit windows guest as L2 guest, the guest boots up fine.
> > 3. this should be a kernel bug:
> > kvm   + qemu = result
> > 6557bada  + 5c697ae7 = bad
> > 8fff5e37  + 5c697ae7 = good
> >
> > Reproduce steps:
> > 
> > 1 create L1 guest:
> > qemu-system-x86_64 -enable-kvm -m 8G -smp 4 -net 
> > nic,macaddr=00:12:31:34:51:31 -net tap,script=/etc/kvm/qemu-ifup 
> > nested-kvm.qcow -cpu host
> >
> > 2. create L2 guest
> > qemu-system-x86_64 -enable-kvm -m 2G -smp 2 -net none rhel6u5.qcow
> >
> > Current result:
> > 
> > create 64bit linux guest as L2 guest, the guest boots up fail
> >
> > Expected result:
> > 
> > create 64bit linux guest as L2 guest, the guest boots up fine
> >
> > Please take a look.
> >
> 
> Yong, according to the logs, I found that L1 may have disabled x2apic,
> and the MSR_BITMAP field will be modified by following vmx_set_efer in
> prepare_vmcs02.
> So I think we can fix this issue by:
> 
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index f7b20b4..f6e3457 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -2168,7 +2168,10 @@ static void vmx_set_msr_bitmap(struct kvm_vcpu *vcpu)
>  {
> unsigned long *msr_bitmap;
> 
> -   if (irqchip_in_kernel(vcpu->kvm) && 
> apic_x2apic_mode(vcpu->arch.apic)) {
> +   if (is_guest_mode(vcpu))
> +   msr_bitmap = vmx_msr_bitmap_nested;
> +   else if (irqchip_in_kernel(vcpu->kvm) &&
> +   apic_x2apic_mode(vcpu->arch.apic)) {
> if (is_long_mode(vcpu))
> msr_bitmap = vmx_msr_bitmap_longmode_x2apic;
> else
> 
> 

Our QA verified that your patch fixed the issue. Please prepare a formal patch
that Paolo can consider applying. Thanks a lot Wincy!

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: virtio balloon: do not call blocking ops when !TASK_RUNNING

2015-03-02 Thread Rusty Russell
Thomas Huth  writes:
> On Thu, 26 Feb 2015 11:50:42 +1030
> Rusty Russell  wrote:
>
>> Thomas Huth  writes:
>> >  Hi all,
>> >
>> > with the recent kernel 3.19, I get a kernel warning when I start my
>> > KVM guest on s390 with virtio balloon enabled:
>> 
>> The deeper problem is that virtio_ccw_get_config just silently fails on
>> OOM.
>> 
>> Neither get_config nor set_config are expected to fail.
>
> AFAIK this is currently not a problem. According to
> http://lwn.net/Articles/627419/ these kmalloc calls never
> fail because they allocate less than a page.

I strongly suggest you unlearn that fact.

The fix for this is in two parts:

1) Annotate using sched_annotate_sleep() and add a comment: we may spin
   a few times in low memory situations, but this isn't a high
   performance path.

2) Handle get_config (and other) failure in some more elegant way.

Cheers,
Rusty.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH stable 3.12, 3.14] KVM: MIPS: Don't leak FPU/DSP to guest

2015-03-02 Thread James Hogan
[ Upstream commit f798217dfd038af981a18bbe4bc57027a08bb182 ]

The FPU and DSP are enabled via the CP0 Status CU1 and MX bits by
kvm_mips_set_c0_status() on a guest exit, presumably in case there is
active state that needs saving if pre-emption occurs. However neither of
these bits are cleared again when returning to the guest.

This effectively gives the guest access to the FPU/DSP hardware after
the first guest exit even though it is not aware of its presence,
allowing FP instructions in guest user code to intermittently actually
execute instead of trapping into the guest OS for emulation. It will
then read & manipulate the hardware FP registers which technically
belong to the user process (e.g. QEMU), or are stale from another user
process. It can also crash the guest OS by causing an FP exception, for
which a guest exception handler won't have been registered.

First lets save and disable the FPU (and MSA) state with lose_fpu(1)
before entering the guest. This simplifies the problem, especially for
when guest FPU/MSA support is added in the future, and prevents FR=1 FPU
state being live when the FR bit gets cleared for the guest, which
according to the architecture causes the contents of the FPU and vector
registers to become UNPREDICTABLE.

We can then safely remove the enabling of the FPU in
kvm_mips_set_c0_status(), since there should never be any active FPU or
MSA state to save at pre-emption, which should plug the FPU leak.

DSP state is always live rather than being lazily restored, so for that
it is simpler to just clear the MX bit again when re-entering the guest.

Signed-off-by: James Hogan 
Cc: Paolo Bonzini 
Cc: Ralf Baechle 
Cc: Sanjay Lal 
Cc: Gleb Natapov 
Cc: kvm@vger.kernel.org
Cc: linux-m...@linux-mips.org
Cc:  # v3.10+: 044f0f03eca0: MIPS: KVM: Deliver guest 
interrupts
Cc:  # v3.10+: 3ce465e04bfd: MIPS: Export FP functions 
used by lose_fpu(1) for KVM
Cc:  # v3.10+
Signed-off-by: Paolo Bonzini 
Signed-off-by: James Hogan 
---
This should apply to stable trees 3.12 and 3.14, but not 3.10. The files
had been renamed since v3.14 so it cherry-picked cleanly but the patch
didn't apply cleanly. I've also added a reference to the "MIPS: Export
FP functions used by lose_fpu(1) for KVM" commit which is itself marked
for stable, but is needed to avoid a build failure when KVM=m.
---
 arch/mips/kvm/kvm_locore.S | 2 +-
 arch/mips/kvm/kvm_mips.c   | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/mips/kvm/kvm_locore.S b/arch/mips/kvm/kvm_locore.S
index bbace092ad0a..03a2db58b22d 100644
--- a/arch/mips/kvm/kvm_locore.S
+++ b/arch/mips/kvm/kvm_locore.S
@@ -428,7 +428,7 @@ __kvm_mips_return_to_guest:
/* Setup status register for running guest in UM */
.setat
or  v1, v1, (ST0_EXL | KSU_USER | ST0_IE)
-   and v1, v1, ~ST0_CU0
+   and v1, v1, ~(ST0_CU0 | ST0_MX)
.setnoat
mtc0v1, CP0_STATUS
ehb
diff --git a/arch/mips/kvm/kvm_mips.c b/arch/mips/kvm/kvm_mips.c
index 28838f1a6c1a..897c605263f2 100644
--- a/arch/mips/kvm/kvm_mips.c
+++ b/arch/mips/kvm/kvm_mips.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -418,6 +419,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct 
kvm_run *run)
vcpu->mmio_needed = 0;
}
 
+   lose_fpu(1);
+
local_irq_disable();
/* Check if we have any exceptions/interrupts pending */
kvm_mips_deliver_interrupts(vcpu,
@@ -1021,9 +1024,6 @@ void kvm_mips_set_c0_status(void)
 {
uint32_t status = read_c0_status();
 
-   if (cpu_has_fpu)
-   status |= (ST0_CU1);
-
if (cpu_has_dsp)
status |= (ST0_MX);
 
-- 
2.0.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH stable 3.10] KVM: MIPS: Don't leak FPU/DSP to guest

2015-03-02 Thread James Hogan
[ Upstream commit f798217dfd038af981a18bbe4bc57027a08bb182 ]

The FPU and DSP are enabled via the CP0 Status CU1 and MX bits by
kvm_mips_set_c0_status() on a guest exit, presumably in case there is
active state that needs saving if pre-emption occurs. However neither of
these bits are cleared again when returning to the guest.

This effectively gives the guest access to the FPU/DSP hardware after
the first guest exit even though it is not aware of its presence,
allowing FP instructions in guest user code to intermittently actually
execute instead of trapping into the guest OS for emulation. It will
then read & manipulate the hardware FP registers which technically
belong to the user process (e.g. QEMU), or are stale from another user
process. It can also crash the guest OS by causing an FP exception, for
which a guest exception handler won't have been registered.

First lets save and disable the FPU (and MSA) state with lose_fpu(1)
before entering the guest. This simplifies the problem, especially for
when guest FPU/MSA support is added in the future, and prevents FR=1 FPU
state being live when the FR bit gets cleared for the guest, which
according to the architecture causes the contents of the FPU and vector
registers to become UNPREDICTABLE.

We can then safely remove the enabling of the FPU in
kvm_mips_set_c0_status(), since there should never be any active FPU or
MSA state to save at pre-emption, which should plug the FPU leak.

DSP state is always live rather than being lazily restored, so for that
it is simpler to just clear the MX bit again when re-entering the guest.

Signed-off-by: James Hogan 
Cc: Paolo Bonzini 
Cc: Ralf Baechle 
Cc: Sanjay Lal 
Cc: Gleb Natapov 
Cc: kvm@vger.kernel.org
Cc: linux-m...@linux-mips.org
Cc:  # v3.10+: 044f0f03eca0: MIPS: KVM: Deliver guest 
interrupts
Cc:  # v3.10+: 3ce465e04bfd: MIPS: Export FP functions 
used by lose_fpu(1) for KVM
Cc:  # v3.10+
Signed-off-by: Paolo Bonzini 
Signed-off-by: James Hogan 
---
This should apply to stable tree 3.10. In addition to the files being
renamed since v3.14, kvm_locore.S was reformatted slightly between v3.10
and v3.12. I've also added a reference to the "MIPS: Export FP functions
used by lose_fpu(1) for KVM" commit which is itself marked for stable,
but is needed to avoid a build failure when KVM=m.
---
 arch/mips/kvm/kvm_locore.S | 2 +-
 arch/mips/kvm/kvm_mips.c   | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/mips/kvm/kvm_locore.S b/arch/mips/kvm/kvm_locore.S
index dca2aa665993..920b63210806 100644
--- a/arch/mips/kvm/kvm_locore.S
+++ b/arch/mips/kvm/kvm_locore.S
@@ -431,7 +431,7 @@ __kvm_mips_return_to_guest:
 /* Setup status register for running guest in UM */
 .set at
 or v1, v1, (ST0_EXL | KSU_USER | ST0_IE)
-and v1, v1, ~ST0_CU0
+and v1, v1, ~(ST0_CU0 | ST0_MX)
 .set noat
 mtc0v1, CP0_STATUS
 ehb
diff --git a/arch/mips/kvm/kvm_mips.c b/arch/mips/kvm/kvm_mips.c
index f957a8ac979b..843ec38fec7b 100644
--- a/arch/mips/kvm/kvm_mips.c
+++ b/arch/mips/kvm/kvm_mips.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -413,6 +414,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct 
kvm_run *run)
vcpu->mmio_needed = 0;
}
 
+   lose_fpu(1);
+
local_irq_disable();
/* Check if we have any exceptions/interrupts pending */
kvm_mips_deliver_interrupts(vcpu,
@@ -1017,9 +1020,6 @@ void kvm_mips_set_c0_status(void)
 {
uint32_t status = read_c0_status();
 
-   if (cpu_has_fpu)
-   status |= (ST0_CU1);
-
if (cpu_has_dsp)
status |= (ST0_MX);
 
-- 
2.0.5

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: virtio balloon: do not call blocking ops when !TASK_RUNNING

2015-03-02 Thread Michael S. Tsirkin
On Mon, Mar 02, 2015 at 10:37:26AM +1030, Rusty Russell wrote:
> Thomas Huth  writes:
> > On Thu, 26 Feb 2015 11:50:42 +1030
> > Rusty Russell  wrote:
> >
> >> Thomas Huth  writes:
> >> >  Hi all,
> >> >
> >> > with the recent kernel 3.19, I get a kernel warning when I start my
> >> > KVM guest on s390 with virtio balloon enabled:
> >> 
> >> The deeper problem is that virtio_ccw_get_config just silently fails on
> >> OOM.
> >> 
> >> Neither get_config nor set_config are expected to fail.
> >
> > AFAIK this is currently not a problem. According to
> > http://lwn.net/Articles/627419/ these kmalloc calls never
> > fail because they allocate less than a page.
> 
> I strongly suggest you unlearn that fact.
> The fix for this is in two parts:
> 
> 1) Annotate using sched_annotate_sleep() and add a comment: we may spin
>a few times in low memory situations, but this isn't a high
>performance path.
> 
> 2) Handle get_config (and other) failure in some more elegant way.
> 
> Cheers,
> Rusty.

I agree, but I'd like to point out that even without kmalloc,
on s390 get_config is blocking - it's waiting
for a hardware interrupt.

And it makes sense: config is not data path, I don't think
we should spin there.

So I think besides these two parts, we still need my two patches:
virtio-balloon: do not call blocking ops when !TASK_RUNNING
virtio_console: avoid config access from irq
in 4.0.

agree?


--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[patch added to the 3.12 stable tree] KVM: MIPS: Don't leak FPU/DSP to guest

2015-03-02 Thread Jiri Slaby
From: James Hogan 

This patch has been added to the 3.12 stable tree. If you have any
objections, please let us know.

===

[ Upstream commit f798217dfd038af981a18bbe4bc57027a08bb182 ]

The FPU and DSP are enabled via the CP0 Status CU1 and MX bits by
kvm_mips_set_c0_status() on a guest exit, presumably in case there is
active state that needs saving if pre-emption occurs. However neither of
these bits are cleared again when returning to the guest.

This effectively gives the guest access to the FPU/DSP hardware after
the first guest exit even though it is not aware of its presence,
allowing FP instructions in guest user code to intermittently actually
execute instead of trapping into the guest OS for emulation. It will
then read & manipulate the hardware FP registers which technically
belong to the user process (e.g. QEMU), or are stale from another user
process. It can also crash the guest OS by causing an FP exception, for
which a guest exception handler won't have been registered.

First lets save and disable the FPU (and MSA) state with lose_fpu(1)
before entering the guest. This simplifies the problem, especially for
when guest FPU/MSA support is added in the future, and prevents FR=1 FPU
state being live when the FR bit gets cleared for the guest, which
according to the architecture causes the contents of the FPU and vector
registers to become UNPREDICTABLE.

We can then safely remove the enabling of the FPU in
kvm_mips_set_c0_status(), since there should never be any active FPU or
MSA state to save at pre-emption, which should plug the FPU leak.

DSP state is always live rather than being lazily restored, so for that
it is simpler to just clear the MX bit again when re-entering the guest.

Signed-off-by: James Hogan 
Cc: Paolo Bonzini 
Cc: Ralf Baechle 
Cc: Sanjay Lal 
Cc: Gleb Natapov 
Cc: kvm@vger.kernel.org
Cc: linux-m...@linux-mips.org
Cc:  # v3.10+: 044f0f03eca0: MIPS: KVM: Deliver guest 
interrupts
Cc:  # v3.10+: 3ce465e04bfd: MIPS: Export FP functions 
used by lose_fpu(1) for KVM
Cc:  # v3.10+
Signed-off-by: Paolo Bonzini 
Signed-off-by: James Hogan 
Signed-off-by: Jiri Slaby 
---
 arch/mips/kvm/kvm_locore.S | 2 +-
 arch/mips/kvm/kvm_mips.c   | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/mips/kvm/kvm_locore.S b/arch/mips/kvm/kvm_locore.S
index bbace092ad0a..03a2db58b22d 100644
--- a/arch/mips/kvm/kvm_locore.S
+++ b/arch/mips/kvm/kvm_locore.S
@@ -428,7 +428,7 @@ __kvm_mips_return_to_guest:
/* Setup status register for running guest in UM */
.setat
or  v1, v1, (ST0_EXL | KSU_USER | ST0_IE)
-   and v1, v1, ~ST0_CU0
+   and v1, v1, ~(ST0_CU0 | ST0_MX)
.setnoat
mtc0v1, CP0_STATUS
ehb
diff --git a/arch/mips/kvm/kvm_mips.c b/arch/mips/kvm/kvm_mips.c
index 016f163b42da..2cb24788a8a6 100644
--- a/arch/mips/kvm/kvm_mips.c
+++ b/arch/mips/kvm/kvm_mips.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -417,6 +418,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct 
kvm_run *run)
vcpu->mmio_needed = 0;
}
 
+   lose_fpu(1);
+
local_irq_disable();
/* Check if we have any exceptions/interrupts pending */
kvm_mips_deliver_interrupts(vcpu,
@@ -1021,9 +1024,6 @@ void kvm_mips_set_c0_status(void)
 {
uint32_t status = read_c0_status();
 
-   if (cpu_has_fpu)
-   status |= (ST0_CU1);
-
if (cpu_has_dsp)
status |= (ST0_MX);
 
-- 
2.3.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: virtio balloon: do not call blocking ops when !TASK_RUNNING

2015-03-02 Thread Cornelia Huck
On Mon, 2 Mar 2015 12:13:58 +0100
"Michael S. Tsirkin"  wrote:

> On Mon, Mar 02, 2015 at 10:37:26AM +1030, Rusty Russell wrote:
> > Thomas Huth  writes:
> > > On Thu, 26 Feb 2015 11:50:42 +1030
> > > Rusty Russell  wrote:
> > >
> > >> Thomas Huth  writes:
> > >> >  Hi all,
> > >> >
> > >> > with the recent kernel 3.19, I get a kernel warning when I start my
> > >> > KVM guest on s390 with virtio balloon enabled:
> > >> 
> > >> The deeper problem is that virtio_ccw_get_config just silently fails on
> > >> OOM.
> > >> 
> > >> Neither get_config nor set_config are expected to fail.
> > >
> > > AFAIK this is currently not a problem. According to
> > > http://lwn.net/Articles/627419/ these kmalloc calls never
> > > fail because they allocate less than a page.
> > 
> > I strongly suggest you unlearn that fact.
> > The fix for this is in two parts:
> > 
> > 1) Annotate using sched_annotate_sleep() and add a comment: we may spin
> >a few times in low memory situations, but this isn't a high
> >performance path.
> > 
> > 2) Handle get_config (and other) failure in some more elegant way.

Do you mean we need to enable the caller to deal with get_config
failures (and the transport to relay those failures)? I agree with that.

> > 
> > Cheers,
> > Rusty.
> 
> I agree, but I'd like to point out that even without kmalloc,
> on s390 get_config is blocking - it's waiting
> for a hardware interrupt.
> 
> And it makes sense: config is not data path, I don't think
> we should spin there.
> 
> So I think besides these two parts, we still need my two patches:
> virtio-balloon: do not call blocking ops when !TASK_RUNNING
> virtio_console: avoid config access from irq
> in 4.0.
> 
> agree?

I agree that we need those fixes as well.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Qemu and virtio 1.0

2015-03-02 Thread Michael S. Tsirkin
On Wed, Feb 25, 2015 at 02:50:22PM +1030, Rusty Russell wrote:
> OK, I am trying to experiment with virtio 1.0 support using the
> latest kernel and MST's qemu tree:
> 
> https://git.kernel.org/cgit/virt/kvm/mst/qemu.git/?h=virtio-1.0
> 
> The first issue is that the device config endian was wrong (see
> attached patch).
> 
> I'm now setting up a BE guest on my x86 laptop, and a BE and LE guest
> on a BE powerpc machine, to check that all combinations work correctly.
> If others test too, that would be appreciated!
> 
> Cheers,
> Rusty.

Thanks a lot for finding this!
The issue is certainly there, though I think looking
at guest features is not the right thing to do:
drivers can access config before acking features.

At least for PCI, it's very simple: we have a
separate memory region for modern devices, we
should just use a different accessor, not virtio_config_readw
and friends.

Untested patch sent (sorry about the untested part, a bit busy right now).


> >From 95ac91554ed602f856a2a5fcc25eaffcad1b1c8d Mon Sep 17 00:00:00 2001
> From: Rusty Russell 
> Date: Tue, 24 Feb 2015 14:47:44 +1030
> Subject: [PATCH] virtio_config_write*/virtio_config_read*: Don't endian swap
>  for virtio 1.0.
> 
> Signed-off-by: Rusty Russell 
> 
> diff --git a/hw/virtio/virtio.c b/hw/virtio/virtio.c
> index 079944c..882a31b 100644
> --- a/hw/virtio/virtio.c
> +++ b/hw/virtio/virtio.c
> @@ -662,7 +662,12 @@ uint32_t virtio_config_readw(VirtIODevice *vdev, 
> uint32_t addr)
>  
>  k->get_config(vdev, vdev->config);
>  
> -val = lduw_p(vdev->config + addr);
> +/* Virtio 1.0 is always LE */
> +if (virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
> +val = lduw_le_p(vdev->config + addr);
> +} else {
> +val = lduw_p(vdev->config + addr);
> +}
>  return val;
>  }
>  
> @@ -677,7 +682,12 @@ uint32_t virtio_config_readl(VirtIODevice *vdev, 
> uint32_t addr)
>  
>  k->get_config(vdev, vdev->config);
>  
> -val = ldl_p(vdev->config + addr);
> +/* Virtio 1.0 is always LE */
> +if (virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
> +val = ldl_le_p(vdev->config + addr);
> +} else {
> +val = ldl_p(vdev->config + addr);
> +}
>  return val;
>  }
>  
> @@ -706,7 +716,12 @@ void virtio_config_writew(VirtIODevice *vdev, uint32_t 
> addr, uint32_t data)
>  return;
>  }
>  
> -stw_p(vdev->config + addr, val);
> +/* Virtio 1.0 is always LE */
> +if (virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
> +stw_le_p(vdev->config + addr, val);
> +} else {
> +stw_p(vdev->config + addr, val);
> +}
>  
>  if (k->set_config) {
>  k->set_config(vdev, vdev->config);
> @@ -722,7 +737,12 @@ void virtio_config_writel(VirtIODevice *vdev, uint32_t 
> addr, uint32_t data)
>  return;
>  }
>  
> -stl_p(vdev->config + addr, val);
> +/* Virtio 1.0 is always LE */
> +if (virtio_has_feature(vdev, VIRTIO_F_VERSION_1)) {
> +stl_le_p(vdev->config + addr, val);
> +} else {
> +stl_p(vdev->config + addr, val);
> +}
>  
>  if (k->set_config) {
>  k->set_config(vdev, vdev->config);
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: virtio balloon: do not call blocking ops when !TASK_RUNNING

2015-03-02 Thread Michael S. Tsirkin
On Mon, Mar 02, 2015 at 12:31:06PM +0100, Cornelia Huck wrote:
> On Mon, 2 Mar 2015 12:13:58 +0100
> "Michael S. Tsirkin"  wrote:
> 
> > On Mon, Mar 02, 2015 at 10:37:26AM +1030, Rusty Russell wrote:
> > > Thomas Huth  writes:
> > > > On Thu, 26 Feb 2015 11:50:42 +1030
> > > > Rusty Russell  wrote:
> > > >
> > > >> Thomas Huth  writes:
> > > >> >  Hi all,
> > > >> >
> > > >> > with the recent kernel 3.19, I get a kernel warning when I start my
> > > >> > KVM guest on s390 with virtio balloon enabled:
> > > >> 
> > > >> The deeper problem is that virtio_ccw_get_config just silently fails on
> > > >> OOM.
> > > >> 
> > > >> Neither get_config nor set_config are expected to fail.
> > > >
> > > > AFAIK this is currently not a problem. According to
> > > > http://lwn.net/Articles/627419/ these kmalloc calls never
> > > > fail because they allocate less than a page.
> > > 
> > > I strongly suggest you unlearn that fact.
> > > The fix for this is in two parts:
> > > 
> > > 1) Annotate using sched_annotate_sleep() and add a comment: we may spin
> > >a few times in low memory situations, but this isn't a high
> > >performance path.
> > > 
> > > 2) Handle get_config (and other) failure in some more elegant way.
> 
> Do you mean we need to enable the caller to deal with get_config
> failures (and the transport to relay those failures)? I agree with that.

We can certainly tweak code to bypass need to kmalloc
on get_config.

Why is it doing these allocs? What's wrong with using
vcdev->config directly?


> > > 
> > > Cheers,
> > > Rusty.
> > 
> > I agree, but I'd like to point out that even without kmalloc,
> > on s390 get_config is blocking - it's waiting
> > for a hardware interrupt.
> > 
> > And it makes sense: config is not data path, I don't think
> > we should spin there.
> > 
> > So I think besides these two parts, we still need my two patches:
> > virtio-balloon: do not call blocking ops when !TASK_RUNNING
> > virtio_console: avoid config access from irq
> > in 4.0.
> > 
> > agree?
> 
> I agree that we need those fixes as well.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: Qemu and virtio 1.0

2015-03-02 Thread Cornelia Huck
On Mon, 2 Mar 2015 12:43:43 +0100
"Michael S. Tsirkin"  wrote:

> On Wed, Feb 25, 2015 at 02:50:22PM +1030, Rusty Russell wrote:
> > OK, I am trying to experiment with virtio 1.0 support using the
> > latest kernel and MST's qemu tree:
> > 
> > https://git.kernel.org/cgit/virt/kvm/mst/qemu.git/?h=virtio-1.0
> > 
> > The first issue is that the device config endian was wrong (see
> > attached patch).
> > 
> > I'm now setting up a BE guest on my x86 laptop, and a BE and LE guest
> > on a BE powerpc machine, to check that all combinations work correctly.
> > If others test too, that would be appreciated!
> > 
> > Cheers,
> > Rusty.
> 
> Thanks a lot for finding this!
> The issue is certainly there, though I think looking
> at guest features is not the right thing to do:
> drivers can access config before acking features.

Ah right. I'm just wondering what the device-specific accessors (in net
and so on) will do?

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: virtio balloon: do not call blocking ops when !TASK_RUNNING

2015-03-02 Thread Cornelia Huck
On Mon, 2 Mar 2015 12:46:57 +0100
"Michael S. Tsirkin"  wrote:

> On Mon, Mar 02, 2015 at 12:31:06PM +0100, Cornelia Huck wrote:
> > On Mon, 2 Mar 2015 12:13:58 +0100
> > "Michael S. Tsirkin"  wrote:
> > 
> > > On Mon, Mar 02, 2015 at 10:37:26AM +1030, Rusty Russell wrote:
> > > > Thomas Huth  writes:
> > > > > On Thu, 26 Feb 2015 11:50:42 +1030
> > > > > Rusty Russell  wrote:
> > > > >
> > > > >> Thomas Huth  writes:
> > > > >> >  Hi all,
> > > > >> >
> > > > >> > with the recent kernel 3.19, I get a kernel warning when I start my
> > > > >> > KVM guest on s390 with virtio balloon enabled:
> > > > >> 
> > > > >> The deeper problem is that virtio_ccw_get_config just silently fails 
> > > > >> on
> > > > >> OOM.
> > > > >> 
> > > > >> Neither get_config nor set_config are expected to fail.
> > > > >
> > > > > AFAIK this is currently not a problem. According to
> > > > > http://lwn.net/Articles/627419/ these kmalloc calls never
> > > > > fail because they allocate less than a page.
> > > > 
> > > > I strongly suggest you unlearn that fact.
> > > > The fix for this is in two parts:
> > > > 
> > > > 1) Annotate using sched_annotate_sleep() and add a comment: we may spin
> > > >a few times in low memory situations, but this isn't a high
> > > >performance path.
> > > > 
> > > > 2) Handle get_config (and other) failure in some more elegant way.
> > 
> > Do you mean we need to enable the caller to deal with get_config
> > failures (and the transport to relay those failures)? I agree with that.
> 
> We can certainly tweak code to bypass need to kmalloc
> on get_config.
> 
> Why is it doing these allocs? What's wrong with using
> vcdev->config directly?

We'd need to make sure that vcdev->config is allocated with GFP_DMA, as
we need it to be under 2G. And we need to be more careful wrt
serialization, especially if we want to reuse the ccw structure as
well, for example. Nothing complicated, I'd just need some free time to
do it :)

The more likely reason for get_config to fail is a device hotunplug,
however. We'll get a seperate notification about that (via machine
check + channel report), but it would be nice if we could stop poking
the device immediately, as there's no use trying to do something with
it anymore.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: virtio balloon: do not call blocking ops when !TASK_RUNNING

2015-03-02 Thread Michael S. Tsirkin
On Mon, Mar 02, 2015 at 01:11:02PM +0100, Cornelia Huck wrote:
> On Mon, 2 Mar 2015 12:46:57 +0100
> "Michael S. Tsirkin"  wrote:
> 
> > On Mon, Mar 02, 2015 at 12:31:06PM +0100, Cornelia Huck wrote:
> > > On Mon, 2 Mar 2015 12:13:58 +0100
> > > "Michael S. Tsirkin"  wrote:
> > > 
> > > > On Mon, Mar 02, 2015 at 10:37:26AM +1030, Rusty Russell wrote:
> > > > > Thomas Huth  writes:
> > > > > > On Thu, 26 Feb 2015 11:50:42 +1030
> > > > > > Rusty Russell  wrote:
> > > > > >
> > > > > >> Thomas Huth  writes:
> > > > > >> >  Hi all,
> > > > > >> >
> > > > > >> > with the recent kernel 3.19, I get a kernel warning when I start 
> > > > > >> > my
> > > > > >> > KVM guest on s390 with virtio balloon enabled:
> > > > > >> 
> > > > > >> The deeper problem is that virtio_ccw_get_config just silently 
> > > > > >> fails on
> > > > > >> OOM.
> > > > > >> 
> > > > > >> Neither get_config nor set_config are expected to fail.
> > > > > >
> > > > > > AFAIK this is currently not a problem. According to
> > > > > > http://lwn.net/Articles/627419/ these kmalloc calls never
> > > > > > fail because they allocate less than a page.
> > > > > 
> > > > > I strongly suggest you unlearn that fact.
> > > > > The fix for this is in two parts:
> > > > > 
> > > > > 1) Annotate using sched_annotate_sleep() and add a comment: we may 
> > > > > spin
> > > > >a few times in low memory situations, but this isn't a high
> > > > >performance path.
> > > > > 
> > > > > 2) Handle get_config (and other) failure in some more elegant way.
> > > 
> > > Do you mean we need to enable the caller to deal with get_config
> > > failures (and the transport to relay those failures)? I agree with that.
> > 
> > We can certainly tweak code to bypass need to kmalloc
> > on get_config.
> > 
> > Why is it doing these allocs? What's wrong with using
> > vcdev->config directly?
> 
> We'd need to make sure that vcdev->config is allocated with GFP_DMA, as
> we need it to be under 2G. And we need to be more careful wrt
> serialization, especially if we want to reuse the ccw structure as
> well, for example. Nothing complicated, I'd just need some free time to
> do it :)
> 
> The more likely reason for get_config to fail is a device hotunplug,
> however. We'll get a seperate notification about that (via machine
> check + channel report), but it would be nice if we could stop poking
> the device immediately, as there's no use trying to do something with
> it anymore.

Normally, hotunplug requires guest cooperation.
IOW unplug request should send guest interrupt,
then block until guest confirms it's not using the
device anymore.
virtio pci already handles that fine, can't ccw
do something similar?

-- 
MST
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: virtio balloon: do not call blocking ops when !TASK_RUNNING

2015-03-02 Thread Cornelia Huck
On Mon, 2 Mar 2015 13:19:43 +0100
"Michael S. Tsirkin"  wrote:

> On Mon, Mar 02, 2015 at 01:11:02PM +0100, Cornelia Huck wrote:
> > On Mon, 2 Mar 2015 12:46:57 +0100
> > "Michael S. Tsirkin"  wrote:
> > 
> > > On Mon, Mar 02, 2015 at 12:31:06PM +0100, Cornelia Huck wrote:
> > > > On Mon, 2 Mar 2015 12:13:58 +0100
> > > > "Michael S. Tsirkin"  wrote:
> > > > 
> > > > > On Mon, Mar 02, 2015 at 10:37:26AM +1030, Rusty Russell wrote:
> > > > > > Thomas Huth  writes:
> > > > > > > On Thu, 26 Feb 2015 11:50:42 +1030
> > > > > > > Rusty Russell  wrote:
> > > > > > >
> > > > > > >> Thomas Huth  writes:
> > > > > > >> >  Hi all,
> > > > > > >> >
> > > > > > >> > with the recent kernel 3.19, I get a kernel warning when I 
> > > > > > >> > start my
> > > > > > >> > KVM guest on s390 with virtio balloon enabled:
> > > > > > >> 
> > > > > > >> The deeper problem is that virtio_ccw_get_config just silently 
> > > > > > >> fails on
> > > > > > >> OOM.
> > > > > > >> 
> > > > > > >> Neither get_config nor set_config are expected to fail.
> > > > > > >
> > > > > > > AFAIK this is currently not a problem. According to
> > > > > > > http://lwn.net/Articles/627419/ these kmalloc calls never
> > > > > > > fail because they allocate less than a page.
> > > > > > 
> > > > > > I strongly suggest you unlearn that fact.
> > > > > > The fix for this is in two parts:
> > > > > > 
> > > > > > 1) Annotate using sched_annotate_sleep() and add a comment: we may 
> > > > > > spin
> > > > > >a few times in low memory situations, but this isn't a high
> > > > > >performance path.
> > > > > > 
> > > > > > 2) Handle get_config (and other) failure in some more elegant way.
> > > > 
> > > > Do you mean we need to enable the caller to deal with get_config
> > > > failures (and the transport to relay those failures)? I agree with that.
> > > 
> > > We can certainly tweak code to bypass need to kmalloc
> > > on get_config.
> > > 
> > > Why is it doing these allocs? What's wrong with using
> > > vcdev->config directly?
> > 
> > We'd need to make sure that vcdev->config is allocated with GFP_DMA, as
> > we need it to be under 2G. And we need to be more careful wrt
> > serialization, especially if we want to reuse the ccw structure as
> > well, for example. Nothing complicated, I'd just need some free time to
> > do it :)
> > 
> > The more likely reason for get_config to fail is a device hotunplug,
> > however. We'll get a seperate notification about that (via machine
> > check + channel report), but it would be nice if we could stop poking
> > the device immediately, as there's no use trying to do something with
> > it anymore.
> 
> Normally, hotunplug requires guest cooperation.
> IOW unplug request should send guest interrupt,
> then block until guest confirms it's not using the
> device anymore.
> virtio pci already handles that fine, can't ccw
> do something similar?

Hotunplug for channel devices does not require guest feedback. (In
fact, I was surprised to hear that there is somthing like guest
cooperation on other platforms.) Basically, the guest is simply
presented with the fact that the device is gone and has to deal with
it. It does not matter whether the device was removed by operator
request or due to a hardware failure.

(We do have support in the s390 channel device core to be able to deal
with devices going away and coming back gracefully. ccw devices can be
put into a special state where they retain their configuration so that
they can be reactivated if they become available again. For example,
dasd (disk) devices survive being detached and reattached just fine,
even under I/O load. See the ->notify() callback of the ccw driver for
details.)

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 00/16] s390x cpu model implementation

2015-03-02 Thread Michael Mueller
This patch set in combination with its kernel kvm patch set proposes an
implementation of S390 cpu models. The origin of this item is to provide
a means for management interfaces like libvirt to draw decisions if life
guest migration to a target hypervisor is reasonable.

A migration constraint is that a target hypervisor is capable to run a
guest with the same S390 cpu model as the source hypervisor does. To
verify this condition, the administration interface employes the existing
QMP command "query-cpu-definitions" which returns a list of all currently
supported S390 cpu models of a given host system. Together with the newly
defined QMP command "query-cpu-model", which returns the current active
S390 cpu model of a guest, a conclusion can be drawn if a migration is
possible.

A S390 cpu model is defined as a triple of machine type, cpu facility set
and IBC value. Each historic, current and future triple receives a name
composed of the machine type and its general availability counter. This name
forms the cpu model name (e.g.: "2817-ga2".)

With means of the Instruction Blocking Control feature (IBC), the instruction
set available to a given guest is limitable.

Details:
- The QMP command query-cpu-model returns the active cpu model and the
  accellerator it is using:

  {"name":"2066-ga1","accelerator":"kvm"}

Or just the empty model in case an accelerator does not implement cpu
models yet:

  {}

- A management instance like libvirt may probe by means of the QMP command
  query-cpu-definitions which models are defined and usable for all
  supporting accelerators. To implement this the cpu definition info type gets
  an optional field named 'accelerators' which holds a list defining
  which cpu model is 'runnable' and in addition which one the 'default'
  cpu model is (i.e. the model to be used in the 'host' case).

  [{"name":"2964-ga1",
"accelerators":[{"name":"kvm","runnable":false,"default":false}]}

Or just 'host' in case an accelerator does not implement cpu models yet:

  [{"name":"host"}]

- For accel=kvm the cpu model initialization takes place in kvm_arch_init()

What's currently a little bit unclear to me is how to best initialize the
various accelerators for machine 'none'. I played around with different
options and finally came up with the following sugguestion:

Introduce a QEMU "probe mode" that gets entered in case the current machine
is "none" and no specific accelerator is requested on the cmd line. When
in that mode, loop trough a list of acellerators in configure_accelerator
and invoke all their init methods once. The last accelerator to init shall
be tcg.

In cpu model context that allows to initialize the S390 CPU classes for
each single accelertor which supports it. Whence the callback for
qemu-cpu-definitions allows to populate its answer string according to the
above sketched extended CpuDefinitionInfo type for multiplaccelerators. 

v2-v3:
- using GTK-Doc style format now for function descriptions
- typo fixed (2/16)
- gen-facilties now used to generate cpu model specific facility lists
  and the qemu side facility mask during build time (5/16)
- gen-facilities added to make magic (5/16)
- element of struct S390CPUMachineProps now statically in cpu class (6/16)
- element of struct S390CPUProcessorProps now statically in cpu class (6/16)
- facility list also static now (6/16)
- typo fixed (7/16)
- zBC12-ga1 model now active on zEC12-ga2 host (11/16)
- operations on facility lists use QEMU bitmap API now (11/16)
- routine s390_cpu_model_init() introduced, called during cpu object
  realization to prepare the current accelarator (12/16) if a cpu
  model was selected
- missing comment added in description of CpuModelInfo type (13/16)
- accelerator field now mandatory for "query-cpu-model" (13/16)
- sorted list related comment to "query-cpu-definitions" dropped in
  commit message (13/16)
- comment for AccelCpuInfo type updated (13/16)
- routine s390_facility_test() factored out (15/16)

v1-v2:
- QEMU-side facility list mask introduced: this allows to enable guest
  facilities that are handled by instruction interception handlers
  implemented on qemu side. Similar to the facilities enabled by means
  of the KVM side facility list mask which are handled by kvm/kernel.
- Concept of soft facilities has been dropped 
- Result type of QMP command query-cpu-definitions extended to hold
  additional information beside the cpu model name including which
  cpu model is runnable in current accelerator and machine context. 

Michael Mueller (16):
  Introduce probe mode for machine type none
  Introduce option --probe to switch into probe mode
  Introduce stub routine cpu_desc_avail
  target-s390x: Introduce cpu facilities
  target-s390x: Generate facility defines per cpu model
  target-s390x: Introduce cpu models
  target-s390x: Define cpu model specific facility lists
  target-s390x: Add cpu model alias definition routines
  target-s390x: Update linux-headers/asm-s390/kvm.h
  target-s390x

[PATCH v3 15/16] target-s390x: Introduce facility test routine

2015-03-02 Thread Michael Mueller
The patch introduces routine s390_facility_test() which allows to
verify a specific facility bit is set.

Signed-off-by: Michael Mueller 
---
 target-s390x/cpu-models.c | 30 ++
 target-s390x/cpu-models.h |  1 +
 2 files changed, 31 insertions(+)

diff --git a/target-s390x/cpu-models.c b/target-s390x/cpu-models.c
index 7ad61df..5c4eac5 100644
--- a/target-s390x/cpu-models.c
+++ b/target-s390x/cpu-models.c
@@ -698,3 +698,33 @@ bool s390_cpu_models_used(void)
 {
 return cpu_models_used;
 }
+
+static inline int test_facility(uint64_t *fac_list, uint16_t nr)
+{
+uint16_t word = nr / BITS_PER_LONG;
+uint16_t be_bit = (BITS_PER_LONG - 1) - (nr % BITS_PER_LONG);
+
+return (nr < FAC_LIST_CPU_S390_SIZE_UINT1) ?
+(fac_list[word] >> be_bit) & __UINT64_C(1) : 0;
+}
+
+/**
+ * s390_test_facility:
+ * @nr: facility bit number to test
+ * @cc: cpu class to test
+ *
+ * The functions tests if the cpu facility identified by bit @nr is available
+ * to the cpu class @cc.
+ *
+ * Returns: a boolean value.
+ *
+ * Since: 2.3
+ */
+bool s390_test_facility(S390CPUClass *cc, uint16_t nr)
+{
+if (!cc) {
+return false;
+}
+return test_facility(cc->fac_list, nr) ? true : false;
+}
+
diff --git a/target-s390x/cpu-models.h b/target-s390x/cpu-models.h
index 3605aa4..557f5e5 100644
--- a/target-s390x/cpu-models.h
+++ b/target-s390x/cpu-models.h
@@ -116,6 +116,7 @@ bool s390_cpu_classes_initialized(void);
 bool s390_probe_mode(void);
 void s390_cpu_model_init(S390CPUClass *cc);
 bool s390_cpu_models_used(void);
+bool s390_test_facility(S390CPUClass *cc, uint16_t nr);
 
 /*
  * bits 0-7   : CMOS generation
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 16/16] target-s390x: Enable cpu model usage

2015-03-02 Thread Michael Mueller
This patch enables QEMU to instantiate S390 CPUs with cpu model types.

Signed-off-by: Michael Mueller 
---
 hw/s390x/s390-virtio.c | 12 +++-
 target-s390x/helper.c  |  9 ++---
 2 files changed, 13 insertions(+), 8 deletions(-)

diff --git a/hw/s390x/s390-virtio.c b/hw/s390x/s390-virtio.c
index 412e49b..0bf7632 100644
--- a/hw/s390x/s390-virtio.c
+++ b/hw/s390x/s390-virtio.c
@@ -32,6 +32,7 @@
 #include "hw/virtio/virtio.h"
 #include "hw/sysbus.h"
 #include "sysemu/kvm.h"
+#include "sysemu/cpus.h"
 #include "exec/address-spaces.h"
 
 #include "hw/s390x/s390-virtio-bus.h"
@@ -153,7 +154,12 @@ void s390_init_cpus(const char *cpu_model, uint8_t 
*storage_keys)
 int i;
 
 if (cpu_model == NULL) {
-cpu_model = "host";
+cpu_model = "none";
+}
+
+if (is_help_option(cpu_model)) {
+list_cpus(stdout, &fprintf, cpu_model);
+exit(0);
 }
 
 ipi_states = g_malloc(sizeof(S390CPU *) * smp_cpus);
@@ -163,6 +169,10 @@ void s390_init_cpus(const char *cpu_model, uint8_t 
*storage_keys)
 CPUState *cs;
 
 cpu = cpu_s390x_init(cpu_model);
+if (cpu == NULL) {
+fprintf(stderr, "Unable to find CPU definition\n");
+exit(1);
+}
 cs = CPU(cpu);
 
 ipi_states[i] = cpu;
diff --git a/target-s390x/helper.c b/target-s390x/helper.c
index e0fd8fc..5b6bad2 100644
--- a/target-s390x/helper.c
+++ b/target-s390x/helper.c
@@ -22,6 +22,7 @@
 #include "exec/gdbstub.h"
 #include "qemu/timer.h"
 #include "exec/cpu_ldst.h"
+#include "cpu-models.h"
 #ifndef CONFIG_USER_ONLY
 #include "sysemu/sysemu.h"
 #endif
@@ -66,13 +67,7 @@ void s390x_cpu_timer(void *opaque)
 
 S390CPU *cpu_s390x_init(const char *cpu_model)
 {
-S390CPU *cpu;
-
-cpu = S390_CPU(object_new(TYPE_S390_CPU));
-
-object_property_set_bool(OBJECT(cpu), true, "realized", NULL);
-
-return cpu;
+return S390_CPU(cpu_generic_init(TYPE_S390_CPU, cpu_model));
 }
 
 #if defined(CONFIG_USER_ONLY)
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 12/16] target-s390x: Prepare accelerator during cpu object realization

2015-03-02 Thread Michael Mueller
This patch implements routine s390_cpu_model_init(). It is called by the
realize function during instantiation of an cpu object. Its task is to
initialize the current accelerator with the properties of the selected
processor model.

Signed-off-by: Michael Mueller 
---
 target-s390x/cpu-models.c | 36 
 target-s390x/cpu-models.h |  4 
 target-s390x/cpu.c|  1 +
 3 files changed, 41 insertions(+)

diff --git a/target-s390x/cpu-models.c b/target-s390x/cpu-models.c
index 83e590a..c6c1771 100644
--- a/target-s390x/cpu-models.c
+++ b/target-s390x/cpu-models.c
@@ -107,6 +107,7 @@ typedef struct ParmAddrAddrAccel {
 } ParmAddrAddrAccel;
 
 static GSList *s390_cpu_aliases;
+static bool cpu_models_used;
 
 /* compare order of two cpu classes for ascending sort */
 gint s390_cpu_class_asc_order_compare(gconstpointer a, gconstpointer b)
@@ -634,3 +635,38 @@ bool s390_probe_mode(void)
 return false;
 }
 
+/**
+ * s390_cpu_model_init:
+ * @cc: S390 CPU class
+ *
+ * This function intitializes the current accelerator with processor
+ * related properties.
+ *
+ * Since: 2.3
+ */
+void s390_cpu_model_init(S390CPUClass *cc)
+{
+S390ProcessorProps proc;
+
+/* none cpu model case */
+if (!strcmp(object_class_get_name(OBJECT_CLASS(cc)), TYPE_S390_CPU)) {
+return;
+}
+
+/* accelerator already prepared */
+if (cpu_models_used) {
+return;
+}
+
+proc.cpuid = cpuid(cc->proc);
+proc.ibc = cc->proc.ibc;
+bitmap_zero(proc.fac_list, FAC_LIST_ARCH_S390_SIZE_UINT1);
+bitmap_copy(proc.fac_list, cc->fac_list, FAC_LIST_CPU_S390_SIZE_UINT1);
+
+if (kvm_enabled()) {
+if (!kvm_s390_set_processor_props(&proc)) {
+cpu_models_used = true;
+}
+}
+}
+
diff --git a/target-s390x/cpu-models.h b/target-s390x/cpu-models.h
index aa81c9b..f3f914a 100644
--- a/target-s390x/cpu-models.h
+++ b/target-s390x/cpu-models.h
@@ -45,6 +45,9 @@
 #define type_cpuid(x) ((uint64_t)((x) & 0x) << 16)
 #define id_cpuid(x)   ((uint64_t)((x) & 0xff) << 32)
 #define ver_cpuid(x)  ((uint64_t)((x) & 0xff) << 56)
+#define cpuid(x)  (ver_cpuid(x.ver) |  \
+   id_cpuid(x.id) |\
+   type_cpuid(x.type))
 
 #define oldest_ibc(x) (((uint32_t)(x) >> 16) & 0xfff)
 #define newest_ibc(x) ((uint32_t)(x) & 0xfff)
@@ -110,6 +113,7 @@ gint s390_cpu_class_asc_order_compare(gconstpointer a, 
gconstpointer b);
 void s390_cpu_list_entry(gpointer data, gpointer user_data);
 bool s390_cpu_classes_initialized(void);
 bool s390_probe_mode(void);
+void s390_cpu_model_init(S390CPUClass *cc);
 
 /*
  * bits 0-7   : CMOS generation
diff --git a/target-s390x/cpu.c b/target-s390x/cpu.c
index c9200ee..2f4192e 100644
--- a/target-s390x/cpu.c
+++ b/target-s390x/cpu.c
@@ -178,6 +178,7 @@ static void s390_cpu_realizefn(DeviceState *dev, Error 
**errp)
 CPUState *cs = CPU(dev);
 S390CPUClass *scc = S390_CPU_GET_CLASS(dev);
 
+s390_cpu_model_init(scc);
 s390_cpu_gdb_init(cs);
 qemu_init_vcpu(cs);
 #if !defined(CONFIG_USER_ONLY)
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 13/16] target-s390x: New QMP command query-cpu-model

2015-03-02 Thread Michael Mueller
This patch implements a new QMP request named 'query-cpu-model'.
It returns the cpu model of cpu 0 and its backing accelerator.

request:
  {"execute" : "query-cpu-model" }

answer:
  {"return" : {"name": "2827-ga2", "accelerator": "kvm" }}

Alias names are resolved to their respective machine type and GA names
already during cpu instantiation. Thus, also a cpu model like 'host'
which is implemented as alias will return its normalized cpu model name.

Furthermore the patch implements the following functions:

- s390_cpu_typename(), returns the currently selected cpu type name or NULL
- s390_cpu_models_used(), returns true if S390 cpu models are in use

Signed-off-by: Michael Mueller 
---
 include/sysemu/arch_init.h |  1 +
 qapi-schema.json   | 25 +
 qmp-commands.hx|  6 ++
 qmp.c  |  5 +
 stubs/Makefile.objs|  1 +
 stubs/arch-query-cpu-mod.c |  9 +
 target-s390x/cpu-models.c  | 13 +
 target-s390x/cpu-models.h  |  1 +
 target-s390x/cpu.c | 29 +
 9 files changed, 90 insertions(+)
 create mode 100644 stubs/arch-query-cpu-mod.c

diff --git a/include/sysemu/arch_init.h b/include/sysemu/arch_init.h
index 54b36c1..86344a2 100644
--- a/include/sysemu/arch_init.h
+++ b/include/sysemu/arch_init.h
@@ -37,5 +37,6 @@ int kvm_available(void);
 int xen_available(void);
 
 CpuDefinitionInfoList *arch_query_cpu_definitions(Error **errp);
+CpuModelInfo *arch_query_cpu_model(Error **errp);
 
 #endif
diff --git a/qapi-schema.json b/qapi-schema.json
index ea436ec..e9b213f 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -2507,6 +2507,31 @@
 ##
 { 'command': 'query-cpu-definitions', 'returns': ['CpuDefinitionInfo'] }
 
+##
+# @CpuModelInfo:
+#
+# Virtual CPU model definition.
+#
+# @name: the name of the CPU model definition
+#
+# @accelerator: AccelId (name) of this cpu models acceletaror
+#
+# Since: 2.3
+##
+{ 'type': 'CpuModelInfo',
+  'data': { 'name': 'str', 'accelerator': 'AccelId' } }
+
+##
+# @query-cpu-model:
+#
+# Return the current virtual CPU model
+#
+# Returns: CpuModelInfo
+#
+# Since: 2.3
+##
+{ 'command': 'query-cpu-model', 'returns': 'CpuModelInfo' }
+
 # @AddfdInfo:
 #
 # Information about a file descriptor that was added to an fd set.
diff --git a/qmp-commands.hx b/qmp-commands.hx
index a85d847..98bfedd 100644
--- a/qmp-commands.hx
+++ b/qmp-commands.hx
@@ -3392,6 +3392,12 @@ EQMP
 },
 
 {
+.name   = "query-cpu-model",
+.args_type  = "",
+.mhandler.cmd_new = qmp_marshal_input_query_cpu_model,
+},
+
+{
 .name   = "query-target",
 .args_type  = "",
 .mhandler.cmd_new = qmp_marshal_input_query_target,
diff --git a/qmp.c b/qmp.c
index d701cff..11b6172 100644
--- a/qmp.c
+++ b/qmp.c
@@ -573,6 +573,11 @@ CpuDefinitionInfoList *qmp_query_cpu_definitions(Error 
**errp)
 return arch_query_cpu_definitions(errp);
 }
 
+CpuModelInfo *qmp_query_cpu_model(Error **errp)
+{
+return arch_query_cpu_model(errp);
+}
+
 void qmp_add_client(const char *protocol, const char *fdname,
 bool has_skipauth, bool skipauth, bool has_tls, bool tls,
 Error **errp)
diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs
index fd7a489..45daa92 100644
--- a/stubs/Makefile.objs
+++ b/stubs/Makefile.objs
@@ -1,4 +1,5 @@
 stub-obj-y += arch-query-cpu-def.o
+stub-obj-y += arch-query-cpu-mod.o
 stub-obj-y += bdrv-commit-all.o
 stub-obj-y += chr-baum-init.o
 stub-obj-y += chr-msmouse.o
diff --git a/stubs/arch-query-cpu-mod.c b/stubs/arch-query-cpu-mod.c
new file mode 100644
index 000..90ebd08
--- /dev/null
+++ b/stubs/arch-query-cpu-mod.c
@@ -0,0 +1,9 @@
+#include "qemu-common.h"
+#include "sysemu/arch_init.h"
+#include "qapi/qmp/qerror.h"
+
+CpuModelInfo *arch_query_cpu_model(Error **errp)
+{
+error_set(errp, QERR_UNSUPPORTED);
+return NULL;
+}
diff --git a/target-s390x/cpu-models.c b/target-s390x/cpu-models.c
index c6c1771..116dbcc 100644
--- a/target-s390x/cpu-models.c
+++ b/target-s390x/cpu-models.c
@@ -670,3 +670,16 @@ void s390_cpu_model_init(S390CPUClass *cc)
 }
 }
 
+/**
+ * s390_cpu_models_used:
+ *
+ * This function indicates if cpus with model properties are in use.
+ *
+ * Returns: a boolean value.
+ *
+ * Since: 2.3
+ */
+bool s390_cpu_models_used(void)
+{
+return cpu_models_used;
+}
diff --git a/target-s390x/cpu-models.h b/target-s390x/cpu-models.h
index f3f914a..51db298 100644
--- a/target-s390x/cpu-models.h
+++ b/target-s390x/cpu-models.h
@@ -114,6 +114,7 @@ void s390_cpu_list_entry(gpointer data, gpointer user_data);
 bool s390_cpu_classes_initialized(void);
 bool s390_probe_mode(void);
 void s390_cpu_model_init(S390CPUClass *cc);
+bool s390_cpu_models_used(void);
 
 /*
  * bits 0-7   : CMOS generation
diff --git a/target-s390x/cpu.c b/target-s390x/cpu.c
index 2f4192e..cefaff1 100644
--- a/target-s390x/cpu.c
+++ b/target-s390x/cpu.c
@@ -37,6 +37,11 @@
 #d

[PATCH v3 14/16] target-s390x: Extend QMP command query-cpu-definitions

2015-03-02 Thread Michael Mueller
This patch implements the QMP command 'query-cpu-definitions' in the S390
context. The command returns a list of cpu model names in the current host
context. A consumer may successfully request each listed cpu model as long
for a given accelerator this model is runnable.

The QMP type AccelCpuModelInfo is introduced and the type CpuDefinitionInfo
is extended by the optional field 'accelerators'. It contains a list of named
accelerators and some indication whether the associated cpu model is runnable
or the default cpu model. The default cpu model is used if either no specific
cpu model is requested during QEMU startup or if the cpu model with name
'host' is requested.

request:
  {"execute": "query-cpu-definitions"}

answer:
  {"return":

[{"name":"2964-ga1","accelerators":[{"name":"kvm","runnable":false,"default":false}]},
 
{"name":"2828-ga1","accelerators":[{"name":"kvm","runnable":false,"default":false}]},
 
{"name":"2827-ga2","accelerators":[{"name":"kvm","runnable":true,"default":true}]},
 
{"name":"2827-ga1","accelerators":[{"name":"kvm","runnable":true,"default":false}]},
 
{"name":"2818-ga1","accelerators":[{"name":"kvm","runnable":true,"default":false}]},
 ...
 
{"name":"2064-ga1","accelerators":[{"runnable":true,"name":"kvm","default":false}]}
]
   }

Signed-off-by: Michael Mueller 
---
 qapi-schema.json  |  21 +-
 target-s390x/cpu-models.c |  15 +++
 target-s390x/cpu-models.h |   1 +
 target-s390x/cpu.c| 100 +++---
 4 files changed, 130 insertions(+), 7 deletions(-)

diff --git a/qapi-schema.json b/qapi-schema.json
index e9b213f..44863e5 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -2485,16 +2485,35 @@
   'data': ['qtest', 'tcg', 'kvm', 'xen'  ] }
 
 ##
+# @AccelCpuModelInfo:
+#
+# Accelerator specific CPU model data
+#
+# @name: the accelerator name
+#
+# @default: cpu model for 'host'
+#
+# @runnable: cpu model can be activated on hosting machine
+#
+# Since: 2.3
+#
+##
+{ 'type': 'AccelCpuModelInfo',
+  'data': { 'name': 'AccelId', 'default': 'bool', 'runnable': 'bool' } }
+
+##
 # @CpuDefinitionInfo:
 #
 # Virtual CPU definition.
 #
 # @name: the name of the CPU definition
 #
+# @accelerators: #optional cpu model offered per accelerator (since 2.3)
+#
 # Since: 1.2.0
 ##
 { 'type': 'CpuDefinitionInfo',
-  'data': { 'name': 'str' } }
+  'data': { 'name': 'str', '*accelerators': ['AccelCpuModelInfo'] } }
 
 ##
 # @query-cpu-definitions:
diff --git a/target-s390x/cpu-models.c b/target-s390x/cpu-models.c
index 116dbcc..7ad61df 100644
--- a/target-s390x/cpu-models.c
+++ b/target-s390x/cpu-models.c
@@ -211,6 +211,21 @@ int set_s390_cpu_alias(const char *name, const char *model)
 return 0;
 }
 
+/* compare order of two cpu classes for descending sort */
+gint s390_cpu_class_desc_order_compare(gconstpointer a, gconstpointer b)
+{
+S390CPUClass *cc_a = S390_CPU_CLASS((ObjectClass *) a);
+S390CPUClass *cc_b = S390_CPU_CLASS((ObjectClass *) b);
+
+if (cc_a->mach.order < cc_b->mach.order) {
+return 1;
+}
+if (cc_a->mach.order > cc_b->mach.order) {
+return -1;
+}
+return 0;
+}
+
 /* return machine class for specific machine type */
 static void s390_machine_class_test_cpu_class(gpointer data, gpointer 
user_data)
 {
diff --git a/target-s390x/cpu-models.h b/target-s390x/cpu-models.h
index 51db298..3605aa4 100644
--- a/target-s390x/cpu-models.h
+++ b/target-s390x/cpu-models.h
@@ -110,6 +110,7 @@ static inline bool kvm_s390_probe_mode(void)
 
 int s390_setup_cpu_classes(AccelId accel, S390MachineProps *prop);
 gint s390_cpu_class_asc_order_compare(gconstpointer a, gconstpointer b);
+gint s390_cpu_class_desc_order_compare(gconstpointer a, gconstpointer b);
 void s390_cpu_list_entry(gpointer data, gpointer user_data);
 bool s390_cpu_classes_initialized(void);
 bool s390_probe_mode(void);
diff --git a/target-s390x/cpu.c b/target-s390x/cpu.c
index cefaff1..6bf6554 100644
--- a/target-s390x/cpu.c
+++ b/target-s390x/cpu.c
@@ -66,18 +66,106 @@ void s390_cpu_list(FILE *f, fprintf_function cpu_fprintf)
 }
 
 #ifndef CONFIG_USER_ONLY
-CpuDefinitionInfoList *arch_query_cpu_definitions(Error **errp)
+static AccelCpuModelInfoList *qmp_query_accel_entry(AccelId accel,
+S390CPUClass *cc,
+AccelCpuModelInfoList 
*prev)
+{
+AccelCpuModelInfoList *list;
+AccelCpuModelInfo *info;
+
+info = g_try_new0(AccelCpuModelInfo, 1);
+if (!info) {
+goto out;
+}
+info->name = accel;
+info->runnable = cc->is_active[accel];
+info->q_default = cc->is_host[accel];
+list = g_try_new0(AccelCpuModelInfoList, 1);
+if (!list) {
+goto out;
+}
+list->value = info;
+list->next = prev;
+
+return list;
+out:
+g_free(info);
+return prev;
+}
+
+static void qmp_query_cpu_definition_entry(gpointer data, gpointer user_

[PATCH v3 10/16] target-s390x: Add KVM VM attribute interface for cpu models

2015-03-02 Thread Michael Mueller
The patch implements routines to set and retrieve processor configuration
data and to retrieve machine configuration data. The machine related data
is used together with the cpu model facility lists to determine the list of
supported cpu models of this host. The above mentioned routines have QEMU
trace point instrumentation.

Signed-off-by: Michael Mueller 
---
 target-s390x/cpu-models.h |  39 ++
 target-s390x/kvm.c| 102 ++
 trace-events  |   3 ++
 3 files changed, 144 insertions(+)

diff --git a/target-s390x/cpu-models.h b/target-s390x/cpu-models.h
index a32f559..b6b57d4 100644
--- a/target-s390x/cpu-models.h
+++ b/target-s390x/cpu-models.h
@@ -46,6 +46,45 @@ typedef struct S390CPUAlias {
 char *model;
 } S390CPUAlias;
 
+typedef struct S390ProcessorProps {
+uint64_t cpuid;
+uint16_t ibc;
+uint8_t  pad[6];
+uint64_t fac_list[FAC_LIST_ARCH_S390_SIZE_UINT64];
+} S390ProcessorProps;
+
+typedef struct S390MachineProps {
+uint64_t cpuid;
+uint32_t ibc_range;
+uint8_t  pad[4];
+uint64_t fac_list_mask[FAC_LIST_ARCH_S390_SIZE_UINT64];
+uint64_t fac_list[FAC_LIST_ARCH_S390_SIZE_UINT64];
+} S390MachineProps;
+
+#ifdef CONFIG_KVM
+int kvm_s390_get_processor_props(S390ProcessorProps *prop);
+int kvm_s390_set_processor_props(S390ProcessorProps *prop);
+bool kvm_s390_cpu_classes_initialized(void);
+bool kvm_s390_probe_mode(void);
+#else
+static inline int kvm_s390_get_processor_props(S390ProcessorProps *prob)
+{
+return -ENOSYS;
+}
+static inline int kvm_s390_set_processor_props(S390ProcessorProps *prob)
+{
+return -ENOSYS;
+}
+static inline bool kvm_s390_cpu_classes_initialized(void)
+{
+return false;
+}
+static inline bool kvm_s390_probe_mode(void)
+{
+return false;
+}
+#endif
+
 /*
  * bits 0-7   : CMOS generation
  * bits 8-9   : reserved
diff --git a/target-s390x/kvm.c b/target-s390x/kvm.c
index d7c57d9..5404137 100644
--- a/target-s390x/kvm.c
+++ b/target-s390x/kvm.c
@@ -43,6 +43,7 @@
 #include "hw/s390x/s390-pci-inst.h"
 #include "hw/s390x/s390-pci-bus.h"
 #include "hw/s390x/ipl.h"
+#include "cpu-models.h"
 
 /* #define DEBUG_KVM */
 
@@ -118,6 +119,7 @@ const KVMCapabilityInfo kvm_arch_required_capabilities[] = {
 
 static int cap_sync_regs;
 static int cap_async_pf;
+static bool cpu_classes_initialized;
 
 static void *legacy_s390_alloc(size_t size, uint64_t *align);
 
@@ -173,6 +175,68 @@ static void kvm_s390_enable_cmma(KVMState *s)
 trace_kvm_enable_cmma(rc);
 }
 
+static int cpu_model_get(KVMState *s, uint64_t attr, uint64_t addr)
+{
+struct kvm_device_attr dev_attr = {
+.group = KVM_S390_VM_CPU_MODEL,
+.attr = attr,
+.addr = addr,
+};
+
+return kvm_vm_ioctl(s, KVM_GET_DEVICE_ATTR, &dev_attr);
+}
+
+static int cpu_model_set(KVMState *s, uint64_t attr, uint64_t addr)
+{
+struct kvm_device_attr dev_attr = {
+.group = KVM_S390_VM_CPU_MODEL,
+.attr = attr,
+.addr = addr,
+};
+
+return kvm_vm_ioctl(s, KVM_SET_DEVICE_ATTR, &dev_attr);
+}
+
+static int has_cpu_model_call(KVMState *s, uint64_t attr)
+{
+int rc;
+struct kvm_device_attr dev_attr = {
+.group = KVM_S390_VM_CPU_MODEL,
+.attr = attr,
+};
+
+if (kvm_check_extension(s, KVM_CAP_VM_ATTRIBUTES) == 0) {
+return -ENOSYS;
+}
+
+rc = kvm_vm_ioctl(s, KVM_HAS_DEVICE_ATTR, &dev_attr);
+if (rc == 0) {
+return 0;
+}
+return -EFAULT;
+}
+
+static int kvm_s390_get_machine_props(KVMState *s, S390MachineProps *prop)
+{
+int rc;
+
+rc = has_cpu_model_call(s, KVM_S390_VM_CPU_MACHINE);
+if (!rc) {
+rc = cpu_model_get(s, KVM_S390_VM_CPU_MACHINE, (uint64_t) prop);
+}
+trace_kvm_get_machine_props(rc, prop->cpuid, prop->ibc_range);
+return rc;
+}
+
+static void kvm_s390_setup_cpu_classes(KVMState *s)
+{
+S390MachineProps mach;
+
+if (!kvm_s390_get_machine_props(s, &mach)) {
+cpu_classes_initialized = false;
+}
+}
+
 int kvm_arch_init(KVMState *s)
 {
 cap_sync_regs = kvm_check_extension(s, KVM_CAP_SYNC_REGS);
@@ -186,6 +250,8 @@ int kvm_arch_init(KVMState *s)
 || !kvm_check_extension(s, KVM_CAP_S390_COW)) {
 phys_mem_set_alloc(legacy_s390_alloc);
 }
+
+kvm_s390_setup_cpu_classes(s);
 return 0;
 }
 
@@ -1578,3 +1644,39 @@ int kvm_arch_fixup_msi_route(struct 
kvm_irq_routing_entry *route,
 route->u.adapter.adapter_id = pbdev->routes.adapter.adapter_id;
 return 0;
 }
+
+int kvm_s390_get_processor_props(S390ProcessorProps *prop)
+{
+int rc;
+
+rc = has_cpu_model_call(kvm_state, KVM_S390_VM_CPU_PROCESSOR);
+if (!rc) {
+rc = cpu_model_get(kvm_state,
+   KVM_S390_VM_CPU_PROCESSOR, (uint64_t) prop);
+}
+trace_kvm_get_processor_props(rc, prop->cpuid, prop->ibc);
+return rc;
+}
+
+int kvm_s390_set_processor_props(S390ProcessorProps *prop)
+{
+int rc;
+
+rc = ha

[PATCH v3 11/16] target-s390x: Add cpu class initialization routines

2015-03-02 Thread Michael Mueller
This patch provides routines to dynamically update the previously defined
S390 cpu classes in the current host context. The main function performing
this process is s390_setup_cpu_classes(). It takes the current host context
as parameter to setup the classes accordingly. It basically performs the
following sub-tasks:

- Update of cpu classes with accelerator specific host and QEMU properties
- Mark adequate cpu class as default cpu class to be used for cpu model 'host'
- Invalidate cpu classes not supported by this hosting machine
- Define machine type aliases to latest GA number of a processor model
- Define aliases for common cpu model names
- Set cpu model alias 'host' to default cpu class

Forthermore the patch provides the following routines:

- cpu_desc_avail(), s390 specific stub indicating that list_cpus() can run
- s390_cpu_classes_initialized(), test if cpu classes have been initialized
- s390_probe_mode(), indicates if probe mode is active

Signed-off-by: Michael Mueller 
---
 target-s390x/cpu-models.c | 458 ++
 target-s390x/cpu-models.h |  26 +++
 target-s390x/cpu.c|  17 +-
 target-s390x/kvm.c|   4 +-
 4 files changed, 503 insertions(+), 2 deletions(-)

diff --git a/target-s390x/cpu-models.c b/target-s390x/cpu-models.c
index 608189d..83e590a 100644
--- a/target-s390x/cpu-models.c
+++ b/target-s390x/cpu-models.c
@@ -13,6 +13,7 @@
 #include "qemu-common.h"
 #include "cpu-models.h"
 #include "gen-facilities.h"
+#include "qemu/error-report.h"
 
 #define S390_PROC_DEF(_name, _cpu_id, _desc)\
 static void \
@@ -87,8 +88,41 @@ S390_PROC_DEF("2827-ga1", CPU_S390_2827_GA1, "IBM 
zEnterprise EC12 GA1")
 S390_PROC_DEF("2827-ga2", CPU_S390_2827_GA2, "IBM zEnterprise EC12 GA2")
 S390_PROC_DEF("2828-ga1", CPU_S390_2828_GA1, "IBM zEnterprise BC12 GA1")
 
+/* some types for calls to g_list_foreach() with parameters */
+typedef struct ParmBoolShortShortAccel {
+bool valid;
+unsigned short type;
+union {
+unsigned short class;
+unsigned short gen;
+unsigned short ga;
+};
+AccelId accel;
+} ParmBoolShortShortAccel;
+
+typedef struct ParmAddrAddrAccel {
+S390MachineProps *prop;
+S390CPUClass *host_cc;
+AccelId accel;
+} ParmAddrAddrAccel;
+
 static GSList *s390_cpu_aliases;
 
+/* compare order of two cpu classes for ascending sort */
+gint s390_cpu_class_asc_order_compare(gconstpointer a, gconstpointer b)
+{
+S390CPUClass *cc_a = S390_CPU_CLASS((ObjectClass *) a);
+S390CPUClass *cc_b = S390_CPU_CLASS((ObjectClass *) b);
+
+if (cc_a->mach.order < cc_b->mach.order) {
+return -1;
+}
+if (cc_a->mach.order > cc_b->mach.order) {
+return 1;
+}
+return 0;
+}
+
 static gint s390_cpu_compare_class_name(gconstpointer a, gconstpointer b)
 {
 const char *aname = object_class_get_name((ObjectClass *) a);
@@ -176,3 +210,427 @@ int set_s390_cpu_alias(const char *name, const char 
*model)
 return 0;
 }
 
+/* return machine class for specific machine type */
+static void s390_machine_class_test_cpu_class(gpointer data, gpointer 
user_data)
+{
+S390CPUClass *cc = S390_CPU_CLASS((ObjectClass *) data);
+ParmBoolShortShortAccel *parm = user_data;
+
+if (parm->valid || !cc->proc.type || parm->type != cc->proc.type) {
+return;
+}
+
+parm->class = cc->mach.class;
+parm->valid = true;
+}
+
+/* return machine class by machine type */
+static unsigned short machine_class(unsigned short type, void *user_data)
+{
+GSList *list = object_class_get_list(TYPE_S390_CPU, false);
+ParmBoolShortShortAccel parm_class, *parm = user_data;
+
+if (parm->type != type) {
+parm->class = 0;
+}
+if (!parm->class) {
+parm_class.type = type;
+parm_class.class = 0;
+parm_class.valid = false;
+g_slist_foreach(list, (GFunc) s390_machine_class_test_cpu_class,
+&parm_class);
+g_slist_free(list);
+if (parm_class.valid) {
+parm->class = parm_class.class;
+}
+}
+parm->type = type;
+
+return parm->class;
+}
+
+/* return CMOS generation for specific machine type */
+static void s390_machine_class_test_cpu_gen(gpointer data, gpointer user_data)
+{
+S390CPUClass *cc = S390_CPU_CLASS((ObjectClass *) data);
+ParmBoolShortShortAccel *parm = user_data;
+
+if (parm->valid) {
+return;
+}
+
+if (parm->type == cc->proc.type) {
+parm->gen = cc->proc.gen;
+parm->valid = true;
+}
+}
+
+/* return CMOS generation by machine type */
+static uint16_t machine_gen(unsigned short type)
+{
+GSList *list = object_class_get_list(TYPE_S390_CPU, false);
+ParmBoolShortShortAccel parm;
+
+parm.type = type;
+parm.gen = 0;
+parm.valid = false;
+g_slist_foreach(list, (GFunc) s390_machine_class_test_cpu_gen, &parm);
+   

[PATCH v3 07/16] target-s390x: Define cpu model specific facility lists

2015-03-02 Thread Michael Mueller
This patch defines S390 cpu facilities and their presence at the
different cpu model levels. Beside defining a base which facilities
have to be requested per cpu model, these sets are associated to the
defined cpu classes and used to calculate the list of supported
cpu models in context of the current hosting machine model.

The also defined qemu side facility mask allows to implement and enable
facilities in QEMU land.

Signed-off-by: Michael Mueller 
---
 target-s390x/cpu-models.c | 12 
 target-s390x/cpu-models.h |  8 
 target-s390x/cpu.c|  1 +
 3 files changed, 21 insertions(+)

diff --git a/target-s390x/cpu-models.c b/target-s390x/cpu-models.c
index 3520691..bd9f0bc 100644
--- a/target-s390x/cpu-models.c
+++ b/target-s390x/cpu-models.c
@@ -12,6 +12,7 @@
 
 #include "qemu-common.h"
 #include "cpu-models.h"
+#include "gen-facilities.h"
 
 #define S390_PROC_DEF(_name, _cpu_id, _desc)\
 static void \
@@ -20,6 +21,10 @@
 {   \
 DeviceClass *dc = DEVICE_CLASS(oc); \
 S390CPUClass *cc = S390_CPU_CLASS(oc);  \
+uint64_t nbits = FAC_LIST_CPU_S390_SIZE_UINT1;  \
+uint64_t fac_list[FAC_LIST_CPU_S390_SIZE_UINT64] = {\
+glue(FAC_LIST_, _cpu_id)\
+};  \
 \
 cc->is_active[ACCEL_ID_KVM] = true; \
 cc->mach.ga= cpu_ga(_cpu_id);   \
@@ -30,6 +35,7 @@
 cc->proc.id= S390_DEF_ID;   \
 cc->proc.type  = cpu_type(_cpu_id); \
 cc->proc.ibc   = S390_DEF_IBC;  \
+bitmap_copy(cc->proc.fac_list, fac_list, nbits);\
 dc->desc   = _desc; \
 }   \
 static const TypeInfo   \
@@ -46,6 +52,11 @@
 }   \
 type_init(glue(_cpu_id, _cpu_register_types))
 
+/* facilities implemented by qemu */
+uint64_t qemu_s390_fac_list_mask[FAC_LIST_CPU_S390_SIZE_UINT64] = {
+FAC_LIST_CPU_S390_MASK_QEMU
+};
+
 /* define S390 CPU model classes */
 S390_PROC_DEF("2064-ga1", CPU_S390_2064_GA1, "IBM zSeries 900 GA1")
 S390_PROC_DEF("2064-ga2", CPU_S390_2064_GA2, "IBM zSeries 900 GA2")
@@ -75,3 +86,4 @@ S390_PROC_DEF("2818-ga1", CPU_S390_2818_GA1, "IBM zEnterprise 
114 GA1")
 S390_PROC_DEF("2827-ga1", CPU_S390_2827_GA1, "IBM zEnterprise EC12 GA1")
 S390_PROC_DEF("2827-ga2", CPU_S390_2827_GA2, "IBM zEnterprise EC12 GA2")
 S390_PROC_DEF("2828-ga1", CPU_S390_2828_GA1, "IBM zEnterprise BC12 GA1")
+
diff --git a/target-s390x/cpu-models.h b/target-s390x/cpu-models.h
index db681bf..d5f0b59 100644
--- a/target-s390x/cpu-models.h
+++ b/target-s390x/cpu-models.h
@@ -13,6 +13,14 @@
 #ifndef TARGET_S390X_CPU_MODELS_H
 #define TARGET_S390X_CPU_MODELS_H
 
+#include "cpu-facilities.h"
+#include "gen-facilities.h"
+
+#define FAC_LIST_ARCH_S390_SIZE_UINT1 \
+(FAC_LIST_ARCH_S390_SIZE_UINT8 * BITS_PER_BYTE)
+#define FAC_LIST_ARCH_S390_SIZE_UINT64 \
+(FAC_LIST_ARCH_S390_SIZE_UINT8 / sizeof(uint64_t))
+
 #define S390_EC 0x1
 #define S390_BC 0x2
 
diff --git a/target-s390x/cpu.c b/target-s390x/cpu.c
index d2f6312..7a26b91 100644
--- a/target-s390x/cpu.c
+++ b/target-s390x/cpu.c
@@ -29,6 +29,7 @@
 #include "qemu/error-report.h"
 #include "hw/hw.h"
 #include "trace.h"
+#include "cpu-models.h"
 #ifndef CONFIG_USER_ONLY
 #include "sysemu/arch_init.h"
 #endif
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 09/16] target-s390x: Update linux-headers/asm-s390/kvm.h

2015-03-02 Thread Michael Mueller
Signed-off-by: Michael Mueller 
---
 linux-headers/asm-s390/kvm.h | 20 
 1 file changed, 20 insertions(+)

diff --git a/linux-headers/asm-s390/kvm.h b/linux-headers/asm-s390/kvm.h
index d36b2fa..e38c942 100644
--- a/linux-headers/asm-s390/kvm.h
+++ b/linux-headers/asm-s390/kvm.h
@@ -57,11 +57,31 @@ struct kvm_s390_io_adapter_req {
 
 /* kvm attr_group  on vm fd */
 #define KVM_S390_VM_MEM_CTRL   0
+#define KVM_S390_VM_CPU_MODEL  3
 
 /* kvm attributes for mem_ctrl */
 #define KVM_S390_VM_MEM_ENABLE_CMMA0
 #define KVM_S390_VM_MEM_CLR_CMMA   1
 
+/* kvm S390 processor related attributes are r/w */
+#define KVM_S390_VM_CPU_PROCESSOR  0
+struct kvm_s390_vm_cpu_processor {
+   __u64 cpuid;
+   __u16 ibc;
+   __u8  pad[6];
+   __u64 fac_list[256];
+};
+
+/* kvm S390 machine related attributes are r/o */
+#define KVM_S390_VM_CPU_MACHINE1
+struct kvm_s390_vm_cpu_machine {
+   __u64 cpuid;
+   __u32 ibc_range;
+   __u8  pad[4];
+   __u64 fac_mask[256];
+   __u64 fac_list[256];
+};
+
 /* for KVM_GET_REGS and KVM_SET_REGS */
 struct kvm_regs {
/* general purpose regs for s390 */
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 06/16] target-s390x: Introduce cpu models

2015-03-02 Thread Michael Mueller
This patch implements the static part of the s390 cpu class definitions.
It defines s390 cpu models by means of virtual cpu ids (enum) which contain
information on the cpu generation, the machine class, the GA number and
the machine type. The cpu id is used to instantiate a cpu class per cpu
model.

In addition the patch introduces the QMP enumeration AccelId. It is used
to index certain cpu model poperties per accelerator.

Furthermore it extends the existing S390CPUClass by model related properties.

Signed-off-by: Michael Mueller 
Reviewed-by: Thomas Huth 
---
 qapi-schema.json   | 11 +++
 target-s390x/Makefile.objs |  1 +
 target-s390x/cpu-models.c  | 77 ++
 target-s390x/cpu-models.h  | 71 ++
 target-s390x/cpu-qom.h | 25 +++
 5 files changed, 185 insertions(+)
 create mode 100644 target-s390x/cpu-models.c
 create mode 100644 target-s390x/cpu-models.h

diff --git a/qapi-schema.json b/qapi-schema.json
index e16f8eb..ea436ec 100644
--- a/qapi-schema.json
+++ b/qapi-schema.json
@@ -2473,6 +2473,17 @@
 ##
 { 'command': 'query-machines', 'returns': ['MachineInfo'] }
 
+
+##
+# @AccelId
+#
+# Defines accelerator ids
+#
+# Since: 2.3
+##
+{ 'enum': 'AccelId',
+  'data': ['qtest', 'tcg', 'kvm', 'xen'  ] }
+
 ##
 # @CpuDefinitionInfo:
 #
diff --git a/target-s390x/Makefile.objs b/target-s390x/Makefile.objs
index 1b65776..4ee0f1d 100644
--- a/target-s390x/Makefile.objs
+++ b/target-s390x/Makefile.objs
@@ -1,6 +1,7 @@
 obj-y += translate.o helper.o cpu.o interrupt.o
 obj-y += int_helper.o fpu_helper.o cc_helper.o mem_helper.o misc_helper.o
 obj-y += gdbstub.o
+obj-y += cpu-models.o
 obj-$(CONFIG_SOFTMMU) += machine.o ioinst.o arch_dump.o mmu_helper.o
 obj-$(CONFIG_KVM) += kvm.o
 
diff --git a/target-s390x/cpu-models.c b/target-s390x/cpu-models.c
new file mode 100644
index 000..3520691
--- /dev/null
+++ b/target-s390x/cpu-models.c
@@ -0,0 +1,77 @@
+/*
+ * CPU models for s390
+ *
+ * Copyright 2014,2015 IBM Corp.
+ *
+ * Author(s): Michael Mueller 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or (at
+ * your option) any later version. See the COPYING file in the top-level
+ * directory.
+ */
+
+#include "qemu-common.h"
+#include "cpu-models.h"
+
+#define S390_PROC_DEF(_name, _cpu_id, _desc)\
+static void \
+glue(_cpu_id, _cpu_class_init)  \
+(ObjectClass *oc, void *data)   \
+{   \
+DeviceClass *dc = DEVICE_CLASS(oc); \
+S390CPUClass *cc = S390_CPU_CLASS(oc);  \
+\
+cc->is_active[ACCEL_ID_KVM] = true; \
+cc->mach.ga= cpu_ga(_cpu_id);   \
+cc->mach.class = cpu_class(_cpu_id);\
+cc->mach.order = cpu_order(_cpu_id);\
+cc->proc.gen   = cpu_generation(_cpu_id);   \
+cc->proc.ver   = S390_DEF_VERSION;  \
+cc->proc.id= S390_DEF_ID;   \
+cc->proc.type  = cpu_type(_cpu_id); \
+cc->proc.ibc   = S390_DEF_IBC;  \
+dc->desc   = _desc; \
+}   \
+static const TypeInfo   \
+glue(_cpu_id, _cpu_type_info) = {   \
+.name   = _name "-" TYPE_S390_CPU,  \
+.parent = TYPE_S390_CPU,\
+.class_init = glue(_cpu_id, _cpu_class_init),   \
+};  \
+static void \
+glue(_cpu_id, _cpu_register_types)(void)\
+{   \
+type_register_static(   \
+&glue(_cpu_id, _cpu_type_info));\
+}   \
+type_init(glue(_cpu_id, _cpu_register_types))
+
+/* define S390 CPU model classes */
+S390_PROC_DEF("2064-ga1", CPU_S390_2064_GA1, "IBM zSeries 900 GA1")
+S390_PROC_DEF("2064-ga2", CPU_S390_2064_GA2, "IBM zSeries 900 GA2")
+S390_PROC_DEF("2064-ga3", CPU_S390_2064_GA3, "IBM zSeries 900 GA3")
+S390_PROC_DEF("2066-ga1", CPU_S390_2066_GA1, "IBM zSeries 80

[PATCH v3 04/16] target-s390x: Introduce cpu facilities

2015-03-02 Thread Michael Mueller
The patch introduces S390 CPU facility bit numbers and names
as well as the architectural facility size limit in bytes.

Signed-off-by: Michael Mueller 
---
 target-s390x/cpu-facilities.h | 76 +++
 1 file changed, 76 insertions(+)
 create mode 100644 target-s390x/cpu-facilities.h

diff --git a/target-s390x/cpu-facilities.h b/target-s390x/cpu-facilities.h
new file mode 100644
index 000..1f1716a
--- /dev/null
+++ b/target-s390x/cpu-facilities.h
@@ -0,0 +1,76 @@
+/*
+ * CPU facilities for s390
+ *
+ * Copyright 2015 IBM Corp.
+ *
+ * Author(s): Michael Mueller 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or (at
+ * your option) any later version. See the COPYING file in the top-level
+ * directory.
+ */
+
+#ifndef TARGET_S390X_CPU_FACILITIES_H
+#define TARGET_S390X_CPU_FACILITIES_H
+
+/* architectural size of facilities is 2KB */
+#define FAC_LIST_ARCH_S390_SIZE_UINT8 (1<<11)
+
+/* CPU facility bits */
+typedef enum {
+FAC_N3  = 0,
+FAC_ZARCH   = 1,
+FAC_ZARCH_ACTIVE= 2,
+FAC_DAT_ENH = 3,
+FAC_ASN_LX_REUSE= 6,
+FAC_STFLE   = 7,
+FAC_ENHANCED_DAT_1  = 8,
+FAC_SENSE_RUNNING_STATUS= 9,
+FAC_CONDITIONAL_SSKE= 10,
+FAC_CONFIGURATION_TOPOLOGY  = 11,
+FAC_IPTE_RANGE  = 13,
+FAC_NONQ_KEY_SETTING= 14,
+FAC_EXTENDED_TRANSLATION_2  = 16,
+FAC_MESSAGE_SECURITY_ASSIST = 17,
+FAC_LONG_DISPLACEMENT   = 18,
+FAC_LONG_DISPLACEMENT_FAST  = 19,
+FAC_HFP_MADDSUB = 20,
+FAC_EXTENDED_IMMEDIATE  = 21,
+FAC_EXTENDED_TRANSLATION_3  = 22,
+FAC_HFP_UNNORMALIZED_EXT= 23,
+FAC_ETF2_ENH= 24,
+FAC_STORE_CLOCK_FAST= 25,
+FAC_PARSING_ENH = 26,
+FAC_MOVE_WITH_OPTIONAL_SPEC = 27,
+FAC_TOD_CLOCK_STEERING  = 28,
+FAC_ETF3_ENH= 30,
+FAC_EXTRACT_CPU_TIME= 31,
+FAC_COMPARE_AND_SWAP_AND_STORE  = 32,
+FAC_COMPARE_AND_SWAP_AND_STORE_2= 33,
+FAC_GENERAL_INSTRUCTIONS_EXT= 34,
+FAC_EXECUTE_EXT = 35,
+FAC_ENHANCED_MONITOR= 36,
+FAC_FLOATING_POINT_EXT  = 37,
+FAC_LOAD_PROGRAM_PARAMETERS = 40,
+FAC_FLOATING_POINT_SUPPPORT_ENH = 41,
+FAC_DFP = 42,
+FAC_DFP_FAST= 43,
+FAC_PFPO= 44,
+FAC_MULTI_45= 45,
+FAC_CMPSC_ENH   = 47,
+FAC_DFP_ZONED_CONVERSION= 48,
+FAC_MULTI_49= 49,
+FAC_CONSTRAINT_TRANSACTIONAL_EXE= 50,
+FAC_LOCAL_TLB_CLEARING  = 51,
+FAC_INTERLOCKED_ACCESS_2= 52,
+FAC_RESET_REFERENCE_BITS_MULTIPLE   = 66,
+FAC_CPU_MEASUREMENT_COUNTER = 67,
+FAC_CPU_MEASUREMENT_SAMPLING= 68,
+FAC_TRANSACTIONAL_EXE   = 73,
+FAC_ACCESS_EXCEPTION_FS_INDICATION  = 75,
+FAC_MESSAGE_SECURITY_ASSIST_3   = 76,
+FAC_MESSAGE_SECURITY_ASSIST_4   = 77,
+FAC_ENHANCED_DAT_2  = 78,
+} S390Facility;
+
+#endif
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 05/16] target-s390x: Generate facility defines per cpu model

2015-03-02 Thread Michael Mueller
This patch introduces the helper "gen-facilities" which allows to generate
facility list definitions and masks at compile time. Its flexibility is
better and the error-proneness is lower when compared to static programming
time added statements.

The helper includes "target-s390x/cpu-facilities.h" to be able to use named
facility bits instead of numbers. Its output will be feed back into the
cpu model related header file "target-s390x/cpu-models.h" by including
"target-s390x/gen-facilities.h" to implement model related data structures.

The following defines/symbols are expected to be provided by the cpu-facilities
header file:

FAC_LIST_ARCH_S390_SIZE_UINT8
FAC_N3
FAC_ZARCH
FAC_ZARCH_ACTIVE
...

The defines provided by gen-facilities follow the following schema:

FAC_LIST_CPU_S390_SIZE_UINT1 %PRIu32
FAC_LIST_CPU_S390_SIZE_UINT8 %PRIu32
FAC_LIST_CPU_S390_SIZE_UINT64 %PRIu32
FAC_LIST_CPU_S390_MASK_QEMU 0x%016PRIx64,0x%016PRIx64,...
FAC_LIST_CPU_S390__GA 0x%016PRIx64,0x%016PRIx64,...

Signed-off-by: Michael Mueller 
---
 Makefile.target   |   2 +-
 rules.mak |   3 +
 target-s390x/Makefile.objs|  18 ++
 target-s390x/gen-facilities.c | 401 ++
 4 files changed, 423 insertions(+), 1 deletion(-)
 create mode 100644 target-s390x/gen-facilities.c

diff --git a/Makefile.target b/Makefile.target
index 58c6ae1..d174d5e 100644
--- a/Makefile.target
+++ b/Makefile.target
@@ -188,7 +188,7 @@ hmp-commands.h: $(SRC_PATH)/hmp-commands.hx
 qmp-commands-old.h: $(SRC_PATH)/qmp-commands.hx
$(call quiet-command,sh $(SRC_PATH)/scripts/hxtool -h < $< > $@,"  GEN  
 $(TARGET_DIR)$@")
 
-clean:
+clean: clean-target
rm -f *.a *~ $(PROGS)
rm -f $(shell find . -name '*.[od]')
rm -f hmp-commands.h qmp-commands-old.h gdbstub-xml.c
diff --git a/rules.mak b/rules.mak
index 3a05627..d367e23 100644
--- a/rules.mak
+++ b/rules.mak
@@ -13,6 +13,9 @@ MAKEFLAGS += -rR
 %.m:
 %.mak:
 
+# default target cleanup
+clean-target:
+
 # Flags for C++ compilation
 QEMU_CXXFLAGS = -D__STDC_LIMIT_MACROS $(filter-out -Wstrict-prototypes 
-Wmissing-prototypes -Wnested-externs -Wold-style-declaration 
-Wold-style-definition -Wredundant-decls, $(QEMU_CFLAGS))
 
diff --git a/target-s390x/Makefile.objs b/target-s390x/Makefile.objs
index dd62cbd..1b65776 100644
--- a/target-s390x/Makefile.objs
+++ b/target-s390x/Makefile.objs
@@ -3,3 +3,21 @@ obj-y += int_helper.o fpu_helper.o cc_helper.o mem_helper.o 
misc_helper.o
 obj-y += gdbstub.o
 obj-$(CONFIG_SOFTMMU) += machine.o ioinst.o arch_dump.o mmu_helper.o
 obj-$(CONFIG_KVM) += kvm.o
+
+# build and run facility generator
+#
+fac = gen-facilities
+fac-src = $(SRC_PATH)/target-$(TARGET_BASE_ARCH)
+fac-dst = $(SRC_PATH)/$(TARGET_DIR)
+fac-bin = $(TARGET_DIR)$(fac)
+fac-h = $(fac-bin).h
+GENERATED_HEADERS += $(fac-dst)$(fac).h
+
+$(fac-dst)$(fac).h: $(fac-dst)$(fac)
+   $(call quiet-command,$< >$@,"  GEN   $(fac-h)")
+
+$(fac-dst)$(fac): $(fac-src)/$(fac).c $(fac-src)/cpu-facilities.h
+   $(call quiet-command,$(CC) $(QEMU_INCLUDES) $(QEMU_CFLAGS) $(CFLAGS) -o 
$@ $<,"  CC$(fac-bin)")
+
+clean-target:
+   rm -f $(fac-bin) $(fac-h)
diff --git a/target-s390x/gen-facilities.c b/target-s390x/gen-facilities.c
new file mode 100644
index 000..9d48bcc
--- /dev/null
+++ b/target-s390x/gen-facilities.c
@@ -0,0 +1,401 @@
+/*
+ * S390 facility list/mask generator
+ *
+ * Copyright 2015 IBM Corp.
+ *
+ * Author(s): Michael Mueller 
+ *
+ * This work is licensed under the terms of the GNU GPL, version 2 or (at
+ * your option) any later version. See the COPYING file in the top-level
+ * directory.
+ *
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include "cpu-facilities.h"
+
+/* BEGIN FACILITY DEFS */
+
+/***
+ * CMOS G7 processors
+ ***/
+
+/* 2064-GA1 */
+static uint16_t set_2064_GA1[] = {
+FAC_N3,
+FAC_ZARCH,
+FAC_ZARCH_ACTIVE,
+};
+#define clear_2064_GA1 EmptyFacs
+
+/* 2064-GA2 */
+static uint16_t set_2064_GA2[] = {
+FAC_EXTENDED_TRANSLATION_2,
+};
+#define clear_2064_GA2 EmptyFacs
+
+/* 2064-GA3 */
+#define set_2064_GA3 EmptyFacs
+#define clear_2064_GA3 EmptyFacs
+
+/* 2066-GA1 */
+#define set_2066_GA1 EmptyFacs
+#define clear_2066_GA1 EmptyFacs
+
+/***
+ * CMOS G8 processors
+ ***/
+
+/* 2084-GA1 */
+static uint16_t set_2084_GA1[] = {
+FAC_DAT_ENH,
+FAC_MESSAGE_SECURITY_ASSIST,
+FAC_LONG_DISPLACEMENT,
+FAC_LONG_DISPLACEMENT_FAST,
+FAC_HFP_MADDSUB,
+};
+#define clear_2084_GA1 EmptyFacs
+
+/* 2084-GA2 */
+static uint16_t set_2084_GA2[] = {
+4,
+};
+#define clear_2084_GA2 EmptyFacs
+
+/* 2084-GA3 */
+static uint16_t set_2084_GA3[] = {
+FAC_ASN_LX_REUSE,
+FAC_EXTENDED_TRANSLATION_3,
+};
+#define clear_2084_GA3 EmptyFacs
+
+/* 2084-GA4 */
+#define set_2084_GA4 EmptyFacs
+#define clear_2084_GA4 EmptyFacs
+
+/* 208

[PATCH v3 01/16] Introduce probe mode for machine type none

2015-03-02 Thread Michael Mueller
QEMU now switches into "probe mode" when the selected machine is "none" and no
specific accelerator(s) has been requested (i.e.: "-machine none").

In probe mode a by "_CONFIG" defines predefined list of accelerators run
their init() methods.

Signed-off-by: Michael Mueller 
---
 accel.c  | 31 +--
 include/hw/boards.h  |  1 +
 include/sysemu/kvm.h | 10 ++
 kvm-all.c|  3 +++
 4 files changed, 39 insertions(+), 6 deletions(-)

diff --git a/accel.c b/accel.c
index 74e41da..260b009 100644
--- a/accel.c
+++ b/accel.c
@@ -36,6 +36,9 @@
 
 int tcg_tb_size;
 static bool tcg_allowed = true;
+static const char *probe_mode_accels =
+"kvm:"
+"tcg";
 
 static int tcg_init(MachineState *ms)
 {
@@ -59,13 +62,15 @@ static AccelClass *accel_find(const char *opt_name)
 return ac;
 }
 
-static int accel_init_machine(AccelClass *acc, MachineState *ms)
+static int accel_init_machine(AccelClass *acc, MachineState *ms,
+  bool probe_mode)
 {
 ObjectClass *oc = OBJECT_CLASS(acc);
 const char *cname = object_class_get_name(oc);
 AccelState *accel = ACCEL(object_new(cname));
 int ret;
 ms->accelerator = accel;
+ms->probe_mode = probe_mode;
 *(acc->allowed) = true;
 ret = acc->init_machine(ms);
 if (ret < 0) {
@@ -78,20 +83,30 @@ static int accel_init_machine(AccelClass *acc, MachineState 
*ms)
 
 int configure_accelerator(MachineState *ms)
 {
-const char *p;
+const char *p, *name;
 char buf[10];
 int ret;
 bool accel_initialised = false;
 bool init_failed = false;
 AccelClass *acc = NULL;
+ObjectClass *oc;
+bool probe_mode = false;
 
 p = qemu_opt_get(qemu_get_machine_opts(), "accel");
 if (p == NULL) {
-/* Use the default "accelerator", tcg */
-p = "tcg";
+oc = (ObjectClass *) MACHINE_GET_CLASS(current_machine);
+name = object_class_get_name(oc);
+probe_mode = !strcmp(name, "none" TYPE_MACHINE_SUFFIX);
+if (probe_mode) {
+/* Use these accelerators in probe mode, tcg should be last */
+p = probe_mode_accels;
+} else {
+/* Use the default "accelerator", tcg */
+p = "tcg";
+}
 }
 
-while (!accel_initialised && *p != '\0') {
+while ((probe_mode || !accel_initialised) && *p != '\0') {
 if (*p == ':') {
 p++;
 }
@@ -106,7 +121,7 @@ int configure_accelerator(MachineState *ms)
acc->name);
 continue;
 }
-ret = accel_init_machine(acc, ms);
+ret = accel_init_machine(acc, ms, probe_mode);
 if (ret < 0) {
 init_failed = true;
 fprintf(stderr, "failed to initialize %s: %s\n",
@@ -128,6 +143,10 @@ int configure_accelerator(MachineState *ms)
 fprintf(stderr, "Back to %s accelerator.\n", acc->name);
 }
 
+if (probe_mode) {
+accel_initialised = false;
+}
+
 return !accel_initialised;
 }
 
diff --git a/include/hw/boards.h b/include/hw/boards.h
index 3ddc449..3253fa5 100644
--- a/include/hw/boards.h
+++ b/include/hw/boards.h
@@ -135,6 +135,7 @@ struct MachineState {
 bool usb;
 char *firmware;
 bool iommu;
+bool probe_mode;
 
 ram_addr_t ram_size;
 ram_addr_t maxram_size;
diff --git a/include/sysemu/kvm.h b/include/sysemu/kvm.h
index 30cb84d..fbc18c8 100644
--- a/include/sysemu/kvm.h
+++ b/include/sysemu/kvm.h
@@ -50,6 +50,7 @@ extern bool kvm_msi_via_irqfd_allowed;
 extern bool kvm_gsi_routing_allowed;
 extern bool kvm_gsi_direct_mapping;
 extern bool kvm_readonly_mem_allowed;
+extern bool kvm_probe_mode;
 
 #if defined CONFIG_KVM || !defined NEED_CPU_H
 #define kvm_enabled()   (kvm_allowed)
@@ -143,6 +144,15 @@ extern bool kvm_readonly_mem_allowed;
  */
 #define kvm_readonly_mem_enabled() (kvm_readonly_mem_allowed)
 
+/**
+ * kvm_probe_mode_enabled:
+ *
+ * Returns: true if KVM is initialized for a machine type that
+ * has its probe_mode attribute set (ie QEMU was started in probe
+ * mode)
+ */
+#define kvm_probe_mode_enabled() (kvm_probe_mode)
+
 #else
 #define kvm_enabled()   (0)
 #define kvm_irqchip_in_kernel() (false)
diff --git a/kvm-all.c b/kvm-all.c
index 05a79c2..f9e4434 100644
--- a/kvm-all.c
+++ b/kvm-all.c
@@ -126,6 +126,7 @@ bool kvm_gsi_routing_allowed;
 bool kvm_gsi_direct_mapping;
 bool kvm_allowed;
 bool kvm_readonly_mem_allowed;
+bool kvm_probe_mode;
 
 static const KVMCapabilityInfo kvm_required_capabilites[] = {
 KVM_CAP_INFO(USER_MEMORY),
@@ -1471,6 +1472,8 @@ static int kvm_init(MachineState *ms)
 goto err;
 }
 
+kvm_probe_mode = ms->probe_mode;
+
 s->nr_slots = kvm_check_extension(s, KVM_CAP_NR_MEMSLOTS);
 
 /* If unspecified, use the default value */
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vg

[PATCH v3 02/16] Introduce option --probe to switch into probe mode

2015-03-02 Thread Michael Mueller
The option --probe allows to switch into probe mode also for machines
different from none. If one or more accelerators are specified these
accelerators are used to provide probable properties. If no accelerator
is given a list of accelerators that support probing is used.

Signed-off-by: Michael Mueller 
---
 accel.c| 13 -
 include/sysemu/accel.h |  2 +-
 qemu-options.hx|  8 
 vl.c   |  7 ++-
 4 files changed, 23 insertions(+), 7 deletions(-)

diff --git a/accel.c b/accel.c
index 260b009..4ed6df8 100644
--- a/accel.c
+++ b/accel.c
@@ -81,7 +81,7 @@ static int accel_init_machine(AccelClass *acc, MachineState 
*ms,
 return ret;
 }
 
-int configure_accelerator(MachineState *ms)
+int configure_accelerator(MachineState *ms, int probe)
 {
 const char *p, *name;
 char buf[10];
@@ -90,13 +90,16 @@ int configure_accelerator(MachineState *ms)
 bool init_failed = false;
 AccelClass *acc = NULL;
 ObjectClass *oc;
-bool probe_mode = false;
+bool probe_mode;
 
+probe_mode = probe != 0;
 p = qemu_opt_get(qemu_get_machine_opts(), "accel");
 if (p == NULL) {
-oc = (ObjectClass *) MACHINE_GET_CLASS(current_machine);
-name = object_class_get_name(oc);
-probe_mode = !strcmp(name, "none" TYPE_MACHINE_SUFFIX);
+if (!probe_mode) {
+oc = (ObjectClass *) MACHINE_GET_CLASS(current_machine);
+name = object_class_get_name(oc);
+probe_mode = !strcmp(name, "none" TYPE_MACHINE_SUFFIX);
+}
 if (probe_mode) {
 /* Use these accelerators in probe mode, tcg should be last */
 p = probe_mode_accels;
diff --git a/include/sysemu/accel.h b/include/sysemu/accel.h
index 997720f..3adb6ba 100644
--- a/include/sysemu/accel.h
+++ b/include/sysemu/accel.h
@@ -57,6 +57,6 @@ typedef struct AccelClass {
 
 extern int tcg_tb_size;
 
-int configure_accelerator(MachineState *ms);
+int configure_accelerator(MachineState *ms, int probe);
 
 #endif
diff --git a/qemu-options.hx b/qemu-options.hx
index 85ca3ad..22e7544 100644
--- a/qemu-options.hx
+++ b/qemu-options.hx
@@ -2847,6 +2847,14 @@ STEXI
 Do not start CPU at startup (you must type 'c' in the monitor).
 ETEXI
 
+DEF("probe", 0, QEMU_OPTION_probe, \
+"-probe  startup in probe mode, option -S is selected as well\n", 
QEMU_ARCH_ALL)
+STEXI
+@item -probe
+@findex -probe
+Startup in probe mode.
+ETEXI
+
 DEF("realtime", HAS_ARG, QEMU_OPTION_realtime,
 "-realtime [mlock=on|off]\n"
 "run qemu with realtime features\n"
diff --git a/vl.c b/vl.c
index e1ffd0a..ba1730c 100644
--- a/vl.c
+++ b/vl.c
@@ -138,6 +138,7 @@ bool enable_mlock = false;
 int nb_nics;
 NICInfo nd_table[MAX_NICS];
 int autostart;
+int probe;
 static int rtc_utc = 1;
 static int rtc_date_offset = -1; /* -1 means no change */
 QEMUClockType rtc_clock;
@@ -3144,6 +3145,10 @@ int main(int argc, char **argv, char **envp)
 case QEMU_OPTION_S:
 autostart = 0;
 break;
+case QEMU_OPTION_probe:
+probe = 1;
+autostart = 0;
+break;
 case QEMU_OPTION_k:
 keyboard_layout = optarg;
 break;
@@ -4023,7 +4028,7 @@ int main(int argc, char **argv, char **envp)
 exit(1);
 }
 
-configure_accelerator(current_machine);
+configure_accelerator(current_machine, probe);
 
 if (qtest_chrdev) {
 Error *local_err = NULL;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 08/16] target-s390x: Add cpu model alias definition routines

2015-03-02 Thread Michael Mueller
This patch implements the infrastructure to dynamically add cpu
model aliases.

Signed-off-by: Michael Mueller 
Reviewed-by: Cornelia Huck 
---
 target-s390x/cpu-models.c | 89 +++
 target-s390x/cpu-models.h | 11 ++
 target-s390x/cpu.c|  1 +
 3 files changed, 101 insertions(+)

diff --git a/target-s390x/cpu-models.c b/target-s390x/cpu-models.c
index bd9f0bc..608189d 100644
--- a/target-s390x/cpu-models.c
+++ b/target-s390x/cpu-models.c
@@ -87,3 +87,92 @@ S390_PROC_DEF("2827-ga1", CPU_S390_2827_GA1, "IBM 
zEnterprise EC12 GA1")
 S390_PROC_DEF("2827-ga2", CPU_S390_2827_GA2, "IBM zEnterprise EC12 GA2")
 S390_PROC_DEF("2828-ga1", CPU_S390_2828_GA1, "IBM zEnterprise BC12 GA1")
 
+static GSList *s390_cpu_aliases;
+
+static gint s390_cpu_compare_class_name(gconstpointer a, gconstpointer b)
+{
+const char *aname = object_class_get_name((ObjectClass *) a);
+const char *bname = b;
+int blen;
+
+if (!strcmp(bname, "none") &&
+!strcmp(aname, TYPE_S390_CPU)) {
+return 0;
+}
+blen = strlen(bname);
+if (!strncasecmp(bname, aname, blen) &&
+!strcmp(aname + blen, "-" TYPE_S390_CPU)) {
+return 0;
+}
+return -1;
+}
+
+/**
+ * s390_cpu_class_by_name:
+ * @name: a cpu model or alias name
+ *
+ * The function searches for the requested cpu model name or an alias
+ * cpu model name and returns the associated object class.
+ *
+ * Returns: reference to object class on success or %NULL elsewise.
+ *
+ * Since: 2.3
+ */
+ObjectClass *s390_cpu_class_by_name(const char *name)
+{
+GSList *list, *item;
+ObjectClass *ret = NULL;
+S390CPUAlias *alias;
+
+for (item = s390_cpu_aliases; item != NULL; item = item->next) {
+alias = (S390CPUAlias *) item->data;
+if (strcmp(alias->name, name) == 0) {
+return s390_cpu_class_by_name(alias->model);
+}
+}
+list = object_class_get_list(TYPE_S390_CPU, false);
+item = g_slist_find_custom(list, name, s390_cpu_compare_class_name);
+if (item) {
+ret = OBJECT_CLASS(item->data);
+}
+g_slist_free(list);
+return ret;
+}
+
+/**
+ * set_s390_cpu_alias:
+ * @name: the cpu alias name
+ * @model: the cpu model name
+ *
+ * The function registers the alias @name for an existing cpu @model.
+ *
+ * Returns: %0 in case of success
+ *  -%EINVAL if name or model is %NULL or both are idential
+ *   or model is not a valid cpu model
+ *  -%ENOMEM if internal memory allocation fails
+ *
+ * Since: 2.3
+ */
+int set_s390_cpu_alias(const char *name, const char *model)
+{
+S390CPUAlias *alias;
+
+if (!name || !model) {
+return -EINVAL;
+}
+if (!strcmp(name, model)) {
+return -EINVAL;
+}
+if (!s390_cpu_class_by_name(model)) {
+return -EINVAL;
+}
+alias = g_try_malloc0(sizeof(S390CPUAlias));
+if (!alias) {
+return -ENOMEM;
+}
+alias->name = g_strdup(name);
+alias->model = g_strdup(model);
+s390_cpu_aliases = g_slist_append(s390_cpu_aliases, alias);
+return 0;
+}
+
diff --git a/target-s390x/cpu-models.h b/target-s390x/cpu-models.h
index d5f0b59..a32f559 100644
--- a/target-s390x/cpu-models.h
+++ b/target-s390x/cpu-models.h
@@ -35,6 +35,17 @@
 #define cpu_class(x)  (((x) >> 20) & 0x3)
 #define cpu_generation(x) (((x) >> 24) & 0xff)
 
+ObjectClass *s390_cpu_class_by_name(const char *name);
+int set_s390_cpu_alias(const char *name, const char *model);
+
+/*
+ * S390 cpu aliases will be added dynamically
+ */
+typedef struct S390CPUAlias {
+char *name;
+char *model;
+} S390CPUAlias;
+
 /*
  * bits 0-7   : CMOS generation
  * bits 8-9   : reserved
diff --git a/target-s390x/cpu.c b/target-s390x/cpu.c
index 7a26b91..1992910 100644
--- a/target-s390x/cpu.c
+++ b/target-s390x/cpu.c
@@ -308,6 +308,7 @@ static void s390_cpu_class_init(ObjectClass *oc, void *data)
 #endif
 scc->cpu_reset = s390_cpu_reset;
 scc->initial_cpu_reset = s390_cpu_initial_reset;
+cc->class_by_name = s390_cpu_class_by_name;
 cc->reset = s390_cpu_full_reset;
 cc->has_work = s390_cpu_has_work;
 cc->do_interrupt = s390_cpu_do_interrupt;
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v3 03/16] Introduce stub routine cpu_desc_avail

2015-03-02 Thread Michael Mueller
This patch introduces the function cpu_desc_avail() which returns by
default true if not architecture specific implemented. Its intention
is to indicate if the cpu model description is available for display
by list_cpus(). This change allows cpu model descriptions to become
dynamically created by evaluating the runtime context instead of
putting static cpu model information at display.

Signed-off-by: Michael Mueller 
Reviewed-by: Thomas Huth 
---
 include/qemu-common.h  | 2 ++
 stubs/Makefile.objs| 1 +
 stubs/cpu-desc-avail.c | 6 ++
 vl.c   | 2 +-
 4 files changed, 10 insertions(+), 1 deletion(-)
 create mode 100644 stubs/cpu-desc-avail.c

diff --git a/include/qemu-common.h b/include/qemu-common.h
index 644b46d..45040f9 100644
--- a/include/qemu-common.h
+++ b/include/qemu-common.h
@@ -481,4 +481,6 @@ int parse_debug_env(const char *name, int max, int initial);
 
 const char *qemu_ether_ntoa(const MACAddr *mac);
 
+bool cpu_desc_avail(void);
+
 #endif
diff --git a/stubs/Makefile.objs b/stubs/Makefile.objs
index 5e347d0..fd7a489 100644
--- a/stubs/Makefile.objs
+++ b/stubs/Makefile.objs
@@ -40,3 +40,4 @@ stub-obj-$(CONFIG_WIN32) += fd-register.o
 stub-obj-y += cpus.o
 stub-obj-y += kvm.o
 stub-obj-y += qmp_pc_dimm_device_list.o
+stub-obj-y += cpu-desc-avail.o
diff --git a/stubs/cpu-desc-avail.c b/stubs/cpu-desc-avail.c
new file mode 100644
index 000..0cd594e
--- /dev/null
+++ b/stubs/cpu-desc-avail.c
@@ -0,0 +1,6 @@
+#include "qemu-common.h"
+
+bool cpu_desc_avail(void)
+{
+return true;
+}
diff --git a/vl.c b/vl.c
index ba1730c..d337a74 100644
--- a/vl.c
+++ b/vl.c
@@ -3815,7 +3815,7 @@ int main(int argc, char **argv, char **envp)
  */
 cpudef_init();
 
-if (cpu_model && is_help_option(cpu_model)) {
+if (cpu_model && cpu_desc_avail() && is_help_option(cpu_model)) {
 list_cpus(stdout, &fprintf, cpu_model);
 exit(0);
 }
-- 
1.8.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 4/5] arm/arm64: KVM: Fix migration race in the arch timer

2015-03-02 Thread Alex Bennée
From: Christoffer Dall 

When a VCPU is no longer running, we currently check to see if it has a
timer scheduled in the future, and if it does, we schedule a host
hrtimer to notify is in case the timer expires while the VCPU is still
not running.  When the hrtimer fires, we mask the guest's timer and
inject the timer IRQ (still relying on the guest unmasking the time when
it receives the IRQ).

This is all good and fine, but when migration a VM (checkpoint/restore)
this introduces a race.  It is unlikely, but possible, for the following
sequence of events to happen:

 1. Userspace stops the VM
 2. Hrtimer for VCPU is scheduled
 3. Userspace checkpoints the VGIC state (no pending timer interrupts)
 4. The hrtimer fires, schedules work in a workqueue
 5. Workqueue function runs, masks the timer and injects timer interrupt
 6. Userspace checkpoints the timer state (timer masked)

At restore time, you end up with a masked timer without any timer
interrupts and your guest halts never receiving timer interrupts.

Fix this by only kicking the VCPU in the workqueue function, and sample
the expired state of the timer when entering the guest again and inject
the interrupt and mask the timer only then.

Signed-off-by: Christoffer Dall 
Signed-off-by: Alex Bennée 

diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 8531536..f7fd76e 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -269,7 +269,7 @@ void kvm_arch_vcpu_destroy(struct kvm_vcpu *vcpu)
 
 int kvm_cpu_has_pending_timer(struct kvm_vcpu *vcpu)
 {
-   return 0;
+   return kvm_timer_should_fire(vcpu);
 }
 
 int kvm_arch_vcpu_init(struct kvm_vcpu *vcpu)
diff --git a/include/kvm/arm_arch_timer.h b/include/kvm/arm_arch_timer.h
index b3f45a5..98cc9f4 100644
--- a/include/kvm/arm_arch_timer.h
+++ b/include/kvm/arm_arch_timer.h
@@ -72,6 +72,8 @@ void kvm_timer_vcpu_terminate(struct kvm_vcpu *vcpu);
 u64 kvm_arm_timer_get_reg(struct kvm_vcpu *, u64 regid);
 int kvm_arm_timer_set_reg(struct kvm_vcpu *, u64 regid, u64 value);
 
+bool kvm_timer_should_fire(struct kvm_vcpu *vcpu);
+
 #else
 static inline int kvm_timer_hyp_init(void)
 {
@@ -96,6 +98,11 @@ static inline u64 kvm_arm_timer_get_reg(struct kvm_vcpu 
*vcpu, u64 regid)
 {
return 0;
 }
+
+static inline bool kvm_timer_should_fire(struct kvm_vcpu *vcpu)
+{
+   return false;
+}
 #endif
 
 #endif
diff --git a/virt/kvm/arm/arch_timer.c b/virt/kvm/arm/arch_timer.c
index 6e54f35..98c95f2 100644
--- a/virt/kvm/arm/arch_timer.c
+++ b/virt/kvm/arm/arch_timer.c
@@ -85,13 +85,22 @@ static irqreturn_t kvm_arch_timer_handler(int irq, void 
*dev_id)
return IRQ_HANDLED;
 }
 
+/*
+ * Work function for handling the backup timer that we schedule when a vcpu is
+ * no longer running, but had a timer programmed to fire in the future.
+ */
 static void kvm_timer_inject_irq_work(struct work_struct *work)
 {
struct kvm_vcpu *vcpu;
 
vcpu = container_of(work, struct kvm_vcpu, arch.timer_cpu.expired);
vcpu->arch.timer_cpu.armed = false;
-   kvm_timer_inject_irq(vcpu);
+
+   /*
+* If the vcpu is blocked we want to wake it up so that it will see
+* the timer has expired when entering the guest.
+*/
+   kvm_vcpu_kick(vcpu);
 }
 
 static enum hrtimer_restart kvm_timer_expire(struct hrtimer *hrt)
@@ -102,6 +111,21 @@ static enum hrtimer_restart kvm_timer_expire(struct 
hrtimer *hrt)
return HRTIMER_NORESTART;
 }
 
+bool kvm_timer_should_fire(struct kvm_vcpu *vcpu)
+{
+   struct arch_timer_cpu *timer = &vcpu->arch.timer_cpu;
+   cycle_t cval, now;
+
+   if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) ||
+   !(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE))
+   return false;
+
+   cval = timer->cntv_cval;
+   now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff;
+
+   return cval <= now;
+}
+
 /**
  * kvm_timer_flush_hwstate - prepare to move the virt timer to the cpu
  * @vcpu: The vcpu pointer
@@ -119,6 +143,13 @@ void kvm_timer_flush_hwstate(struct kvm_vcpu *vcpu)
 * populate the CPU timer again.
 */
timer_disarm(timer);
+
+   /*
+* If the timer expired while we were not scheduled, now is the time
+* to inject it.
+*/
+   if (kvm_timer_should_fire(vcpu))
+   kvm_timer_inject_irq(vcpu);
 }
 
 /**
@@ -134,16 +165,9 @@ void kvm_timer_sync_hwstate(struct kvm_vcpu *vcpu)
cycle_t cval, now;
u64 ns;
 
-   if ((timer->cntv_ctl & ARCH_TIMER_CTRL_IT_MASK) ||
-   !(timer->cntv_ctl & ARCH_TIMER_CTRL_ENABLE))
-   return;
-
-   cval = timer->cntv_cval;
-   now = kvm_phys_timer_read() - vcpu->kvm->arch.timer.cntvoff;
-
BUG_ON(timer_is_armed(timer));
 
-   if (cval <= now) {
+   if (kvm_timer_should_fire(vcpu)) {
/*
 * Timer has already expired while we were not
 * looking. Inject the interrupt and carry on.
@@ -152,6 +

[PATCH v2 0/5] KVM ARM64 Migration Fixes

2015-03-02 Thread Alex Bennée
An update to the patch posted earlier this week. Result is textually
identical but

  - split out the vgic_queue_irq_to_lr re-factoring
  - merged doc changes into earlier patch
  - added Acked-by tags

Christoffer,

Are you ok with the split?

The branch I've been working with can be found at:

  http://git.linaro.org/people/alex.bennee/linux.git
  branch: migration/kvmarm-fixes-for-4.0-v2

It includes the two patches from the current kvmarm/master branch.

Alex Bennée (2):
  arm: KVM: export vcpi->pause state via MP_STATE ioctls
  arm: KVM: add a common vgic_queue_irq_to_lr fn

Christoffer Dall (3):
  arm/arm64: KVM: Implement support for unqueueing active IRQs
  arm/arm64: KVM: Fix migration race in the arch timer
  arm/arm64: KVM: Keep elrsr/aisr in sync with software model

 Documentation/virtual/kvm/api.txt |  24 +++-
 arch/arm/kvm/arm.c|  23 +++-
 include/kvm/arm_arch_timer.h  |   7 ++
 include/kvm/arm_vgic.h|  16 ++-
 virt/kvm/arm/arch_timer.c |  45 +--
 virt/kvm/arm/vgic-v2-emul.c   |  20 +++-
 virt/kvm/arm/vgic-v2.c|   8 ++
 virt/kvm/arm/vgic-v3.c|   8 ++
 virt/kvm/arm/vgic.c   | 245 +++---
 virt/kvm/arm/vgic.h   |   8 ++
 10 files changed, 342 insertions(+), 62 deletions(-)

-- 
2.3.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 2/5] arm/arm64: KVM: Implement support for unqueueing active IRQs

2015-03-02 Thread Alex Bennée
From: Christoffer Dall 

Migrating active interrupts causes the active state to be lost
completely. This implements some additional bitmaps to track the active
state on the distributor and export this to user space.

Signed-off-by: Christoffer Dall 
Signed-off-by: Alex Bennée 

---
AJB:
   - fixed merge conflicts
   - moved additional shared bitmaps to be dynamically allocated
   - make irq_active_on_cpu dynamically allocated as well
   - in vgic_queue_irq don't queue pending if already active
   - in __kvm_vgic_flush_hwstate use pr_shared when checking SPIs
   - vgic: clear active on CPU bit
   - checkpatch, remove extraneous braces
   - checkpatch, remove debug, fix overflows
   - move register access fns to re-factored vgic-v2-emul.c
v2
   - doc: unqueue and update_state

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 7c55dd5..7042251 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -196,6 +196,9 @@ struct vgic_dist {
/* Level-triggered interrupt queued on VCPU interface */
struct vgic_bitmap  irq_queued;
 
+   /* Interrupt was active when unqueue from VCPU interface */
+   struct vgic_bitmap  irq_active;
+
/* Interrupt priority. Not used yet. */
struct vgic_bytemap irq_priority;
 
@@ -236,6 +239,9 @@ struct vgic_dist {
/* Bitmap indicating which CPU has something pending */
unsigned long   *irq_pending_on_cpu;
 
+   /* Bitmap indicating which CPU has active IRQs */
+   unsigned long   *irq_active_on_cpu;
+
struct vgic_vm_ops  vm_ops;
 #endif
 };
@@ -269,9 +275,15 @@ struct vgic_cpu {
/* per IRQ to LR mapping */
u8  *vgic_irq_lr_map;
 
-   /* Pending interrupts on this VCPU */
+   /* Pending/active/both interrupts on this VCPU */
DECLARE_BITMAP( pending_percpu, VGIC_NR_PRIVATE_IRQS);
+   DECLARE_BITMAP( active_percpu, VGIC_NR_PRIVATE_IRQS);
+   DECLARE_BITMAP( pend_act_percpu, VGIC_NR_PRIVATE_IRQS);
+
+   /* Pending/active/both shared interrupts, dynamically sized */
unsigned long   *pending_shared;
+   unsigned long   *active_shared;
+   unsigned long   *pend_act_shared;
 
/* Bitmap of used/free list registers */
DECLARE_BITMAP( lr_used, VGIC_V2_MAX_LRS);
@@ -311,6 +323,7 @@ int kvm_vgic_inject_irq(struct kvm *kvm, int cpuid, 
unsigned int irq_num,
bool level);
 void vgic_v3_dispatch_sgi(struct kvm_vcpu *vcpu, u64 reg);
 int kvm_vgic_vcpu_pending_irq(struct kvm_vcpu *vcpu);
+int kvm_vgic_vcpu_active_irq(struct kvm_vcpu *vcpu);
 bool vgic_handle_mmio(struct kvm_vcpu *vcpu, struct kvm_run *run,
  struct kvm_exit_mmio *mmio);
 
diff --git a/virt/kvm/arm/vgic-v2-emul.c b/virt/kvm/arm/vgic-v2-emul.c
index 19c6210..c818662 100644
--- a/virt/kvm/arm/vgic-v2-emul.c
+++ b/virt/kvm/arm/vgic-v2-emul.c
@@ -107,6 +107,22 @@ static bool handle_mmio_clear_pending_reg(struct kvm_vcpu 
*vcpu,
 vcpu->vcpu_id);
 }
 
+static bool handle_mmio_set_active_reg(struct kvm_vcpu *vcpu,
+  struct kvm_exit_mmio *mmio,
+  phys_addr_t offset)
+{
+   return vgic_handle_set_active_reg(vcpu->kvm, mmio, offset,
+ vcpu->vcpu_id);
+}
+
+static bool handle_mmio_clear_active_reg(struct kvm_vcpu *vcpu,
+struct kvm_exit_mmio *mmio,
+phys_addr_t offset)
+{
+   return vgic_handle_clear_active_reg(vcpu->kvm, mmio, offset,
+   vcpu->vcpu_id);
+}
+
 static bool handle_mmio_priority_reg(struct kvm_vcpu *vcpu,
 struct kvm_exit_mmio *mmio,
 phys_addr_t offset)
@@ -344,13 +360,13 @@ static const struct kvm_mmio_range vgic_dist_ranges[] = {
.base   = GIC_DIST_ACTIVE_SET,
.len= VGIC_MAX_IRQS / 8,
.bits_per_irq   = 1,
-   .handle_mmio= handle_mmio_raz_wi,
+   .handle_mmio= handle_mmio_set_active_reg,
},
{
.base   = GIC_DIST_ACTIVE_CLEAR,
.len= VGIC_MAX_IRQS / 8,
.bits_per_irq   = 1,
-   .handle_mmio= handle_mmio_raz_wi,
+   .handle_mmio= handle_mmio_clear_active_reg,
},
{
.base   = GIC_DIST_PRI,
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 0cc6ab6..bfb6fbb 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -277,6 +277,14 @@ static void vgic_irq_clear_queued(struct kvm_vcpu *vcpu, 
int irq)
vgic_bitmap_set_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq, 0);
 }
 
+
+static void vgic_irq_set_active(struct kvm_vcpu *vcpu, int irq)
+{
+   struct vgic_d

[PATCH v2 1/5] arm: KVM: export vcpi->pause state via MP_STATE ioctls

2015-03-02 Thread Alex Bennée
To cleanly restore an SMP VM we need to ensure that the current pause
state of each vcpu is correctly recorded. Things could get confused if
the CPU starts running after migration restore completes when it was
paused before it state was captured.

We use the existing KVM_GET/SET_MP_STATE ioctl to do this. The arm/arm64
interface is a lot simpler as the only valid states are
KVM_MP_STATE_RUNNABLE and KVM_MP_STATE_HALTED.

Signed-off-by: Alex Bennée 

diff --git a/Documentation/virtual/kvm/api.txt 
b/Documentation/virtual/kvm/api.txt
index b112efc..602156f 100644
--- a/Documentation/virtual/kvm/api.txt
+++ b/Documentation/virtual/kvm/api.txt
@@ -997,7 +997,7 @@ for vm-wide capabilities.
 4.38 KVM_GET_MP_STATE
 
 Capability: KVM_CAP_MP_STATE
-Architectures: x86, s390
+Architectures: x86, s390, arm, arm64
 Type: vcpu ioctl
 Parameters: struct kvm_mp_state (out)
 Returns: 0 on success; -1 on error
@@ -1027,15 +1027,21 @@ Possible values are:
  - KVM_MP_STATE_LOAD:the vcpu is in a special load/startup state
  [s390]
 
-On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an
-in-kernel irqchip, the multiprocessing state must be maintained by userspace on
+For x86:
+
+This ioctl is only useful after KVM_CREATE_IRQCHIP.  Without an in-kernel
+irqchip, the multiprocessing state must be maintained by userspace on
 these architectures.
 
+For arm/arm64:
+
+The only states that are valid are KVM_MP_STATE_HALTED and
+KVM_MP_STATE_RUNNABLE which reflect if the vcpu is paused or not.
 
 4.39 KVM_SET_MP_STATE
 
 Capability: KVM_CAP_MP_STATE
-Architectures: x86, s390
+Architectures: x86, s390, arm, arm64
 Type: vcpu ioctl
 Parameters: struct kvm_mp_state (in)
 Returns: 0 on success; -1 on error
@@ -1043,10 +1049,16 @@ Returns: 0 on success; -1 on error
 Sets the vcpu's current "multiprocessing state"; see KVM_GET_MP_STATE for
 arguments.
 
-On x86, this ioctl is only useful after KVM_CREATE_IRQCHIP. Without an
-in-kernel irqchip, the multiprocessing state must be maintained by userspace on
+For x86:
+
+This ioctl is only useful after KVM_CREATE_IRQCHIP.  Without an in-kernel
+irqchip, the multiprocessing state must be maintained by userspace on
 these architectures.
 
+For arm/arm64:
+
+The only states that are valid are KVM_MP_STATE_HALTED and
+KVM_MP_STATE_RUNNABLE which reflect if the vcpu should be paused or not.
 
 4.40 KVM_SET_IDENTITY_MAP_ADDR
 
diff --git a/arch/arm/kvm/arm.c b/arch/arm/kvm/arm.c
index 5560f74..8531536 100644
--- a/arch/arm/kvm/arm.c
+++ b/arch/arm/kvm/arm.c
@@ -183,6 +183,7 @@ int kvm_vm_ioctl_check_extension(struct kvm *kvm, long ext)
case KVM_CAP_ARM_PSCI:
case KVM_CAP_ARM_PSCI_0_2:
case KVM_CAP_READONLY_MEM:
+   case KVM_CAP_MP_STATE:
r = 1;
break;
case KVM_CAP_COALESCED_MMIO:
@@ -313,13 +314,29 @@ int kvm_arch_vcpu_ioctl_set_guest_debug(struct kvm_vcpu 
*vcpu,
 int kvm_arch_vcpu_ioctl_get_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
 {
-   return -EINVAL;
+   if (vcpu->arch.pause)
+   mp_state->mp_state = KVM_MP_STATE_HALTED;
+   else
+   mp_state->mp_state = KVM_MP_STATE_RUNNABLE;
+
+   return 0;
 }
 
 int kvm_arch_vcpu_ioctl_set_mpstate(struct kvm_vcpu *vcpu,
struct kvm_mp_state *mp_state)
 {
-   return -EINVAL;
+   switch (mp_state->mp_state) {
+   case KVM_MP_STATE_RUNNABLE:
+   vcpu->arch.pause = false;
+   break;
+   case KVM_MP_STATE_HALTED:
+   vcpu->arch.pause = true;
+   break;
+   default:
+   return -EINVAL;
+   }
+
+   return 0;
 }
 
 /**
-- 
2.3.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 3/5] arm: KVM: add a common vgic_queue_irq_to_lr fn

2015-03-02 Thread Alex Bennée
This helps re-factor away some of the repetitive code and makes the code
flow more nicely.

Signed-off-by: Alex Bennée 

diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index bfb6fbb..3b4ded2 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -263,6 +263,13 @@ static int vgic_irq_is_queued(struct kvm_vcpu *vcpu, int 
irq)
return vgic_bitmap_get_irq_val(&dist->irq_queued, vcpu->vcpu_id, irq);
 }
 
+static int vgic_irq_is_active(struct kvm_vcpu *vcpu, int irq)
+{
+   struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+
+   return vgic_bitmap_get_irq_val(&dist->irq_active, vcpu->vcpu_id, irq);
+}
+
 static void vgic_irq_set_queued(struct kvm_vcpu *vcpu, int irq)
 {
struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
@@ -285,6 +292,13 @@ static void vgic_irq_set_active(struct kvm_vcpu *vcpu, int 
irq)
vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 1);
 }
 
+static void vgic_irq_clear_active(struct kvm_vcpu *vcpu, int irq)
+{
+   struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
+
+   vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id, irq, 0);
+}
+
 static int vgic_dist_irq_get_level(struct kvm_vcpu *vcpu, int irq)
 {
struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
@@ -1032,6 +1046,25 @@ static void vgic_retire_disabled_irqs(struct kvm_vcpu 
*vcpu)
}
 }
 
+static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
+int lr_nr, struct vgic_lr vlr)
+{
+   if (vgic_irq_is_active(vcpu, irq)) {
+   vlr.state |= LR_STATE_ACTIVE;
+   kvm_debug("Set active, clear distributor: 0x%x\n", vlr.state);
+   vgic_irq_clear_active(vcpu, irq);
+   vgic_update_state(vcpu->kvm);
+   } else if (vgic_dist_irq_is_pending(vcpu, irq)) {
+   vlr.state |= LR_STATE_PENDING;
+   kvm_debug("Set pending: 0x%x\n", vlr.state);
+   }
+
+   if (!vgic_irq_is_edge(vcpu, irq))
+   vlr.state |= LR_EOI_INT;
+
+   vgic_set_lr(vcpu, lr_nr, vlr);
+}
+
 /*
  * Queue an interrupt to a CPU virtual interface. Return true on success,
  * or false if it wasn't possible to queue it.
@@ -1059,8 +1092,7 @@ bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 
sgi_source_id, int irq)
if (vlr.source == sgi_source_id) {
kvm_debug("LR%d piggyback for IRQ%d\n", lr, vlr.irq);
BUG_ON(!test_bit(lr, vgic_cpu->lr_used));
-   vlr.state |= LR_STATE_PENDING;
-   vgic_set_lr(vcpu, lr, vlr);
+   vgic_queue_irq_to_lr(vcpu, irq, lr, vlr);
return true;
}
}
@@ -1077,11 +1109,8 @@ bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8 
sgi_source_id, int irq)
 
vlr.irq = irq;
vlr.source = sgi_source_id;
-   vlr.state = LR_STATE_PENDING;
-   if (!vgic_irq_is_edge(vcpu, irq))
-   vlr.state |= LR_EOI_INT;
-
-   vgic_set_lr(vcpu, lr, vlr);
+   vlr.state = 0;
+   vgic_queue_irq_to_lr(vcpu, irq, lr, vlr);
 
return true;
 }
-- 
2.3.0

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 5/5] arm/arm64: KVM: Keep elrsr/aisr in sync with software model

2015-03-02 Thread Alex Bennée
From: Christoffer Dall 

There is an interesting bug in the vgic code, which manifests itself
when the KVM run loop has a signal pending or needs a vmid generation
rollover after having disabled interrupts but before actually switching
to the guest.

In this case, we flush the vgic as usual, but we sync back the vgic
state and exit to userspace before entering the guest.  The consequence
is that we will be syncing the list registers back to the software model
using the GICH_ELRSR and GICH_EISR from the last execution of the guest,
potentially overwriting a list register containing an interrupt.

This showed up during migration testing where we would capture a state
where the VM has masked the arch timer but there were no interrupts,
resulting in a hung test.

Cc: Marc Zyngier 
Reported-by: Alex Bennee 
Signed-off-by: Christoffer Dall 
Signed-off-by: Alex Bennée 
Acked-by: Marc Zyngier 

diff --git a/include/kvm/arm_vgic.h b/include/kvm/arm_vgic.h
index 7042251..e2a676e 100644
--- a/include/kvm/arm_vgic.h
+++ b/include/kvm/arm_vgic.h
@@ -114,6 +114,7 @@ struct vgic_ops {
void(*sync_lr_elrsr)(struct kvm_vcpu *, int, struct vgic_lr);
u64 (*get_elrsr)(const struct kvm_vcpu *vcpu);
u64 (*get_eisr)(const struct kvm_vcpu *vcpu);
+   void(*clear_eisr)(struct kvm_vcpu *vcpu);
u32 (*get_interrupt_status)(const struct kvm_vcpu *vcpu);
void(*enable_underflow)(struct kvm_vcpu *vcpu);
void(*disable_underflow)(struct kvm_vcpu *vcpu);
diff --git a/virt/kvm/arm/vgic-v2.c b/virt/kvm/arm/vgic-v2.c
index a0a7b5d..f9b9c7c 100644
--- a/virt/kvm/arm/vgic-v2.c
+++ b/virt/kvm/arm/vgic-v2.c
@@ -72,6 +72,8 @@ static void vgic_v2_sync_lr_elrsr(struct kvm_vcpu *vcpu, int 
lr,
 {
if (!(lr_desc.state & LR_STATE_MASK))
vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr |= (1ULL << lr);
+   else
+   vcpu->arch.vgic_cpu.vgic_v2.vgic_elrsr &= ~(1ULL << lr);
 }
 
 static u64 vgic_v2_get_elrsr(const struct kvm_vcpu *vcpu)
@@ -84,6 +86,11 @@ static u64 vgic_v2_get_eisr(const struct kvm_vcpu *vcpu)
return vcpu->arch.vgic_cpu.vgic_v2.vgic_eisr;
 }
 
+static void vgic_v2_clear_eisr(struct kvm_vcpu *vcpu)
+{
+   vcpu->arch.vgic_cpu.vgic_v2.vgic_eisr = 0;
+}
+
 static u32 vgic_v2_get_interrupt_status(const struct kvm_vcpu *vcpu)
 {
u32 misr = vcpu->arch.vgic_cpu.vgic_v2.vgic_misr;
@@ -148,6 +155,7 @@ static const struct vgic_ops vgic_v2_ops = {
.sync_lr_elrsr  = vgic_v2_sync_lr_elrsr,
.get_elrsr  = vgic_v2_get_elrsr,
.get_eisr   = vgic_v2_get_eisr,
+   .clear_eisr = vgic_v2_clear_eisr,
.get_interrupt_status   = vgic_v2_get_interrupt_status,
.enable_underflow   = vgic_v2_enable_underflow,
.disable_underflow  = vgic_v2_disable_underflow,
diff --git a/virt/kvm/arm/vgic-v3.c b/virt/kvm/arm/vgic-v3.c
index 3a62d8a..dff0602 100644
--- a/virt/kvm/arm/vgic-v3.c
+++ b/virt/kvm/arm/vgic-v3.c
@@ -104,6 +104,8 @@ static void vgic_v3_sync_lr_elrsr(struct kvm_vcpu *vcpu, 
int lr,
 {
if (!(lr_desc.state & LR_STATE_MASK))
vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr |= (1U << lr);
+   else
+   vcpu->arch.vgic_cpu.vgic_v3.vgic_elrsr &= ~(1U << lr);
 }
 
 static u64 vgic_v3_get_elrsr(const struct kvm_vcpu *vcpu)
@@ -116,6 +118,11 @@ static u64 vgic_v3_get_eisr(const struct kvm_vcpu *vcpu)
return vcpu->arch.vgic_cpu.vgic_v3.vgic_eisr;
 }
 
+static void vgic_v3_clear_eisr(struct kvm_vcpu *vcpu)
+{
+   vcpu->arch.vgic_cpu.vgic_v3.vgic_eisr = 0;
+}
+
 static u32 vgic_v3_get_interrupt_status(const struct kvm_vcpu *vcpu)
 {
u32 misr = vcpu->arch.vgic_cpu.vgic_v3.vgic_misr;
@@ -192,6 +199,7 @@ static const struct vgic_ops vgic_v3_ops = {
.sync_lr_elrsr  = vgic_v3_sync_lr_elrsr,
.get_elrsr  = vgic_v3_get_elrsr,
.get_eisr   = vgic_v3_get_eisr,
+   .clear_eisr = vgic_v3_clear_eisr,
.get_interrupt_status   = vgic_v3_get_interrupt_status,
.enable_underflow   = vgic_v3_enable_underflow,
.disable_underflow  = vgic_v3_disable_underflow,
diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
index 3b4ded2..3690c1e 100644
--- a/virt/kvm/arm/vgic.c
+++ b/virt/kvm/arm/vgic.c
@@ -980,6 +980,11 @@ static inline u64 vgic_get_eisr(struct kvm_vcpu *vcpu)
return vgic_ops->get_eisr(vcpu);
 }
 
+static inline void vgic_clear_eisr(struct kvm_vcpu *vcpu)
+{
+   vgic_ops->clear_eisr(vcpu);
+}
+
 static inline u32 vgic_get_interrupt_status(struct kvm_vcpu *vcpu)
 {
return vgic_ops->get_interrupt_status(vcpu);
@@ -1019,6 +1024,7 @@ static void vgic_retire_lr(int lr_nr, int irq, struct 
kvm_vcpu *vcpu)
vgic_set_lr(vcpu, lr_nr, vlr);
clear_bit(lr_nr, vgic_cpu->lr_used);
vgic_cpu->vgic_irq_lr_map[irq] = LR_EMPTY;
+   vgic_sync_lr_elrsr(vcpu, lr_nr, vlr);
 }
 
 /*
@@ -106

[3.16.y-ckt stable] Patch "KVM: MIPS: Don't leak FPU/DSP to guest" has been added to staging queue

2015-03-02 Thread Luis Henriques
This is a note to let you know that I have just added a patch titled

KVM: MIPS: Don't leak FPU/DSP to guest

to the linux-3.16.y-queue branch of the 3.16.y-ckt extended stable tree 
which can be found at:

 
http://kernel.ubuntu.com/git?p=ubuntu/linux.git;a=shortlog;h=refs/heads/linux-3.16.y-queue

This patch is scheduled to be released in version 3.16.7-ckt8.

If you, or anyone else, feels it should not be added to this tree, please 
reply to this email.

For more information about the 3.16.y-ckt tree, see
https://wiki.ubuntu.com/Kernel/Dev/ExtendedStable

Thanks.
-Luis

--

>From b4db76b6e5898d4f4389cc944e5262788fa90d8c Mon Sep 17 00:00:00 2001
From: James Hogan 
Date: Wed, 4 Feb 2015 17:06:37 +
Subject: KVM: MIPS: Don't leak FPU/DSP to guest

commit f798217dfd038af981a18bbe4bc57027a08bb182 upstream.

The FPU and DSP are enabled via the CP0 Status CU1 and MX bits by
kvm_mips_set_c0_status() on a guest exit, presumably in case there is
active state that needs saving if pre-emption occurs. However neither of
these bits are cleared again when returning to the guest.

This effectively gives the guest access to the FPU/DSP hardware after
the first guest exit even though it is not aware of its presence,
allowing FP instructions in guest user code to intermittently actually
execute instead of trapping into the guest OS for emulation. It will
then read & manipulate the hardware FP registers which technically
belong to the user process (e.g. QEMU), or are stale from another user
process. It can also crash the guest OS by causing an FP exception, for
which a guest exception handler won't have been registered.

First lets save and disable the FPU (and MSA) state with lose_fpu(1)
before entering the guest. This simplifies the problem, especially for
when guest FPU/MSA support is added in the future, and prevents FR=1 FPU
state being live when the FR bit gets cleared for the guest, which
according to the architecture causes the contents of the FPU and vector
registers to become UNPREDICTABLE.

We can then safely remove the enabling of the FPU in
kvm_mips_set_c0_status(), since there should never be any active FPU or
MSA state to save at pre-emption, which should plug the FPU leak.

DSP state is always live rather than being lazily restored, so for that
it is simpler to just clear the MX bit again when re-entering the guest.

Signed-off-by: James Hogan 
Cc: Paolo Bonzini 
Cc: Ralf Baechle 
Cc: Sanjay Lal 
Cc: Gleb Natapov 
Cc: kvm@vger.kernel.org
Cc: linux-m...@linux-mips.org
Signed-off-by: Paolo Bonzini 
[ luis: backported to 3.16: files rename:
  - locore.S -> kvm_locore.S
  - mips.c -> kvm_mips.c ]
Signed-off-by: Luis Henriques 
---
 arch/mips/kvm/kvm_locore.S | 2 +-
 arch/mips/kvm/kvm_mips.c   | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/arch/mips/kvm/kvm_locore.S b/arch/mips/kvm/kvm_locore.S
index 033ac343e72c..17376cd838e6 100644
--- a/arch/mips/kvm/kvm_locore.S
+++ b/arch/mips/kvm/kvm_locore.S
@@ -428,7 +428,7 @@ __kvm_mips_return_to_guest:
/* Setup status register for running guest in UM */
.setat
or  v1, v1, (ST0_EXL | KSU_USER | ST0_IE)
-   and v1, v1, ~ST0_CU0
+   and v1, v1, ~(ST0_CU0 | ST0_MX)
.setnoat
mtc0v1, CP0_STATUS
ehb
diff --git a/arch/mips/kvm/kvm_mips.c b/arch/mips/kvm/kvm_mips.c
index f3c56a182fd8..d84f96e51349 100644
--- a/arch/mips/kvm/kvm_mips.c
+++ b/arch/mips/kvm/kvm_mips.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 #include 
 #include 
@@ -413,6 +414,8 @@ int kvm_arch_vcpu_ioctl_run(struct kvm_vcpu *vcpu, struct 
kvm_run *run)
vcpu->mmio_needed = 0;
}

+   lose_fpu(1);
+
local_irq_disable();
/* Check if we have any exceptions/interrupts pending */
kvm_mips_deliver_interrupts(vcpu,
@@ -1028,9 +1031,6 @@ void kvm_mips_set_c0_status(void)
 {
uint32_t status = read_c0_status();

-   if (cpu_has_fpu)
-   status |= (ST0_CU1);
-
if (cpu_has_dsp)
status |= (ST0_MX);

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


RE: [v3 24/26] KVM: Update Posted-Interrupts Descriptor when vCPU is blocked

2015-03-02 Thread Wu, Feng


> -Original Message-
> From: Marcelo Tosatti [mailto:mtosa...@redhat.com]
> Sent: Friday, February 27, 2015 7:41 AM
> To: Wu, Feng
> Cc: t...@linutronix.de; mi...@redhat.com; h...@zytor.com; x...@kernel.org;
> g...@kernel.org; pbonz...@redhat.com; dw...@infradead.org;
> j...@8bytes.org; alex.william...@redhat.com; jiang@linux.intel.com;
> eric.au...@linaro.org; linux-ker...@vger.kernel.org;
> io...@lists.linux-foundation.org; kvm@vger.kernel.org
> Subject: Re: [v3 24/26] KVM: Update Posted-Interrupts Descriptor when vCPU
> is blocked
> 
> On Fri, Dec 12, 2014 at 11:14:58PM +0800, Feng Wu wrote:
> > This patch updates the Posted-Interrupts Descriptor when vCPU
> > is blocked.
> >
> > pre-block:
> > - Add the vCPU to the blocked per-CPU list
> > - Clear 'SN'
> > - Set 'NV' to POSTED_INTR_WAKEUP_VECTOR
> >
> > post-block:
> > - Remove the vCPU from the per-CPU list
> >
> > Signed-off-by: Feng Wu 
> > ---
> >  arch/x86/include/asm/kvm_host.h |  2 +
> >  arch/x86/kvm/vmx.c  | 96
> +
> >  arch/x86/kvm/x86.c  | 22 +++---
> >  include/linux/kvm_host.h|  4 ++
> >  virt/kvm/kvm_main.c |  6 +++
> >  5 files changed, 123 insertions(+), 7 deletions(-)
> >
> > diff --git a/arch/x86/include/asm/kvm_host.h
> b/arch/x86/include/asm/kvm_host.h
> > index 13e3e40..32c110a 100644
> > --- a/arch/x86/include/asm/kvm_host.h
> > +++ b/arch/x86/include/asm/kvm_host.h
> > @@ -101,6 +101,8 @@ static inline gfn_t gfn_to_index(gfn_t gfn, gfn_t
> base_gfn, int level)
> >
> >  #define ASYNC_PF_PER_VCPU 64
> >
> > +extern void (*wakeup_handler_callback)(void);
> > +
> >  enum kvm_reg {
> > VCPU_REGS_RAX = 0,
> > VCPU_REGS_RCX = 1,
> > diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> > index bf2e6cd..a1c83a2 100644
> > --- a/arch/x86/kvm/vmx.c
> > +++ b/arch/x86/kvm/vmx.c
> > @@ -832,6 +832,13 @@ static DEFINE_PER_CPU(struct vmcs *,
> current_vmcs);
> >  static DEFINE_PER_CPU(struct list_head, loaded_vmcss_on_cpu);
> >  static DEFINE_PER_CPU(struct desc_ptr, host_gdt);
> >
> > +/*
> > + * We maintian a per-CPU linked-list of vCPU, so in wakeup_handler() we
> > + * can find which vCPU should be waken up.
> > + */
> > +static DEFINE_PER_CPU(struct list_head, blocked_vcpu_on_cpu);
> > +static DEFINE_PER_CPU(spinlock_t, blocked_vcpu_on_cpu_lock);
> > +
> >  static unsigned long *vmx_io_bitmap_a;
> >  static unsigned long *vmx_io_bitmap_b;
> >  static unsigned long *vmx_msr_bitmap_legacy;
> > @@ -1921,6 +1928,7 @@ static void vmx_vcpu_load(struct kvm_vcpu *vcpu,
> int cpu)
> > struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
> > struct pi_desc old, new;
> > unsigned int dest;
> > +   unsigned long flags;
> >
> > memset(&old, 0, sizeof(old));
> > memset(&new, 0, sizeof(new));
> > @@ -1942,6 +1950,20 @@ static void vmx_vcpu_load(struct kvm_vcpu
> *vcpu, int cpu)
> > new.nv = POSTED_INTR_VECTOR;
> > } while (cmpxchg(&pi_desc->control, old.control,
> > new.control) != old.control);
> > +
> > +   /*
> > +* Delete the vCPU from the related wakeup queue
> > +* if we are resuming from blocked state
> > +*/
> > +   if (vcpu->blocked) {
> > +   vcpu->blocked = false;
> > +   spin_lock_irqsave(&per_cpu(blocked_vcpu_on_cpu_lock,
> > +   vcpu->wakeup_cpu), flags);
> > +   list_del(&vcpu->blocked_vcpu_list);
> > +   
> > spin_unlock_irqrestore(&per_cpu(blocked_vcpu_on_cpu_lock,
> > +   vcpu->wakeup_cpu), flags);
> > +   vcpu->wakeup_cpu = -1;
> > +   }
> > }
> >  }
> >
> > @@ -1950,6 +1972,9 @@ static void vmx_vcpu_put(struct kvm_vcpu *vcpu)
> > if (irq_remapping_cap(IRQ_POSTING_CAP)) {
> > struct pi_desc *pi_desc = vcpu_to_pi_desc(vcpu);
> > struct pi_desc old, new;
> > +   unsigned long flags;
> > +   int cpu;
> > +   struct cpumask cpu_others_mask;
> >
> > memset(&old, 0, sizeof(old));
> > memset(&new, 0, sizeof(new));
> > @@ -1961,6 +1986,54 @@ static void vmx_vcpu_put(struct kvm_vcpu
> *vcpu)
> > pi_set_sn(&new);
> > } while (cmpxchg(&pi_desc->control, old.control,
> > new.control) != old.control);
> > +   } else if (vcpu->blocked) {
> > +   /*
> > +* The vcpu is blocked on the wait queue.
> > +* Store the blocked vCPU on the list of the
> > +* vcpu->wakeup_cpu, which is the destination
> > +* of the wake-up notification event.
> > +*/
> > +   vcpu->wakeup_cpu = vcpu->cpu;
> > +   spin_lock_irqsave(&per_cpu(blocked_

[3.16.y-ckt stable] Patch "MIPS: Export MSA functions used by lose_fpu(1) for KVM" has been added to staging queue

2015-03-02 Thread Luis Henriques
This is a note to let you know that I have just added a patch titled

MIPS: Export MSA functions used by lose_fpu(1) for KVM

to the linux-3.16.y-queue branch of the 3.16.y-ckt extended stable tree 
which can be found at:

 
http://kernel.ubuntu.com/git?p=ubuntu/linux.git;a=shortlog;h=refs/heads/linux-3.16.y-queue

This patch is scheduled to be released in version 3.16.7-ckt8.

If you, or anyone else, feels it should not be added to this tree, please 
reply to this email.

For more information about the 3.16.y-ckt tree, see
https://wiki.ubuntu.com/Kernel/Dev/ExtendedStable

Thanks.
-Luis

--

>From edb72c5797a0361ac1b22d8d6482dc85a3e29592 Mon Sep 17 00:00:00 2001
From: James Hogan 
Date: Tue, 10 Feb 2015 10:03:00 +
Subject: MIPS: Export MSA functions used by lose_fpu(1) for KVM

commit ca5d25642e212f73492d332d95dc90ef46a0e8dc upstream.

Export the _save_msa asm function used by the lose_fpu(1) macro to GPL
modules so that KVM can make use of it when it is built as a module.

This fixes the following build error when CONFIG_KVM=m and
CONFIG_CPU_HAS_MSA=y due to commit f798217dfd03 ("KVM: MIPS: Don't leak
FPU/DSP to guest"):

ERROR: "_save_msa" [arch/mips/kvm/kvm.ko] undefined!

Fixes: f798217dfd03 (KVM: MIPS: Don't leak FPU/DSP to guest)
Signed-off-by: James Hogan 
Cc: Paolo Bonzini 
Cc: Ralf Baechle 
Cc: Paul Burton 
Cc: Gleb Natapov 
Cc: kvm@vger.kernel.org
Cc: linux-m...@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/9261/
Signed-off-by: Ralf Baechle 
Signed-off-by: Luis Henriques 
---
 arch/mips/kernel/mips_ksyms.c | 4 
 1 file changed, 4 insertions(+)

diff --git a/arch/mips/kernel/mips_ksyms.c b/arch/mips/kernel/mips_ksyms.c
index e69bdd3b4b74..1b2452e2be67 100644
--- a/arch/mips/kernel/mips_ksyms.c
+++ b/arch/mips/kernel/mips_ksyms.c
@@ -15,6 +15,7 @@
 #include 
 #include 
 #include 
+#include 

 extern void *__bzero(void *__s, size_t __count);
 extern long __strncpy_from_kernel_nocheck_asm(char *__to,
@@ -38,6 +39,9 @@ extern long __strnlen_user_asm(const char *s);
  * Core architecture code
  */
 EXPORT_SYMBOL_GPL(_save_fp);
+#ifdef CONFIG_CPU_HAS_MSA
+EXPORT_SYMBOL_GPL(_save_msa);
+#endif

 /*
  * String functions
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[3.16.y-ckt stable] Patch "MIPS: Export FP functions used by lose_fpu(1) for KVM" has been added to staging queue

2015-03-02 Thread Luis Henriques
This is a note to let you know that I have just added a patch titled

MIPS: Export FP functions used by lose_fpu(1) for KVM

to the linux-3.16.y-queue branch of the 3.16.y-ckt extended stable tree 
which can be found at:

 
http://kernel.ubuntu.com/git?p=ubuntu/linux.git;a=shortlog;h=refs/heads/linux-3.16.y-queue

This patch is scheduled to be released in version 3.16.7-ckt8.

If you, or anyone else, feels it should not be added to this tree, please 
reply to this email.

For more information about the 3.16.y-ckt tree, see
https://wiki.ubuntu.com/Kernel/Dev/ExtendedStable

Thanks.
-Luis

--

>From ebe07fbdc6dcfc5e5e6d23a9cec0e75ef6cc78a4 Mon Sep 17 00:00:00 2001
From: James Hogan 
Date: Tue, 10 Feb 2015 10:02:59 +
Subject: MIPS: Export FP functions used by lose_fpu(1) for KVM

commit 3ce465e04bfd8de9956d515d6e9587faac3375dc upstream.

Export the _save_fp asm function used by the lose_fpu(1) macro to GPL
modules so that KVM can make use of it when it is built as a module.

This fixes the following build error when CONFIG_KVM=m due to commit
f798217dfd03 ("KVM: MIPS: Don't leak FPU/DSP to guest"):

ERROR: "_save_fp" [arch/mips/kvm/kvm.ko] undefined!

Signed-off-by: James Hogan 
Fixes: f798217dfd03 (KVM: MIPS: Don't leak FPU/DSP to guest)
Cc: Paolo Bonzini 
Cc: Ralf Baechle 
Cc: Paul Burton 
Cc: Gleb Natapov 
Cc: kvm@vger.kernel.org
Cc: linux-m...@linux-mips.org
Patchwork: https://patchwork.linux-mips.org/patch/9260/
Signed-off-by: Ralf Baechle 
Signed-off-by: Luis Henriques 
---
 arch/mips/kernel/mips_ksyms.c | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/arch/mips/kernel/mips_ksyms.c b/arch/mips/kernel/mips_ksyms.c
index 2607c3a4ff7e..e69bdd3b4b74 100644
--- a/arch/mips/kernel/mips_ksyms.c
+++ b/arch/mips/kernel/mips_ksyms.c
@@ -14,6 +14,7 @@
 #include 
 #include 
 #include 
+#include 

 extern void *__bzero(void *__s, size_t __count);
 extern long __strncpy_from_kernel_nocheck_asm(char *__to,
@@ -34,6 +35,11 @@ extern long __strnlen_user_nocheck_asm(const char *s);
 extern long __strnlen_user_asm(const char *s);

 /*
+ * Core architecture code
+ */
+EXPORT_SYMBOL_GPL(_save_fp);
+
+/*
  * String functions
  */
 EXPORT_SYMBOL(memset);
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] x86: svm: use kvm_fast_pio_in()

2015-03-02 Thread Radim Krčmář
2015-02-27 18:04-0600, Joel Schopp:
> From: David Kaplan 
> 
> We can make the in instruction go faster the same way the out instruction is
> already.
> 
> Signed-off-by: David Kaplan 
> [extracted from larger unlrelated patch, forward ported, tested]
> Signed-off-by: Joel Schopp 
> ---
>  arch/x86/kvm/svm.c |4 +++-
>  1 file changed, 3 insertions(+), 1 deletion(-)
> 
> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> @@ -1907,6 +1907,8 @@ static int io_interception(struct vcpu_svm *svm)
>   svm->next_rip = svm->vmcb->control.exit_info_2;
>   skip_emulated_instruction(&svm->vcpu);
>  
> + if (in)
> + return kvm_fast_pio_in(vcpu, size, port);

Have I missed a patch that defined kvm_fast_pio_in()?

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Revert "target-ppc: Create versionless CPU class per family if KVM"

2015-03-02 Thread Andreas Färber
Am 02.03.2015 um 14:37 schrieb Alexander Graf:
> On 01.03.15 01:31, Andreas Färber wrote:
>> This reverts commit 5b79b1cadd3e565b6d1a5ba59764bd47af58b271 to avoid
>> double-registration of types:
>>
>>   Registering `POWER5+-powerpc64-cpu' which already exists
>>
>> Taking the textual description of a CPU type as part of a new type name
>> is plain wrong, and so is unconditionally registering a new type here.
>>
>> Cc: Alexey Kardashevskiy 
>> Cc: qemu-sta...@nongnu.org
>> Signed-off-by: Andreas Färber 
> 
> Doesn't this break p8 support?

Maybe, but p5 support was in longer and this is definitely a regression
and really really wrong. If you know a way to fix it without handing it
back to the IBM guys for more thought, feel free to give it a shot.

Andreas

-- 
SUSE Linux GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
GF: Felix Imendörffer, Jane Smithard, Jennifer Guild, Dilip Upmanyu,
Graham Norton; HRB 21284 (AG Nürnberg)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Revert "target-ppc: Create versionless CPU class per family if KVM"

2015-03-02 Thread Alexander Graf


On 02.03.15 14:42, Andreas Färber wrote:
> Am 02.03.2015 um 14:37 schrieb Alexander Graf:
>> On 01.03.15 01:31, Andreas Färber wrote:
>>> This reverts commit 5b79b1cadd3e565b6d1a5ba59764bd47af58b271 to avoid
>>> double-registration of types:
>>>
>>>   Registering `POWER5+-powerpc64-cpu' which already exists
>>>
>>> Taking the textual description of a CPU type as part of a new type name
>>> is plain wrong, and so is unconditionally registering a new type here.
>>>
>>> Cc: Alexey Kardashevskiy 
>>> Cc: qemu-sta...@nongnu.org
>>> Signed-off-by: Andreas Färber 
>>
>> Doesn't this break p8 support?
> 
> Maybe, but p5 support was in longer and this is definitely a regression
> and really really wrong. If you know a way to fix it without handing it
> back to the IBM guys for more thought, feel free to give it a shot.

I honestly don't fully remember what this was about. Wasn't this our
special KVM class that we use to create a compatible cpu type on the fly?

Alexey, please take a look at it.


Alex

> 
> Andreas
> 
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v3 01/16] Introduce probe mode for machine type none

2015-03-02 Thread Andreas Färber
Am 02.03.2015 um 13:43 schrieb Michael Mueller:
> QEMU now switches into "probe mode" when the selected machine is "none" and no
> specific accelerator(s) has been requested (i.e.: "-machine none").
> 
> In probe mode a by "_CONFIG" defines predefined list of accelerators run
> their init() methods.
> 
> Signed-off-by: Michael Mueller 
> ---
>  accel.c  | 31 +--
>  include/hw/boards.h  |  1 +
>  include/sysemu/kvm.h | 10 ++
>  kvm-all.c|  3 +++
>  4 files changed, 39 insertions(+), 6 deletions(-)

Edgar/Peter, isn't Xilinx using -machine none in TCG mode?

> @@ -78,20 +83,30 @@ static int accel_init_machine(AccelClass *acc, 
> MachineState *ms)
>  
>  int configure_accelerator(MachineState *ms)
>  {
> -const char *p;
> +const char *p, *name;
>  char buf[10];
>  int ret;
>  bool accel_initialised = false;
>  bool init_failed = false;
>  AccelClass *acc = NULL;
> +ObjectClass *oc;
> +bool probe_mode = false;
>  
>  p = qemu_opt_get(qemu_get_machine_opts(), "accel");
>  if (p == NULL) {
> -/* Use the default "accelerator", tcg */
> -p = "tcg";
> +oc = (ObjectClass *) MACHINE_GET_CLASS(current_machine);
> +name = object_class_get_name(oc);
> +probe_mode = !strcmp(name, "none" TYPE_MACHINE_SUFFIX);
> +if (probe_mode) {
> +/* Use these accelerators in probe mode, tcg should be last */
> +p = probe_mode_accels;
> +} else {
> +/* Use the default "accelerator", tcg */
> +p = "tcg";
> +}
>  }

Can't we instead use an explicit ,accel=probe or ,accel=auto?
That would then obsolete the next patch.

Regards,
Andreas

>  
> -while (!accel_initialised && *p != '\0') {
> +while ((probe_mode || !accel_initialised) && *p != '\0') {
>  if (*p == ':') {
>  p++;
>  }

-- 
SUSE Linux GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
GF: Felix Imendörffer, Jane Smithard, Jennifer Guild, Dilip Upmanyu,
Graham Norton; HRB 21284 (AG Nürnberg)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] x86: svm: make wbinvd faster

2015-03-02 Thread Radim Krčmář
2015-03-01 21:29-0500, Bandan Das:
> Joel Schopp  writes:
> 
> > From: David Kaplan 
> > No need to re-decode WBINVD since we know what it is from the intercept.
> >
> > Signed-off-by: David Kaplan 
> > [extracted from larger unlrelated patch, forward ported, tested]
> > Signed-off-by: Joel Schopp 
> > ---
> > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> > +static int wbinvd_interception(struct vcpu_svm *svm)
> > +{
> > +   kvm_emulate_wbinvd(&svm->vcpu);
> > +   skip_emulated_instruction(&svm->vcpu);
> > +   return 1;
> > +}
> > +
> > +
> Can't we merge this to kvm_emulate_wbinvd, and just call that function
> directly for both vmx and svm ?

kvm_emulate_wbinvd() lives in x86.c and skip_emulated_instruction() is
from svm.c/vmx.c:  so we'd have to create a new x86 op and change the
emulator code as well ... it's probably better like this.

> >  static int xsetbv_interception(struct vcpu_svm *svm)
> >  {
> > u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
> > @@ -3376,7 +3384,7 @@ static int (*const svm_exit_handlers[])(struct 
> > vcpu_svm *svm) = {
> > [SVM_EXIT_STGI] = stgi_interception,
> > [SVM_EXIT_CLGI] = clgi_interception,
> > [SVM_EXIT_SKINIT]   = skinit_interception,
> > -   [SVM_EXIT_WBINVD]   = emulate_on_interception,
> So, this means x86_emulate_insn() in emulate.c has no callers left for the
> wbinvd case ? vmx calls kvm_emulate_wbinvd directly too..

I think that invalid state emulation might still hit wbinvd.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] Revert "target-ppc: Create versionless CPU class per family if KVM"

2015-03-02 Thread Andreas Färber
Am 02.03.2015 um 14:51 schrieb Alexander Graf:
> On 02.03.15 14:42, Andreas Färber wrote:
>> Am 02.03.2015 um 14:37 schrieb Alexander Graf:
>>> On 01.03.15 01:31, Andreas Färber wrote:
 This reverts commit 5b79b1cadd3e565b6d1a5ba59764bd47af58b271 to avoid
 double-registration of types:

   Registering `POWER5+-powerpc64-cpu' which already exists

 Taking the textual description of a CPU type as part of a new type name
 is plain wrong, and so is unconditionally registering a new type here.

 Cc: Alexey Kardashevskiy 
 Cc: qemu-sta...@nongnu.org
 Signed-off-by: Andreas Färber 
>>>
>>> Doesn't this break p8 support?
>>
>> Maybe, but p5 support was in longer and this is definitely a regression
>> and really really wrong. If you know a way to fix it without handing it
>> back to the IBM guys for more thought, feel free to give it a shot.
> 
> I honestly don't fully remember what this was about. Wasn't this our
> special KVM class that we use to create a compatible cpu type on the fly?

No, the class I create on the fly is a few lines above:

pvr_pcc = ppc_cpu_class_by_pvr(host_pvr);
if (pvr_pcc == NULL) {
pvr_pcc = ppc_cpu_class_by_pvr_mask(host_pvr);
}
if (pvr_pcc == NULL) {
return -1;
}
type_info.parent = object_class_get_name(OBJECT_CLASS(pvr_pcc));
type_register(&type_info);

So, if no matching class is returned, we never reach the offending code.

Here, a second type with the same parent was being created in the
kvm_ppc_register_host_cpu_type() function that is supposed to create
that host CPU type. Why? The host CPU type by definition should already
have the right PVR taken from the host. kvmppc_host_cpu_class_init():

/* Now fix up the class with information we can query from the host */
pcc->pvr = mfpvr();

> Alexey, please take a look at it.

Thanks,
Andreas

-- 
SUSE Linux GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
GF: Felix Imendörffer, Jane Smithard, Jennifer Guild, Dilip Upmanyu,
Graham Norton; HRB 21284 (AG Nürnberg)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] x86: svm: make wbinvd faster

2015-03-02 Thread Radim Krčmář
2015-02-27 18:19-0600, Joel Schopp:
> From: David Kaplan 
> No need to re-decode WBINVD since we know what it is from the intercept.
> 
> Signed-off-by: David Kaplan 
> [extracted from larger unlrelated patch, forward ported, tested]
> Signed-off-by: Joel Schopp 
> ---

Reviewed-by: Radim Krčmář 

> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> +static int wbinvd_interception(struct vcpu_svm *svm)
> +{
> + kvm_emulate_wbinvd(&svm->vcpu);
> + skip_emulated_instruction(&svm->vcpu);
> + return 1;
> +}
> +
> +

(One line is optimal.)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH 3/4] arm/arm64: KVM: Fix migration race in the arch timer

2015-03-02 Thread Alex Bennée

Marc Zyngier  writes:

> On 02/03/15 08:50, Alex Bennée wrote:
>> 
>> Marc Zyngier  writes:
>> 
>>> On Wed, 25 Feb 2015 15:36:21 +
>>> Alex Bennée  wrote:
>>>
>>> Alex, Christoffer,
>>>
>> 
>>>
>>> So the first half of the patch looks perfectly OK to me...
>>>
 diff --git a/virt/kvm/arm/vgic.c b/virt/kvm/arm/vgic.c
 index af6a521..3b4ded2 100644
 --- a/virt/kvm/arm/vgic.c
 +++ b/virt/kvm/arm/vgic.c
 @@ -263,6 +263,13 @@ static int vgic_irq_is_queued(struct kvm_vcpu
 *vcpu, int irq) return vgic_bitmap_get_irq_val(&dist->irq_queued,
 vcpu->vcpu_id, irq); }
  
 +static int vgic_irq_is_active(struct kvm_vcpu *vcpu, int irq)
 +{
 +  struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
 +
 +  return vgic_bitmap_get_irq_val(&dist->irq_active,
 vcpu->vcpu_id, irq); +}
 +
  static void vgic_irq_set_queued(struct kvm_vcpu *vcpu, int irq)
  {
struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
 @@ -285,6 +292,13 @@ static void vgic_irq_set_active(struct kvm_vcpu
 *vcpu, int irq) vgic_bitmap_set_irq_val(&dist->irq_active,
 vcpu->vcpu_id, irq, 1); }
  
 +static void vgic_irq_clear_active(struct kvm_vcpu *vcpu, int irq)
 +{
 +  struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
 +
 +  vgic_bitmap_set_irq_val(&dist->irq_active, vcpu->vcpu_id,
 irq, 0); +}
 +
  static int vgic_dist_irq_get_level(struct kvm_vcpu *vcpu, int irq)
  {
struct vgic_dist *dist = &vcpu->kvm->arch.vgic;
 @@ -634,16 +648,12 @@ bool vgic_handle_cfg_reg(u32 *reg, struct
 kvm_exit_mmio *mmio, }
  
  /**
 - * vgic_unqueue_irqs - move pending IRQs from LRs to the distributor
 + * vgic_unqueue_irqs - move pending/active IRQs from LRs to the
 distributor
   * @vgic_cpu: Pointer to the vgic_cpu struct holding the LRs
   *
 - * Move any pending IRQs that have already been assigned to LRs back
 to the
 + * Move any IRQs that have already been assigned to LRs back to the
   * emulated distributor state so that the complete emulated state
 can be read
   * from the main emulation structures without investigating the LRs.
 - *
 - * Note that IRQs in the active state in the LRs get their pending
 state moved
 - * to the distributor but the active state stays in the LRs, because
 we don't
 - * track the active state on the distributor side.
   */
  void vgic_unqueue_irqs(struct kvm_vcpu *vcpu)
  {
 @@ -919,7 +929,7 @@ static int compute_pending_for_cpu(struct
 kvm_vcpu *vcpu) 
  /*
   * Update the interrupt state and determine which CPUs have pending
 - * interrupts. Must be called with distributor lock held.
 + * or active interrupts. Must be called with distributor lock held.
   */
  void vgic_update_state(struct kvm *kvm)
  {
 @@ -1036,6 +1046,25 @@ static void vgic_retire_disabled_irqs(struct
 kvm_vcpu *vcpu) }
  }
  
 +static void vgic_queue_irq_to_lr(struct kvm_vcpu *vcpu, int irq,
 +   int lr_nr, struct vgic_lr vlr)
 +{
 +  if (vgic_irq_is_active(vcpu, irq)) {
 +  vlr.state |= LR_STATE_ACTIVE;
 +  kvm_debug("Set active, clear distributor: 0x%x\n",
 vlr.state);
 +  vgic_irq_clear_active(vcpu, irq);
 +  vgic_update_state(vcpu->kvm);
 +  } else if (vgic_dist_irq_is_pending(vcpu, irq)) {
 +  vlr.state |= LR_STATE_PENDING;
 +  kvm_debug("Set pending: 0x%x\n", vlr.state);
 +  }
 +
 +  if (!vgic_irq_is_edge(vcpu, irq))
 +  vlr.state |= LR_EOI_INT;
 +
 +  vgic_set_lr(vcpu, lr_nr, vlr);
 +}
 +
  /*
   * Queue an interrupt to a CPU virtual interface. Return true on
 success,
   * or false if it wasn't possible to queue it.
 @@ -1063,8 +1092,7 @@ bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8
 sgi_source_id, int irq) if (vlr.source == sgi_source_id) {
kvm_debug("LR%d piggyback for IRQ%d\n", lr,
 vlr.irq); BUG_ON(!test_bit(lr, vgic_cpu->lr_used));
 -  vlr.state |= LR_STATE_PENDING;
 -  vgic_set_lr(vcpu, lr, vlr);
 +  vgic_queue_irq_to_lr(vcpu, irq, lr, vlr);
return true;
}
}
 @@ -1081,11 +1109,8 @@ bool vgic_queue_irq(struct kvm_vcpu *vcpu, u8
 sgi_source_id, int irq) 
vlr.irq = irq;
vlr.source = sgi_source_id;
 -  vlr.state = LR_STATE_PENDING;
 -  if (!vgic_irq_is_edge(vcpu, irq))
 -  vlr.state |= LR_EOI_INT;
 -
 -  vgic_set_lr(vcpu, lr, vlr);
 +  vlr.state = 0;
 +  vgic_queue_irq_to_lr(vcpu, irq, lr, vlr);
  
return true;
  }
>>>
>>>
>>> ... but this whole vgic rework seems rather out of place, and I can't
>>> really see its connection with the timer. Isn't it logically part of the
>>> previous patch?
>> 
>

Re: [PATCH] x86: svm: make wbinvd faster

2015-03-02 Thread Bandan Das
Radim Krčmář  writes:

> 2015-03-01 21:29-0500, Bandan Das:
>> Joel Schopp  writes:
>> 
>> > From: David Kaplan 
>> > No need to re-decode WBINVD since we know what it is from the intercept.
>> >
>> > Signed-off-by: David Kaplan 
>> > [extracted from larger unlrelated patch, forward ported, tested]
>> > Signed-off-by: Joel Schopp 
>> > ---
>> > diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
>> > +static int wbinvd_interception(struct vcpu_svm *svm)
>> > +{
>> > +  kvm_emulate_wbinvd(&svm->vcpu);
>> > +  skip_emulated_instruction(&svm->vcpu);
>> > +  return 1;
>> > +}
>> > +
>> > +
>> Can't we merge this to kvm_emulate_wbinvd, and just call that function
>> directly for both vmx and svm ?
>
> kvm_emulate_wbinvd() lives in x86.c and skip_emulated_instruction() is
> from svm.c/vmx.c:  so we'd have to create a new x86 op and change the
> emulator code as well ... it's probably better like this.

There's already one - kvm_x86_ops->skip_emulated_instruction

>> >  static int xsetbv_interception(struct vcpu_svm *svm)
>> >  {
>> >u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
>> > @@ -3376,7 +3384,7 @@ static int (*const svm_exit_handlers[])(struct 
>> > vcpu_svm *svm) = {
>> >[SVM_EXIT_STGI] = stgi_interception,
>> >[SVM_EXIT_CLGI] = clgi_interception,
>> >[SVM_EXIT_SKINIT]   = skinit_interception,
>> > -  [SVM_EXIT_WBINVD]   = emulate_on_interception,
>> So, this means x86_emulate_insn() in emulate.c has no callers left for the
>> wbinvd case ? vmx calls kvm_emulate_wbinvd directly too..
>
> I think that invalid state emulation might still hit wbinvd.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] x86: svm: make wbinvd faster

2015-03-02 Thread Radim Krčmář
2015-03-02 10:25-0500, Bandan Das:
> Radim Krčmář  writes:
> > 2015-03-01 21:29-0500, Bandan Das:
> >> Joel Schopp  writes:
> >> > +static int wbinvd_interception(struct vcpu_svm *svm)
> >> > +{
> >> > +kvm_emulate_wbinvd(&svm->vcpu);
> >> > +skip_emulated_instruction(&svm->vcpu);
> >> > +return 1;
> >> > +}
> >> Can't we merge this to kvm_emulate_wbinvd, and just call that function
> >> directly for both vmx and svm ?
> >
> > kvm_emulate_wbinvd() lives in x86.c and skip_emulated_instruction() is
> > from svm.c/vmx.c:  so we'd have to create a new x86 op and change the
> > emulator code as well ... it's probably better like this.
> 
> There's already one - kvm_x86_ops->skip_emulated_instruction

My bad, its usage is inconsistent and I only looked at two close
interceptions where it was used ... kvm_emulate_cpuid() calls
kvm_x86_ops->skip_emulated_instruction(), while kvm_emulate_halt() and
kvm_emulate_hypercall() need an external skip.

We do "skip" the instruction with kvm_emulate(), so automatically
skipping the instruction on kvm_emulate_*() makes sense:
 1. rename kvm_emulate_halt() and kvm_emulate_wbinvd() to accommodate
callers that don't want to skip
 2. introduce kvm_emulate_{halt,wbinvd}() and move the skip to to
kvm_emulate_{halt,wbinvd,hypercall}()

The alternative is to remove kvm_x86_ops->skip_emulated_instruction():
 1. remove skip from kvm_emulate_cpuid() and modify callers
 2. move kvm_complete_insn_gp to a header file and use
skip_emulated_instruction directly
 3. remove unused kvm_x86_ops->skip_emulated_instruction()

Which one do you prefer?

Thanks.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] x86: svm: use kvm_fast_pio_in()

2015-03-02 Thread Joel Schopp



+   if (in)
+   return kvm_fast_pio_in(vcpu, size, port);

Have I missed a patch that defined kvm_fast_pio_in()?
Not sure how I managed to leave out the bulk of the patch. Resending v2 
momentarily.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v8 5/5] KVM: arm/arm64: add irqfd support

2015-03-02 Thread Eric Auger
On 02/23/2015 10:45 PM, Christoffer Dall wrote:
> On Mon, Jan 19, 2015 at 05:43:13PM +0100, Eric Auger wrote:
>> This patch enables irqfd on arm/arm64.
>>
>> Both irqfd and resamplefd are supported. Injection is implemented
>> in vgic.c without routing.
>>
>> This patch enables CONFIG_HAVE_KVM_EVENTFD and CONFIG_HAVE_KVM_IRQFD.
>>
>> KVM_CAP_IRQFD is now advertised. KVM_CAP_IRQFD_RESAMPLE capability
>> automatically is advertised as soon as CONFIG_HAVE_KVM_IRQFD is set.
>>
>> Irqfd injection is restricted to SPI. The rationale behind not
>> supporting PPI irqfd injection is that any device using a PPI would
>> be a private-to-the-CPU device (timer for instance), so its state
>> would have to be context-switched along with the VCPU and would
>> require in-kernel wiring anyhow. It is not a relevant use case for
>> irqfds.
>>
>> Signed-off-by: Eric Auger 
>>
>> ---
>> v7 -> v8:
>> - remove kvm_irq_has_notifier call
>> - part of dist locking changes now are part of previous patch file
>> - remove gic_initialized() check in kvm_set_irq
>> - remove Christoffer's Reviewed-by after this change
>>
>> v5 -> v6:
>> - KVM_CAP_IRQFD support depends on vgic_present
>> - add Christoffer's Reviewed-by
>>
>> v4 -> v5:
>> - squash [PATCH v4 3/3] KVM: arm64: add irqfd support into this patch
>> - some rewording in Documentation/virtual/kvm/api.txt and in vgic
>>   vgic_process_maintenance unlock comment.
>> - move explanation of why not supporting PPI into commit message
>> - in case of injection before gic readiness, -ENODEV is returned. It is
>>   up to the user space to avoid this situation.
>>
>> v3 -> v4:
>> - reword commit message
>> - explain why we unlock the distributor before calling kvm_notify_acked_irq
>> - rename is_assigned_irq into has_notifier
>> - change EOI and injection kvm_debug format string
>> - remove error local variable in kvm_set_irq
>> - Move HAVE_KVM_IRQCHIP unset in a separate patch
>> - handle case were the irqfd injection is attempted before the vgic is ready.
>>   in such a case the notifier, if any, is called immediatly
>> - use nr_irqs to test spi is within correct range
>>
>> v2 -> v3:
>> - removal of irq.h from eventfd.c put in a separate patch to increase
>>   visibility
>> - properly expose KVM_CAP_IRQFD capability in arm.c
>> - remove CONFIG_HAVE_KVM_IRQCHIP meaningfull only if irq_comm.c is used
>>
>> v1 -> v2:
>> - rebase on 3.17rc1
>> - move of the dist unlock in process_maintenance
>> - remove of dist lock in __kvm_vgic_sync_hwstate
>> - rewording of the commit message (add resamplefd reference)
>> - remove irq.h
>> ---
>>  Documentation/virtual/kvm/api.txt |  6 +-
>>  arch/arm/include/uapi/asm/kvm.h   |  3 +++
>>  arch/arm/kvm/Kconfig  |  2 ++
>>  arch/arm/kvm/Makefile |  2 +-
>>  arch/arm/kvm/arm.c|  5 +
>>  arch/arm64/include/uapi/asm/kvm.h |  3 +++
>>  arch/arm64/kvm/Kconfig|  2 ++
>>  arch/arm64/kvm/Makefile   |  2 +-
>>  virt/kvm/arm/vgic.c   | 45 
>> +++
>>  9 files changed, 67 insertions(+), 3 deletions(-)
>>
>> diff --git a/Documentation/virtual/kvm/api.txt 
>> b/Documentation/virtual/kvm/api.txt
>> index 0007fef..5ed8088 100644
>> --- a/Documentation/virtual/kvm/api.txt
>> +++ b/Documentation/virtual/kvm/api.txt
>> @@ -2231,7 +2231,7 @@ into the hash PTE second double word).
>>  4.75 KVM_IRQFD
>>  
>>  Capability: KVM_CAP_IRQFD
>> -Architectures: x86 s390
>> +Architectures: x86 s390 arm arm64
>>  Type: vm ioctl
>>  Parameters: struct kvm_irqfd (in)
>>  Returns: 0 on success, -1 on error
>> @@ -2257,6 +2257,10 @@ Note that closing the resamplefd is not sufficient to 
>> disable the
>>  irqfd.  The KVM_IRQFD_FLAG_RESAMPLE is only necessary on assignment
>>  and need not be specified with KVM_IRQFD_FLAG_DEASSIGN.
>>  
>> +On ARM/ARM64, the gsi field in the kvm_irqfd struct specifies the Shared
>> +Peripheral Interrupt (SPI) index, such that the GIC interrupt ID is
>> +given by gsi + 32.
>> +
>>  4.76 KVM_PPC_ALLOCATE_HTAB
>>  
>>  Capability: KVM_CAP_PPC_ALLOC_HTAB
>> diff --git a/arch/arm/include/uapi/asm/kvm.h 
>> b/arch/arm/include/uapi/asm/kvm.h
>> index 0db25bc..2499867 100644
>> --- a/arch/arm/include/uapi/asm/kvm.h
>> +++ b/arch/arm/include/uapi/asm/kvm.h
>> @@ -198,6 +198,9 @@ struct kvm_arch_memory_slot {
>>  /* Highest supported SPI, from VGIC_NR_IRQS */
>>  #define KVM_ARM_IRQ_GIC_MAX 127
>>  
>> +/* One single KVM irqchip, ie. the VGIC */
>> +#define KVM_NR_IRQCHIPS  1
>> +
>>  /* PSCI interface */
>>  #define KVM_PSCI_FN_BASE0x95c1ba5e
>>  #define KVM_PSCI_FN(n)  (KVM_PSCI_FN_BASE + (n))
>> diff --git a/arch/arm/kvm/Kconfig b/arch/arm/kvm/Kconfig
>> index 9f581b1..e519a40 100644
>> --- a/arch/arm/kvm/Kconfig
>> +++ b/arch/arm/kvm/Kconfig
>> @@ -24,6 +24,7 @@ config KVM
>>  select KVM_MMIO
>>  select KVM_ARM_HOST
>>  depends on ARM_VIRT_EXT && ARM_LPAE
>> +select HAVE_KVM_EVENTFD
>> 

Re: [kvm-unit-tests PATCH 00/18] arm/arm64: add smp support

2015-03-02 Thread Christoffer Dall
On Thu, Feb 26, 2015 at 02:50:38PM +0100, Andrew Jones wrote:
> On Thu, Feb 26, 2015 at 12:34:02PM +0100, Christoffer Dall wrote:
> > On Sun, Feb 01, 2015 at 07:34:28PM +0100, Andrew Jones wrote:
> > > This series extends the kvm-unit-tests/arm[64] framework to support smp.
> > > A break down of the patches is as follows
> > > 
> > > 01-02: prepare general framework for smp use
> > > 03-06: arm/arm64 fixups not 100% related to this series,
> > >but need to post some time...
> > > 07-09: add thread_info (for per-thread data) and suck some global
> > >data into it
> > > 10-11: add cpumask support (for per-cpu data) and suck some more
> > >global data in
> > >12: add arm64 simple spinlock implementation
> > > 13-14: add some PSCI support
> > > 15-16: further prep for smp_boot_secondary
> > >17: finally add smp_boot_secondary
> > >18: as usual, add a selftest to make sure it all works
> > > 
> > > These patches are also available here:
> > > https://github.com/rhdrjones/kvm-unit-tests/tree/arm/smp
> > > 
> > I've tested these patches on Juno and they seem to run fine, however,
> > since we don't support big.LITTLE yet, you have to run them under
> > 'taskset ', but the config script uses $(getconf
> > _NPROCESSORS_CONF), which returns 6, and QEMU fails.  The interesting
> 
> Should I try to read the number of host cpus from some other source?
> If you know something I can read that also works on big.LITTLE, then
> I can change it now.
> 

I have no idea what the right scripting fix would be. But we should
really fix big.LITTLE support in KVM.  Hmmm.

> > bit is that the unit-tests still report 'PASS' - not sure why.
> 
> Ah, this is due to the weird way qemu's debugexit device sets its exit
> code
> 
> hw/misc/debugexit.c:debug_exit_write()
> {
>   exit((val << 1) | 1);
> }
> 
> To be consistent with that we made chr-testdev do the same thing (see
> backends/testdev.c:testdev_eat_packet():case 'q'). Now, the
> kvm-unit-tests run_tests.sh script knows about that, so it has
> 
>   eval $cmdline >> test.log
>   if [ $? -le 1 ]; then
>  echo -e "\e[32mPASS\e[0m $1"
>   else
>  echo -e "\e[31mFAIL\e[0m $1"
>   fi
> 
> Yes, this sucks, as we can't tell the difference between qemu failing
> to run the test, and exiting with 1 vs. the test running, passing -
> exiting with (0 << 1) | 1. It's too bad debugexit didn't set a higher
> bit (like 5 or 6) to flag a "debug exit". Maybe it's not too late to
> change it? Paolo?
> 

This would be really good to address somehow, because we don't want
to report that everything is happy when the test harness broke, that
really goes against the whole idea of this work.

Thanks,
-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC/RFT PATCH 0/3] arm64: KVM: work around incoherency with uncached guest mappings

2015-03-02 Thread Christoffer Dall
On Tue, Feb 24, 2015 at 05:47:19PM +, Ard Biesheuvel wrote:
> On 24 February 2015 at 14:55, Andrew Jones  wrote:
> > On Fri, Feb 20, 2015 at 04:36:26PM +0100, Andrew Jones wrote:
> >> On Fri, Feb 20, 2015 at 02:37:25PM +, Ard Biesheuvel wrote:
> >> > On 20 February 2015 at 14:29, Andrew Jones  wrote:
> >> > > So looks like the 3 orders of magnitude greater number of traps
> >> > > (only to el2) don't impact kernel compiles.
> >> > >
> >> >
> >> > OK, good! That was what I was hoping for, obviously.
> >> >
> >> > > Then I thought I'd be able to quick measure the number of cycles
> >> > > a trap to el2 takes with this kvm-unit-tests test
> >> > >
> >> > > int main(void)
> >> > > {
> >> > > unsigned long start, end;
> >> > > unsigned int sctlr;
> >> > >
> >> > > asm volatile(
> >> > > "   mrs %0, sctlr_el1\n"
> >> > > "   msr pmcr_el0, %1\n"
> >> > > : "=&r" (sctlr) : "r" (5));
> >> > >
> >> > > asm volatile(
> >> > > "   mrs %0, pmccntr_el0\n"
> >> > > "   msr sctlr_el1, %2\n"
> >> > > "   mrs %1, pmccntr_el0\n"
> >> > > : "=&r" (start), "=&r" (end) : "r" (sctlr));
> >> > >
> >> > > printf("%llx\n", end - start);
> >> > > return 0;
> >> > > }
> >> > >
> >> > > after applying this patch to kvm
> >> > >
> >> > > diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
> >> > > index bb91b6fc63861..5de39d740aa58 100644
> >> > > --- a/arch/arm64/kvm/hyp.S
> >> > > +++ b/arch/arm64/kvm/hyp.S
> >> > > @@ -770,7 +770,7 @@
> >> > >
> >> > > mrs x2, mdcr_el2
> >> > > and x2, x2, #MDCR_EL2_HPMN_MASK
> >> > > -   orr x2, x2, #(MDCR_EL2_TPM | MDCR_EL2_TPMCR)
> >> > > +// orr x2, x2, #(MDCR_EL2_TPM | MDCR_EL2_TPMCR)
> >> > > orr x2, x2, #(MDCR_EL2_TDRA | MDCR_EL2_TDOSA)
> >> > >
> >> > > // Check for KVM_ARM64_DEBUG_DIRTY, and set debug to trap
> >> > >
> >> > > But I get zero for the cycle count. Not sure what I'm missing.
> >> > >
> >> >
> >> > No clue tbh. Does the counter work as expected in the host?
> >> >
> >>
> >> Guess not. I dropped the test into a module_init and inserted
> >> it on the host. Always get zero for pmccntr_el0 reads. Or, if
> >> I set it to something non-zero with a write, then I always get
> >> that back - no increments. pmcr_el0 looks OK... I had forgotten
> >> to set bit 31 of pmcntenset_el0, but doing that still doesn't
> >> help. Anyway, I assume the problem is me. I'll keep looking to
> >> see what I'm missing.
> >>
> >
> > I returned to this and see that the problem was indeed me. I needed yet
> > another enable bit set (the filter register needed to be instructed to
> > count cycles while in el2). I've attached the code for the curious.
> > The numbers are mean=6999, std_dev=242. Run on the host, or in a guest
> > running on a host without this patch series (after TVM traps have been
> > disabled), I get a pretty consistent 40.
> >
> > I checked how many vm-sysreg traps we do during the kernel compile
> > benchmark. It's 124924. So it's a bit strange that we don't see the
> > benchmark taking 10 to 20 seconds longer on average. I should probably
> > double check my runs. In any case, while I like the approach of this
> > series, the overhead is looking non-negligible.
> >
> 
> Thanks a lot for producing these numbers. 125k x 7k == <1 billion
> cycles == <1 second on a >1 GHz machine, I think?
> Or am I missing something? How long does the actual compile take?
> 
I ran a sequence of benchmarks that I occasionally run (pbzip,
kernbench, and hackbench) and I also saw < 1% performance degradation,
so I think we can trust that somewhat.  (I can post the raw numbers when
I have ssh access to my Linux desktop - sending this from Somewhere Over
The Atlantic).

However, my concern with these patches are on two points:

1. It's not a fix-all.  We still have the case where the guest expects
the behavior of device memory (for strong ordering for example) on a RAM
region, which we now break.  Similiarly this doesn't support the
non-coherent DMA to RAM region case.

2. While the code is probably as nice as this kind of stuff gets, it
is non-trivial and extremely difficult to debug.  The counter-point here
is that we may end up handling other stuff at EL2 for performanc reasons
in the future.

Mainly because of point 1 above, I am leaning to thinking userspace
should do the invalidation when it knows it needs to, either through KVM
via a memslot flag or through some other syscall mechanism.

Thanks,
-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2] x86: svm: use kvm_fast_pio_in()

2015-03-02 Thread Joel Schopp
From: David Kaplan 

We can make the in instruction go faster the same way the out instruction is
already.

Changes from v1
* Added kvm_fast_pio_in() implementation that was left out of v1

Signed-off-by: David Kaplan 
[extracted from larger unlrelated patch, forward ported, tested]
Signed-off-by: Joel Schopp 
---
 arch/x86/include/asm/kvm_host.h |1 +
 arch/x86/kvm/svm.c  |4 +++-
 arch/x86/kvm/x86.c  |   33 +
 3 files changed, 37 insertions(+), 1 deletion(-)

diff --git a/arch/x86/include/asm/kvm_host.h b/arch/x86/include/asm/kvm_host.h
index a236e39..b976824 100644
--- a/arch/x86/include/asm/kvm_host.h
+++ b/arch/x86/include/asm/kvm_host.h
@@ -931,6 +931,7 @@ int kvm_set_msr(struct kvm_vcpu *vcpu, struct msr_data 
*msr);
 struct x86_emulate_ctxt;
 
 int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, unsigned short port);
+int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port);
 void kvm_emulate_cpuid(struct kvm_vcpu *vcpu);
 int kvm_emulate_halt(struct kvm_vcpu *vcpu);
 int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu);
diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d319e0c..f8c906b 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1899,7 +1899,7 @@ static int io_interception(struct vcpu_svm *svm)
++svm->vcpu.stat.io_exits;
string = (io_info & SVM_IOIO_STR_MASK) != 0;
in = (io_info & SVM_IOIO_TYPE_MASK) != 0;
-   if (string || in)
+   if (string)
return emulate_instruction(vcpu, 0) == EMULATE_DONE;
 
port = io_info >> 16;
@@ -1907,6 +1907,8 @@ static int io_interception(struct vcpu_svm *svm)
svm->next_rip = svm->vmcb->control.exit_info_2;
skip_emulated_instruction(&svm->vcpu);
 
+   if (in)
+   return kvm_fast_pio_in(vcpu, size, port);
return kvm_fast_pio_out(vcpu, size, port);
 }
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bd7a70b..089247c 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -5463,6 +5463,39 @@ int kvm_fast_pio_out(struct kvm_vcpu *vcpu, int size, 
unsigned short port)
 }
 EXPORT_SYMBOL_GPL(kvm_fast_pio_out);
 
+static int complete_fast_pio(struct kvm_vcpu *vcpu)
+{
+   u32 new_rax = kvm_register_read(vcpu, VCPU_REGS_RAX);
+
+   if (!vcpu->arch.pio.count)
+   return 0;
+   if (vcpu->arch.pio.count * vcpu->arch.pio.size > 8)
+   return 0;
+
+   memcpy(&new_rax, vcpu->arch.pio_data,
+  vcpu->arch.pio.count * vcpu->arch.pio.size);
+   kvm_register_write(vcpu, VCPU_REGS_RAX, new_rax);
+
+   vcpu->arch.pio.count = 0;
+   return 1;
+}
+
+int kvm_fast_pio_in(struct kvm_vcpu *vcpu, int size, unsigned short port)
+{
+   unsigned long val;
+   int ret = emulator_pio_in_emulated(&vcpu->arch.emulate_ctxt, size,
+  port, &val, 1);
+
+   if (ret) {
+   kvm_register_write(vcpu, VCPU_REGS_RAX, val);
+   vcpu->arch.pio.count = 0;
+   } else
+   vcpu->arch.complete_userspace_io = complete_fast_pio;
+
+   return ret;
+}
+EXPORT_SYMBOL_GPL(kvm_fast_pio_in);
+
 static void tsc_bad(void *info)
 {
__this_cpu_write(cpu_tsc_khz, 0);

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 88671] Radeon driver fails to reset hardware properly after kvm guest reboot

2015-03-02 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=88671

--- Comment #6 from Tom Stellard  ---
I've been playing with this a little more and it seems to be working correctly,
but radeon dynamic power management (dpm) always fails to initialize on the
second guest boot.  My questions are:

1. What methods are being used by kvm/qemu/libvirt to reset the GPU on guest
shutdown?

2. Is the problem only cuased by the fact that GPU reset is not implemented
correctly in the radeon driver of are there improvements that are needed in
kvm/qemu/libvirt in order to get this working?

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC/RFT PATCH 0/3] arm64: KVM: work around incoherency with uncached guest mappings

2015-03-02 Thread Paolo Bonzini


On 02/03/2015 17:31, Christoffer Dall wrote:
> 2. While the code is probably as nice as this kind of stuff gets, it
> is non-trivial and extremely difficult to debug.  The counter-point here
> is that we may end up handling other stuff at EL2 for performanc reasons
> in the future.
> 
> Mainly because of point 1 above, I am leaning to thinking userspace
> should do the invalidation when it knows it needs to, either through KVM
> via a memslot flag or through some other syscall mechanism.

I'm okay with adding a KVM capability and ioctl that flushes the dcache
for a given gpa range.  However:

1) I'd like to have an implementation for QEMU and/or kvmtool before
accepting that ioctl.

2) I think the ioctl should work whatever the stage1 mapping is (e.g.
with and without Ard's patches, with and without Laszlo's OVMF patch, etc.).

Also, we may want to invalidate the cache for dirty pages before
returning the dirty bitmap, and probably should do that directly in
KVM_GET_DIRTY_LOG.

Paolo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH v3 01/16] Introduce probe mode for machine type none

2015-03-02 Thread Michael Mueller
On Mon, 02 Mar 2015 14:57:21 +0100
Andreas Färber  wrote:

> >  int configure_accelerator(MachineState *ms)
> >  {
> > -const char *p;
> > +const char *p, *name;
> >  char buf[10];
> >  int ret;
> >  bool accel_initialised = false;
> >  bool init_failed = false;
> >  AccelClass *acc = NULL;
> > +ObjectClass *oc;
> > +bool probe_mode = false;
> >  
> >  p = qemu_opt_get(qemu_get_machine_opts(), "accel");
> >  if (p == NULL) {
> > -/* Use the default "accelerator", tcg */
> > -p = "tcg";
> > +oc = (ObjectClass *) MACHINE_GET_CLASS(current_machine);
> > +name = object_class_get_name(oc);
> > +probe_mode = !strcmp(name, "none" TYPE_MACHINE_SUFFIX);
> > +if (probe_mode) {
> > +/* Use these accelerators in probe mode, tcg should be last */
> > +p = probe_mode_accels;
> > +} else {
> > +/* Use the default "accelerator", tcg */
> > +p = "tcg";
> > +}
> >  }  
> 
> Can't we instead use an explicit ,accel=probe or ,accel=auto?
> That would then obsolete the next patch.

How would you express the following with the accel= approach?

-probe -machine s390-ccw,accel=kvm 

Using machine "none" as default with tcg as last accelerator initialized should 
not break
anything.

-M none

The return code of configure_accelerator() is ignored anyway.

Thanks,
Michael

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [RFC/RFT PATCH 0/3] arm64: KVM: work around incoherency with uncached guest mappings

2015-03-02 Thread Andrew Jones
On Mon, Mar 02, 2015 at 08:31:46AM -0800, Christoffer Dall wrote:
> On Tue, Feb 24, 2015 at 05:47:19PM +, Ard Biesheuvel wrote:
> > On 24 February 2015 at 14:55, Andrew Jones  wrote:
> > > On Fri, Feb 20, 2015 at 04:36:26PM +0100, Andrew Jones wrote:
> > >> On Fri, Feb 20, 2015 at 02:37:25PM +, Ard Biesheuvel wrote:
> > >> > On 20 February 2015 at 14:29, Andrew Jones  wrote:
> > >> > > So looks like the 3 orders of magnitude greater number of traps
> > >> > > (only to el2) don't impact kernel compiles.
> > >> > >
> > >> >
> > >> > OK, good! That was what I was hoping for, obviously.
> > >> >
> > >> > > Then I thought I'd be able to quick measure the number of cycles
> > >> > > a trap to el2 takes with this kvm-unit-tests test
> > >> > >
> > >> > > int main(void)
> > >> > > {
> > >> > > unsigned long start, end;
> > >> > > unsigned int sctlr;
> > >> > >
> > >> > > asm volatile(
> > >> > > "   mrs %0, sctlr_el1\n"
> > >> > > "   msr pmcr_el0, %1\n"
> > >> > > : "=&r" (sctlr) : "r" (5));
> > >> > >
> > >> > > asm volatile(
> > >> > > "   mrs %0, pmccntr_el0\n"
> > >> > > "   msr sctlr_el1, %2\n"
> > >> > > "   mrs %1, pmccntr_el0\n"
> > >> > > : "=&r" (start), "=&r" (end) : "r" (sctlr));
> > >> > >
> > >> > > printf("%llx\n", end - start);
> > >> > > return 0;
> > >> > > }
> > >> > >
> > >> > > after applying this patch to kvm
> > >> > >
> > >> > > diff --git a/arch/arm64/kvm/hyp.S b/arch/arm64/kvm/hyp.S
> > >> > > index bb91b6fc63861..5de39d740aa58 100644
> > >> > > --- a/arch/arm64/kvm/hyp.S
> > >> > > +++ b/arch/arm64/kvm/hyp.S
> > >> > > @@ -770,7 +770,7 @@
> > >> > >
> > >> > > mrs x2, mdcr_el2
> > >> > > and x2, x2, #MDCR_EL2_HPMN_MASK
> > >> > > -   orr x2, x2, #(MDCR_EL2_TPM | MDCR_EL2_TPMCR)
> > >> > > +// orr x2, x2, #(MDCR_EL2_TPM | MDCR_EL2_TPMCR)
> > >> > > orr x2, x2, #(MDCR_EL2_TDRA | MDCR_EL2_TDOSA)
> > >> > >
> > >> > > // Check for KVM_ARM64_DEBUG_DIRTY, and set debug to trap
> > >> > >
> > >> > > But I get zero for the cycle count. Not sure what I'm missing.
> > >> > >
> > >> >
> > >> > No clue tbh. Does the counter work as expected in the host?
> > >> >
> > >>
> > >> Guess not. I dropped the test into a module_init and inserted
> > >> it on the host. Always get zero for pmccntr_el0 reads. Or, if
> > >> I set it to something non-zero with a write, then I always get
> > >> that back - no increments. pmcr_el0 looks OK... I had forgotten
> > >> to set bit 31 of pmcntenset_el0, but doing that still doesn't
> > >> help. Anyway, I assume the problem is me. I'll keep looking to
> > >> see what I'm missing.
> > >>
> > >
> > > I returned to this and see that the problem was indeed me. I needed yet
> > > another enable bit set (the filter register needed to be instructed to
> > > count cycles while in el2). I've attached the code for the curious.
> > > The numbers are mean=6999, std_dev=242. Run on the host, or in a guest
> > > running on a host without this patch series (after TVM traps have been
> > > disabled), I get a pretty consistent 40.
> > >
> > > I checked how many vm-sysreg traps we do during the kernel compile
> > > benchmark. It's 124924. So it's a bit strange that we don't see the
> > > benchmark taking 10 to 20 seconds longer on average. I should probably
> > > double check my runs. In any case, while I like the approach of this
> > > series, the overhead is looking non-negligible.
> > >
> > 
> > Thanks a lot for producing these numbers. 125k x 7k == <1 billion
> > cycles == <1 second on a >1 GHz machine, I think?
> > Or am I missing something? How long does the actual compile take?
> > 
> I ran a sequence of benchmarks that I occasionally run (pbzip,
> kernbench, and hackbench) and I also saw < 1% performance degradation,
> so I think we can trust that somewhat.  (I can post the raw numbers when
> I have ssh access to my Linux desktop - sending this from Somewhere Over
> The Atlantic).
> 
> However, my concern with these patches are on two points:
> 
> 1. It's not a fix-all.  We still have the case where the guest expects
> the behavior of device memory (for strong ordering for example) on a RAM
> region, which we now break.  Similiarly this doesn't support the
> non-coherent DMA to RAM region case.
> 
> 2. While the code is probably as nice as this kind of stuff gets, it
> is non-trivial and extremely difficult to debug.  The counter-point here
> is that we may end up handling other stuff at EL2 for performanc reasons
> in the future.
> 
> Mainly because of point 1 above, I am leaning to thinking userspace
> should do the invalidation when it knows it needs to, either through KVM
> via a memslot flag or through some other syscall mechanism.

I've started down the memslot flag road by promoting KVM_MEMSLOT_INCOHERENT
to uapi/KVM_MEM_INCOHERENT, replacing the readonly memslot heuristic.
With a coup

Re: [RFC/RFT PATCH 0/3] arm64: KVM: work around incoherency with uncached guest mappings

2015-03-02 Thread Laszlo Ersek
On 03/02/15 17:47, Paolo Bonzini wrote:
> 
> Also, we may want to invalidate the cache for dirty pages before
> returning the dirty bitmap, and probably should do that directly in
> KVM_GET_DIRTY_LOG.

"I agree."

If KVM_GET_DIRTY_LOG is supposed to be atomic fetch and clear (from
userspace's aspect), then the cache invalidation should be an atomic
part of it too (from the same aspect).

(Sorry if I just said something incredibly stupid.)

Laszlo
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] x86: svm: make wbinvd faster

2015-03-02 Thread Joel Schopp


On 03/02/2015 10:03 AM, Radim Krčmář wrote:

2015-03-02 10:25-0500, Bandan Das:

Radim Krčmář  writes:

2015-03-01 21:29-0500, Bandan Das:

Joel Schopp  writes:

+static int wbinvd_interception(struct vcpu_svm *svm)
+{
+   kvm_emulate_wbinvd(&svm->vcpu);
+   skip_emulated_instruction(&svm->vcpu);
+   return 1;
+}

Can't we merge this to kvm_emulate_wbinvd, and just call that function
directly for both vmx and svm ?

kvm_emulate_wbinvd() lives in x86.c and skip_emulated_instruction() is
from svm.c/vmx.c:  so we'd have to create a new x86 op and change the
emulator code as well ... it's probably better like this.

There's already one - kvm_x86_ops->skip_emulated_instruction

My bad, its usage is inconsistent and I only looked at two close
interceptions where it was used ... kvm_emulate_cpuid() calls
kvm_x86_ops->skip_emulated_instruction(), while kvm_emulate_halt() and
kvm_emulate_hypercall() need an external skip.

We do "skip" the instruction with kvm_emulate(), so automatically
skipping the instruction on kvm_emulate_*() makes sense:
  1. rename kvm_emulate_halt() and kvm_emulate_wbinvd() to accommodate
 callers that don't want to skip
  2. introduce kvm_emulate_{halt,wbinvd}() and move the skip to to
 kvm_emulate_{halt,wbinvd,hypercall}()

The alternative is to remove kvm_x86_ops->skip_emulated_instruction():
  1. remove skip from kvm_emulate_cpuid() and modify callers
  2. move kvm_complete_insn_gp to a header file and use
 skip_emulated_instruction directly
  3. remove unused kvm_x86_ops->skip_emulated_instruction()

Which one do you prefer?
I prefer renaming them,  ie kvm_emulate_wbinvd_noskip(), and making the 
existing ones, ie kvm_emulate_wbinvd() call the noskip verion and add a 
skip similar to how wbinvd_interception above does.  I can send out a 
patch later today with that rework.

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [Qemu-devel] [PATCH v3 01/16] Introduce probe mode for machine type none

2015-03-02 Thread Andreas Färber
Am 02.03.2015 um 17:43 schrieb Michael Mueller:
> On Mon, 02 Mar 2015 14:57:21 +0100
> Andreas Färber  wrote:
> 
>>>  int configure_accelerator(MachineState *ms)
>>>  {
>>> -const char *p;
>>> +const char *p, *name;
>>>  char buf[10];
>>>  int ret;
>>>  bool accel_initialised = false;
>>>  bool init_failed = false;
>>>  AccelClass *acc = NULL;
>>> +ObjectClass *oc;
>>> +bool probe_mode = false;
>>>  
>>>  p = qemu_opt_get(qemu_get_machine_opts(), "accel");
>>>  if (p == NULL) {
>>> -/* Use the default "accelerator", tcg */
>>> -p = "tcg";
>>> +oc = (ObjectClass *) MACHINE_GET_CLASS(current_machine);
>>> +name = object_class_get_name(oc);
>>> +probe_mode = !strcmp(name, "none" TYPE_MACHINE_SUFFIX);
>>> +if (probe_mode) {
>>> +/* Use these accelerators in probe mode, tcg should be last */
>>> +p = probe_mode_accels;
>>> +} else {
>>> +/* Use the default "accelerator", tcg */
>>> +p = "tcg";
>>> +}
>>>  }  
>>
>> Can't we instead use an explicit ,accel=probe or ,accel=auto?
>> That would then obsolete the next patch.
> 
> How would you express the following with the accel= approach?
> 
> -probe -machine s390-ccw,accel=kvm 
> 
> Using machine "none" as default with tcg as last accelerator initialized 
> should not break
> anything.
> 
> -M none

Let me ask differently: What does -machine none or -M none have to do
with probing? It reads as if you are introducing two probe modes. Why do
you need both? If we have -probe, isn't that independent of which
machine we specify? Who is going to call either, with which respective goal?

I think that changing the semantics of an absent ,accel=foo parameter to
mean something else than its longtime default of tcg is a bad idea.

Have you testing qtest with it? Doesn't -qtest imply accel=qtest or is
that always passed explicitly?

Regards,
Andreas

-- 
SUSE Linux GmbH, Maxfeldstr. 5, 90409 Nürnberg, Germany
GF: Felix Imendörffer, Jane Smithard, Jennifer Guild, Dilip Upmanyu,
Graham Norton; HRB 21284 (AG Nürnberg)
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v14 02/20] vfio: platform: probe to devices on the platform bus

2015-03-02 Thread Baptiste Reynal
From: Antonios Motakis 

Driver to bind to Linux platform devices, and callbacks to discover their
resources to be used by the main VFIO PLATFORM code.

Signed-off-by: Antonios Motakis 
Signed-off-by: Baptiste Reynal 
---
 drivers/vfio/platform/vfio_platform.c | 103 ++
 include/uapi/linux/vfio.h |   1 +
 2 files changed, 104 insertions(+)
 create mode 100644 drivers/vfio/platform/vfio_platform.c

diff --git a/drivers/vfio/platform/vfio_platform.c 
b/drivers/vfio/platform/vfio_platform.c
new file mode 100644
index 000..cef645c
--- /dev/null
+++ b/drivers/vfio/platform/vfio_platform.c
@@ -0,0 +1,103 @@
+/*
+ * Copyright (C) 2013 - Virtual Open Systems
+ * Author: Antonios Motakis 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "vfio_platform_private.h"
+
+#define DRIVER_VERSION  "0.10"
+#define DRIVER_AUTHOR   "Antonios Motakis "
+#define DRIVER_DESC "VFIO for platform devices - User Level meta-driver"
+
+/* probing devices from the linux platform bus */
+
+static struct resource *get_platform_resource(struct vfio_platform_device 
*vdev,
+ int num)
+{
+   struct platform_device *dev = (struct platform_device *) vdev->opaque;
+   int i;
+
+   for (i = 0; i < dev->num_resources; i++) {
+   struct resource *r = &dev->resource[i];
+
+   if (resource_type(r) & (IORESOURCE_MEM|IORESOURCE_IO)) {
+   if (!num)
+   return r;
+
+   num--;
+   }
+   }
+   return NULL;
+}
+
+static int get_platform_irq(struct vfio_platform_device *vdev, int i)
+{
+   struct platform_device *pdev = (struct platform_device *) vdev->opaque;
+
+   return platform_get_irq(pdev, i);
+}
+
+static int vfio_platform_probe(struct platform_device *pdev)
+{
+   struct vfio_platform_device *vdev;
+   int ret;
+
+   vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
+   if (!vdev)
+   return -ENOMEM;
+
+   vdev->opaque = (void *) pdev;
+   vdev->name = pdev->name;
+   vdev->flags = VFIO_DEVICE_FLAGS_PLATFORM;
+   vdev->get_resource = get_platform_resource;
+   vdev->get_irq = get_platform_irq;
+
+   ret = vfio_platform_probe_common(vdev, &pdev->dev);
+   if (ret)
+   kfree(vdev);
+
+   return ret;
+}
+
+static int vfio_platform_remove(struct platform_device *pdev)
+{
+   struct vfio_platform_device *vdev;
+
+   vdev = vfio_platform_remove_common(&pdev->dev);
+   if (vdev) {
+   kfree(vdev);
+   return 0;
+   }
+
+   return -EINVAL;
+}
+
+static struct platform_driver vfio_platform_driver = {
+   .probe  = vfio_platform_probe,
+   .remove = vfio_platform_remove,
+   .driver = {
+   .name   = "vfio-platform",
+   .owner  = THIS_MODULE,
+   },
+};
+
+module_platform_driver(vfio_platform_driver);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 30801a7..e33b04b 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -159,6 +159,7 @@ struct vfio_device_info {
__u32   flags;
 #define VFIO_DEVICE_FLAGS_RESET(1 << 0)/* Device supports 
reset */
 #define VFIO_DEVICE_FLAGS_PCI  (1 << 1)/* vfio-pci device */
+#define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)/* vfio-platform device */
__u32   num_regions;/* Max region index + 1 */
__u32   num_irqs;   /* Max IRQ index + 1 */
 };
-- 
2.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v14 09/20] vfio/platform: support MMAP of MMIO regions

2015-03-02 Thread Baptiste Reynal
From: Antonios Motakis 

Allow to memory map the MMIO regions of the device so userspace can
directly access them. PIO regions are not being handled at this point.

Signed-off-by: Antonios Motakis 
Signed-off-by: Baptiste Reynal 
---
 drivers/vfio/platform/vfio_platform_common.c | 65 
 1 file changed, 65 insertions(+)

diff --git a/drivers/vfio/platform/vfio_platform_common.c 
b/drivers/vfio/platform/vfio_platform_common.c
index 4df66f5..d7fe2c7 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -55,6 +55,16 @@ static int vfio_platform_regions_init(struct 
vfio_platform_device *vdev)
if (!(res->flags & IORESOURCE_READONLY))
vdev->regions[i].flags |=
VFIO_REGION_INFO_FLAG_WRITE;
+
+   /*
+* Only regions addressed with PAGE granularity may be
+* MMAPed securely.
+*/
+   if (!(vdev->regions[i].addr & ~PAGE_MASK) &&
+   !(vdev->regions[i].size & ~PAGE_MASK))
+   vdev->regions[i].flags |=
+   VFIO_REGION_INFO_FLAG_MMAP;
+
break;
case IORESOURCE_IO:
vdev->regions[i].type = VFIO_PLATFORM_REGION_TYPE_PIO;
@@ -334,8 +344,63 @@ static ssize_t vfio_platform_write(void *device_data, 
const char __user *buf,
return -EINVAL;
 }
 
+static int vfio_platform_mmap_mmio(struct vfio_platform_region region,
+  struct vm_area_struct *vma)
+{
+   u64 req_len, pgoff, req_start;
+
+   req_len = vma->vm_end - vma->vm_start;
+   pgoff = vma->vm_pgoff &
+   ((1U << (VFIO_PLATFORM_OFFSET_SHIFT - PAGE_SHIFT)) - 1);
+   req_start = pgoff << PAGE_SHIFT;
+
+   if (region.size < PAGE_SIZE || req_start + req_len > region.size)
+   return -EINVAL;
+
+   vma->vm_page_prot = pgprot_noncached(vma->vm_page_prot);
+   vma->vm_pgoff = (region.addr >> PAGE_SHIFT) + pgoff;
+
+   return remap_pfn_range(vma, vma->vm_start, vma->vm_pgoff,
+  req_len, vma->vm_page_prot);
+}
+
 static int vfio_platform_mmap(void *device_data, struct vm_area_struct *vma)
 {
+   struct vfio_platform_device *vdev = device_data;
+   unsigned int index;
+
+   index = vma->vm_pgoff >> (VFIO_PLATFORM_OFFSET_SHIFT - PAGE_SHIFT);
+
+   if (vma->vm_end < vma->vm_start)
+   return -EINVAL;
+   if (!(vma->vm_flags & VM_SHARED))
+   return -EINVAL;
+   if (index >= vdev->num_regions)
+   return -EINVAL;
+   if (vma->vm_start & ~PAGE_MASK)
+   return -EINVAL;
+   if (vma->vm_end & ~PAGE_MASK)
+   return -EINVAL;
+
+   if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_MMAP))
+   return -EINVAL;
+
+   if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_READ)
+   && (vma->vm_flags & VM_READ))
+   return -EINVAL;
+
+   if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_WRITE)
+   && (vma->vm_flags & VM_WRITE))
+   return -EINVAL;
+
+   vma->vm_private_data = vdev;
+
+   if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_MMIO)
+   return vfio_platform_mmap_mmio(vdev->regions[index], vma);
+
+   else if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_PIO)
+   return -EINVAL; /* not implemented */
+
return -EINVAL;
 }
 
-- 
2.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v14 15/20] vfio: virqfd: rename vfio_pci_virqfd_init and vfio_pci_virqfd_exit

2015-03-02 Thread Baptiste Reynal
From: Antonios Motakis 

The functions vfio_pci_virqfd_init and vfio_pci_virqfd_exit are not really
PCI specific, since we plan to reuse the virqfd code with more VFIO drivers
in addition to VFIO_PCI.

Signed-off-by: Antonios Motakis 
[Baptiste Reynal: Move rename vfio_pci_virqfd_init and vfio_pci_virqfd_exit
from "vfio: add a vfio_ prefix to virqfd_enable and virqfd_disable and export"]
Signed-off-by: Baptiste Reynal 
---
 drivers/vfio/pci/vfio_pci.c | 6 +++---
 drivers/vfio/pci/vfio_pci_intrs.c   | 4 ++--
 drivers/vfio/pci/vfio_pci_private.h | 4 ++--
 3 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index f8a1863..668d37c 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -1030,7 +1030,7 @@ put_devs:
 static void __exit vfio_pci_cleanup(void)
 {
pci_unregister_driver(&vfio_pci_driver);
-   vfio_pci_virqfd_exit();
+   vfio_virqfd_exit();
vfio_pci_uninit_perm_bits();
 }
 
@@ -1044,7 +1044,7 @@ static int __init vfio_pci_init(void)
return ret;
 
/* Start the virqfd cleanup handler */
-   ret = vfio_pci_virqfd_init();
+   ret = vfio_virqfd_init();
if (ret)
goto out_virqfd;
 
@@ -1056,7 +1056,7 @@ static int __init vfio_pci_init(void)
return 0;
 
 out_driver:
-   vfio_pci_virqfd_exit();
+   vfio_virqfd_exit();
 out_virqfd:
vfio_pci_uninit_perm_bits();
return ret;
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c 
b/drivers/vfio/pci/vfio_pci_intrs.c
index 4d38c93..d96ffe0 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -45,7 +45,7 @@ struct virqfd {
 
 static struct workqueue_struct *vfio_irqfd_cleanup_wq;
 
-int __init vfio_pci_virqfd_init(void)
+int __init vfio_virqfd_init(void)
 {
vfio_irqfd_cleanup_wq =
create_singlethread_workqueue("vfio-irqfd-cleanup");
@@ -55,7 +55,7 @@ int __init vfio_pci_virqfd_init(void)
return 0;
 }
 
-void vfio_pci_virqfd_exit(void)
+void vfio_virqfd_exit(void)
 {
destroy_workqueue(vfio_irqfd_cleanup_wq);
 }
diff --git a/drivers/vfio/pci/vfio_pci_private.h 
b/drivers/vfio/pci/vfio_pci_private.h
index c9f9b32..0253965 100644
--- a/drivers/vfio/pci/vfio_pci_private.h
+++ b/drivers/vfio/pci/vfio_pci_private.h
@@ -87,8 +87,8 @@ extern ssize_t vfio_pci_vga_rw(struct vfio_pci_device *vdev, 
char __user *buf,
 extern int vfio_pci_init_perm_bits(void);
 extern void vfio_pci_uninit_perm_bits(void);
 
-extern int vfio_pci_virqfd_init(void);
-extern void vfio_pci_virqfd_exit(void);
+extern int vfio_virqfd_init(void);
+extern void vfio_virqfd_exit(void);
 
 extern int vfio_config_init(struct vfio_pci_device *vdev);
 extern void vfio_config_free(struct vfio_pci_device *vdev);
-- 
2.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v14 20/20] vfio/platform: implement IRQ masking/unmasking via an eventfd

2015-03-02 Thread Baptiste Reynal
From: Antonios Motakis 

With this patch the VFIO user will be able to set an eventfd that can be
used in order to mask and unmask IRQs of platform devices.

Signed-off-by: Antonios Motakis 
Signed-off-by: Baptiste Reynal 
---
 drivers/vfio/platform/vfio_platform_irq.c | 47 ---
 drivers/vfio/platform/vfio_platform_private.h |  2 ++
 2 files changed, 45 insertions(+), 4 deletions(-)

diff --git a/drivers/vfio/platform/vfio_platform_irq.c 
b/drivers/vfio/platform/vfio_platform_irq.c
index e0e6388..88bba57 100644
--- a/drivers/vfio/platform/vfio_platform_irq.c
+++ b/drivers/vfio/platform/vfio_platform_irq.c
@@ -37,6 +37,15 @@ static void vfio_platform_mask(struct vfio_platform_irq 
*irq_ctx)
spin_unlock_irqrestore(&irq_ctx->lock, flags);
 }
 
+static int vfio_platform_mask_handler(void *opaque, void *unused)
+{
+   struct vfio_platform_irq *irq_ctx = opaque;
+
+   vfio_platform_mask(irq_ctx);
+
+   return 0;
+}
+
 static int vfio_platform_set_irq_mask(struct vfio_platform_device *vdev,
  unsigned index, unsigned start,
  unsigned count, uint32_t flags,
@@ -48,8 +57,18 @@ static int vfio_platform_set_irq_mask(struct 
vfio_platform_device *vdev,
if (!(vdev->irqs[index].flags & VFIO_IRQ_INFO_MASKABLE))
return -EINVAL;
 
-   if (flags & VFIO_IRQ_SET_DATA_EVENTFD)
-   return -EINVAL; /* not implemented yet */
+   if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+   int32_t fd = *(int32_t *)data;
+
+   if (fd >= 0)
+   return vfio_virqfd_enable((void *) &vdev->irqs[index],
+ vfio_platform_mask_handler,
+ NULL, NULL,
+ &vdev->irqs[index].mask, fd);
+
+   vfio_virqfd_disable(&vdev->irqs[index].mask);
+   return 0;
+   }
 
if (flags & VFIO_IRQ_SET_DATA_NONE) {
vfio_platform_mask(&vdev->irqs[index]);
@@ -78,6 +97,15 @@ static void vfio_platform_unmask(struct vfio_platform_irq 
*irq_ctx)
spin_unlock_irqrestore(&irq_ctx->lock, flags);
 }
 
+static int vfio_platform_unmask_handler(void *opaque, void *unused)
+{
+   struct vfio_platform_irq *irq_ctx = opaque;
+
+   vfio_platform_unmask(irq_ctx);
+
+   return 0;
+}
+
 static int vfio_platform_set_irq_unmask(struct vfio_platform_device *vdev,
unsigned index, unsigned start,
unsigned count, uint32_t flags,
@@ -89,8 +117,19 @@ static int vfio_platform_set_irq_unmask(struct 
vfio_platform_device *vdev,
if (!(vdev->irqs[index].flags & VFIO_IRQ_INFO_MASKABLE))
return -EINVAL;
 
-   if (flags & VFIO_IRQ_SET_DATA_EVENTFD)
-   return -EINVAL; /* not implemented yet */
+   if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+   int32_t fd = *(int32_t *)data;
+
+   if (fd >= 0)
+   return vfio_virqfd_enable((void *) &vdev->irqs[index],
+ vfio_platform_unmask_handler,
+ NULL, NULL,
+ &vdev->irqs[index].unmask,
+ fd);
+
+   vfio_virqfd_disable(&vdev->irqs[index].unmask);
+   return 0;
+   }
 
if (flags & VFIO_IRQ_SET_DATA_NONE) {
vfio_platform_unmask(&vdev->irqs[index]);
diff --git a/drivers/vfio/platform/vfio_platform_private.h 
b/drivers/vfio/platform/vfio_platform_private.h
index ff2db1d..5d31e04 100644
--- a/drivers/vfio/platform/vfio_platform_private.h
+++ b/drivers/vfio/platform/vfio_platform_private.h
@@ -35,6 +35,8 @@ struct vfio_platform_irq {
struct eventfd_ctx  *trigger;
boolmasked;
spinlock_t  lock;
+   struct virqfd   *unmask;
+   struct virqfd   *mask;
 };
 
 struct vfio_platform_region {
-- 
2.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v14 13/20] vfio/platform: support for level sensitive interrupts

2015-03-02 Thread Baptiste Reynal
From: Antonios Motakis 

Level sensitive interrupts are exposed as maskable and automasked
interrupts and are masked and disabled automatically when they fire.

Signed-off-by: Antonios Motakis 
[Baptiste Reynal: Move masked interrupt initialization from "vfio/platform:
trigger an interrupt via eventfd"]
Signed-off-by: Baptiste Reynal 
---
 drivers/vfio/platform/vfio_platform_irq.c | 103 +-
 drivers/vfio/platform/vfio_platform_private.h |   2 +
 2 files changed, 102 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/platform/vfio_platform_irq.c 
b/drivers/vfio/platform/vfio_platform_irq.c
index 611ec80..e0e6388 100644
--- a/drivers/vfio/platform/vfio_platform_irq.c
+++ b/drivers/vfio/platform/vfio_platform_irq.c
@@ -23,12 +23,59 @@
 
 #include "vfio_platform_private.h"
 
+static void vfio_platform_mask(struct vfio_platform_irq *irq_ctx)
+{
+   unsigned long flags;
+
+   spin_lock_irqsave(&irq_ctx->lock, flags);
+
+   if (!irq_ctx->masked) {
+   disable_irq_nosync(irq_ctx->hwirq);
+   irq_ctx->masked = true;
+   }
+
+   spin_unlock_irqrestore(&irq_ctx->lock, flags);
+}
+
 static int vfio_platform_set_irq_mask(struct vfio_platform_device *vdev,
  unsigned index, unsigned start,
  unsigned count, uint32_t flags,
  void *data)
 {
-   return -EINVAL;
+   if (start != 0 || count != 1)
+   return -EINVAL;
+
+   if (!(vdev->irqs[index].flags & VFIO_IRQ_INFO_MASKABLE))
+   return -EINVAL;
+
+   if (flags & VFIO_IRQ_SET_DATA_EVENTFD)
+   return -EINVAL; /* not implemented yet */
+
+   if (flags & VFIO_IRQ_SET_DATA_NONE) {
+   vfio_platform_mask(&vdev->irqs[index]);
+
+   } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+   uint8_t mask = *(uint8_t *)data;
+
+   if (mask)
+   vfio_platform_mask(&vdev->irqs[index]);
+   }
+
+   return 0;
+}
+
+static void vfio_platform_unmask(struct vfio_platform_irq *irq_ctx)
+{
+   unsigned long flags;
+
+   spin_lock_irqsave(&irq_ctx->lock, flags);
+
+   if (irq_ctx->masked) {
+   enable_irq(irq_ctx->hwirq);
+   irq_ctx->masked = false;
+   }
+
+   spin_unlock_irqrestore(&irq_ctx->lock, flags);
 }
 
 static int vfio_platform_set_irq_unmask(struct vfio_platform_device *vdev,
@@ -36,7 +83,50 @@ static int vfio_platform_set_irq_unmask(struct 
vfio_platform_device *vdev,
unsigned count, uint32_t flags,
void *data)
 {
-   return -EINVAL;
+   if (start != 0 || count != 1)
+   return -EINVAL;
+
+   if (!(vdev->irqs[index].flags & VFIO_IRQ_INFO_MASKABLE))
+   return -EINVAL;
+
+   if (flags & VFIO_IRQ_SET_DATA_EVENTFD)
+   return -EINVAL; /* not implemented yet */
+
+   if (flags & VFIO_IRQ_SET_DATA_NONE) {
+   vfio_platform_unmask(&vdev->irqs[index]);
+
+   } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+   uint8_t unmask = *(uint8_t *)data;
+
+   if (unmask)
+   vfio_platform_unmask(&vdev->irqs[index]);
+   }
+
+   return 0;
+}
+
+static irqreturn_t vfio_automasked_irq_handler(int irq, void *dev_id)
+{
+   struct vfio_platform_irq *irq_ctx = dev_id;
+   unsigned long flags;
+   int ret = IRQ_NONE;
+
+   spin_lock_irqsave(&irq_ctx->lock, flags);
+
+   if (!irq_ctx->masked) {
+   ret = IRQ_HANDLED;
+
+   /* automask maskable interrupts */
+   disable_irq_nosync(irq_ctx->hwirq);
+   irq_ctx->masked = true;
+   }
+
+   spin_unlock_irqrestore(&irq_ctx->lock, flags);
+
+   if (ret == IRQ_HANDLED)
+   eventfd_signal(irq_ctx->trigger, 1);
+
+   return ret;
 }
 
 static irqreturn_t vfio_irq_handler(int irq, void *dev_id)
@@ -78,6 +168,7 @@ static int vfio_set_trigger(struct vfio_platform_device 
*vdev, int index,
 
irq->trigger = trigger;
 
+   irq_set_status_flags(irq->hwirq, IRQ_NOAUTOEN);
ret = request_irq(irq->hwirq, handler, 0, irq->name, irq);
if (ret) {
kfree(irq->name);
@@ -86,6 +177,9 @@ static int vfio_set_trigger(struct vfio_platform_device 
*vdev, int index,
return ret;
}
 
+   if (!irq->masked)
+   enable_irq(irq->hwirq);
+
return 0;
 }
 
@@ -98,7 +192,7 @@ static int vfio_platform_set_irq_trigger(struct 
vfio_platform_device *vdev,
irq_handler_t handler;
 
if (vdev->irqs[index].flags & VFIO_IRQ_INFO_AUTOMASKED)
-   return -EINVAL; /* not implemented */
+   handler = vfio_automasked_irq_handler;
else
handler = vfio_irq_handler;
 
@@ -170,6 +264,8 @@ int vfio_platform_irq_init(struct vfio_platform

[PATCH v14 16/20] vfio: add local lock for virqfd instead of depending on VFIO PCI

2015-03-02 Thread Baptiste Reynal
From: Antonios Motakis 

The Virqfd code needs to keep accesses to any struct *virqfd safe, but
this comes into play only when creating or destroying eventfds, so sharing
the same spinlock with the VFIO bus driver is not necessary.

Signed-off-by: Antonios Motakis 
Signed-off-by: Baptiste Reynal 
---
 drivers/vfio/pci/vfio_pci_intrs.c | 31 ---
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_intrs.c 
b/drivers/vfio/pci/vfio_pci_intrs.c
index d96ffe0..7d3c135 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -44,6 +44,7 @@ struct virqfd {
 };
 
 static struct workqueue_struct *vfio_irqfd_cleanup_wq;
+DEFINE_SPINLOCK(virqfd_lock);
 
 int __init vfio_virqfd_init(void)
 {
@@ -80,21 +81,21 @@ static int virqfd_wakeup(wait_queue_t *wait, unsigned mode, 
int sync, void *key)
 
if (flags & POLLHUP) {
unsigned long flags;
-   spin_lock_irqsave(&virqfd->vdev->irqlock, flags);
+   spin_lock_irqsave(&virqfd_lock, flags);
 
/*
 * The eventfd is closing, if the virqfd has not yet been
 * queued for release, as determined by testing whether the
-* vdev pointer to it is still valid, queue it now.  As
+* virqfd pointer to it is still valid, queue it now.  As
 * with kvm irqfds, we know we won't race against the virqfd
-* going away because we hold wqh->lock to get here.
+* going away because we hold the lock to get here.
 */
if (*(virqfd->pvirqfd) == virqfd) {
*(virqfd->pvirqfd) = NULL;
virqfd_deactivate(virqfd);
}
 
-   spin_unlock_irqrestore(&virqfd->vdev->irqlock, flags);
+   spin_unlock_irqrestore(&virqfd_lock, flags);
}
 
return 0;
@@ -170,16 +171,16 @@ int vfio_virqfd_enable(struct vfio_pci_device *vdev,
 * we update the pointer to the virqfd under lock to avoid
 * pushing multiple jobs to release the same virqfd.
 */
-   spin_lock_irq(&vdev->irqlock);
+   spin_lock_irq(&virqfd_lock);
 
if (*pvirqfd) {
-   spin_unlock_irq(&vdev->irqlock);
+   spin_unlock_irq(&virqfd_lock);
ret = -EBUSY;
goto err_busy;
}
*pvirqfd = virqfd;
 
-   spin_unlock_irq(&vdev->irqlock);
+   spin_unlock_irq(&virqfd_lock);
 
/*
 * Install our own custom wake-up handling so we are notified via
@@ -217,18 +218,18 @@ err_fd:
 }
 EXPORT_SYMBOL_GPL(vfio_virqfd_enable);
 
-void vfio_virqfd_disable(struct vfio_pci_device *vdev, struct virqfd **pvirqfd)
+void vfio_virqfd_disable(struct virqfd **pvirqfd)
 {
unsigned long flags;
 
-   spin_lock_irqsave(&vdev->irqlock, flags);
+   spin_lock_irqsave(&virqfd_lock, flags);
 
if (*pvirqfd) {
virqfd_deactivate(*pvirqfd);
*pvirqfd = NULL;
}
 
-   spin_unlock_irqrestore(&vdev->irqlock, flags);
+   spin_unlock_irqrestore(&virqfd_lock, flags);
 
/*
 * Block until we know all outstanding shutdown jobs have completed.
@@ -441,8 +442,8 @@ static int vfio_intx_set_signal(struct vfio_pci_device 
*vdev, int fd)
 static void vfio_intx_disable(struct vfio_pci_device *vdev)
 {
vfio_intx_set_signal(vdev, -1);
-   vfio_virqfd_disable(vdev, &vdev->ctx[0].unmask);
-   vfio_virqfd_disable(vdev, &vdev->ctx[0].mask);
+   vfio_virqfd_disable(&vdev->ctx[0].unmask);
+   vfio_virqfd_disable(&vdev->ctx[0].mask);
vdev->irq_type = VFIO_PCI_NUM_IRQS;
vdev->num_ctx = 0;
kfree(vdev->ctx);
@@ -606,8 +607,8 @@ static void vfio_msi_disable(struct vfio_pci_device *vdev, 
bool msix)
vfio_msi_set_block(vdev, 0, vdev->num_ctx, NULL, msix);
 
for (i = 0; i < vdev->num_ctx; i++) {
-   vfio_virqfd_disable(vdev, &vdev->ctx[i].unmask);
-   vfio_virqfd_disable(vdev, &vdev->ctx[i].mask);
+   vfio_virqfd_disable(&vdev->ctx[i].unmask);
+   vfio_virqfd_disable(&vdev->ctx[i].mask);
}
 
if (msix) {
@@ -645,7 +646,7 @@ static int vfio_pci_set_intx_unmask(struct vfio_pci_device 
*vdev,
  vfio_send_intx_eventfd, NULL,
  &vdev->ctx[0].unmask, fd);
 
-   vfio_virqfd_disable(vdev, &vdev->ctx[0].unmask);
+   vfio_virqfd_disable(&vdev->ctx[0].unmask);
}
 
return 0;
-- 
2.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v14 18/20] vfio: move eventfd support code for VFIO_PCI to a separate file

2015-03-02 Thread Baptiste Reynal
From: Antonios Motakis 

The virqfd functionality that is used by VFIO_PCI to implement interrupt
masking and unmasking via an eventfd, is generic enough and can be reused
by another driver. Move it to a separate file in order to allow the code
to be shared.

Signed-off-by: Antonios Motakis 
Signed-off-by: Baptiste Reynal 
---
 drivers/vfio/pci/Makefile   |   3 +-
 drivers/vfio/pci/vfio_pci_intrs.c   | 215 
 drivers/vfio/pci/vfio_pci_private.h |   3 -
 drivers/vfio/virqfd.c   | 213 +++
 include/linux/vfio.h|  27 +
 5 files changed, 242 insertions(+), 219 deletions(-)
 create mode 100644 drivers/vfio/virqfd.c

diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index 1310792..c7c8644 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -1,4 +1,5 @@
 
-vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
+vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o \
+ ../virqfd.o
 
 obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
diff --git a/drivers/vfio/pci/vfio_pci_intrs.c 
b/drivers/vfio/pci/vfio_pci_intrs.c
index 1a16da3..da6616e 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -19,228 +19,13 @@
 #include 
 #include 
 #include 
-#include 
 #include 
 #include 
-#include 
 #include 
 
 #include "vfio_pci_private.h"
 
 /*
- * IRQfd - generic
- */
-struct virqfd {
-   void*opaque;
-   struct eventfd_ctx  *eventfd;
-   int (*handler)(void *, void *);
-   void(*thread)(void *, void *);
-   void*data;
-   struct work_struct  inject;
-   wait_queue_twait;
-   poll_table  pt;
-   struct work_struct  shutdown;
-   struct virqfd   **pvirqfd;
-};
-
-static struct workqueue_struct *vfio_irqfd_cleanup_wq;
-DEFINE_SPINLOCK(virqfd_lock);
-
-int __init vfio_virqfd_init(void)
-{
-   vfio_irqfd_cleanup_wq =
-   create_singlethread_workqueue("vfio-irqfd-cleanup");
-   if (!vfio_irqfd_cleanup_wq)
-   return -ENOMEM;
-
-   return 0;
-}
-
-void vfio_virqfd_exit(void)
-{
-   destroy_workqueue(vfio_irqfd_cleanup_wq);
-}
-
-static void virqfd_deactivate(struct virqfd *virqfd)
-{
-   queue_work(vfio_irqfd_cleanup_wq, &virqfd->shutdown);
-}
-
-static int virqfd_wakeup(wait_queue_t *wait, unsigned mode, int sync, void 
*key)
-{
-   struct virqfd *virqfd = container_of(wait, struct virqfd, wait);
-   unsigned long flags = (unsigned long)key;
-
-   if (flags & POLLIN) {
-   /* An event has been signaled, call function */
-   if ((!virqfd->handler ||
-virqfd->handler(virqfd->opaque, virqfd->data)) &&
-   virqfd->thread)
-   schedule_work(&virqfd->inject);
-   }
-
-   if (flags & POLLHUP) {
-   unsigned long flags;
-   spin_lock_irqsave(&virqfd_lock, flags);
-
-   /*
-* The eventfd is closing, if the virqfd has not yet been
-* queued for release, as determined by testing whether the
-* virqfd pointer to it is still valid, queue it now.  As
-* with kvm irqfds, we know we won't race against the virqfd
-* going away because we hold the lock to get here.
-*/
-   if (*(virqfd->pvirqfd) == virqfd) {
-   *(virqfd->pvirqfd) = NULL;
-   virqfd_deactivate(virqfd);
-   }
-
-   spin_unlock_irqrestore(&virqfd_lock, flags);
-   }
-
-   return 0;
-}
-
-static void virqfd_ptable_queue_proc(struct file *file,
-wait_queue_head_t *wqh, poll_table *pt)
-{
-   struct virqfd *virqfd = container_of(pt, struct virqfd, pt);
-   add_wait_queue(wqh, &virqfd->wait);
-}
-
-static void virqfd_shutdown(struct work_struct *work)
-{
-   struct virqfd *virqfd = container_of(work, struct virqfd, shutdown);
-   u64 cnt;
-
-   eventfd_ctx_remove_wait_queue(virqfd->eventfd, &virqfd->wait, &cnt);
-   flush_work(&virqfd->inject);
-   eventfd_ctx_put(virqfd->eventfd);
-
-   kfree(virqfd);
-}
-
-static void virqfd_inject(struct work_struct *work)
-{
-   struct virqfd *virqfd = container_of(work, struct virqfd, inject);
-   if (virqfd->thread)
-   virqfd->thread(virqfd->opaque, virqfd->data);
-}
-
-int vfio_virqfd_enable(void *opaque,
-  int (*handler)(void *, void *),
-  void (*thread)(void *, void *),
-  void *data, struct virqfd **pvirqfd, int fd)
-{
-   struct fd irqfd;
-   struct eventfd_ctx *ctx;
-   struct virqfd *virqfd;
-   int ret = 0;
-   unsigned int events;
-
-

Re: [RFC/RFT PATCH 0/3] arm64: KVM: work around incoherency with uncached guest mappings

2015-03-02 Thread Andrew Jones
On Mon, Mar 02, 2015 at 05:55:44PM +0100, Laszlo Ersek wrote:
> On 03/02/15 17:47, Paolo Bonzini wrote:
> > 
> > Also, we may want to invalidate the cache for dirty pages before
> > returning the dirty bitmap, and probably should do that directly in
> > KVM_GET_DIRTY_LOG.
> 
> "I agree."
> 
> If KVM_GET_DIRTY_LOG is supposed to be atomic fetch and clear (from
> userspace's aspect), then the cache invalidation should be an atomic
> part of it too (from the same aspect).
> 
> (Sorry if I just said something incredibly stupid.)
>

With the path I'm headed down, all cache maintenance operations will
be done before exiting to userspace (and after returning). I was
actually already letting a feature creep into this PoC by setting
KVM_MEM_LOG_DIRTY_PAGES when we see KVM_MEM_INCOHERENT has been set,
and the region isn't readonly. The dirty log would then be used by
KVM internally to know exactly which pages need to be invalidated
before the exit.

drew
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v14 11/20] vfio/platform: initial interrupts support code

2015-03-02 Thread Baptiste Reynal
From: Antonios Motakis 

This patch is a skeleton for the VFIO_DEVICE_SET_IRQS IOCTL, around which
most IRQ functionality is implemented in VFIO.

Signed-off-by: Antonios Motakis 
Signed-off-by: Baptiste Reynal 
---
 drivers/vfio/platform/vfio_platform_common.c  | 52 +--
 drivers/vfio/platform/vfio_platform_irq.c | 59 +++
 drivers/vfio/platform/vfio_platform_private.h |  7 
 3 files changed, 115 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/platform/vfio_platform_common.c 
b/drivers/vfio/platform/vfio_platform_common.c
index 908d510..abcff7a 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -205,10 +205,54 @@ static long vfio_platform_ioctl(void *device_data,
 
return copy_to_user((void __user *)arg, &info, minsz);
 
-   } else if (cmd == VFIO_DEVICE_SET_IRQS)
-   return -EINVAL;
+   } else if (cmd == VFIO_DEVICE_SET_IRQS) {
+   struct vfio_irq_set hdr;
+   u8 *data = NULL;
+   int ret = 0;
+
+   minsz = offsetofend(struct vfio_irq_set, count);
+
+   if (copy_from_user(&hdr, (void __user *)arg, minsz))
+   return -EFAULT;
+
+   if (hdr.argsz < minsz)
+   return -EINVAL;
+
+   if (hdr.index >= vdev->num_irqs)
+   return -EINVAL;
+
+   if (hdr.flags & ~(VFIO_IRQ_SET_DATA_TYPE_MASK |
+ VFIO_IRQ_SET_ACTION_TYPE_MASK))
+   return -EINVAL;
 
-   else if (cmd == VFIO_DEVICE_RESET)
+   if (!(hdr.flags & VFIO_IRQ_SET_DATA_NONE)) {
+   size_t size;
+
+   if (hdr.flags & VFIO_IRQ_SET_DATA_BOOL)
+   size = sizeof(uint8_t);
+   else if (hdr.flags & VFIO_IRQ_SET_DATA_EVENTFD)
+   size = sizeof(int32_t);
+   else
+   return -EINVAL;
+
+   if (hdr.argsz - minsz < size)
+   return -EINVAL;
+
+   data = memdup_user((void __user *)(arg + minsz), size);
+   if (IS_ERR(data))
+   return PTR_ERR(data);
+   }
+
+   mutex_lock(&vdev->igate);
+
+   ret = vfio_platform_set_irqs_ioctl(vdev, hdr.flags, hdr.index,
+  hdr.start, hdr.count, data);
+   mutex_unlock(&vdev->igate);
+   kfree(data);
+
+   return ret;
+
+   } else if (cmd == VFIO_DEVICE_RESET)
return -EINVAL;
 
return -ENOTTY;
@@ -458,6 +502,8 @@ int vfio_platform_probe_common(struct vfio_platform_device 
*vdev,
return ret;
}
 
+   mutex_init(&vdev->igate);
+
return 0;
 }
 EXPORT_SYMBOL_GPL(vfio_platform_probe_common);
diff --git a/drivers/vfio/platform/vfio_platform_irq.c 
b/drivers/vfio/platform/vfio_platform_irq.c
index c6c3ec1..df5c919 100644
--- a/drivers/vfio/platform/vfio_platform_irq.c
+++ b/drivers/vfio/platform/vfio_platform_irq.c
@@ -23,6 +23,56 @@
 
 #include "vfio_platform_private.h"
 
+static int vfio_platform_set_irq_mask(struct vfio_platform_device *vdev,
+ unsigned index, unsigned start,
+ unsigned count, uint32_t flags,
+ void *data)
+{
+   return -EINVAL;
+}
+
+static int vfio_platform_set_irq_unmask(struct vfio_platform_device *vdev,
+   unsigned index, unsigned start,
+   unsigned count, uint32_t flags,
+   void *data)
+{
+   return -EINVAL;
+}
+
+static int vfio_platform_set_irq_trigger(struct vfio_platform_device *vdev,
+unsigned index, unsigned start,
+unsigned count, uint32_t flags,
+void *data)
+{
+   return -EINVAL;
+}
+
+int vfio_platform_set_irqs_ioctl(struct vfio_platform_device *vdev,
+uint32_t flags, unsigned index, unsigned start,
+unsigned count, void *data)
+{
+   int (*func)(struct vfio_platform_device *vdev, unsigned index,
+   unsigned start, unsigned count, uint32_t flags,
+   void *data) = NULL;
+
+   switch (flags & VFIO_IRQ_SET_ACTION_TYPE_MASK) {
+   case VFIO_IRQ_SET_ACTION_MASK:
+   func = vfio_platform_set_irq_mask;
+   break;
+   case VFIO_IRQ_SET_ACTION_UNMASK:
+   func = vfio_platform_set_irq_unmask;
+   break;
+   case VFIO_IRQ_SET_ACTION_TRIGGER:
+   func = vfio_platform_set_irq

[PATCH v14 10/20] vfio/platform: return IRQ info

2015-03-02 Thread Baptiste Reynal
From: Antonios Motakis 

Return information for the interrupts exposed by the device.
This patch extends VFIO_DEVICE_GET_INFO with the number of IRQs
and enables VFIO_DEVICE_GET_IRQ_INFO.

Signed-off-by: Antonios Motakis 
Signed-off-by: Baptiste Reynal 
---
 drivers/vfio/platform/Makefile|  2 +-
 drivers/vfio/platform/vfio_platform_common.c  | 31 +---
 drivers/vfio/platform/vfio_platform_irq.c | 51 +++
 drivers/vfio/platform/vfio_platform_private.h | 10 ++
 4 files changed, 89 insertions(+), 5 deletions(-)
 create mode 100644 drivers/vfio/platform/vfio_platform_irq.c

diff --git a/drivers/vfio/platform/Makefile b/drivers/vfio/platform/Makefile
index 1957170..81de144 100644
--- a/drivers/vfio/platform/Makefile
+++ b/drivers/vfio/platform/Makefile
@@ -1,5 +1,5 @@
 
-vfio-platform-y := vfio_platform.o vfio_platform_common.o
+vfio-platform-y := vfio_platform.o vfio_platform_common.o vfio_platform_irq.o
 
 obj-$(CONFIG_VFIO_PLATFORM) += vfio-platform.o
 
diff --git a/drivers/vfio/platform/vfio_platform_common.c 
b/drivers/vfio/platform/vfio_platform_common.c
index d7fe2c7..908d510 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -101,6 +101,7 @@ static void vfio_platform_release(void *device_data)
 
if (!(--vdev->refcnt)) {
vfio_platform_regions_cleanup(vdev);
+   vfio_platform_irq_cleanup(vdev);
}
 
mutex_unlock(&driver_lock);
@@ -122,6 +123,10 @@ static int vfio_platform_open(void *device_data)
ret = vfio_platform_regions_init(vdev);
if (ret)
goto err_reg;
+
+   ret = vfio_platform_irq_init(vdev);
+   if (ret)
+   goto err_irq;
}
 
vdev->refcnt++;
@@ -129,6 +134,8 @@ static int vfio_platform_open(void *device_data)
mutex_unlock(&driver_lock);
return 0;
 
+err_irq:
+   vfio_platform_regions_cleanup(vdev);
 err_reg:
mutex_unlock(&driver_lock);
module_put(THIS_MODULE);
@@ -154,7 +161,7 @@ static long vfio_platform_ioctl(void *device_data,
 
info.flags = vdev->flags;
info.num_regions = vdev->num_regions;
-   info.num_irqs = 0;
+   info.num_irqs = vdev->num_irqs;
 
return copy_to_user((void __user *)arg, &info, minsz);
 
@@ -179,10 +186,26 @@ static long vfio_platform_ioctl(void *device_data,
 
return copy_to_user((void __user *)arg, &info, minsz);
 
-   } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO)
-   return -EINVAL;
+   } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO) {
+   struct vfio_irq_info info;
+
+   minsz = offsetofend(struct vfio_irq_info, count);
+
+   if (copy_from_user(&info, (void __user *)arg, minsz))
+   return -EFAULT;
+
+   if (info.argsz < minsz)
+   return -EINVAL;
+
+   if (info.index >= vdev->num_irqs)
+   return -EINVAL;
+
+   info.flags = vdev->irqs[info.index].flags;
+   info.count = vdev->irqs[info.index].count;
+
+   return copy_to_user((void __user *)arg, &info, minsz);
 
-   else if (cmd == VFIO_DEVICE_SET_IRQS)
+   } else if (cmd == VFIO_DEVICE_SET_IRQS)
return -EINVAL;
 
else if (cmd == VFIO_DEVICE_RESET)
diff --git a/drivers/vfio/platform/vfio_platform_irq.c 
b/drivers/vfio/platform/vfio_platform_irq.c
new file mode 100644
index 000..c6c3ec1
--- /dev/null
+++ b/drivers/vfio/platform/vfio_platform_irq.c
@@ -0,0 +1,51 @@
+/*
+ * VFIO platform devices interrupt handling
+ *
+ * Copyright (C) 2013 - Virtual Open Systems
+ * Author: Antonios Motakis 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "vfio_platform_private.h"
+
+int vfio_platform_irq_init(struct vfio_platform_device *vdev)
+{
+   int cnt = 0, i;
+
+   while (vdev->get_irq(vdev, cnt) >= 0)
+   cnt++;
+
+   vdev->irqs = kcalloc(cnt, sizeof(struct vfio_platform_irq), GFP_KERNEL);
+   if (!vdev->irqs)
+   return -ENOMEM;
+
+   for (i = 0; i < cnt; i++) {
+   vdev->irqs[i].flags = 0;
+   vdev->irqs[i].count = 1;
+   }
+
+   vdev->num_irqs = cnt;
+
+   return 0;
+}
+
+void vfio_platform_irq_cleanup(struct vfio_platform_device *vdev)
+{
+   vdev->num_irqs = 0;
+

[PATCH v14 08/20] vfio/platform: read and write support for the device fd

2015-03-02 Thread Baptiste Reynal
From: Antonios Motakis 

VFIO returns a file descriptor which we can use to manipulate the memory
regions of the device. Usually, the user will mmap memory regions that are
addressable on page boundaries, however for memory regions where this is
not the case we cannot provide mmap functionality due to security concerns.
For this reason we also allow to use read and write functions to the file
descriptor pointing to the memory regions.

We implement this functionality only for MMIO regions of platform devices;
PIO regions are not being handled at this point.

Signed-off-by: Antonios Motakis 
Signed-off-by: Baptiste Reynal 
---
 drivers/vfio/platform/vfio_platform_common.c  | 150 ++
 drivers/vfio/platform/vfio_platform_private.h |   1 +
 2 files changed, 151 insertions(+)

diff --git a/drivers/vfio/platform/vfio_platform_common.c 
b/drivers/vfio/platform/vfio_platform_common.c
index 47f6309..4df66f5 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -51,6 +51,10 @@ static int vfio_platform_regions_init(struct 
vfio_platform_device *vdev)
switch (resource_type(res)) {
case IORESOURCE_MEM:
vdev->regions[i].type = VFIO_PLATFORM_REGION_TYPE_MMIO;
+   vdev->regions[i].flags |= VFIO_REGION_INFO_FLAG_READ;
+   if (!(res->flags & IORESOURCE_READONLY))
+   vdev->regions[i].flags |=
+   VFIO_REGION_INFO_FLAG_WRITE;
break;
case IORESOURCE_IO:
vdev->regions[i].type = VFIO_PLATFORM_REGION_TYPE_PIO;
@@ -70,6 +74,11 @@ err:
 
 static void vfio_platform_regions_cleanup(struct vfio_platform_device *vdev)
 {
+   int i;
+
+   for (i = 0; i < vdev->num_regions; i++)
+   iounmap(vdev->regions[i].ioaddr);
+
vdev->num_regions = 0;
kfree(vdev->regions);
 }
@@ -172,15 +181,156 @@ static long vfio_platform_ioctl(void *device_data,
return -ENOTTY;
 }
 
+static ssize_t vfio_platform_read_mmio(struct vfio_platform_region reg,
+  char __user *buf, size_t count,
+  loff_t off)
+{
+   unsigned int done = 0;
+
+   if (!reg.ioaddr) {
+   reg.ioaddr =
+   ioremap_nocache(reg.addr, reg.size);
+
+   if (!reg.ioaddr)
+   return -ENOMEM;
+   }
+
+   while (count) {
+   size_t filled;
+
+   if (count >= 4 && !(off % 4)) {
+   u32 val;
+
+   val = ioread32(reg.ioaddr + off);
+   if (copy_to_user(buf, &val, 4))
+   goto err;
+
+   filled = 4;
+   } else if (count >= 2 && !(off % 2)) {
+   u16 val;
+
+   val = ioread16(reg.ioaddr + off);
+   if (copy_to_user(buf, &val, 2))
+   goto err;
+
+   filled = 2;
+   } else {
+   u8 val;
+
+   val = ioread8(reg.ioaddr + off);
+   if (copy_to_user(buf, &val, 1))
+   goto err;
+
+   filled = 1;
+   }
+
+
+   count -= filled;
+   done += filled;
+   off += filled;
+   buf += filled;
+   }
+
+   return done;
+err:
+   return -EFAULT;
+}
+
 static ssize_t vfio_platform_read(void *device_data, char __user *buf,
  size_t count, loff_t *ppos)
 {
+   struct vfio_platform_device *vdev = device_data;
+   unsigned int index = VFIO_PLATFORM_OFFSET_TO_INDEX(*ppos);
+   loff_t off = *ppos & VFIO_PLATFORM_OFFSET_MASK;
+
+   if (index >= vdev->num_regions)
+   return -EINVAL;
+
+   if (!(vdev->regions[index].flags & VFIO_REGION_INFO_FLAG_READ))
+   return -EINVAL;
+
+   if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_MMIO)
+   return vfio_platform_read_mmio(vdev->regions[index],
+   buf, count, off);
+   else if (vdev->regions[index].type & VFIO_PLATFORM_REGION_TYPE_PIO)
+   return -EINVAL; /* not implemented */
+
return -EINVAL;
 }
 
+static ssize_t vfio_platform_write_mmio(struct vfio_platform_region reg,
+   const char __user *buf, size_t count,
+   loff_t off)
+{
+   unsigned int done = 0;
+
+   if (!reg.ioaddr) {
+   reg.ioaddr =
+   ioremap_nocache(reg.addr, reg.size);
+
+   if (!reg.ioaddr)
+   return -ENOMEM;
+   }
+
+   while (count) {
+   size_t fill

[PATCH v14 06/20] vfio/platform: return info for bound device

2015-03-02 Thread Baptiste Reynal
From: Antonios Motakis 

A VFIO userspace driver will start by opening the VFIO device
that corresponds to an IOMMU group, and will use the ioctl interface
to get the basic device info, such as number of memory regions and
interrupts, and their properties. This patch enables the
VFIO_DEVICE_GET_INFO ioctl call.

Signed-off-by: Antonios Motakis 
[Baptiste Reynal: added include in vfio_platform_common.c]
Signed-off-by: Baptiste Reynal 
---
 drivers/vfio/platform/vfio_platform_common.c | 24 +---
 1 file changed, 21 insertions(+), 3 deletions(-)

diff --git a/drivers/vfio/platform/vfio_platform_common.c 
b/drivers/vfio/platform/vfio_platform_common.c
index 34d023b..c2f853a 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -18,6 +18,7 @@
 #include 
 #include 
 #include 
+#include 
 #include 
 
 #include "vfio_platform_private.h"
@@ -38,10 +39,27 @@ static int vfio_platform_open(void *device_data)
 static long vfio_platform_ioctl(void *device_data,
unsigned int cmd, unsigned long arg)
 {
-   if (cmd == VFIO_DEVICE_GET_INFO)
-   return -EINVAL;
+   struct vfio_platform_device *vdev = device_data;
+   unsigned long minsz;
+
+   if (cmd == VFIO_DEVICE_GET_INFO) {
+   struct vfio_device_info info;
+
+   minsz = offsetofend(struct vfio_device_info, num_irqs);
+
+   if (copy_from_user(&info, (void __user *)arg, minsz))
+   return -EFAULT;
+
+   if (info.argsz < minsz)
+   return -EINVAL;
+
+   info.flags = vdev->flags;
+   info.num_regions = 0;
+   info.num_irqs = 0;
+
+   return copy_to_user((void __user *)arg, &info, minsz);
 
-   else if (cmd == VFIO_DEVICE_GET_REGION_INFO)
+   } else if (cmd == VFIO_DEVICE_GET_REGION_INFO)
return -EINVAL;
 
else if (cmd == VFIO_DEVICE_GET_IRQ_INFO)
-- 
2.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v14 14/20] vfio: add a vfio_ prefix to virqfd_enable and virqfd_disable and export

2015-03-02 Thread Baptiste Reynal
From: Antonios Motakis 

We want to reuse virqfd functionality in multiple VFIO drivers; before
moving these functions to core VFIO, add the vfio_ prefix to the
virqfd_enable and virqfd_disable functions, and export them so they can
be used from other modules.

Signed-off-by: Antonios Motakis 
Signed-off-by: Baptiste Reynal 
---
 drivers/vfio/pci/vfio_pci_intrs.c | 30 --
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_intrs.c 
b/drivers/vfio/pci/vfio_pci_intrs.c
index f88bfdf..4d38c93 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -126,10 +126,10 @@ static void virqfd_inject(struct work_struct *work)
virqfd->thread(virqfd->vdev, virqfd->data);
 }
 
-static int virqfd_enable(struct vfio_pci_device *vdev,
-int (*handler)(struct vfio_pci_device *, void *),
-void (*thread)(struct vfio_pci_device *, void *),
-void *data, struct virqfd **pvirqfd, int fd)
+int vfio_virqfd_enable(struct vfio_pci_device *vdev,
+  int (*handler)(struct vfio_pci_device *, void *),
+  void (*thread)(struct vfio_pci_device *, void *),
+  void *data, struct virqfd **pvirqfd, int fd)
 {
struct fd irqfd;
struct eventfd_ctx *ctx;
@@ -215,9 +215,9 @@ err_fd:
 
return ret;
 }
+EXPORT_SYMBOL_GPL(vfio_virqfd_enable);
 
-static void virqfd_disable(struct vfio_pci_device *vdev,
-  struct virqfd **pvirqfd)
+void vfio_virqfd_disable(struct vfio_pci_device *vdev, struct virqfd **pvirqfd)
 {
unsigned long flags;
 
@@ -237,6 +237,7 @@ static void virqfd_disable(struct vfio_pci_device *vdev,
 */
flush_workqueue(vfio_irqfd_cleanup_wq);
 }
+EXPORT_SYMBOL_GPL(vfio_virqfd_disable);
 
 /*
  * INTx
@@ -440,8 +441,8 @@ static int vfio_intx_set_signal(struct vfio_pci_device 
*vdev, int fd)
 static void vfio_intx_disable(struct vfio_pci_device *vdev)
 {
vfio_intx_set_signal(vdev, -1);
-   virqfd_disable(vdev, &vdev->ctx[0].unmask);
-   virqfd_disable(vdev, &vdev->ctx[0].mask);
+   vfio_virqfd_disable(vdev, &vdev->ctx[0].unmask);
+   vfio_virqfd_disable(vdev, &vdev->ctx[0].mask);
vdev->irq_type = VFIO_PCI_NUM_IRQS;
vdev->num_ctx = 0;
kfree(vdev->ctx);
@@ -605,8 +606,8 @@ static void vfio_msi_disable(struct vfio_pci_device *vdev, 
bool msix)
vfio_msi_set_block(vdev, 0, vdev->num_ctx, NULL, msix);
 
for (i = 0; i < vdev->num_ctx; i++) {
-   virqfd_disable(vdev, &vdev->ctx[i].unmask);
-   virqfd_disable(vdev, &vdev->ctx[i].mask);
+   vfio_virqfd_disable(vdev, &vdev->ctx[i].unmask);
+   vfio_virqfd_disable(vdev, &vdev->ctx[i].mask);
}
 
if (msix) {
@@ -639,11 +640,12 @@ static int vfio_pci_set_intx_unmask(struct 
vfio_pci_device *vdev,
} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
int32_t fd = *(int32_t *)data;
if (fd >= 0)
-   return virqfd_enable(vdev, vfio_pci_intx_unmask_handler,
-vfio_send_intx_eventfd, NULL,
-&vdev->ctx[0].unmask, fd);
+   return vfio_virqfd_enable(vdev,
+ vfio_pci_intx_unmask_handler,
+ vfio_send_intx_eventfd, NULL,
+ &vdev->ctx[0].unmask, fd);
 
-   virqfd_disable(vdev, &vdev->ctx[0].unmask);
+   vfio_virqfd_disable(vdev, &vdev->ctx[0].unmask);
}
 
return 0;
-- 
2.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v14 12/20] vfio/platform: trigger an interrupt via eventfd

2015-03-02 Thread Baptiste Reynal
From: Antonios Motakis 

This patch allows to set an eventfd for a platform device's interrupt,
and also to trigger the interrupt eventfd from userspace for testing.
Level sensitive interrupts are marked as maskable and are handled in
a later patch. Edge triggered interrupts are not advertised as maskable
and are implemented here using a simple and efficient IRQ handler.

Signed-off-by: Antonios Motakis 
[Baptiste Reynal: fix masked interrupt initialization]
Signed-off-by: Baptiste Reynal 
---
 drivers/vfio/platform/vfio_platform_irq.c | 94 ++-
 drivers/vfio/platform/vfio_platform_private.h |  2 +
 2 files changed, 94 insertions(+), 2 deletions(-)

diff --git a/drivers/vfio/platform/vfio_platform_irq.c 
b/drivers/vfio/platform/vfio_platform_irq.c
index df5c919..611ec80 100644
--- a/drivers/vfio/platform/vfio_platform_irq.c
+++ b/drivers/vfio/platform/vfio_platform_irq.c
@@ -39,12 +39,92 @@ static int vfio_platform_set_irq_unmask(struct 
vfio_platform_device *vdev,
return -EINVAL;
 }
 
+static irqreturn_t vfio_irq_handler(int irq, void *dev_id)
+{
+   struct vfio_platform_irq *irq_ctx = dev_id;
+
+   eventfd_signal(irq_ctx->trigger, 1);
+
+   return IRQ_HANDLED;
+}
+
+static int vfio_set_trigger(struct vfio_platform_device *vdev, int index,
+   int fd, irq_handler_t handler)
+{
+   struct vfio_platform_irq *irq = &vdev->irqs[index];
+   struct eventfd_ctx *trigger;
+   int ret;
+
+   if (irq->trigger) {
+   free_irq(irq->hwirq, irq);
+   kfree(irq->name);
+   eventfd_ctx_put(irq->trigger);
+   irq->trigger = NULL;
+   }
+
+   if (fd < 0) /* Disable only */
+   return 0;
+
+   irq->name = kasprintf(GFP_KERNEL, "vfio-irq[%d](%s)",
+   irq->hwirq, vdev->name);
+   if (!irq->name)
+   return -ENOMEM;
+
+   trigger = eventfd_ctx_fdget(fd);
+   if (IS_ERR(trigger)) {
+   kfree(irq->name);
+   return PTR_ERR(trigger);
+   }
+
+   irq->trigger = trigger;
+
+   ret = request_irq(irq->hwirq, handler, 0, irq->name, irq);
+   if (ret) {
+   kfree(irq->name);
+   eventfd_ctx_put(trigger);
+   irq->trigger = NULL;
+   return ret;
+   }
+
+   return 0;
+}
+
 static int vfio_platform_set_irq_trigger(struct vfio_platform_device *vdev,
 unsigned index, unsigned start,
 unsigned count, uint32_t flags,
 void *data)
 {
-   return -EINVAL;
+   struct vfio_platform_irq *irq = &vdev->irqs[index];
+   irq_handler_t handler;
+
+   if (vdev->irqs[index].flags & VFIO_IRQ_INFO_AUTOMASKED)
+   return -EINVAL; /* not implemented */
+   else
+   handler = vfio_irq_handler;
+
+   if (!count && (flags & VFIO_IRQ_SET_DATA_NONE))
+   return vfio_set_trigger(vdev, index, -1, handler);
+
+   if (start != 0 || count != 1)
+   return -EINVAL;
+
+   if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
+   int32_t fd = *(int32_t *)data;
+
+   return vfio_set_trigger(vdev, index, fd, handler);
+   }
+
+   if (flags & VFIO_IRQ_SET_DATA_NONE) {
+   handler(irq->hwirq, irq);
+
+   } else if (flags & VFIO_IRQ_SET_DATA_BOOL) {
+   uint8_t trigger = *(uint8_t *)data;
+
+   if (trigger)
+   handler(irq->hwirq, irq);
+   }
+
+   return 0;
 }
 
 int vfio_platform_set_irqs_ioctl(struct vfio_platform_device *vdev,
@@ -90,7 +170,12 @@ int vfio_platform_irq_init(struct vfio_platform_device 
*vdev)
if (hwirq < 0)
goto err;
 
-   vdev->irqs[i].flags = 0;
+   vdev->irqs[i].flags = VFIO_IRQ_INFO_EVENTFD;
+
+   if (irq_get_trigger_type(hwirq) & IRQ_TYPE_LEVEL_MASK)
+   vdev->irqs[i].flags |= VFIO_IRQ_INFO_MASKABLE
+   | VFIO_IRQ_INFO_AUTOMASKED;
+
vdev->irqs[i].count = 1;
vdev->irqs[i].hwirq = hwirq;
}
@@ -105,6 +190,11 @@ err:
 
 void vfio_platform_irq_cleanup(struct vfio_platform_device *vdev)
 {
+   int i;
+
+   for (i = 0; i < vdev->num_irqs; i++)
+   vfio_set_trigger(vdev, i, -1, NULL);
+
vdev->num_irqs = 0;
kfree(vdev->irqs);
 }
diff --git a/drivers/vfio/platform/vfio_platform_private.h 
b/drivers/vfio/platform/vfio_platform_private.h
index b119a6c..aa01cc3 100644
--- a/drivers/vfio/platform/vfio_platform_private.h
+++ b/drivers/vfio/platform/vfio_platform_private.h
@@ -31,6 +31,8 @@ struct vfio_platform_irq {
u32 flags;
u32 count;
int hwirq;
+   char

[PATCH v14 17/20] vfio: pass an opaque pointer on virqfd initialization

2015-03-02 Thread Baptiste Reynal
From: Antonios Motakis 

VFIO_PCI passes the VFIO device structure *vdev via eventfd to the handler
that implements masking/unmasking of IRQs via an eventfd. We can replace
it in the virqfd infrastructure with an opaque type so we can make use
of the mechanism from other VFIO bus drivers.

Signed-off-by: Antonios Motakis 
Signed-off-by: Baptiste Reynal 
---
 drivers/vfio/pci/vfio_pci_intrs.c | 30 --
 1 file changed, 16 insertions(+), 14 deletions(-)

diff --git a/drivers/vfio/pci/vfio_pci_intrs.c 
b/drivers/vfio/pci/vfio_pci_intrs.c
index 7d3c135..1a16da3 100644
--- a/drivers/vfio/pci/vfio_pci_intrs.c
+++ b/drivers/vfio/pci/vfio_pci_intrs.c
@@ -31,10 +31,10 @@
  * IRQfd - generic
  */
 struct virqfd {
-   struct vfio_pci_device  *vdev;
+   void*opaque;
struct eventfd_ctx  *eventfd;
-   int (*handler)(struct vfio_pci_device *, void *);
-   void(*thread)(struct vfio_pci_device *, void *);
+   int (*handler)(void *, void *);
+   void(*thread)(void *, void *);
void*data;
struct work_struct  inject;
wait_queue_twait;
@@ -74,7 +74,7 @@ static int virqfd_wakeup(wait_queue_t *wait, unsigned mode, 
int sync, void *key)
if (flags & POLLIN) {
/* An event has been signaled, call function */
if ((!virqfd->handler ||
-virqfd->handler(virqfd->vdev, virqfd->data)) &&
+virqfd->handler(virqfd->opaque, virqfd->data)) &&
virqfd->thread)
schedule_work(&virqfd->inject);
}
@@ -124,12 +124,12 @@ static void virqfd_inject(struct work_struct *work)
 {
struct virqfd *virqfd = container_of(work, struct virqfd, inject);
if (virqfd->thread)
-   virqfd->thread(virqfd->vdev, virqfd->data);
+   virqfd->thread(virqfd->opaque, virqfd->data);
 }
 
-int vfio_virqfd_enable(struct vfio_pci_device *vdev,
-  int (*handler)(struct vfio_pci_device *, void *),
-  void (*thread)(struct vfio_pci_device *, void *),
+int vfio_virqfd_enable(void *opaque,
+  int (*handler)(void *, void *),
+  void (*thread)(void *, void *),
   void *data, struct virqfd **pvirqfd, int fd)
 {
struct fd irqfd;
@@ -143,7 +143,7 @@ int vfio_virqfd_enable(struct vfio_pci_device *vdev,
return -ENOMEM;
 
virqfd->pvirqfd = pvirqfd;
-   virqfd->vdev = vdev;
+   virqfd->opaque = opaque;
virqfd->handler = handler;
virqfd->thread = thread;
virqfd->data = data;
@@ -196,7 +196,7 @@ int vfio_virqfd_enable(struct vfio_pci_device *vdev,
 * before we registered and trigger it as if we didn't miss it.
 */
if (events & POLLIN) {
-   if ((!handler || handler(vdev, data)) && thread)
+   if ((!handler || handler(opaque, data)) && thread)
schedule_work(&virqfd->inject);
}
 
@@ -243,8 +243,10 @@ EXPORT_SYMBOL_GPL(vfio_virqfd_disable);
 /*
  * INTx
  */
-static void vfio_send_intx_eventfd(struct vfio_pci_device *vdev, void *unused)
+static void vfio_send_intx_eventfd(void *opaque, void *unused)
 {
+   struct vfio_pci_device *vdev = opaque;
+
if (likely(is_intx(vdev) && !vdev->virq_disabled))
eventfd_signal(vdev->ctx[0].trigger, 1);
 }
@@ -287,9 +289,9 @@ void vfio_pci_intx_mask(struct vfio_pci_device *vdev)
  * a signal is necessary, which can then be handled via a work queue
  * or directly depending on the caller.
  */
-static int vfio_pci_intx_unmask_handler(struct vfio_pci_device *vdev,
-   void *unused)
+static int vfio_pci_intx_unmask_handler(void *opaque, void *unused)
 {
+   struct vfio_pci_device *vdev = opaque;
struct pci_dev *pdev = vdev->pdev;
unsigned long flags;
int ret = 0;
@@ -641,7 +643,7 @@ static int vfio_pci_set_intx_unmask(struct vfio_pci_device 
*vdev,
} else if (flags & VFIO_IRQ_SET_DATA_EVENTFD) {
int32_t fd = *(int32_t *)data;
if (fd >= 0)
-   return vfio_virqfd_enable(vdev,
+   return vfio_virqfd_enable((void *) vdev,
  vfio_pci_intx_unmask_handler,
  vfio_send_intx_eventfd, NULL,
  &vdev->ctx[0].unmask, fd);
-- 
2.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v14 07/20] vfio/platform: return info for device memory mapped IO regions

2015-03-02 Thread Baptiste Reynal
From: Antonios Motakis 

This patch enables the IOCTLs VFIO_DEVICE_GET_REGION_INFO ioctl call,
which allows the user to learn about the available MMIO resources of
a device.

Signed-off-by: Antonios Motakis 
Signed-off-by: Baptiste Reynal 
---
 drivers/vfio/platform/vfio_platform_common.c  | 106 +-
 drivers/vfio/platform/vfio_platform_private.h |  22 ++
 2 files changed, 124 insertions(+), 4 deletions(-)

diff --git a/drivers/vfio/platform/vfio_platform_common.c 
b/drivers/vfio/platform/vfio_platform_common.c
index c2f853a..47f6309 100644
--- a/drivers/vfio/platform/vfio_platform_common.c
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -23,17 +23,97 @@
 
 #include "vfio_platform_private.h"
 
+static DEFINE_MUTEX(driver_lock);
+
+static int vfio_platform_regions_init(struct vfio_platform_device *vdev)
+{
+   int cnt = 0, i;
+
+   while (vdev->get_resource(vdev, cnt))
+   cnt++;
+
+   vdev->regions = kcalloc(cnt, sizeof(struct vfio_platform_region),
+   GFP_KERNEL);
+   if (!vdev->regions)
+   return -ENOMEM;
+
+   for (i = 0; i < cnt;  i++) {
+   struct resource *res =
+   vdev->get_resource(vdev, i);
+
+   if (!res)
+   goto err;
+
+   vdev->regions[i].addr = res->start;
+   vdev->regions[i].size = resource_size(res);
+   vdev->regions[i].flags = 0;
+
+   switch (resource_type(res)) {
+   case IORESOURCE_MEM:
+   vdev->regions[i].type = VFIO_PLATFORM_REGION_TYPE_MMIO;
+   break;
+   case IORESOURCE_IO:
+   vdev->regions[i].type = VFIO_PLATFORM_REGION_TYPE_PIO;
+   break;
+   default:
+   goto err;
+   }
+   }
+
+   vdev->num_regions = cnt;
+
+   return 0;
+err:
+   kfree(vdev->regions);
+   return -EINVAL;
+}
+
+static void vfio_platform_regions_cleanup(struct vfio_platform_device *vdev)
+{
+   vdev->num_regions = 0;
+   kfree(vdev->regions);
+}
+
 static void vfio_platform_release(void *device_data)
 {
+   struct vfio_platform_device *vdev = device_data;
+
+   mutex_lock(&driver_lock);
+
+   if (!(--vdev->refcnt)) {
+   vfio_platform_regions_cleanup(vdev);
+   }
+
+   mutex_unlock(&driver_lock);
+
module_put(THIS_MODULE);
 }
 
 static int vfio_platform_open(void *device_data)
 {
+   struct vfio_platform_device *vdev = device_data;
+   int ret;
+
if (!try_module_get(THIS_MODULE))
return -ENODEV;
 
+   mutex_lock(&driver_lock);
+
+   if (!vdev->refcnt) {
+   ret = vfio_platform_regions_init(vdev);
+   if (ret)
+   goto err_reg;
+   }
+
+   vdev->refcnt++;
+
+   mutex_unlock(&driver_lock);
return 0;
+
+err_reg:
+   mutex_unlock(&driver_lock);
+   module_put(THIS_MODULE);
+   return ret;
 }
 
 static long vfio_platform_ioctl(void *device_data,
@@ -54,15 +134,33 @@ static long vfio_platform_ioctl(void *device_data,
return -EINVAL;
 
info.flags = vdev->flags;
-   info.num_regions = 0;
+   info.num_regions = vdev->num_regions;
info.num_irqs = 0;
 
return copy_to_user((void __user *)arg, &info, minsz);
 
-   } else if (cmd == VFIO_DEVICE_GET_REGION_INFO)
-   return -EINVAL;
+   } else if (cmd == VFIO_DEVICE_GET_REGION_INFO) {
+   struct vfio_region_info info;
+
+   minsz = offsetofend(struct vfio_region_info, offset);
+
+   if (copy_from_user(&info, (void __user *)arg, minsz))
+   return -EFAULT;
+
+   if (info.argsz < minsz)
+   return -EINVAL;
+
+   if (info.index >= vdev->num_regions)
+   return -EINVAL;
+
+   /* map offset to the physical address  */
+   info.offset = VFIO_PLATFORM_INDEX_TO_OFFSET(info.index);
+   info.size = vdev->regions[info.index].size;
+   info.flags = vdev->regions[info.index].flags;
+
+   return copy_to_user((void __user *)arg, &info, minsz);
 
-   else if (cmd == VFIO_DEVICE_GET_IRQ_INFO)
+   } else if (cmd == VFIO_DEVICE_GET_IRQ_INFO)
return -EINVAL;
 
else if (cmd == VFIO_DEVICE_SET_IRQS)
diff --git a/drivers/vfio/platform/vfio_platform_private.h 
b/drivers/vfio/platform/vfio_platform_private.h
index c046988..3551f6d 100644
--- a/drivers/vfio/platform/vfio_platform_private.h
+++ b/drivers/vfio/platform/vfio_platform_private.h
@@ -18,7 +18,29 @@
 #include 
 #include 
 
+#define VFIO_PLATFORM_OFFSET_SHIFT   40
+#define VFIO_PLATFORM_OFFSET_MASK (((u64)(1) << VFIO_PLATFORM_OFFSET_SHIFT) - 
1)
+
+#define VFIO_PLATFORM_OFFSET_

[PATCH v14 19/20] vfio: initialize the virqfd workqueue in VFIO generic code

2015-03-02 Thread Baptiste Reynal
From: Antonios Motakis 

Now we have finally completely decoupled virqfd from VFIO_PCI. We can
initialize it from the VFIO generic code, in order to safely use it from
multiple independent VFIO bus drivers.

Signed-off-by: Antonios Motakis 
Signed-off-by: Baptiste Reynal 
---
 drivers/vfio/Makefile   | 4 +++-
 drivers/vfio/pci/Makefile   | 3 +--
 drivers/vfio/pci/vfio_pci.c | 8 
 drivers/vfio/vfio.c | 8 
 4 files changed, 12 insertions(+), 11 deletions(-)

diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index dadf0ca..d798b09 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -1,4 +1,6 @@
-obj-$(CONFIG_VFIO) += vfio.o
+vfio_core-y := vfio.o virqfd.o
+
+obj-$(CONFIG_VFIO) += vfio_core.o
 obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
 obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o
diff --git a/drivers/vfio/pci/Makefile b/drivers/vfio/pci/Makefile
index c7c8644..1310792 100644
--- a/drivers/vfio/pci/Makefile
+++ b/drivers/vfio/pci/Makefile
@@ -1,5 +1,4 @@
 
-vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o \
- ../virqfd.o
+vfio-pci-y := vfio_pci.o vfio_pci_intrs.o vfio_pci_rdwr.o vfio_pci_config.o
 
 obj-$(CONFIG_VFIO_PCI) += vfio-pci.o
diff --git a/drivers/vfio/pci/vfio_pci.c b/drivers/vfio/pci/vfio_pci.c
index 668d37c..2f865d07 100644
--- a/drivers/vfio/pci/vfio_pci.c
+++ b/drivers/vfio/pci/vfio_pci.c
@@ -1030,7 +1030,6 @@ put_devs:
 static void __exit vfio_pci_cleanup(void)
 {
pci_unregister_driver(&vfio_pci_driver);
-   vfio_virqfd_exit();
vfio_pci_uninit_perm_bits();
 }
 
@@ -1043,11 +1042,6 @@ static int __init vfio_pci_init(void)
if (ret)
return ret;
 
-   /* Start the virqfd cleanup handler */
-   ret = vfio_virqfd_init();
-   if (ret)
-   goto out_virqfd;
-
/* Register and scan for devices */
ret = pci_register_driver(&vfio_pci_driver);
if (ret)
@@ -1056,8 +1050,6 @@ static int __init vfio_pci_init(void)
return 0;
 
 out_driver:
-   vfio_virqfd_exit();
-out_virqfd:
vfio_pci_uninit_perm_bits();
return ret;
 }
diff --git a/drivers/vfio/vfio.c b/drivers/vfio/vfio.c
index 4cde855..23ba12a 100644
--- a/drivers/vfio/vfio.c
+++ b/drivers/vfio/vfio.c
@@ -1553,6 +1553,11 @@ static int __init vfio_init(void)
if (ret)
goto err_cdev_add;
 
+   /* Start the virqfd cleanup handler used by some VFIO bus drivers */
+   ret = vfio_virqfd_init();
+   if (ret)
+   goto err_virqfd;
+
pr_info(DRIVER_DESC " version: " DRIVER_VERSION "\n");
 
/*
@@ -1565,6 +1570,8 @@ static int __init vfio_init(void)
 
return 0;
 
+err_virqfd:
+   cdev_del(&vfio.group_cdev);
 err_cdev_add:
unregister_chrdev_region(vfio.group_devt, MINORMASK);
 err_alloc_chrdev:
@@ -1579,6 +1586,7 @@ static void __exit vfio_cleanup(void)
 {
WARN_ON(!list_empty(&vfio.group_list));
 
+   vfio_virqfd_exit();
idr_destroy(&vfio.group_idr);
cdev_del(&vfio.group_cdev);
unregister_chrdev_region(vfio.group_devt, MINORMASK);
-- 
2.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 1/5] vfio: implement iommu driver capabilities with an enum

2015-03-02 Thread Baptiste Reynal
From: Antonios Motakis 

Currently a VFIO driver's IOMMU capabilities are encoded as a series of
numerical defines. Replace this with an enum for future maintainability.

Signed-off-by: Antonios Motakis 
Signed-off-by: Baptiste Reynal 
---
 include/uapi/linux/vfio.h | 24 +++-
 1 file changed, 11 insertions(+), 13 deletions(-)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 82889c3..5fb3d46 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -19,22 +19,20 @@
 
 /* Kernel & User level defines for VFIO IOCTLs. */
 
-/* Extensions */
-
-#define VFIO_TYPE1_IOMMU   1
-#define VFIO_SPAPR_TCE_IOMMU   2
-#define VFIO_TYPE1v2_IOMMU 3
 /*
- * IOMMU enforces DMA cache coherence (ex. PCIe NoSnoop stripping).  This
- * capability is subject to change as groups are added or removed.
+ * Capabilities exposed by the VFIO IOMMU driver. Some capabilities are subject
+ * to change as groups are added or removed.
  */
-#define VFIO_DMA_CC_IOMMU  4
-
-/* Check if EEH is supported */
-#define VFIO_EEH   5
+enum vfio_iommu_cap {
+   VFIO_TYPE1_IOMMU = 1,
+   VFIO_SPAPR_TCE_IOMMU = 2,
+   VFIO_TYPE1v2_IOMMU = 3,
+   VFIO_DMA_CC_IOMMU = 4,  /* IOMMU enforces DMA cache coherence
+  (ex. PCIe NoSnoop stripping) */
+   VFIO_EEH = 5,   /* Check if EEH is supported */
+   VFIO_TYPE1_NESTING_IOMMU = 6,   /* Two-stage IOMMU, implies v2  */
+};
 
-/* Two-stage IOMMU */
-#define VFIO_TYPE1_NESTING_IOMMU   6   /* Implies v2 */
 
 /*
  * The IOCTL interface is designed for extensibility by embedding the
-- 
2.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 3/5] vfio: type1: replace domain wide protection flags with supported capabilities

2015-03-02 Thread Baptiste Reynal
From: Antonios Motakis 

VFIO_IOMMU_TYPE1 keeps track for each domain it knows a list of protection
flags it always applies to all mappings in the domain. This is used for
domains that support IOMMU_CAP_CACHE_COHERENCY.

Refactor this slightly, by keeping track instead that a given domain
supports the capability, and applying the IOMMU_CACHE protection flag when
doing the actual DMA mappings.

This will allow us to reuse the behavior for IOMMU_CAP_NOEXEC, which we
also want to keep track of, but without applying it to all domains that
support it unless the user explicitly requests it.

Signed-off-by: Antonios Motakis 
[Baptiste Reynal: Use bit shifting for domain->caps]
Signed-off-by: Baptiste Reynal 
---
 drivers/vfio/vfio_iommu_type1.c | 31 ++-
 1 file changed, 22 insertions(+), 9 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 57d8c37..998619b 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -65,7 +65,7 @@ struct vfio_domain {
struct iommu_domain *domain;
struct list_headnext;
struct list_headgroup_list;
-   int prot;   /* IOMMU_CACHE */
+   int caps;
boolfgsp;   /* Fine-grained super pages */
 };
 
@@ -507,7 +507,7 @@ static int map_try_harder(struct vfio_domain *domain, 
dma_addr_t iova,
for (i = 0; i < npage; i++, pfn++, iova += PAGE_SIZE) {
ret = iommu_map(domain->domain, iova,
(phys_addr_t)pfn << PAGE_SHIFT,
-   PAGE_SIZE, prot | domain->prot);
+   PAGE_SIZE, prot);
if (ret)
break;
}
@@ -525,11 +525,16 @@ static int vfio_iommu_map(struct vfio_iommu *iommu, 
dma_addr_t iova,
int ret;
 
list_for_each_entry(d, &iommu->domain_list, next) {
+   int dprot = prot;
+
+   if (d->caps & (1 << IOMMU_CAP_CACHE_COHERENCY))
+   dprot |= IOMMU_CACHE;
+
ret = iommu_map(d->domain, iova, (phys_addr_t)pfn << PAGE_SHIFT,
-   npage << PAGE_SHIFT, prot | d->prot);
+   npage << PAGE_SHIFT, dprot);
if (ret) {
if (ret != -EBUSY ||
-   map_try_harder(d, iova, pfn, npage, prot))
+   map_try_harder(d, iova, pfn, npage, dprot))
goto unwind;
}
 
@@ -644,6 +649,10 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
struct vfio_domain *d;
struct rb_node *n;
int ret;
+   int dprot = 0;
+
+   if (domain->caps & (1 << IOMMU_CAP_CACHE_COHERENCY))
+   dprot |= IOMMU_CACHE;
 
/* Arbitrarily pick the first domain in the list for lookups */
d = list_first_entry(&iommu->domain_list, struct vfio_domain, next);
@@ -677,7 +686,7 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
size += PAGE_SIZE;
 
ret = iommu_map(domain->domain, iova, phys,
-   size, dma->prot | domain->prot);
+   size, dma->prot | dprot);
if (ret)
return ret;
 
@@ -702,13 +711,17 @@ static void vfio_test_domain_fgsp(struct vfio_domain 
*domain)
 {
struct page *pages;
int ret, order = get_order(PAGE_SIZE * 2);
+   int dprot = 0;
+
+   if (domain->caps & (1 << IOMMU_CAP_CACHE_COHERENCY))
+   dprot |= IOMMU_CACHE;
 
pages = alloc_pages(GFP_KERNEL | __GFP_ZERO, order);
if (!pages)
return;
 
ret = iommu_map(domain->domain, 0, page_to_phys(pages), PAGE_SIZE * 2,
-   IOMMU_READ | IOMMU_WRITE | domain->prot);
+   IOMMU_READ | IOMMU_WRITE | dprot);
if (!ret) {
size_t unmapped = iommu_unmap(domain->domain, 0, PAGE_SIZE);
 
@@ -787,7 +800,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
}
 
if (iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY))
-   domain->prot |= IOMMU_CACHE;
+   domain->caps |= (1 << IOMMU_CAP_CACHE_COHERENCY);
 
/*
 * Try to match an existing compatible domain.  We don't want to
@@ -798,7 +811,7 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
 */
list_for_each_entry(d, &iommu->domain_list, next) {
if (d->domain->ops == domain->domain->ops &&
-   d->prot == domain->prot) {
+   d->caps == domain->caps) {
iommu_detach_group(domain->domain, iommu_group);
if (!iommu_attach_group(d->domain, iommu_group)) {
 

[PATCH v4 2/5] vfio: introduce the VFIO_DMA_MAP_FLAG_NOEXEC flag

2015-03-02 Thread Baptiste Reynal
From: Antonios Motakis 

We introduce the VFIO_DMA_MAP_FLAG_NOEXEC flag to the VFIO dma map call,
and expose its availability via the capability VFIO_DMA_NOEXEC_IOMMU.
This way the user can control whether the XN flag will be set on the
requested mappings. The IOMMU_NOEXEC flag needs to be available for all
the IOMMUs of the container used.

Signed-off-by: Antonios Motakis 
Signed-off-by: Baptiste Reynal 
---
 include/uapi/linux/vfio.h | 6 ++
 1 file changed, 6 insertions(+)

diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index 5fb3d46..30801a7 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -31,6 +31,7 @@ enum vfio_iommu_cap {
   (ex. PCIe NoSnoop stripping) */
VFIO_EEH = 5,   /* Check if EEH is supported */
VFIO_TYPE1_NESTING_IOMMU = 6,   /* Two-stage IOMMU, implies v2  */
+   VFIO_DMA_NOEXEC_IOMMU = 7,
 };
 
 
@@ -397,12 +398,17 @@ struct vfio_iommu_type1_info {
  *
  * Map process virtual addresses to IO virtual addresses using the
  * provided struct vfio_dma_map. Caller sets argsz. READ &/ WRITE required.
+ *
+ * To use the VFIO_DMA_MAP_FLAG_NOEXEC flag, the container must support the
+ * VFIO_DMA_NOEXEC_IOMMU capability. If mappings are created using this flag,
+ * any groups subsequently added to the container must support this capability.
  */
 struct vfio_iommu_type1_dma_map {
__u32   argsz;
__u32   flags;
 #define VFIO_DMA_MAP_FLAG_READ (1 << 0)/* readable from device 
*/
 #define VFIO_DMA_MAP_FLAG_WRITE (1 << 1)   /* writable from device */
+#define VFIO_DMA_MAP_FLAG_NOEXEC (1 << 2)  /* not executable from device */
__u64   vaddr;  /* Process virtual address */
__u64   iova;   /* IO virtual address */
__u64   size;   /* Size of mapping (bytes) */
-- 
2.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v14 03/20] vfio: platform: add the VFIO PLATFORM module to Kconfig

2015-03-02 Thread Baptiste Reynal
From: Antonios Motakis 

Enable building the VFIO PLATFORM driver that allows to use Linux platform
devices with VFIO.

Signed-off-by: Antonios Motakis 
Signed-off-by: Baptiste Reynal 
---
 drivers/vfio/Kconfig   | 1 +
 drivers/vfio/Makefile  | 1 +
 drivers/vfio/platform/Kconfig  | 9 +
 drivers/vfio/platform/Makefile | 4 
 4 files changed, 15 insertions(+)
 create mode 100644 drivers/vfio/platform/Kconfig
 create mode 100644 drivers/vfio/platform/Makefile

diff --git a/drivers/vfio/Kconfig b/drivers/vfio/Kconfig
index 14e27ab..d5322a4 100644
--- a/drivers/vfio/Kconfig
+++ b/drivers/vfio/Kconfig
@@ -27,3 +27,4 @@ menuconfig VFIO
  If you don't know what to do here, say N.
 
 source "drivers/vfio/pci/Kconfig"
+source "drivers/vfio/platform/Kconfig"
diff --git a/drivers/vfio/Makefile b/drivers/vfio/Makefile
index 0b035b1..dadf0ca 100644
--- a/drivers/vfio/Makefile
+++ b/drivers/vfio/Makefile
@@ -3,3 +3,4 @@ obj-$(CONFIG_VFIO_IOMMU_TYPE1) += vfio_iommu_type1.o
 obj-$(CONFIG_VFIO_IOMMU_SPAPR_TCE) += vfio_iommu_spapr_tce.o
 obj-$(CONFIG_VFIO_SPAPR_EEH) += vfio_spapr_eeh.o
 obj-$(CONFIG_VFIO_PCI) += pci/
+obj-$(CONFIG_VFIO_PLATFORM) += platform/
diff --git a/drivers/vfio/platform/Kconfig b/drivers/vfio/platform/Kconfig
new file mode 100644
index 000..c51af17
--- /dev/null
+++ b/drivers/vfio/platform/Kconfig
@@ -0,0 +1,9 @@
+config VFIO_PLATFORM
+   tristate "VFIO support for platform devices"
+   depends on VFIO && EVENTFD && ARM
+   help
+ Support for platform devices with VFIO. This is required to make
+ use of platform devices present on the system using the VFIO
+ framework.
+
+ If you don't know what to do here, say N.
diff --git a/drivers/vfio/platform/Makefile b/drivers/vfio/platform/Makefile
new file mode 100644
index 000..279862b
--- /dev/null
+++ b/drivers/vfio/platform/Makefile
@@ -0,0 +1,4 @@
+
+vfio-platform-y := vfio_platform.o vfio_platform_common.o
+
+obj-$(CONFIG_VFIO_PLATFORM) += vfio-platform.o
-- 
2.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v14 05/20] vfio: amba: add the VFIO for AMBA devices module to Kconfig

2015-03-02 Thread Baptiste Reynal
From: Antonios Motakis 

Enable building the VFIO AMBA driver. VFIO_AMBA depends on VFIO_PLATFORM,
since it is sharing a portion of the code, and it is essentially implemented
as a platform device whose resources are discovered via AMBA specific APIs
in the kernel.

Signed-off-by: Antonios Motakis 
Signed-off-by: Baptiste Reynal 
---
 drivers/vfio/platform/Kconfig  | 10 ++
 drivers/vfio/platform/Makefile |  4 
 2 files changed, 14 insertions(+)

diff --git a/drivers/vfio/platform/Kconfig b/drivers/vfio/platform/Kconfig
index c51af17..c0a3bff 100644
--- a/drivers/vfio/platform/Kconfig
+++ b/drivers/vfio/platform/Kconfig
@@ -7,3 +7,13 @@ config VFIO_PLATFORM
  framework.
 
  If you don't know what to do here, say N.
+
+config VFIO_AMBA
+   tristate "VFIO support for AMBA devices"
+   depends on VFIO_PLATFORM && ARM_AMBA
+   help
+ Support for ARM AMBA devices with VFIO. This is required to make
+ use of ARM AMBA devices present on the system using the VFIO
+ framework.
+
+ If you don't know what to do here, say N.
diff --git a/drivers/vfio/platform/Makefile b/drivers/vfio/platform/Makefile
index 279862b..1957170 100644
--- a/drivers/vfio/platform/Makefile
+++ b/drivers/vfio/platform/Makefile
@@ -2,3 +2,7 @@
 vfio-platform-y := vfio_platform.o vfio_platform_common.o
 
 obj-$(CONFIG_VFIO_PLATFORM) += vfio-platform.o
+
+vfio-amba-y := vfio_amba.o
+
+obj-$(CONFIG_VFIO_AMBA) += vfio-amba.o
-- 
2.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v14 04/20] vfio: amba: VFIO support for AMBA devices

2015-03-02 Thread Baptiste Reynal
From: Antonios Motakis 

Add support for discovering AMBA devices with VFIO and handle them
similarly to Linux platform devices.

Signed-off-by: Antonios Motakis 
Signed-off-by: Baptiste Reynal 
---
 drivers/vfio/platform/vfio_amba.c | 115 ++
 include/uapi/linux/vfio.h |   1 +
 2 files changed, 116 insertions(+)
 create mode 100644 drivers/vfio/platform/vfio_amba.c

diff --git a/drivers/vfio/platform/vfio_amba.c 
b/drivers/vfio/platform/vfio_amba.c
new file mode 100644
index 000..ff0331f
--- /dev/null
+++ b/drivers/vfio/platform/vfio_amba.c
@@ -0,0 +1,115 @@
+/*
+ * Copyright (C) 2013 - Virtual Open Systems
+ * Author: Antonios Motakis 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include 
+#include 
+#include 
+#include 
+
+#include "vfio_platform_private.h"
+
+#define DRIVER_VERSION  "0.10"
+#define DRIVER_AUTHOR   "Antonios Motakis "
+#define DRIVER_DESC "VFIO for AMBA devices - User Level meta-driver"
+
+/* probing devices from the AMBA bus */
+
+static struct resource *get_amba_resource(struct vfio_platform_device *vdev,
+ int i)
+{
+   struct amba_device *adev = (struct amba_device *) vdev->opaque;
+
+   if (i == 0)
+   return &adev->res;
+
+   return NULL;
+}
+
+static int get_amba_irq(struct vfio_platform_device *vdev, int i)
+{
+   struct amba_device *adev = (struct amba_device *) vdev->opaque;
+   int ret = 0;
+
+   if (i < AMBA_NR_IRQS)
+   ret = adev->irq[i];
+
+   /* zero is an unset IRQ for AMBA devices */
+   return ret ? ret : -ENXIO;
+}
+
+static int vfio_amba_probe(struct amba_device *adev, const struct amba_id *id)
+{
+   struct vfio_platform_device *vdev;
+   int ret;
+
+   vdev = kzalloc(sizeof(*vdev), GFP_KERNEL);
+   if (!vdev)
+   return -ENOMEM;
+
+   vdev->name = kasprintf(GFP_KERNEL, "vfio-amba-%08x", adev->periphid);
+   if (!vdev->name) {
+   kfree(vdev);
+   return -ENOMEM;
+   }
+
+   vdev->opaque = (void *) adev;
+   vdev->flags = VFIO_DEVICE_FLAGS_AMBA;
+   vdev->get_resource = get_amba_resource;
+   vdev->get_irq = get_amba_irq;
+
+   ret = vfio_platform_probe_common(vdev, &adev->dev);
+   if (ret) {
+   kfree(vdev->name);
+   kfree(vdev);
+   }
+
+   return ret;
+}
+
+static int vfio_amba_remove(struct amba_device *adev)
+{
+   struct vfio_platform_device *vdev;
+
+   vdev = vfio_platform_remove_common(&adev->dev);
+   if (vdev) {
+   kfree(vdev->name);
+   kfree(vdev);
+   return 0;
+   }
+
+   return -EINVAL;
+}
+
+static struct amba_id pl330_ids[] = {
+   { 0, 0 },
+};
+
+MODULE_DEVICE_TABLE(amba, pl330_ids);
+
+static struct amba_driver vfio_amba_driver = {
+   .probe = vfio_amba_probe,
+   .remove = vfio_amba_remove,
+   .id_table = pl330_ids,
+   .drv = {
+   .name = "vfio-amba",
+   .owner = THIS_MODULE,
+   },
+};
+
+module_amba_driver(vfio_amba_driver);
+
+MODULE_VERSION(DRIVER_VERSION);
+MODULE_LICENSE("GPL v2");
+MODULE_AUTHOR(DRIVER_AUTHOR);
+MODULE_DESCRIPTION(DRIVER_DESC);
diff --git a/include/uapi/linux/vfio.h b/include/uapi/linux/vfio.h
index e33b04b..da07c1a 100644
--- a/include/uapi/linux/vfio.h
+++ b/include/uapi/linux/vfio.h
@@ -160,6 +160,7 @@ struct vfio_device_info {
 #define VFIO_DEVICE_FLAGS_RESET(1 << 0)/* Device supports 
reset */
 #define VFIO_DEVICE_FLAGS_PCI  (1 << 1)/* vfio-pci device */
 #define VFIO_DEVICE_FLAGS_PLATFORM (1 << 2)/* vfio-platform device */
+#define VFIO_DEVICE_FLAGS_AMBA  (1 << 3)   /* vfio-amba device */
__u32   num_regions;/* Max region index + 1 */
__u32   num_irqs;   /* Max IRQ index + 1 */
 };
-- 
2.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 5/5] vfio: type1: implement the VFIO_DMA_MAP_FLAG_NOEXEC flag

2015-03-02 Thread Baptiste Reynal
From: Antonios Motakis 

Some IOMMU drivers, such as the ARM SMMU driver, make available the
IOMMU_NOEXEC flag to set the page tables for a device as XN (execute never).
This affects devices such as the ARM PL330 DMA Controller, which respects
this flag and will refuse to fetch DMA instructions from memory where the
XN flag has been set.

The flag can be used only if all IOMMU domains behind the container support
the IOMMU_NOEXEC flag. Also, if any mappings are created with the flag, any
new domains with devices will have to support it as well.

Signed-off-by: Antonios Motakis 
Signed-off-by: Baptiste Reynal 
---
 drivers/vfio/vfio_iommu_type1.c | 25 -
 1 file changed, 24 insertions(+), 1 deletion(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 0ea371b..2bbd311 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -596,6 +596,12 @@ static int vfio_dma_do_map(struct vfio_iommu *iommu,
if (!prot || !size || (size | iova | vaddr) & mask)
return -EINVAL;
 
+   if (map->flags & VFIO_DMA_MAP_FLAG_NOEXEC) {
+   if (!vfio_domains_have_iommu_cap(iommu, IOMMU_CAP_NOEXEC))
+   return -EINVAL;
+   prot |= IOMMU_NOEXEC;
+   }
+
/* Don't allow IOVA or virtual address wrap */
if (iova + size - 1 < iova || vaddr + size - 1 < vaddr)
return -EINVAL;
@@ -686,6 +692,14 @@ static int vfio_iommu_replay(struct vfio_iommu *iommu,
dma = rb_entry(n, struct vfio_dma, node);
iova = dma->iova;
 
+   /*
+* if any of the mappings to be replayed has the NOEXEC flag
+* set, then the new iommu domain must support it
+*/
+   if ((dma->prot & IOMMU_NOEXEC) &&
+   !(domain->caps & IOMMU_CAP_NOEXEC))
+   return -EINVAL;
+
while (iova < dma->iova + dma->size) {
phys_addr_t phys = iommu_iova_to_phys(d->domain, iova);
size_t size;
@@ -819,6 +833,9 @@ static int vfio_iommu_type1_attach_group(void *iommu_data,
if (iommu_capable(bus, IOMMU_CAP_CACHE_COHERENCY))
domain->caps |= (1 << IOMMU_CAP_CACHE_COHERENCY);
 
+   if (iommu_capable(bus, IOMMU_CAP_NOEXEC))
+   domain->caps |= IOMMU_CAP_NOEXEC;
+
/*
 * Try to match an existing compatible domain.  We don't want to
 * preclude an IOMMU driver supporting multiple bus_types and being
@@ -982,6 +999,11 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
return 0;
return vfio_domains_have_iommu_cap(iommu,
  IOMMU_CAP_CACHE_COHERENCY);
+   case VFIO_DMA_NOEXEC_IOMMU:
+   if (!iommu)
+   return 0;
+   return vfio_domains_have_iommu_cap(iommu,
+  IOMMU_CAP_NOEXEC);
default:
return 0;
}
@@ -1005,7 +1027,8 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
} else if (cmd == VFIO_IOMMU_MAP_DMA) {
struct vfio_iommu_type1_dma_map map;
uint32_t mask = VFIO_DMA_MAP_FLAG_READ |
-   VFIO_DMA_MAP_FLAG_WRITE;
+   VFIO_DMA_MAP_FLAG_WRITE |
+   VFIO_DMA_MAP_FLAG_NOEXEC;
 
minsz = offsetofend(struct vfio_iommu_type1_dma_map, size);
 
-- 
2.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v4 4/5] vfio: type1: replace vfio_domains_have_iommu_cache with generic function

2015-03-02 Thread Baptiste Reynal
From: Antonios Motakis 

Replace the function vfio_domains_have_iommu_cache() with a more generic
function vfio_domains_have_iommu_cap() which allows to check all domains
of an vfio_iommu structure for a given cached capability.

Signed-off-by: Antonios Motakis 
Signed-off-by: Baptiste Reynal 
---
 drivers/vfio/vfio_iommu_type1.c | 37 +++--
 1 file changed, 19 insertions(+), 18 deletions(-)

diff --git a/drivers/vfio/vfio_iommu_type1.c b/drivers/vfio/vfio_iommu_type1.c
index 998619b..0ea371b 100644
--- a/drivers/vfio/vfio_iommu_type1.c
+++ b/drivers/vfio/vfio_iommu_type1.c
@@ -82,6 +82,23 @@ struct vfio_group {
struct list_headnext;
 };
 
+static int vfio_domains_have_iommu_cap(struct vfio_iommu *iommu, int cap)
+{
+   struct vfio_domain *domain;
+   int ret = 1;
+
+   mutex_lock(&iommu->lock);
+   list_for_each_entry(domain, &iommu->domain_list, next) {
+   if (!(domain->caps & cap)) {
+   ret = 0;
+   break;
+   }
+   }
+   mutex_unlock(&iommu->lock);
+
+   return ret;
+}
+
 /*
  * This code handles mapping and unmapping of user data buffers
  * into DMA'ble space using the IOMMU
@@ -948,23 +965,6 @@ static void vfio_iommu_type1_release(void *iommu_data)
kfree(iommu);
 }
 
-static int vfio_domains_have_iommu_cache(struct vfio_iommu *iommu)
-{
-   struct vfio_domain *domain;
-   int ret = 1;
-
-   mutex_lock(&iommu->lock);
-   list_for_each_entry(domain, &iommu->domain_list, next) {
-   if (!(domain->caps & IOMMU_CAP_CACHE_COHERENCY)) {
-   ret = 0;
-   break;
-   }
-   }
-   mutex_unlock(&iommu->lock);
-
-   return ret;
-}
-
 static long vfio_iommu_type1_ioctl(void *iommu_data,
   unsigned int cmd, unsigned long arg)
 {
@@ -980,7 +980,8 @@ static long vfio_iommu_type1_ioctl(void *iommu_data,
case VFIO_DMA_CC_IOMMU:
if (!iommu)
return 0;
-   return vfio_domains_have_iommu_cache(iommu);
+   return vfio_domains_have_iommu_cap(iommu,
+ IOMMU_CAP_CACHE_COHERENCY);
default:
return 0;
}
-- 
2.3.1

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v14 01/20] vfio/platform: initial skeleton of VFIO support for platform devices

2015-03-02 Thread Baptiste Reynal
From: Antonios Motakis 

This patch forms the common skeleton code for platform devices support
with VFIO. This will include the core functionality of VFIO_PLATFORM,
however binding to the device and discovering the device resources will
be done with the help of a separate file where any Linux platform bus
specific code will reside.

This will allow us to implement support for also discovering AMBA devices
and their resources, but still reuse a large part of the VFIO_PLATFORM
implementation.

Signed-off-by: Antonios Motakis 
[Baptiste Reynal: added includes in vfio_platform_private.h]
Signed-off-by: Baptiste Reynal 
---
 drivers/vfio/platform/vfio_platform_common.c  | 121 ++
 drivers/vfio/platform/vfio_platform_private.h |  39 +
 2 files changed, 160 insertions(+)
 create mode 100644 drivers/vfio/platform/vfio_platform_common.c
 create mode 100644 drivers/vfio/platform/vfio_platform_private.h

diff --git a/drivers/vfio/platform/vfio_platform_common.c 
b/drivers/vfio/platform/vfio_platform_common.c
new file mode 100644
index 000..34d023b
--- /dev/null
+++ b/drivers/vfio/platform/vfio_platform_common.c
@@ -0,0 +1,121 @@
+/*
+ * Copyright (C) 2013 - Virtual Open Systems
+ * Author: Antonios Motakis 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be useful,
+ * but WITHOUT ANY WARRANTY; without even the implied warranty of
+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+ * GNU General Public License for more details.
+ */
+
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+#include 
+
+#include "vfio_platform_private.h"
+
+static void vfio_platform_release(void *device_data)
+{
+   module_put(THIS_MODULE);
+}
+
+static int vfio_platform_open(void *device_data)
+{
+   if (!try_module_get(THIS_MODULE))
+   return -ENODEV;
+
+   return 0;
+}
+
+static long vfio_platform_ioctl(void *device_data,
+   unsigned int cmd, unsigned long arg)
+{
+   if (cmd == VFIO_DEVICE_GET_INFO)
+   return -EINVAL;
+
+   else if (cmd == VFIO_DEVICE_GET_REGION_INFO)
+   return -EINVAL;
+
+   else if (cmd == VFIO_DEVICE_GET_IRQ_INFO)
+   return -EINVAL;
+
+   else if (cmd == VFIO_DEVICE_SET_IRQS)
+   return -EINVAL;
+
+   else if (cmd == VFIO_DEVICE_RESET)
+   return -EINVAL;
+
+   return -ENOTTY;
+}
+
+static ssize_t vfio_platform_read(void *device_data, char __user *buf,
+ size_t count, loff_t *ppos)
+{
+   return -EINVAL;
+}
+
+static ssize_t vfio_platform_write(void *device_data, const char __user *buf,
+  size_t count, loff_t *ppos)
+{
+   return -EINVAL;
+}
+
+static int vfio_platform_mmap(void *device_data, struct vm_area_struct *vma)
+{
+   return -EINVAL;
+}
+
+static const struct vfio_device_ops vfio_platform_ops = {
+   .name   = "vfio-platform",
+   .open   = vfio_platform_open,
+   .release= vfio_platform_release,
+   .ioctl  = vfio_platform_ioctl,
+   .read   = vfio_platform_read,
+   .write  = vfio_platform_write,
+   .mmap   = vfio_platform_mmap,
+};
+
+int vfio_platform_probe_common(struct vfio_platform_device *vdev,
+  struct device *dev)
+{
+   struct iommu_group *group;
+   int ret;
+
+   if (!vdev)
+   return -EINVAL;
+
+   group = iommu_group_get(dev);
+   if (!group) {
+   pr_err("VFIO: No IOMMU group for device %s\n", vdev->name);
+   return -EINVAL;
+   }
+
+   ret = vfio_add_group_dev(dev, &vfio_platform_ops, vdev);
+   if (ret) {
+   iommu_group_put(group);
+   return ret;
+   }
+
+   return 0;
+}
+EXPORT_SYMBOL_GPL(vfio_platform_probe_common);
+
+struct vfio_platform_device *vfio_platform_remove_common(struct device *dev)
+{
+   struct vfio_platform_device *vdev;
+
+   vdev = vfio_del_group_dev(dev);
+   if (vdev)
+   iommu_group_put(dev->iommu_group);
+
+   return vdev;
+}
+EXPORT_SYMBOL_GPL(vfio_platform_remove_common);
diff --git a/drivers/vfio/platform/vfio_platform_private.h 
b/drivers/vfio/platform/vfio_platform_private.h
new file mode 100644
index 000..c046988
--- /dev/null
+++ b/drivers/vfio/platform/vfio_platform_private.h
@@ -0,0 +1,39 @@
+/*
+ * Copyright (C) 2013 - Virtual Open Systems
+ * Author: Antonios Motakis 
+ *
+ * This program is free software; you can redistribute it and/or modify
+ * it under the terms of the GNU General Public License, version 2, as
+ * published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope that it will be use

Re: [PATCH 5/6] target-arm/kvm64: fix save/restore of SPSR regs

2015-03-02 Thread Christoffer Dall
Hi Alex,

Seems like you accidentally sent out two copies of this patch, hopefully
I'm reviewing the right one...

On Wed, Feb 25, 2015 at 04:02:38PM +, Alex Bennée wrote:
> From: Christoffer Dall 
> 
> The current code was negatively indexing the cpu state array and not
> synchronizing banked spsr register state with the current mode's spsr
> state, causing occasional failures with migration.
> 
> Some munging is done to take care of the aarch64 mapping and also to
> ensure the most current value of the spsr is updated to the banked
> registers (relevant for KVM<->TCG migration).
> 
> Signed-off-by: Christoffer Dall 
> Signed-off-by: Alex Bennée 
> 
> ---
> v2 (ajb)
>   - minor tweaks and clarifications
> 
> diff --git a/target-arm/kvm64.c b/target-arm/kvm64.c
> index c60e989..1e36b0a 100644
> --- a/target-arm/kvm64.c
> +++ b/target-arm/kvm64.c
> @@ -140,6 +140,7 @@ int kvm_arch_put_registers(CPUState *cs, int level)
>  uint64_t val;
>  int i;
>  int ret;
> +unsigned int el;
>  
>  ARMCPU *cpu = ARM_CPU(cs);
>  CPUARMState *env = &cpu->env;
> @@ -206,9 +207,25 @@ int kvm_arch_put_registers(CPUState *cs, int level)
>  return ret;
>  }
>  
> +/* Saved Program State Registers
> + *
> + * Before we restore from the banked_spsr[] array we need to
> + * ensure that any modifications to env->spsr are correctly
> + * reflected and map aarch64 exception levels if required.
> + */
> +el = arm_current_el(env);
> +if (is_a64(env) && el > 0) {
> +g_assert(el == 1);
> +/* KVM maps KVM_SPSR_SVC to KVM_SPSR_EL1 for aarch64 */
> +env->banked_spsr[1] = env->banked_spsr[0];
> +env->banked_spsr[aarch64_banked_spsr_index(el)] = env->spsr;
> +} else {
> +env->banked_spsr[el] = env->spsr;

is this valid if (is_a64(env) && el == 0)) ?  I thought that if you're
in el == 0, then env->banked_spsr[x] is the most up-to-date one, not
env->spsr ?

for !is_a64(env) this looks wrong, because of the same as above if el ==
0, but also because I think you need
bank_number(env->uncached_cpsr & CPSR_M) to index into the array.

> +}
> +
>  for (i = 0; i < KVM_NR_SPSR; i++) {
>  reg.id = AARCH64_CORE_REG(spsr[i]);
> -reg.addr = (uintptr_t) &env->banked_spsr[i - 1];
> +reg.addr = (uintptr_t) &env->banked_spsr[i+1];
>  ret = kvm_vcpu_ioctl(cs, KVM_SET_ONE_REG, ®);
>  if (ret) {
>  return ret;
> @@ -253,6 +270,7 @@ int kvm_arch_get_registers(CPUState *cs)
>  struct kvm_one_reg reg;
>  uint64_t val;
>  uint32_t fpr;
> +unsigned int el;
>  int i;
>  int ret;
>  
> @@ -325,15 +343,32 @@ int kvm_arch_get_registers(CPUState *cs)
>  return ret;
>  }
>  
> +/* Fetch the SPSR registers
> + *
> + * KVM has an array of state indexed for all the possible aarch32
> + * privilage levels. Although not all are valid at all points
> + * there are some transitions possible which can access old state
> + * so it is worth keeping them all.
> + */
>  for (i = 0; i < KVM_NR_SPSR; i++) {
>  reg.id = AARCH64_CORE_REG(spsr[i]);
> -reg.addr = (uintptr_t) &env->banked_spsr[i - 1];
> +reg.addr = (uintptr_t) &env->banked_spsr[i+1];
>  ret = kvm_vcpu_ioctl(cs, KVM_GET_ONE_REG, ®);
>  if (ret) {
>  return ret;
>  }
>  }
>  
> +el = arm_current_el(env);
> +if (is_a64(env) && el > 0) {
> +g_assert(el == 1);
> +/* KVM maps KVM_SPSR_SVC to KVM_SPSR_EL1 for aarch64 */
> +env->banked_spsr[0] = env->banked_spsr[1];
> +env->spsr = env->banked_spsr[aarch64_banked_spsr_index(el)];
> +} else {
> +env->spsr = env->banked_spsr[el];

same concern with bank_number as above.

> +}
> +
>  /* Advanced SIMD and FP registers */
>  for (i = 0; i < 32; i++) {
>  reg.id = AARCH64_SIMD_CORE_REG(fp_regs.vregs[i]);
> -- 
> 2.3.0
> 

Thanks,
-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH] arm/arm64: KVM: fix missing unlock on error in kvm_vgic_create()

2015-03-02 Thread Christoffer Dall
On Fri, Feb 27, 2015 at 07:41:45PM +0800, weiyj...@163.com wrote:
> From: Wei Yongjun 
> 
> Add the missing unlock before return from function kvm_vgic_create()
> in the error handling case.
> 
> Signed-off-by: Wei Yongjun 

Thanks, applied.
-Christoffer
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[Bug 88671] Radeon driver fails to reset hardware properly after kvm guest reboot

2015-03-02 Thread bugzilla-daemon
https://bugzilla.kernel.org/show_bug.cgi?id=88671

--- Comment #7 from Alex Williamson  ---
(In reply to Tom Stellard from comment #6)
> I've been playing with this a little more and it seems to be working
> correctly,
> but radeon dynamic power management (dpm) always fails to initialize on the
> second guest boot.  My questions are:
> 
> 1. What methods are being used by kvm/qemu/libvirt to reset the GPU on guest
> shutdown?

Secondary PCI bus reset from the parent bridge.

> 2. Is the problem only cuased by the fact that GPU reset is not implemented
> correctly in the radeon driver of are there improvements that are needed in
> kvm/qemu/libvirt in order to get this working?

Without a device specific reset, we're doing to most thorough standard reset
available to us.  I've also tried to use some of the reset mechanisms
implemented in the radeon FOSS driver but they offered no improvement over the
bus reset.  There seems to be some state retained in the device that is not
cleared via bus reset.

-- 
You are receiving this mail because:
You are watching the assignee of the bug.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 0/2] Series short description

2015-03-02 Thread Joel Schopp
Review comments from v1 that used kvm_emulate_wbinvd() pointed out that 
kvm_emulate_* was inconsistant in using skipping, while kvm_emulate() always
skips.  The first patch cleans up the existing use while the second patch
adds use of the updated version of kvm_emulate_wbinvd() in svm

---

Joel Schopp (2):
  kvm: x86: make kvm_emulate_* consistant
  x86: svm: make wbinvd faster


 arch/x86/kvm/svm.c |   11 ---
 arch/x86/kvm/vmx.c |9 +++--
 arch/x86/kvm/x86.c |   23 ---
 3 files changed, 31 insertions(+), 12 deletions(-)

--

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


[PATCH v2 1/2] kvm: x86: make kvm_emulate_* consistant

2015-03-02 Thread Joel Schopp
Currently kvm_emulate() skips the instruction but kvm_emulate_* sometimes
don't.  The end reult is the caller ends up doing the skip themselves.
Let's make them consistant.

Signed-off-by: Joel Schopp 
---
 arch/x86/kvm/svm.c |2 --
 arch/x86/kvm/vmx.c |9 +++--
 arch/x86/kvm/x86.c |   23 ---
 3 files changed, 23 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index d319e0c..0c9e377 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -1929,14 +1929,12 @@ static int nop_on_interception(struct vcpu_svm *svm)
 static int halt_interception(struct vcpu_svm *svm)
 {
svm->next_rip = kvm_rip_read(&svm->vcpu) + 1;
-   skip_emulated_instruction(&svm->vcpu);
return kvm_emulate_halt(&svm->vcpu);
 }
 
 static int vmmcall_interception(struct vcpu_svm *svm)
 {
svm->next_rip = kvm_rip_read(&svm->vcpu) + 3;
-   skip_emulated_instruction(&svm->vcpu);
kvm_emulate_hypercall(&svm->vcpu);
return 1;
 }
diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
index 14c1a18..b7dcd3c 100644
--- a/arch/x86/kvm/vmx.c
+++ b/arch/x86/kvm/vmx.c
@@ -4995,7 +4995,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
if (emulate_instruction(vcpu, 0) == EMULATE_DONE) {
if (vcpu->arch.halt_request) {
vcpu->arch.halt_request = 0;
-   return kvm_emulate_halt(vcpu);
+   return kvm_emulate_halt_noskip(vcpu);
}
return 1;
}
@@ -5522,13 +5522,11 @@ static int handle_interrupt_window(struct kvm_vcpu 
*vcpu)
 
 static int handle_halt(struct kvm_vcpu *vcpu)
 {
-   skip_emulated_instruction(vcpu);
return kvm_emulate_halt(vcpu);
 }
 
 static int handle_vmcall(struct kvm_vcpu *vcpu)
 {
-   skip_emulated_instruction(vcpu);
kvm_emulate_hypercall(vcpu);
return 1;
 }
@@ -5559,7 +5557,6 @@ static int handle_rdpmc(struct kvm_vcpu *vcpu)
 
 static int handle_wbinvd(struct kvm_vcpu *vcpu)
 {
-   skip_emulated_instruction(vcpu);
kvm_emulate_wbinvd(vcpu);
return 1;
 }
@@ -5898,7 +5895,7 @@ static int handle_invalid_guest_state(struct kvm_vcpu 
*vcpu)
 
if (vcpu->arch.halt_request) {
vcpu->arch.halt_request = 0;
-   ret = kvm_emulate_halt(vcpu);
+   ret = kvm_emulate_halt_noskip(vcpu);
goto out;
}
 
@@ -9513,7 +9510,7 @@ static int nested_vmx_run(struct kvm_vcpu *vcpu, bool 
launch)
vmcs12->launch_state = 1;
 
if (vmcs12->guest_activity_state == GUEST_ACTIVITY_HLT)
-   return kvm_emulate_halt(vcpu);
+   return kvm_emulate_halt_noskip(vcpu);
 
vmx->nested.nested_run_pending = 1;
 
diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c
index bd7a70b..96a8333 100644
--- a/arch/x86/kvm/x86.c
+++ b/arch/x86/kvm/x86.c
@@ -4706,7 +4706,7 @@ static void emulator_invlpg(struct x86_emulate_ctxt 
*ctxt, ulong address)
kvm_mmu_invlpg(emul_to_vcpu(ctxt), address);
 }
 
-int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
+int kvm_emulate_wbinvd_noskip(struct kvm_vcpu *vcpu)
 {
if (!need_emulate_wbinvd(vcpu))
return X86EMUL_CONTINUE;
@@ -4723,11 +4723,19 @@ int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
wbinvd();
return X86EMUL_CONTINUE;
 }
+
+int kvm_emulate_wbinvd(struct kvm_vcpu *vcpu)
+{
+   kvm_x86_ops->skip_emulated_instruction(vcpu);
+   return kvm_emulate_wbinvd_noskip(vcpu);
+}
 EXPORT_SYMBOL_GPL(kvm_emulate_wbinvd);
 
+
+
 static void emulator_wbinvd(struct x86_emulate_ctxt *ctxt)
 {
-   kvm_emulate_wbinvd(emul_to_vcpu(ctxt));
+   kvm_emulate_wbinvd_noskip(emul_to_vcpu(ctxt));
 }
 
 int emulator_get_dr(struct x86_emulate_ctxt *ctxt, int dr, unsigned long *dest)
@@ -5817,7 +5825,7 @@ void kvm_arch_exit(void)
free_percpu(shared_msrs);
 }
 
-int kvm_emulate_halt(struct kvm_vcpu *vcpu)
+int kvm_emulate_halt_noskip(struct kvm_vcpu *vcpu)
 {
++vcpu->stat.halt_exits;
if (irqchip_in_kernel(vcpu->kvm)) {
@@ -5828,6 +5836,13 @@ int kvm_emulate_halt(struct kvm_vcpu *vcpu)
return 0;
}
 }
+EXPORT_SYMBOL_GPL(kvm_emulate_halt_noskip);
+
+int kvm_emulate_halt(struct kvm_vcpu *vcpu)
+{
+   kvm_x86_ops->skip_emulated_instruction(vcpu);
+   return kvm_emulate_halt_noskip(vcpu);
+}
 EXPORT_SYMBOL_GPL(kvm_emulate_halt);
 
 int kvm_hv_hypercall(struct kvm_vcpu *vcpu)
@@ -5912,6 +5927,8 @@ int kvm_emulate_hypercall(struct kvm_vcpu *vcpu)
unsigned long nr, a0, a1, a2, a3, ret;
int op_64_bit, r = 1;
 
+   kvm_x86_ops->skip_emulated_instruction(vcpu);
+
if (kvm_hv_hypercall_enabled(vcpu->kvm))
return kvm_hv_hypercall(vcpu);
 

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of

[PATCH v2 2/2] x86: svm: make wbinvd faster

2015-03-02 Thread Joel Schopp
From: David Kaplan 
No need to re-decode WBINVD since we know what it is from the intercept.

Signed-off-by: David Kaplan 
[extracted from larger unlrelated patch, forward ported, tested]
Signed-off-by: Joel Schopp 
---
 arch/x86/kvm/svm.c |9 -
 1 file changed, 8 insertions(+), 1 deletion(-)

diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
index 0c9e377..794bca7 100644
--- a/arch/x86/kvm/svm.c
+++ b/arch/x86/kvm/svm.c
@@ -2774,6 +2774,13 @@ static int skinit_interception(struct vcpu_svm *svm)
return 1;
 }
 
+static int wbinvd_interception(struct vcpu_svm *svm)
+{
+   kvm_emulate_wbinvd(&svm->vcpu);
+   return 1;
+}
+
+
 static int xsetbv_interception(struct vcpu_svm *svm)
 {
u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
@@ -3374,7 +3381,7 @@ static int (*const svm_exit_handlers[])(struct vcpu_svm 
*svm) = {
[SVM_EXIT_STGI] = stgi_interception,
[SVM_EXIT_CLGI] = clgi_interception,
[SVM_EXIT_SKINIT]   = skinit_interception,
-   [SVM_EXIT_WBINVD]   = emulate_on_interception,
+   [SVM_EXIT_WBINVD]   = wbinvd_interception,
[SVM_EXIT_MONITOR]  = monitor_interception,
[SVM_EXIT_MWAIT]= mwait_interception,
[SVM_EXIT_XSETBV]   = xsetbv_interception,

--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [BUG] Balloon malfunctions with memory hotplug

2015-03-02 Thread Luiz Capitulino
On Mon, 2 Mar 2015 11:52:34 +0530
Amit Shah  wrote:

> > >Another important detail is that, I *suspect* that a very similar
> > >bug already exists with 32-bit guests even without memory
> > >hotplug: what happens if you assign 6GB to a 32-bit without PAE
> > >support? I think the same problem we're seeing with memory
> > >hotplug will happen and solution 1 won't fix this, although
> > >no one seems to care about 32-bit guests...
> 
> Not just 32-bit guests; even 64-bit guests restricted with mem= on the
> cmdline.  

You're right. So, it's an already existing issue that becomes very
apparent with memory hotplug.

> I know we've discussed this in the past, and I recall
> virtio-balloon v2 was going to address this all; sadly I've not kept
> uptodate with it.

Me neither :(
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 2/2] x86: svm: make wbinvd faster

2015-03-02 Thread Radim Krčmář
2015-03-02 12:04-0600, Joel Schopp:
> From: David Kaplan 
> No need to re-decode WBINVD since we know what it is from the intercept.
> 
> Signed-off-by: David Kaplan 
> [extracted from larger unlrelated patch, forward ported, tested]
> Signed-off-by: Joel Schopp 
> ---

Reviewed-by: Radim Krčmář 

> diff --git a/arch/x86/kvm/svm.c b/arch/x86/kvm/svm.c
> @@ -2774,6 +2774,13 @@ static int skinit_interception(struct vcpu_svm *svm)
>   return 1;
>  }
>  
> +static int wbinvd_interception(struct vcpu_svm *svm)
> +{
> + kvm_emulate_wbinvd(&svm->vcpu);
> + return 1;
> +}
> +
> +

(Squashing these lines would have been a nice improvement.)

>  static int xsetbv_interception(struct vcpu_svm *svm)
>  {
>   u64 new_bv = kvm_read_edx_eax(&svm->vcpu);
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


Re: [PATCH v2 1/2] kvm: x86: make kvm_emulate_* consistant

2015-03-02 Thread Radim Krčmář
2015-03-02 12:04-0600, Joel Schopp:
> Currently kvm_emulate() skips the instruction but kvm_emulate_* sometimes
> don't.  The end reult is the caller ends up doing the skip themselves. 
> Let's make them consistant.
> 
> Signed-off-by: Joel Schopp 
> ---
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> @@ -4995,7 +4995,7 @@ static int handle_rmode_exception(struct kvm_vcpu *vcpu,
>   if (emulate_instruction(vcpu, 0) == EMULATE_DONE) {
>   if (vcpu->arch.halt_request) {
>   vcpu->arch.halt_request = 0;
> - return kvm_emulate_halt(vcpu);
> + return kvm_emulate_halt_noskip(vcpu);

noskip is used without being declared ... it shouldn't compile.

*_noskip makes the usual case harder to undertand: we just want to halt
the vcpu, so name it more directly ... like kvm_vcpu_halt()?
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html


  1   2   >