On Sat, Mar 16, 2013 at 11:23:16AM +0100, Jan Kiszka wrote:
> From: Jan Kiszka <jan.kis...@siemens.com>
> 
> The basic idea is to always transfer the pending event injection on
> vmexit into the architectural state of the VCPU and then drop it from
> there if it turns out that we left L2 to enter L1.
> 
> VMX and SVM are now identical in how they recover event injections from
> unperformed vmlaunch/vmresume: We detect that VM_ENTRY_INTR_INFO_FIELD
> still contains a valid event and, if yes, transfer the content into L1's
> idt_vectoring_info_field.
> 
But how this can happens with VMX code? VMX has this nested_run_pending
thing that prevents #vmexit emulation from happening without vmlaunch.
This means that VM_ENTRY_INTR_INFO_FIELD should never be valid during
#vmexit emulation since it is marked invalid during vmlaunch.

> However, we differ on how to deal with events that L0 wanted to inject
> into L2. Likely, this case is still broken in SVM. For VMX, the function
> vmcs12_save_pending_events deals with transferring pending L0 events
> into the queue of L1. That is mandatory as L1 may decide to switch the
> guest state completely, invalidating or preserving the pending events
> for later injection (including on a different node, once we support
> migration).
> 
> Note that we treat directly injected NMIs differently as they can hit
> both L1 and L2. In this case, we let L0 try to injection again also over
> L1 after leaving L2.
> 
Hmm, where SDM says NMI behaves this way?

> To avoid that we incorrectly leak an event into the architectural VCPU
> state that L1 wants to inject, we skip cancellation on nested run.
> 
How the leak can happen?

> Signed-off-by: Jan Kiszka <jan.kis...@siemens.com>
> ---
>  arch/x86/kvm/vmx.c |  118 
> ++++++++++++++++++++++++++++++++++++++--------------
>  1 files changed, 87 insertions(+), 31 deletions(-)
> 
> diff --git a/arch/x86/kvm/vmx.c b/arch/x86/kvm/vmx.c
> index 126d047..ca74358 100644
> --- a/arch/x86/kvm/vmx.c
> +++ b/arch/x86/kvm/vmx.c
> @@ -6492,8 +6492,6 @@ static void __vmx_complete_interrupts(struct kvm_vcpu 
> *vcpu,
>  
>  static void vmx_complete_interrupts(struct vcpu_vmx *vmx)
>  {
> -     if (is_guest_mode(&vmx->vcpu))
> -             return;
>       __vmx_complete_interrupts(&vmx->vcpu, vmx->idt_vectoring_info,
>                                 VM_EXIT_INSTRUCTION_LEN,
>                                 IDT_VECTORING_ERROR_CODE);
> @@ -6501,7 +6499,7 @@ static void vmx_complete_interrupts(struct vcpu_vmx 
> *vmx)
>  
>  static void vmx_cancel_injection(struct kvm_vcpu *vcpu)
>  {
> -     if (is_guest_mode(vcpu))
> +     if (to_vmx(vcpu)->nested.nested_run_pending)
>               return;
>       __vmx_complete_interrupts(vcpu,
>                                 vmcs_read32(VM_ENTRY_INTR_INFO_FIELD),
> @@ -6534,21 +6532,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu 
> *vcpu)
>       struct vcpu_vmx *vmx = to_vmx(vcpu);
>       unsigned long debugctlmsr;
>  
> -     if (is_guest_mode(vcpu) && !vmx->nested.nested_run_pending) {
> -             struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> -             if (vmcs12->idt_vectoring_info_field &
> -                             VECTORING_INFO_VALID_MASK) {
> -                     vmcs_write32(VM_ENTRY_INTR_INFO_FIELD,
> -                             vmcs12->idt_vectoring_info_field);
> -                     vmcs_write32(VM_ENTRY_INSTRUCTION_LEN,
> -                             vmcs12->vm_exit_instruction_len);
> -                     if (vmcs12->idt_vectoring_info_field &
> -                                     VECTORING_INFO_DELIVER_CODE_MASK)
> -                             vmcs_write32(VM_ENTRY_EXCEPTION_ERROR_CODE,
> -                                     vmcs12->idt_vectoring_error_code);
> -             }
> -     }
> -
>       /* Record the guest's net vcpu time for enforced NMI injections. */
>       if (unlikely(!cpu_has_virtual_nmis() && vmx->soft_vnmi_blocked))
>               vmx->entry_time = ktime_get();
> @@ -6707,17 +6690,6 @@ static void __noclone vmx_vcpu_run(struct kvm_vcpu 
> *vcpu)
>  
>       vmx->idt_vectoring_info = vmcs_read32(IDT_VECTORING_INFO_FIELD);
>  
> -     if (is_guest_mode(vcpu)) {
> -             struct vmcs12 *vmcs12 = get_vmcs12(vcpu);
> -             vmcs12->idt_vectoring_info_field = vmx->idt_vectoring_info;
> -             if (vmx->idt_vectoring_info & VECTORING_INFO_VALID_MASK) {
> -                     vmcs12->idt_vectoring_error_code =
> -                             vmcs_read32(IDT_VECTORING_ERROR_CODE);
> -                     vmcs12->vm_exit_instruction_len =
> -                             vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
> -             }
> -     }
> -
>       vmx->loaded_vmcs->launched = 1;
>  
>       vmx->exit_reason = vmcs_read32(VM_EXIT_REASON);
> @@ -7324,6 +7296,52 @@ vmcs12_guest_cr4(struct kvm_vcpu *vcpu, struct vmcs12 
> *vmcs12)
>                       vcpu->arch.cr4_guest_owned_bits));
>  }
>  
> +static void vmcs12_save_pending_events(struct kvm_vcpu *vcpu,
> +                                    struct vmcs12 *vmcs12)
> +{
> +     u32 idt_vectoring;
> +     unsigned int nr;
> +
> +     /*
> +      * We only transfer exceptions and maskable interrupts. It is fine if
> +      * L0 retries to inject a pending NMI over L1.
> +      */
> +     if (vcpu->arch.exception.pending) {
> +             nr = vcpu->arch.exception.nr;
> +             idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
> +
> +             if (kvm_exception_is_soft(nr)) {
> +                     vmcs12->vm_exit_instruction_len =
> +                             vcpu->arch.event_exit_inst_len;
> +                     idt_vectoring |= INTR_TYPE_SOFT_EXCEPTION;
> +             } else
> +                     idt_vectoring |= INTR_TYPE_HARD_EXCEPTION;
> +
> +             if (vcpu->arch.exception.has_error_code) {
> +                     idt_vectoring |= VECTORING_INFO_DELIVER_CODE_MASK;
> +                     vmcs12->idt_vectoring_error_code =
> +                             vcpu->arch.exception.error_code;
> +             }
> +
> +             vmcs12->idt_vectoring_info_field = idt_vectoring;
> +     } else if (vcpu->arch.interrupt.pending) {
> +             nr = vcpu->arch.interrupt.nr;
> +             idt_vectoring = nr | VECTORING_INFO_VALID_MASK;
> +
> +             if (vcpu->arch.interrupt.soft) {
> +                     idt_vectoring |= INTR_TYPE_SOFT_INTR;
> +                     vmcs12->vm_entry_instruction_len =
> +                             vcpu->arch.event_exit_inst_len;
> +             } else
> +                     idt_vectoring |= INTR_TYPE_EXT_INTR;
> +
> +             vmcs12->idt_vectoring_info_field = idt_vectoring;
> +     }
> +
> +     kvm_clear_exception_queue(vcpu);
> +     kvm_clear_interrupt_queue(vcpu);
> +}
> +
>  /*
>   * prepare_vmcs12 is part of what we need to do when the nested L2 guest 
> exits
>   * and we want to prepare to run its L1 parent. L1 keeps a vmcs for L2 
> (vmcs12),
> @@ -7415,9 +7433,47 @@ static void prepare_vmcs12(struct kvm_vcpu *vcpu, 
> struct vmcs12 *vmcs12)
>       vmcs12->vm_exit_instruction_len = vmcs_read32(VM_EXIT_INSTRUCTION_LEN);
>       vmcs12->vmx_instruction_info = vmcs_read32(VMX_INSTRUCTION_INFO);
>  
> -     /* clear vm-entry fields which are to be cleared on exit */
> -     if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY))
> +     if (!(vmcs12->vm_exit_reason & VMX_EXIT_REASONS_FAILED_VMENTRY)) {
> +             if ((vmcs12->vm_entry_intr_info_field &
> +                  INTR_INFO_VALID_MASK) &&
> +                 (vmcs_read32(VM_ENTRY_INTR_INFO_FIELD) &
> +                  INTR_INFO_VALID_MASK)) {
Again I do not see how this condition can be true.

> +                     /*
> +                      * Preserve the event that was supposed to be injected
> +                      * by L1 via emulating it would have been returned in
> +                      * IDT_VECTORING_INFO_FIELD.
> +                      */
> +                     vmcs12->idt_vectoring_info_field =
> +                             vmcs12->vm_entry_intr_info_field;
> +                     vmcs12->idt_vectoring_error_code =
> +                             vmcs12->vm_entry_exception_error_code;
> +                     vmcs12->vm_exit_instruction_len =
> +                             vmcs12->vm_entry_instruction_len;
> +                     vmcs_write32(VM_ENTRY_INTR_INFO_FIELD, 0);
> +
> +                     /*
> +                      * We do not drop NMIs that targeted L2 below as they
> +                      * can also be reinjected over L1. But if this event
> +                      * was an NMI, it was synthetic and came from L1.
> +                      */
> +                     vcpu->arch.nmi_injected = false;
> +             } else
> +                     /*
> +                      * Transfer the event L0 may wanted to inject into L2
> +                      * to IDT_VECTORING_INFO_FIELD.
> +                      */
I do not understand the comment. This transfers an event from event queue into 
vmcs12.
Since vmx_complete_interrupts() transfers event that L1 tried to inject
into event queue too he we handle not only L0->L2, but also L1->L2
events too. In fast I think only "else" part of this if() is needed. 

> +                     vmcs12_save_pending_events(vcpu, vmcs12);
> +
> +             /* clear vm-entry fields which are to be cleared on exit */
>               vmcs12->vm_entry_intr_info_field &= ~INTR_INFO_VALID_MASK;
> +     }
> +
> +     /*
> +      * Drop what we picked up for L2 via vmx_complete_interrupts. It is
> +      * preserved above and would only end up incorrectly in L1.
> +      */
> +     kvm_clear_exception_queue(vcpu);
> +     kvm_clear_interrupt_queue(vcpu);
>  }
>  
>  /*
> -- 
> 1.7.3.4

--
                        Gleb.
--
To unsubscribe from this list: send the line "unsubscribe kvm" in
the body of a message to majord...@vger.kernel.org
More majordomo info at  http://vger.kernel.org/majordomo-info.html

Reply via email to